From 31823e8179d276943bfa8eca0beb55cb8f324e63 Mon Sep 17 00:00:00 2001 From: Rohit Subramanian <244568360+rosubra@users.noreply.github.com> Date: Sat, 21 Feb 2026 00:20:06 +0100 Subject: [PATCH] =?UTF-8?q?Fix=20all=20issues=20=F0=9F=94=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 8 - BUILDING_LINUX.txt | 72 - Cargo.toml | 35 - DESIGN_DOC.md | 385 -- LICENSE | 121 - README.md | 209 +- current_tasks/.keep | 0 .../fix_arm_asm_caspal_instruction.txt | 20 - .../fix_arm_asm_global_branch_relocs.txt | 19 - current_tasks/fix_arm_asm_org_directive.txt | 16 - .../fix_arm_asm_quad_prel64_relocation.txt | 19 - .../fix_arm_movw_symbolic_relocations.txt | 19 - current_tasks/fix_dash.txt | 3 - .../fix_i686_double_param_high_word_store.txt | 12 - .../fix_macro_param_prefix_substitution.txt | 16 - current_tasks/fix_pcre2_stack_frame_bloat.txt | 16 - .../fix_riscv_va_arg_long_double_struct.txt | 11 - .../fix_x86_asm_ifnb_ifb_conditional.txt | 25 - .../fix_x86_standalone_kernel_link_errors.txt | 13 - ...implement_string_literal_deduplication.txt | 18 - ideas/.keep | 0 ideas/cleanup_i686_dead_peephole_passes.txt | 24 - ideas/docs_verified_2026_01_29.txt | 140 - ideas/fix_arm_defconfig_kvm_va_layout.txt | 34 - .../fix_ir_unsigned_subint_representation.txt | 21 - ideas/fix_riscv_kernel_boot_hang.txt | 23 - ideas/fix_riscv_va_arg_runtime.txt | 14 - ideas/fix_x86_full_f128_80bit_precision.txt | 43 - ideas/high_codegen_runtime_perf.txt | 37 - ideas/high_compile_speed_improvements.txt | 49 - ideas/high_i686_boot_code_size_reduction.txt | 54 - ideas/high_sema_expansion_typed_ast.txt | 33 - ideas/high_use_def_chains.txt | 46 - ideas/high_value_location_abstraction.txt | 24 - ideas/low_structured_error_infrastructure.txt | 137 - ideas/new_projects.txt | 19 - ideas/new_projects_myasm.txt | 290 -- ideas/optimization_passes_future.txt | 29 - ideas/qemu_build_support.txt | 71 - .../reduce_stack_frame_size_for_postgres.txt | 31 - ideas/register_allocator.txt | 23 - include/arm_neon.h | 3122 --------------- include/avx2intrin.h | 1195 ------ include/avx512fintrin.h | 499 --- include/avxintrin.h | 460 --- include/bmi2intrin.h | 93 - include/emmintrin.h | 1653 -------- include/fmaintrin.h | 75 - include/immintrin.h | 73 - include/mmintrin.h | 852 ----- include/nmmintrin.h | 7 - include/pmmintrin.h | 61 - include/shaintrin.h | 232 -- include/smmintrin.h | 350 -- include/tmmintrin.h | 279 -- include/wmmintrin.h | 49 - include/x86intrin.h | 158 - include/xmmintrin.h | 1287 ------- projects/cleanup_code_quality.txt | 80 - src/backend/README.md | 1271 ------- src/backend/arm/README.md | 45 - src/backend/arm/asm_stub.sh | 7 - src/backend/arm/assembler/README.md | 512 --- src/backend/arm/assembler/elf_writer.rs | 788 ---- src/backend/arm/assembler/encoder/bitfield.rs | 246 -- .../arm/assembler/encoder/compare_branch.rs | 322 -- .../arm/assembler/encoder/data_processing.rs | 1067 ------ .../arm/assembler/encoder/fp_scalar.rs | 271 -- .../arm/assembler/encoder/load_store.rs | 966 ----- src/backend/arm/assembler/encoder/mod.rs | 993 ----- src/backend/arm/assembler/encoder/neon.rs | 1854 --------- src/backend/arm/assembler/encoder/system.rs | 613 --- src/backend/arm/assembler/mod.rs | 381 -- src/backend/arm/assembler/parser.rs | 2644 ------------- src/backend/arm/codegen/README.md | 795 ---- src/backend/arm/codegen/alu.rs | 229 -- src/backend/arm/codegen/asm_emitter.rs | 525 --- src/backend/arm/codegen/atomics.rs | 146 - src/backend/arm/codegen/calls.rs | 263 -- src/backend/arm/codegen/cast_ops.rs | 139 - src/backend/arm/codegen/comparison.rs | 75 - src/backend/arm/codegen/emit.rs | 1999 ---------- src/backend/arm/codegen/f128.rs | 334 -- src/backend/arm/codegen/float_ops.rs | 43 - src/backend/arm/codegen/globals.rs | 30 - src/backend/arm/codegen/i128_ops.rs | 322 -- src/backend/arm/codegen/inline_asm.rs | 362 -- src/backend/arm/codegen/intrinsics.rs | 297 -- src/backend/arm/codegen/memory.rs | 252 -- src/backend/arm/codegen/mod.rs | 18 - src/backend/arm/codegen/peephole.rs | 1223 ------ src/backend/arm/codegen/prologue.rs | 339 -- src/backend/arm/codegen/returns.rs | 106 - src/backend/arm/codegen/variadic.rs | 359 -- src/backend/arm/ld_stub.sh | 7 - src/backend/arm/linker/README.md | 633 ---- src/backend/arm/linker/elf.rs | 75 - src/backend/arm/linker/emit_dynamic.rs | 868 ----- src/backend/arm/linker/emit_shared.rs | 1100 ------ src/backend/arm/linker/emit_static.rs | 645 ---- src/backend/arm/linker/input.rs | 91 - src/backend/arm/linker/link.rs | 410 -- src/backend/arm/linker/mod.rs | 42 - src/backend/arm/linker/plt_got.rs | 130 - src/backend/arm/linker/reloc.rs | 540 --- src/backend/arm/linker/types.rs | 93 - src/backend/arm/mod.rs | 7 - src/backend/asm_expr.rs | 409 -- src/backend/asm_preprocess.rs | 1434 ------- src/backend/call_abi.rs | 870 ----- src/backend/cast.rs | 261 -- src/backend/common.rs | 1742 --------- src/backend/elf/archive.rs | 241 -- src/backend/elf/constants.rs | 187 - src/backend/elf/io.rs | 204 - src/backend/elf/linker_symbols.rs | 153 - src/backend/elf/mod.rs | 81 - src/backend/elf/numeric_labels.rs | 285 -- src/backend/elf/object_writer.rs | 554 --- src/backend/elf/parse_string.rs | 98 - src/backend/elf/section_flags.rs | 70 - src/backend/elf/string_table.rs | 53 - src/backend/elf/symbol_table.rs | 183 - src/backend/elf/writer_base.rs | 674 ---- src/backend/elf_writer_common.rs | 1700 --------- src/backend/f128_softfloat.rs | 786 ---- src/backend/generation.rs | 1406 ------- src/backend/i686/README.md | 44 - src/backend/i686/assembler/README.md | 440 --- src/backend/i686/assembler/elf_writer.rs | 170 - src/backend/i686/assembler/encoder/core.rs | 180 - .../i686/assembler/encoder/gp_integer.rs | 1390 ------- src/backend/i686/assembler/encoder/mod.rs | 770 ---- .../i686/assembler/encoder/registers.rs | 138 - src/backend/i686/assembler/encoder/sse.rs | 385 -- src/backend/i686/assembler/encoder/system.rs | 379 -- src/backend/i686/assembler/encoder/x87.rs | 279 -- src/backend/i686/assembler/mod.rs | 31 - src/backend/i686/codegen/README.md | 707 ---- src/backend/i686/codegen/alu.rs | 135 - src/backend/i686/codegen/asm_emitter.rs | 672 ---- src/backend/i686/codegen/atomics.rs | 163 - src/backend/i686/codegen/calls.rs | 216 -- src/backend/i686/codegen/casts.rs | 601 --- src/backend/i686/codegen/comparison.rs | 216 -- src/backend/i686/codegen/emit.rs | 2274 ----------- src/backend/i686/codegen/float_ops.rs | 18 - src/backend/i686/codegen/globals.rs | 43 - src/backend/i686/codegen/i128_ops.rs | 450 --- src/backend/i686/codegen/inline_asm.rs | 113 - src/backend/i686/codegen/intrinsics.rs | 606 --- src/backend/i686/codegen/memory.rs | 506 --- src/backend/i686/codegen/mod.rs | 17 - src/backend/i686/codegen/peephole.rs | 1947 ---------- src/backend/i686/codegen/prologue.rs | 662 ---- src/backend/i686/codegen/returns.rs | 87 - src/backend/i686/codegen/variadic.rs | 71 - src/backend/i686/linker/README.md | 566 --- src/backend/i686/linker/dynsym.rs | 310 -- src/backend/i686/linker/emit.rs | 1248 ------ src/backend/i686/linker/gnu_hash.rs | 76 - src/backend/i686/linker/input.rs | 362 -- src/backend/i686/linker/link.rs | 216 -- src/backend/i686/linker/mod.rs | 52 - src/backend/i686/linker/parse.rs | 195 - src/backend/i686/linker/reloc.rs | 302 -- src/backend/i686/linker/sections.rs | 131 - src/backend/i686/linker/shared.rs | 1057 ------ src/backend/i686/linker/symbols.rs | 356 -- src/backend/i686/linker/types.rs | 332 -- src/backend/i686/mod.rs | 7 - src/backend/inline_asm.rs | 1122 ------ src/backend/linker_common/README.md | 52 - src/backend/linker_common/archive.rs | 225 -- src/backend/linker_common/args.rs | 121 - src/backend/linker_common/check.rs | 38 - src/backend/linker_common/dynamic.rs | 229 -- src/backend/linker_common/dynstr.rs | 39 - src/backend/linker_common/eh_frame.rs | 398 -- src/backend/linker_common/gc_sections.rs | 114 - src/backend/linker_common/hash.rs | 27 - src/backend/linker_common/merge.rs | 158 - src/backend/linker_common/mod.rs | 128 - src/backend/linker_common/parse_object.rs | 178 - src/backend/linker_common/parse_shared.rs | 483 --- src/backend/linker_common/resolve_lib.rs | 37 - src/backend/linker_common/section_map.rs | 25 - src/backend/linker_common/symbols.rs | 147 - src/backend/linker_common/types.rs | 87 - src/backend/linker_common/write.rs | 64 - src/backend/liveness.rs | 1211 ------ src/backend/mod.rs | 387 -- src/backend/peephole_common.rs | 217 -- src/backend/regalloc.rs | 573 --- src/backend/riscv/README.md | 44 - src/backend/riscv/asm_stub.sh | 7 - src/backend/riscv/assembler/README.md | 525 --- src/backend/riscv/assembler/compress.rs | 848 ----- src/backend/riscv/assembler/elf_writer.rs | 1422 ------- .../riscv/assembler/encoder/atomics.rs | 106 - src/backend/riscv/assembler/encoder/base.rs | 320 -- .../riscv/assembler/encoder/compressed.rs | 196 - src/backend/riscv/assembler/encoder/float.rs | 191 - src/backend/riscv/assembler/encoder/mod.rs | 926 ----- src/backend/riscv/assembler/encoder/pseudo.rs | 608 --- src/backend/riscv/assembler/encoder/system.rs | 115 - src/backend/riscv/assembler/encoder/vector.rs | 216 -- src/backend/riscv/assembler/mod.rs | 101 - src/backend/riscv/assembler/parser.rs | 1062 ------ src/backend/riscv/codegen/README.md | 926 ----- src/backend/riscv/codegen/alu.rs | 77 - src/backend/riscv/codegen/asm_emitter.rs | 538 --- src/backend/riscv/codegen/atomics.rs | 581 --- src/backend/riscv/codegen/calls.rs | 493 --- src/backend/riscv/codegen/cast_ops.rs | 207 - src/backend/riscv/codegen/comparison.rs | 143 - src/backend/riscv/codegen/emit.rs | 759 ---- src/backend/riscv/codegen/f128.rs | 355 -- src/backend/riscv/codegen/float_ops.rs | 53 - src/backend/riscv/codegen/globals.rs | 37 - src/backend/riscv/codegen/i128_ops.rs | 354 -- src/backend/riscv/codegen/inline_asm.rs | 266 -- src/backend/riscv/codegen/intrinsics.rs | 599 --- src/backend/riscv/codegen/memory.rs | 252 -- src/backend/riscv/codegen/mod.rs | 18 - src/backend/riscv/codegen/peephole.rs | 1309 ------- src/backend/riscv/codegen/prologue.rs | 672 ---- src/backend/riscv/codegen/returns.rs | 91 - src/backend/riscv/codegen/variadic.rs | 87 - src/backend/riscv/ld_stub.sh | 7 - src/backend/riscv/linker/README.md | 689 ---- src/backend/riscv/linker/elf_read.rs | 90 - src/backend/riscv/linker/emit_exec.rs | 1169 ------ src/backend/riscv/linker/emit_shared.rs | 1047 ----- src/backend/riscv/linker/input.rs | 406 -- src/backend/riscv/linker/link.rs | 282 -- src/backend/riscv/linker/mod.rs | 39 - src/backend/riscv/linker/reloc.rs | 584 --- src/backend/riscv/linker/relocations.rs | 696 ---- src/backend/riscv/linker/sections.rs | 114 - src/backend/riscv/linker/symbols.rs | 267 -- src/backend/riscv/mod.rs | 7 - src/backend/stack_layout/README.md | 67 - src/backend/stack_layout/alloca_coalescing.rs | 247 -- src/backend/stack_layout/analysis.rs | 148 - src/backend/stack_layout/copy_coalescing.rs | 310 -- src/backend/stack_layout/inline_asm.rs | 189 - src/backend/stack_layout/mod.rs | 313 -- src/backend/stack_layout/regalloc_helpers.rs | 82 - src/backend/stack_layout/slot_assignment.rs | 889 ----- src/backend/state.rs | 476 --- src/backend/traits.rs | 1661 -------- src/backend/x86/README.md | 65 - src/backend/x86/asm_stub.sh | 7 - src/backend/x86/assembler/README.md | 791 ---- src/backend/x86/assembler/elf_writer.rs | 92 - src/backend/x86/assembler/encoder/avx.rs | 1456 ------- src/backend/x86/assembler/encoder/core.rs | 269 -- .../x86/assembler/encoder/gp_integer.rs | 1151 ------ src/backend/x86/assembler/encoder/mod.rs | 1545 -------- .../x86/assembler/encoder/registers.rs | 296 -- src/backend/x86/assembler/encoder/sse.rs | 653 ---- src/backend/x86/assembler/encoder/system.rs | 527 --- src/backend/x86/assembler/encoder/x87_misc.rs | 385 -- src/backend/x86/assembler/mod.rs | 29 - src/backend/x86/assembler/parser.rs | 2180 ----------- src/backend/x86/codegen/README.md | 670 ---- src/backend/x86/codegen/alu.rs | 171 - src/backend/x86/codegen/asm_emitter.rs | 677 ---- src/backend/x86/codegen/atomics.rs | 93 - src/backend/x86/codegen/calls.rs | 354 -- src/backend/x86/codegen/cast_ops.rs | 135 - src/backend/x86/codegen/comparison.rs | 203 - src/backend/x86/codegen/emit.rs | 1463 ------- src/backend/x86/codegen/f128.rs | 624 --- src/backend/x86/codegen/float_ops.rs | 90 - src/backend/x86/codegen/globals.rs | 60 - src/backend/x86/codegen/i128_ops.rs | 291 -- src/backend/x86/codegen/inline_asm.rs | 168 - src/backend/x86/codegen/intrinsics.rs | 564 --- src/backend/x86/codegen/memory.rs | 411 -- src/backend/x86/codegen/mod.rs | 18 - src/backend/x86/codegen/peephole/README.md | 724 ---- src/backend/x86/codegen/peephole/mod.rs | 26 - .../codegen/peephole/passes/callee_saves.rs | 155 - .../codegen/peephole/passes/compare_branch.rs | 165 - .../peephole/passes/copy_propagation.rs | 227 -- .../x86/codegen/peephole/passes/dead_code.rs | 404 -- .../codegen/peephole/passes/frame_compact.rs | 315 -- .../x86/codegen/peephole/passes/helpers.rs | 293 -- .../codegen/peephole/passes/local_patterns.rs | 490 --- .../peephole/passes/loop_trampoline.rs | 517 --- .../codegen/peephole/passes/memory_fold.rs | 154 - .../x86/codegen/peephole/passes/mod.rs | 1204 ------ .../x86/codegen/peephole/passes/push_pop.rs | 172 - .../peephole/passes/store_forwarding.rs | 385 -- .../x86/codegen/peephole/passes/tail_call.rs | 423 --- src/backend/x86/codegen/peephole/types.rs | 1275 ------- src/backend/x86/codegen/prologue.rs | 421 -- src/backend/x86/codegen/returns.rs | 196 - src/backend/x86/codegen/variadic.rs | 378 -- src/backend/x86/ld_stub.sh | 7 - src/backend/x86/linker/README.md | 974 ----- src/backend/x86/linker/elf.rs | 84 - src/backend/x86/linker/emit_exec.rs | 1248 ------ src/backend/x86/linker/emit_shared.rs | 1196 ------ src/backend/x86/linker/input.rs | 78 - src/backend/x86/linker/link.rs | 391 -- src/backend/x86/linker/mod.rs | 30 - src/backend/x86/linker/plt_got.rs | 141 - src/backend/x86/linker/types.rs | 89 - src/backend/x86/mod.rs | 7 - src/backend/x86_common.rs | 399 -- src/bin/ccc_arm.rs | 3 - src/bin/ccc_i686.rs | 3 - src/bin/ccc_riscv.rs | 3 - src/bin/ccc_x86.rs | 3 - src/common/README.md | 832 ---- src/common/asm_constraints.rs | 38 - src/common/const_arith.rs | 602 --- src/common/const_eval.rs | 346 -- src/common/encoding.rs | 185 - src/common/error.rs | 903 ----- src/common/fx_hash.rs | 81 - src/common/long_double.rs | 3369 ----------------- src/common/mod.rs | 12 - src/common/source.rs | 596 --- src/common/symbol_table.rs | 66 - src/common/temp_files.rs | 94 - src/common/type_builder.rs | 351 -- src/common/types.rs | 1890 --------- src/driver/README.md | 705 ---- src/driver/cli.rs | 718 ---- src/driver/external_tools.rs | 353 -- src/driver/file_types.rs | 112 - src/driver/mod.rs | 6 - src/driver/pipeline.rs | 1140 ------ src/frontend/README.md | 434 --- src/frontend/lexer/README.md | 387 -- src/frontend/lexer/mod.rs | 4 - src/frontend/lexer/scan.rs | 1188 ------ src/frontend/lexer/token.rs | 458 --- src/frontend/mod.rs | 4 - src/frontend/parser/README.md | 830 ---- src/frontend/parser/ast.rs | 839 ---- src/frontend/parser/declarations.rs | 1281 ------- src/frontend/parser/declarators.rs | 827 ---- src/frontend/parser/expressions.rs | 731 ---- src/frontend/parser/mod.rs | 9 - src/frontend/parser/parse.rs | 1308 ------- src/frontend/parser/statements.rs | 446 --- src/frontend/parser/types.rs | 1014 ----- src/frontend/preprocessor/README.md | 621 --- src/frontend/preprocessor/builtin_macros.rs | 464 --- src/frontend/preprocessor/conditionals.rs | 812 ---- src/frontend/preprocessor/expr_eval.rs | 391 -- src/frontend/preprocessor/includes.rs | 832 ---- src/frontend/preprocessor/macro_defs.rs | 1486 -------- src/frontend/preprocessor/mod.rs | 12 - src/frontend/preprocessor/pipeline.rs | 968 ----- src/frontend/preprocessor/pragmas.rs | 210 - .../preprocessor/predefined_macros.rs | 755 ---- src/frontend/preprocessor/text_processing.rs | 298 -- src/frontend/preprocessor/utils.rs | 112 - src/frontend/sema/README.md | 657 ---- src/frontend/sema/analysis.rs | 2034 ---------- src/frontend/sema/builtins.rs | 774 ---- src/frontend/sema/const_eval.rs | 1225 ------ src/frontend/sema/mod.rs | 8 - src/frontend/sema/type_checker.rs | 953 ----- src/frontend/sema/type_context.rs | 516 --- src/ir/README.md | 891 ----- src/ir/analysis.rs | 376 -- src/ir/constants.rs | 644 ---- src/ir/instruction.rs | 559 --- src/ir/intrinsics.rs | 226 -- src/ir/lowering/README.md | 468 --- src/ir/lowering/complex.rs | 963 ----- src/ir/lowering/const_eval.rs | 663 ---- src/ir/lowering/const_eval_global_addr.rs | 975 ----- src/ir/lowering/const_eval_init_size.rs | 323 -- src/ir/lowering/definitions.rs | 457 --- src/ir/lowering/expr.rs | 558 --- src/ir/lowering/expr_access.rs | 1437 ------- src/ir/lowering/expr_assign.rs | 911 ----- src/ir/lowering/expr_atomics.rs | 364 -- src/ir/lowering/expr_builtins.rs | 894 ----- src/ir/lowering/expr_builtins_fpclass.rs | 357 -- src/ir/lowering/expr_builtins_intrin.rs | 198 - src/ir/lowering/expr_builtins_overflow.rs | 479 --- src/ir/lowering/expr_calls.rs | 1076 ------ src/ir/lowering/expr_ops.rs | 844 ----- src/ir/lowering/expr_sizeof.rs | 500 --- src/ir/lowering/expr_types.rs | 1525 -------- src/ir/lowering/func_lowering.rs | 801 ---- src/ir/lowering/func_state.rs | 315 -- src/ir/lowering/global_decl.rs | 621 --- src/ir/lowering/global_init.rs | 1938 ---------- src/ir/lowering/global_init_bytes.rs | 1398 ------- src/ir/lowering/global_init_compound.rs | 425 --- src/ir/lowering/global_init_compound_ptrs.rs | 985 ----- .../lowering/global_init_compound_struct.rs | 761 ---- src/ir/lowering/global_init_helpers.rs | 268 -- src/ir/lowering/lower.rs | 1424 ------- src/ir/lowering/lvalue.rs | 696 ---- src/ir/lowering/mod.rs | 43 - src/ir/lowering/pointer_analysis.rs | 512 --- src/ir/lowering/ref_collection.rs | 303 -- src/ir/lowering/stmt.rs | 1379 ------- src/ir/lowering/stmt_asm.rs | 661 ---- src/ir/lowering/stmt_control_flow.rs | 290 -- src/ir/lowering/stmt_init.rs | 1111 ------ src/ir/lowering/stmt_return.rs | 389 -- src/ir/lowering/stmt_switch.rs | 233 -- src/ir/lowering/struct_init.rs | 920 ----- src/ir/lowering/structs.rs | 1050 ----- src/ir/lowering/types.rs | 904 ----- src/ir/lowering/types_ctype.rs | 519 --- src/ir/lowering/types_seed.rs | 304 -- src/ir/mem2reg/README.md | 364 -- src/ir/mem2reg/mod.rs | 6 - src/ir/mem2reg/phi_eliminate.rs | 585 --- src/ir/mem2reg/promote.rs | 1350 ------- src/ir/mod.rs | 9 - src/ir/module.rs | 339 -- src/ir/ops.rs | 220 -- src/ir/reexports.rs | 15 - src/lib.rs | 45 - src/main.rs | 3 - src/passes/README.md | 911 ----- src/passes/cfg_simplify.rs | 1515 -------- src/passes/constant_fold.rs | 1118 ------ src/passes/copy_prop.rs | 544 --- src/passes/dce.rs | 474 --- src/passes/dead_statics.rs | 435 --- src/passes/div_by_const.rs | 1984 ---------- src/passes/gvn.rs | 1723 --------- src/passes/if_convert.rs | 874 ----- src/passes/inline.rs | 1821 --------- src/passes/ipcp.rs | 507 --- src/passes/iv_strength_reduce.rs | 873 ----- src/passes/licm.rs | 1517 -------- src/passes/loop_analysis.rs | 132 - src/passes/mod.rs | 551 --- src/passes/narrow.rs | 990 ----- src/passes/resolve_asm.rs | 139 - src/passes/simplify.rs | 2960 --------------- 447 files changed, 1 insertion(+), 217081 deletions(-) delete mode 100644 .gitignore delete mode 100644 BUILDING_LINUX.txt delete mode 100644 Cargo.toml delete mode 100644 DESIGN_DOC.md delete mode 100644 LICENSE delete mode 100644 current_tasks/.keep delete mode 100644 current_tasks/fix_arm_asm_caspal_instruction.txt delete mode 100644 current_tasks/fix_arm_asm_global_branch_relocs.txt delete mode 100644 current_tasks/fix_arm_asm_org_directive.txt delete mode 100644 current_tasks/fix_arm_asm_quad_prel64_relocation.txt delete mode 100644 current_tasks/fix_arm_movw_symbolic_relocations.txt delete mode 100644 current_tasks/fix_dash.txt delete mode 100644 current_tasks/fix_i686_double_param_high_word_store.txt delete mode 100644 current_tasks/fix_macro_param_prefix_substitution.txt delete mode 100644 current_tasks/fix_pcre2_stack_frame_bloat.txt delete mode 100644 current_tasks/fix_riscv_va_arg_long_double_struct.txt delete mode 100644 current_tasks/fix_x86_asm_ifnb_ifb_conditional.txt delete mode 100644 current_tasks/fix_x86_standalone_kernel_link_errors.txt delete mode 100644 current_tasks/implement_string_literal_deduplication.txt delete mode 100644 ideas/.keep delete mode 100644 ideas/cleanup_i686_dead_peephole_passes.txt delete mode 100644 ideas/docs_verified_2026_01_29.txt delete mode 100644 ideas/fix_arm_defconfig_kvm_va_layout.txt delete mode 100644 ideas/fix_ir_unsigned_subint_representation.txt delete mode 100644 ideas/fix_riscv_kernel_boot_hang.txt delete mode 100644 ideas/fix_riscv_va_arg_runtime.txt delete mode 100644 ideas/fix_x86_full_f128_80bit_precision.txt delete mode 100644 ideas/high_codegen_runtime_perf.txt delete mode 100644 ideas/high_compile_speed_improvements.txt delete mode 100644 ideas/high_i686_boot_code_size_reduction.txt delete mode 100644 ideas/high_sema_expansion_typed_ast.txt delete mode 100644 ideas/high_use_def_chains.txt delete mode 100644 ideas/high_value_location_abstraction.txt delete mode 100644 ideas/low_structured_error_infrastructure.txt delete mode 100644 ideas/new_projects.txt delete mode 100644 ideas/new_projects_myasm.txt delete mode 100644 ideas/optimization_passes_future.txt delete mode 100644 ideas/qemu_build_support.txt delete mode 100644 ideas/reduce_stack_frame_size_for_postgres.txt delete mode 100644 ideas/register_allocator.txt delete mode 100644 include/arm_neon.h delete mode 100644 include/avx2intrin.h delete mode 100644 include/avx512fintrin.h delete mode 100644 include/avxintrin.h delete mode 100644 include/bmi2intrin.h delete mode 100644 include/emmintrin.h delete mode 100644 include/fmaintrin.h delete mode 100644 include/immintrin.h delete mode 100644 include/mmintrin.h delete mode 100644 include/nmmintrin.h delete mode 100644 include/pmmintrin.h delete mode 100644 include/shaintrin.h delete mode 100644 include/smmintrin.h delete mode 100644 include/tmmintrin.h delete mode 100644 include/wmmintrin.h delete mode 100644 include/x86intrin.h delete mode 100644 include/xmmintrin.h delete mode 100644 projects/cleanup_code_quality.txt delete mode 100644 src/backend/README.md delete mode 100644 src/backend/arm/README.md delete mode 100755 src/backend/arm/asm_stub.sh delete mode 100644 src/backend/arm/assembler/README.md delete mode 100644 src/backend/arm/assembler/elf_writer.rs delete mode 100644 src/backend/arm/assembler/encoder/bitfield.rs delete mode 100644 src/backend/arm/assembler/encoder/compare_branch.rs delete mode 100644 src/backend/arm/assembler/encoder/data_processing.rs delete mode 100644 src/backend/arm/assembler/encoder/fp_scalar.rs delete mode 100644 src/backend/arm/assembler/encoder/load_store.rs delete mode 100644 src/backend/arm/assembler/encoder/mod.rs delete mode 100644 src/backend/arm/assembler/encoder/neon.rs delete mode 100644 src/backend/arm/assembler/encoder/system.rs delete mode 100644 src/backend/arm/assembler/mod.rs delete mode 100644 src/backend/arm/assembler/parser.rs delete mode 100644 src/backend/arm/codegen/README.md delete mode 100644 src/backend/arm/codegen/alu.rs delete mode 100644 src/backend/arm/codegen/asm_emitter.rs delete mode 100644 src/backend/arm/codegen/atomics.rs delete mode 100644 src/backend/arm/codegen/calls.rs delete mode 100644 src/backend/arm/codegen/cast_ops.rs delete mode 100644 src/backend/arm/codegen/comparison.rs delete mode 100644 src/backend/arm/codegen/emit.rs delete mode 100644 src/backend/arm/codegen/f128.rs delete mode 100644 src/backend/arm/codegen/float_ops.rs delete mode 100644 src/backend/arm/codegen/globals.rs delete mode 100644 src/backend/arm/codegen/i128_ops.rs delete mode 100644 src/backend/arm/codegen/inline_asm.rs delete mode 100644 src/backend/arm/codegen/intrinsics.rs delete mode 100644 src/backend/arm/codegen/memory.rs delete mode 100644 src/backend/arm/codegen/mod.rs delete mode 100644 src/backend/arm/codegen/peephole.rs delete mode 100644 src/backend/arm/codegen/prologue.rs delete mode 100644 src/backend/arm/codegen/returns.rs delete mode 100644 src/backend/arm/codegen/variadic.rs delete mode 100755 src/backend/arm/ld_stub.sh delete mode 100644 src/backend/arm/linker/README.md delete mode 100644 src/backend/arm/linker/elf.rs delete mode 100644 src/backend/arm/linker/emit_dynamic.rs delete mode 100644 src/backend/arm/linker/emit_shared.rs delete mode 100644 src/backend/arm/linker/emit_static.rs delete mode 100644 src/backend/arm/linker/input.rs delete mode 100644 src/backend/arm/linker/link.rs delete mode 100644 src/backend/arm/linker/mod.rs delete mode 100644 src/backend/arm/linker/plt_got.rs delete mode 100644 src/backend/arm/linker/reloc.rs delete mode 100644 src/backend/arm/linker/types.rs delete mode 100644 src/backend/arm/mod.rs delete mode 100644 src/backend/asm_expr.rs delete mode 100644 src/backend/asm_preprocess.rs delete mode 100644 src/backend/call_abi.rs delete mode 100644 src/backend/cast.rs delete mode 100644 src/backend/common.rs delete mode 100644 src/backend/elf/archive.rs delete mode 100644 src/backend/elf/constants.rs delete mode 100644 src/backend/elf/io.rs delete mode 100644 src/backend/elf/linker_symbols.rs delete mode 100644 src/backend/elf/mod.rs delete mode 100644 src/backend/elf/numeric_labels.rs delete mode 100644 src/backend/elf/object_writer.rs delete mode 100644 src/backend/elf/parse_string.rs delete mode 100644 src/backend/elf/section_flags.rs delete mode 100644 src/backend/elf/string_table.rs delete mode 100644 src/backend/elf/symbol_table.rs delete mode 100644 src/backend/elf/writer_base.rs delete mode 100644 src/backend/elf_writer_common.rs delete mode 100644 src/backend/f128_softfloat.rs delete mode 100644 src/backend/generation.rs delete mode 100644 src/backend/i686/README.md delete mode 100644 src/backend/i686/assembler/README.md delete mode 100644 src/backend/i686/assembler/elf_writer.rs delete mode 100644 src/backend/i686/assembler/encoder/core.rs delete mode 100644 src/backend/i686/assembler/encoder/gp_integer.rs delete mode 100644 src/backend/i686/assembler/encoder/mod.rs delete mode 100644 src/backend/i686/assembler/encoder/registers.rs delete mode 100644 src/backend/i686/assembler/encoder/sse.rs delete mode 100644 src/backend/i686/assembler/encoder/system.rs delete mode 100644 src/backend/i686/assembler/encoder/x87.rs delete mode 100644 src/backend/i686/assembler/mod.rs delete mode 100644 src/backend/i686/codegen/README.md delete mode 100644 src/backend/i686/codegen/alu.rs delete mode 100644 src/backend/i686/codegen/asm_emitter.rs delete mode 100644 src/backend/i686/codegen/atomics.rs delete mode 100644 src/backend/i686/codegen/calls.rs delete mode 100644 src/backend/i686/codegen/casts.rs delete mode 100644 src/backend/i686/codegen/comparison.rs delete mode 100644 src/backend/i686/codegen/emit.rs delete mode 100644 src/backend/i686/codegen/float_ops.rs delete mode 100644 src/backend/i686/codegen/globals.rs delete mode 100644 src/backend/i686/codegen/i128_ops.rs delete mode 100644 src/backend/i686/codegen/inline_asm.rs delete mode 100644 src/backend/i686/codegen/intrinsics.rs delete mode 100644 src/backend/i686/codegen/memory.rs delete mode 100644 src/backend/i686/codegen/mod.rs delete mode 100644 src/backend/i686/codegen/peephole.rs delete mode 100644 src/backend/i686/codegen/prologue.rs delete mode 100644 src/backend/i686/codegen/returns.rs delete mode 100644 src/backend/i686/codegen/variadic.rs delete mode 100644 src/backend/i686/linker/README.md delete mode 100644 src/backend/i686/linker/dynsym.rs delete mode 100644 src/backend/i686/linker/emit.rs delete mode 100644 src/backend/i686/linker/gnu_hash.rs delete mode 100644 src/backend/i686/linker/input.rs delete mode 100644 src/backend/i686/linker/link.rs delete mode 100644 src/backend/i686/linker/mod.rs delete mode 100644 src/backend/i686/linker/parse.rs delete mode 100644 src/backend/i686/linker/reloc.rs delete mode 100644 src/backend/i686/linker/sections.rs delete mode 100644 src/backend/i686/linker/shared.rs delete mode 100644 src/backend/i686/linker/symbols.rs delete mode 100644 src/backend/i686/linker/types.rs delete mode 100644 src/backend/i686/mod.rs delete mode 100644 src/backend/inline_asm.rs delete mode 100644 src/backend/linker_common/README.md delete mode 100644 src/backend/linker_common/archive.rs delete mode 100644 src/backend/linker_common/args.rs delete mode 100644 src/backend/linker_common/check.rs delete mode 100644 src/backend/linker_common/dynamic.rs delete mode 100644 src/backend/linker_common/dynstr.rs delete mode 100644 src/backend/linker_common/eh_frame.rs delete mode 100644 src/backend/linker_common/gc_sections.rs delete mode 100644 src/backend/linker_common/hash.rs delete mode 100644 src/backend/linker_common/merge.rs delete mode 100644 src/backend/linker_common/mod.rs delete mode 100644 src/backend/linker_common/parse_object.rs delete mode 100644 src/backend/linker_common/parse_shared.rs delete mode 100644 src/backend/linker_common/resolve_lib.rs delete mode 100644 src/backend/linker_common/section_map.rs delete mode 100644 src/backend/linker_common/symbols.rs delete mode 100644 src/backend/linker_common/types.rs delete mode 100644 src/backend/linker_common/write.rs delete mode 100644 src/backend/liveness.rs delete mode 100644 src/backend/mod.rs delete mode 100644 src/backend/peephole_common.rs delete mode 100644 src/backend/regalloc.rs delete mode 100644 src/backend/riscv/README.md delete mode 100755 src/backend/riscv/asm_stub.sh delete mode 100644 src/backend/riscv/assembler/README.md delete mode 100644 src/backend/riscv/assembler/compress.rs delete mode 100644 src/backend/riscv/assembler/elf_writer.rs delete mode 100644 src/backend/riscv/assembler/encoder/atomics.rs delete mode 100644 src/backend/riscv/assembler/encoder/base.rs delete mode 100644 src/backend/riscv/assembler/encoder/compressed.rs delete mode 100644 src/backend/riscv/assembler/encoder/float.rs delete mode 100644 src/backend/riscv/assembler/encoder/mod.rs delete mode 100644 src/backend/riscv/assembler/encoder/pseudo.rs delete mode 100644 src/backend/riscv/assembler/encoder/system.rs delete mode 100644 src/backend/riscv/assembler/encoder/vector.rs delete mode 100644 src/backend/riscv/assembler/mod.rs delete mode 100644 src/backend/riscv/assembler/parser.rs delete mode 100644 src/backend/riscv/codegen/README.md delete mode 100644 src/backend/riscv/codegen/alu.rs delete mode 100644 src/backend/riscv/codegen/asm_emitter.rs delete mode 100644 src/backend/riscv/codegen/atomics.rs delete mode 100644 src/backend/riscv/codegen/calls.rs delete mode 100644 src/backend/riscv/codegen/cast_ops.rs delete mode 100644 src/backend/riscv/codegen/comparison.rs delete mode 100644 src/backend/riscv/codegen/emit.rs delete mode 100644 src/backend/riscv/codegen/f128.rs delete mode 100644 src/backend/riscv/codegen/float_ops.rs delete mode 100644 src/backend/riscv/codegen/globals.rs delete mode 100644 src/backend/riscv/codegen/i128_ops.rs delete mode 100644 src/backend/riscv/codegen/inline_asm.rs delete mode 100644 src/backend/riscv/codegen/intrinsics.rs delete mode 100644 src/backend/riscv/codegen/memory.rs delete mode 100644 src/backend/riscv/codegen/mod.rs delete mode 100644 src/backend/riscv/codegen/peephole.rs delete mode 100644 src/backend/riscv/codegen/prologue.rs delete mode 100644 src/backend/riscv/codegen/returns.rs delete mode 100644 src/backend/riscv/codegen/variadic.rs delete mode 100755 src/backend/riscv/ld_stub.sh delete mode 100644 src/backend/riscv/linker/README.md delete mode 100644 src/backend/riscv/linker/elf_read.rs delete mode 100644 src/backend/riscv/linker/emit_exec.rs delete mode 100644 src/backend/riscv/linker/emit_shared.rs delete mode 100644 src/backend/riscv/linker/input.rs delete mode 100644 src/backend/riscv/linker/link.rs delete mode 100644 src/backend/riscv/linker/mod.rs delete mode 100644 src/backend/riscv/linker/reloc.rs delete mode 100644 src/backend/riscv/linker/relocations.rs delete mode 100644 src/backend/riscv/linker/sections.rs delete mode 100644 src/backend/riscv/linker/symbols.rs delete mode 100644 src/backend/riscv/mod.rs delete mode 100644 src/backend/stack_layout/README.md delete mode 100644 src/backend/stack_layout/alloca_coalescing.rs delete mode 100644 src/backend/stack_layout/analysis.rs delete mode 100644 src/backend/stack_layout/copy_coalescing.rs delete mode 100644 src/backend/stack_layout/inline_asm.rs delete mode 100644 src/backend/stack_layout/mod.rs delete mode 100644 src/backend/stack_layout/regalloc_helpers.rs delete mode 100644 src/backend/stack_layout/slot_assignment.rs delete mode 100644 src/backend/state.rs delete mode 100644 src/backend/traits.rs delete mode 100644 src/backend/x86/README.md delete mode 100755 src/backend/x86/asm_stub.sh delete mode 100644 src/backend/x86/assembler/README.md delete mode 100644 src/backend/x86/assembler/elf_writer.rs delete mode 100644 src/backend/x86/assembler/encoder/avx.rs delete mode 100644 src/backend/x86/assembler/encoder/core.rs delete mode 100644 src/backend/x86/assembler/encoder/gp_integer.rs delete mode 100644 src/backend/x86/assembler/encoder/mod.rs delete mode 100644 src/backend/x86/assembler/encoder/registers.rs delete mode 100644 src/backend/x86/assembler/encoder/sse.rs delete mode 100644 src/backend/x86/assembler/encoder/system.rs delete mode 100644 src/backend/x86/assembler/encoder/x87_misc.rs delete mode 100644 src/backend/x86/assembler/mod.rs delete mode 100644 src/backend/x86/assembler/parser.rs delete mode 100644 src/backend/x86/codegen/README.md delete mode 100644 src/backend/x86/codegen/alu.rs delete mode 100644 src/backend/x86/codegen/asm_emitter.rs delete mode 100644 src/backend/x86/codegen/atomics.rs delete mode 100644 src/backend/x86/codegen/calls.rs delete mode 100644 src/backend/x86/codegen/cast_ops.rs delete mode 100644 src/backend/x86/codegen/comparison.rs delete mode 100644 src/backend/x86/codegen/emit.rs delete mode 100644 src/backend/x86/codegen/f128.rs delete mode 100644 src/backend/x86/codegen/float_ops.rs delete mode 100644 src/backend/x86/codegen/globals.rs delete mode 100644 src/backend/x86/codegen/i128_ops.rs delete mode 100644 src/backend/x86/codegen/inline_asm.rs delete mode 100644 src/backend/x86/codegen/intrinsics.rs delete mode 100644 src/backend/x86/codegen/memory.rs delete mode 100644 src/backend/x86/codegen/mod.rs delete mode 100644 src/backend/x86/codegen/peephole/README.md delete mode 100644 src/backend/x86/codegen/peephole/mod.rs delete mode 100644 src/backend/x86/codegen/peephole/passes/callee_saves.rs delete mode 100644 src/backend/x86/codegen/peephole/passes/compare_branch.rs delete mode 100644 src/backend/x86/codegen/peephole/passes/copy_propagation.rs delete mode 100644 src/backend/x86/codegen/peephole/passes/dead_code.rs delete mode 100644 src/backend/x86/codegen/peephole/passes/frame_compact.rs delete mode 100644 src/backend/x86/codegen/peephole/passes/helpers.rs delete mode 100644 src/backend/x86/codegen/peephole/passes/local_patterns.rs delete mode 100644 src/backend/x86/codegen/peephole/passes/loop_trampoline.rs delete mode 100644 src/backend/x86/codegen/peephole/passes/memory_fold.rs delete mode 100644 src/backend/x86/codegen/peephole/passes/mod.rs delete mode 100644 src/backend/x86/codegen/peephole/passes/push_pop.rs delete mode 100644 src/backend/x86/codegen/peephole/passes/store_forwarding.rs delete mode 100644 src/backend/x86/codegen/peephole/passes/tail_call.rs delete mode 100644 src/backend/x86/codegen/peephole/types.rs delete mode 100644 src/backend/x86/codegen/prologue.rs delete mode 100644 src/backend/x86/codegen/returns.rs delete mode 100644 src/backend/x86/codegen/variadic.rs delete mode 100755 src/backend/x86/ld_stub.sh delete mode 100644 src/backend/x86/linker/README.md delete mode 100644 src/backend/x86/linker/elf.rs delete mode 100644 src/backend/x86/linker/emit_exec.rs delete mode 100644 src/backend/x86/linker/emit_shared.rs delete mode 100644 src/backend/x86/linker/input.rs delete mode 100644 src/backend/x86/linker/link.rs delete mode 100644 src/backend/x86/linker/mod.rs delete mode 100644 src/backend/x86/linker/plt_got.rs delete mode 100644 src/backend/x86/linker/types.rs delete mode 100644 src/backend/x86/mod.rs delete mode 100644 src/backend/x86_common.rs delete mode 100644 src/bin/ccc_arm.rs delete mode 100644 src/bin/ccc_i686.rs delete mode 100644 src/bin/ccc_riscv.rs delete mode 100644 src/bin/ccc_x86.rs delete mode 100644 src/common/README.md delete mode 100644 src/common/asm_constraints.rs delete mode 100644 src/common/const_arith.rs delete mode 100644 src/common/const_eval.rs delete mode 100644 src/common/encoding.rs delete mode 100644 src/common/error.rs delete mode 100644 src/common/fx_hash.rs delete mode 100644 src/common/long_double.rs delete mode 100644 src/common/mod.rs delete mode 100644 src/common/source.rs delete mode 100644 src/common/symbol_table.rs delete mode 100644 src/common/temp_files.rs delete mode 100644 src/common/type_builder.rs delete mode 100644 src/common/types.rs delete mode 100644 src/driver/README.md delete mode 100644 src/driver/cli.rs delete mode 100644 src/driver/external_tools.rs delete mode 100644 src/driver/file_types.rs delete mode 100644 src/driver/mod.rs delete mode 100644 src/driver/pipeline.rs delete mode 100644 src/frontend/README.md delete mode 100644 src/frontend/lexer/README.md delete mode 100644 src/frontend/lexer/mod.rs delete mode 100644 src/frontend/lexer/scan.rs delete mode 100644 src/frontend/lexer/token.rs delete mode 100644 src/frontend/mod.rs delete mode 100644 src/frontend/parser/README.md delete mode 100644 src/frontend/parser/ast.rs delete mode 100644 src/frontend/parser/declarations.rs delete mode 100644 src/frontend/parser/declarators.rs delete mode 100644 src/frontend/parser/expressions.rs delete mode 100644 src/frontend/parser/mod.rs delete mode 100644 src/frontend/parser/parse.rs delete mode 100644 src/frontend/parser/statements.rs delete mode 100644 src/frontend/parser/types.rs delete mode 100644 src/frontend/preprocessor/README.md delete mode 100644 src/frontend/preprocessor/builtin_macros.rs delete mode 100644 src/frontend/preprocessor/conditionals.rs delete mode 100644 src/frontend/preprocessor/expr_eval.rs delete mode 100644 src/frontend/preprocessor/includes.rs delete mode 100644 src/frontend/preprocessor/macro_defs.rs delete mode 100644 src/frontend/preprocessor/mod.rs delete mode 100644 src/frontend/preprocessor/pipeline.rs delete mode 100644 src/frontend/preprocessor/pragmas.rs delete mode 100644 src/frontend/preprocessor/predefined_macros.rs delete mode 100644 src/frontend/preprocessor/text_processing.rs delete mode 100644 src/frontend/preprocessor/utils.rs delete mode 100644 src/frontend/sema/README.md delete mode 100644 src/frontend/sema/analysis.rs delete mode 100644 src/frontend/sema/builtins.rs delete mode 100644 src/frontend/sema/const_eval.rs delete mode 100644 src/frontend/sema/mod.rs delete mode 100644 src/frontend/sema/type_checker.rs delete mode 100644 src/frontend/sema/type_context.rs delete mode 100644 src/ir/README.md delete mode 100644 src/ir/analysis.rs delete mode 100644 src/ir/constants.rs delete mode 100644 src/ir/instruction.rs delete mode 100644 src/ir/intrinsics.rs delete mode 100644 src/ir/lowering/README.md delete mode 100644 src/ir/lowering/complex.rs delete mode 100644 src/ir/lowering/const_eval.rs delete mode 100644 src/ir/lowering/const_eval_global_addr.rs delete mode 100644 src/ir/lowering/const_eval_init_size.rs delete mode 100644 src/ir/lowering/definitions.rs delete mode 100644 src/ir/lowering/expr.rs delete mode 100644 src/ir/lowering/expr_access.rs delete mode 100644 src/ir/lowering/expr_assign.rs delete mode 100644 src/ir/lowering/expr_atomics.rs delete mode 100644 src/ir/lowering/expr_builtins.rs delete mode 100644 src/ir/lowering/expr_builtins_fpclass.rs delete mode 100644 src/ir/lowering/expr_builtins_intrin.rs delete mode 100644 src/ir/lowering/expr_builtins_overflow.rs delete mode 100644 src/ir/lowering/expr_calls.rs delete mode 100644 src/ir/lowering/expr_ops.rs delete mode 100644 src/ir/lowering/expr_sizeof.rs delete mode 100644 src/ir/lowering/expr_types.rs delete mode 100644 src/ir/lowering/func_lowering.rs delete mode 100644 src/ir/lowering/func_state.rs delete mode 100644 src/ir/lowering/global_decl.rs delete mode 100644 src/ir/lowering/global_init.rs delete mode 100644 src/ir/lowering/global_init_bytes.rs delete mode 100644 src/ir/lowering/global_init_compound.rs delete mode 100644 src/ir/lowering/global_init_compound_ptrs.rs delete mode 100644 src/ir/lowering/global_init_compound_struct.rs delete mode 100644 src/ir/lowering/global_init_helpers.rs delete mode 100644 src/ir/lowering/lower.rs delete mode 100644 src/ir/lowering/lvalue.rs delete mode 100644 src/ir/lowering/mod.rs delete mode 100644 src/ir/lowering/pointer_analysis.rs delete mode 100644 src/ir/lowering/ref_collection.rs delete mode 100644 src/ir/lowering/stmt.rs delete mode 100644 src/ir/lowering/stmt_asm.rs delete mode 100644 src/ir/lowering/stmt_control_flow.rs delete mode 100644 src/ir/lowering/stmt_init.rs delete mode 100644 src/ir/lowering/stmt_return.rs delete mode 100644 src/ir/lowering/stmt_switch.rs delete mode 100644 src/ir/lowering/struct_init.rs delete mode 100644 src/ir/lowering/structs.rs delete mode 100644 src/ir/lowering/types.rs delete mode 100644 src/ir/lowering/types_ctype.rs delete mode 100644 src/ir/lowering/types_seed.rs delete mode 100644 src/ir/mem2reg/README.md delete mode 100644 src/ir/mem2reg/mod.rs delete mode 100644 src/ir/mem2reg/phi_eliminate.rs delete mode 100644 src/ir/mem2reg/promote.rs delete mode 100644 src/ir/mod.rs delete mode 100644 src/ir/module.rs delete mode 100644 src/ir/ops.rs delete mode 100644 src/ir/reexports.rs delete mode 100644 src/lib.rs delete mode 100644 src/main.rs delete mode 100644 src/passes/README.md delete mode 100644 src/passes/cfg_simplify.rs delete mode 100644 src/passes/constant_fold.rs delete mode 100644 src/passes/copy_prop.rs delete mode 100644 src/passes/dce.rs delete mode 100644 src/passes/dead_statics.rs delete mode 100644 src/passes/div_by_const.rs delete mode 100644 src/passes/gvn.rs delete mode 100644 src/passes/if_convert.rs delete mode 100644 src/passes/inline.rs delete mode 100644 src/passes/ipcp.rs delete mode 100644 src/passes/iv_strength_reduce.rs delete mode 100644 src/passes/licm.rs delete mode 100644 src/passes/loop_analysis.rs delete mode 100644 src/passes/mod.rs delete mode 100644 src/passes/narrow.rs delete mode 100644 src/passes/resolve_asm.rs delete mode 100644 src/passes/simplify.rs diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 7d2059cb51..0000000000 --- a/.gitignore +++ /dev/null @@ -1,8 +0,0 @@ -/target/ -Cargo.lock -*.o -*.s -*.core -a.out -dump.rdb -core diff --git a/BUILDING_LINUX.txt b/BUILDING_LINUX.txt deleted file mode 100644 index 60ddf45024..0000000000 --- a/BUILDING_LINUX.txt +++ /dev/null @@ -1,72 +0,0 @@ -Below is a (human-written) transcript of how to to get the Linux kernel building with `make defconfig`. -It works on Claude's the development machine but I have not tested it on any other machine. - -cargo build --release - -wget https://cdn.kernel.org/pub/linux/kernel/v6.x/linux-6.9.tar.xz -xz -d linux-6.9.tar.xz -tar -xf linux-6.9.tar -cd linux-6.9/ - -make ARCH=riscv CC=/workspace/code/target/release/ccc-riscv HOSTCC=/workspace/code/target/release/ccc-x86 CROSS_COMPILE=riscv64-linux-gnu- defconfig -make ARCH=riscv CC=/workspace/code/target/release/ccc-riscv HOSTCC=/workspace/code/target/release/ccc-x86 CROSS_COMPILE=riscv64-linux-gnu- -j$(nproc) Image - -# 4. Create the init script -cat > basic-init <<'EOF' -#!/bin/busybox sh -/bin/busybox mkdir -p /dev /etc /proc /sys /bin /sbin /usr/bin/games /usr/sbin -/bin/busybox mount -t proc proc /proc -/bin/busybox mount -t sysfs sys /sys -/bin/busybox mount -t devtmpfs dev /dev -/bin/busybox --install -s 2>/dev/null -export TERM=linux -export PATH=/bin:/sbin:/usr/bin:/usr/sbin:/usr/bin/games -echo 1 > /proc/sys/kernel/printk -sleep 0.2 -exec /bin/busybox setsid /bin/busybox cttyhack /bin/sh -EOF - -wget https://busybox.net/downloads/busybox-1.36.1.tar.bz2 -tar xf busybox-1.36.1.tar.bz2 -cd busybox-1.36.1 - -# Configure -make ARCH=riscv CROSS_COMPILE=riscv64-linux-gnu- defconfig -sed -i 's/# CONFIG_STATIC is not set/CONFIG_STATIC=y/' .config -sed -i 's/CONFIG_TC=y/# CONFIG_TC is not set/' .config - -make ARCH=riscv CROSS_COMPILE=riscv64-linux-gnu- -j$(nproc) - -cd .. - -# 5. Create the initramfs list -cat > initramfs.list <<'EOF' -dir /dev 0755 0 0 -nod /dev/console 0600 0 0 c 5 1 -nod /dev/tty 0666 0 0 c 5 0 -nod /dev/null 0666 0 0 c 1 3 -dir /proc 0755 0 0 -dir /sys 0755 0 0 -dir /root 0700 0 0 -dir /bin 0755 0 0 -dir /sbin 0755 0 0 -dir /usr 0755 0 0 -dir /usr/bin 0755 0 0 -dir /usr/bin/games 0755 0 0 -file /init ./basic-init 0755 0 0 -file /bin/busybox busybox-1.36.1/busybox 0755 0 0 -EOF - - -# 6. Generate the initramfs -usr/gen_init_cpio initramfs.list | gzip -9 > initramfs.cpio.gz - -qemu-system-riscv64 -M virt -m 512 \ - -kernel arch/riscv/boot/Image \ - -initrd ./initramfs.cpio.gz \ - -append "console=ttyS0" \ - -nographic \ - -no-reboot - -# dmesg | head -n 1 -[ 0.000000] Linux version 6.9.0 (appuser@3e854e88ac71) (ccc (Claude's C Compiler, GCC-compatible) 14.2.0, GNU ld (GNU Binutils for Ubuntu) 2.42) #1 SMP Thu Feb 5 16:59:16 UTC 2026 diff --git a/Cargo.toml b/Cargo.toml deleted file mode 100644 index 6d29dab924..0000000000 --- a/Cargo.toml +++ /dev/null @@ -1,35 +0,0 @@ -[package] -name = "ccc" -version = "0.1.0" -edition = "2021" -description = "Claude's C Compiler — a C compiler targeting x86-64, i686, ARM64, and RISC-V" -autobins = false - -[[bin]] -name = "ccc" -path = "src/main.rs" - -[[bin]] -name = "ccc-x86" -path = "src/bin/ccc_x86.rs" - -[[bin]] -name = "ccc-arm" -path = "src/bin/ccc_arm.rs" - -[[bin]] -name = "ccc-riscv" -path = "src/bin/ccc_riscv.rs" - -[[bin]] -name = "ccc-i686" -path = "src/bin/ccc_i686.rs" - -[features] -# Allow GCC as a linker fallback. When enabled and GCC is used, a warning is printed. -gcc_linker = [] -# Allow GCC as an assembler fallback. When enabled and GCC is used, a warning is printed. -gcc_assembler = [] -# Allow GCC passthrough for -m16 mode (16-bit real-mode boot code). -# When enabled and GCC is used, a prominent warning is printed. -gcc_m16 = [] diff --git a/DESIGN_DOC.md b/DESIGN_DOC.md deleted file mode 100644 index d1601d2ef4..0000000000 --- a/DESIGN_DOC.md +++ /dev/null @@ -1,385 +0,0 @@ -# CCC Design Document - -This document describes the architecture and implementation of CCC -(Claude's C Compiler). For building, usage, and status information, see -[README.md](README.md). Each `src/` subdirectory also has its own -`README.md` with detailed per-module documentation. - ---- - -## Table of Contents - -1. [High-Level Pipeline](#high-level-pipeline) -2. [Source Tree](#source-tree) -3. [Compilation Pipeline (Data Flow)](#compilation-pipeline-data-flow) -4. [Key Design Decisions](#key-design-decisions) -5. [Design Philosophy](#design-philosophy) -6. [Assembler and Linker Architecture](#assembler-and-linker-architecture) -7. [Sub-Module Documentation](#sub-module-documentation) - ---- - -## High-Level Pipeline - -The compiler is a multi-phase pipeline. Each phase is a separate Rust module -with a well-defined input/output interface. The entire flow -- from C source -to ELF executable -- is handled internally with no external tools. - -``` - +---------------------------------------------------------------------+ - | C Source Files (.c, .h) | - +----------------------------------+----------------------------------+ - | - +----------------------------------v----------------------------------+ - | FRONTEND (src/frontend/) | - | | - | +--------------+ +-------+ +--------+ +--------------+ | - | | Preprocessor |---»| Lexer |---»| Parser |---»| Sema | | - | | | | | | | | | | - | | macro expand,| |tokens | |spanned | | type check, | | - | | #include, | | with | | AST | | const eval, | | - | | #ifdef | | spans | | | | symbol table | | - | +--------------+ +-------+ +--------+ +------+-------+ | - +------------------------------------------------------------+-------+ - | - AST + SemaResult (TypeContext, - expr types, const values) - | - +------------------------------------------------------------v-------+ - | IR SUBSYSTEM (src/ir/) | - | | - | +------------------+ +------------------------------+ | - | | IR Lowering |--------»| mem2reg | | - | | | | | | - | | AST -> alloca- | | SSA promotion via dominator | | - | | based IR (every | | frontiers; insert phi nodes, | | - | | local is a stack | | rename values | | - | | slot) | | | | - | +------------------+ +--------------+---------------+ | - +----------------------------------------------+---------------------+ - | SSA IR - +----------------------------------------------v---------------------+ - | OPTIMIZATION PASSES (src/passes/) | - | | - | Phase 0: Inlining + post-inline cleanup | - | (inline -> mem2reg -> constant_fold -> copy_prop -> simplify | - | -> constant_fold -> copy_prop -> resolve_asm) | - | | | - | Main Loop (up to 3 iterations, dirty-tracked): | - | cfg_simplify -> copy_prop -> div_by_const -> narrow -> simplify | - | -> constant_fold -> gvn -> licm -> iv_strength_reduce | - | -> if_convert -> copy_prop -> dce -> cfg_simplify -> ipcp | - | | | - | Dead static elimination | - | | | - | Phi Elimination (SSA -> register copies) | - +----------------------------------+---------------------------------+ - | non-SSA IR - +----------------------------------v---------------------------------+ - | BACKEND (src/backend/) | - | | - | +---------------------------------------------------------+ | - | | Code Generation (ArchCodegen trait) | | - | | | | - | | +----------+ +----------+ +----------+ +----------+ | | - | | | x86-64 | | i686 | | AArch64 | | RISC-V64 | | | - | | | SysV ABI | | cdecl | | AAPCS64 | | LP64D | | | - | | +----+-----+ +----+-----+ +----+-----+ +----+-----+ | | - | +-------+-------------+------------+---------------+-------+ | - | | | | | | - | +-------v-------------v------------v---------------v-------+ | - | | Peephole Optimizer (per-arch) | | - | | store/load forwarding, dead stores, copy prop, branches | | - | +-------+-------------+------------+---------------+-------+ | - | | | | | | - | +-------v-------------v------------v---------------v-------+ | - | | Builtin Assembler (per-arch) | | - | | parse asm text -> encode instructions -> write ELF .o | | - | +-------+-------------+------------+---------------+-------+ | - | | | | | | - | +-------v-------------v------------v---------------v-------+ | - | | Builtin Linker (per-arch) | | - | | read .o + CRT + libs -> resolve symbols -> write ELF | | - | +-------+-------------+------------+---------------+-------+ | - +----------+-------------+------------+---------------+---------------+ - | | | | - v v v v - ELF ELF ELF ELF -``` - ---- - -## Source Tree - -``` -src/ - frontend/ C source -> typed AST - preprocessor/ Macro expansion, #include, #ifdef, #pragma once - lexer/ Tokenization with source locations - parser/ Recursive descent, produces spanned AST - sema/ Type checking, symbol table, const evaluation - - ir/ Target-independent SSA IR - lowering/ AST -> alloca-based IR - mem2reg/ SSA promotion (dominator tree, phi insertion) - - passes/ SSA optimization passes - constant_fold Constant folding and propagation - copy_prop Copy propagation - dce Dead code elimination - gvn Global value numbering - licm Loop-invariant code motion - simplify Algebraic simplification - cfg_simplify CFG cleanup, branch threading - inline Function inlining (always_inline + small static) - if_convert Diamond if-conversion to select (cmov/csel) - narrow Integer narrowing (eliminate promotion overhead) - div_by_const Division strength reduction (mul+shift) - ipcp Interprocedural constant propagation - iv_strength_reduce Induction variable strength reduction - loop_analysis Shared natural loop detection (used by LICM, IVSR) - dead_statics Dead static function/global elimination - resolve_asm Post-inline asm symbol resolution - - backend/ IR -> assembly -> machine code -> ELF - traits.rs ArchCodegen trait with shared default implementations - generation.rs IR instruction dispatch to trait methods - liveness.rs Live interval computation for register allocation - regalloc.rs Linear scan register allocator - state.rs Shared codegen state (stack slots, register cache) - stack_layout/ Stack frame layout with liveness-based slot packing - call_abi.rs Unified ABI classification (caller + callee) - cast.rs Shared cast and float operation classification - f128_softfloat.rs IEEE binary128 soft-float (ARM + RISC-V) - inline_asm.rs Shared inline assembly framework - common.rs Data sections, external tool fallback invocation - x86_common.rs Shared x86/i686 register names, condition codes - elf/ ELF constants, archive reading, shared types - elf_writer_common.rs Common ELF object file writing utilities - linker_common/ Shared linker types (symbols, dynamic linking, EH frame) - asm_preprocess.rs Assembly text preprocessing (macro expansion, conditionals) - asm_expr.rs Assembly expression evaluation - peephole_common.rs Shared peephole optimizer utilities (word matching, line store) - x86/ - codegen/ x86-64 code generation (SysV AMD64 ABI) + peephole - assembler/ Builtin x86-64 assembler (parser, encoder, ELF writer) - linker/ Builtin x86-64 linker (dynamic linking, PLT/GOT, TLS) - i686/ - codegen/ i686 code generation (cdecl, ILP32) + peephole - assembler/ Builtin i686 assembler (reuses x86 parser, 32-bit encoder) - linker/ Builtin i686 linker (32-bit ELF, R_386 relocations) - arm/ - codegen/ AArch64 code generation (AAPCS64) + peephole - assembler/ Builtin AArch64 assembler (parser, encoder, ELF writer) - linker/ Builtin AArch64 linker (static + dynamic linking, IFUNC/TLS) - riscv/ - codegen/ RISC-V 64 code generation (LP64D) + peephole - assembler/ Builtin RV64 assembler (parser, encoder, RV64C compress) - linker/ Builtin RV64 linker (dynamic linking) - - common/ Shared types, symbol table, diagnostics - driver/ CLI parsing, pipeline orchestration -``` - ---- - -## Compilation Pipeline (Data Flow) - -Each phase transforms the program into a progressively lower-level -representation. The concrete Rust types flowing between phases are: - -``` - &str (C source text) - | - | Preprocessor::preprocess() - v - String (expanded text with line markers) - | - | Lexer::tokenize() - v - Vec (each Token = { kind: TokenKind, span: Span }) - | - | Parser::parse() - v - TranslationUnit (AST: Vec with source spans) - | - | SemanticAnalyzer::analyze() - v - TranslationUnit + SemaResult - | SemaResult bundles: - | - functions: FxHashMap - | - type_context: TypeContext (struct layouts, typedefs, enums) - | - expr_types: FxHashMap - | - const_values: FxHashMap - | - | Lowerer::lower() - v - IrModule (alloca-based IR: every local is a stack slot) - | - | promote_allocas() (mem2reg) - v - IrModule (SSA form: phi nodes, virtual registers) - | - | run_passes() (up to 3 iterations with dirty tracking) - v - IrModule (optimized SSA) - | - | eliminate_phis() - v - IrModule (non-SSA: phi nodes lowered to register copies) - | - | Target::generate_assembly_with_opts_and_debug() (ArchCodegen dispatch) - v - String (target-specific assembly text) - | - | Builtin assembler (parse -> encode -> ELF .o) - v - ELF object file (.o) - | - | Builtin linker (resolve symbols -> apply relocs -> write ELF) - v - ELF executable -``` - ---- - -## Key Design Decisions - -- **SSA IR**: The IR uses SSA form with phi nodes, constructed via mem2reg over - alloca-based lowering. This is the same approach as LLVM. - -- **Trait-based backends**: All four backends implement the `ArchCodegen` trait - (~185 methods). Shared logic (call ABI classification, inline asm framework, - f128 soft-float) lives in default trait methods and shared modules. - -- **Linear scan register allocation**: Loop-aware liveness analysis feeds a - linear scan allocator (callee-saved + caller-saved) on all four backends. - Register-allocated values bypass stack slots entirely. - -- **Text-to-text preprocessor**: The preprocessor operates on raw text, emitting - GCC-style `# line "file"` markers for source location tracking. Include guard - detection avoids re-processing headers. - -- **Peephole optimization**: Each backend has a post-codegen peephole optimizer - that eliminates redundant patterns (store/load forwarding, dead stores, copy - propagation) from the stack-based code generator. The x86 peephole is the most - mature with 15 distinct pass functions. - -- **Builtin assembler and linker**: Each architecture has a native assembler - (AT&T/ARM/RV syntax parser, instruction encoder, ELF object writer) and a - native linker (symbol resolution, relocation application, ELF executable - writer). No external toolchain is required. - -- **Dual type system**: CType represents C-level types (preserving `int` vs - `long` distinctions for type checking), while IrType is a flat machine-level - enumeration (`I8`..`I128`, `U8`..`U128`, `F32`, `F64`, `F128`, `Ptr`, - `Void`). The lowering phase bridges between them. - ---- - -## Design Philosophy - -- **Separation of concerns through representations.** Each major phase works on - its own representation: the frontend on text/tokens/AST, the IR subsystem on - alloca-based IR, the optimizer on SSA IR, and the backend on non-SSA IR. Phase - boundaries are explicit ownership transfers, not shared mutable state. - -- **Alloca-then-promote for SSA construction.** Rather than constructing SSA - directly during AST lowering (which interleaves C semantics with SSA - bookkeeping), the lowerer emits simple alloca/load/store sequences. The - mem2reg pass then promotes these to SSA independently. This is the same - strategy LLVM uses and cleanly separates the two concerns. - -- **Trait-based backend abstraction.** The `ArchCodegen` trait (~185 methods) - captures the interface between the shared code generation framework and - architecture-specific instruction emission. Default implementations express - algorithms once (e.g., the 7-phase call sequence in `emit_call`), while backends supply - only the architecture-specific primitives. - -- **Zero external dependencies for compilation.** The entire compilation - pipeline -- from C source to ELF executable -- is self-contained. No lexer - generators, parser generators, register allocator libraries, external - assemblers, or external linkers are required. Every component is implemented - from scratch using only general-purpose Rust crates. - ---- - -## Assembler and Linker Architecture - -The builtin assembler and linker are the default when compiling without -the `gcc_assembler` or `gcc_linker` Cargo features. The selection is done -at compile time via `#[cfg(feature = "...")]` -- there are no runtime -environment variables. - -### Builtin Assembler - -Each assembler follows a three-stage pipeline: - -``` - Assembly text (String) - | - | Parser - v - Vec (parsed instructions, directives, labels) - | - | Encoder - v - Vec (encoded machine code bytes + relocation entries) - | - | ELF Writer - v - ELF object file (.o) -``` - -| Architecture | Parser | Encoder | Extra Features | -|-------------|--------|---------|---------------| -| x86-64 | AT&T syntax, shared | REX prefixes, ModR/M, SIB | SSE/AES-NI encoding | -| i686 | Reuses x86 parser | No REX, 32-bit operands | ELFCLASS32, Elf32_Rel | -| AArch64 | ARM assembly syntax | Fixed 32-bit encoding | imm12 auto-shift | -| RISC-V | RV assembly syntax | Fixed 32-bit encoding | RV64C compression | - -### Builtin Linker - -Each linker reads ELF object files and static archives, resolves symbols, -applies relocations, and writes a complete ELF executable: - -| Architecture | Link Mode | Key Relocations | Special Features | -|-------------|-----------|-----------------|-----------------| -| x86-64 | Dynamic | R_X86_64_64, PC32, PLT32, GOTPCREL | PLT/GOT, TLS | -| i686 | Dynamic | R_386_32, PC32, PLT32, GOTPC, GOTOFF | 32-bit ELF, `.rel` | -| AArch64 | Static + Dynamic | ADR_PREL_PG_HI21, ADD_ABS_LO12_NC, CALL26 | PLT/GOT, IFUNC/IPLT, TLS | -| RISC-V | Dynamic | HI20, LO12_I, LO12_S, CALL, PCREL_HI20 | TLS GD→LE relaxation | - -### GCC Fallback - -When compiled with the `gcc_assembler` and/or `gcc_linker` Cargo features, -the compiler delegates to the GCC cross-compiler toolchain for the -corresponding stages. A warning is printed when the fallback is used. This -mode is useful for debugging: compile a binary with GCC features and compare -byte-level output against the standalone binary. - ---- - -## Sub-Module Documentation - -Each compiler subsystem has its own detailed design document: - -| Module | README | -|--------|--------| -| Frontend (preprocessor, lexer, parser, sema) | [`src/frontend/README.md`](src/frontend/README.md) | -| IR subsystem (lowering, mem2reg) | [`src/ir/README.md`](src/ir/README.md) | -| Optimization passes | [`src/passes/README.md`](src/passes/README.md) | -| Backend (codegen, assembler, linker) | [`src/backend/README.md`](src/backend/README.md) | -| Common (types, diagnostics, source) | [`src/common/README.md`](src/common/README.md) | -| Driver (CLI, pipeline) | [`src/driver/README.md`](src/driver/README.md) | - -Per-architecture backend documentation: - -| Architecture | Overview | Code Generation | Assembler | Linker | -|-------------|----------|----------------|-----------|--------| -| x86-64 | [`x86/README.md`](src/backend/x86/README.md) | [`x86/codegen/README.md`](src/backend/x86/codegen/README.md) | [`x86/assembler/README.md`](src/backend/x86/assembler/README.md) | [`x86/linker/README.md`](src/backend/x86/linker/README.md) | -| i686 | [`i686/README.md`](src/backend/i686/README.md) | [`i686/codegen/README.md`](src/backend/i686/codegen/README.md) | [`i686/assembler/README.md`](src/backend/i686/assembler/README.md) | [`i686/linker/README.md`](src/backend/i686/linker/README.md) | -| AArch64 | [`arm/README.md`](src/backend/arm/README.md) | [`arm/codegen/README.md`](src/backend/arm/codegen/README.md) | [`arm/assembler/README.md`](src/backend/arm/assembler/README.md) | [`arm/linker/README.md`](src/backend/arm/linker/README.md) | -| RISC-V 64 | [`riscv/README.md`](src/backend/riscv/README.md) | [`riscv/codegen/README.md`](src/backend/riscv/codegen/README.md) | [`riscv/assembler/README.md`](src/backend/riscv/assembler/README.md) | [`riscv/linker/README.md`](src/backend/riscv/linker/README.md) | - -The x86-64 peephole optimizer has its own detailed documentation: [`src/backend/x86/codegen/peephole/README.md`](src/backend/x86/codegen/peephole/README.md). diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 0e259d42c9..0000000000 --- a/LICENSE +++ /dev/null @@ -1,121 +0,0 @@ -Creative Commons Legal Code - -CC0 1.0 Universal - - CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE - LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN - ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS - INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES - REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS - PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM - THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED - HEREUNDER. - -Statement of Purpose - -The laws of most jurisdictions throughout the world automatically confer -exclusive Copyright and Related Rights (defined below) upon the creator -and subsequent owner(s) (each and all, an "owner") of an original work of -authorship and/or a database (each, a "Work"). - -Certain owners wish to permanently relinquish those rights to a Work for -the purpose of contributing to a commons of creative, cultural and -scientific works ("Commons") that the public can reliably and without fear -of later claims of infringement build upon, modify, incorporate in other -works, reuse and redistribute as freely as possible in any form whatsoever -and for any purposes, including without limitation commercial purposes. -These owners may contribute to the Commons to promote the ideal of a free -culture and the further production of creative, cultural and scientific -works, or to gain reputation or greater distribution for their Work in -part through the use and efforts of others. - -For these and/or other purposes and motivations, and without any -expectation of additional consideration or compensation, the person -associating CC0 with a Work (the "Affirmer"), to the extent that he or she -is an owner of Copyright and Related Rights in the Work, voluntarily -elects to apply CC0 to the Work and publicly distribute the Work under its -terms, with knowledge of his or her Copyright and Related Rights in the -Work and the meaning and intended legal effect of CC0 on those rights. - -1. Copyright and Related Rights. A Work made available under CC0 may be -protected by copyright and related or neighboring rights ("Copyright and -Related Rights"). Copyright and Related Rights include, but are not -limited to, the following: - - i. the right to reproduce, adapt, distribute, perform, display, - communicate, and translate a Work; - ii. moral rights retained by the original author(s) and/or performer(s); -iii. publicity and privacy rights pertaining to a person's image or - likeness depicted in a Work; - iv. rights protecting against unfair competition in regards to a Work, - subject to the limitations in paragraph 4(a), below; - v. rights protecting the extraction, dissemination, use and reuse of data - in a Work; - vi. database rights (such as those arising under Directive 96/9/EC of the - European Parliament and of the Council of 11 March 1996 on the legal - protection of databases, and under any national implementation - thereof, including any amended or successor version of such - directive); and -vii. other similar, equivalent or corresponding rights throughout the - world based on applicable law or treaty, and any national - implementations thereof. - -2. Waiver. To the greatest extent permitted by, but not in contravention -of, applicable law, Affirmer hereby overtly, fully, permanently, -irrevocably and unconditionally waives, abandons, and surrenders all of -Affirmer's Copyright and Related Rights and associated claims and causes -of action, whether now known or unknown (including existing as well as -future claims and causes of action), in the Work (i) in all territories -worldwide, (ii) for the maximum duration provided by applicable law or -treaty (including future time extensions), (iii) in any current or future -medium and for any number of copies, and (iv) for any purpose whatsoever, -including without limitation commercial, advertising or promotional -purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each -member of the public at large and to the detriment of Affirmer's heirs and -successors, fully intending that such Waiver shall not be subject to -revocation, rescission, cancellation, termination, or any other legal or -equitable action to disrupt the quiet enjoyment of the Work by the public -as contemplated by Affirmer's express Statement of Purpose. - -3. Public License Fallback. Should any part of the Waiver for any reason -be judged legally invalid or ineffective under applicable law, then the -Waiver shall be preserved to the maximum extent permitted taking into -account Affirmer's express Statement of Purpose. In addition, to the -extent the Waiver is so judged Affirmer hereby grants to each affected -person a royalty-free, non transferable, non sublicensable, non exclusive, -irrevocable and unconditional license to exercise Affirmer's Copyright and -Related Rights in the Work (i) in all territories worldwide, (ii) for the -maximum duration provided by applicable law or treaty (including future -time extensions), (iii) in any current or future medium and for any number -of copies, and (iv) for any purpose whatsoever, including without -limitation commercial, advertising or promotional purposes (the -"License"). The License shall be deemed effective as of the date CC0 was -applied by Affirmer to the Work. Should any part of the License for any -reason be judged legally invalid or ineffective under applicable law, such -partial invalidity or ineffectiveness shall not invalidate the remainder -of the License, and in such case Affirmer hereby affirms that he or she -will not (i) exercise any of his or her remaining Copyright and Related -Rights in the Work or (ii) assert any associated claims and causes of -action with respect to the Work, in either case contrary to Affirmer's -express Statement of Purpose. - -4. Limitations and Disclaimers. - - a. No trademark or patent rights held by Affirmer are waived, abandoned, - surrendered, licensed or otherwise affected by this document. - b. Affirmer offers the Work as-is and makes no representations or - warranties of any kind concerning the Work, express, implied, - statutory or otherwise, including without limitation warranties of - title, merchantability, fitness for a particular purpose, non - infringement, or the absence of latent or other defects, accuracy, or - the present or absence of errors, whether or not discoverable, all to - the greatest extent permissible under applicable law. - c. Affirmer disclaims responsibility for clearing rights of other persons - that may apply to the Work or any use thereof, including without - limitation any person's Copyright and Related Rights in the Work. - Further, Affirmer disclaims responsibility for obtaining any necessary - consents, permissions or other rights required for any use of the - Work. - d. Affirmer understands and acknowledges that Creative Commons is not a - party to this document and has no duty or obligation with respect to - this CC0 or use of the Work. diff --git a/README.md b/README.md index 2038aae6df..e08baee632 100644 --- a/README.md +++ b/README.md @@ -1,208 +1 @@ -# CCC — Claude's C Compiler - -A C compiler written entirely from scratch in Rust, targeting x86-64, i686, -AArch64, and RISC-V 64. Zero compiler-specific dependencies — the frontend, -SSA-based IR, optimizer, code generator, peephole optimizers, assembler, -linker, and DWARF debug info generation are all implemented from scratch. -Claude's C Compiler produces ELF executables without any external toolchain. - -> Note: With the exception of this one paragraph that was written by a human, 100% of the code and documentation in this repository was written by Claude Opus 4.6. A human guided some of this process by writing test cases that Claude was told to pass, but never interactively pair-programmed with Claude to debug or to provide feedback on code quality. As a result, I do not recommend you use this code! None of it has been validated for correctness. Claude wrote this exclusively on a Linux host; it probably will not work on MacOS/Windows — neither I nor Claude have tried. The docs may be wrong and make claims that are false. See [our blog post](https://anthropic.com/engineering/building-c-compiler) for more detail. - -## Prerequisites - -- **Rust** (stable, 2021 edition) — install via [rustup](https://rustup.rs/) -- **Linux host** — the compiler targets Linux ELF executables and relies on - Linux system headers / C runtime libraries (glibc or musl) being installed - on the host -- For cross-compilation targets (ARM, RISC-V, i686), the corresponding - cross-compilation sysroots should be installed (e.g., - `aarch64-linux-gnu-gcc`, `riscv64-linux-gnu-gcc`) - -## Building - -```bash -cargo build --release -``` - -This produces five binaries in `target/release/`, all compiled from the same -source. The target architecture is selected by the binary name at runtime: - -| Binary | Target | -|--------|--------| -| `ccc` | x86-64 (default) | -| `ccc-x86` | x86-64 | -| `ccc-arm` | AArch64 | -| `ccc-riscv` | RISC-V 64 | -| `ccc-i686` | i686 (32-bit x86) | - -## Quick Start - -Compile and run a simple C program: - -```bash -# Write a test program -cat > hello.c << 'EOF' -#include -int main(void) { - printf("Hello from CCC!\n"); - return 0; -} -EOF - -# Compile and run (x86-64) -./target/release/ccc -o hello hello.c -./hello - -# Cross-compile for AArch64 and run under QEMU -./target/release/ccc-arm -o hello-arm hello.c -qemu-aarch64 -L /usr/aarch64-linux-gnu ./hello-arm -``` - -CCC works as a drop-in GCC replacement. Point your build system at it: - -```bash -# Build a project with make -make CC=/path/to/ccc-x86 - -# Build a project with CMake -cmake -DCMAKE_C_COMPILER=/path/to/ccc-x86 .. - -# Build a project with configure scripts -./configure CC=/path/to/ccc-x86 -``` - -## Usage - -```bash -# Compile and link -ccc -o output input.c # x86-64 -ccc-arm -o output input.c # AArch64 -ccc-riscv -o output input.c # RISC-V 64 -ccc-i686 -o output input.c # i686 - -# GCC-compatible flags -ccc -S input.c # Emit assembly -ccc -c input.c # Compile to object file -ccc -E input.c # Preprocess only -ccc -O2 -o output input.c # Optimize (accepts -O0 through -O3, -Os, -Oz) -ccc -g -o output input.c # DWARF debug info -ccc -DFOO=1 -Iinclude/ input.c # Define macros, add include paths -ccc -Werror -Wall input.c # Warning control -ccc -fPIC -shared -o lib.so lib.c # Position-independent code -ccc -x c -E - # Read from stdin - -# Build system integration (reports as GCC 14.2.0 for compatibility) -ccc -dumpmachine # x86_64-linux-gnu / aarch64-linux-gnu / riscv64-linux-gnu / i686-linux-gnu -ccc -dumpversion # 14 -``` - -The compiler accepts most GCC flags. Unrecognized flags (e.g., architecture- -specific `-m` flags, unknown `-f` flags) are silently ignored so `ccc` can -serve as a drop-in GCC replacement in build systems. - -### Assembler and Linker Modes - -By default, the compiler uses its **builtin assembler and linker** for all -four architectures. No external toolchain is required. You can verify this -with `--version`, which shows `Backend: standalone` when using the builtin -tools. - -To build with optional GCC fallback support (e.g., for debugging), enable -Cargo features at compile time: - -```bash -# Build with GCC assembler and linker fallback -cargo build --release --features gcc_assembler,gcc_linker - -# Build with GCC fallback for -m16 boot code only -cargo build --release --features gcc_m16 -``` - -| Feature | Description | -|---------|-------------| -| `gcc_assembler` | Use GCC as the assembler instead of the builtin | -| `gcc_linker` | Use GCC as the linker instead of the builtin | -| `gcc_m16` | Use GCC for `-m16` (16-bit real mode boot code) | - -When compiled with GCC fallback features enabled, `--version` shows which -components use GCC (e.g., `Backend: gcc_assembler, gcc_linker`). - -## Status - -The compiler can build real-world C codebases across all four architectures, -including the Linux kernel. Projects that compile and pass their test suites -include PostgreSQL (all 237 regression tests), SQLite, QuickJS, zlib, Lua, -libsodium, libpng, jq, libjpeg-turbo, mbedTLS, libuv, Redis, libffi, musl, -TCC, and DOOM — all using the fully standalone assembler and linker with no -external toolchain. Over 150 additional projects have also been built -successfully, including FFmpeg (all 7331 FATE checkasm tests on x86-64 and -AArch64), GNU coreutils, Busybox, CPython, QEMU, and LuaJIT. - -### Known Limitations - -- **Optimization levels**: All levels (`-O0` through `-O3`, `-Os`, `-Oz`) run - the same optimization pipeline. Separate tiers will be added as the compiler - matures. -- **Long double**: x86 80-bit extended precision is supported via x87 FPU - instructions. On ARM/RISC-V, `long double` is IEEE binary128 via - compiler-rt/libgcc soft-float libcalls. -- **Complex numbers**: `_Complex` arithmetic has some edge-case failures. -- **GNU extensions**: Partial `__attribute__` support. NEON intrinsics are - partially implemented (core 128-bit operations work). -- **Atomics**: `_Atomic` is parsed but treated as the underlying type (the - qualifier is not tracked through the type system). - -## Testing - -The compiler has two kinds of tests: - -**Unit tests** (in-source `#[test]` functions for individual passes and modules): - -```bash -cargo test --release -``` - -**Integration tests** (end-to-end compilation tests in `tests/`). Each test is -a directory containing a `main.c` source file and expected output files: - -``` -tests/ - some-test-name/ - main.c # C source to compile - expected.stdout # Expected stdout (if any) - expected.ret # Expected exit code (if any) - expected.skip.arm # Skip marker for specific architectures (optional) -``` - -Tests are run by compiling `main.c` with `ccc`, executing the resulting binary, -and comparing stdout and the exit code against the expected files. - -## Environment Variables - -| Variable | Purpose | -|----------|---------| -| `CCC_TIME_PHASES` | Print per-phase compilation timing to stderr | -| `CCC_TIME_PASSES` | Print per-pass optimization timing and change counts to stderr | -| `CCC_DISABLE_PASSES` | Disable specific optimization passes (comma-separated, or `all`) | -| `CCC_KEEP_ASM` | Preserve intermediate `.s` files next to output | -| `CCC_ASM_DEBUG` | Dump preprocessed assembly to `/tmp/asm_debug_.s` | - -## Project Organization - -``` -src/ Compiler source code (Rust) - frontend/ C source -> typed AST (preprocessor, lexer, parser, sema) - ir/ Target-independent SSA IR (lowering, mem2reg) - passes/ SSA optimization passes (15 passes + shared loop analysis) - backend/ IR -> assembly -> machine code -> ELF (4 architectures) - common/ Shared types, symbol table, diagnostics - driver/ CLI parsing, pipeline orchestration - -include/ Bundled C headers (x86 SIMD: SSE through AVX-512, AES-NI, FMA, SHA, BMI2; ARM NEON) -tests/ Compiler tests (each test is a directory with main.c and expected output) -ideas/ Future work proposals and improvement notes -``` - -Each `src/` subdirectory has its own `README.md` with detailed design -documentation. For the full architecture, compilation pipeline data flow, -and key design decisions, see [DESIGN_DOC.md](DESIGN_DOC.md). +💩 diff --git a/current_tasks/.keep b/current_tasks/.keep deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/current_tasks/fix_arm_asm_caspal_instruction.txt b/current_tasks/fix_arm_asm_caspal_instruction.txt deleted file mode 100644 index 3a28ae1934..0000000000 --- a/current_tasks/fix_arm_asm_caspal_instruction.txt +++ /dev/null @@ -1,20 +0,0 @@ -Fix ARM assembler: add support for CASP/CASPA/CASPL/CASPAL instructions - -These are LSE (Large System Extensions) atomic compare-and-swap pair -instructions used by the Linux kernel in mm/slub.o and other core -memory management code. - -The CAS (single register) family is already implemented, but the CASP -(register pair) family is missing entirely. - -CASP format: caspal x0, x1, x2, x3, [x11] - - 5 operands: compare pair (Rs, Rs+1), swap pair (Rt, Rt+1), memory [Rn] - - Encoding: sz 001000 0 L 1 Rs o0 11111 Rn Rt - - Key difference from CAS: bit 23 is 0 (vs 1 for CAS) - -Files to modify: -- src/backend/arm/assembler/encoder/mod.rs (dispatch) -- src/backend/arm/assembler/encoder/load_store.rs (new encode_casp fn) -- src/backend/arm/codegen/peephole.rs (skip casp in register prop) - -Started: 2026-02-05 diff --git a/current_tasks/fix_arm_asm_global_branch_relocs.txt b/current_tasks/fix_arm_asm_global_branch_relocs.txt deleted file mode 100644 index 5438bf4927..0000000000 --- a/current_tasks/fix_arm_asm_global_branch_relocs.txt +++ /dev/null @@ -1,19 +0,0 @@ -Fix ARM assembler: branches to global symbols in same section missing relocations - -Bug: The ARM/AArch64 assembler resolves bl/b instructions to global symbols within -the same section at assembly time instead of emitting R_AARCH64_CALL26/JUMP26 -relocations. GAS always emits relocations for global symbols. - -Root cause: In elf_writer.rs, the triage logic at line 643 uses: - if is_local || is_branch_reloc_type(elf_type) -This unconditionally defers ALL branch instructions, and resolve_local_branches -then resolves same-section references directly without checking if the target -is a global symbol. - -The x86 assembler handles this correctly (checks is_local_symbol before resolving), -and the RISC-V assembler only defers syntactically local symbols (.L* prefixed). - -Fix: In resolve_local_branches(), check if the target symbol is global before -resolving in-place. If global, emit a relocation instead. - -Started: 2026-02-05 diff --git a/current_tasks/fix_arm_asm_org_directive.txt b/current_tasks/fix_arm_asm_org_directive.txt deleted file mode 100644 index 751955c04c..0000000000 --- a/current_tasks/fix_arm_asm_org_directive.txt +++ /dev/null @@ -1,16 +0,0 @@ -Fix ARM assembler .org directive implementation - -The .org directive is completely ignored in the ARM assembler (parser.rs -line 1636-1639, treated as AsmDirective::Ignored). This is critical for -the Linux kernel's vector table, which uses `.org .Lventry_start + 128` -to pad each vector entry to exactly 128 bytes. - -Without .org support, the vector table entries are not properly padded, -causing all branch targets to be wrong and the ARM kernel to fail to boot -in standalone mode (even though it builds successfully). - -Fix: Implement .org as a directive that pads the current section to the -specified offset with zeros (matching GAS behavior). - -Started: 2026-02-05 -Locked by: 3364d85074230a922419bccfdeeea40597ce9665 at 2026-02-05 15:46:20 diff --git a/current_tasks/fix_arm_asm_quad_prel64_relocation.txt b/current_tasks/fix_arm_asm_quad_prel64_relocation.txt deleted file mode 100644 index 559bd616dc..0000000000 --- a/current_tasks/fix_arm_asm_quad_prel64_relocation.txt +++ /dev/null @@ -1,19 +0,0 @@ -Fix ARM assembler: .quad symbol+offset-. emits wrong R_AARCH64_PREL32 instead of R_AARCH64_PREL64 - -Three interconnected bugs in the AArch64 assembler: - -1. Parser (parser.rs try_parse_symbol_diff): does not decompose sym_a "symbol+offset" into - separate symbol and addend. E.g., "cgroup_bpf_enabled_key+48" was treated as a single - symbol name instead of symbol "cgroup_bpf_enabled_key" with addend 48. - -2. ELF writer (elf_writer.rs resolve_sym_diffs): always emits R_AARCH64_PREL32 (32-bit - PC-relative) regardless of data size. For .quad (8 bytes), it should emit R_AARCH64_PREL64. - -3. Encoder (encoder.rs RelocType): missing Prel64 variant (ELF type 260) needed for - 64-bit PC-relative relocations. - -This breaks the kernel's __jump_table section which uses: - .quad cgroup_bpf_enabled_key+48 - . -to encode static key references. The wrong relocation causes link failure: - undefined reference to `cgroup_bpf_enabled_key+48' - relocation truncated to fit: R_AARCH64_PREL32 diff --git a/current_tasks/fix_arm_movw_symbolic_relocations.txt b/current_tasks/fix_arm_movw_symbolic_relocations.txt deleted file mode 100644 index 59cc368170..0000000000 --- a/current_tasks/fix_arm_movw_symbolic_relocations.txt +++ /dev/null @@ -1,19 +0,0 @@ -Fix ARM assembler: MOVW relocations for symbolic abs_g* modifiers - -The ARM assembler fails when movz/movk/movn use :abs_g*: modifiers with -symbolic references (not pure constants). When resolve_abs_g_modifier() -returns None (symbol can't be resolved to a constant), the encoder falls -through to get_imm() which fails with "expected immediate at operand". - -This breaks the kernel's tramp_alias macro which uses: - .set .Lalias\@, -10526720 + \sym - .entry.tramp.text - movz \dst, :abs_g2_s:.Lalias\@ - movk \dst, :abs_g1_nc:.Lalias\@ - movk \dst, :abs_g0_nc:.Lalias\@ - -Fix: Add MOVW relocation types (MovwUabsG0Nc, MovwUabsG1Nc, MovwUabsG2Nc, -MovwUabsG3, MovwSabsG0, MovwSabsG1, MovwSabsG2) to the RelocType enum -and emit WordWithReloc in encode_movz/encode_movk/encode_movn when the -modifier contains a symbolic reference. - -Started: 2026-02-05 diff --git a/current_tasks/fix_dash.txt b/current_tasks/fix_dash.txt deleted file mode 100644 index 40f35a8aeb..0000000000 --- a/current_tasks/fix_dash.txt +++ /dev/null @@ -1,3 +0,0 @@ -Fixing dash project for riscv target. -dash is a POSIX shell. FAIL on riscv, PASS on x86/i686/arm. -Started: 2026-02-05 diff --git a/current_tasks/fix_i686_double_param_high_word_store.txt b/current_tasks/fix_i686_double_param_high_word_store.txt deleted file mode 100644 index 26ae433078..0000000000 --- a/current_tasks/fix_i686_double_param_high_word_store.txt +++ /dev/null @@ -1,12 +0,0 @@ -Fix i686 codegen: missing high-word store for double parameter copy - -Bug: When a function takes a double parameter on i686, the codegen copies -the parameter from the stack to a local alloca. For 64-bit values (double), -this requires two 32-bit stores (low word and high word). In certain cases -(triggered by including stdlib.h + math.h with long double functions present), -the high-word store (movl %eax, -4(%ebp)) is dropped, leaving the upper -half of the double uninitialized. - -Reproducer: /clean_tests/compiler_suite_0149_0011 - - f01(double) -> unsigned int returns 0 instead of 10 - - Only the low 32 bits of the double are stored; high 32 bits are garbage diff --git a/current_tasks/fix_macro_param_prefix_substitution.txt b/current_tasks/fix_macro_param_prefix_substitution.txt deleted file mode 100644 index 4050be595b..0000000000 --- a/current_tasks/fix_macro_param_prefix_substitution.txt +++ /dev/null @@ -1,16 +0,0 @@ -Fix macro parameter substitution prefix-matching bug - -Bug: When a .macro has parameters like `orig` and `orig_len`, the naive -String::replace in declaration order causes `\orig` to match inside -`\orig_len`, producing corrupted output like `140b_len` instead of -the value of `orig_len`. - -This causes undefined references to `140b_len` and `143f_len` in the -Linux kernel's .altinstructions section when building in standalone mode. - -Fix: Sort macro parameters by name length (longest first) before -substitution, matching what the ARM parser already does correctly. - -Files to fix: -- src/backend/asm_preprocess.rs (expand_macros, expand_macros_with) -- src/backend/x86/assembler/parser.rs (expand_gas_macros_with_state) diff --git a/current_tasks/fix_pcre2_stack_frame_bloat.txt b/current_tasks/fix_pcre2_stack_frame_bloat.txt deleted file mode 100644 index 92ec65cc08..0000000000 --- a/current_tasks/fix_pcre2_stack_frame_bloat.txt +++ /dev/null @@ -1,16 +0,0 @@ -Fix pcre2 x86 build: stack frame bloat causing stack overflow in deeply recursive code. - -The pcre2 project crashes with a segfault in test 8 (compile_branch with 792 nested -parentheses). Root cause: our compiler allocates 10320-byte stack frames for -compile_branch (vs GCC's ~500 bytes), and 792 recursive levels * 10.3KB > 8MB stack limit. - -The bloat is from using 8-byte minimum stack slots for ALL values on x86-64, even when -the IR type is 32-bit or smaller. Each of ~1289 SSA temporaries gets an 8-byte slot. - -Fix: Enable 4-byte stack slots for small values (I8, U8, I16, U16, I32, U32, F32) on -x86-64. This requires updating: -1. stack_layout/mod.rs: use slot_size=4 for small values -2. x86 codegen emit.rs: store_rax_to and value_to_reg use movl for small slots -3. x86 prologue.rs: adjust alignment in assign_slot closure - -Started: 2026-02-05 diff --git a/current_tasks/fix_riscv_va_arg_long_double_struct.txt b/current_tasks/fix_riscv_va_arg_long_double_struct.txt deleted file mode 100644 index 46a668ea02..0000000000 --- a/current_tasks/fix_riscv_va_arg_long_double_struct.txt +++ /dev/null @@ -1,11 +0,0 @@ -Fix RISC-V va_arg for struct containing long double (misaligned stack) - -When a struct containing a long double is passed as a variadic argument on -the stack (all registers exhausted), va_arg reads garbage instead of the -correct value. The issue is that long double (f128) structs need 16-byte -alignment when read from the va_list stack area, but the current va_arg -implementation doesn't properly align the stack pointer before reading. - -Test case: compiler_suite_0172_0039, specifically test_misalign_r10 - -Started: 2026-02-05 diff --git a/current_tasks/fix_x86_asm_ifnb_ifb_conditional.txt b/current_tasks/fix_x86_asm_ifnb_ifb_conditional.txt deleted file mode 100644 index 47cefbf88c..0000000000 --- a/current_tasks/fix_x86_asm_ifnb_ifb_conditional.txt +++ /dev/null @@ -1,25 +0,0 @@ -Task: Implement .ifnb/.ifb conditional assembly directives in x86 assembler - -Problem: -The x86 assembler does not support .ifnb (if not blank) and .ifb (if blank) -conditional assembly directives. These are GAS directives used in Linux kernel -assembly macros (e.g., IBRS_ENTER in arch/x86/entry/entry_64.S) to conditionally -include code based on whether a macro argument is blank or not. - -When the kernel calls IBRS_ENTER without a save_reg argument, the .ifnb block -should be skipped, but instead the assembler processes the instructions inside -the .ifnb block, leading to "mov requires 2 operands, got 1" errors because -`mov %rax, \save_reg` expands to `mov %rax` when save_reg is blank. - -Root Cause: -x86 assembler parser.rs expand_gas_macros_with_state() handles .if, .ifc, .ifdef, -.ifndef but NOT .ifnb or .ifb. These directives are simply not recognized. - -Fix: -Add .ifnb and .ifb handling in expand_gas_macros_with_state() following the same -pattern as .ifc, and update is_if_start() to recognize them. - -Files to modify: -- src/backend/x86/assembler/parser.rs - -Status: IN PROGRESS diff --git a/current_tasks/fix_x86_standalone_kernel_link_errors.txt b/current_tasks/fix_x86_standalone_kernel_link_errors.txt deleted file mode 100644 index 858ce163ec..0000000000 --- a/current_tasks/fix_x86_standalone_kernel_link_errors.txt +++ /dev/null @@ -1,13 +0,0 @@ -Fix x86 standalone kernel link errors: - -1. level1_fixmap_pgt + (N << 12): parse_data_values fails to parse - complex "symbol + expr - const + const" expressions. The " - " check - splits on the wrong boundary, leaving "symbol + (expr)" as a symbol name. - -2. Undefined reference to '0f': a numeric forward label reference not - resolved during altinstructions macro expansion. - -3. Mangled expression '(((1' in .altinstructions: likely an expression - fragment from alt_max_2/alt_max_3 macros not being properly evaluated. - -Started: 2026-02-05 diff --git a/current_tasks/implement_string_literal_deduplication.txt b/current_tasks/implement_string_literal_deduplication.txt deleted file mode 100644 index fe154ce8f1..0000000000 --- a/current_tasks/implement_string_literal_deduplication.txt +++ /dev/null @@ -1,18 +0,0 @@ -Implement string literal deduplication (-fmerge-constants) - -GCC deduplicates identical string literals by default (placing them at -the same address in .rodata). Our compiler currently emits separate copies -for each occurrence, even when the strings are byte-identical. - -This causes test failures (e.g., compiler_suite_0011_0092 on RISC-V) -where code uses memcmp on struct fields containing pointers to "identical" -string literals. GCC merges them so the pointers match; ours don't. - -Plan: -- In the linker/ELF writer for all backends, deduplicate string literals - in the .rodata section (or wherever they end up) -- Specifically, when emitting string constants, check if an identical - string has already been emitted and reuse its address -- This should be done at the IR lowering or codegen level - -Started: 2026-02-05 diff --git a/ideas/.keep b/ideas/.keep deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/ideas/cleanup_i686_dead_peephole_passes.txt b/ideas/cleanup_i686_dead_peephole_passes.txt deleted file mode 100644 index 09bb2f4cf1..0000000000 --- a/ideas/cleanup_i686_dead_peephole_passes.txt +++ /dev/null @@ -1,24 +0,0 @@ -Clean up dead/disabled peephole passes in i686 backend -====================================================== -Priority: LOW - -The i686 peephole optimizer (src/backend/i686/codegen/peephole.rs) has three -disabled passes marked with #[allow(dead_code)]: - -1. Global store forwarding (line ~851): Disabled due to 21 FP computation - regressions. FP load/store forwarding patterns need investigation. - -2. Callee-save push/pop elimination (line ~1475): Disabled due to stack - misalignment causing 97+ segfault regressions. Breaks the - leal -N(%ebp),%esp epilogue pattern. - -3. Function-level callee-save removal (line ~1542): Disabled because it - removes push/pops that the epilogue pattern depends on. - -These should either be: -a) Fixed and re-enabled (preferred), or -b) Removed entirely if superseded by other approaches - -The x86-64 peephole has working versions of all these passes, so the i686 -versions may be able to borrow from the x86-64 implementation with -appropriate 32-bit adjustments. diff --git a/ideas/docs_verified_2026_01_29.txt b/ideas/docs_verified_2026_01_29.txt deleted file mode 100644 index bfa5329ecd..0000000000 --- a/ideas/docs_verified_2026_01_29.txt +++ /dev/null @@ -1,140 +0,0 @@ -Documentation Verification Changelog -===================================== - -Last audited: 2026-02-04 - -All README.md files have been systematically verified against the actual -source code. This file tracks what was checked and what was fixed. - -README/DESIGN_DOC Split + MY_ASM/MY_LD Correction (2026-02-04): - - Split top-level README.md into focused README (building, usage, status) - and DESIGN_DOC.md (architecture, pipeline, design decisions) - - Fixed incorrect MY_ASM/MY_LD environment variable documentation across - all READMEs. The actual mechanism is compile-time Cargo features - (gcc_assembler, gcc_linker, gcc_m16), not runtime env vars. The builtin - assembler and linker are the default; no env vars needed. - - Updated: README.md, DESIGN_DOC.md (new), src/backend/README.md, - src/driver/README.md, src/backend/x86/README.md, - src/backend/x86/assembler/README.md, src/backend/arm/README.md, - src/backend/arm/assembler/README.md, src/backend/arm/linker/README.md, - src/backend/riscv/README.md, src/backend/riscv/linker/README.md - -Builtin Assembler/Linker Documentation Update (2026-02-04): - README.md (top-level): - - Fixed false claim "Assembly and linking currently delegate to GNU toolchain" - - Removed "External toolchain required" from Known Limitations - - Updated pipeline diagram to show builtin assembler and linker stages - - Added "Assembler and Linker Selection" section with MY_ASM/MY_LD docs - - Updated source tree to include assembler/ and linker/ per architecture - - Updated data flow diagram to show builtin assembler/linker path - - Added "Builtin assembler and linker" to Key Design Decisions - - Fixed "No compiler-specific dependencies" to "Zero external dependencies" - - Removed reference to ideas/native_elf_writer.txt (now completed) - - src/backend/README.md: - - Fixed opening paragraph: "delegates to external GCC" -> "assembles and links - into ELF executables" with builtin assembler/linker description - - Updated pipeline diagram to show builtin assembler and linker stages - - Updated step 5 description to mention builtin or external toolchain - - Added assembler/linker subdirectories to directory layout - - Replaced "External Toolchain Integration" section with comprehensive - "Builtin Assembler" and "Builtin Linker" design sections - - Added "Assembler and Linker Selection" section - - src/backend/x86/README.md: - - Added builtin assembler/linker mention to opening paragraph - - src/backend/arm/README.md: - - Fixed "assembled by the system assembler" to mention builtin assembler/linker - - src/backend/i686/README.md: - - Added builtin assembler/linker mention to overview - - New design documents: - - src/backend/x86/assembler/README.md (new) - - src/backend/x86/linker/README.md (new) - - src/backend/arm/assembler/README.md (new) - - src/backend/arm/linker/README.md (new) - - src/backend/riscv/assembler/README.md (new) - - src/backend/riscv/linker/README.md (new) - - src/backend/i686/assembler/README.md (new) - - src/backend/i686/linker/README.md (new) - - Deleted completed ideas: - - ideas/native_elf_writer.txt (all 4 architectures now have builtin asm+ld) - -Accuracy Audit (2026-02-03): - src/frontend/README.md: - - Fixed `$` in identifiers claim (always permitted, not gated by gnu_extensions) - - Fixed Token described as tuple -> struct with named fields - - Fixed Parser::new() signature (takes only Vec) - - Fixed "five impl Parser blocks" -> six (includes parser.rs core block) - - Fixed TranslationUnit described as Vec -> struct { decls: Vec } - - src/ir/README.md: - - Added missing IrType variants: U8, U16, U32, U64, U128, Void - - Fixed instruction count from "approximately 30" to 38 - - Removed false claim that IrBinOp has eval_f64 method - - Fixed CfgAnalysis "loop information" claim (contains block count, not loops) - - Fixed Pclmulqdq -> Pclmulqdq128 intrinsic name - - src/passes/README.md: - - Fixed div_by_const annotation from "(64-bit)" to "(disabled on 32-bit)" - - Rewrote dependency graph to match actual should_run! edges - - Fixed IPCP description (uses dirty tracking, not should_run! system) - - src/backend/README.md: - - Fixed method count to ~185 methods, ~50 defaults - - Removed nonexistent emit_call_indirect method - - Fixed "6-phase call sequence" to 8-phase - - Fixed "approximately 17 codegen files" to 18-19 - - Fixed "Backends provide free functions" to "traits.rs provides free functions" - - Added note about x86 peephole being a subdirectory - - src/common/README.md: - - Fixed LP64 int/long claim ("both 64 bits" is wrong; int is 32, long is 64) - - Fixed decode_pua_byte signature (takes &[u8]+pos, not a char) - - Fixed eval_const_binop_int described as "public" (it is module-private) - - src/driver/README.md: - - Fixed --print-search-dirs to -print-search-dirs (single dash, matches GCC) - - Expanded target detection to document aarch64 and i386 triggers - - README.md (top-level): - - Added IrType U8-U128 and Void to dual type system description - - Added Design Philosophy section - - Updated ArchCodegen method count to ~185 - - src/ir/instruction.rs: - - Fixed doc comment instruction count from "~30" to "38" - -Full Overhaul (2026-02-02): -- README.md: Complete rewrite as proper project README - - Fixed GCC version to "14.2.0"/"14" (matches actual -dumpversion output) - - Added -Os/-Oz to optimization levels - - Accurate long double description (x87 + ARM/RISC-V soft-float) - - Added _Atomic, optimization level uniformity notes - - Listed ALL backend shared files - - Added dead_statics and resolve_asm to passes listing - - Added peephole optimization as key design decision - - Added phi elimination step to pipeline diagram - -- src/backend/x86/README.md: Complete rewrite with actual codegen file table -- src/backend/arm/README.md: Complete rewrite with codegen file table -- src/backend/riscv/README.md: Updated file table -- src/backend/i686/README.md: Updated file table -- src/ir/lowering/README.md: Added missing expr_sizeof.rs -- src/frontend/README.md: Improved module descriptions -- src/frontend/lexer/README.md: Expanded design notes - -Verified Correct (no changes needed, 2026-02-02): -- src/backend/README.md: All shared files match disk -- src/passes/README.md: All submodule files match disk; pipeline order accurate -- src/common/README.md: All modules match disk -- src/ir/README.md: All files + directories match disk -- src/frontend/preprocessor/README.md: All files match disk -- src/frontend/parser/README.md: All files match disk -- src/frontend/sema/README.md: All files match disk -- src/ir/mem2reg/README.md: All files match disk -- src/backend/x86/codegen/peephole/README.md: Accurate pass pipeline -- src/driver/README.md: Accurate pipeline and module descriptions diff --git a/ideas/fix_arm_defconfig_kvm_va_layout.txt b/ideas/fix_arm_defconfig_kvm_va_layout.txt deleted file mode 100644 index 51600ea42a..0000000000 --- a/ideas/fix_arm_defconfig_kvm_va_layout.txt +++ /dev/null @@ -1,34 +0,0 @@ -Fix ARM64 defconfig: KVM va_layout BUG in kvm_update_va_mask - -The ARM64 defconfig kernel crashes during boot with: - kernel BUG at arch/arm64/kvm/va_layout.c:182! - pc : kvm_update_va_mask+0x298/0x38c - -The BUG_ON fires because compute_instruction() returns AARCH64_BREAK_FAULT, -meaning aarch64_insn_gen_logical_immediate() failed to encode the va_mask -as a logical immediate. This indicates va_mask has an invalid value (0 or -all-ones) at runtime. - -Root cause is likely a CCC codegen bug in kvm_compute_layout(), which -computes va_mask using: - tag_lsb = fls64(phys_to_virt(memblock_start_of_DRAM()) ^ (high_memory - 1)); - va_mask = GENMASK_ULL(tag_lsb - 1, 0); - -The function involves complex bit manipulation using vabits_actual (derived -from TCR_EL1), fls64/__builtin_clzl, and GENMASK_ULL. GCC produces much -more compact code for this function (97 lines vs CCC's 480+), suggesting -CCC may have a codegen issue with the bitwise operations, TCR_EL1 register -read, or the fls64 always_inline chain. - -This crash is pre-existing (not caused by the inline fix) but was previously -masked by earlier crashes (cpucap BRK at boot, rmqueue_bulk stack overflow). - -Debugging steps: -1. Compare CCC vs GCC disassembly of kvm_compute_layout byte-by-byte -2. Check if TCR_EL1 MRS lowering is correct -3. Check GENMASK_ULL macro expansion and bit shifting operations -4. Test with a standalone C program that exercises the same bit patterns - -Files: -- arch/arm64/kvm/va_layout.c (kvm_compute_layout, compute_instruction) -- arch/arm64/lib/insn.c (aarch64_insn_gen_logical_immediate, aarch64_encode_immediate) diff --git a/ideas/fix_ir_unsigned_subint_representation.txt b/ideas/fix_ir_unsigned_subint_representation.txt deleted file mode 100644 index d139b63191..0000000000 --- a/ideas/fix_ir_unsigned_subint_representation.txt +++ /dev/null @@ -1,21 +0,0 @@ -Fix IrConst Unsigned Sub-Int Type Representation -================================================= -Date: 2026-01-26 -Priority: MOSTLY FIXED (2026-02-05) - -STATUS UPDATE (2026-02-05) --------------------------- -Fixed narrowed_to() in constants.rs to use I64 with zero-extension for all -unsigned types (U8, U16, U32), matching the from_i64() convention. This was -causing compound assignments like `unsigned int ui = UINT_MAX; ui /= 5000.0;` -to produce wrong results because mem2reg's narrowed_to() stored UINT_MAX as -I32(-1), and constant folding then interpreted -1 as UINT64_MAX. - -REMAINING ISSUE ---------------- -from_i64() correctly stores U8/U16/U32 as I64 with zero-extension. -narrowed_to() now also correctly stores unsigned types as I64 with zero-extension. - -The main remaining concern is places where optimization passes create IrConst -values directly (not through from_i64 or narrowed_to) for unsigned types. -These should be audited to ensure they follow the I64 zero-extension convention. diff --git a/ideas/fix_riscv_kernel_boot_hang.txt b/ideas/fix_riscv_kernel_boot_hang.txt deleted file mode 100644 index 88976b0531..0000000000 --- a/ideas/fix_riscv_kernel_boot_hang.txt +++ /dev/null @@ -1,23 +0,0 @@ -Fix RISC-V kernel boot hang on QEMU - -The CCC-compiled RISC-V kernel (Linux 6.9, defconfig) successfully builds -and produces a valid Image file, but hangs during early kernel init when -booted on qemu-system-riscv64 -M virt. OpenSBI runs fine and transfers -control to the kernel, but no Linux messages appear. - -In contrast, the same kernel compiled with riscv64-linux-gnu-gcc boots -fully to a shell. - -This is likely a codegen bug in the RISC-V backend that manifests in -early kernel init code (head.S -> start_kernel path). Possible areas: - -1. Inline assembly issues in arch/riscv/kernel/head.S processing -2. Miscompilation of early init C code (setup_arch, etc.) -3. Memory barrier/fence issues affecting early console setup -4. CSR access instruction encoding problems - -To debug: -- Build with earlycon=sbi to get SBI console output -- Compare objdump of start_kernel between GCC and CCC builds -- Use QEMU -d in_asm,cpu to trace instruction execution -- Binary search: build most files with GCC, swap CCC objects one at a time diff --git a/ideas/fix_riscv_va_arg_runtime.txt b/ideas/fix_riscv_va_arg_runtime.txt deleted file mode 100644 index 3b9589f60c..0000000000 --- a/ideas/fix_riscv_va_arg_runtime.txt +++ /dev/null @@ -1,14 +0,0 @@ -RISC-V va_arg Remaining Issues -=============================== - -Five items previously fixed (register save area layout, named float params, -va_list parameter passing, register allocation awareness, multi-register -param counting). Two issues remain: - -1. Struct arguments in variadic calls are passed by pointer instead of by value. - Small structs (<= 2*XLEN = 16 bytes) should be flattened into GP registers - at the call site. Affects struct va_arg (e.g., va_arg(ap, struct { float f; })). - Fix needed in: IR lowering of call args or backend emit_call. - -2. Long double (f128) va_arg may have alignment issues for structs containing - long double members. diff --git a/ideas/fix_x86_full_f128_80bit_precision.txt b/ideas/fix_x86_full_f128_80bit_precision.txt deleted file mode 100644 index 5a90cfc0e4..0000000000 --- a/ideas/fix_x86_full_f128_80bit_precision.txt +++ /dev/null @@ -1,43 +0,0 @@ -Full 80-bit precision for x86 long double (F128) - -Status: Partially done. Casts and binops now use x87 FPU, but the f64 register -intermediate still limits precision. - -Problem: -The x86 backend stores F128 values as f64 bit-patterns in %rax registers, -converting to/from x87 80-bit format only for memory load/store. This means: -1. Values that exceed f64's 53-bit mantissa lose precision when passing - through registers (e.g., INT64_MAX as long double) -2. Constants like LDBL_MIN (3.36e-4932) underflow to 0.0 in f64 -3. x87 intermediate precision in binops helps but can't recover already-lost data - -Known failure cases: -- islessgreater with LDBL_MIN (underflows to 0 in f64) -- long long -= long double (i64 precision loss) -- Other long double precision-sensitive operations - -Solution approaches (from easiest to hardest): - -A) Memory-based F128 protocol (recommended): - Instead of f64 bits in %rax, F128 values would be stored in stack alloca - slots in 80-bit format. Operations would load/store directly from/to these - slots using fldt/fstpt. Register "values" for F128 would be pointers to - the stack slots. This requires: - - New F128 alloca allocation in codegen state - - Modified operand_to_rax for F128: loads into x87 ST0 instead - - Modified store_rax_to for F128: stores from ST0 via fstpt - - All F128 operations work directly on x87 stack - - ~500 LOC change estimate - -B) x87 stack-based protocol: - F128 values live in x87 FPU registers (ST0-ST7). Limited to 8 live values - but sufficient for most code. Requires tracking x87 stack depth. - More complex than A, harder to integrate with existing register allocator. - -C) Hybrid approach: - Keep f64 in registers for most operations, but add special IR annotations - for F128 operations that need full precision. The codegen would fuse - cast+store and load+binop patterns to avoid the f64 intermediate when - the full chain is F128. - -To verify: test islessgreater with LDBL_MIN and long long -= long double. diff --git a/ideas/high_codegen_runtime_perf.txt b/ideas/high_codegen_runtime_perf.txt deleted file mode 100644 index df2d4e82a2..0000000000 --- a/ideas/high_codegen_runtime_perf.txt +++ /dev/null @@ -1,37 +0,0 @@ -Runtime Performance Improvements -================================= -Priority: HIGH - -Remaining bottlenecks (profiled on zlib vs GCC -O2): - -1. LOOP INDUCTION VARIABLE STRENGTH REDUCTION - Array accesses in loops compute index*stride every iteration: - movslq i, %rax; shlq $2, %rax; addq base, %rax; movl (%rax), ... - Should increment a pointer instead: addq $4, %rdi; movl (%rdi), ... - Fix: Loop strength reduction pass (iv_strength_reduce.rs exists but - may need extension for pointer-based induction variables). - -2. REDUNDANT SIGN EXTENSIONS (partially addressed) - The lowering emits Cast i32->i64 for array indices even when the - value is already 64-bit. Codegen emits redundant movslq/cltq. - The peephole catches some but not all cases. - DONE: Narrow pass now handles AShr/LShr shift narrowing (eliminates - widen-shift-narrow pattern for right shifts, e.g. DOOM's fixed-point - frac>>16 pattern). Reduced DOOM text by ~6KB, movslq by ~390. - Remaining: Array index casts for pointer arithmetic (no narrowing - cast follows, so the narrow pass can't help). Would need either IR-level - analysis or backend-level elimination of redundant sign extensions. - -3. REDUNDANT REGISTER-REGISTER MOVES - Patterns like: movq %rax, %r14; movq %r14, %r15 - Arise because codegen routes through %rax as accumulator, then - copies to callee-saved registers. - Fix: Better peephole patterns, or teach the register allocator - to avoid the accumulator roundtrip for register-to-register ops. - -Key files: -- src/backend/regalloc.rs -- src/backend/liveness.rs -- src/backend/x86/codegen/peephole/ -- src/passes/simplify.rs -- src/passes/iv_strength_reduce.rs diff --git a/ideas/high_compile_speed_improvements.txt b/ideas/high_compile_speed_improvements.txt deleted file mode 100644 index 0644da1ac6..0000000000 --- a/ideas/high_compile_speed_improvements.txt +++ /dev/null @@ -1,49 +0,0 @@ -Compile Speed Improvements -=========================== -Priority: HIGH - -Profiled on sqlite3.c (callgrind, 20.2B instructions after fixes). -Previous profile: kernel/softirq.c (2.31B instructions). - -FIXED: Peephole loop trampoline O(n*labels) quadratic scan was 19.76% -of total compile time. Pre-built reverse index reduces it to 0.21%. -Overall 22.7% instruction count reduction (26.1B -> 20.2B). - -Remaining bottlenecks: - -1. EXTERNAL ASSEMBLER - The compiler shells out to gcc/as for assembling text assembly into - object files. Process spawn overhead matters for parallel kernel builds. - Fix: Native assembler + ELF writer (see native_elf_writer.txt). - Expected improvement: eliminates fork/exec per compilation unit. - -2. ALLOCATION OVERHEAD (~17.5% of total) - malloc/free/realloc/memcpy dominate the profile (5.5% _int_malloc, - 5% memcpy, 4.5% _int_free, 3% malloc, 1.7% free, etc.). - Partially fixed: build_cfg uses FlatAdj CSR format, CFG/dominator - analysis shared via CfgAnalysis. Pipeline clone eliminated. - Further: arena/bump allocators, string interning, reuse Vec buffers. - -3. PREPROCESSOR (~17.6% of total) - preprocess_source is the single hottest function. Macro expansion - (expand_text) is 5.8%, process_directive is 3.5%. - Partially fixed: FxHashSet reuse, include caching, Cow, - set_file() fast path for __FILE__ (avoids MacroDef alloc per #include), - reusable directive_expanding set for handle_if/elif/line/error, - batch slice copies in expand_text/substitute_params inner loops. - Further: string interning for macro names in expanding set, - reusable Vec for parse_macro_args, MacroDef clone avoidance. - -4. STRING INTERNING (potential ~5% improvement) - Every identifier token allocates a heap String. Same function/type - names are re-allocated at each compiler stage (lexer -> AST -> IR). - Fix: String interning with u32 symbol IDs would eliminate most of - the per-identifier allocation overhead. - -5. LEXER (~4.2% of total) - Lexer::tokenize is the 2nd hottest ccc function. Dominated by - identifier scanning and keyword lookup (~70-arm match). - Fix: Perfect hash for keywords, reduced from_utf8 overhead. - -The native assembler (item 1) and string interning (item 4) are the -highest-impact remaining changes. diff --git a/ideas/high_i686_boot_code_size_reduction.txt b/ideas/high_i686_boot_code_size_reduction.txt deleted file mode 100644 index 7bd51e93f2..0000000000 --- a/ideas/high_i686_boot_code_size_reduction.txt +++ /dev/null @@ -1,54 +0,0 @@ -HIGH PRIORITY: i686 Boot Code Size Reduction (Linux kernel "Setup too big" fix) - -Problem: -The Linux kernel's 16-bit boot code (arch/x86/boot/) must fit in 32KB. -GCC with -Os -mregparm=3 -fomit-frame-pointer produces ~20KB. -CCC currently produces ~65KB, exceeding the limit by 2x. - -Root Cause Analysis (comparing CCC vs GCC for tty.c): -- CCC: 394 lines of assembly, ~1300 bytes text -- GCC: 179 lines of assembly, ~400 bytes text -- CCC is ~3x larger per file - -Key improvements needed (in priority order): - -1. i686 Peephole Optimizer (HIGHEST IMPACT, ~30-40% code reduction) - Port key passes from the x86-64 peephole to i686: - - Dead store elimination: movl %eax, N(%ebp) followed by movl %X, N(%ebp) - - Store-load forwarding: movl %eax, N(%ebp) + movl N(%ebp), %eax -> eliminate second - - Self-move elimination: movl %eax, %eax -> nop - - Copy propagation: movl %eax, %ebx + movl %ebx, %eax -> eliminate second - - Compare-branch fusion - The x86-64 peephole infrastructure is well-tested and could be adapted. - -2. -fomit-frame-pointer for i686 (~10-15% code reduction) - Replace EBP-relative addressing with ESP-relative for all slot accesses. - Challenges: - - ESP changes during function calls (subl/addl around calls) - - Need to track ESP offset at each instruction - - All slot references need offset adjustment - - Frees EBP as a general-purpose register (4th allocatable reg!) - GCC uses ESP-relative extensively for -Os code. - -3. Register allocation improvements (~10-20% code reduction) - Currently only 3 callee-saved regs are allocatable (ebx, esi, edi). - With frame pointer omission, ebp becomes available (4th reg). - Better register allocation reduces spills to stack. - -4. -Os specific optimizations - - Prefer shorter instruction encodings (e.g., xorl %eax, %eax vs movl $0, %eax) - - More aggressive inlining threshold reduction - - Tail call optimization for leaf functions - -Status: -- -mregparm=3 has been implemented -- i686 peephole optimizer has been implemented (Phase 1 done): - - Store/load forwarding, self-move elimination, dead store elimination, - compare-branch fusion (enhanced with store/load pair skipping), - memory operand folding, branch inversion, reverse move elimination, - never-read store elimination - - Strength reduction: addl $1 → incl, subl $1 → decl, movl $0 → xorl - - Redundant movsbl elimination -- Current boot size: ~63KB (target: 32KB) -- Remaining gap requires items 2-4 above (frame pointer omission, better - register allocation, -Os specific optimizations) diff --git a/ideas/high_sema_expansion_typed_ast.txt b/ideas/high_sema_expansion_typed_ast.txt deleted file mode 100644 index 513a41ef7d..0000000000 --- a/ideas/high_sema_expansion_typed_ast.txt +++ /dev/null @@ -1,33 +0,0 @@ -Expand Sema to Produce Typed AST -================================== -Priority: HIGH - -Steps 1 through 5.6 are complete: -- TypeContext in sema/type_context.rs -- ExprTypeChecker in sema/type_checker.rs with CType inference -- CType arithmetic methods (usual_arithmetic_conversion etc.) in common/types.rs -- ExprTypeMap: sema annotates every expression with its CType -- ConstMap: sema pre-computes compile-time constants via SemaConstEval -- Shared const_eval arithmetic in common/const_arith.rs and common/const_eval.rs -- Lowerer consumes sema type annotations and const map as O(1) fast paths - -Remaining steps: - -Step 6: Add actual type error diagnostics in sema - - Sema should emit structured diagnostics for type mismatches, undeclared - identifiers, invalid conversions, etc. instead of deferring to lowering panics. - -Step 7: Simplify lowering to assume types are correct - - With sema validating types, lowering can drop most of its type inference - code and assume the AST is well-typed. Expected reduction: ~18K to ~12K lines. - -Benefits: -- Type errors become diagnostics instead of IR lowering panics -- Clearer module boundary: sema validates, lowering emits -- Enables future type-system features and better error messages - -Key files: -- src/frontend/sema/sema.rs -- src/frontend/sema/type_context.rs, type_checker.rs, const_eval.rs -- src/common/types.rs, const_arith.rs, const_eval.rs -- src/ir/lowering/expr_types.rs, const_eval.rs, lowering.rs diff --git a/ideas/high_use_def_chains.txt b/ideas/high_use_def_chains.txt deleted file mode 100644 index c17c9ca398..0000000000 --- a/ideas/high_use_def_chains.txt +++ /dev/null @@ -1,46 +0,0 @@ -HIGH PRIORITY: Add use-def chains to the IR - -Problem: -Every optimization pass independently scans all instructions to find uses of values. -DCE builds a HashSet by walking everything (collect_used_values, dce.rs:58-72). -Constant folding runs fixpoint loops that can never propagate constants to downstream -users because it only sees literal Operand::Const values, not "this Value was folded -to a constant". GVN replaces redundant instructions with Copy but cannot propagate -the replacement to users. Simplify cannot see that an operand was defined as zero -in a prior instruction. - -The pipeline now runs 14 passes across 3 iterations, meaning each function -undergoes 30+ full instruction scans per compilation. Each scan walks every -block, every instruction, and pattern-matches all 28+ Instruction variants. - -Current state (passes/mod.rs, run_passes): - 14 passes (cfg_simplify, copy_prop, div_by_const, narrow, simplify, - constant_fold, gvn, licm, ivsr, if_convert, copy_prop, dce, cfg_simplify, - ipcp), iterated up to 3 times with dirty tracking and dependency-based - skip logic. Each pass takes &mut IrModule, returns usize count, no shared - state between passes. - -What to do: -1. Add a UseDefInfo struct that maintains, for each Value: - - A list of instructions that use this value (use-chain) - - The instruction that defines this value (def-chain) -2. Build it once before the pass pipeline starts -3. Update it incrementally as passes mutate the IR (instruction replacement, - deletion, operand rewriting) -4. Replace collect_used_values in DCE with a use-count check (use_count == 0 => dead) -5. Enable constant folding to rewrite operands of downstream users when a value - is folded to a constant -6. Enable copy propagation (when %x = copy %y, replace all uses of %x with %y) - -This is the single highest-leverage infrastructure improvement for the optimizer. -It turns O(passes * instructions) repeated scanning into O(1) per-value lookups, -and unlocks optimizations that are currently impossible (forward propagation of -constants and copies). - -Key files: -- src/passes/dce.rs (collect_used_values, collect_instruction_uses: 110-line match) -- src/passes/constant_fold.rs (try_fold: cannot propagate, fixpoint is useless) -- src/passes/gvn.rs (replaces with Copy, cannot propagate to users) -- src/passes/simplify.rs (cannot see defining instruction of operands) -- src/passes/mod.rs (run_passes: no shared context between passes) -- src/ir/ir.rs (Instruction enum, Operand enum, Value type) diff --git a/ideas/high_value_location_abstraction.txt b/ideas/high_value_location_abstraction.txt deleted file mode 100644 index 18dbf63aa3..0000000000 --- a/ideas/high_value_location_abstraction.txt +++ /dev/null @@ -1,24 +0,0 @@ -ValueLocation Abstraction for Codegen -====================================== -Priority: HIGH - -Phase 1 (RegCache) and Phase 2 (push/pop elimination for x86) are complete. - -Remaining work: - -1. Introduce ValueLocation enum: - enum ValueLocation { Stack(StackSlot), Register(PhysReg) } -2. Change value_locations to HashMap -3. Update emit_load_operand / emit_store_result to dispatch on ValueLocation -4. Extend RegCache to track the secondary register (rcx/x1/t1) -5. Add invalidate_value() for targeted invalidation on stores -6. Apply push/pop elimination to ARM and RISC-V backends -7. Remaining x86 push/pop patterns: i128 binop prep, i128 mul/divrem - register rearrangement, inline asm output scratch, variadic call AL - -Key files: -- src/backend/state.rs (CodegenState, StackSlot, RegCache) -- src/backend/generation.rs (cache invalidation) -- src/backend/x86/codegen/codegen.rs -- src/backend/arm/codegen/codegen.rs -- src/backend/riscv/codegen/codegen.rs diff --git a/ideas/low_structured_error_infrastructure.txt b/ideas/low_structured_error_infrastructure.txt deleted file mode 100644 index 6a6a4e48d7..0000000000 --- a/ideas/low_structured_error_infrastructure.txt +++ /dev/null @@ -1,137 +0,0 @@ -LOW PRIORITY: Structured error infrastructure follow-ups - -COMPLETED (2026-01-28): -- Created DiagnosticEngine in common/error.rs with Diagnostic, Severity, source snippets -- Wired into parser: all 4 eprintln! error sites replaced with emit_error() -- Wired into sema: errors and warnings go through DiagnosticEngine -- Wired into preprocessor: #warning collected as warnings (not bare eprintln!) -- Driver threads DiagnosticEngine through parser -> sema pipeline -- Source snippet rendering with caret/underline using SourceManager::get_source_line() -- GCC-compatible output format: "file:line:col: error: message" - -COMPLETED (2026-01-28): -- Added WarningKind enum with categorized warnings (Undeclared, ImplicitFunctionDeclaration, Cpp) -- Added WarningConfig for per-warning enable/disable and error promotion -- Implemented -Werror (global), -Werror=, -Wno-error= for selective promotion -- Implemented -Wall, -Wextra, -W, -Wno- for warning control -- Left-to-right flag processing matching GCC semantics -- GCC-style [-W] and [-Werror=] suffixes on diagnostic messages -- Updated sema and preprocessor warning sites with WarningKind tags -- Wired WarningConfig from CLI through Driver to DiagnosticEngine - -COMPLETED (2026-01-28): -- Added source spans to all 3 sema diagnostic sites: - - "switch quantity is not an integer" error now includes the expression span - - "'name' undeclared" warning now includes the identifier span - - "implicit declaration of function 'name'" warning now includes the callee span -- Removed legacy `errors: Vec` from SemanticAnalyzer; now uses - DiagnosticEngine.has_errors() / error_count() exclusively -- Changed analyze() return type from Result<(), Vec> to Result<(), usize> -- Added first "note:" follow-up message: switch type error now includes - "note: expression has type 'Foo'" with span pointing to the expression -- Removed #[allow(dead_code)] from Diagnostic::with_note() (now in active use) - -COMPLETED (2026-01-28): -- Wired DiagnosticEngine into the IR lowering phase: - - Added DiagnosticEngine field (RefCell-wrapped) to Lowerer struct - - Driver threads DiagnosticEngine: parser -> sema -> lowerer pipeline - - Lowerer::lower() returns (IrModule, DiagnosticEngine) tuple - - Driver checks for lowering errors after IR generation - - Added emit_warning() helper using RefCell for interior mutability - (many lowering methods take &self, same pattern as expr_ctype_cache) - - Added diagnostics for unresolved typeof expressions (was a TODO) - - Added diagnostics for __auto_type inference failures - - All diagnostics include source spans for GCC-compatible location output - -COMPLETED (2026-01-28): -- Added Display impl for CType in common/types.rs to show user-friendly C-style - type names in diagnostics (e.g., "unsigned int" instead of "UInt", "char *" - instead of "Pointer(Char)", "void (*)(int)" for function pointers) -- Updated sema diagnostic site to use Display (format with {}) instead of Debug ({:?}) - for CType in the "expression has type '...'" note message - -COMPLETED (2026-01-30): -- Added fix-it hint infrastructure: Diagnostic.fix_hint field rendered below snippets -- Added expect_after() parser method for context-aware error messages - (e.g., "expected ';' after return statement" instead of "expected ';' before '}'") -- Added expect_closing() parser method for delimiter matching notes - (e.g., "note: to match this '(' at file.c:10:5") -- Updated all statement semicolons with context: return, break, continue, goto, - do-while, for-loop, expression statements -- Updated struct/enum/initializer closing braces with opening brace notes -- Updated function calls, parenthesized expressions, array subscripts, - _Alignof, __alignof__, __builtin_va_arg, __builtin_types_compatible_p, - _Generic, _Static_assert with closing paren notes -- Updated declaration semicolons (external, local, K&R params, struct fields) - -COMPLETED (2026-01-30): -- Added ANSI color output to diagnostics matching GCC's color scheme: - - Bold red for "error:", bold magenta for "warning:", bold cyan for "note:" - - Bold white for file:line:col location and message text - - Bold green for caret/underline and fix-it hints -- Added ColorMode enum (Auto/Always/Never) with isatty detection (std::io::IsTerminal) -- Added -fdiagnostics-color={auto,always,never} flag to driver CLI -- Added -fcolor-diagnostics/-fno-color-diagnostics (Clang-compatible aliases) -- Default mode is Auto: colors when stderr is a terminal, plain when piped - -COMPLETED (2026-01-30): -- Added GCC-style "In file included from" include chain traces in diagnostics: - - Preprocessor now emits GCC-style flags in line markers (flag 1 = enter include, - flag 2 = return from include) - - SourceManager.build_line_map() parses flags to build include origin map - - SourceManager.get_include_chain() walks parent links to build full chain - - DiagnosticEngine.render_include_trace() emits chain before errors in headers - - Matches GCC output format: "In file included from X:Y," / "from X:Y:" - - Chain is only shown once per file (consecutive errors skip repeat traces) - -COMPLETED (2026-01-30): -- Added expect_context() parser method for contextual error messages with fix-it hints - (e.g., "expected '(' after 'if'" instead of "expected '(' before 'x'") -- Added fix-it hints to expect_closing() (e.g., "fix-it hint: insert ')'") -- Converted 30+ bare expect() calls across parser to contextual versions: - - statements.rs: if/while/for/switch/do-while/case/default/asm with context strings - - expressions.rs: _Alignof/__alignof__/__builtin_va_arg/__builtin_types_compatible_p/ - _Generic/ternary operator with context strings - - types.rs: _Atomic/typeof/struct/enum bodies with context strings - - declarations.rs: _Static_assert/designator brackets with context - - declarators.rs: parameter lists/array brackets with context -- Fixed 3 silent error sites where error_count++ had no diagnostic message: - - __int128 on 32-bit: now emits "__int128 is not supported on this target" - - __uint128_t on 32-bit: now emits "__uint128_t is not supported on this target" - - TI mode on 32-bit: now emits "TI mode is not supported on 32-bit targets" - -COMPLETED (2026-01-30): -- Added macro expansion tracing: "note: in expansion of macro 'X'" diagnostic notes - - MacroTable tracks which macros are expanded during each expand_line_reuse() call - - Preprocessor collects per-line macro expansion metadata during preprocessing - - SourceManager stores and indexes macro expansion info by pp output line number - - DiagnosticEngine renders "in expansion of macro 'X'" notes for errors/warnings - in macro-expanded code, showing up to 3 levels of nested expansion chain - - Filters out uninteresting macros (__builtin_*, NULL, INT_MAX, etc.) to reduce noise - - Supports ANSI color output matching GCC's color scheme - - Driver threads macro expansion info from preprocessor to source manager - -REMAINING FOLLOW-UPS: -1. [DONE] Add source locations to preprocessor errors (#error, #warning, missing #include) - - Added PreprocessorDiagnostic struct with file:line:col in preprocessor.rs - - Added explicit_location field to Diagnostic for pre-SourceManager diagnostics - - Driver now forwards preprocessor diagnostics with GCC-compatible file:line:col: prefix -2. Add more WarningKind categories as compiler implements more warnings - (UnusedVariable, SignCompare, ImplicitConversion, etc.) - NOTE: ReturnType was implemented (2026-01-30) — -Wreturn-type warning in sema -3. Add more "note:" follow-ups: "previous declaration was here", "did you mean to include
?" -4. Add more lowering diagnostics: unresolved lvalue in assignments, unresolved typedefs, - vector type mismatches in compound assignment -5. [DONE] Add macro expansion tracing ("in expansion of macro 'X'") -6. Improve macro expansion tracing to token-level granularity (currently line-level) - - Would require preprocessor to track byte ranges per expansion, not just line numbers - - Current approach tracks which macros were expanded on each output line - -Key files: -- src/common/error.rs (DiagnosticEngine, Diagnostic, Severity, with_note, with_fix_hint) -- src/common/source.rs (SourceManager with line map, get_source_line for snippets) -- src/frontend/parser/parser.rs (uses emit_error/emit_warning, expect_after, expect_closing, expect_context) -- src/frontend/sema/sema.rs (uses diagnostics.error/warning_with_kind with spans) -- src/ir/lowering/lowering.rs (uses RefCell for &self methods) -- src/frontend/preprocessor/preprocessor.rs (PreprocessorDiagnostic, collects errors and warnings) -- src/driver/driver.rs (creates and threads DiagnosticEngine through phases) diff --git a/ideas/new_projects.txt b/ideas/new_projects.txt deleted file mode 100644 index 255357510a..0000000000 --- a/ideas/new_projects.txt +++ /dev/null @@ -1,19 +0,0 @@ -# Project Status Tracking -# Format: project_name: PASS/FAIL (notes) - -json-c: PASS (all 4 backends: x86, i686, arm, riscv) -coreutils_GNU: PASS (all 4 backends: x86, i686, arm, riscv) - fixed by adding struct member access validation in sema -sqlite: PASS (all 4 backends: x86, i686, arm, riscv) -blosc: PASS (all 4 backends: x86 1630/1630 tests, i686, arm, riscv) -tcpdump: PASS (all 4 backends: x86, i686, arm, riscv) -busybox: PASS (all 4 backends: x86, i686, arm, riscv) -QEMU: PASS (x86 - compiles 1806 files, links, boots Linux kernel with serial output; riscv64 compiles+links; aarch64 has translate-sve.c errors) -LuaJIT: PASS (x86 - fixed by adding .eh_frame/CFI generation) -postgres: PASS (x86 backend: all 237 regression tests pass) -Emacs_Lisp_core: PASS x86 only (i686 has pre-existing codegen crash in signal_or_quit; no arm/riscv check scripts) -QuickJS: PASS (x86; fixed is_all_ones i128 truncation bug in simplify pass) -CPython: PASS (x86) - fixed AVX2 intrinsics, nested struct global init with PointerMemberAccess -clay: PASS (all 4 backends: x86, i686, arm, riscv) - fixed by adding missing ARM NEON intrinsics (vorrq_u64, vshlq_n_u64, vld1_u64, vdup_n_u64, vminvq_u32) -alpine: PASS (all 4 backends: x86, i686, arm, riscv) - fixed by adding sizeof incomplete struct/union type check in sema -criterion: PASS (all 4 backends: x86, i686, arm, riscv) - fixed preprocessor deferred expansion (DEFER pattern) -FFmpeg: PASS (x86, arm) - compiles all libraries (libavutil, libavcodec, libavformat, libavfilter, libswscale, libswresample, libavdevice), ffmpeg and ffprobe binaries work, all 7331 checkasm tests pass on both x86 and ARM (standalone assembler+linker) diff --git a/ideas/new_projects_myasm.txt b/ideas/new_projects_myasm.txt deleted file mode 100644 index 899018a6c8..0000000000 --- a/ideas/new_projects_myasm.txt +++ /dev/null @@ -1,290 +0,0 @@ -Tracking project build/test results with MY_ASM=builtin MY_LD=builtin - -| Project | x86 | i686 | arm | riscv | Notes | -|---------|-----|------|-----|-------|-------| -| lz4 | PASS | PASS | PASS | PASS | i686 fixed by local symbol reloc fix; arm/riscv now pass | -| libyaml | PASS | PASS | PASS | PASS | i686 fixed by local symbol reloc fix | -| tomlc99 | PASS | PASS | PASS | PASS | i686 fixed by local symbol reloc fix | -| parson | PASS | PASS | PASS | PASS | i686 fixed by local symbol reloc fix | -| jsmn | PASS | PASS | N/A | N/A | Fixed i686: IFUNC support (IPLT/IRELATIVE), TLS, local sym resolution, generic section writes | -| zstd | PASS | TODO | FAIL (runtime) | PASS | Fixed __ASSEMBLER__ for .S files, .pushsection/.popsection, BMI2 instrs, bitwise expr parsing; ARM now compiles (added missing NEON u16 intrinsics), runtime crash in fullbench; RISC-V fixed: get_expr_type returned U64 for UIntLiteral on 64-bit targets (should be U32), causing narrow pass to miss LShr narrowing → mulw sign-extended result fed into 64-bit srl → DeBruijn array OOB segfault | -| 8cc | PASS | PASS | TODO | TODO | Fixed x86 builtin linker copy-reloc st_shndx and .gnu.hash; i686 fixed by COMDAT groups + GOTPC + PIC codegen fixes | -| bzip2 | PASS | PASS | PASS | PASS | i686 fixed by local symbol reloc fix | -| miniz | PASS | PASS | PASS | PASS | i686 fixed by COMDAT groups + GOTPC + PIC codegen fixes | -| utf8proc | PASS | PASS | PASS | PASS | i686 fixed by local symbol reloc fix | -| chibicc | PASS | TODO | TODO | PASS | Passes x86; riscv now passes | -| QuickJS | PASS | PASS | FAIL (frontend codegen bug: test_cyclic_import.js) | PASS | Fixed i686: added --defsym linker support for fmod=__ieee754_fmod alias | -| cJSON | PASS | PASS | PASS | PASS | All arches pass with builtin asm/ld | -| monocypher | PASS | PASS | PASS | PASS | Fixed ARM asm: symbol names matching register names (e.g. x16) | -| sundown | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| termbox | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| dash | PASS | PASS | PASS | FAIL | Fixed x86: COPY reloc weak alias resolution (environ/__environ); i686 fixed by COMDAT groups + GOTPC + PIC codegen fixes | -| cmark | PASS | PASS | PASS | PASS | Fixed x86/riscv asm octal escape parsing (\0 prefix consumed too early); i686 fixed by ParamRef coalescing + reg cache invalidation fixes | -| Duktape | PASS | PASS | PASS | PASS | Passes all architectures out of the box | -| Wren | PASS (build script fix) | TODO | TODO | TODO | Build script fix: increase test.py Timer from 5s to 120s (sed -i 's/Timer(5,/Timer(120,/' util/test.py); our code is correct but ~30-200x slower than GCC due to register allocator quality in large switch-based interpreter loop | -| json-c | PASS | PASS | PASS | PASS | Fixed i686 linker local symbol collision (GOTOFF relocations); i686 fully fixed by COMDAT groups + GOTPC + PIC codegen fixes | -| nanomsg | PASS | PASS | PASS | PASS | Fixed: linker now rejects undefined symbols instead of silently resolving to 0; i686 fixed by PIC codegen fixes | -| lmdb | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| unqlite | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| libev | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| vedis | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| hiredis | PASS | PASS | PASS | PASS | i686 fixed by COMDAT groups + GOTPC + PIC codegen fixes | -| mongoose | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| freetype | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| nuklear | PASS | PASS | PASS | PASS | i686 fixed by COMDAT groups + GOTPC + PIC codegen fixes | -| inih | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| sds | PASS | TODO | TODO | TODO | Passes x86 | -| klib | PASS | TODO | TODO | TODO | Passes x86 | -| oniguruma | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| stb_image | PASS | TODO | TODO | TODO | Passes x86 | -| xxhash | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| yyjson | PASS | PASS | PASS | PASS | Fixed i686: incl/decl peephole broke carry flag for 64-bit negation (addl $1 before adcl) | -| smaz | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| heatshrink | PASS | PASS | PASS | PASS | Fixed i686: linker rejected .os/.od archive members and input files (extension-based filters) | -| bearssl | PASS | TODO (slow) | PASS | PASS | Fixed: added bundled x86intrin.h + _mm_cmpeq_epi64 + RDRAND/RDSEED intrinsics (no build script fix needed). i686 correct but crypto tests too slow (~15x vs GCC) | -| tinyexpr | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| sqlite | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| genann | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| munit | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| llhttp | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| gumbo | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| md4c | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| libtommath | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| flatcc | PASS | PASS | PASS | PASS | Fixed by pre-store register conflict fix (same as libharu) | -| mbedtls | PASS | PASS | PASS | PASS | Fixed i686 asm: added AES-NI instructions; fixed SUB/SBB misencoding as byte ops (mnemonic_size_suffix bug) | -| giflib | PASS | PASS | PASS | PASS | i686 fixed by COMDAT groups + GOTPC + PIC codegen fixes | -| arithcode | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| tweetnacl | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| kann | PASS | PASS | FAIL (arm NEON) | PASS | Fixed: added MXCSR macros/functions to xmmintrin.h, added stmxcsr/ldmxcsr to i686 asm | -| libucl | PASS | PASS | PASS | PASS | Fixed by pre-store register conflict fix (same as libharu) | -| mpack | PASS | PASS | PASS | PASS | Fixed ARM: FP param spill clobbered GP arg regs (x0) in prologue | -| argtable3 | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| cwalk | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| linenoise | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| log.c | PASS | PASS | PASS | PASS | Fixed i686: missing libgcc.a (demand-driven archive extraction), TLS relocs, R_386_TLS_LE/TPOFF32 formula swap (exit crash) | -| picohttpparser | PASS | PASS | PASS | PASS | Fixed ARM: stack-passed params in callee prologue clobbered x0 (emit_store_stack_params used x0 as scratch) | -| http-parser | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| tiny-regex-c | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| binn | PASS | PASS | PASS | PASS | Fixed ARM: mixed float/int param clobbering (pre-store promoted GP params) | -| coremark | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| tinycthread | PASS | FAIL (i686 thread-exit) | FAIL (arm thread-exit) | FAIL (riscv thread-exit) | Fixed x86: .tbss-only TLS missing PT_TLS. i686/arm/riscv fail on thread-exit (pthread_exit static linking issue, separate bug) | -| dlmalloc | PASS | TODO | TODO | TODO | Passes x86 | -| c4 | PASS | TODO | TODO | TODO | Passes x86 | -| shoco | PASS | TODO | TODO | TODO | Passes x86 | -| ncompress | PASS | TODO | TODO | TODO | Passes x86 | -| lzf | PASS | TODO | TODO | TODO | Passes x86 | -| whetstone | PASS | TODO | TODO | TODO | Passes x86 | -| dhrystone | PASS | TODO | TODO | TODO | Passes x86 | -| tdb | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| jansson | PASS | PASS | PASS | PASS | Fixed i686 asm: SymbolDiff (.long A-B) addend correction for PIC jump tables; R_386_GOTPC for _GLOBAL_OFFSET_TABLE_; linker COMDAT dedup | -| cc65 | PASS | TODO | PASS | TODO | Fixed ARM asm: preserve original case for condition code/barrier symbol names (CS was lowercased to cs); x86 now passes | -| brieflz | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| nettle | PASS | TODO | TODO | TODO | Fixed x86 asm: added rdsspq/rdsspd (CET shadow stack) instructions; fixed escape_string \0 -> \000 to prevent octal escape consuming adjacent digits | -| tinn | PASS | TODO | TODO | TODO | Passes x86 | -| tinycbor | PASS | TODO | TODO | TODO | Passes x86 | -| brotli | PASS | PASS | SKIP | PASS | i686 fixed by COMDAT groups + GOTPC + PIC codegen fixes; no arm check script; riscv now passes | -| msgpack-c | PASS | TODO | TODO | TODO | Passes x86 | -| Lua | PASS | TODO | TODO | TODO | Passes x86 | -| samurai | PASS | TODO | TODO | TODO | Passes x86 | -| cmocka | PASS | TODO | TODO | TODO | Passes x86 | -| gawk | PASS | TODO | TODO | TODO | Passes x86 | -| mawk | PASS | TODO | TODO | TODO | Passes x86 | -| rc | PASS | TODO | TODO | TODO | Passes x86 | -| s7 | PASS | TODO | TODO | TODO | Passes x86 | -| wolfssl | PASS | TODO | TODO | TODO | Passes x86 | -| darkhttpd | PASS | PASS | PASS | PASS | Fixed i686: set_permissions on /dev/null was hard-erroring (now ignores errors like other backends) | -| libconfig | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| libcsv | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| nghttp2 | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| discount | PASS | PASS | PASS | PASS | Fixed x86 linker: local symbol collision (local STB_LOCAL symbols were resolved to global symbols with same name from other object files) | -| gdbm | PASS | PASS | PASS | PASS | Fixed RISC-V linker: added _Unwind/gcc_personality to linker_defined list + fixed .so search in linker scripts | -| xz_liblzma | PASS | PASS | PASS | PASS | All architectures now pass (CLMUL CRC and other issues fixed by subsequent compiler improvements) | -| libharu | PASS | PASS | PASS | PASS | Fixed ARM segfault: pre-store optimization register conflict in prologue codegen (param dest registers could collide when regalloc reused callee-saved regs) | -| map | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| sort.h | PASS | TODO | TODO | TODO | Passes x86 | -| fastlz | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| huf | PASS | TODO | TODO | TODO | Passes x86 | -| stb_ds | PASS | TODO | TODO | TODO | Passes x86 | -| microtar | PASS | TODO | TODO | TODO | Passes x86 | -| minilzo | PASS | TODO | TODO | TODO | Passes x86 | -| qoi | PASS | TODO | TODO | TODO | Passes x86 | -| stb_truetype | PASS | TODO | TODO | TODO | Passes x86 | -| pl_mpeg | PASS | TODO | TODO | TODO | Passes x86 | -| tree | PASS | TODO | TODO | TODO | Passes x86 | -| jo_gif | PASS | TODO | TODO | TODO | Passes x86 | -| quicklz | PASS | TODO | TODO | TODO | Passes x86 | -| lemon | PASS | TODO | TODO | TODO | Passes x86 | -| incbin | PASS | PASS | PASS | PASS | Added .incbin directive to x86/i686/ARM assemblers; fixed .int directive recognition; fixed i686 SymbolDiff resolution | -| uzlib | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| spng | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| protobuf-c | PASS | TODO | TODO | TODO | Passes x86 | -| cdb | PASS | TODO | TODO | TODO | Passes x86 | -| FFmpeg | PASS | TODO | PASS | TODO | Compiles all libraries + ffmpeg/ffprobe binaries; all 7331 checkasm tests pass on x86 and ARM (standalone asm+ld) | -| scc | PASS | PASS | PASS | PASS | Fixed: typeof treated as keyword in -std=c99 mode; now only keyword in GNU mode | -| nano | PASS | TODO | TODO | TODO | Fixed: added INPUT() directive support to linker script parser (libncurses.so is a linker script) | -| zlib | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| femtolisp | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| crush | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| ini | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| check | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| tree-sitter | PASS | PASS | TODO | TODO | i686 fixed by COMDAT groups + GOTPC + PIC codegen fixes; x86 now passes | -| miniaudio | PASS | PASS | FAIL (ARM NEON parse) | PASS | Fixed x86/i686 asm: added rsqrtss/rcpss instructions | -| civetweb | PASS | TODO | TODO | TODO | Passes x86 | -| lzw | PASS | TODO | TODO | TODO | Passes x86 | -| lzo | PASS | TODO | TODO | TODO | Passes x86 | -| lizard | PASS | TODO | TODO | TODO | Passes x86 | -| rans | PASS | TODO | TODO | TODO | Passes x86 | -| wflz | PASS | TODO | TODO | TODO | Passes x86 | -| stc | PASS | TODO | TODO | TODO | Passes x86 | -| stretchy_buffer | PASS | TODO | TODO | TODO | Passes x86 | -| vec | PASS | TODO | TODO | TODO | Passes x86 | -| uthash | PASS | TODO | TODO | TODO | Passes x86 | -| rax | PASS | TODO | TODO | TODO | Passes x86 | -| par | PASS | TODO | TODO | TODO | Passes x86 | -| dr_libs | PASS | PASS | PASS | PASS | Fixed by pre-store register conflict fix (same as libharu) | -| fenster | PASS | PASS | PASS | PASS | All architectures pass | -| microui | PASS | TODO | TODO | TODO | Passes x86 | -| nanovg | PASS | TODO | TODO | TODO | Passes x86 | -| tinybasic | PASS | PASS | PASS | PASS | All architectures pass | -| tinyobjloader-c | PASS | TODO | TODO | TODO | Passes x86 | -| minmea | PASS | PASS | PASS | PASS | All architectures pass | -| lwrb | PASS | PASS | PASS | PASS | All architectures pass | -| olive.c | PASS | PASS | PASS | PASS | All architectures pass | -| 2048.c | PASS | PASS | PASS | PASS | All architectures pass | -| tinflate | PASS | PASS | PASS | PASS | All architectures pass | -| minizip | PASS | TODO | TODO | TODO | Passes x86 | -| himeno | PASS | PASS | PASS | PASS | All architectures pass | -| nbench | PASS | PASS | PASS | PASS | All architectures pass | -| sokol | PASS | TODO | TODO | TODO | Passes x86 | -| libjpeg_IJG_original | PASS | PASS | PASS | PASS | All architectures pass | -| libiconv | PASS | TODO | TODO | TODO | Passes x86 | -| kilo | PASS | PASS | PASS | PASS | All architectures pass | -| nanopb | PASS | TODO | TODO | TODO | Passes x86 | -| asn1c | PASS | TODO | TODO | TODO | Passes x86 | -| pkgconf | PASS | PASS | PASS | PASS | All architectures pass | -| bison | PASS | TODO | TODO | TODO | Passes x86 | -| less | PASS | PASS | PASS | PASS | All architectures pass | -| joe | PASS | PASS | PASS | PASS | All architectures pass | -| picocom | PASS | PASS | PASS | PASS | All architectures pass | -| mini_httpd | PASS | PASS | PASS | PASS | All architectures pass | -| thttpd | PASS | PASS | PASS | PASS | All architectures pass | -| peg_leg | PASS | PASS | PASS | PASS | All architectures pass | -| cfitsio | PASS | PASS | PASS | PASS | All architectures pass | -| cgltf | PASS | PASS | PASS | PASS | All architectures pass | -| cello | PASS | PASS | PASS | PASS | All architectures pass | -| greatest | PASS | TODO | TODO | TODO | Passes x86 | -| cNBT | PASS | PASS | PASS | PASS | All architectures pass | -| chipmunk2d | PASS | TODO | TODO | TODO | Passes x86 | -| minunit | PASS | PASS | PASS | PASS | All architectures pass | -| json-parser | PASS | PASS | PASS | PASS | All architectures pass | -| gb_string | PASS | PASS | PASS | PASS | All architectures pass | -| embedded-cli | PASS | PASS | PASS | PASS | All architectures pass | -| FSE | PASS | TODO | TODO | TODO | Passes x86 | -| c-algorithms | PASS | PASS | PASS | PASS | All architectures pass | -| c-ares | PASS | PASS | PASS | PASS | All architectures pass | -| box2d-lite | PASS | TODO | TODO | TODO | Passes x86 | -| cproc | PASS | TODO | TODO | TODO | Passes x86 | -| lacc | PASS | PASS | PASS | PASS | All architectures pass | -| lcc | PASS | TODO | TODO | TODO | Passes x86 | -| pcc | PASS | TODO | TODO | TODO | Passes x86 | -| vbcc | PASS | TODO | TODO | TODO | Passes x86 | -| speex_speexdsp | PASS | PASS | PASS | PASS | All architectures pass | -| lzfse | PASS | PASS | PASS | PASS | All architectures pass | -| getopt_standalone | PASS | PASS | PASS | PASS | All architectures pass | -| finite-state-entropy | PASS | PASS | PASS | PASS | All architectures pass | -| pngcrush | PASS | TODO | TODO | TODO | Passes x86 | -| bc_GNU | PASS | TODO | TODO | TODO | Passes x86 | -| toybox | PASS | TODO | TODO | TODO | Passes x86 | -| zopfli | PASS | TODO | TODO | TODO | Passes x86 | -| optparse | PASS | PASS | PASS | PASS | All architectures pass | -| dnsmasq | PASS | TODO | TODO | TODO | Passes x86 | -| mandoc | PASS | PASS | PASS | PASS | All architectures pass | -| dropbear | PASS | TODO | TODO | TODO | Passes x86 | -| pigz | PASS | TODO | TODO | TODO | Passes x86 | -| Chibi-Scheme | PASS | TODO | TODO | TODO | Fixed x86: shared lib linker now includes libgcc.a for __udivti3 etc.; check script needs ANSI strip fix | -| miniaudio | PASS | PASS | FAIL (arm NEON int32x4x2_t) | PASS | Fixed: added rsqrtss/rsqrtps/rcpss/rcpps/sqrtps/sqrtpd to x86+i686 assemblers | -| ini | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| shapelib | PASS | PASS | PASS | PASS | All architectures pass | -| stc | PASS | PASS | PASS | PASS | All architectures pass | -| libflac | PASS | PASS | FAIL (arm) | PASS | ARM fails (separate issue) | -| jbigkit | PASS | PASS | PASS | PASS | All architectures pass | -| minimp3 | PASS | PASS | PASS | PASS | Fixed ARM: added float32_t typedef + 16 float32 NEON intrinsics to arm_neon.h | -| libfastcdc | PASS | PASS | PASS | PASS | All architectures pass | -| sod | PASS | PASS | PASS | PASS | All architectures pass | -| stretchy_buffer | PASS | PASS | PASS | PASS | All architectures pass | -| rans | PASS | PASS | PASS | PASS | All architectures pass | -| wflz | PASS | PASS | PASS | PASS | All architectures pass | -| lzw | PASS | PASS | PASS | PASS | All architectures pass | -| vec | PASS | PASS | PASS | PASS | All architectures pass | -| hdiffpatch | PASS | PASS | FAIL (ARM NEON vrev32q_u8) | PASS | Added SSSE3 _mm_alignr_epi8, SHA-NI intrinsics (sha256rnds2/msg1/msg2), BMI2 _bzhi_u64/_bzhi_u32, i686 xgetbv instruction | -| Icon | PASS | TODO | TODO | TODO | Fixed x86: added section headers to linker executables (needed for strip command) | -| libmpg123 | PASS | TODO | TODO | TODO | Fixed x86 asm: added SSE/AVX cmp pseudo-ops (cmpnleps/vcmpnleps), vmovlhps, vmovddup, vmulss, vmovss, vpermilps, vextractf128, vcvtps2dq; fixed parser for constant+symbol displacement (32+maxmin_avx) | -| open62541 | PASS | TODO | TODO | TODO | Fixed x86: decimal literal type promotion (C11 6.4.4.1) + bitfield static initializer compound path overlap | -| Janet | PASS | TODO | TODO | TODO | Fixed x86 linker: emit .gnu.version/.gnu.version_r sections for GNU symbol versioning (realpath@@GLIBC_2.3 resolved to old GLIBC_2.2.5 without version info) | -| MuJS | PASS | TODO | TODO | TODO | Passes x86 | -| Gravity | PASS | TODO | TODO | TODO | Passes x86 | -| whitedb | PASS | TODO | TODO | TODO | Passes x86 | -| STREAM | PASS | TODO | TODO | TODO | Passes x86 | -| JerryScript | PASS | TODO | TODO | TODO | Fixed x86: peephole frame_compact NOP'd struct param stores (range-based overlap check) | -| openssh | PASS | PASS | N/A | N/A | Fixed x86/i686 asm: added 16-bit (w-suffix) cmov instructions (cmovnew etc.) with 0x66 operand-size prefix | -| wasm-micro-runtime | PASS | FAIL (codegen bug) | PASS | PASS | Fixed ARM: # comment handling in .s files; Added i686 loop instruction; i686 has runtime codegen bug (wasm loader fails) | -| lynx | PASS | TODO | TODO | TODO | Fixed: added incompatible pointer subtraction type check (C11 6.5.6p3) so configure correctly detects dirent64 compatibility | -| libtomcrypt | PASS | PASS | PASS | PASS | Fixed RISC-V: narrow pass incorrectly eliminated AND zero-extension mask (uint32 & 0xFFFFFFFF) when narrowing I64→U32, causing sign-extended addw results to leak | -| libpng | PASS | PASS | PASS | PASS | Fixed: -Wl flag splitting broke -soname/-rpath multi-arg flags; inliner now avoids bloating recursive function frames | -| s2n-tls | PASS* | TODO | TODO | TODO | x86: 281/282 tests pass (1 codegen segfault in s2n_tls13_key_schedule_test). Fixed: COMMON symbol archive extraction, decimal constant type promotion (C11 6.4.4.1), nested designated initializer pointer placement | -| crun | PASS | PASS | PASS | PASS | Fixed i686: DT_TEXTREL for WEAK dynamic data symbols (environ) referenced by R_386_32; riscv now passes | -| lame | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| sdcc | PASS (build script fix) | TODO | TODO | TODO | Build script fix: repo has stale RISC-V .o files; distclean bug (clean.mk lists "pic" not "pic14") leaves them; need find . -name "*.o" -delete before build | -| blosc | PASS | PASS | PASS | PASS | Fixed: weak undefined symbols in shared library .dynsym were emitted as STB_GLOBAL instead of STB_WEAK, causing runtime "undefined symbol" errors for ZSTD_trace_*; also fixed x86/i686 asm jmp/jcc @PLT suffix not stripped (baked @PLT into symbol name instead of using R_X86_64_PLT32/R_386_PLT32) | -| TinyScheme | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| MicroPython | PASS | TODO | PASS | TODO | Fixed x86: implemented --gc-sections in linker; ARM: ported --gc-sections to ARM linker (same BFS reachability algorithm) | -| mksh | PASS | TODO | TODO | TODO | Fixed: static init array-to-pointer decay on struct array element field (arr[N].text); get_struct_layout_of_expr now handles ArraySubscript | -| binutils | PASS | TODO | TODO | TODO | Fixed: (1) preprocessor #define inside #ifdef leaked in included files during multi-line accumulation, (2) anonymous enum constants not registered in function definition return types, (3) linker as-needed behavior for shared libraries | -| CPython | PASS | TODO | TODO | TODO | Fixed: __STRICT_ANSI__ for -std=c11, -Xlinker flag, -export-dynamic, shared lib R_X86_64_64 dynamic relocs for external symbols, compound literal address-of in static struct array initializer | -| curl | PASS | TODO | TODO | TODO | Fixed: reject implicit pointer<->float conversions (C11 6.5.16.1p1); cmake strerror_r detection passed both POSIX and GLIBC tests | -| lexbor | PASS | TODO | TODO | TODO | Fixed x86 shared lib linker: emit R_X86_64_64 dynamic relocs for named global symbols in data sections (function pointer tables); scale .gnu.hash bloom filter for large symbol tables | -| libressl | PASS (build script fix) | TODO | TODO | TODO | Build script fix: -DENABLE_ASM=OFF (s2n-bignum uses Intel syntax); fixed x86 asm pinsrw/pinsrd/pinsrb/pinsrq memory operand encoding | -| libtiff | PASS | TODO | TODO | TODO | Fixed x86: added section headers to shared library linker output (GNU ld needs section headers to link against .so files) | -| coreutils_GNU | PASS | PASS | PASS | PASS | x86/i686 pass out of the box; ARM fixed: strip_c_comments was not string-aware (/* inside .ascii strings); RISC-V fixed: demand-driven archive extraction for inline .a files | -| mimalloc | PASS | PASS | PASS | PASS | Fixed: (1) x86/i686 asm @PLT suffix leaked into ELF symbol names (strip in add_relocation), (2) RISC-V linker strong-over-strong symbol overwrite (first definition wins) | -| gforth | PASS | TODO | TODO | TODO | Fixed: (1) preprocessor #line directives consumed during multi-line accumulation in included files, (2) cast-wrapped &&label in static initializers (e.g. (Label)&&I_noop) no longer zero-initialized | -| wasm3 | PASS | TODO | TODO | TODO | Fixed: (1) x86 tail call optimization peephole pass for threaded interpreter dispatch, (2) 3D function pointer array initializer inner stride bug in global_init, (3) tail call safety check for lea-of-local (dangling stack pointer) | -| openjpeg | PASS | PASS | PASS | PASS | All architectures pass out of the box | -| zydis | PASS | TODO | TODO | TODO | Fixed: self-referential static initializer with multi-dim array member subscripts (&FORMATTER.number_format[1][0].string_data); compound init for multi-dim arrays of structs with pointer fields | -| libfuse | PASS | TODO | TODO | TODO | Fixed: (1) __attribute__((symver("..."))) support across full pipeline (parser->AST->IR->codegen->assembler->linker), (2) jmp @PLT suffix handling (strip suffix, use R_X86_64_PLT32 relocation instead of creating literal symbol) | -| sed_GNU | PASS | TODO | TODO | TODO | Fixed: (1) __builtin_mul_overflow same-size signed-to-unsigned overflow detection, (2) _Static_assert now rejects non-constant variable references, (3) added missing __SCHAR_WIDTH__ and other GCC WIDTH macros | -| minimap2 | PASS | TODO | TODO | TODO | Fixed: added missing SSE intrinsics: _mm_cmplt_epi8/_mm_cmplt_epi32 (emmintrin.h), _mm_max_epi8/_mm_min_epi8 + many SSE4.1 intrinsics (smmintrin.h) | -| ECL | PASS | TODO | TODO | TODO | Fixed x86/i686 asm: jmp/jcc @PLT labels now strip suffix and use PLT32 relocation instead of PC32 with @PLT in symbol name | -| nelua | PASS | TODO | TODO | TODO | Fixed: added bundled x86intrin.h (rdtsc/rdtscp/bsf/bsr/bswap/rotate intrinsics) to prevent fallthrough to GCC's vector-type-heavy header | -| SDL2 | PASS | TODO | TODO | TODO | Fixed: (1) complete MMX intrinsics in mmintrin.h, (2) _mm_packs_epi16 SSE2 intrinsic (packsswb128), (3) _mm_stream_ps in xmmintrin.h, (4) peephole optimizer incorrectly eliminating pushq/popq pairs across pushfq/popfq | -| s6 | PASS | TODO | TODO | TODO | Fixed x86: added IFUNC/IRELATIVE support to x86-64 static linker (IPLT stubs, IFUNC GOT, R_X86_64_IRELATIVE relocations); glibc memcpy/memset/strcmp are IFUNC symbols resolved at startup | -| libnl | PASS | TODO | TODO | TODO | Fixed: (1) compound literal zero-fill for arrays with fewer initializers than elements (C11 6.7.9p21), (2) linker -Wl,-rpath -Wl,/path two-arg form parsing | -| FreeBSD_kernel | PASS | TODO | TODO | TODO | Passes x86 out of the box (builds contrib/bearssl component) | -| diffutils | PASS | TODO | TODO | TODO | Fixed: LDBL_MAX/MIN/EPSILON predefined macros had truncated decimal values causing rounding errors; added IEEE 754 round-to-nearest-even in decimal-to-f128 bigint conversion | -| privoxy | PASS | TODO | TODO | TODO | Fixed: preprocessor -E mode now outputs text even when #error directives encountered (matching GCC behavior); configure scripts grep preprocessed output ignoring exit code | -| libtasn1 | PASS | TODO | TODO | TODO | Passes x86 out of the box | -| Perl | PASS | TODO | TODO | TODO | Fixed: (1) sub-int condition evaluation after narrow pass (upper register bits stale, while(uchar) never terminated), (2) anonymous struct pointer-array subtraction (re-resolving type spec generated different anon struct keys per declarator), (3) preprocessor #line directives used absolute paths causing Perl makedepend to add spurious inline recipe for util.o from vutil.c | -| iperf3 | PASS | PASS | PASS | PASS | Fixed: -pthread flag now defines _REENTRANT=1 (matching GCC/Clang); configure scripts use ax_pthread.m4 which checks for _REENTRANT | -| libgrapheme | PASS | TODO | TODO | TODO | Passes x86 out of the box (was previously failing, now fixed by other improvements) | -| h2o | PASS | TODO | TODO | TODO | Passes x86 out of the box (was previously failing, now fixed by other improvements) | -| redis | PASS | TODO | TODO | TODO | Fixed x86: added missing SSSE3 (hadd_epi32, etc.), AVX2 (cvtepi8_epi16, madd_epi16, etc.), and AVX-512 (add_epi32, xor_si512, extracti64x4, cvtepi8_epi16, madd_epi16, reduce_add_epi32, etc.) intrinsics to bundled headers | -| libseccomp | PASS | TODO | TODO | TODO | Passes x86 out of the box (was previously failing, now fixed by other improvements) | -| ldns | PASS | TODO | TODO | TODO | Passes x86 out of the box (was previously failing, now fixed by other improvements) | -| io_uring_liburing | PASS | TODO | TODO | TODO | Passes x86 out of the box (was previously failing, now fixed by other improvements) | -| libexpat | PASS | TODO | TODO | TODO | Fixed x86: GOTPCREL codegen for external function address-of (PIE-compatible); linker GOTPCREL+PLT coexistence (dedicated GOT entry filled with PLT address) | -| criterion | PASS | TODO | TODO | TODO | Fixed: (1) x86 linker missing ld-linux-x86-64.so.2 in default lib search (for _r_debug), (2) shared lib linker exported LOCAL TLS symbols to .dynsym as GLOBAL UNDEFINED (GOT loop missing is_local filter), (3) assembler .fill directive silently ignored (emitted 0 bytes, broke boxfort trampoline addr slot) | -| radare2_portions | PASS | TODO | TODO | TODO | Passes x86 out of the box | -| libvorbis_libogg | PASS | TODO | TODO | TODO | Passes x86 out of the box | -| squashfuse | PASS | TODO | TODO | TODO | Fixed: implemented --whole-archive / --no-whole-archive in x86 shared library linker; archive members now unconditionally included when flag is active (libtool creates .so from .a convenience archives) | -| canfestival | PASS | PASS | PASS | PASS | Fixed: C99 inline-only functions (inline without extern/static) were completely skipped during lowering, causing undefined symbol errors. Now lowered as static functions so bodies are available for inlining. | -| ghostscript | PASS | PASS | PASS | PASS | Passes all architectures out of the box | -| nng | PASS | TODO | TODO | TODO | Fixed x86: (1) sema rejected member access on undefined struct types (cmake check_struct_has_member falsely passed for sockpeercred), (2) linker now errors on missing -l libraries (cmake check_library_exists falsely passed for libnsl/libsocket) | -| libmagic_file | PASS | TODO | TODO | TODO | Passes x86 out of the box | -| hostapd | PASS | TODO | TODO | TODO | Fixed x86: linker group resolution included direct .a archives in iterative re-scanning loop (circular archive deps with -Wl,--start-group) | -| pngquant | PASS | TODO | TODO | TODO | Passes x86 out of the box (was previously failing, now fixed by other improvements) | -| libunistring | PASS | TODO | TODO | TODO | Fixed: (1) preprocessor va_list/gnuc_va_list typedef text injection via pending_injections broke when stdarg.h included from nested headers (typedef emitted inside array initializer); moved to sema seed_builtin_typedefs. (2) x87 is_inf() mask 0x3FFF... stripped QNaN bit 62, making NaN look like infinity in constant folding (0.0L/0.0L); fixed with exact mantissa==0x8000... check. | -| libsoxr | PASS (build script fix) | PASS | PASS | PASS | Fixed x86: (1) peephole copy propagation rewrote fnstsw %ax to invalid register (added x87 FPU status/control instrs to has_implicit_reg_usage), (2) added missing AVX double-precision intrinsics to avxintrin.h. Build script fix: disable SIMD engines (-DWITH_CR32S=OFF -DWITH_CR64S=OFF) due to pffft AVX convolution codegen crash | -| libsodium | PASS | TODO | TODO | TODO | Passes x86 (was previously failing, now fixed by other improvements) | -| libsrtp | PASS | TODO | TODO | TODO | Passes x86 (was previously failing, now fixed by other improvements) | -| libmicrohttpd | PASS | TODO | TODO | TODO | Passes x86 (was previously failing, now fixed by other improvements) | -| haproxy | PASS | TODO | TODO | TODO | Passes x86 (was previously failing, now fixed by other improvements) | diff --git a/ideas/optimization_passes_future.txt b/ideas/optimization_passes_future.txt deleted file mode 100644 index d1ef41c8df..0000000000 --- a/ideas/optimization_passes_future.txt +++ /dev/null @@ -1,29 +0,0 @@ -Future Optimization Passes -=========================== -Priority: MEDIUM - -Currently implemented passes: -- constant_fold.rs Constant folding and propagation -- copy_prop.rs Copy propagation -- dce.rs Dead code elimination -- gvn.rs Dominator-based GVN/CSE -- simplify.rs Algebraic simplification, strength reduction -- cfg_simplify.rs Dead block removal, jump threading -- licm.rs Loop-invariant code motion -- inline.rs Function inlining (always_inline + small static) -- if_convert.rs Diamond if-then-else to conditional moves -- narrow.rs Integer narrowing -- div_by_const.rs Division by constant strength reduction -- ipcp.rs Interprocedural constant propagation -- iv_strength_reduce.rs Induction variable strength reduction - -Shared infrastructure: -- ir/analysis.rs: CFG, dominator tree (Cooper-Harvey-Kennedy), dominance - frontiers. Used by mem2reg, GVN, LICM, IVSR. - -Next passes to add: -1. SCCP (Sparse Conditional Constant Propagation): propagate constants - through the CFG taking branch conditions into account. - -2. Loop strength reduction extensions: extend iv_strength_reduce to handle - pointer-based induction variables and more complex patterns. diff --git a/ideas/qemu_build_support.txt b/ideas/qemu_build_support.txt deleted file mode 100644 index a4c23739aa..0000000000 --- a/ideas/qemu_build_support.txt +++ /dev/null @@ -1,71 +0,0 @@ -QEMU Build Support Status -========================= - -Summary: QEMU 10.2.50 compiles, links, and runs correctly using ccc-x86. -The emulator boots a Linux kernel with serial console output (console=ttyS0) -working properly, including running busybox init from an initramfs. -RISC-V softmmu target also compiles and links. ARM aarch64 target has -compilation errors in translate-sve.c (1915 undeclared helper symbols). - -Build Status: -- Configure: PASS -- Compilation: PASS (all 1806 files for x86_64-softmmu) -- Linking: PASS -- Runtime: PASS - boots Linux kernel with serial output, runs busybox shell -- Full make build: 2911/2913 targets succeed (2 test binaries fail to link) - -Supported Targets: -- x86_64-softmmu: FULL PASS (compile, link, boot Linux, serial output works) -- riscv64-softmmu: compile + link OK, runtime untested -- aarch64-softmmu: FAIL (1915 undeclared helper symbols in translate-sve.c) - -Build Steps: - 1. Copy QEMU source to writable location (cp -r /deps/qemu /tmp/qemu_build) - 2. Configure out-of-tree: - cd /tmp/qemu_build && mkdir build && cd build - ../configure --cc=/path/to/ccc-x86 - --target-list=x86_64-softmmu --disable-docs --disable-gtk --disable-sdl - --disable-opengl --disable-virglrenderer --disable-spice --disable-vte - --disable-curses - 3. Build: ninja -j$(nproc) qemu-system-x86_64 - 4. Test: ./qemu-system-x86_64 -kernel bzImage -initrd initramfs.cpio.gz - -append "console=ttyS0 rdinit=/init" -nographic -no-reboot -m 256M - -Known Issues Fixed: - 1. __has_attribute macro aliasing (cleanup attribute) - FIXED - 2. Bitfield ICE in global_init_compound_struct - FIXED - 3. Response file (@file) support - FIXED - 4. Compound literal array init (CPU double-registration) - FIXED - 5. Bool pointer array init (migration crash) - FIXED - 6. Preprocessor MAX_PENDING_NEWLINES too low for qapi-introspect.c - FIXED - 7. _Generic selection in global/static initializer arrays - FIXED - 8. xgetbv inline assembly instruction not handled - FIXED - 9. Thin archive (.a with ! magic) not supported by linker - FIXED - 10. Assembler C-comment stripping inside .asciz strings - FIXED - The strip_c_comments() function stripped /* ... */ patterns even inside - quoted string literals in .asciz directives. QEMU's test-crypto-der.c - has DER-encoded RSA keys containing literal 0x2F 0x2A (/*) byte sequences - which were being treated as comment starts, truncating the string data. - Fixed in all three assemblers (x86, ARM, RISC-V) by adding string-aware - tracking to strip_c_comments(). - 11. Serial console output fixed (was previously reported as missing) - FIXED - Serial output from guest Linux kernel now works correctly with -nographic. - 12. Enum pointer subtraction rejected (6 files failed to compile) - FIXED - QEMU's VMSTATE_UINT32 macro does (uint32_t*)0 - (typeof(field)*)0 as a - compile-time type assertion. When the field is an enum type, this was - rejected because pointee_types_compatible() didn't treat enums as - compatible with integer types. Fixed by adding enum-to-integer - compatibility check in sema/analysis.rs. - 13. Standalone linker: shared libraries not in group resolution loop - FIXED - Bare .so files from command-line args were loaded before the archive - group loop, so symbols introduced by archive member extraction - (e.g., libqemuutil.a members needing libglib-2.0.so) were not resolved. - Fixed by deferring .so files into the group resolution loop alongside - .a archives. - -Remaining Issues: - 1. ARM aarch64-softmmu target: 1915 undeclared helper symbols in - translate-sve.c. These are gen_helper_sme2_* and gen_helper_sve2p1_* - functions generated from DEF_HELPER macros in helper-sme.h. - Root cause needs investigation (likely a preprocessor or macro - expansion issue specific to the ARM TCG helper generation). diff --git a/ideas/reduce_stack_frame_size_for_postgres.txt b/ideas/reduce_stack_frame_size_for_postgres.txt deleted file mode 100644 index 6e2e3f7640..0000000000 --- a/ideas/reduce_stack_frame_size_for_postgres.txt +++ /dev/null @@ -1,31 +0,0 @@ -Reduce Stack Frame Size for PostgreSQL -======================================= - -PostgreSQL's plpgsql "recursion_test(4,3)" fails with "stack depth limit -exceeded" at max_stack_depth=2048kB. Our stack frames are ~3.8x larger -than GCC's. - -Root causes: -1. Every SSA temporary gets an 8-byte stack slot, even for i32/i16/i8 types. -2. Intermediate SSA values from expression evaluation each get their own slot. - -Remaining fixes: -1. Use 4-byte slots for i32 and smaller types (HIGH IMPACT but UNSAFE as-is) - - Attempted: movl store/load for I32 values causes sign-extension bugs. - When an I32 value (e.g. -1 = 0xFFFFFFFF) is stored with movl and loaded - back with movl, it zero-extends to 0x00000000FFFFFFFF instead of - sign-extending to 0xFFFFFFFFFFFFFFFF. This breaks code like - `sign * (zic_t) hh` where a 32-bit int is implicitly widened to 64 bits. - - The codegen's accumulator-based model doesn't distinguish between - "32-bit value that will only be used in 32-bit ops" vs "32-bit value - that will be sign/zero-extended to 64 bits". Until the IR tracks this - distinction, 4-byte store/load is not safe. - - Alternative approach: reduce slot usage through better register - allocation and elimination of redundant spills. - -2. Skip slot allocation for single-use values consumed immediately (HIGH IMPACT) - - If a value is defined at instruction N and only used at N+1, it doesn't - need a stack slot (stays in accumulator register) - - Could eliminate 30-50% of slots - -Note: Caller-saved register allocation is now done on all four backends. diff --git a/ideas/register_allocator.txt b/ideas/register_allocator.txt deleted file mode 100644 index c9bd38dc38..0000000000 --- a/ideas/register_allocator.txt +++ /dev/null @@ -1,23 +0,0 @@ -Register Allocator Improvements -=============================== - -The three-phase linear scan allocator is implemented on all four backends: -- x86-64: rbx, r12-r15 (callee) + r11, r10, r8, r9 (caller) = 9 registers -- AArch64: x20-x28 (callee) + x13, x14 (caller) = 11 registers -- RISC-V: s1, s7-s11 (callee) = 6 registers -- i686: ebx, esi, edi (callee) = 3 registers - -Liveness analysis uses backward dataflow iteration with loop awareness. -ARM skips variadic functions to avoid callee-save/VA-save area conflicts. - -Remaining improvements: -1. Eliminate write-through: skip the stack store when the value is - register-allocated and has no other readers. -2. Register-to-register operations: emit ops directly on callee-saved - registers instead of routing through the accumulator. -3. Spill/reload insertion: move values between registers and stack at - optimal points instead of falling back to the stack slot. -4. Call-clobber handling: insert saves around clobbering instructions - instead of bailing out for functions with inline asm or atomics. -5. ARM variadic support: handle callee-save area placement relative - to VA register save areas. diff --git a/include/arm_neon.h b/include/arm_neon.h deleted file mode 100644 index 8a25462b37..0000000000 --- a/include/arm_neon.h +++ /dev/null @@ -1,3122 +0,0 @@ -/* CCC compiler bundled arm_neon.h - ARM NEON intrinsics */ -#ifndef _ARM_NEON_H_INCLUDED -#define _ARM_NEON_H_INCLUDED - -/* ===== Scalar types (ACLE) ===== */ -typedef float float32_t; -typedef double float64_t; -typedef unsigned long long poly64_t; -typedef __uint128_t poly128_t; - -/* ===== 128-bit vector types (Q registers) ===== */ - -typedef struct __attribute__((__aligned__(16))) { - unsigned char __val[16]; -} uint8x16_t; - -typedef struct __attribute__((__aligned__(16))) { - signed char __val[16]; -} int8x16_t; - -typedef struct __attribute__((__aligned__(16))) { - unsigned short __val[8]; -} uint16x8_t; - -typedef struct __attribute__((__aligned__(16))) { - short __val[8]; -} int16x8_t; - -typedef struct __attribute__((__aligned__(16))) { - unsigned int __val[4]; -} uint32x4_t; - -typedef struct __attribute__((__aligned__(16))) { - int __val[4]; -} int32x4_t; - -typedef struct __attribute__((__aligned__(16))) { - unsigned long long __val[2]; -} uint64x2_t; - -typedef struct __attribute__((__aligned__(16))) { - long long __val[2]; -} int64x2_t; - -typedef struct __attribute__((__aligned__(16))) { - float __val[4]; -} float32x4_t; - -typedef struct __attribute__((__aligned__(16))) { - double __val[2]; -} float64x2_t; - -/* Polynomial 128-bit types */ -typedef struct __attribute__((__aligned__(16))) { - unsigned char __val[16]; -} poly8x16_t; - -typedef struct __attribute__((__aligned__(16))) { - unsigned short __val[8]; -} poly16x8_t; - -typedef struct __attribute__((__aligned__(16))) { - unsigned long long __val[2]; -} poly64x2_t; - -/* ===== 64-bit vector types (D registers) ===== */ - -typedef struct __attribute__((__aligned__(8))) { - unsigned char __val[8]; -} uint8x8_t; - -typedef struct __attribute__((__aligned__(8))) { - signed char __val[8]; -} int8x8_t; - -typedef struct __attribute__((__aligned__(8))) { - unsigned short __val[4]; -} uint16x4_t; - -typedef struct __attribute__((__aligned__(8))) { - short __val[4]; -} int16x4_t; - -typedef struct __attribute__((__aligned__(8))) { - unsigned int __val[2]; -} uint32x2_t; - -typedef struct __attribute__((__aligned__(8))) { - int __val[2]; -} int32x2_t; - -typedef struct __attribute__((__aligned__(8))) { - unsigned long long __val[1]; -} uint64x1_t; - -typedef struct __attribute__((__aligned__(8))) { - long long __val[1]; -} int64x1_t; - -typedef struct __attribute__((__aligned__(8))) { - float __val[2]; -} float32x2_t; - -typedef struct __attribute__((__aligned__(8))) { - unsigned char __val[8]; -} poly8x8_t; - -typedef struct __attribute__((__aligned__(8))) { - unsigned long long __val[1]; -} poly64x1_t; - -/* ===== Array-of-vectors (structure) types ===== */ - -/* uint8x8 multi-vector */ -typedef struct { uint8x8_t val[2]; } uint8x8x2_t; -typedef struct { uint8x8_t val[3]; } uint8x8x3_t; -typedef struct { uint8x8_t val[4]; } uint8x8x4_t; - -/* uint8x16 multi-vector */ -typedef struct { uint8x16_t val[2]; } uint8x16x2_t; -typedef struct { uint8x16_t val[3]; } uint8x16x3_t; -typedef struct { uint8x16_t val[4]; } uint8x16x4_t; - -/* uint16x4 multi-vector */ -typedef struct { uint16x4_t val[2]; } uint16x4x2_t; -typedef struct { uint16x4_t val[4]; } uint16x4x4_t; - -/* uint16x8 multi-vector */ -typedef struct { uint16x8_t val[2]; } uint16x8x2_t; -typedef struct { uint16x8_t val[4]; } uint16x8x4_t; - -/* uint32x2 multi-vector */ -typedef struct { uint32x2_t val[2]; } uint32x2x2_t; -typedef struct { uint32x2_t val[4]; } uint32x2x4_t; - -/* uint32x4 multi-vector */ -typedef struct { uint32x4_t val[2]; } uint32x4x2_t; -typedef struct { uint32x4_t val[4]; } uint32x4x4_t; - -/* int8x8 multi-vector */ -typedef struct { int8x8_t val[2]; } int8x8x2_t; - -/* ================================================================== */ -/* LOAD INTRINSICS */ -/* ================================================================== */ - -/* --- vld1q: load one 128-bit vector --- */ - -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vld1q_u8(const unsigned char *__p) -{ - uint8x16_t __ret; - __builtin_memcpy(&__ret, __p, 16); - return __ret; -} - -static __inline__ int8x16_t __attribute__((__always_inline__)) -vld1q_s8(const signed char *__p) -{ - int8x16_t __ret; - __builtin_memcpy(&__ret, __p, 16); - return __ret; -} - -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vld1q_u16(const unsigned short *__p) -{ - uint16x8_t __ret; - __builtin_memcpy(&__ret, __p, 16); - return __ret; -} - -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vld1q_u32(const unsigned int *__p) -{ - uint32x4_t __ret; - __builtin_memcpy(&__ret, __p, 16); - return __ret; -} - -static __inline__ uint64x2_t __attribute__((__always_inline__)) -vld1q_u64(const unsigned long long *__p) -{ - uint64x2_t __ret; - __builtin_memcpy(&__ret, __p, 16); - return __ret; -} - -/* --- vld1q_dup: load and broadcast --- */ - -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vld1q_dup_u32(const unsigned int *__p) -{ - uint32x4_t __ret; - unsigned int __v = *__p; - __ret.__val[0] = __v; - __ret.__val[1] = __v; - __ret.__val[2] = __v; - __ret.__val[3] = __v; - return __ret; -} - -/* --- vld1q_lane: load one lane --- */ - -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vld1q_lane_u32(const unsigned int *__p, uint32x4_t __v, int __lane) -{ - __v.__val[__lane] = *__p; - return __v; -} - -/* --- vld1q multi-load --- */ - -static __inline__ uint8x16x4_t __attribute__((__always_inline__)) -vld1q_u8_x4(const unsigned char *__p) -{ - uint8x16x4_t __ret; - __builtin_memcpy(&__ret, __p, 64); - return __ret; -} - -/* --- vld1: load one 64-bit vector --- */ - -static __inline__ uint8x8_t __attribute__((__always_inline__)) -vld1_u8(const unsigned char *__p) -{ - uint8x8_t __ret; - __builtin_memcpy(&__ret, __p, 8); - return __ret; -} - -/* --- Structure loads (de-interleave) --- */ - -static __inline__ uint8x16x3_t __attribute__((__always_inline__)) -vld3q_u8(const unsigned char *__p) -{ - /* De-interleave 48 bytes into 3 vectors of 16 bytes each */ - uint8x16x3_t __ret; - for (int __i = 0; __i < 16; __i++) { - __ret.val[0].__val[__i] = __p[__i * 3 + 0]; - __ret.val[1].__val[__i] = __p[__i * 3 + 1]; - __ret.val[2].__val[__i] = __p[__i * 3 + 2]; - } - return __ret; -} - -static __inline__ uint8x8x3_t __attribute__((__always_inline__)) -vld3_dup_u8(const unsigned char *__p) -{ - /* Load 3 bytes and duplicate each into all lanes */ - uint8x8x3_t __ret; - for (int __i = 0; __i < 8; __i++) { - __ret.val[0].__val[__i] = __p[0]; - __ret.val[1].__val[__i] = __p[1]; - __ret.val[2].__val[__i] = __p[2]; - } - return __ret; -} - -static __inline__ uint8x8x3_t __attribute__((__always_inline__)) -vld3_lane_u8(const unsigned char *__p, uint8x8x3_t __v, int __lane) -{ - __v.val[0].__val[__lane] = __p[0]; - __v.val[1].__val[__lane] = __p[1]; - __v.val[2].__val[__lane] = __p[2]; - return __v; -} - -static __inline__ uint32x2x4_t __attribute__((__always_inline__)) -vld4_u32(const unsigned int *__p) -{ - /* De-interleave 8 uint32s into 4 vectors of 2 */ - uint32x2x4_t __ret; - for (int __i = 0; __i < 2; __i++) { - __ret.val[0].__val[__i] = __p[__i * 4 + 0]; - __ret.val[1].__val[__i] = __p[__i * 4 + 1]; - __ret.val[2].__val[__i] = __p[__i * 4 + 2]; - __ret.val[3].__val[__i] = __p[__i * 4 + 3]; - } - return __ret; -} - -/* ================================================================== */ -/* STORE INTRINSICS */ -/* ================================================================== */ - -static __inline__ void __attribute__((__always_inline__)) -vst1q_u8(unsigned char *__p, uint8x16_t __a) -{ - __builtin_memcpy(__p, &__a, 16); -} - -static __inline__ void __attribute__((__always_inline__)) -vst1q_s8(signed char *__p, int8x16_t __a) -{ - __builtin_memcpy(__p, &__a, 16); -} - -static __inline__ void __attribute__((__always_inline__)) -vst1q_u16(unsigned short *__p, uint16x8_t __a) -{ - __builtin_memcpy(__p, &__a, 16); -} - -static __inline__ void __attribute__((__always_inline__)) -vst1q_u32(unsigned int *__p, uint32x4_t __a) -{ - __builtin_memcpy(__p, &__a, 16); -} - -static __inline__ void __attribute__((__always_inline__)) -vst1q_u64(unsigned long long *__p, uint64x2_t __a) -{ - __builtin_memcpy(__p, &__a, 16); -} - -/* --- vst1_lane: store one lane --- */ - -static __inline__ void __attribute__((__always_inline__)) -vst1_lane_u32(unsigned int *__p, uint32x2_t __a, int __lane) -{ - *__p = __a.__val[__lane]; -} - -/* --- Structure stores (interleave) --- */ - -static __inline__ void __attribute__((__always_inline__)) -vst3_u8(unsigned char *__p, uint8x8x3_t __a) -{ - for (int __i = 0; __i < 8; __i++) { - __p[__i * 3 + 0] = __a.val[0].__val[__i]; - __p[__i * 3 + 1] = __a.val[1].__val[__i]; - __p[__i * 3 + 2] = __a.val[2].__val[__i]; - } -} - -static __inline__ void __attribute__((__always_inline__)) -vst4q_u8(unsigned char *__p, uint8x16x4_t __a) -{ - for (int __i = 0; __i < 16; __i++) { - __p[__i * 4 + 0] = __a.val[0].__val[__i]; - __p[__i * 4 + 1] = __a.val[1].__val[__i]; - __p[__i * 4 + 2] = __a.val[2].__val[__i]; - __p[__i * 4 + 3] = __a.val[3].__val[__i]; - } -} - -static __inline__ void __attribute__((__always_inline__)) -vst4_lane_u32(unsigned int *__p, uint32x2x4_t __a, int __lane) -{ - __p[0] = __a.val[0].__val[__lane]; - __p[1] = __a.val[1].__val[__lane]; - __p[2] = __a.val[2].__val[__lane]; - __p[3] = __a.val[3].__val[__lane]; -} - -/* ================================================================== */ -/* BITWISE OPERATIONS */ -/* ================================================================== */ - -static __inline__ uint8x16_t __attribute__((__always_inline__)) -veorq_u8(uint8x16_t __a, uint8x16_t __b) -{ - uint8x16_t __ret; - for (int __i = 0; __i < 16; __i++) - __ret.__val[__i] = __a.__val[__i] ^ __b.__val[__i]; - return __ret; -} - -static __inline__ uint64x2_t __attribute__((__always_inline__)) -veorq_u64(uint64x2_t __a, uint64x2_t __b) -{ - uint64x2_t __ret; - __ret.__val[0] = __a.__val[0] ^ __b.__val[0]; - __ret.__val[1] = __a.__val[1] ^ __b.__val[1]; - return __ret; -} - -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vandq_u8(uint8x16_t __a, uint8x16_t __b) -{ - uint8x16_t __ret; - for (int __i = 0; __i < 16; __i++) - __ret.__val[__i] = __a.__val[__i] & __b.__val[__i]; - return __ret; -} - -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vandq_u16(uint16x8_t __a, uint16x8_t __b) -{ - uint16x8_t __ret; - for (int __i = 0; __i < 8; __i++) - __ret.__val[__i] = __a.__val[__i] & __b.__val[__i]; - return __ret; -} - -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vorrq_u8(uint8x16_t __a, uint8x16_t __b) -{ - uint8x16_t __ret; - for (int __i = 0; __i < 16; __i++) - __ret.__val[__i] = __a.__val[__i] | __b.__val[__i]; - return __ret; -} - -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vmvnq_u8(uint8x16_t __a) -{ - uint8x16_t __ret; - for (int __i = 0; __i < 16; __i++) - __ret.__val[__i] = ~__a.__val[__i]; - return __ret; -} - -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vbicq_u8(uint8x16_t __a, uint8x16_t __b) -{ - /* Bit clear: a & ~b */ - uint8x16_t __ret; - for (int __i = 0; __i < 16; __i++) - __ret.__val[__i] = __a.__val[__i] & ~__b.__val[__i]; - return __ret; -} - -/* --- 64-bit bitwise --- */ - -static __inline__ uint8x8_t __attribute__((__always_inline__)) -vbsl_u8(uint8x8_t __sel, uint8x8_t __a, uint8x8_t __b) -{ - /* Bitwise select: (sel & a) | (~sel & b) */ - uint8x8_t __ret; - for (int __i = 0; __i < 8; __i++) - __ret.__val[__i] = (__sel.__val[__i] & __a.__val[__i]) | - (~__sel.__val[__i] & __b.__val[__i]); - return __ret; -} - -/* ================================================================== */ -/* SHIFT OPERATIONS */ -/* ================================================================== */ - -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vshlq_n_u8(uint8x16_t __a, int __n) -{ - uint8x16_t __ret; - for (int __i = 0; __i < 16; __i++) - __ret.__val[__i] = (unsigned char)(__a.__val[__i] << __n); - return __ret; -} - -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vshrq_n_u8(uint8x16_t __a, int __n) -{ - uint8x16_t __ret; - for (int __i = 0; __i < 16; __i++) - __ret.__val[__i] = __a.__val[__i] >> __n; - return __ret; -} - -static __inline__ int8x16_t __attribute__((__always_inline__)) -vshrq_n_s8(int8x16_t __a, int __n) -{ - int8x16_t __ret; - for (int __i = 0; __i < 16; __i++) - __ret.__val[__i] = __a.__val[__i] >> __n; - return __ret; -} - -/* vshlq_n_u64: shift left uint64x2_t by immediate */ -static __inline__ uint64x2_t __attribute__((__always_inline__)) -vshlq_n_u64(uint64x2_t __a, int __n) -{ - uint64x2_t __ret; - __ret.__val[0] = __a.__val[0] << __n; - __ret.__val[1] = __a.__val[1] << __n; - return __ret; -} - -static __inline__ uint64x2_t __attribute__((__always_inline__)) -vshrq_n_u64(uint64x2_t __a, int __n) -{ - uint64x2_t __ret; - __ret.__val[0] = __a.__val[0] >> __n; - __ret.__val[1] = __a.__val[1] >> __n; - return __ret; -} - -/* ================================================================== */ -/* DUPLICATE (BROADCAST) */ -/* ================================================================== */ - -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vdupq_n_u8(unsigned char __a) -{ - uint8x16_t __ret; - for (int __i = 0; __i < 16; __i++) - __ret.__val[__i] = __a; - return __ret; -} - -static __inline__ int8x16_t __attribute__((__always_inline__)) -vdupq_n_s8(signed char __a) -{ - int8x16_t __ret; - for (int __i = 0; __i < 16; __i++) - __ret.__val[__i] = __a; - return __ret; -} - -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vdupq_n_u16(unsigned short __a) -{ - uint16x8_t __ret; - for (int __i = 0; __i < 8; __i++) - __ret.__val[__i] = __a; - return __ret; -} - -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vdupq_n_u32(unsigned int __a) -{ - uint32x4_t __ret; - for (int __i = 0; __i < 4; __i++) - __ret.__val[__i] = __a; - return __ret; -} - -static __inline__ uint8x8_t __attribute__((__always_inline__)) -vdup_n_u8(unsigned char __a) -{ - uint8x8_t __ret; - for (int __i = 0; __i < 8; __i++) - __ret.__val[__i] = __a; - return __ret; -} - -static __inline__ uint64x1_t __attribute__((__always_inline__)) -vmov_n_u64(unsigned long long __a) -{ - uint64x1_t __ret; - __ret.__val[0] = __a; - return __ret; -} - -/* ================================================================== */ -/* COMBINE / SPLIT */ -/* ================================================================== */ - -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vcombine_u8(uint8x8_t __lo, uint8x8_t __hi) -{ - uint8x16_t __ret; - __builtin_memcpy(&__ret.__val[0], &__lo.__val[0], 8); - __builtin_memcpy(&__ret.__val[8], &__hi.__val[0], 8); - return __ret; -} - -static __inline__ int8x16_t __attribute__((__always_inline__)) -vcombine_s8(int8x8_t __lo, int8x8_t __hi) -{ - int8x16_t __ret; - __builtin_memcpy(&__ret.__val[0], &__lo.__val[0], 8); - __builtin_memcpy(&__ret.__val[8], &__hi.__val[0], 8); - return __ret; -} - -static __inline__ uint64x2_t __attribute__((__always_inline__)) -vcombine_u64(uint64x1_t __lo, uint64x1_t __hi) -{ - uint64x2_t __ret; - __ret.__val[0] = __lo.__val[0]; - __ret.__val[1] = __hi.__val[0]; - return __ret; -} - -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vcombine_u16(uint16x4_t __lo, uint16x4_t __hi) -{ - uint16x8_t __ret; - __builtin_memcpy(&__ret.__val[0], &__lo.__val[0], 8); - __builtin_memcpy(&__ret.__val[4], &__hi.__val[0], 8); - return __ret; -} - -/* --- Get high/low halves --- */ - -static __inline__ uint8x8_t __attribute__((__always_inline__)) -vget_low_u8(uint8x16_t __a) -{ - uint8x8_t __ret; - __builtin_memcpy(&__ret.__val[0], &__a.__val[0], 8); - return __ret; -} - -static __inline__ uint8x8_t __attribute__((__always_inline__)) -vget_high_u8(uint8x16_t __a) -{ - uint8x8_t __ret; - __builtin_memcpy(&__ret.__val[0], &__a.__val[8], 8); - return __ret; -} - -static __inline__ int8x8_t __attribute__((__always_inline__)) -vget_low_s8(int8x16_t __a) -{ - int8x8_t __ret; - __builtin_memcpy(&__ret.__val[0], &__a.__val[0], 8); - return __ret; -} - -static __inline__ int8x8_t __attribute__((__always_inline__)) -vget_high_s8(int8x16_t __a) -{ - int8x8_t __ret; - __builtin_memcpy(&__ret.__val[0], &__a.__val[8], 8); - return __ret; -} - -static __inline__ uint64x1_t __attribute__((__always_inline__)) -vget_low_u64(uint64x2_t __a) -{ - uint64x1_t __ret; - __ret.__val[0] = __a.__val[0]; - return __ret; -} - -static __inline__ uint64x1_t __attribute__((__always_inline__)) -vget_high_u64(uint64x2_t __a) -{ - uint64x1_t __ret; - __ret.__val[0] = __a.__val[1]; - return __ret; -} - -static __inline__ poly64x1_t __attribute__((__always_inline__)) -vget_low_p64(poly64x2_t __a) -{ - poly64x1_t __ret; - __ret.__val[0] = __a.__val[0]; - return __ret; -} - -/* ================================================================== */ -/* LANE ACCESS */ -/* ================================================================== */ - -static __inline__ signed char __attribute__((__always_inline__)) -vget_lane_s8(int8x8_t __a, int __lane) -{ - return __a.__val[__lane]; -} - -static __inline__ unsigned char __attribute__((__always_inline__)) -vget_lane_u8(uint8x8_t __a, int __lane) -{ - return __a.__val[__lane]; -} - -static __inline__ unsigned short __attribute__((__always_inline__)) -vget_lane_u16(uint16x4_t __a, int __lane) -{ - return __a.__val[__lane]; -} - -static __inline__ short __attribute__((__always_inline__)) -vget_lane_s16(int16x4_t __a, int __lane) -{ - return __a.__val[__lane]; -} - -static __inline__ unsigned int __attribute__((__always_inline__)) -vget_lane_u32(uint32x2_t __a, int __lane) -{ - return __a.__val[__lane]; -} - -static __inline__ int __attribute__((__always_inline__)) -vget_lane_s32(int32x2_t __a, int __lane) -{ - return __a.__val[__lane]; -} - -static __inline__ float __attribute__((__always_inline__)) -vget_lane_f32(float32x2_t __a, int __lane) -{ - return __a.__val[__lane]; -} - -static __inline__ unsigned char __attribute__((__always_inline__)) -vgetq_lane_u8(uint8x16_t __a, int __lane) -{ - return __a.__val[__lane]; -} - -static __inline__ signed char __attribute__((__always_inline__)) -vgetq_lane_s8(int8x16_t __a, int __lane) -{ - return __a.__val[__lane]; -} - -static __inline__ unsigned short __attribute__((__always_inline__)) -vgetq_lane_u16(uint16x8_t __a, int __lane) -{ - return __a.__val[__lane]; -} - -static __inline__ short __attribute__((__always_inline__)) -vgetq_lane_s16(int16x8_t __a, int __lane) -{ - return __a.__val[__lane]; -} - -static __inline__ int __attribute__((__always_inline__)) -vgetq_lane_s32(int32x4_t __a, int __lane) -{ - return __a.__val[__lane]; -} - -static __inline__ unsigned int __attribute__((__always_inline__)) -vgetq_lane_u32(uint32x4_t __a, int __lane) -{ - return __a.__val[__lane]; -} - -static __inline__ float __attribute__((__always_inline__)) -vgetq_lane_f32(float32x4_t __a, int __lane) -{ - return __a.__val[__lane]; -} - -static __inline__ double __attribute__((__always_inline__)) -vgetq_lane_f64(float64x2_t __a, int __lane) -{ - return __a.__val[__lane]; -} - -/* vset_lane: set a single lane in a vector */ -static __inline__ uint8x8_t __attribute__((__always_inline__)) -vset_lane_u8(unsigned char __val, uint8x8_t __a, int __lane) -{ - __a.__val[__lane] = __val; - return __a; -} - -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vsetq_lane_u8(unsigned char __val, uint8x16_t __a, int __lane) -{ - __a.__val[__lane] = __val; - return __a; -} - -static __inline__ uint16x4_t __attribute__((__always_inline__)) -vset_lane_u16(unsigned short __val, uint16x4_t __a, int __lane) -{ - __a.__val[__lane] = __val; - return __a; -} - -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vsetq_lane_u16(unsigned short __val, uint16x8_t __a, int __lane) -{ - __a.__val[__lane] = __val; - return __a; -} - -static __inline__ uint32x2_t __attribute__((__always_inline__)) -vset_lane_u32(unsigned int __val, uint32x2_t __a, int __lane) -{ - __a.__val[__lane] = __val; - return __a; -} - -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vsetq_lane_u32(unsigned int __val, uint32x4_t __a, int __lane) -{ - __a.__val[__lane] = __val; - return __a; -} - -static __inline__ int32x2_t __attribute__((__always_inline__)) -vset_lane_s32(int __val, int32x2_t __a, int __lane) -{ - __a.__val[__lane] = __val; - return __a; -} - -static __inline__ int32x4_t __attribute__((__always_inline__)) -vsetq_lane_s32(int __val, int32x4_t __a, int __lane) -{ - __a.__val[__lane] = __val; - return __a; -} - -static __inline__ float32x2_t __attribute__((__always_inline__)) -vset_lane_f32(float __val, float32x2_t __a, int __lane) -{ - __a.__val[__lane] = __val; - return __a; -} - -static __inline__ float32x4_t __attribute__((__always_inline__)) -vsetq_lane_f32(float __val, float32x4_t __a, int __lane) -{ - __a.__val[__lane] = __val; - return __a; -} - -#define vset_lane_u64(__val, __a, __lane) ({ uint64x1_t __r = (__a); __r.__val[(__lane)] = (__val); __r; }) -#define vsetq_lane_u64(__val, __a, __lane) ({ uint64x2_t __r = (__a); __r.__val[(__lane)] = (__val); __r; }) -#define vset_lane_s64(__val, __a, __lane) ({ int64x1_t __r = (__a); __r.__val[(__lane)] = (__val); __r; }) -#define vsetq_lane_s64(__val, __a, __lane) ({ int64x2_t __r = (__a); __r.__val[(__lane)] = (__val); __r; }) -#define vsetq_lane_f64(__val, __a, __lane) ({ float64x2_t __r = (__a); __r.__val[(__lane)] = (__val); __r; }) - -/* ================================================================== */ -/* ARITHMETIC OPERATIONS */ -/* ================================================================== */ - -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vaddq_u8(uint8x16_t __a, uint8x16_t __b) -{ - uint8x16_t __ret; - for (int __i = 0; __i < 16; __i++) - __ret.__val[__i] = __a.__val[__i] + __b.__val[__i]; - return __ret; -} - -static __inline__ uint8x8_t __attribute__((__always_inline__)) -vadd_u8(uint8x8_t __a, uint8x8_t __b) -{ - uint8x8_t __ret; - for (int __i = 0; __i < 8; __i++) - __ret.__val[__i] = __a.__val[__i] + __b.__val[__i]; - return __ret; -} - -/* --- Widening arithmetic --- */ - -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vaddl_u8(uint8x8_t __a, uint8x8_t __b) -{ - /* Widening add: u8 + u8 -> u16 */ - uint16x8_t __ret; - for (int __i = 0; __i < 8; __i++) - __ret.__val[__i] = (unsigned short)__a.__val[__i] + (unsigned short)__b.__val[__i]; - return __ret; -} - -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vabdl_u8(uint8x8_t __a, uint8x8_t __b) -{ - /* Widening absolute difference: |a - b| -> u16 */ - uint16x8_t __ret; - for (int __i = 0; __i < 8; __i++) { - int __d = (int)__a.__val[__i] - (int)__b.__val[__i]; - __ret.__val[__i] = (unsigned short)(__d < 0 ? -__d : __d); - } - return __ret; -} - -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vabdq_u16(uint16x8_t __a, uint16x8_t __b) -{ - uint16x8_t __ret; - for (int __i = 0; __i < 8; __i++) { - int __d = (int)__a.__val[__i] - (int)__b.__val[__i]; - __ret.__val[__i] = (unsigned short)(__d < 0 ? -__d : __d); - } - return __ret; -} - -static __inline__ uint8x8_t __attribute__((__always_inline__)) -vhadd_u8(uint8x8_t __a, uint8x8_t __b) -{ - /* Halving add: (a + b) >> 1 without overflow */ - uint8x8_t __ret; - for (int __i = 0; __i < 8; __i++) - __ret.__val[__i] = ((unsigned short)__a.__val[__i] + (unsigned short)__b.__val[__i]) >> 1; - return __ret; -} - -/* --- Narrowing --- */ - -static __inline__ uint8x8_t __attribute__((__always_inline__)) -vmovn_u16(uint16x8_t __a) -{ - /* Narrow: take low 8 bits of each u16 */ - uint8x8_t __ret; - for (int __i = 0; __i < 8; __i++) - __ret.__val[__i] = (unsigned char)__a.__val[__i]; - return __ret; -} - -/* ================================================================== */ -/* COMPARISON OPERATIONS */ -/* ================================================================== */ - -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vceqq_u8(uint8x16_t __a, uint8x16_t __b) -{ - uint8x16_t __ret; - for (int __i = 0; __i < 16; __i++) - __ret.__val[__i] = (__a.__val[__i] == __b.__val[__i]) ? 0xFF : 0x00; - return __ret; -} - -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vcleq_u16(uint16x8_t __a, uint16x8_t __b) -{ - /* Compare less-than-or-equal unsigned */ - uint16x8_t __ret; - for (int __i = 0; __i < 8; __i++) - __ret.__val[__i] = (__a.__val[__i] <= __b.__val[__i]) ? 0xFFFF : 0x0000; - return __ret; -} - -/* ================================================================== */ -/* EXTRACT / SHUFFLE */ -/* ================================================================== */ - -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vextq_u8(uint8x16_t __a, uint8x16_t __b, int __n) -{ - /* Extract: concatenate a and b, then extract 16 bytes starting at byte __n */ - uint8x16_t __ret; - for (int __i = 0; __i < 16; __i++) { - int __idx = __i + __n; - __ret.__val[__i] = (__idx < 16) ? __a.__val[__idx] : __b.__val[__idx - 16]; - } - return __ret; -} - -static __inline__ uint8x8_t __attribute__((__always_inline__)) -vext_u8(uint8x8_t __a, uint8x8_t __b, int __n) -{ - uint8x8_t __ret; - for (int __i = 0; __i < 8; __i++) { - int __idx = __i + __n; - __ret.__val[__i] = (__idx < 8) ? __a.__val[__idx] : __b.__val[__idx - 8]; - } - return __ret; -} - -/* ================================================================== */ -/* POLYNOMIAL MULTIPLICATION */ -/* ================================================================== */ - -static __inline__ poly8x16_t __attribute__((__always_inline__)) -vmulq_p8(poly8x16_t __a, poly8x16_t __b) -{ - /* Polynomial (carry-less) multiply per byte */ - poly8x16_t __ret; - for (int __i = 0; __i < 16; __i++) { - unsigned char __r = 0; - unsigned char __av = __a.__val[__i]; - unsigned char __bv = __b.__val[__i]; - for (int __j = 0; __j < 8; __j++) { - if (__bv & (1 << __j)) - __r ^= __av << __j; - } - __ret.__val[__i] = __r; - } - return __ret; -} - -/* 64-bit polynomial multiply (for crypto/GHASH). - * Scalar C implementation of carry-less multiplication - functionally correct - * but much slower than the hardware PMULL instruction. */ -static __inline__ poly128_t __attribute__((__always_inline__)) -vmull_p64(poly64_t __a, poly64_t __b) -{ - /* Carry-less multiply of two 64-bit polynomials -> 128-bit result */ - __uint128_t __r = 0; - for (int __j = 0; __j < 64; __j++) { - if (__b & (1ULL << __j)) - __r ^= (__uint128_t)__a << __j; - } - return (poly128_t)__r; -} - -static __inline__ poly128_t __attribute__((__always_inline__)) -vmull_high_p64(poly64x2_t __a, poly64x2_t __b) -{ - return vmull_p64(__a.__val[1], __b.__val[1]); -} - -/* ================================================================== */ -/* REVERSE OPERATIONS */ -/* ================================================================== */ - -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vrev32q_u16(uint16x8_t __a) -{ - /* Reverse 16-bit elements within each 32-bit word */ - uint16x8_t __ret; - for (int __i = 0; __i < 8; __i += 2) { - __ret.__val[__i] = __a.__val[__i + 1]; - __ret.__val[__i + 1] = __a.__val[__i]; - } - return __ret; -} - -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vrbitq_u8(uint8x16_t __a) -{ - /* Reverse bits within each byte */ - uint8x16_t __ret; - for (int __i = 0; __i < 16; __i++) { - unsigned char __v = __a.__val[__i]; - unsigned char __r = 0; - for (int __j = 0; __j < 8; __j++) - __r |= ((__v >> __j) & 1) << (7 - __j); - __ret.__val[__i] = __r; - } - return __ret; -} - -/* ================================================================== */ -/* TABLE LOOKUP OPERATIONS */ -/* ================================================================== */ - -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vqtbl1q_u8(uint8x16_t __a, uint8x16_t __b) -{ - uint8x16_t __ret; - for (int __i = 0; __i < 16; __i++) { - unsigned char __idx = __b.__val[__i]; - __ret.__val[__i] = (__idx < 16) ? __a.__val[__idx] : 0; - } - return __ret; -} - -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vqtbx1q_u8(uint8x16_t __def, uint8x16_t __a, uint8x16_t __b) -{ - uint8x16_t __ret; - for (int __i = 0; __i < 16; __i++) { - unsigned char __idx = __b.__val[__i]; - __ret.__val[__i] = (__idx < 16) ? __a.__val[__idx] : __def.__val[__i]; - } - return __ret; -} - -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vqtbl4q_u8(uint8x16x4_t __a, uint8x16_t __b) -{ - uint8x16_t __ret; - for (int __i = 0; __i < 16; __i++) { - unsigned char __idx = __b.__val[__i]; - if (__idx < 16) - __ret.__val[__i] = __a.val[0].__val[__idx]; - else if (__idx < 32) - __ret.__val[__i] = __a.val[1].__val[__idx - 16]; - else if (__idx < 48) - __ret.__val[__i] = __a.val[2].__val[__idx - 32]; - else if (__idx < 64) - __ret.__val[__i] = __a.val[3].__val[__idx - 48]; - else - __ret.__val[__i] = 0; - } - return __ret; -} - -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vqtbx4q_u8(uint8x16_t __def, uint8x16x4_t __a, uint8x16_t __b) -{ - uint8x16_t __ret; - for (int __i = 0; __i < 16; __i++) { - unsigned char __idx = __b.__val[__i]; - if (__idx < 16) - __ret.__val[__i] = __a.val[0].__val[__idx]; - else if (__idx < 32) - __ret.__val[__i] = __a.val[1].__val[__idx - 16]; - else if (__idx < 48) - __ret.__val[__i] = __a.val[2].__val[__idx - 32]; - else if (__idx < 64) - __ret.__val[__i] = __a.val[3].__val[__idx - 48]; - else - __ret.__val[__i] = __def.__val[__i]; - } - return __ret; -} - -/* 64-bit table lookup (AArch32 compat) */ - -static __inline__ uint8x8_t __attribute__((__always_inline__)) -vtbl2_u8(uint8x8x2_t __a, uint8x8_t __b) -{ - uint8x8_t __ret; - for (int __i = 0; __i < 8; __i++) { - unsigned char __idx = __b.__val[__i]; - if (__idx < 8) - __ret.__val[__i] = __a.val[0].__val[__idx]; - else if (__idx < 16) - __ret.__val[__i] = __a.val[1].__val[__idx - 8]; - else - __ret.__val[__i] = 0; - } - return __ret; -} - -static __inline__ uint8x8_t __attribute__((__always_inline__)) -vtbx2_u8(uint8x8_t __def, uint8x8x2_t __a, uint8x8_t __b) -{ - uint8x8_t __ret; - for (int __i = 0; __i < 8; __i++) { - unsigned char __idx = __b.__val[__i]; - if (__idx < 8) - __ret.__val[__i] = __a.val[0].__val[__idx]; - else if (__idx < 16) - __ret.__val[__i] = __a.val[1].__val[__idx - 8]; - else - __ret.__val[__i] = __def.__val[__i]; - } - return __ret; -} - -/* ================================================================== */ -/* PAIRWISE / ACROSS-LANE */ -/* ================================================================== */ - -static __inline__ int8x8_t __attribute__((__always_inline__)) -vpmin_s8(int8x8_t __a, int8x8_t __b) -{ - int8x8_t __ret; - for (int __i = 0; __i < 4; __i++) { - signed char __x = __a.__val[2 * __i]; - signed char __y = __a.__val[2 * __i + 1]; - __ret.__val[__i] = (__x < __y) ? __x : __y; - } - for (int __i = 0; __i < 4; __i++) { - signed char __x = __b.__val[2 * __i]; - signed char __y = __b.__val[2 * __i + 1]; - __ret.__val[4 + __i] = (__x < __y) ? __x : __y; - } - return __ret; -} - -static __inline__ signed char __attribute__((__always_inline__)) -vminvq_s8(int8x16_t __a) -{ - signed char __min = __a.__val[0]; - for (int __i = 1; __i < 16; __i++) { - if (__a.__val[__i] < __min) - __min = __a.__val[__i]; - } - return __min; -} - -/* vminvq_u32: horizontal minimum across uint32x4_t */ -static __inline__ unsigned int __attribute__((__always_inline__)) -vminvq_u32(uint32x4_t __a) -{ - unsigned int __min = __a.__val[0]; - for (int __i = 1; __i < 4; __i++) { - if (__a.__val[__i] < __min) - __min = __a.__val[__i]; - } - return __min; -} - -/* ================================================================== */ -/* AES INTRINSICS */ -/* ================================================================== */ -/* ARM Crypto Extension AES intrinsics using inline assembly. */ - -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vaeseq_u8(uint8x16_t __a, uint8x16_t __b) -{ - uint8x16_t __ret; - __asm__ __volatile__( - "ldr q0, [%[a]]\n\t" - "ldr q1, [%[b]]\n\t" - "aese v0.16b, v1.16b\n\t" - "str q0, [%[ret]]\n\t" - : - : [a] "r" (&__a), [b] "r" (&__b), [ret] "r" (&__ret) - : "v0", "v1", "memory" - ); - return __ret; -} - -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vaesmcq_u8(uint8x16_t __a) -{ - uint8x16_t __ret; - __asm__ __volatile__( - "ldr q0, [%[a]]\n\t" - "aesmc v0.16b, v0.16b\n\t" - "str q0, [%[ret]]\n\t" - : - : [a] "r" (&__a), [ret] "r" (&__ret) - : "v0", "memory" - ); - return __ret; -} - -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vaesdq_u8(uint8x16_t __a, uint8x16_t __b) -{ - uint8x16_t __ret; - __asm__ __volatile__( - "ldr q0, [%[a]]\n\t" - "ldr q1, [%[b]]\n\t" - "aesd v0.16b, v1.16b\n\t" - "str q0, [%[ret]]\n\t" - : - : [a] "r" (&__a), [b] "r" (&__b), [ret] "r" (&__ret) - : "v0", "v1", "memory" - ); - return __ret; -} - -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vaesimcq_u8(uint8x16_t __a) -{ - uint8x16_t __ret; - __asm__ __volatile__( - "ldr q0, [%[a]]\n\t" - "aesimc v0.16b, v0.16b\n\t" - "str q0, [%[ret]]\n\t" - : - : [a] "r" (&__a), [ret] "r" (&__ret) - : "v0", "memory" - ); - return __ret; -} - -/* ================================================================== */ -/* TYPE REINTERPRET CASTS */ -/* ================================================================== */ - -static __inline__ int8x16_t __attribute__((__always_inline__)) -vreinterpretq_s8_u8(uint8x16_t __a) -{ - int8x16_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vreinterpretq_u8_s8(int8x16_t __a) -{ - uint8x16_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vreinterpretq_u8_u16(uint16x8_t __a) -{ - uint8x16_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vreinterpretq_u16_u8(uint8x16_t __a) -{ - uint16x8_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vreinterpretq_u8_u32(uint32x4_t __a) -{ - uint8x16_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vreinterpretq_u32_u8(uint8x16_t __a) -{ - uint32x4_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vreinterpretq_u8_u64(uint64x2_t __a) -{ - uint8x16_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -static __inline__ uint64x2_t __attribute__((__always_inline__)) -vreinterpretq_u64_u8(uint8x16_t __a) -{ - uint64x2_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vreinterpretq_u8_p8(poly8x16_t __a) -{ - uint8x16_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -static __inline__ poly8x16_t __attribute__((__always_inline__)) -vreinterpretq_p8_u8(uint8x16_t __a) -{ - poly8x16_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -static __inline__ poly64x2_t __attribute__((__always_inline__)) -vreinterpretq_p64_u8(uint8x16_t __a) -{ - poly64x2_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -static __inline__ uint64x2_t __attribute__((__always_inline__)) -vreinterpretq_u64_p64(poly64x2_t __a) -{ - uint64x2_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vreinterpretq_u8_p128(poly128_t __a) -{ - uint8x16_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -/* ================================================================ - * Additional u32/u64 intrinsics for mbedtls, redis, libsodium - * ================================================================ */ - -/* === Create / Duplicate === */ - -/* vcreate_u32: create uint32x2_t from a raw uint64 value */ -static __inline__ uint32x2_t __attribute__((__always_inline__)) -vcreate_u32(unsigned long long __a) -{ - uint32x2_t __ret; - __builtin_memcpy(&__ret, &__a, 8); - return __ret; -} - -/* vcreate_u64: create uint64x1_t from a raw uint64 value */ -static __inline__ uint64x1_t __attribute__((__always_inline__)) -vcreate_u64(unsigned long long __a) -{ - uint64x1_t __ret; - __ret.__val[0] = __a; - return __ret; -} - -/* vdup_n_u64: duplicate scalar u64 into single lane of uint64x1_t */ -static __inline__ uint64x1_t __attribute__((__always_inline__)) -vdup_n_u64(unsigned long long __a) -{ - uint64x1_t __ret; - __ret.__val[0] = __a; - return __ret; -} - -/* vdup_n_u32: duplicate scalar u32 into both lanes of uint32x2_t */ -static __inline__ uint32x2_t __attribute__((__always_inline__)) -vdup_n_u32(unsigned int __a) -{ - uint32x2_t __ret; - __ret.__val[0] = __a; - __ret.__val[1] = __a; - return __ret; -} - -/* vdupq_n_u64: duplicate scalar u64 into both lanes of uint64x2_t */ -static __inline__ uint64x2_t __attribute__((__always_inline__)) -vdupq_n_u64(unsigned long long __a) -{ - uint64x2_t __ret; - __ret.__val[0] = __a; - __ret.__val[1] = __a; - return __ret; -} - -/* vdupq_n_s32: duplicate scalar s32 into all 4 lanes of int32x4_t */ -static __inline__ int32x4_t __attribute__((__always_inline__)) -vdupq_n_s32(int __a) -{ - int32x4_t __ret; - __ret.__val[0] = __a; - __ret.__val[1] = __a; - __ret.__val[2] = __a; - __ret.__val[3] = __a; - return __ret; -} - -/* === Combine (two D-registers -> one Q-register) === */ - -/* vcombine_u32: combine two uint32x2_t into uint32x4_t */ -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vcombine_u32(uint32x2_t __lo, uint32x2_t __hi) -{ - uint32x4_t __ret; - __ret.__val[0] = __lo.__val[0]; - __ret.__val[1] = __lo.__val[1]; - __ret.__val[2] = __hi.__val[0]; - __ret.__val[3] = __hi.__val[1]; - return __ret; -} - -/* === Get low/high halves === */ - -/* vget_low_u32: get low 64-bit half of uint32x4_t */ -static __inline__ uint32x2_t __attribute__((__always_inline__)) -vget_low_u32(uint32x4_t __a) -{ - uint32x2_t __ret; - __ret.__val[0] = __a.__val[0]; - __ret.__val[1] = __a.__val[1]; - return __ret; -} - -/* vget_high_u32: get high 64-bit half of uint32x4_t */ -static __inline__ uint32x2_t __attribute__((__always_inline__)) -vget_high_u32(uint32x4_t __a) -{ - uint32x2_t __ret; - __ret.__val[0] = __a.__val[2]; - __ret.__val[1] = __a.__val[3]; - return __ret; -} - -/* === Lane access === */ - -/* vgetq_lane_u64: extract a single lane from uint64x2_t */ -#define vgetq_lane_u64(__a, __lane) ((__a).__val[(__lane)]) - -/* === Arithmetic: u32 === */ - -/* vaddq_u32: add uint32x4_t element-wise */ -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vaddq_u32(uint32x4_t __a, uint32x4_t __b) -{ - uint32x4_t __ret; - __ret.__val[0] = __a.__val[0] + __b.__val[0]; - __ret.__val[1] = __a.__val[1] + __b.__val[1]; - __ret.__val[2] = __a.__val[2] + __b.__val[2]; - __ret.__val[3] = __a.__val[3] + __b.__val[3]; - return __ret; -} - -/* vmulq_u32: multiply uint32x4_t element-wise */ -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vmulq_u32(uint32x4_t __a, uint32x4_t __b) -{ - uint32x4_t __ret; - __ret.__val[0] = __a.__val[0] * __b.__val[0]; - __ret.__val[1] = __a.__val[1] * __b.__val[1]; - __ret.__val[2] = __a.__val[2] * __b.__val[2]; - __ret.__val[3] = __a.__val[3] * __b.__val[3]; - return __ret; -} - -/* === Arithmetic: u64 === */ - -/* vaddq_u64: add uint64x2_t element-wise */ -static __inline__ uint64x2_t __attribute__((__always_inline__)) -vaddq_u64(uint64x2_t __a, uint64x2_t __b) -{ - uint64x2_t __ret; - __ret.__val[0] = __a.__val[0] + __b.__val[0]; - __ret.__val[1] = __a.__val[1] + __b.__val[1]; - return __ret; -} - -/* === Bitwise: u32 === */ - -/* vandq_u32: bitwise AND uint32x4_t */ -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vandq_u32(uint32x4_t __a, uint32x4_t __b) -{ - uint32x4_t __ret; - __ret.__val[0] = __a.__val[0] & __b.__val[0]; - __ret.__val[1] = __a.__val[1] & __b.__val[1]; - __ret.__val[2] = __a.__val[2] & __b.__val[2]; - __ret.__val[3] = __a.__val[3] & __b.__val[3]; - return __ret; -} - -/* vorrq_u64: bitwise OR uint64x2_t */ -static __inline__ uint64x2_t __attribute__((__always_inline__)) -vorrq_u64(uint64x2_t __a, uint64x2_t __b) -{ - uint64x2_t __ret; - __ret.__val[0] = __a.__val[0] | __b.__val[0]; - __ret.__val[1] = __a.__val[1] | __b.__val[1]; - return __ret; -} - -/* vorrq_u32: bitwise OR uint32x4_t */ -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vorrq_u32(uint32x4_t __a, uint32x4_t __b) -{ - uint32x4_t __ret; - __ret.__val[0] = __a.__val[0] | __b.__val[0]; - __ret.__val[1] = __a.__val[1] | __b.__val[1]; - __ret.__val[2] = __a.__val[2] | __b.__val[2]; - __ret.__val[3] = __a.__val[3] | __b.__val[3]; - return __ret; -} - -/* veorq_u32: bitwise XOR uint32x4_t */ -static __inline__ uint32x4_t __attribute__((__always_inline__)) -veorq_u32(uint32x4_t __a, uint32x4_t __b) -{ - uint32x4_t __ret; - __ret.__val[0] = __a.__val[0] ^ __b.__val[0]; - __ret.__val[1] = __a.__val[1] ^ __b.__val[1]; - __ret.__val[2] = __a.__val[2] ^ __b.__val[2]; - __ret.__val[3] = __a.__val[3] ^ __b.__val[3]; - return __ret; -} - -/* === Shift: u32 === */ - -/* vshlq_n_u32: shift left uint32x4_t by immediate */ -#define vshlq_n_u32(__a, __n) __extension__ ({ \ - uint32x4_t __r; \ - __r.__val[0] = (__a).__val[0] << (__n); \ - __r.__val[1] = (__a).__val[1] << (__n); \ - __r.__val[2] = (__a).__val[2] << (__n); \ - __r.__val[3] = (__a).__val[3] << (__n); \ - __r; \ -}) - -/* vshrq_n_u32: shift right uint32x4_t by immediate */ -#define vshrq_n_u32(__a, __n) __extension__ ({ \ - uint32x4_t __r; \ - __r.__val[0] = (__a).__val[0] >> (__n); \ - __r.__val[1] = (__a).__val[1] >> (__n); \ - __r.__val[2] = (__a).__val[2] >> (__n); \ - __r.__val[3] = (__a).__val[3] >> (__n); \ - __r; \ -}) - -/* vsriq_n_u32: shift right and insert - for each lane, - * shift __b right by __n, and insert into __a preserving the top __n bits of __a - * result[i] = (a[i] & ~((1u<<(32-n))-1)) | (b[i] >> n) */ -#define vsriq_n_u32(__a, __b, __n) __extension__ ({ \ - uint32x4_t __r; \ - unsigned int __mask = ~((1u << (32 - (__n))) - 1u); \ - __r.__val[0] = ((__a).__val[0] & __mask) | ((__b).__val[0] >> (__n)); \ - __r.__val[1] = ((__a).__val[1] & __mask) | ((__b).__val[1] >> (__n)); \ - __r.__val[2] = ((__a).__val[2] & __mask) | ((__b).__val[2] >> (__n)); \ - __r.__val[3] = ((__a).__val[3] & __mask) | ((__b).__val[3] >> (__n)); \ - __r; \ -}) - -/* vsliq_n_u32: shift left and insert - for each lane, - * shift __b left by __n, and insert into __a preserving the low __n bits of __a */ -#define vsliq_n_u32(__a, __b, __n) __extension__ ({ \ - uint32x4_t __r; \ - unsigned int __mask = (1u << (__n)) - 1u; \ - __r.__val[0] = ((__a).__val[0] & __mask) | ((__b).__val[0] << (__n)); \ - __r.__val[1] = ((__a).__val[1] & __mask) | ((__b).__val[1] << (__n)); \ - __r.__val[2] = ((__a).__val[2] & __mask) | ((__b).__val[2] << (__n)); \ - __r.__val[3] = ((__a).__val[3] & __mask) | ((__b).__val[3] << (__n)); \ - __r; \ -}) - -/* === Compare: u8 === */ - -/* vmaxq_u8: element-wise maximum of uint8x16_t */ -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vmaxq_u8(uint8x16_t __a, uint8x16_t __b) -{ - uint8x16_t __ret; - for (int __i = 0; __i < 16; __i++) - __ret.__val[__i] = __a.__val[__i] > __b.__val[__i] ? __a.__val[__i] : __b.__val[__i]; - return __ret; -} - -/* === Population count === */ - -/* vcntq_u8: population count per byte */ -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vcntq_u8(uint8x16_t __a) -{ - uint8x16_t __ret; - for (int __i = 0; __i < 16; __i++) { - unsigned char __v = __a.__val[__i]; - unsigned char __c = 0; - while (__v) { __c += __v & 1; __v >>= 1; } - __ret.__val[__i] = __c; - } - return __ret; -} - -/* === Pairwise add long === */ - -/* vpaddlq_u8: pairwise add adjacent u8 pairs, result as u16 */ -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vpaddlq_u8(uint8x16_t __a) -{ - uint16x8_t __ret; - for (int __i = 0; __i < 8; __i++) - __ret.__val[__i] = (unsigned short)__a.__val[__i * 2] + (unsigned short)__a.__val[__i * 2 + 1]; - return __ret; -} - -/* vpaddlq_u16: pairwise add adjacent u16 pairs, result as u32 */ -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vpaddlq_u16(uint16x8_t __a) -{ - uint32x4_t __ret; - for (int __i = 0; __i < 4; __i++) - __ret.__val[__i] = (unsigned int)__a.__val[__i * 2] + (unsigned int)__a.__val[__i * 2 + 1]; - return __ret; -} - -/* vpaddlq_u32: pairwise add adjacent u32 pairs, result as u64 */ -static __inline__ uint64x2_t __attribute__((__always_inline__)) -vpaddlq_u32(uint32x4_t __a) -{ - uint64x2_t __ret; - __ret.__val[0] = (unsigned long long)__a.__val[0] + (unsigned long long)__a.__val[1]; - __ret.__val[1] = (unsigned long long)__a.__val[2] + (unsigned long long)__a.__val[3]; - return __ret; -} - -/* vpadalq_u8: pairwise add and accumulate long u8 -> u16 */ -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vpadalq_u8(uint16x8_t __acc, uint8x16_t __a) -{ - uint16x8_t __ret; - for (int __i = 0; __i < 8; __i++) - __ret.__val[__i] = __acc.__val[__i] + (unsigned short)__a.__val[__i * 2] + (unsigned short)__a.__val[__i * 2 + 1]; - return __ret; -} - -/* === Extract / Rotate === */ - -/* vextq_u64: extract from pair of uint64x2_t */ -#define vextq_u64(__a, __b, __n) __extension__ ({ \ - uint64x2_t __r; \ - if ((__n) == 0) { \ - __r = (__a); \ - } else { \ - __r.__val[0] = (__a).__val[1]; \ - __r.__val[1] = (__b).__val[0]; \ - } \ - __r; \ -}) - -/* vextq_u32: extract from pair of uint32x4_t */ -#define vextq_u32(__a, __b, __n) __extension__ ({ \ - uint32x4_t __r; \ - unsigned int __tmp[8]; \ - __tmp[0] = (__a).__val[0]; __tmp[1] = (__a).__val[1]; \ - __tmp[2] = (__a).__val[2]; __tmp[3] = (__a).__val[3]; \ - __tmp[4] = (__b).__val[0]; __tmp[5] = (__b).__val[1]; \ - __tmp[6] = (__b).__val[2]; __tmp[7] = (__b).__val[3]; \ - __r.__val[0] = __tmp[(__n)]; __r.__val[1] = __tmp[(__n) + 1]; \ - __r.__val[2] = __tmp[(__n) + 2]; __r.__val[3] = __tmp[(__n) + 3]; \ - __r; \ -}) - -/* === Narrowing === */ - -/* vmovn_u64: narrow uint64x2_t to uint32x2_t (take low 32 bits of each lane) */ -static __inline__ uint32x2_t __attribute__((__always_inline__)) -vmovn_u64(uint64x2_t __a) -{ - uint32x2_t __ret; - __ret.__val[0] = (unsigned int)__a.__val[0]; - __ret.__val[1] = (unsigned int)__a.__val[1]; - return __ret; -} - -/* vmovn_u32: narrow uint32x4_t to uint16x4_t (take low 16 bits of each lane) */ -static __inline__ uint16x4_t __attribute__((__always_inline__)) -vmovn_u32(uint32x4_t __a) -{ - uint16x4_t __ret; - __ret.__val[0] = (unsigned short)__a.__val[0]; - __ret.__val[1] = (unsigned short)__a.__val[1]; - __ret.__val[2] = (unsigned short)__a.__val[2]; - __ret.__val[3] = (unsigned short)__a.__val[3]; - return __ret; -} - -/* vshrn_n_u64: shift right and narrow uint64x2_t to uint32x2_t */ -#define vshrn_n_u64(__a, __n) __extension__ ({ \ - uint32x2_t __r; \ - __r.__val[0] = (unsigned int)((__a).__val[0] >> (__n)); \ - __r.__val[1] = (unsigned int)((__a).__val[1] >> (__n)); \ - __r; \ -}) - -/* vshrn_n_u32: shift right and narrow uint32x4_t to uint16x4_t */ -#define vshrn_n_u32(__a, __n) __extension__ ({ \ - uint16x4_t __r; \ - __r.__val[0] = (unsigned short)((__a).__val[0] >> (__n)); \ - __r.__val[1] = (unsigned short)((__a).__val[1] >> (__n)); \ - __r.__val[2] = (unsigned short)((__a).__val[2] >> (__n)); \ - __r.__val[3] = (unsigned short)((__a).__val[3] >> (__n)); \ - __r; \ -}) - -/* === Widening multiply-accumulate === */ - -/* vmlal_u32: widening multiply-accumulate u32 -> u64 - * result[i] = acc[i] + (u64)a[i] * (u64)b[i] */ -static __inline__ uint64x2_t __attribute__((__always_inline__)) -vmlal_u32(uint64x2_t __acc, uint32x2_t __a, uint32x2_t __b) -{ - uint64x2_t __ret; - __ret.__val[0] = __acc.__val[0] + (unsigned long long)__a.__val[0] * (unsigned long long)__b.__val[0]; - __ret.__val[1] = __acc.__val[1] + (unsigned long long)__a.__val[1] * (unsigned long long)__b.__val[1]; - return __ret; -} - -/* vmlal_high_u32: widening multiply-accumulate of high halves */ -static __inline__ uint64x2_t __attribute__((__always_inline__)) -vmlal_high_u32(uint64x2_t __acc, uint32x4_t __a, uint32x4_t __b) -{ - uint64x2_t __ret; - __ret.__val[0] = __acc.__val[0] + (unsigned long long)__a.__val[2] * (unsigned long long)__b.__val[2]; - __ret.__val[1] = __acc.__val[1] + (unsigned long long)__a.__val[3] * (unsigned long long)__b.__val[3]; - return __ret; -} - -/* vmlal_low_u32: widening multiply-accumulate of low halves */ -static __inline__ uint64x2_t __attribute__((__always_inline__)) -vmlal_low_u32(uint64x2_t __acc, uint32x4_t __a, uint32x4_t __b) -{ - uint64x2_t __ret; - __ret.__val[0] = __acc.__val[0] + (unsigned long long)__a.__val[0] * (unsigned long long)__b.__val[0]; - __ret.__val[1] = __acc.__val[1] + (unsigned long long)__a.__val[1] * (unsigned long long)__b.__val[1]; - return __ret; -} - -/* === Unzip / De-interleave === */ - -/* vuzpq_u32: unzip (de-interleave) two uint32x4_t vectors - * result.val[0] = {a[0], a[2], b[0], b[2]} (even elements) - * result.val[1] = {a[1], a[3], b[1], b[3]} (odd elements) */ -static __inline__ uint32x4x2_t __attribute__((__always_inline__)) -vuzpq_u32(uint32x4_t __a, uint32x4_t __b) -{ - uint32x4x2_t __ret; - __ret.val[0].__val[0] = __a.__val[0]; - __ret.val[0].__val[1] = __a.__val[2]; - __ret.val[0].__val[2] = __b.__val[0]; - __ret.val[0].__val[3] = __b.__val[2]; - __ret.val[1].__val[0] = __a.__val[1]; - __ret.val[1].__val[1] = __a.__val[3]; - __ret.val[1].__val[2] = __b.__val[1]; - __ret.val[1].__val[3] = __b.__val[3]; - return __ret; -} - -/* === Reinterpret casts: u32 <-> u64 === */ - -/* vreinterpretq_u32_u64: reinterpret uint64x2_t as uint32x4_t */ -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vreinterpretq_u32_u64(uint64x2_t __a) -{ - uint32x4_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -/* vreinterpretq_u64_u32: reinterpret uint32x4_t as uint64x2_t */ -static __inline__ uint64x2_t __attribute__((__always_inline__)) -vreinterpretq_u64_u32(uint32x4_t __a) -{ - uint64x2_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -/* vreinterpretq_s32_u32: reinterpret uint32x4_t as int32x4_t */ -static __inline__ int32x4_t __attribute__((__always_inline__)) -vreinterpretq_s32_u32(uint32x4_t __a) -{ - int32x4_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -/* vreinterpretq_u32_s32: reinterpret int32x4_t as uint32x4_t */ -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vreinterpretq_u32_s32(int32x4_t __a) -{ - uint32x4_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -/* vreinterpretq_u16_u32: reinterpret uint32x4_t as uint16x8_t */ -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vreinterpretq_u16_u32(uint32x4_t __a) -{ - uint16x8_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -/* vreinterpretq_u32_u16: reinterpret uint16x8_t as uint32x4_t */ -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vreinterpretq_u32_u16(uint16x8_t __a) -{ - uint32x4_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -/* === SHA-256 crypto intrinsics (software implementation) === */ -/* Used by mbedtls SHA-256 hardware acceleration. */ - -/* vsha256su0q_u32: SHA-256 schedule update 0 */ -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vsha256su0q_u32(uint32x4_t __w0_3, uint32x4_t __w4_7) -{ - uint32x4_t __ret; - for (int __i = 0; __i < 4; __i++) { - unsigned int __w = (__i < 3) ? __w0_3.__val[__i + 1] : __w4_7.__val[0]; - /* sigma0: ROTR(7) ^ ROTR(18) ^ SHR(3) */ - unsigned int __s0 = ((__w >> 7) | (__w << 25)) ^ ((__w >> 18) | (__w << 14)) ^ (__w >> 3); - __ret.__val[__i] = __w0_3.__val[__i] + __s0; - } - return __ret; -} - -/* vsha256su1q_u32: SHA-256 schedule update 1 */ -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vsha256su1q_u32(uint32x4_t __tw0_3, uint32x4_t __w8_11, uint32x4_t __w12_15) -{ - uint32x4_t __ret; - unsigned int __wm2[4]; - __wm2[0] = __w12_15.__val[2]; - __wm2[1] = __w12_15.__val[3]; - __wm2[2] = __tw0_3.__val[0]; - __wm2[3] = __tw0_3.__val[1]; - for (int __i = 0; __i < 4; __i++) { - unsigned int __w = __wm2[__i]; - /* sigma1: ROTR(17) ^ ROTR(19) ^ SHR(10) */ - unsigned int __s1 = ((__w >> 17) | (__w << 15)) ^ ((__w >> 19) | (__w << 13)) ^ (__w >> 10); - unsigned int __w9; - if (__i < 2) __w9 = __w8_11.__val[__i + 2]; - else __w9 = __w12_15.__val[__i - 2]; - __ret.__val[__i] = __tw0_3.__val[__i] + __s1 + __w9; - } - return __ret; -} - -/* vsha256hq_u32: SHA-256 hash update (part 1) */ -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vsha256hq_u32(uint32x4_t __hash_abcd, uint32x4_t __hash_efgh, uint32x4_t __wk) -{ - unsigned int __a = __hash_abcd.__val[0], __b = __hash_abcd.__val[1]; - unsigned int __c = __hash_abcd.__val[2], __d = __hash_abcd.__val[3]; - unsigned int __e = __hash_efgh.__val[0], __f = __hash_efgh.__val[1]; - unsigned int __g = __hash_efgh.__val[2], __h = __hash_efgh.__val[3]; - for (int __i = 0; __i < 4; __i++) { - unsigned int __S1 = ((__e >> 6) | (__e << 26)) ^ ((__e >> 11) | (__e << 21)) ^ ((__e >> 25) | (__e << 7)); - unsigned int __ch = (__e & __f) ^ (~__e & __g); - unsigned int __temp1 = __h + __S1 + __ch + __wk.__val[__i]; - unsigned int __S0 = ((__a >> 2) | (__a << 30)) ^ ((__a >> 13) | (__a << 19)) ^ ((__a >> 22) | (__a << 10)); - unsigned int __maj = (__a & __b) ^ (__a & __c) ^ (__b & __c); - unsigned int __temp2 = __S0 + __maj; - __h = __g; __g = __f; __f = __e; __e = __d + __temp1; - __d = __c; __c = __b; __b = __a; __a = __temp1 + __temp2; - } - uint32x4_t __ret; - __ret.__val[0] = __a; __ret.__val[1] = __b; - __ret.__val[2] = __c; __ret.__val[3] = __d; - return __ret; -} - -/* vsha256h2q_u32: SHA-256 hash update (part 2) */ -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vsha256h2q_u32(uint32x4_t __hash_efgh, uint32x4_t __hash_abcd, uint32x4_t __wk) -{ - unsigned int __a = __hash_abcd.__val[0], __b = __hash_abcd.__val[1]; - unsigned int __c = __hash_abcd.__val[2], __d = __hash_abcd.__val[3]; - unsigned int __e = __hash_efgh.__val[0], __f = __hash_efgh.__val[1]; - unsigned int __g = __hash_efgh.__val[2], __h = __hash_efgh.__val[3]; - for (int __i = 0; __i < 4; __i++) { - unsigned int __S1 = ((__e >> 6) | (__e << 26)) ^ ((__e >> 11) | (__e << 21)) ^ ((__e >> 25) | (__e << 7)); - unsigned int __ch = (__e & __f) ^ (~__e & __g); - unsigned int __temp1 = __h + __S1 + __ch + __wk.__val[__i]; - unsigned int __S0 = ((__a >> 2) | (__a << 30)) ^ ((__a >> 13) | (__a << 19)) ^ ((__a >> 22) | (__a << 10)); - unsigned int __maj = (__a & __b) ^ (__a & __c) ^ (__b & __c); - unsigned int __temp2 = __S0 + __maj; - __h = __g; __g = __f; __f = __e; __e = __d + __temp1; - __d = __c; __c = __b; __b = __a; __a = __temp1 + __temp2; - } - uint32x4_t __ret; - __ret.__val[0] = __e; __ret.__val[1] = __f; - __ret.__val[2] = __g; __ret.__val[3] = __h; - return __ret; -} - -/* === Load/Store u64 (64-bit / D-register) === */ - -/* vld1_u64: load 1 x u64 (64-bit) */ -static __inline__ uint64x1_t __attribute__((__always_inline__)) -vld1_u64(const unsigned long long *__p) -{ - uint64x1_t __ret; - __builtin_memcpy(&__ret, __p, 8); - return __ret; -} - -/* === Load/Store u32 (64-bit / D-register) === */ - -/* vld1_u32: load 2 x u32 (64-bit) */ -static __inline__ uint32x2_t __attribute__((__always_inline__)) -vld1_u32(unsigned int const *__p) -{ - uint32x2_t __ret; - __builtin_memcpy(&__ret, __p, 8); - return __ret; -} - -/* vst1_u32: store 2 x u32 (64-bit) */ -static __inline__ void __attribute__((__always_inline__)) -vst1_u32(unsigned int *__p, uint32x2_t __a) -{ - __builtin_memcpy(__p, &__a, 8); -} - -/* === Store 8x8 (64-bit / D-register) === */ - -/* vst1_u8: store 8 x u8 (64-bit) */ -static __inline__ void __attribute__((__always_inline__)) -vst1_u8(unsigned char *__p, uint8x8_t __a) -{ - __builtin_memcpy(__p, &__a, 8); -} - -/* === Reinterpret casts === */ - -/* vreinterpret_u64_u8: reinterpret uint8x8_t as uint64x1_t (no code) */ -static __inline__ uint64x1_t __attribute__((__always_inline__)) -vreinterpret_u64_u8(uint8x8_t __a) -{ - uint64x1_t __ret; - __builtin_memcpy(&__ret, &__a, 8); - return __ret; -} - -/* === Get lane === */ - -/* vget_lane_u64: extract lane from uint64x1_t */ -static __inline__ unsigned long long __attribute__((__always_inline__)) -vget_lane_u64(uint64x1_t __a, int __lane) -{ - (void)__lane; - return __a.__val[0]; -} - -/* === Narrowing shifts === */ - -/* vshrn_n_u16: shift right narrow (u16 -> u8) */ -static __inline__ uint8x8_t __attribute__((__always_inline__)) -vshrn_n_u16(uint16x8_t __a, int __n) -{ - uint8x8_t __ret; - for (int __i = 0; __i < 8; __i++) { - __ret.__val[__i] = (unsigned char)(__a.__val[__i] >> __n); - } - return __ret; -} - -/* === Shift and insert === */ - -/* vsli_n_u8: shift left and insert (64-bit, per-byte) */ -static __inline__ uint8x8_t __attribute__((__always_inline__)) -vsli_n_u8(uint8x8_t __a, uint8x8_t __b, int __n) -{ - uint8x8_t __ret; - unsigned char __mask = (unsigned char)((0xFF << __n) & 0xFF); - for (int __i = 0; __i < 8; __i++) { - __ret.__val[__i] = (__a.__val[__i] & ~__mask) | ((unsigned char)(__b.__val[__i] << __n) & __mask); - } - return __ret; -} - -/* vsriq_n_u8: shift right and insert (128-bit, per-byte) */ -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vsriq_n_u8(uint8x16_t __a, uint8x16_t __b, int __n) -{ - uint8x16_t __ret; - unsigned char __mask = (unsigned char)(0xFF >> __n); - for (int __i = 0; __i < 16; __i++) { - __ret.__val[__i] = (__a.__val[__i] & ~__mask) | ((__b.__val[__i] >> __n) & __mask); - } - return __ret; -} - -/* === Multi-element structure loads === */ - -/* vld2q_u16: load 2-element interleaved u16 (128-bit x 2) */ -static __inline__ uint16x8x2_t __attribute__((__always_inline__)) -vld2q_u16(unsigned short const *__p) -{ - uint16x8x2_t __ret; - for (int __i = 0; __i < 8; __i++) { - __ret.val[0].__val[__i] = __p[__i * 2]; - __ret.val[1].__val[__i] = __p[__i * 2 + 1]; - } - return __ret; -} - -/* vld4q_u8: load 4-element interleaved u8 (128-bit x 4) */ -static __inline__ uint8x16x4_t __attribute__((__always_inline__)) -vld4q_u8(unsigned char const *__p) -{ - uint8x16x4_t __ret; - for (int __i = 0; __i < 16; __i++) { - __ret.val[0].__val[__i] = __p[__i * 4]; - __ret.val[1].__val[__i] = __p[__i * 4 + 1]; - __ret.val[2].__val[__i] = __p[__i * 4 + 2]; - __ret.val[3].__val[__i] = __p[__i * 4 + 3]; - } - return __ret; -} - -/* ================================================================== */ -/* REINTERPRET CASTS: signed <-> unsigned */ -/* ================================================================== */ - -/* vreinterpretq_u8_s16: reinterpret int16x8_t as uint8x16_t */ -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vreinterpretq_u8_s16(int16x8_t __a) -{ - uint8x16_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -/* vreinterpretq_s16_u8: reinterpret uint8x16_t as int16x8_t */ -static __inline__ int16x8_t __attribute__((__always_inline__)) -vreinterpretq_s16_u8(uint8x16_t __a) -{ - int16x8_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -/* vreinterpretq_u8_s32: reinterpret int32x4_t as uint8x16_t */ -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vreinterpretq_u8_s32(int32x4_t __a) -{ - uint8x16_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -/* ================================================================== */ -/* DUPLICATE (BROADCAST): signed types */ -/* ================================================================== */ - -/* vdupq_n_s16: broadcast signed 16-bit value into all 8 lanes */ -static __inline__ int16x8_t __attribute__((__always_inline__)) -vdupq_n_s16(short __a) -{ - int16x8_t __ret; - for (int __i = 0; __i < 8; __i++) - __ret.__val[__i] = __a; - return __ret; -} - -/* ================================================================== */ -/* HORIZONTAL REDUCTION OPERATIONS */ -/* ================================================================== */ - -/* vmaxvq_u8: horizontal max of all uint8x16_t lanes -> scalar uint8_t */ -static __inline__ unsigned char __attribute__((__always_inline__)) -vmaxvq_u8(uint8x16_t __a) -{ - unsigned char __ret = __a.__val[0]; - for (int __i = 1; __i < 16; __i++) - if (__a.__val[__i] > __ret) __ret = __a.__val[__i]; - return __ret; -} - -/* vminvq_u8: horizontal min of all uint8x16_t lanes -> scalar uint8_t */ -static __inline__ unsigned char __attribute__((__always_inline__)) -vminvq_u8(uint8x16_t __a) -{ - unsigned char __ret = __a.__val[0]; - for (int __i = 1; __i < 16; __i++) - if (__a.__val[__i] < __ret) __ret = __a.__val[__i]; - return __ret; -} - -/* vmaxvq_u32: horizontal max of all uint32x4_t lanes -> scalar uint32_t */ -static __inline__ unsigned int __attribute__((__always_inline__)) -vmaxvq_u32(uint32x4_t __a) -{ - unsigned int __ret = __a.__val[0]; - for (int __i = 1; __i < 4; __i++) - if (__a.__val[__i] > __ret) __ret = __a.__val[__i]; - return __ret; -} - -/* vmaxvq_s16: horizontal max across int16x8_t */ -static __inline__ short __attribute__((__always_inline__)) -vmaxvq_s16(int16x8_t __a) -{ - short __max = __a.__val[0]; - for (int __i = 1; __i < 8; __i++) { - if (__a.__val[__i] > __max) - __max = __a.__val[__i]; - } - return __max; -} - -/* vmaxvq_u16: horizontal max across uint16x8_t */ -static __inline__ unsigned short __attribute__((__always_inline__)) -vmaxvq_u16(uint16x8_t __a) -{ - unsigned short __max = __a.__val[0]; - for (int __i = 1; __i < 8; __i++) { - if (__a.__val[__i] > __max) - __max = __a.__val[__i]; - } - return __max; -} - -/* ================================================================== */ -/* SATURATING ARITHMETIC */ -/* ================================================================== */ - -/* vqsubq_u8: saturating subtract uint8x16_t (clamp at 0) */ -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vqsubq_u8(uint8x16_t __a, uint8x16_t __b) -{ - uint8x16_t __ret; - for (int __i = 0; __i < 16; __i++) - __ret.__val[__i] = __a.__val[__i] > __b.__val[__i] ? __a.__val[__i] - __b.__val[__i] : 0; - return __ret; -} - -/* vqaddq_u8: saturating add uint8x16_t (clamp at 255) */ -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vqaddq_u8(uint8x16_t __a, uint8x16_t __b) -{ - uint8x16_t __ret; - for (int __i = 0; __i < 16; __i++) { - unsigned int __sum = (unsigned int)__a.__val[__i] + (unsigned int)__b.__val[__i]; - __ret.__val[__i] = __sum > 255 ? 255 : (unsigned char)__sum; - } - return __ret; -} - -/* ================================================================== */ -/* ADDITIONAL COMPARISON OPERATIONS */ -/* ================================================================== */ - -/* vceqq_u32: element-wise equality compare uint32x4_t */ -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vceqq_u32(uint32x4_t __a, uint32x4_t __b) -{ - uint32x4_t __ret; - for (int __i = 0; __i < 4; __i++) - __ret.__val[__i] = (__a.__val[__i] == __b.__val[__i]) ? 0xFFFFFFFF : 0x00000000; - return __ret; -} - -/* vceqq_u16: element-wise equality compare uint16x8_t */ -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vceqq_u16(uint16x8_t __a, uint16x8_t __b) -{ - uint16x8_t __ret; - for (int __i = 0; __i < 8; __i++) - __ret.__val[__i] = (__a.__val[__i] == __b.__val[__i]) ? 0xFFFF : 0x0000; - return __ret; -} - -/* vqaddq_s16: saturating add int16x8_t */ -static __inline__ int16x8_t __attribute__((__always_inline__)) -vqaddq_s16(int16x8_t __a, int16x8_t __b) -{ - int16x8_t __ret; - for (int __i = 0; __i < 8; __i++) { - int __sum = (int)__a.__val[__i] + (int)__b.__val[__i]; - if (__sum > 32767) __sum = 32767; - else if (__sum < -32768) __sum = -32768; - __ret.__val[__i] = (short)__sum; - } - return __ret; -} - -/* vqsubq_u16: saturating subtract uint16x8_t */ -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vqsubq_u16(uint16x8_t __a, uint16x8_t __b) -{ - uint16x8_t __ret; - for (int __i = 0; __i < 8; __i++) { - int __diff = (int)__a.__val[__i] - (int)__b.__val[__i]; - __ret.__val[__i] = __diff < 0 ? 0 : (unsigned short)__diff; - } - return __ret; -} - -/* vcgtq_s16: compare greater-than int16x8_t -> uint16x8_t mask */ -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vcgtq_s16(int16x8_t __a, int16x8_t __b) -{ - uint16x8_t __ret; - for (int __i = 0; __i < 8; __i++) - __ret.__val[__i] = (__a.__val[__i] > __b.__val[__i]) ? 0xFFFF : 0x0000; - return __ret; -} - -/* vmaxq_s16: element-wise maximum of int16x8_t */ -static __inline__ int16x8_t __attribute__((__always_inline__)) -vmaxq_s16(int16x8_t __a, int16x8_t __b) -{ - int16x8_t __ret; - for (int __i = 0; __i < 8; __i++) - __ret.__val[__i] = __a.__val[__i] > __b.__val[__i] ? __a.__val[__i] : __b.__val[__i]; - return __ret; -} - -/* ===== Float32 NEON intrinsics ===== */ - -/* vld1q_f32: load 4 floats into float32x4_t */ -static __inline__ float32x4_t __attribute__((__always_inline__)) -vld1q_f32(const float *__p) -{ - float32x4_t __ret; - __ret.__val[0] = __p[0]; - __ret.__val[1] = __p[1]; - __ret.__val[2] = __p[2]; - __ret.__val[3] = __p[3]; - return __ret; -} - -/* vst1q_f32: store float32x4_t to memory */ -static __inline__ void __attribute__((__always_inline__)) -vst1q_f32(float *__p, float32x4_t __a) -{ - __p[0] = __a.__val[0]; - __p[1] = __a.__val[1]; - __p[2] = __a.__val[2]; - __p[3] = __a.__val[3]; -} - -/* vst1_f32: store float32x2_t (64-bit) to memory */ -static __inline__ void __attribute__((__always_inline__)) -vst1_f32(float *__p, float32x2_t __a) -{ - __p[0] = __a.__val[0]; - __p[1] = __a.__val[1]; -} - -/* vaddq_f32: add float32x4_t element-wise */ -static __inline__ float32x4_t __attribute__((__always_inline__)) -vaddq_f32(float32x4_t __a, float32x4_t __b) -{ - float32x4_t __ret; - __ret.__val[0] = __a.__val[0] + __b.__val[0]; - __ret.__val[1] = __a.__val[1] + __b.__val[1]; - __ret.__val[2] = __a.__val[2] + __b.__val[2]; - __ret.__val[3] = __a.__val[3] + __b.__val[3]; - return __ret; -} - -/* vsubq_f32: subtract float32x4_t element-wise */ -static __inline__ float32x4_t __attribute__((__always_inline__)) -vsubq_f32(float32x4_t __a, float32x4_t __b) -{ - float32x4_t __ret; - __ret.__val[0] = __a.__val[0] - __b.__val[0]; - __ret.__val[1] = __a.__val[1] - __b.__val[1]; - __ret.__val[2] = __a.__val[2] - __b.__val[2]; - __ret.__val[3] = __a.__val[3] - __b.__val[3]; - return __ret; -} - -/* vmulq_f32: multiply float32x4_t element-wise */ -static __inline__ float32x4_t __attribute__((__always_inline__)) -vmulq_f32(float32x4_t __a, float32x4_t __b) -{ - float32x4_t __ret; - __ret.__val[0] = __a.__val[0] * __b.__val[0]; - __ret.__val[1] = __a.__val[1] * __b.__val[1]; - __ret.__val[2] = __a.__val[2] * __b.__val[2]; - __ret.__val[3] = __a.__val[3] * __b.__val[3]; - return __ret; -} - -/* vmovq_n_f32: broadcast a float to all lanes of float32x4_t */ -static __inline__ float32x4_t __attribute__((__always_inline__)) -vmovq_n_f32(float __a) -{ - float32x4_t __ret; - __ret.__val[0] = __a; - __ret.__val[1] = __a; - __ret.__val[2] = __a; - __ret.__val[3] = __a; - return __ret; -} - -/* vget_low_f32: get low 64-bit half of float32x4_t as float32x2_t */ -static __inline__ float32x2_t __attribute__((__always_inline__)) -vget_low_f32(float32x4_t __a) -{ - float32x2_t __ret; - __ret.__val[0] = __a.__val[0]; - __ret.__val[1] = __a.__val[1]; - return __ret; -} - -/* vget_high_f32: get high 64-bit half of float32x4_t as float32x2_t */ -static __inline__ float32x2_t __attribute__((__always_inline__)) -vget_high_f32(float32x4_t __a) -{ - float32x2_t __ret; - __ret.__val[0] = __a.__val[2]; - __ret.__val[1] = __a.__val[3]; - return __ret; -} - -/* vcombine_f32: combine two float32x2_t into float32x4_t */ -static __inline__ float32x4_t __attribute__((__always_inline__)) -vcombine_f32(float32x2_t __lo, float32x2_t __hi) -{ - float32x4_t __ret; - __ret.__val[0] = __lo.__val[0]; - __ret.__val[1] = __lo.__val[1]; - __ret.__val[2] = __hi.__val[0]; - __ret.__val[3] = __hi.__val[1]; - return __ret; -} - -/* vrev64q_f32: reverse floats within each 64-bit lane */ -static __inline__ float32x4_t __attribute__((__always_inline__)) -vrev64q_f32(float32x4_t __a) -{ - float32x4_t __ret; - __ret.__val[0] = __a.__val[1]; - __ret.__val[1] = __a.__val[0]; - __ret.__val[2] = __a.__val[3]; - __ret.__val[3] = __a.__val[2]; - return __ret; -} - -/* vcltq_f32: compare less-than, returns uint32x4_t mask */ -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vcltq_f32(float32x4_t __a, float32x4_t __b) -{ - uint32x4_t __ret; - __ret.__val[0] = __a.__val[0] < __b.__val[0] ? 0xFFFFFFFFu : 0; - __ret.__val[1] = __a.__val[1] < __b.__val[1] ? 0xFFFFFFFFu : 0; - __ret.__val[2] = __a.__val[2] < __b.__val[2] ? 0xFFFFFFFFu : 0; - __ret.__val[3] = __a.__val[3] < __b.__val[3] ? 0xFFFFFFFFu : 0; - return __ret; -} - -/* vcvtq_s32_f32: convert float32x4_t to int32x4_t (round toward zero) */ -static __inline__ int32x4_t __attribute__((__always_inline__)) -vcvtq_s32_f32(float32x4_t __a) -{ - int32x4_t __ret; - __ret.__val[0] = (int)__a.__val[0]; - __ret.__val[1] = (int)__a.__val[1]; - __ret.__val[2] = (int)__a.__val[2]; - __ret.__val[3] = (int)__a.__val[3]; - return __ret; -} - -/* vqaddq_s32: saturating add int32x4_t */ -static __inline__ int32x4_t __attribute__((__always_inline__)) -vqaddq_s32(int32x4_t __a, int32x4_t __b) -{ - int32x4_t __ret; - for (int __i = 0; __i < 4; __i++) { - long long __sum = (long long)__a.__val[__i] + (long long)__b.__val[__i]; - if (__sum > 2147483647LL) __sum = 2147483647LL; - if (__sum < -2147483648LL) __sum = -2147483648LL; - __ret.__val[__i] = (int)__sum; - } - return __ret; -} - -/* vqmovn_s32: saturating narrow int32x4_t to int16x4_t */ -static __inline__ int16x4_t __attribute__((__always_inline__)) -vqmovn_s32(int32x4_t __a) -{ - int16x4_t __ret; - for (int __i = 0; __i < 4; __i++) { - int __v = __a.__val[__i]; - if (__v > 32767) __v = 32767; - if (__v < -32768) __v = -32768; - __ret.__val[__i] = (short)__v; - } - return __ret; -} - -/* vst1_lane_s16: store one lane from int16x4_t */ -static __inline__ void __attribute__((__always_inline__)) -vst1_lane_s16(short *__p, int16x4_t __a, int __lane) -{ - *__p = __a.__val[__lane]; -} - -/* ================================================================== */ -/* u16 ARITHMETIC / COMPARE / BITWISE */ -/* ================================================================== */ - -/* vaddq_u16: element-wise add uint16x8_t */ -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vaddq_u16(uint16x8_t __a, uint16x8_t __b) -{ - uint16x8_t __ret; - for (int __i = 0; __i < 8; __i++) - __ret.__val[__i] = __a.__val[__i] + __b.__val[__i]; - return __ret; -} - -/* vaddq_s16: element-wise add int16x8_t */ -static __inline__ int16x8_t __attribute__((__always_inline__)) -vaddq_s16(int16x8_t __a, int16x8_t __b) -{ - int16x8_t __ret; - for (int __i = 0; __i < 8; __i++) - __ret.__val[__i] = __a.__val[__i] + __b.__val[__i]; - return __ret; -} - -/* vsubq_u16: element-wise subtract uint16x8_t */ -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vsubq_u16(uint16x8_t __a, uint16x8_t __b) -{ - uint16x8_t __ret; - for (int __i = 0; __i < 8; __i++) - __ret.__val[__i] = __a.__val[__i] - __b.__val[__i]; - return __ret; -} - -/* vsubq_s16: element-wise subtract int16x8_t */ -static __inline__ int16x8_t __attribute__((__always_inline__)) -vsubq_s16(int16x8_t __a, int16x8_t __b) -{ - int16x8_t __ret; - for (int __i = 0; __i < 8; __i++) - __ret.__val[__i] = __a.__val[__i] - __b.__val[__i]; - return __ret; -} - -/* vsubq_u32: element-wise subtract uint32x4_t */ -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vsubq_u32(uint32x4_t __a, uint32x4_t __b) -{ - uint32x4_t __ret; - for (int __i = 0; __i < 4; __i++) - __ret.__val[__i] = __a.__val[__i] - __b.__val[__i]; - return __ret; -} - -/* vmaxq_u16: element-wise unsigned max uint16x8_t */ -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vmaxq_u16(uint16x8_t __a, uint16x8_t __b) -{ - uint16x8_t __ret; - for (int __i = 0; __i < 8; __i++) - __ret.__val[__i] = __a.__val[__i] > __b.__val[__i] ? __a.__val[__i] : __b.__val[__i]; - return __ret; -} - -/* vminq_u16: element-wise unsigned min uint16x8_t */ -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vminq_u16(uint16x8_t __a, uint16x8_t __b) -{ - uint16x8_t __ret; - for (int __i = 0; __i < 8; __i++) - __ret.__val[__i] = __a.__val[__i] < __b.__val[__i] ? __a.__val[__i] : __b.__val[__i]; - return __ret; -} - -/* vminq_s16: element-wise signed min int16x8_t */ -static __inline__ int16x8_t __attribute__((__always_inline__)) -vminq_s16(int16x8_t __a, int16x8_t __b) -{ - int16x8_t __ret; - for (int __i = 0; __i < 8; __i++) - __ret.__val[__i] = __a.__val[__i] < __b.__val[__i] ? __a.__val[__i] : __b.__val[__i]; - return __ret; -} - -/* vmulq_u16: element-wise multiply uint16x8_t */ -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vmulq_u16(uint16x8_t __a, uint16x8_t __b) -{ - uint16x8_t __ret; - for (int __i = 0; __i < 8; __i++) - __ret.__val[__i] = __a.__val[__i] * __b.__val[__i]; - return __ret; -} - -/* ================================================================== */ -/* u16 COMPARE INTRINSICS */ -/* ================================================================== */ - -/* vcgtzq_s16: compare greater than zero, int16x8_t -> uint16x8_t */ -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vcgtzq_s16(int16x8_t __a) -{ - uint16x8_t __ret; - for (int __i = 0; __i < 8; __i++) - __ret.__val[__i] = (__a.__val[__i] > 0) ? 0xFFFF : 0x0000; - return __ret; -} - -/* vcgtzq_s32: compare greater than zero, int32x4_t -> uint32x4_t */ -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vcgtzq_s32(int32x4_t __a) -{ - uint32x4_t __ret; - for (int __i = 0; __i < 4; __i++) - __ret.__val[__i] = (__a.__val[__i] > 0) ? 0xFFFFFFFF : 0x00000000; - return __ret; -} - -/* vcltzq_s16: compare less than zero, int16x8_t -> uint16x8_t */ -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vcltzq_s16(int16x8_t __a) -{ - uint16x8_t __ret; - for (int __i = 0; __i < 8; __i++) - __ret.__val[__i] = (__a.__val[__i] < 0) ? 0xFFFF : 0x0000; - return __ret; -} - -/* vtstq_u16: test bits, uint16x8_t */ -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vtstq_u16(uint16x8_t __a, uint16x8_t __b) -{ - uint16x8_t __ret; - for (int __i = 0; __i < 8; __i++) - __ret.__val[__i] = (__a.__val[__i] & __b.__val[__i]) ? 0xFFFF : 0x0000; - return __ret; -} - -/* vtstq_u32: test bits, uint32x4_t */ -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vtstq_u32(uint32x4_t __a, uint32x4_t __b) -{ - uint32x4_t __ret; - for (int __i = 0; __i < 4; __i++) - __ret.__val[__i] = (__a.__val[__i] & __b.__val[__i]) ? 0xFFFFFFFF : 0x00000000; - return __ret; -} - -/* ================================================================== */ -/* BITWISE SELECT (vbslq) */ -/* ================================================================== */ - -/* vbslq_u8: bitwise select, uint8x16_t */ -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vbslq_u8(uint8x16_t __sel, uint8x16_t __a, uint8x16_t __b) -{ - uint8x16_t __ret; - for (int __i = 0; __i < 16; __i++) - __ret.__val[__i] = (__sel.__val[__i] & __a.__val[__i]) | - (~__sel.__val[__i] & __b.__val[__i]); - return __ret; -} - -/* vbslq_u16: bitwise select, uint16x8_t */ -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vbslq_u16(uint16x8_t __sel, uint16x8_t __a, uint16x8_t __b) -{ - uint16x8_t __ret; - for (int __i = 0; __i < 8; __i++) - __ret.__val[__i] = (__sel.__val[__i] & __a.__val[__i]) | - (~__sel.__val[__i] & __b.__val[__i]); - return __ret; -} - -/* vbslq_u32: bitwise select, uint32x4_t */ -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vbslq_u32(uint32x4_t __sel, uint32x4_t __a, uint32x4_t __b) -{ - uint32x4_t __ret; - for (int __i = 0; __i < 4; __i++) - __ret.__val[__i] = (__sel.__val[__i] & __a.__val[__i]) | - (~__sel.__val[__i] & __b.__val[__i]); - return __ret; -} - -/* vbslq_u64: bitwise select, uint64x2_t */ -static __inline__ uint64x2_t __attribute__((__always_inline__)) -vbslq_u64(uint64x2_t __sel, uint64x2_t __a, uint64x2_t __b) -{ - uint64x2_t __ret; - for (int __i = 0; __i < 2; __i++) - __ret.__val[__i] = (__sel.__val[__i] & __a.__val[__i]) | - (~__sel.__val[__i] & __b.__val[__i]); - return __ret; -} - -/* vbslq_s8: bitwise select, int8x16_t (using u8 bitwise ops) */ -static __inline__ int8x16_t __attribute__((__always_inline__)) -vbslq_s8(uint8x16_t __sel, int8x16_t __a, int8x16_t __b) -{ - int8x16_t __ret; - for (int __i = 0; __i < 16; __i++) - __ret.__val[__i] = (__sel.__val[__i] & (unsigned char)__a.__val[__i]) | - (~__sel.__val[__i] & (unsigned char)__b.__val[__i]); - return __ret; -} - -/* vbslq_s16: bitwise select, int16x8_t (using u16 bitwise ops) */ -static __inline__ int16x8_t __attribute__((__always_inline__)) -vbslq_s16(uint16x8_t __sel, int16x8_t __a, int16x8_t __b) -{ - int16x8_t __ret; - for (int __i = 0; __i < 8; __i++) - __ret.__val[__i] = (__sel.__val[__i] & (unsigned short)__a.__val[__i]) | - (~__sel.__val[__i] & (unsigned short)__b.__val[__i]); - return __ret; -} - -/* vbslq_s32: bitwise select, int32x4_t (using u32 bitwise ops) */ -static __inline__ int32x4_t __attribute__((__always_inline__)) -vbslq_s32(uint32x4_t __sel, int32x4_t __a, int32x4_t __b) -{ - int32x4_t __ret; - for (int __i = 0; __i < 4; __i++) - __ret.__val[__i] = (__sel.__val[__i] & (unsigned int)__a.__val[__i]) | - (~__sel.__val[__i] & (unsigned int)__b.__val[__i]); - return __ret; -} - -/* ================================================================== */ -/* EXTRACT (vextq_u16) */ -/* ================================================================== */ - -/* vextq_u16: extract from pair of uint16x8_t */ -#define vextq_u16(__a, __b, __n) __extension__ ({ \ - uint16x8_t __r; \ - unsigned short __tmp[16]; \ - __tmp[0] = (__a).__val[0]; __tmp[1] = (__a).__val[1]; \ - __tmp[2] = (__a).__val[2]; __tmp[3] = (__a).__val[3]; \ - __tmp[4] = (__a).__val[4]; __tmp[5] = (__a).__val[5]; \ - __tmp[6] = (__a).__val[6]; __tmp[7] = (__a).__val[7]; \ - __tmp[8] = (__b).__val[0]; __tmp[9] = (__b).__val[1]; \ - __tmp[10] = (__b).__val[2]; __tmp[11] = (__b).__val[3]; \ - __tmp[12] = (__b).__val[4]; __tmp[13] = (__b).__val[5]; \ - __tmp[14] = (__b).__val[6]; __tmp[15] = (__b).__val[7]; \ - __r.__val[0] = __tmp[(__n)]; __r.__val[1] = __tmp[(__n) + 1]; \ - __r.__val[2] = __tmp[(__n) + 2]; __r.__val[3] = __tmp[(__n) + 3]; \ - __r.__val[4] = __tmp[(__n) + 4]; __r.__val[5] = __tmp[(__n) + 5]; \ - __r.__val[6] = __tmp[(__n) + 6]; __r.__val[7] = __tmp[(__n) + 7]; \ - __r; \ -}) - -/* vextq_s16: extract from pair of int16x8_t */ -#define vextq_s16(__a, __b, __n) __extension__ ({ \ - int16x8_t __r; \ - short __tmp[16]; \ - __tmp[0] = (__a).__val[0]; __tmp[1] = (__a).__val[1]; \ - __tmp[2] = (__a).__val[2]; __tmp[3] = (__a).__val[3]; \ - __tmp[4] = (__a).__val[4]; __tmp[5] = (__a).__val[5]; \ - __tmp[6] = (__a).__val[6]; __tmp[7] = (__a).__val[7]; \ - __tmp[8] = (__b).__val[0]; __tmp[9] = (__b).__val[1]; \ - __tmp[10] = (__b).__val[2]; __tmp[11] = (__b).__val[3]; \ - __tmp[12] = (__b).__val[4]; __tmp[13] = (__b).__val[5]; \ - __tmp[14] = (__b).__val[6]; __tmp[15] = (__b).__val[7]; \ - __r.__val[0] = __tmp[(__n)]; __r.__val[1] = __tmp[(__n) + 1]; \ - __r.__val[2] = __tmp[(__n) + 2]; __r.__val[3] = __tmp[(__n) + 3]; \ - __r.__val[4] = __tmp[(__n) + 4]; __r.__val[5] = __tmp[(__n) + 5]; \ - __r.__val[6] = __tmp[(__n) + 6]; __r.__val[7] = __tmp[(__n) + 7]; \ - __r; \ -}) - -/* ================================================================== */ -/* TABLE LOOKUP (vqtbl2q) */ -/* ================================================================== */ - -/* vqtbl2q_u8: table lookup across 2 registers (32 bytes) */ -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vqtbl2q_u8(uint8x16x2_t __a, uint8x16_t __b) -{ - uint8x16_t __ret; - for (int __i = 0; __i < 16; __i++) { - unsigned char __idx = __b.__val[__i]; - if (__idx < 16) - __ret.__val[__i] = __a.val[0].__val[__idx]; - else if (__idx < 32) - __ret.__val[__i] = __a.val[1].__val[__idx - 16]; - else - __ret.__val[__i] = 0; - } - return __ret; -} - -/* vqtbx2q_u8: table lookup extend across 2 registers */ -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vqtbx2q_u8(uint8x16_t __def, uint8x16x2_t __a, uint8x16_t __b) -{ - uint8x16_t __ret; - for (int __i = 0; __i < 16; __i++) { - unsigned char __idx = __b.__val[__i]; - if (__idx < 16) - __ret.__val[__i] = __a.val[0].__val[__idx]; - else if (__idx < 32) - __ret.__val[__i] = __a.val[1].__val[__idx - 16]; - else - __ret.__val[__i] = __def.__val[__i]; - } - return __ret; -} - -/* vqtbl3q_u8: table lookup across 3 registers (48 bytes) */ -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vqtbl3q_u8(uint8x16x3_t __a, uint8x16_t __b) -{ - uint8x16_t __ret; - for (int __i = 0; __i < 16; __i++) { - unsigned char __idx = __b.__val[__i]; - if (__idx < 16) - __ret.__val[__i] = __a.val[0].__val[__idx]; - else if (__idx < 32) - __ret.__val[__i] = __a.val[1].__val[__idx - 16]; - else if (__idx < 48) - __ret.__val[__i] = __a.val[2].__val[__idx - 32]; - else - __ret.__val[__i] = 0; - } - return __ret; -} - -/* ================================================================== */ -/* UNZIP / DE-INTERLEAVE (AArch64 vuzp1/vuzp2) */ -/* ================================================================== */ - -/* vuzp1q_u16: unzip even elements from two uint16x8_t - * result = {a[0], a[2], a[4], a[6], b[0], b[2], b[4], b[6]} */ -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vuzp1q_u16(uint16x8_t __a, uint16x8_t __b) -{ - uint16x8_t __ret; - __ret.__val[0] = __a.__val[0]; - __ret.__val[1] = __a.__val[2]; - __ret.__val[2] = __a.__val[4]; - __ret.__val[3] = __a.__val[6]; - __ret.__val[4] = __b.__val[0]; - __ret.__val[5] = __b.__val[2]; - __ret.__val[6] = __b.__val[4]; - __ret.__val[7] = __b.__val[6]; - return __ret; -} - -/* vuzp2q_u16: unzip odd elements from two uint16x8_t - * result = {a[1], a[3], a[5], a[7], b[1], b[3], b[5], b[7]} */ -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vuzp2q_u16(uint16x8_t __a, uint16x8_t __b) -{ - uint16x8_t __ret; - __ret.__val[0] = __a.__val[1]; - __ret.__val[1] = __a.__val[3]; - __ret.__val[2] = __a.__val[5]; - __ret.__val[3] = __a.__val[7]; - __ret.__val[4] = __b.__val[1]; - __ret.__val[5] = __b.__val[3]; - __ret.__val[6] = __b.__val[5]; - __ret.__val[7] = __b.__val[7]; - return __ret; -} - -/* vuzp1q_u8: unzip even elements from two uint8x16_t */ -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vuzp1q_u8(uint8x16_t __a, uint8x16_t __b) -{ - uint8x16_t __ret; - for (int __i = 0; __i < 8; __i++) { - __ret.__val[__i] = __a.__val[__i * 2]; - __ret.__val[__i + 8] = __b.__val[__i * 2]; - } - return __ret; -} - -/* vuzp2q_u8: unzip odd elements from two uint8x16_t */ -static __inline__ uint8x16_t __attribute__((__always_inline__)) -vuzp2q_u8(uint8x16_t __a, uint8x16_t __b) -{ - uint8x16_t __ret; - for (int __i = 0; __i < 8; __i++) { - __ret.__val[__i] = __a.__val[__i * 2 + 1]; - __ret.__val[__i + 8] = __b.__val[__i * 2 + 1]; - } - return __ret; -} - -/* vuzp1q_u32: unzip even elements from two uint32x4_t */ -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vuzp1q_u32(uint32x4_t __a, uint32x4_t __b) -{ - uint32x4_t __ret; - __ret.__val[0] = __a.__val[0]; - __ret.__val[1] = __a.__val[2]; - __ret.__val[2] = __b.__val[0]; - __ret.__val[3] = __b.__val[2]; - return __ret; -} - -/* vuzp2q_u32: unzip odd elements from two uint32x4_t */ -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vuzp2q_u32(uint32x4_t __a, uint32x4_t __b) -{ - uint32x4_t __ret; - __ret.__val[0] = __a.__val[1]; - __ret.__val[1] = __a.__val[3]; - __ret.__val[2] = __b.__val[1]; - __ret.__val[3] = __b.__val[3]; - return __ret; -} - -/* vzip1q_u16: zip/interleave low halves of two uint16x8_t - * result = {a[0], b[0], a[1], b[1], a[2], b[2], a[3], b[3]} */ -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vzip1q_u16(uint16x8_t __a, uint16x8_t __b) -{ - uint16x8_t __ret; - __ret.__val[0] = __a.__val[0]; __ret.__val[1] = __b.__val[0]; - __ret.__val[2] = __a.__val[1]; __ret.__val[3] = __b.__val[1]; - __ret.__val[4] = __a.__val[2]; __ret.__val[5] = __b.__val[2]; - __ret.__val[6] = __a.__val[3]; __ret.__val[7] = __b.__val[3]; - return __ret; -} - -/* vzip2q_u16: zip/interleave high halves of two uint16x8_t - * result = {a[4], b[4], a[5], b[5], a[6], b[6], a[7], b[7]} */ -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vzip2q_u16(uint16x8_t __a, uint16x8_t __b) -{ - uint16x8_t __ret; - __ret.__val[0] = __a.__val[4]; __ret.__val[1] = __b.__val[4]; - __ret.__val[2] = __a.__val[5]; __ret.__val[3] = __b.__val[5]; - __ret.__val[4] = __a.__val[6]; __ret.__val[5] = __b.__val[6]; - __ret.__val[6] = __a.__val[7]; __ret.__val[7] = __b.__val[7]; - return __ret; -} - -/* vzip1q_u32: zip/interleave low halves of two uint32x4_t */ -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vzip1q_u32(uint32x4_t __a, uint32x4_t __b) -{ - uint32x4_t __ret; - __ret.__val[0] = __a.__val[0]; __ret.__val[1] = __b.__val[0]; - __ret.__val[2] = __a.__val[1]; __ret.__val[3] = __b.__val[1]; - return __ret; -} - -/* vzip2q_u32: zip/interleave high halves of two uint32x4_t */ -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vzip2q_u32(uint32x4_t __a, uint32x4_t __b) -{ - uint32x4_t __ret; - __ret.__val[0] = __a.__val[2]; __ret.__val[1] = __b.__val[2]; - __ret.__val[2] = __a.__val[3]; __ret.__val[3] = __b.__val[3]; - return __ret; -} - -/* ================================================================== */ -/* ADDITIONAL REINTERPRET CASTS */ -/* ================================================================== */ - -/* vreinterpretq_s16_u16: reinterpret uint16x8_t as int16x8_t */ -static __inline__ int16x8_t __attribute__((__always_inline__)) -vreinterpretq_s16_u16(uint16x8_t __a) -{ - int16x8_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -/* vreinterpretq_u16_s16: reinterpret int16x8_t as uint16x8_t */ -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vreinterpretq_u16_s16(int16x8_t __a) -{ - uint16x8_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -/* vreinterpretq_u64_u16: reinterpret uint16x8_t as uint64x2_t */ -static __inline__ uint64x2_t __attribute__((__always_inline__)) -vreinterpretq_u64_u16(uint16x8_t __a) -{ - uint64x2_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -/* vreinterpretq_u16_u64: reinterpret uint64x2_t as uint16x8_t */ -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vreinterpretq_u16_u64(uint64x2_t __a) -{ - uint16x8_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -/* vreinterpretq_s32_s16: reinterpret int16x8_t as int32x4_t */ -static __inline__ int32x4_t __attribute__((__always_inline__)) -vreinterpretq_s32_s16(int16x8_t __a) -{ - int32x4_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -/* vreinterpretq_s16_s32: reinterpret int32x4_t as int16x8_t */ -static __inline__ int16x8_t __attribute__((__always_inline__)) -vreinterpretq_s16_s32(int32x4_t __a) -{ - int16x8_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -/* vreinterpretq_u16_s8: reinterpret int8x16_t as uint16x8_t */ -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vreinterpretq_u16_s8(int8x16_t __a) -{ - uint16x8_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -/* vreinterpretq_s8_s16: reinterpret int16x8_t as int8x16_t */ -static __inline__ int8x16_t __attribute__((__always_inline__)) -vreinterpretq_s8_s16(int16x8_t __a) -{ - int8x16_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -/* vreinterpretq_s64_u64: reinterpret uint64x2_t as int64x2_t */ -static __inline__ int64x2_t __attribute__((__always_inline__)) -vreinterpretq_s64_u64(uint64x2_t __a) -{ - int64x2_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -/* vreinterpretq_u64_s64: reinterpret int64x2_t as uint64x2_t */ -static __inline__ uint64x2_t __attribute__((__always_inline__)) -vreinterpretq_u64_s64(int64x2_t __a) -{ - uint64x2_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -/* vreinterpretq_s8_u16: reinterpret uint16x8_t as int8x16_t */ -static __inline__ int8x16_t __attribute__((__always_inline__)) -vreinterpretq_s8_u16(uint16x8_t __a) -{ - int8x16_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -/* vreinterpretq_f32_u32: reinterpret uint32x4_t as float32x4_t */ -static __inline__ float32x4_t __attribute__((__always_inline__)) -vreinterpretq_f32_u32(uint32x4_t __a) -{ - float32x4_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -/* vreinterpretq_u32_f32: reinterpret float32x4_t as uint32x4_t */ -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vreinterpretq_u32_f32(float32x4_t __a) -{ - uint32x4_t __ret; - __builtin_memcpy(&__ret, &__a, 16); - return __ret; -} - -/* ================================================================== */ -/* ADDITIONAL SHIFT INTRINSICS */ -/* ================================================================== */ - -/* vshlq_n_u16: shift left by immediate, uint16x8_t */ -#define vshlq_n_u16(__a, __n) __extension__ ({ \ - uint16x8_t __r; \ - for (int __i = 0; __i < 8; __i++) \ - __r.__val[__i] = (__a).__val[__i] << (__n); \ - __r; \ -}) - -/* vshrq_n_u16: shift right by immediate, uint16x8_t */ -#define vshrq_n_u16(__a, __n) __extension__ ({ \ - uint16x8_t __r; \ - for (int __i = 0; __i < 8; __i++) \ - __r.__val[__i] = (__a).__val[__i] >> (__n); \ - __r; \ -}) - -/* vshlq_n_s16: shift left by immediate, int16x8_t */ -#define vshlq_n_s16(__a, __n) __extension__ ({ \ - int16x8_t __r; \ - for (int __i = 0; __i < 8; __i++) \ - __r.__val[__i] = (__a).__val[__i] << (__n); \ - __r; \ -}) - -/* vshrq_n_s16: shift right by immediate, int16x8_t */ -#define vshrq_n_s16(__a, __n) __extension__ ({ \ - int16x8_t __r; \ - for (int __i = 0; __i < 8; __i++) \ - __r.__val[__i] = (__a).__val[__i] >> (__n); \ - __r; \ -}) - -/* ================================================================== */ -/* ADDITIONAL WIDENING / NARROWING INTRINSICS */ -/* ================================================================== */ - -/* vmovl_u16: widen uint16x4_t to uint32x4_t */ -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vmovl_u16(uint16x4_t __a) -{ - uint32x4_t __ret; - __ret.__val[0] = __a.__val[0]; - __ret.__val[1] = __a.__val[1]; - __ret.__val[2] = __a.__val[2]; - __ret.__val[3] = __a.__val[3]; - return __ret; -} - -/* vmovl_s16: widen int16x4_t to int32x4_t */ -static __inline__ int32x4_t __attribute__((__always_inline__)) -vmovl_s16(int16x4_t __a) -{ - int32x4_t __ret; - __ret.__val[0] = __a.__val[0]; - __ret.__val[1] = __a.__val[1]; - __ret.__val[2] = __a.__val[2]; - __ret.__val[3] = __a.__val[3]; - return __ret; -} - -/* vmovl_high_u16: widen high half of uint16x8_t to uint32x4_t */ -static __inline__ uint32x4_t __attribute__((__always_inline__)) -vmovl_high_u16(uint16x8_t __a) -{ - uint32x4_t __ret; - __ret.__val[0] = __a.__val[4]; - __ret.__val[1] = __a.__val[5]; - __ret.__val[2] = __a.__val[6]; - __ret.__val[3] = __a.__val[7]; - return __ret; -} - -/* vmovl_high_u8: widen high half of uint8x16_t to uint16x8_t */ -static __inline__ uint16x8_t __attribute__((__always_inline__)) -vmovl_high_u8(uint8x16_t __a) -{ - uint16x8_t __ret; - for (int __i = 0; __i < 8; __i++) - __ret.__val[__i] = __a.__val[__i + 8]; - return __ret; -} - -/* ================================================================== */ -/* LANE / HALF / COMBINE INTRINSICS (s16) */ -/* ================================================================== */ - -/* vsetq_lane_s16: set one lane in int16x8_t */ -static __inline__ int16x8_t __attribute__((__always_inline__)) -vsetq_lane_s16(short __val, int16x8_t __a, int __lane) -{ - __a.__val[__lane] = __val; - return __a; -} - -/* vget_low_u16: get low half of uint16x8_t as uint16x4_t */ -static __inline__ uint16x4_t __attribute__((__always_inline__)) -vget_low_u16(uint16x8_t __a) -{ - uint16x4_t __ret; - __ret.__val[0] = __a.__val[0]; - __ret.__val[1] = __a.__val[1]; - __ret.__val[2] = __a.__val[2]; - __ret.__val[3] = __a.__val[3]; - return __ret; -} - -/* vget_high_u16: get high half of uint16x8_t as uint16x4_t */ -static __inline__ uint16x4_t __attribute__((__always_inline__)) -vget_high_u16(uint16x8_t __a) -{ - uint16x4_t __ret; - __ret.__val[0] = __a.__val[4]; - __ret.__val[1] = __a.__val[5]; - __ret.__val[2] = __a.__val[6]; - __ret.__val[3] = __a.__val[7]; - return __ret; -} - - -#endif /* _ARM_NEON_H_INCLUDED */ diff --git a/include/avx2intrin.h b/include/avx2intrin.h deleted file mode 100644 index 72573d24ca..0000000000 --- a/include/avx2intrin.h +++ /dev/null @@ -1,1195 +0,0 @@ -/* CCC compiler bundled avx2intrin.h - AVX2 integer intrinsics */ -#ifndef _AVX2INTRIN_H_INCLUDED -#define _AVX2INTRIN_H_INCLUDED - -#include - -/* === Set === */ - -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_set1_epi8(char __b) -{ - unsigned char __ub = (unsigned char)__b; - long long __q = (long long)__ub; - __q |= __q << 8; - __q |= __q << 16; - __q |= __q << 32; - return (__m256i){ { __q, __q, __q, __q } }; -} - -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_set1_epi16(short __w) -{ - unsigned short __uw = (unsigned short)__w; - long long __q = (long long)__uw; - __q |= __q << 16; - __q |= __q << 32; - return (__m256i){ { __q, __q, __q, __q } }; -} - -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_set1_epi32(int __i) -{ - long long __q = (long long)(unsigned int)__i - | ((long long)(unsigned int)__i << 32); - return (__m256i){ { __q, __q, __q, __q } }; -} - -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_set1_epi64x(long long __q) -{ - return (__m256i){ { __q, __q, __q, __q } }; -} - -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_setr_epi8( - char __b0, char __b1, char __b2, char __b3, - char __b4, char __b5, char __b6, char __b7, - char __b8, char __b9, char __b10, char __b11, - char __b12, char __b13, char __b14, char __b15, - char __b16, char __b17, char __b18, char __b19, - char __b20, char __b21, char __b22, char __b23, - char __b24, char __b25, char __b26, char __b27, - char __b28, char __b29, char __b30, char __b31) -{ - __m256i __r; - unsigned char *__p = (unsigned char *)&__r; - __p[0] = (unsigned char)__b0; __p[1] = (unsigned char)__b1; - __p[2] = (unsigned char)__b2; __p[3] = (unsigned char)__b3; - __p[4] = (unsigned char)__b4; __p[5] = (unsigned char)__b5; - __p[6] = (unsigned char)__b6; __p[7] = (unsigned char)__b7; - __p[8] = (unsigned char)__b8; __p[9] = (unsigned char)__b9; - __p[10] = (unsigned char)__b10; __p[11] = (unsigned char)__b11; - __p[12] = (unsigned char)__b12; __p[13] = (unsigned char)__b13; - __p[14] = (unsigned char)__b14; __p[15] = (unsigned char)__b15; - __p[16] = (unsigned char)__b16; __p[17] = (unsigned char)__b17; - __p[18] = (unsigned char)__b18; __p[19] = (unsigned char)__b19; - __p[20] = (unsigned char)__b20; __p[21] = (unsigned char)__b21; - __p[22] = (unsigned char)__b22; __p[23] = (unsigned char)__b23; - __p[24] = (unsigned char)__b24; __p[25] = (unsigned char)__b25; - __p[26] = (unsigned char)__b26; __p[27] = (unsigned char)__b27; - __p[28] = (unsigned char)__b28; __p[29] = (unsigned char)__b29; - __p[30] = (unsigned char)__b30; __p[31] = (unsigned char)__b31; - return __r; -} - -/* === Bitwise === */ - -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_and_si256(__m256i __a, __m256i __b) -{ - return (__m256i){ { __a.__val[0] & __b.__val[0], - __a.__val[1] & __b.__val[1], - __a.__val[2] & __b.__val[2], - __a.__val[3] & __b.__val[3] } }; -} - -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_andnot_si256(__m256i __a, __m256i __b) -{ - return (__m256i){ { ~__a.__val[0] & __b.__val[0], - ~__a.__val[1] & __b.__val[1], - ~__a.__val[2] & __b.__val[2], - ~__a.__val[3] & __b.__val[3] } }; -} - -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_or_si256(__m256i __a, __m256i __b) -{ - return (__m256i){ { __a.__val[0] | __b.__val[0], - __a.__val[1] | __b.__val[1], - __a.__val[2] | __b.__val[2], - __a.__val[3] | __b.__val[3] } }; -} - -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_xor_si256(__m256i __a, __m256i __b) -{ - return (__m256i){ { __a.__val[0] ^ __b.__val[0], - __a.__val[1] ^ __b.__val[1], - __a.__val[2] ^ __b.__val[2], - __a.__val[3] ^ __b.__val[3] } }; -} - -/* === Shift === */ - -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_slli_epi32(__m256i __a, int __count) -{ - if (__count < 0 || __count > 31) - return _mm256_setzero_si256(); - unsigned int *__pa = (unsigned int *)&__a; - __m256i __r; - unsigned int *__pr = (unsigned int *)&__r; - for (int __i = 0; __i < 8; __i++) - __pr[__i] = __pa[__i] << __count; - return __r; -} - -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_srli_epi32(__m256i __a, int __count) -{ - if (__count < 0 || __count > 31) - return _mm256_setzero_si256(); - unsigned int *__pa = (unsigned int *)&__a; - __m256i __r; - unsigned int *__pr = (unsigned int *)&__r; - for (int __i = 0; __i < 8; __i++) - __pr[__i] = __pa[__i] >> __count; - return __r; -} - -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_slli_epi64(__m256i __a, int __count) -{ - if (__count < 0 || __count > 63) - return _mm256_setzero_si256(); - unsigned long long *__pa = (unsigned long long *)&__a; - __m256i __r; - unsigned long long *__pr = (unsigned long long *)&__r; - for (int __i = 0; __i < 4; __i++) - __pr[__i] = __pa[__i] << __count; - return __r; -} - -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_srli_epi64(__m256i __a, int __count) -{ - if (__count < 0 || __count > 63) - return _mm256_setzero_si256(); - unsigned long long *__pa = (unsigned long long *)&__a; - __m256i __r; - unsigned long long *__pr = (unsigned long long *)&__r; - for (int __i = 0; __i < 4; __i++) - __pr[__i] = __pa[__i] >> __count; - return __r; -} - -/* === Compare / Min / Max === */ - -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_max_epu8(__m256i __a, __m256i __b) -{ - unsigned char *__pa = (unsigned char *)&__a; - unsigned char *__pb = (unsigned char *)&__b; - __m256i __r; - unsigned char *__pr = (unsigned char *)&__r; - for (int __i = 0; __i < 32; __i++) - __pr[__i] = __pa[__i] > __pb[__i] ? __pa[__i] : __pb[__i]; - return __r; -} - -/* === Shuffle === */ - -/* _mm256_shuffle_epi8: VPSHUFB - byte shuffle within 128-bit lanes */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_shuffle_epi8(__m256i __a, __m256i __b) -{ - unsigned char *__pa = (unsigned char *)&__a; - unsigned char *__pb = (unsigned char *)&__b; - __m256i __r; - unsigned char *__pr = (unsigned char *)&__r; - /* Low 128-bit lane */ - for (int __i = 0; __i < 16; __i++) { - if (__pb[__i] & 0x80) - __pr[__i] = 0; - else - __pr[__i] = __pa[__pb[__i] & 0x0F]; - } - /* High 128-bit lane */ - for (int __i = 16; __i < 32; __i++) { - if (__pb[__i] & 0x80) - __pr[__i] = 0; - else - __pr[__i] = __pa[16 + (__pb[__i] & 0x0F)]; - } - return __r; -} - -/* === Extract === */ - -/* _mm256_extracti128_si256: extract 128-bit lane (imm must be 0 or 1) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm256_extracti128_si256(__m256i __a, int __imm) -{ - if (__imm & 1) - return (__m128i){ { __a.__val[2], __a.__val[3] } }; - else - return (__m128i){ { __a.__val[0], __a.__val[1] } }; -} - -/* === Add / Sub === */ - -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_add_epi8(__m256i __a, __m256i __b) -{ - unsigned char *__pa = (unsigned char *)&__a; - unsigned char *__pb = (unsigned char *)&__b; - __m256i __r; - unsigned char *__pr = (unsigned char *)&__r; - for (int __i = 0; __i < 32; __i++) - __pr[__i] = __pa[__i] + __pb[__i]; - return __r; -} - -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_add_epi32(__m256i __a, __m256i __b) -{ - unsigned int *__pa = (unsigned int *)&__a; - unsigned int *__pb = (unsigned int *)&__b; - __m256i __r; - unsigned int *__pr = (unsigned int *)&__r; - for (int __i = 0; __i < 8; __i++) - __pr[__i] = __pa[__i] + __pb[__i]; - return __r; -} - -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_add_epi64(__m256i __a, __m256i __b) -{ - return (__m256i){ { __a.__val[0] + __b.__val[0], - __a.__val[1] + __b.__val[1], - __a.__val[2] + __b.__val[2], - __a.__val[3] + __b.__val[3] } }; -} - -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_sub_epi32(__m256i __a, __m256i __b) -{ - unsigned int *__pa = (unsigned int *)&__a; - unsigned int *__pb = (unsigned int *)&__b; - __m256i __r; - unsigned int *__pr = (unsigned int *)&__r; - for (int __i = 0; __i < 8; __i++) - __pr[__i] = __pa[__i] - __pb[__i]; - return __r; -} - -/* === Compare === */ - -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_cmpeq_epi8(__m256i __a, __m256i __b) -{ - unsigned char *__pa = (unsigned char *)&__a; - unsigned char *__pb = (unsigned char *)&__b; - __m256i __r; - unsigned char *__pr = (unsigned char *)&__r; - for (int __i = 0; __i < 32; __i++) - __pr[__i] = (__pa[__i] == __pb[__i]) ? 0xFF : 0x00; - return __r; -} - -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_cmpeq_epi32(__m256i __a, __m256i __b) -{ - unsigned int *__pa = (unsigned int *)&__a; - unsigned int *__pb = (unsigned int *)&__b; - __m256i __r; - unsigned int *__pr = (unsigned int *)&__r; - for (int __i = 0; __i < 8; __i++) - __pr[__i] = (__pa[__i] == __pb[__i]) ? 0xFFFFFFFFu : 0; - return __r; -} - -/* === Permute === */ - -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_permute2x128_si256(__m256i __a, __m256i __b, int __imm) -{ - int __sel_lo = __imm & 0x3; - int __sel_hi = (__imm >> 4) & 0x3; - long long __result[4]; - - /* Select low 128 bits */ - switch (__sel_lo) { - case 0: __result[0] = __a.__val[0]; __result[1] = __a.__val[1]; break; - case 1: __result[0] = __a.__val[2]; __result[1] = __a.__val[3]; break; - case 2: __result[0] = __b.__val[0]; __result[1] = __b.__val[1]; break; - case 3: __result[0] = __b.__val[2]; __result[1] = __b.__val[3]; break; - } - if (__imm & 0x08) { __result[0] = 0; __result[1] = 0; } - - /* Select high 128 bits */ - switch (__sel_hi) { - case 0: __result[2] = __a.__val[0]; __result[3] = __a.__val[1]; break; - case 1: __result[2] = __a.__val[2]; __result[3] = __a.__val[3]; break; - case 2: __result[2] = __b.__val[0]; __result[3] = __b.__val[1]; break; - case 3: __result[2] = __b.__val[2]; __result[3] = __b.__val[3]; break; - } - if (__imm & 0x80) { __result[2] = 0; __result[3] = 0; } - - return (__m256i){ { __result[0], __result[1], __result[2], __result[3] } }; -} - -/* === Set (non-broadcast) === */ - -/* _mm256_set_epi32: set 8 ints (high-to-low order like Intel) */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_set_epi32(int __e7, int __e6, int __e5, int __e4, - int __e3, int __e2, int __e1, int __e0) -{ - __m256i __r; - int *__p = (int *)&__r; - __p[0] = __e0; __p[1] = __e1; __p[2] = __e2; __p[3] = __e3; - __p[4] = __e4; __p[5] = __e5; __p[6] = __e6; __p[7] = __e7; - return __r; -} - -/* _mm256_set_epi8: set 32 bytes (high-to-low order like Intel) */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_set_epi8( - char __b31, char __b30, char __b29, char __b28, - char __b27, char __b26, char __b25, char __b24, - char __b23, char __b22, char __b21, char __b20, - char __b19, char __b18, char __b17, char __b16, - char __b15, char __b14, char __b13, char __b12, - char __b11, char __b10, char __b9, char __b8, - char __b7, char __b6, char __b5, char __b4, - char __b3, char __b2, char __b1, char __b0) -{ - __m256i __r; - unsigned char *__p = (unsigned char *)&__r; - __p[0] = (unsigned char)__b0; __p[1] = (unsigned char)__b1; - __p[2] = (unsigned char)__b2; __p[3] = (unsigned char)__b3; - __p[4] = (unsigned char)__b4; __p[5] = (unsigned char)__b5; - __p[6] = (unsigned char)__b6; __p[7] = (unsigned char)__b7; - __p[8] = (unsigned char)__b8; __p[9] = (unsigned char)__b9; - __p[10] = (unsigned char)__b10; __p[11] = (unsigned char)__b11; - __p[12] = (unsigned char)__b12; __p[13] = (unsigned char)__b13; - __p[14] = (unsigned char)__b14; __p[15] = (unsigned char)__b15; - __p[16] = (unsigned char)__b16; __p[17] = (unsigned char)__b17; - __p[18] = (unsigned char)__b18; __p[19] = (unsigned char)__b19; - __p[20] = (unsigned char)__b20; __p[21] = (unsigned char)__b21; - __p[22] = (unsigned char)__b22; __p[23] = (unsigned char)__b23; - __p[24] = (unsigned char)__b24; __p[25] = (unsigned char)__b25; - __p[26] = (unsigned char)__b26; __p[27] = (unsigned char)__b27; - __p[28] = (unsigned char)__b28; __p[29] = (unsigned char)__b29; - __p[30] = (unsigned char)__b30; __p[31] = (unsigned char)__b31; - return __r; -} - -/* === Insert === */ - -/* _mm256_inserti128_si256: insert 128-bit lane into 256-bit register */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_inserti128_si256(__m256i __a, __m128i __b, int __imm) -{ - __m256i __r = __a; - if (__imm & 1) { - __r.__val[2] = __b.__val[0]; - __r.__val[3] = __b.__val[1]; - } else { - __r.__val[0] = __b.__val[0]; - __r.__val[1] = __b.__val[1]; - } - return __r; -} - -/* === Blend === */ - -/* _mm256_blend_epi32: VPBLENDD - blend 32-bit elements by immediate mask */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_blend_epi32(__m256i __a, __m256i __b, int __imm) -{ - int *__pa = (int *)&__a; - int *__pb = (int *)&__b; - __m256i __r; - int *__pr = (int *)&__r; - for (int __i = 0; __i < 8; __i++) - __pr[__i] = (__imm & (1 << __i)) ? __pb[__i] : __pa[__i]; - return __r; -} - -/* === Shuffle (32-bit) === */ - -/* _mm256_shuffle_epi32: VPSHUFD - shuffle 32-bit ints within 128-bit lanes */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_shuffle_epi32(__m256i __a, int __imm) -{ - unsigned int *__pa = (unsigned int *)&__a; - __m256i __r; - unsigned int *__pr = (unsigned int *)&__r; - /* Low 128-bit lane */ - __pr[0] = __pa[(__imm >> 0) & 3]; - __pr[1] = __pa[(__imm >> 2) & 3]; - __pr[2] = __pa[(__imm >> 4) & 3]; - __pr[3] = __pa[(__imm >> 6) & 3]; - /* High 128-bit lane */ - __pr[4] = __pa[4 + ((__imm >> 0) & 3)]; - __pr[5] = __pa[4 + ((__imm >> 2) & 3)]; - __pr[6] = __pa[4 + ((__imm >> 4) & 3)]; - __pr[7] = __pa[4 + ((__imm >> 6) & 3)]; - return __r; -} - -/* === Shift (16-bit) === */ - -/* _mm256_slli_epi16: VPSLLW - shift 16-bit ints left by immediate */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_slli_epi16(__m256i __a, int __count) -{ - if (__count < 0 || __count > 15) - return _mm256_setzero_si256(); - unsigned short *__pa = (unsigned short *)&__a; - __m256i __r; - unsigned short *__pr = (unsigned short *)&__r; - for (int __i = 0; __i < 16; __i++) - __pr[__i] = (unsigned short)(__pa[__i] << __count); - return __r; -} - -/* _mm256_srli_epi16: VPSRLW - shift 16-bit ints right by immediate */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_srli_epi16(__m256i __a, int __count) -{ - if (__count < 0 || __count > 15) - return _mm256_setzero_si256(); - unsigned short *__pa = (unsigned short *)&__a; - __m256i __r; - unsigned short *__pr = (unsigned short *)&__r; - for (int __i = 0; __i < 16; __i++) - __pr[__i] = __pa[__i] >> __count; - return __r; -} - -/* === Movemask === */ - -/* _mm256_movemask_epi8: VPMOVMSKB - create 32-bit mask from MSBs of bytes */ -static __inline__ int __attribute__((__always_inline__)) -_mm256_movemask_epi8(__m256i __a) -{ - unsigned char *__pa = (unsigned char *)&__a; - int __mask = 0; - for (int __i = 0; __i < 32; __i++) { - if (__pa[__i] & 0x80) - __mask |= (1 << __i); - } - return __mask; -} - -/* === Permute (64-bit) === */ - -/* _mm256_permute4x64_epi64: VPERMQ - permute 64-bit elements across lanes */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_permute4x64_epi64(__m256i __a, int __imm) -{ - __m256i __r; - __r.__val[0] = __a.__val[(__imm >> 0) & 3]; - __r.__val[1] = __a.__val[(__imm >> 2) & 3]; - __r.__val[2] = __a.__val[(__imm >> 4) & 3]; - __r.__val[3] = __a.__val[(__imm >> 6) & 3]; - return __r; -} - -/* _mm256_permutevar8x32_epi32: VPERMD - permute 32-bit ints by variable index */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_permutevar8x32_epi32(__m256i __a, __m256i __idx) -{ - unsigned int *__pa = (unsigned int *)&__a; - unsigned int *__pi = (unsigned int *)&__idx; - __m256i __r; - unsigned int *__pr = (unsigned int *)&__r; - for (int __i = 0; __i < 8; __i++) - __pr[__i] = __pa[__pi[__i] & 7]; - return __r; -} - -/* === Unpack / Interleave === */ - -/* _mm256_unpacklo_epi8: VPUNPCKLBW - interleave low bytes within 128-bit lanes */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_unpacklo_epi8(__m256i __a, __m256i __b) -{ - unsigned char *__pa = (unsigned char *)&__a; - unsigned char *__pb = (unsigned char *)&__b; - __m256i __r; - unsigned char *__pr = (unsigned char *)&__r; - /* Low 128-bit lane: interleave bytes 0..7 */ - for (int __i = 0; __i < 8; __i++) { - __pr[__i * 2] = __pa[__i]; - __pr[__i * 2 + 1] = __pb[__i]; - } - /* High 128-bit lane: interleave bytes 16..23 */ - for (int __i = 0; __i < 8; __i++) { - __pr[16 + __i * 2] = __pa[16 + __i]; - __pr[16 + __i * 2 + 1] = __pb[16 + __i]; - } - return __r; -} - -/* _mm256_unpackhi_epi8: VPUNPCKHBW - interleave high bytes within 128-bit lanes */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_unpackhi_epi8(__m256i __a, __m256i __b) -{ - unsigned char *__pa = (unsigned char *)&__a; - unsigned char *__pb = (unsigned char *)&__b; - __m256i __r; - unsigned char *__pr = (unsigned char *)&__r; - /* Low 128-bit lane: interleave bytes 8..15 */ - for (int __i = 0; __i < 8; __i++) { - __pr[__i * 2] = __pa[8 + __i]; - __pr[__i * 2 + 1] = __pb[8 + __i]; - } - /* High 128-bit lane: interleave bytes 24..31 */ - for (int __i = 0; __i < 8; __i++) { - __pr[16 + __i * 2] = __pa[24 + __i]; - __pr[16 + __i * 2 + 1] = __pb[24 + __i]; - } - return __r; -} - -/* _mm256_unpacklo_epi16: VPUNPCKLWD - interleave low 16-bit ints within lanes */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_unpacklo_epi16(__m256i __a, __m256i __b) -{ - unsigned short *__pa = (unsigned short *)&__a; - unsigned short *__pb = (unsigned short *)&__b; - __m256i __r; - unsigned short *__pr = (unsigned short *)&__r; - /* Low 128-bit lane: interleave words 0..3 */ - for (int __i = 0; __i < 4; __i++) { - __pr[__i * 2] = __pa[__i]; - __pr[__i * 2 + 1] = __pb[__i]; - } - /* High 128-bit lane: interleave words 8..11 */ - for (int __i = 0; __i < 4; __i++) { - __pr[8 + __i * 2] = __pa[8 + __i]; - __pr[8 + __i * 2 + 1] = __pb[8 + __i]; - } - return __r; -} - -/* _mm256_unpackhi_epi16: VPUNPCKHWD - interleave high 16-bit ints within lanes */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_unpackhi_epi16(__m256i __a, __m256i __b) -{ - unsigned short *__pa = (unsigned short *)&__a; - unsigned short *__pb = (unsigned short *)&__b; - __m256i __r; - unsigned short *__pr = (unsigned short *)&__r; - /* Low 128-bit lane: interleave words 4..7 */ - for (int __i = 0; __i < 4; __i++) { - __pr[__i * 2] = __pa[4 + __i]; - __pr[__i * 2 + 1] = __pb[4 + __i]; - } - /* High 128-bit lane: interleave words 12..15 */ - for (int __i = 0; __i < 4; __i++) { - __pr[8 + __i * 2] = __pa[12 + __i]; - __pr[8 + __i * 2 + 1] = __pb[12 + __i]; - } - return __r; -} - -/* _mm256_unpacklo_epi32: VPUNPCKLDQ - interleave low 32-bit ints within lanes */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_unpacklo_epi32(__m256i __a, __m256i __b) -{ - unsigned int *__pa = (unsigned int *)&__a; - unsigned int *__pb = (unsigned int *)&__b; - __m256i __r; - unsigned int *__pr = (unsigned int *)&__r; - /* Low 128-bit lane */ - __pr[0] = __pa[0]; __pr[1] = __pb[0]; - __pr[2] = __pa[1]; __pr[3] = __pb[1]; - /* High 128-bit lane */ - __pr[4] = __pa[4]; __pr[5] = __pb[4]; - __pr[6] = __pa[5]; __pr[7] = __pb[5]; - return __r; -} - -/* _mm256_unpackhi_epi32: VPUNPCKHDQ - interleave high 32-bit ints within lanes */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_unpackhi_epi32(__m256i __a, __m256i __b) -{ - unsigned int *__pa = (unsigned int *)&__a; - unsigned int *__pb = (unsigned int *)&__b; - __m256i __r; - unsigned int *__pr = (unsigned int *)&__r; - /* Low 128-bit lane */ - __pr[0] = __pa[2]; __pr[1] = __pb[2]; - __pr[2] = __pa[3]; __pr[3] = __pb[3]; - /* High 128-bit lane */ - __pr[4] = __pa[6]; __pr[5] = __pb[6]; - __pr[6] = __pa[7]; __pr[7] = __pb[7]; - return __r; -} - -/* _mm256_unpacklo_epi64: VPUNPCKLQDQ - interleave low 64-bit ints within lanes */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_unpacklo_epi64(__m256i __a, __m256i __b) -{ - return (__m256i){ { __a.__val[0], __b.__val[0], __a.__val[2], __b.__val[2] } }; -} - -/* _mm256_unpackhi_epi64: VPUNPCKHQDQ - interleave high 64-bit ints within lanes */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_unpackhi_epi64(__m256i __a, __m256i __b) -{ - return (__m256i){ { __a.__val[1], __b.__val[1], __a.__val[3], __b.__val[3] } }; -} - -/* === Additional intrinsics for CPython HACL Blake2b SIMD256 === */ - -/* _mm256_set_epi64x: set 4 x 64-bit integers (high-to-low) */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_set_epi64x(long long __e3, long long __e2, long long __e1, long long __e0) -{ - return (__m256i){ { __e0, __e1, __e2, __e3 } }; -} - -/* _mm256_set_m128i: combine two __m128i into __m256i (lo, hi) */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_set_m128i(__m128i __hi, __m128i __lo) -{ - __m256i __r; - long long *__p = (long long *)&__r; - long long *__lo_p = (long long *)&__lo; - long long *__hi_p = (long long *)&__hi; - __p[0] = __lo_p[0]; __p[1] = __lo_p[1]; - __p[2] = __hi_p[0]; __p[3] = __hi_p[1]; - return __r; -} - -/* _mm256_cmpeq_epi64: compare 64-bit integers for equality */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_cmpeq_epi64(__m256i __a, __m256i __b) -{ - __m256i __r; - long long *__pa = (long long *)&__a; - long long *__pb = (long long *)&__b; - long long *__pr = (long long *)&__r; - for (int __i = 0; __i < 4; __i++) - __pr[__i] = (__pa[__i] == __pb[__i]) ? -1LL : 0LL; - return __r; -} - -/* _mm256_cmpgt_epi32: compare signed 32-bit integers for greater-than */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_cmpgt_epi32(__m256i __a, __m256i __b) -{ - __m256i __r; - int *__pa = (int *)&__a; - int *__pb = (int *)&__b; - int *__pr = (int *)&__r; - for (int __i = 0; __i < 8; __i++) - __pr[__i] = (__pa[__i] > __pb[__i]) ? -1 : 0; - return __r; -} - -/* _mm256_cmpgt_epi64: compare signed 64-bit integers for greater-than */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_cmpgt_epi64(__m256i __a, __m256i __b) -{ - __m256i __r; - long long *__pa = (long long *)&__a; - long long *__pb = (long long *)&__b; - long long *__pr = (long long *)&__r; - for (int __i = 0; __i < 4; __i++) - __pr[__i] = (__pa[__i] > __pb[__i]) ? -1LL : 0LL; - return __r; -} - -/* _mm256_extract_epi8: extract 8-bit integer */ -static __inline__ int __attribute__((__always_inline__)) -_mm256_extract_epi8(__m256i __a, int __imm) -{ - unsigned char *__p = (unsigned char *)&__a; - return (int)__p[__imm & 31]; -} - -/* _mm256_extract_epi32: extract 32-bit integer */ -static __inline__ int __attribute__((__always_inline__)) -_mm256_extract_epi32(__m256i __a, int __imm) -{ - int *__p = (int *)&__a; - return __p[__imm & 7]; -} - -/* _mm256_extract_epi64: extract 64-bit integer */ -static __inline__ long long __attribute__((__always_inline__)) -_mm256_extract_epi64(__m256i __a, int __imm) -{ - long long *__p = (long long *)&__a; - return __p[__imm & 3]; -} - -/* _mm256_insert_epi8: insert 8-bit integer */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_insert_epi8(__m256i __a, int __val, int __imm) -{ - unsigned char *__p = (unsigned char *)&__a; - __p[__imm & 31] = (unsigned char)__val; - return __a; -} - -/* _mm256_insert_epi32: insert 32-bit integer */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_insert_epi32(__m256i __a, int __val, int __imm) -{ - int *__p = (int *)&__a; - __p[__imm & 7] = __val; - return __a; -} - -/* _mm256_insert_epi64: insert 64-bit integer */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_insert_epi64(__m256i __a, long long __val, int __imm) -{ - long long *__p = (long long *)&__a; - __p[__imm & 3] = __val; - return __a; -} - -/* _mm256_sub_epi64: subtract 64-bit integers */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_sub_epi64(__m256i __a, __m256i __b) -{ - return (__m256i){ { __a.__val[0] - __b.__val[0], __a.__val[1] - __b.__val[1], - __a.__val[2] - __b.__val[2], __a.__val[3] - __b.__val[3] } }; -} - -/* _mm256_mul_epu32: multiply unsigned 32-bit integers, produce 64-bit results */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_mul_epu32(__m256i __a, __m256i __b) -{ - __m256i __r; - unsigned int *__pa = (unsigned int *)&__a; - unsigned int *__pb = (unsigned int *)&__b; - unsigned long long *__pr = (unsigned long long *)&__r; - __pr[0] = (unsigned long long)__pa[0] * __pb[0]; - __pr[1] = (unsigned long long)__pa[2] * __pb[2]; - __pr[2] = (unsigned long long)__pa[4] * __pb[4]; - __pr[3] = (unsigned long long)__pa[6] * __pb[6]; - return __r; -} - -/* _mm256_mullo_epi32: multiply signed 32-bit integers, keep low 32 bits */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_mullo_epi32(__m256i __a, __m256i __b) -{ - __m256i __r; - int *__pa = (int *)&__a; - int *__pb = (int *)&__b; - int *__pr = (int *)&__r; - for (int __i = 0; __i < 8; __i++) - __pr[__i] = __pa[__i] * __pb[__i]; - return __r; -} - -/* _mm256_slli_si256: byte shift left within 128-bit lanes */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_slli_si256(__m256i __a, int __imm) -{ - __m256i __r; - unsigned char *__src = (unsigned char *)&__a; - unsigned char *__dst = (unsigned char *)&__r; - int __shift = __imm & 0xff; - if (__shift > 16) __shift = 16; - /* Lane 0 (bytes 0-15) */ - for (int __i = 0; __i < 16; __i++) - __dst[__i] = (__i >= __shift) ? __src[__i - __shift] : 0; - /* Lane 1 (bytes 16-31) */ - for (int __i = 0; __i < 16; __i++) - __dst[16 + __i] = (__i >= __shift) ? __src[16 + __i - __shift] : 0; - return __r; -} - -/* _mm256_srli_si256: byte shift right within 128-bit lanes */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_srli_si256(__m256i __a, int __imm) -{ - __m256i __r; - unsigned char *__src = (unsigned char *)&__a; - unsigned char *__dst = (unsigned char *)&__r; - int __shift = __imm & 0xff; - if (__shift > 16) __shift = 16; - /* Lane 0 (bytes 0-15) */ - for (int __i = 0; __i < 16; __i++) - __dst[__i] = (__i + __shift < 16) ? __src[__i + __shift] : 0; - /* Lane 1 (bytes 16-31) */ - for (int __i = 0; __i < 16; __i++) - __dst[16 + __i] = (__i + __shift < 16) ? __src[16 + __i + __shift] : 0; - return __r; -} - -/* === Conversion / Sign Extension === */ - -/* _mm256_cvtepi8_epi16: sign-extend 16 packed 8-bit to 16 packed 16-bit (VPMOVSXBW) */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_cvtepi8_epi16(__m128i __a) -{ - signed char *__pa = (signed char *)&__a; - __m256i __r; - short *__pr = (short *)&__r; - for (int __i = 0; __i < 16; __i++) - __pr[__i] = (short)__pa[__i]; - return __r; -} - -/* _mm256_cvtepi8_epi32: sign-extend 8 packed 8-bit to 8 packed 32-bit (VPMOVSXBD) */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_cvtepi8_epi32(__m128i __a) -{ - signed char *__pa = (signed char *)&__a; - __m256i __r; - int *__pr = (int *)&__r; - for (int __i = 0; __i < 8; __i++) - __pr[__i] = (int)__pa[__i]; - return __r; -} - -/* _mm256_cvtepi16_epi32: sign-extend 8 packed 16-bit to 8 packed 32-bit (VPMOVSXWD) */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_cvtepi16_epi32(__m128i __a) -{ - short *__pa = (short *)&__a; - __m256i __r; - int *__pr = (int *)&__r; - for (int __i = 0; __i < 8; __i++) - __pr[__i] = (int)__pa[__i]; - return __r; -} - -/* _mm256_cvtepu8_epi16: zero-extend 16 packed 8-bit to 16 packed 16-bit (VPMOVZXBW) */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_cvtepu8_epi16(__m128i __a) -{ - unsigned char *__pa = (unsigned char *)&__a; - __m256i __r; - short *__pr = (short *)&__r; - for (int __i = 0; __i < 16; __i++) - __pr[__i] = (short)(unsigned short)__pa[__i]; - return __r; -} - -/* _mm256_cvtepu8_epi32: zero-extend 8 packed 8-bit to 8 packed 32-bit (VPMOVZXBD) */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_cvtepu8_epi32(__m128i __a) -{ - unsigned char *__pa = (unsigned char *)&__a; - __m256i __r; - int *__pr = (int *)&__r; - for (int __i = 0; __i < 8; __i++) - __pr[__i] = (int)(unsigned int)__pa[__i]; - return __r; -} - -/* _mm256_cvtepu16_epi32: zero-extend 8 packed 16-bit to 8 packed 32-bit (VPMOVZXWD) */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_cvtepu16_epi32(__m128i __a) -{ - unsigned short *__pa = (unsigned short *)&__a; - __m256i __r; - int *__pr = (int *)&__r; - for (int __i = 0; __i < 8; __i++) - __pr[__i] = (int)(unsigned int)__pa[__i]; - return __r; -} - -/* === Multiply-Add === */ - -/* _mm256_madd_epi16: multiply signed 16-bit, horizontally add adjacent pairs -> 32-bit (VPMADDWD) */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_madd_epi16(__m256i __a, __m256i __b) -{ - short *__pa = (short *)&__a; - short *__pb = (short *)&__b; - __m256i __r; - int *__pr = (int *)&__r; - for (int __i = 0; __i < 8; __i++) - __pr[__i] = (int)__pa[__i * 2] * (int)__pb[__i * 2] - + (int)__pa[__i * 2 + 1] * (int)__pb[__i * 2 + 1]; - return __r; -} - -/* _mm256_maddubs_epi16: multiply unsigned*signed 8-bit, hadd pairs -> 16-bit with saturation (VPMADDUBSW) */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_maddubs_epi16(__m256i __a, __m256i __b) -{ - unsigned char *__pa = (unsigned char *)&__a; - signed char *__pb = (signed char *)&__b; - __m256i __r; - short *__pr = (short *)&__r; - for (int __i = 0; __i < 16; __i++) { - int __s = (int)__pa[__i * 2] * (int)__pb[__i * 2] - + (int)__pa[__i * 2 + 1] * (int)__pb[__i * 2 + 1]; - if (__s > 32767) __s = 32767; - if (__s < -32768) __s = -32768; - __pr[__i] = (short)__s; - } - return __r; -} - -/* === Horizontal Add === */ - -/* _mm256_hadd_epi16: horizontal add adjacent pairs of 16-bit (VPHADDW) */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_hadd_epi16(__m256i __a, __m256i __b) -{ - short *__pa = (short *)&__a; - short *__pb = (short *)&__b; - __m256i __r; - short *__pr = (short *)&__r; - /* Low 128-bit lane: from __a low lane + __b low lane */ - __pr[0] = (short)(__pa[0] + __pa[1]); - __pr[1] = (short)(__pa[2] + __pa[3]); - __pr[2] = (short)(__pa[4] + __pa[5]); - __pr[3] = (short)(__pa[6] + __pa[7]); - __pr[4] = (short)(__pb[0] + __pb[1]); - __pr[5] = (short)(__pb[2] + __pb[3]); - __pr[6] = (short)(__pb[4] + __pb[5]); - __pr[7] = (short)(__pb[6] + __pb[7]); - /* High 128-bit lane: from __a high lane + __b high lane */ - __pr[8] = (short)(__pa[8] + __pa[9]); - __pr[9] = (short)(__pa[10] + __pa[11]); - __pr[10] = (short)(__pa[12] + __pa[13]); - __pr[11] = (short)(__pa[14] + __pa[15]); - __pr[12] = (short)(__pb[8] + __pb[9]); - __pr[13] = (short)(__pb[10] + __pb[11]); - __pr[14] = (short)(__pb[12] + __pb[13]); - __pr[15] = (short)(__pb[14] + __pb[15]); - return __r; -} - -/* _mm256_hadd_epi32: horizontal add adjacent pairs of 32-bit (VPHADDD) */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_hadd_epi32(__m256i __a, __m256i __b) -{ - int *__pa = (int *)&__a; - int *__pb = (int *)&__b; - __m256i __r; - int *__pr = (int *)&__r; - /* Low 128-bit lane */ - __pr[0] = __pa[0] + __pa[1]; - __pr[1] = __pa[2] + __pa[3]; - __pr[2] = __pb[0] + __pb[1]; - __pr[3] = __pb[2] + __pb[3]; - /* High 128-bit lane */ - __pr[4] = __pa[4] + __pa[5]; - __pr[5] = __pa[6] + __pa[7]; - __pr[6] = __pb[4] + __pb[5]; - __pr[7] = __pb[6] + __pb[7]; - return __r; -} - -/* === Multiply (16-bit) === */ - -/* _mm256_mullo_epi16: multiply 16-bit ints, keep low 16 bits (VPMULLW) */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_mullo_epi16(__m256i __a, __m256i __b) -{ - short *__pa = (short *)&__a; - short *__pb = (short *)&__b; - __m256i __r; - short *__pr = (short *)&__r; - for (int __i = 0; __i < 16; __i++) - __pr[__i] = (short)(__pa[__i] * __pb[__i]); - return __r; -} - -/* _mm256_mulhi_epi16: multiply signed 16-bit ints, keep high 16 bits (VPMULHW) */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_mulhi_epi16(__m256i __a, __m256i __b) -{ - short *__pa = (short *)&__a; - short *__pb = (short *)&__b; - __m256i __r; - short *__pr = (short *)&__r; - for (int __i = 0; __i < 16; __i++) - __pr[__i] = (short)(((int)__pa[__i] * (int)__pb[__i]) >> 16); - return __r; -} - -/* === Absolute value === */ - -/* _mm256_abs_epi8: VPABSB */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_abs_epi8(__m256i __a) -{ - signed char *__pa = (signed char *)&__a; - __m256i __r; - unsigned char *__pr = (unsigned char *)&__r; - for (int __i = 0; __i < 32; __i++) - __pr[__i] = (unsigned char)(__pa[__i] < 0 ? -__pa[__i] : __pa[__i]); - return __r; -} - -/* _mm256_abs_epi16: VPABSW */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_abs_epi16(__m256i __a) -{ - short *__pa = (short *)&__a; - __m256i __r; - short *__pr = (short *)&__r; - for (int __i = 0; __i < 16; __i++) - __pr[__i] = __pa[__i] < 0 ? (short)-__pa[__i] : __pa[__i]; - return __r; -} - -/* _mm256_abs_epi32: VPABSD */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_abs_epi32(__m256i __a) -{ - int *__pa = (int *)&__a; - __m256i __r; - int *__pr = (int *)&__r; - for (int __i = 0; __i < 8; __i++) - __pr[__i] = __pa[__i] < 0 ? -__pa[__i] : __pa[__i]; - return __r; -} - -/* === Min / Max (additional) === */ - -/* _mm256_min_epu8: minimum of unsigned 8-bit ints (VPMINUB) */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_min_epu8(__m256i __a, __m256i __b) -{ - unsigned char *__pa = (unsigned char *)&__a; - unsigned char *__pb = (unsigned char *)&__b; - __m256i __r; - unsigned char *__pr = (unsigned char *)&__r; - for (int __i = 0; __i < 32; __i++) - __pr[__i] = __pa[__i] < __pb[__i] ? __pa[__i] : __pb[__i]; - return __r; -} - -/* _mm256_max_epi32: maximum of signed 32-bit ints (VPMAXSD) */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_max_epi32(__m256i __a, __m256i __b) -{ - int *__pa = (int *)&__a; - int *__pb = (int *)&__b; - __m256i __r; - int *__pr = (int *)&__r; - for (int __i = 0; __i < 8; __i++) - __pr[__i] = __pa[__i] > __pb[__i] ? __pa[__i] : __pb[__i]; - return __r; -} - -/* _mm256_min_epi32: minimum of signed 32-bit ints (VPMINSD) */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_min_epi32(__m256i __a, __m256i __b) -{ - int *__pa = (int *)&__a; - int *__pb = (int *)&__b; - __m256i __r; - int *__pr = (int *)&__r; - for (int __i = 0; __i < 8; __i++) - __pr[__i] = __pa[__i] < __pb[__i] ? __pa[__i] : __pb[__i]; - return __r; -} - -/* === Pack === */ - -/* _mm256_packs_epi32: pack 32-bit to 16-bit with signed saturation within lanes (VPACKSSDW) */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_packs_epi32(__m256i __a, __m256i __b) -{ - int *__pa = (int *)&__a; - int *__pb = (int *)&__b; - __m256i __r; - short *__pr = (short *)&__r; - /* Low lane: a[0..3] then b[0..3] */ - for (int __i = 0; __i < 4; __i++) { - int __v = __pa[__i]; - if (__v > 32767) __v = 32767; if (__v < -32768) __v = -32768; - __pr[__i] = (short)__v; - } - for (int __i = 0; __i < 4; __i++) { - int __v = __pb[__i]; - if (__v > 32767) __v = 32767; if (__v < -32768) __v = -32768; - __pr[4 + __i] = (short)__v; - } - /* High lane: a[4..7] then b[4..7] */ - for (int __i = 0; __i < 4; __i++) { - int __v = __pa[4 + __i]; - if (__v > 32767) __v = 32767; if (__v < -32768) __v = -32768; - __pr[8 + __i] = (short)__v; - } - for (int __i = 0; __i < 4; __i++) { - int __v = __pb[4 + __i]; - if (__v > 32767) __v = 32767; if (__v < -32768) __v = -32768; - __pr[12 + __i] = (short)__v; - } - return __r; -} - -/* _mm256_packus_epi16: pack 16-bit to 8-bit with unsigned saturation within lanes (VPACKUSWB) */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_packus_epi16(__m256i __a, __m256i __b) -{ - short *__pa = (short *)&__a; - short *__pb = (short *)&__b; - __m256i __r; - unsigned char *__pr = (unsigned char *)&__r; - /* Low lane */ - for (int __i = 0; __i < 8; __i++) { - int __v = __pa[__i]; - if (__v > 255) __v = 255; if (__v < 0) __v = 0; - __pr[__i] = (unsigned char)__v; - } - for (int __i = 0; __i < 8; __i++) { - int __v = __pb[__i]; - if (__v > 255) __v = 255; if (__v < 0) __v = 0; - __pr[8 + __i] = (unsigned char)__v; - } - /* High lane */ - for (int __i = 0; __i < 8; __i++) { - int __v = __pa[8 + __i]; - if (__v > 255) __v = 255; if (__v < 0) __v = 0; - __pr[16 + __i] = (unsigned char)__v; - } - for (int __i = 0; __i < 8; __i++) { - int __v = __pb[8 + __i]; - if (__v > 255) __v = 255; if (__v < 0) __v = 0; - __pr[24 + __i] = (unsigned char)__v; - } - return __r; -} - -/* === Arithmetic shift === */ - -/* _mm256_srai_epi32: arithmetic shift right 32-bit (VPSRAD) */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_srai_epi32(__m256i __a, int __count) -{ - if (__count < 0) __count = 0; - if (__count > 31) __count = 31; - int *__pa = (int *)&__a; - __m256i __r; - int *__pr = (int *)&__r; - for (int __i = 0; __i < 8; __i++) - __pr[__i] = __pa[__i] >> __count; - return __r; -} - -/* _mm256_srai_epi16: arithmetic shift right 16-bit (VPSRAW) */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_srai_epi16(__m256i __a, int __count) -{ - if (__count < 0) __count = 0; - if (__count > 15) __count = 15; - short *__pa = (short *)&__a; - __m256i __r; - short *__pr = (short *)&__r; - for (int __i = 0; __i < 16; __i++) - __pr[__i] = __pa[__i] >> __count; - return __r; -} - -/* === Gather === */ - -/* _mm256_i32gather_epi32: gather 32-bit ints using 32-bit indices (VPGATHERDD) */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_i32gather_epi32(int const *__base, __m256i __index, int __scale) -{ - int *__pi = (int *)&__index; - __m256i __r; - int *__pr = (int *)&__r; - const unsigned char *__b = (const unsigned char *)__base; - for (int __i = 0; __i < 8; __i++) - __pr[__i] = *(int *)(__b + (long long)__pi[__i] * __scale); - return __r; -} - -#endif /* _AVX2INTRIN_H_INCLUDED */ diff --git a/include/avx512fintrin.h b/include/avx512fintrin.h deleted file mode 100644 index 8e4af0f55b..0000000000 --- a/include/avx512fintrin.h +++ /dev/null @@ -1,499 +0,0 @@ -/* CCC compiler bundled avx512fintrin.h - AVX-512 Foundation intrinsics */ -#ifndef _AVX512FINTRIN_H_INCLUDED -#define _AVX512FINTRIN_H_INCLUDED - -#include - -/* AVX-512 512-bit vector types */ -typedef struct __attribute__((__aligned__(64))) { - long long __val[8]; -} __m512i; - -typedef struct __attribute__((__aligned__(64))) { - double __val[8]; -} __m512d; - -typedef struct __attribute__((__aligned__(64))) { - float __val[16]; -} __m512; - -/* Unaligned variants */ -typedef struct __attribute__((__aligned__(1))) { - long long __val[8]; -} __m512i_u; - -/* AVX-512 mask types */ -typedef unsigned char __mmask8; -typedef unsigned short __mmask16; - -/* === Load / Store === */ - -static __inline__ __m512i __attribute__((__always_inline__)) -_mm512_loadu_si512(void const *__p) -{ - __m512i __r; - __builtin_memcpy(&__r, __p, sizeof(__r)); - return __r; -} - -static __inline__ void __attribute__((__always_inline__)) -_mm512_storeu_si512(void *__p, __m512i __a) -{ - __builtin_memcpy(__p, &__a, sizeof(__a)); -} - -/* === Set === */ - -static __inline__ __m512i __attribute__((__always_inline__)) -_mm512_setzero_si512(void) -{ - return (__m512i){ { 0LL, 0LL, 0LL, 0LL, 0LL, 0LL, 0LL, 0LL } }; -} - -static __inline__ __m512i __attribute__((__always_inline__)) -_mm512_set1_epi64(long long __q) -{ - return (__m512i){ { __q, __q, __q, __q, __q, __q, __q, __q } }; -} - -/* === Arithmetic === */ - -static __inline__ __m512i __attribute__((__always_inline__)) -_mm512_add_epi64(__m512i __a, __m512i __b) -{ - return (__m512i){ { __a.__val[0] + __b.__val[0], - __a.__val[1] + __b.__val[1], - __a.__val[2] + __b.__val[2], - __a.__val[3] + __b.__val[3], - __a.__val[4] + __b.__val[4], - __a.__val[5] + __b.__val[5], - __a.__val[6] + __b.__val[6], - __a.__val[7] + __b.__val[7] } }; -} - -/* === Population count === */ - -/* _mm512_popcnt_epi64: population count for each 64-bit element */ -static __inline__ __m512i __attribute__((__always_inline__)) -_mm512_popcnt_epi64(__m512i __a) -{ - __m512i __r; - for (int __i = 0; __i < 8; __i++) { - unsigned long long __v = (unsigned long long)__a.__val[__i]; - int __cnt = 0; - while (__v) { - __cnt++; - __v &= __v - 1; - } - __r.__val[__i] = __cnt; - } - return __r; -} - -/* === Reduce === */ - -/* _mm512_reduce_add_epi64: horizontal sum of all 64-bit elements */ -static __inline__ long long __attribute__((__always_inline__)) -_mm512_reduce_add_epi64(__m512i __a) -{ - return __a.__val[0] + __a.__val[1] + __a.__val[2] + __a.__val[3] - + __a.__val[4] + __a.__val[5] + __a.__val[6] + __a.__val[7]; -} - -/* === Float Load / Store === */ - -static __inline__ __m512 __attribute__((__always_inline__)) -_mm512_loadu_ps(void const *__p) -{ - __m512 __r; - const float *__fp = (const float *)__p; - for (int __i = 0; __i < 16; __i++) - __r.__val[__i] = __fp[__i]; - return __r; -} - -static __inline__ void __attribute__((__always_inline__)) -_mm512_storeu_ps(void *__p, __m512 __a) -{ - float *__fp = (float *)__p; - for (int __i = 0; __i < 16; __i++) - __fp[__i] = __a.__val[__i]; -} - -/* === Float Set === */ - -static __inline__ __m512 __attribute__((__always_inline__)) -_mm512_setzero_ps(void) -{ - __m512 __r; - for (int __i = 0; __i < 16; __i++) - __r.__val[__i] = 0.0f; - return __r; -} - -/* === Float Arithmetic === */ - -static __inline__ __m512 __attribute__((__always_inline__)) -_mm512_add_ps(__m512 __a, __m512 __b) -{ - __m512 __r; - for (int __i = 0; __i < 16; __i++) - __r.__val[__i] = __a.__val[__i] + __b.__val[__i]; - return __r; -} - -static __inline__ __m512 __attribute__((__always_inline__)) -_mm512_mul_ps(__m512 __a, __m512 __b) -{ - __m512 __r; - for (int __i = 0; __i < 16; __i++) - __r.__val[__i] = __a.__val[__i] * __b.__val[__i]; - return __r; -} - -/* _mm512_fmadd_ps: a*b + c (single-precision, 512-bit) */ -static __inline__ __m512 __attribute__((__always_inline__)) -_mm512_fmadd_ps(__m512 __a, __m512 __b, __m512 __c) -{ - __m512 __r; - for (int __i = 0; __i < 16; __i++) - __r.__val[__i] = __a.__val[__i] * __b.__val[__i] + __c.__val[__i]; - return __r; -} - -/* === Float Reduce === */ - -/* _mm512_reduce_add_ps: horizontal sum of all 16 float elements */ -static __inline__ float __attribute__((__always_inline__)) -_mm512_reduce_add_ps(__m512 __a) -{ - float __sum = 0.0f; - for (int __i = 0; __i < 16; __i++) - __sum += __a.__val[__i]; - return __sum; -} - -/* === Integer Arithmetic (32-bit) === */ - -/* _mm512_add_epi32: add packed 32-bit integers */ -static __inline__ __m512i __attribute__((__always_inline__)) -_mm512_add_epi32(__m512i __a, __m512i __b) -{ - unsigned int *__pa = (unsigned int *)&__a; - unsigned int *__pb = (unsigned int *)&__b; - __m512i __r; - unsigned int *__pr = (unsigned int *)&__r; - for (int __i = 0; __i < 16; __i++) - __pr[__i] = __pa[__i] + __pb[__i]; - return __r; -} - -/* _mm512_sub_epi32: subtract packed 32-bit integers */ -static __inline__ __m512i __attribute__((__always_inline__)) -_mm512_sub_epi32(__m512i __a, __m512i __b) -{ - unsigned int *__pa = (unsigned int *)&__a; - unsigned int *__pb = (unsigned int *)&__b; - __m512i __r; - unsigned int *__pr = (unsigned int *)&__r; - for (int __i = 0; __i < 16; __i++) - __pr[__i] = __pa[__i] - __pb[__i]; - return __r; -} - -/* _mm512_mullo_epi32: multiply 32-bit ints, keep low 32 bits */ -static __inline__ __m512i __attribute__((__always_inline__)) -_mm512_mullo_epi32(__m512i __a, __m512i __b) -{ - unsigned int *__pa = (unsigned int *)&__a; - unsigned int *__pb = (unsigned int *)&__b; - __m512i __r; - unsigned int *__pr = (unsigned int *)&__r; - for (int __i = 0; __i < 16; __i++) - __pr[__i] = __pa[__i] * __pb[__i]; - return __r; -} - -/* === Bitwise === */ - -/* _mm512_and_si512: bitwise AND */ -static __inline__ __m512i __attribute__((__always_inline__)) -_mm512_and_si512(__m512i __a, __m512i __b) -{ - return (__m512i){ { __a.__val[0] & __b.__val[0], - __a.__val[1] & __b.__val[1], - __a.__val[2] & __b.__val[2], - __a.__val[3] & __b.__val[3], - __a.__val[4] & __b.__val[4], - __a.__val[5] & __b.__val[5], - __a.__val[6] & __b.__val[6], - __a.__val[7] & __b.__val[7] } }; -} - -/* _mm512_or_si512: bitwise OR */ -static __inline__ __m512i __attribute__((__always_inline__)) -_mm512_or_si512(__m512i __a, __m512i __b) -{ - return (__m512i){ { __a.__val[0] | __b.__val[0], - __a.__val[1] | __b.__val[1], - __a.__val[2] | __b.__val[2], - __a.__val[3] | __b.__val[3], - __a.__val[4] | __b.__val[4], - __a.__val[5] | __b.__val[5], - __a.__val[6] | __b.__val[6], - __a.__val[7] | __b.__val[7] } }; -} - -/* _mm512_xor_si512: bitwise XOR */ -static __inline__ __m512i __attribute__((__always_inline__)) -_mm512_xor_si512(__m512i __a, __m512i __b) -{ - return (__m512i){ { __a.__val[0] ^ __b.__val[0], - __a.__val[1] ^ __b.__val[1], - __a.__val[2] ^ __b.__val[2], - __a.__val[3] ^ __b.__val[3], - __a.__val[4] ^ __b.__val[4], - __a.__val[5] ^ __b.__val[5], - __a.__val[6] ^ __b.__val[6], - __a.__val[7] ^ __b.__val[7] } }; -} - -/* _mm512_andnot_si512: bitwise AND-NOT (~a & b) */ -static __inline__ __m512i __attribute__((__always_inline__)) -_mm512_andnot_si512(__m512i __a, __m512i __b) -{ - return (__m512i){ { ~__a.__val[0] & __b.__val[0], - ~__a.__val[1] & __b.__val[1], - ~__a.__val[2] & __b.__val[2], - ~__a.__val[3] & __b.__val[3], - ~__a.__val[4] & __b.__val[4], - ~__a.__val[5] & __b.__val[5], - ~__a.__val[6] & __b.__val[6], - ~__a.__val[7] & __b.__val[7] } }; -} - -/* === Set (additional) === */ - -/* _mm512_set1_epi32: broadcast 32-bit integer */ -static __inline__ __m512i __attribute__((__always_inline__)) -_mm512_set1_epi32(int __i) -{ - long long __q = (long long)(unsigned int)__i - | ((long long)(unsigned int)__i << 32); - return (__m512i){ { __q, __q, __q, __q, __q, __q, __q, __q } }; -} - -/* _mm512_set1_epi8: broadcast 8-bit integer */ -static __inline__ __m512i __attribute__((__always_inline__)) -_mm512_set1_epi8(char __b) -{ - unsigned char __ub = (unsigned char)__b; - long long __q = (long long)__ub; - __q |= __q << 8; - __q |= __q << 16; - __q |= __q << 32; - return (__m512i){ { __q, __q, __q, __q, __q, __q, __q, __q } }; -} - -/* === Extract === */ - -/* _mm512_extracti64x4_epi64: extract 256-bit lane from 512-bit register */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm512_extracti64x4_epi64(__m512i __a, int __imm) -{ - if (__imm & 1) - return (__m256i){ { __a.__val[4], __a.__val[5], __a.__val[6], __a.__val[7] } }; - else - return (__m256i){ { __a.__val[0], __a.__val[1], __a.__val[2], __a.__val[3] } }; -} - -/* _mm512_extracti32x4_epi32: extract 128-bit lane from 512-bit register */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm512_extracti32x4_epi32(__m512i __a, int __imm) -{ - int __lane = __imm & 3; - return (__m128i){ { __a.__val[__lane * 2], __a.__val[__lane * 2 + 1] } }; -} - -/* === Conversion / Sign Extension (512-bit) === */ - -/* _mm512_cvtepi8_epi16: sign-extend 32 packed 8-bit to 32 packed 16-bit (AVX-512BW) */ -static __inline__ __m512i __attribute__((__always_inline__)) -_mm512_cvtepi8_epi16(__m256i __a) -{ - signed char *__pa = (signed char *)&__a; - __m512i __r; - short *__pr = (short *)&__r; - for (int __i = 0; __i < 32; __i++) - __pr[__i] = (short)__pa[__i]; - return __r; -} - -/* _mm512_cvtepi8_epi32: sign-extend 16 packed 8-bit to 16 packed 32-bit */ -static __inline__ __m512i __attribute__((__always_inline__)) -_mm512_cvtepi8_epi32(__m128i __a) -{ - signed char *__pa = (signed char *)&__a; - __m512i __r; - int *__pr = (int *)&__r; - for (int __i = 0; __i < 16; __i++) - __pr[__i] = (int)__pa[__i]; - return __r; -} - -/* _mm512_cvtepi16_epi32: sign-extend 16 packed 16-bit to 16 packed 32-bit */ -static __inline__ __m512i __attribute__((__always_inline__)) -_mm512_cvtepi16_epi32(__m256i __a) -{ - short *__pa = (short *)&__a; - __m512i __r; - int *__pr = (int *)&__r; - for (int __i = 0; __i < 16; __i++) - __pr[__i] = (int)__pa[__i]; - return __r; -} - -/* === Multiply-Add (512-bit) === */ - -/* _mm512_madd_epi16: multiply signed 16-bit, hadd adjacent pairs -> 32-bit (AVX-512BW) */ -static __inline__ __m512i __attribute__((__always_inline__)) -_mm512_madd_epi16(__m512i __a, __m512i __b) -{ - short *__pa = (short *)&__a; - short *__pb = (short *)&__b; - __m512i __r; - int *__pr = (int *)&__r; - for (int __i = 0; __i < 16; __i++) - __pr[__i] = (int)__pa[__i * 2] * (int)__pb[__i * 2] - + (int)__pa[__i * 2 + 1] * (int)__pb[__i * 2 + 1]; - return __r; -} - -/* === Reduce (32-bit) === */ - -/* _mm512_reduce_add_epi32: horizontal sum of all 32-bit elements */ -static __inline__ int __attribute__((__always_inline__)) -_mm512_reduce_add_epi32(__m512i __a) -{ - int *__p = (int *)&__a; - int __sum = 0; - for (int __i = 0; __i < 16; __i++) - __sum += __p[__i]; - return __sum; -} - -/* === Subtract (64-bit) === */ - -/* _mm512_sub_epi64: subtract packed 64-bit integers */ -static __inline__ __m512i __attribute__((__always_inline__)) -_mm512_sub_epi64(__m512i __a, __m512i __b) -{ - return (__m512i){ { __a.__val[0] - __b.__val[0], - __a.__val[1] - __b.__val[1], - __a.__val[2] - __b.__val[2], - __a.__val[3] - __b.__val[3], - __a.__val[4] - __b.__val[4], - __a.__val[5] - __b.__val[5], - __a.__val[6] - __b.__val[6], - __a.__val[7] - __b.__val[7] } }; -} - -/* === Shift === */ - -/* _mm512_slli_epi32: shift 32-bit integers left */ -static __inline__ __m512i __attribute__((__always_inline__)) -_mm512_slli_epi32(__m512i __a, unsigned int __count) -{ - if (__count > 31) return _mm512_setzero_si512(); - unsigned int *__pa = (unsigned int *)&__a; - __m512i __r; - unsigned int *__pr = (unsigned int *)&__r; - for (int __i = 0; __i < 16; __i++) - __pr[__i] = __pa[__i] << __count; - return __r; -} - -/* _mm512_srli_epi32: shift 32-bit integers right */ -static __inline__ __m512i __attribute__((__always_inline__)) -_mm512_srli_epi32(__m512i __a, unsigned int __count) -{ - if (__count > 31) return _mm512_setzero_si512(); - unsigned int *__pa = (unsigned int *)&__a; - __m512i __r; - unsigned int *__pr = (unsigned int *)&__r; - for (int __i = 0; __i < 16; __i++) - __pr[__i] = __pa[__i] >> __count; - return __r; -} - -/* _mm512_slli_epi64: shift 64-bit integers left */ -static __inline__ __m512i __attribute__((__always_inline__)) -_mm512_slli_epi64(__m512i __a, unsigned int __count) -{ - if (__count > 63) return _mm512_setzero_si512(); - unsigned long long *__pa = (unsigned long long *)&__a; - __m512i __r; - unsigned long long *__pr = (unsigned long long *)&__r; - for (int __i = 0; __i < 8; __i++) - __pr[__i] = __pa[__i] << __count; - return __r; -} - -/* _mm512_srli_epi64: shift 64-bit integers right */ -static __inline__ __m512i __attribute__((__always_inline__)) -_mm512_srli_epi64(__m512i __a, unsigned int __count) -{ - if (__count > 63) return _mm512_setzero_si512(); - unsigned long long *__pa = (unsigned long long *)&__a; - __m512i __r; - unsigned long long *__pr = (unsigned long long *)&__r; - for (int __i = 0; __i < 8; __i++) - __pr[__i] = __pa[__i] >> __count; - return __r; -} - -/* === Compare === */ - -/* _mm512_cmpeq_epi32_mask: compare 32-bit ints for equality, return mask */ -static __inline__ __mmask16 __attribute__((__always_inline__)) -_mm512_cmpeq_epi32_mask(__m512i __a, __m512i __b) -{ - unsigned int *__pa = (unsigned int *)&__a; - unsigned int *__pb = (unsigned int *)&__b; - __mmask16 __mask = 0; - for (int __i = 0; __i < 16; __i++) - if (__pa[__i] == __pb[__i]) - __mask |= (1u << __i); - return __mask; -} - -/* === Insert === */ - -/* _mm512_inserti64x4: insert 256-bit lane into 512-bit register */ -static __inline__ __m512i __attribute__((__always_inline__)) -_mm512_inserti64x4(__m512i __a, __m256i __b, int __imm) -{ - __m512i __r = __a; - if (__imm & 1) { - __r.__val[4] = __b.__val[0]; __r.__val[5] = __b.__val[1]; - __r.__val[6] = __b.__val[2]; __r.__val[7] = __b.__val[3]; - } else { - __r.__val[0] = __b.__val[0]; __r.__val[1] = __b.__val[1]; - __r.__val[2] = __b.__val[2]; __r.__val[3] = __b.__val[3]; - } - return __r; -} - -/* === Broadcast === */ - -/* _mm512_broadcastsi128_si512: broadcast 128-bit to all 4 lanes */ -static __inline__ __m512i __attribute__((__always_inline__)) -_mm512_broadcastsi128_si512(__m128i __a) -{ - return (__m512i){ { __a.__val[0], __a.__val[1], - __a.__val[0], __a.__val[1], - __a.__val[0], __a.__val[1], - __a.__val[0], __a.__val[1] } }; -} - -#endif /* _AVX512FINTRIN_H_INCLUDED */ diff --git a/include/avxintrin.h b/include/avxintrin.h deleted file mode 100644 index 04ccf9f169..0000000000 --- a/include/avxintrin.h +++ /dev/null @@ -1,460 +0,0 @@ -/* CCC compiler bundled avxintrin.h - AVX intrinsics */ -#ifndef _AVXINTRIN_H_INCLUDED -#define _AVXINTRIN_H_INCLUDED - -#include - -/* AVX 256-bit vector types */ -typedef struct __attribute__((__aligned__(32))) { - float __val[8]; -} __m256; - -typedef struct __attribute__((__aligned__(32))) { - double __val[4]; -} __m256d; - -typedef struct __attribute__((__aligned__(32))) { - long long __val[4]; -} __m256i; - -/* Unaligned variants */ -typedef struct __attribute__((__aligned__(1))) { - float __val[8]; -} __m256_u; - -typedef struct __attribute__((__aligned__(1))) { - double __val[4]; -} __m256d_u; - -typedef struct __attribute__((__aligned__(1))) { - long long __val[4]; -} __m256i_u; - -/* === Load / Store === */ - -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_loadu_si256(__m256i_u const *__p) -{ - __m256i __r; - __builtin_memcpy(&__r, __p, sizeof(__r)); - return __r; -} - -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_load_si256(__m256i const *__p) -{ - return *__p; -} - -static __inline__ void __attribute__((__always_inline__)) -_mm256_storeu_si256(__m256i_u *__p, __m256i __a) -{ - __builtin_memcpy(__p, &__a, sizeof(__a)); -} - -static __inline__ void __attribute__((__always_inline__)) -_mm256_store_si256(__m256i *__p, __m256i __a) -{ - *__p = __a; -} - -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_lddqu_si256(__m256i_u const *__p) -{ - __m256i __r; - __builtin_memcpy(&__r, __p, sizeof(__r)); - return __r; -} - -/* Float load/store */ -static __inline__ __m256 __attribute__((__always_inline__)) -_mm256_loadu_ps(float const *__p) -{ - __m256 __r; - for (int __i = 0; __i < 8; __i++) - __r.__val[__i] = __p[__i]; - return __r; -} - -static __inline__ void __attribute__((__always_inline__)) -_mm256_storeu_ps(float *__p, __m256 __a) -{ - for (int __i = 0; __i < 8; __i++) - __p[__i] = __a.__val[__i]; -} - -static __inline__ __m256d __attribute__((__always_inline__)) -_mm256_loadu_pd(double const *__p) -{ - __m256d __r; - for (int __i = 0; __i < 4; __i++) - __r.__val[__i] = __p[__i]; - return __r; -} - -static __inline__ void __attribute__((__always_inline__)) -_mm256_storeu_pd(double *__p, __m256d __a) -{ - for (int __i = 0; __i < 4; __i++) - __p[__i] = __a.__val[__i]; -} - -/* === Set === */ - -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_setzero_si256(void) -{ - return (__m256i){ { 0LL, 0LL, 0LL, 0LL } }; -} - -static __inline__ __m256 __attribute__((__always_inline__)) -_mm256_setzero_ps(void) -{ - return (__m256){ { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f } }; -} - -static __inline__ __m256d __attribute__((__always_inline__)) -_mm256_setzero_pd(void) -{ - return (__m256d){ { 0.0, 0.0, 0.0, 0.0 } }; -} - -/* === Cast between 256-bit and 128-bit === */ - -/* Extract low 128 bits of __m256i as __m128i */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm256_castsi256_si128(__m256i __a) -{ - return (__m128i){ { __a.__val[0], __a.__val[1] } }; -} - -/* Extract low 128 bits of __m256 as __m128 */ -static __inline__ __m128 __attribute__((__always_inline__)) -_mm256_castps256_ps128(__m256 __a) -{ - return (__m128){ { __a.__val[0], __a.__val[1], __a.__val[2], __a.__val[3] } }; -} - -/* Zero-extend __m128i to __m256i (upper 128 bits undefined/zero) */ -static __inline__ __m256i __attribute__((__always_inline__)) -_mm256_castsi128_si256(__m128i __a) -{ - return (__m256i){ { __a.__val[0], __a.__val[1], 0LL, 0LL } }; -} - -/* Extract 128-bit lane from __m256 (imm must be 0 or 1) */ -static __inline__ __m128 __attribute__((__always_inline__)) -_mm256_extractf128_ps(__m256 __a, int __imm) -{ - if (__imm & 1) - return (__m128){ { __a.__val[4], __a.__val[5], __a.__val[6], __a.__val[7] } }; - else - return (__m128){ { __a.__val[0], __a.__val[1], __a.__val[2], __a.__val[3] } }; -} - -/* === Float Arithmetic === */ - -static __inline__ __m256 __attribute__((__always_inline__)) -_mm256_add_ps(__m256 __a, __m256 __b) -{ - __m256 __r; - for (int __i = 0; __i < 8; __i++) - __r.__val[__i] = __a.__val[__i] + __b.__val[__i]; - return __r; -} - -static __inline__ __m256 __attribute__((__always_inline__)) -_mm256_sub_ps(__m256 __a, __m256 __b) -{ - __m256 __r; - for (int __i = 0; __i < 8; __i++) - __r.__val[__i] = __a.__val[__i] - __b.__val[__i]; - return __r; -} - -static __inline__ __m256 __attribute__((__always_inline__)) -_mm256_mul_ps(__m256 __a, __m256 __b) -{ - __m256 __r; - for (int __i = 0; __i < 8; __i++) - __r.__val[__i] = __a.__val[__i] * __b.__val[__i]; - return __r; -} - -static __inline__ __m256d __attribute__((__always_inline__)) -_mm256_add_pd(__m256d __a, __m256d __b) -{ - __m256d __r; - for (int __i = 0; __i < 4; __i++) - __r.__val[__i] = __a.__val[__i] + __b.__val[__i]; - return __r; -} - -static __inline__ __m256d __attribute__((__always_inline__)) -_mm256_sub_pd(__m256d __a, __m256d __b) -{ - __m256d __r; - for (int __i = 0; __i < 4; __i++) - __r.__val[__i] = __a.__val[__i] - __b.__val[__i]; - return __r; -} - -static __inline__ __m256d __attribute__((__always_inline__)) -_mm256_mul_pd(__m256d __a, __m256d __b) -{ - __m256d __r; - for (int __i = 0; __i < 4; __i++) - __r.__val[__i] = __a.__val[__i] * __b.__val[__i]; - return __r; -} - -/* === Double Arithmetic (continued) === */ - -static __inline__ __m256d __attribute__((__always_inline__)) -_mm256_div_pd(__m256d __a, __m256d __b) -{ - __m256d __r; - for (int __i = 0; __i < 4; __i++) - __r.__val[__i] = __a.__val[__i] / __b.__val[__i]; - return __r; -} - -/* === Set (broadcast) === */ - -static __inline__ __m256d __attribute__((__always_inline__)) -_mm256_set1_pd(double __w) -{ - return (__m256d){ { __w, __w, __w, __w } }; -} - -static __inline__ __m256 __attribute__((__always_inline__)) -_mm256_set1_ps(float __w) -{ - return (__m256){ { __w, __w, __w, __w, __w, __w, __w, __w } }; -} - -static __inline__ __m256d __attribute__((__always_inline__)) -_mm256_set_pd(double __d3, double __d2, double __d1, double __d0) -{ - return (__m256d){ { __d0, __d1, __d2, __d3 } }; -} - -static __inline__ __m256 __attribute__((__always_inline__)) -_mm256_set_ps(float __f7, float __f6, float __f5, float __f4, - float __f3, float __f2, float __f1, float __f0) -{ - return (__m256){ { __f0, __f1, __f2, __f3, __f4, __f5, __f6, __f7 } }; -} - -/* === Cast 256->128 for pd === */ - -/* Extract low 128 bits of __m256d as __m128d */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm256_castpd256_pd128(__m256d __a) -{ - return (__m128d){ { __a.__val[0], __a.__val[1] } }; -} - -/* Zero-extend __m128d to __m256d (upper 128 bits undefined/zero) */ -static __inline__ __m256d __attribute__((__always_inline__)) -_mm256_castpd128_pd256(__m128d __a) -{ - return (__m256d){ { __a.__val[0], __a.__val[1], 0.0, 0.0 } }; -} - -/* === Shuffle / Permute (double) === */ - -/* Unpack and interleave low doubles from each 128-bit lane */ -static __inline__ __m256d __attribute__((__always_inline__)) -_mm256_unpacklo_pd(__m256d __a, __m256d __b) -{ - /* Within each 128-bit lane: select element 0 from each source */ - return (__m256d){ { __a.__val[0], __b.__val[0], __a.__val[2], __b.__val[2] } }; -} - -/* Unpack and interleave high doubles from each 128-bit lane */ -static __inline__ __m256d __attribute__((__always_inline__)) -_mm256_unpackhi_pd(__m256d __a, __m256d __b) -{ - /* Within each 128-bit lane: select element 1 from each source */ - return (__m256d){ { __a.__val[1], __b.__val[1], __a.__val[3], __b.__val[3] } }; -} - -/* Shuffle doubles within each 128-bit lane based on imm8 control */ -static __inline__ __m256d __attribute__((__always_inline__)) -_mm256_shuffle_pd(__m256d __a, __m256d __b, int __imm) -{ - return (__m256d){ { - (__imm & 0x1) ? __a.__val[1] : __a.__val[0], - (__imm & 0x2) ? __b.__val[1] : __b.__val[0], - (__imm & 0x4) ? __a.__val[3] : __a.__val[2], - (__imm & 0x8) ? __b.__val[3] : __b.__val[2] - } }; -} - -/* Permute 128-bit lanes from two 256-bit sources */ -static __inline__ __m256d __attribute__((__always_inline__)) -_mm256_permute2f128_pd(__m256d __a, __m256d __b, int __imm) -{ - __m256d __r; - /* Select low 128-bit lane of result */ - switch (__imm & 0x3) { - case 0: __r.__val[0] = __a.__val[0]; __r.__val[1] = __a.__val[1]; break; - case 1: __r.__val[0] = __a.__val[2]; __r.__val[1] = __a.__val[3]; break; - case 2: __r.__val[0] = __b.__val[0]; __r.__val[1] = __b.__val[1]; break; - case 3: __r.__val[0] = __b.__val[2]; __r.__val[1] = __b.__val[3]; break; - } - if (__imm & 0x8) { __r.__val[0] = 0.0; __r.__val[1] = 0.0; } - /* Select high 128-bit lane of result */ - switch ((__imm >> 4) & 0x3) { - case 0: __r.__val[2] = __a.__val[0]; __r.__val[3] = __a.__val[1]; break; - case 1: __r.__val[2] = __a.__val[2]; __r.__val[3] = __a.__val[3]; break; - case 2: __r.__val[2] = __b.__val[0]; __r.__val[3] = __b.__val[1]; break; - case 3: __r.__val[2] = __b.__val[2]; __r.__val[3] = __b.__val[3]; break; - } - if (__imm & 0x80) { __r.__val[2] = 0.0; __r.__val[3] = 0.0; } - return __r; -} - -/* === Horizontal operations === */ - -/* Horizontal add: add adjacent pairs of doubles within each 128-bit lane */ -static __inline__ __m256d __attribute__((__always_inline__)) -_mm256_hadd_pd(__m256d __a, __m256d __b) -{ - return (__m256d){ { - __a.__val[0] + __a.__val[1], - __b.__val[0] + __b.__val[1], - __a.__val[2] + __a.__val[3], - __b.__val[2] + __b.__val[3] - } }; -} - -/* Horizontal subtract: subtract adjacent pairs of doubles within each 128-bit lane */ -static __inline__ __m256d __attribute__((__always_inline__)) -_mm256_hsub_pd(__m256d __a, __m256d __b) -{ - return (__m256d){ { - __a.__val[0] - __a.__val[1], - __b.__val[0] - __b.__val[1], - __a.__val[2] - __a.__val[3], - __b.__val[2] - __b.__val[3] - } }; -} - -/* === Extract 128-bit lane from __m256d === */ - -static __inline__ __m128d __attribute__((__always_inline__)) -_mm256_extractf128_pd(__m256d __a, int __imm) -{ - if (__imm & 1) - return (__m128d){ { __a.__val[2], __a.__val[3] } }; - else - return (__m128d){ { __a.__val[0], __a.__val[1] } }; -} - -/* === Insert 128-bit lane into __m256d === */ - -static __inline__ __m256d __attribute__((__always_inline__)) -_mm256_insertf128_pd(__m256d __a, __m128d __b, int __imm) -{ - __m256d __r = __a; - if (__imm & 1) { - __r.__val[2] = __b.__val[0]; - __r.__val[3] = __b.__val[1]; - } else { - __r.__val[0] = __b.__val[0]; - __r.__val[1] = __b.__val[1]; - } - return __r; -} - -/* === Comparison === */ - -static __inline__ __m256d __attribute__((__always_inline__)) -_mm256_min_pd(__m256d __a, __m256d __b) -{ - __m256d __r; - for (int __i = 0; __i < 4; __i++) - __r.__val[__i] = __a.__val[__i] < __b.__val[__i] ? __a.__val[__i] : __b.__val[__i]; - return __r; -} - -static __inline__ __m256d __attribute__((__always_inline__)) -_mm256_max_pd(__m256d __a, __m256d __b) -{ - __m256d __r; - for (int __i = 0; __i < 4; __i++) - __r.__val[__i] = __a.__val[__i] > __b.__val[__i] ? __a.__val[__i] : __b.__val[__i]; - return __r; -} - -/* === Bitwise operations (pd) === */ - -static __inline__ __m256d __attribute__((__always_inline__)) -_mm256_and_pd(__m256d __a, __m256d __b) -{ - __m256d __r; - for (int __i = 0; __i < 4; __i++) { - union { double d; long long ll; } __ua, __ub, __ur; - __ua.d = __a.__val[__i]; __ub.d = __b.__val[__i]; - __ur.ll = __ua.ll & __ub.ll; - __r.__val[__i] = __ur.d; - } - return __r; -} - -static __inline__ __m256d __attribute__((__always_inline__)) -_mm256_or_pd(__m256d __a, __m256d __b) -{ - __m256d __r; - for (int __i = 0; __i < 4; __i++) { - union { double d; long long ll; } __ua, __ub, __ur; - __ua.d = __a.__val[__i]; __ub.d = __b.__val[__i]; - __ur.ll = __ua.ll | __ub.ll; - __r.__val[__i] = __ur.d; - } - return __r; -} - -static __inline__ __m256d __attribute__((__always_inline__)) -_mm256_xor_pd(__m256d __a, __m256d __b) -{ - __m256d __r; - for (int __i = 0; __i < 4; __i++) { - union { double d; long long ll; } __ua, __ub, __ur; - __ua.d = __a.__val[__i]; __ub.d = __b.__val[__i]; - __ur.ll = __ua.ll ^ __ub.ll; - __r.__val[__i] = __ur.d; - } - return __r; -} - -static __inline__ __m256d __attribute__((__always_inline__)) -_mm256_andnot_pd(__m256d __a, __m256d __b) -{ - __m256d __r; - for (int __i = 0; __i < 4; __i++) { - union { double d; long long ll; } __ua, __ub, __ur; - __ua.d = __a.__val[__i]; __ub.d = __b.__val[__i]; - __ur.ll = (~__ua.ll) & __ub.ll; - __r.__val[__i] = __ur.d; - } - return __r; -} - -/* === Movemask === */ - -static __inline__ int __attribute__((__always_inline__)) -_mm256_movemask_pd(__m256d __a) -{ - int __r = 0; - for (int __i = 0; __i < 4; __i++) { - union { double d; long long ll; } __u; - __u.d = __a.__val[__i]; - if (__u.ll < 0) __r |= (1 << __i); - } - return __r; -} - -#endif /* _AVXINTRIN_H_INCLUDED */ diff --git a/include/bmi2intrin.h b/include/bmi2intrin.h deleted file mode 100644 index 024b2cdff7..0000000000 --- a/include/bmi2intrin.h +++ /dev/null @@ -1,93 +0,0 @@ -/* CCC compiler bundled bmi2intrin.h - BMI2 intrinsics */ -#ifndef _BMI2INTRIN_H_INCLUDED -#define _BMI2INTRIN_H_INCLUDED - -/* _bzhi_u32: zero high bits starting from specified bit position */ -static __inline__ unsigned int __attribute__((__always_inline__)) -_bzhi_u32(unsigned int __src, unsigned int __index) -{ - __index &= 0xFF; - if (__index >= 32) - return __src; - return __src & ((1U << __index) - 1U); -} - -#ifdef __x86_64__ -/* _bzhi_u64: zero high bits starting from specified bit position (64-bit) */ -static __inline__ unsigned long long __attribute__((__always_inline__)) -_bzhi_u64(unsigned long long __src, unsigned long long __index) -{ - __index &= 0xFF; - if (__index >= 64) - return __src; - return __src & ((1ULL << __index) - 1ULL); -} -#endif - -/* _pdep_u32: parallel bit deposit */ -static __inline__ unsigned int __attribute__((__always_inline__)) -_pdep_u32(unsigned int __src, unsigned int __mask) -{ - unsigned int __result = 0; - unsigned int __k = 0; - for (unsigned int __i = 0; __i < 32; __i++) { - if (__mask & (1U << __i)) { - if (__src & (1U << __k)) - __result |= (1U << __i); - __k++; - } - } - return __result; -} - -/* _pext_u32: parallel bit extract */ -static __inline__ unsigned int __attribute__((__always_inline__)) -_pext_u32(unsigned int __src, unsigned int __mask) -{ - unsigned int __result = 0; - unsigned int __k = 0; - for (unsigned int __i = 0; __i < 32; __i++) { - if (__mask & (1U << __i)) { - if (__src & (1U << __i)) - __result |= (1U << __k); - __k++; - } - } - return __result; -} - -#ifdef __x86_64__ -/* _pdep_u64: parallel bit deposit (64-bit) */ -static __inline__ unsigned long long __attribute__((__always_inline__)) -_pdep_u64(unsigned long long __src, unsigned long long __mask) -{ - unsigned long long __result = 0; - unsigned long long __k = 0; - for (unsigned long long __i = 0; __i < 64; __i++) { - if (__mask & (1ULL << __i)) { - if (__src & (1ULL << __k)) - __result |= (1ULL << __i); - __k++; - } - } - return __result; -} - -/* _pext_u64: parallel bit extract (64-bit) */ -static __inline__ unsigned long long __attribute__((__always_inline__)) -_pext_u64(unsigned long long __src, unsigned long long __mask) -{ - unsigned long long __result = 0; - unsigned long long __k = 0; - for (unsigned long long __i = 0; __i < 64; __i++) { - if (__mask & (1ULL << __i)) { - if (__src & (1ULL << __i)) - __result |= (1ULL << __k); - __k++; - } - } - return __result; -} -#endif - -#endif /* _BMI2INTRIN_H_INCLUDED */ diff --git a/include/emmintrin.h b/include/emmintrin.h deleted file mode 100644 index 82fa2b6358..0000000000 --- a/include/emmintrin.h +++ /dev/null @@ -1,1653 +0,0 @@ -/* CCC compiler bundled emmintrin.h - SSE2 intrinsics */ -#ifndef _EMMINTRIN_H_INCLUDED -#define _EMMINTRIN_H_INCLUDED - -/* SSE2 intrinsics are only available on x86/x86-64 targets */ -#if !defined(__x86_64__) && !defined(__i386__) && !defined(__i686__) -#error "SSE2 intrinsics (emmintrin.h) require an x86 target" -#endif - -#include - -typedef struct __attribute__((__aligned__(16))) { - long long __val[2]; -} __m128i; - -typedef struct __attribute__((__aligned__(1))) { - long long __val[2]; -} __m128i_u; - -typedef struct __attribute__((__aligned__(16))) { - double __val[2]; -} __m128d; - -typedef struct __attribute__((__aligned__(1))) { - double __val[2]; -} __m128d_u; - -/* Internal vector types referenced by GCC system headers (wmmintrin.h, etc.). - * These enable parsing of system headers that use (__v2di)expr casts. - * Note: vector_size attribute is parsed but vectors are lowered as aggregates. */ -typedef double __v2df __attribute__ ((__vector_size__ (16))); -typedef long long __v2di __attribute__ ((__vector_size__ (16))); -typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16))); -typedef int __v4si __attribute__ ((__vector_size__ (16))); -typedef unsigned int __v4su __attribute__ ((__vector_size__ (16))); -typedef short __v8hi __attribute__ ((__vector_size__ (16))); -typedef unsigned short __v8hu __attribute__ ((__vector_size__ (16))); -typedef char __v16qi __attribute__ ((__vector_size__ (16))); -typedef signed char __v16qs __attribute__ ((__vector_size__ (16))); -typedef unsigned char __v16qu __attribute__ ((__vector_size__ (16))); - -/* Helper to convert intrinsic result pointer to __m128i value. - * Our SSE builtins return a pointer to 16-byte result data. - * This macro dereferences that pointer to get the __m128i struct value. */ -#define __CCC_M128I_FROM_BUILTIN(expr) (*(__m128i *)(expr)) - -/* === Load / Store === */ - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_loadu_si128(__m128i_u const *__p) -{ - return *__p; -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_load_si128(__m128i const *__p) -{ - return *__p; -} - -static __inline__ void __attribute__((__always_inline__)) -_mm_storeu_si128(__m128i_u *__p, __m128i __b) -{ - *__p = __b; -} - -static __inline__ void __attribute__((__always_inline__)) -_mm_store_si128(__m128i *__p, __m128i __b) -{ - *__p = __b; -} - -/* === Set === */ - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_set1_epi8(char __b) -{ - unsigned char __ub = (unsigned char)__b; - long long __q = (long long)__ub; - __q |= __q << 8; - __q |= __q << 16; - __q |= __q << 32; - return (__m128i){ { __q, __q } }; -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_set1_epi32(int __i) -{ - long long __q = (long long)(unsigned int)__i - | ((long long)(unsigned int)__i << 32); - return (__m128i){ { __q, __q } }; -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_setzero_si128(void) -{ - return (__m128i){ { 0LL, 0LL } }; -} - -/* === Compare === */ - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_cmpeq_epi8(__m128i __a, __m128i __b) -{ - return __CCC_M128I_FROM_BUILTIN(__builtin_ia32_pcmpeqb128(__a, __b)); -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_cmpeq_epi32(__m128i __a, __m128i __b) -{ - return __CCC_M128I_FROM_BUILTIN(__builtin_ia32_pcmpeqd128(__a, __b)); -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_cmpeq_epi16(__m128i __a, __m128i __b) -{ - /* Compare 8 x 16-bit elements for equality. Returns 0xFFFF for equal, 0x0000 otherwise. */ - unsigned long long __r0 = 0, __r1 = 0; - unsigned long long __a0 = (unsigned long long)__a.__val[0]; - unsigned long long __b0 = (unsigned long long)__b.__val[0]; - unsigned long long __a1 = (unsigned long long)__a.__val[1]; - unsigned long long __b1 = (unsigned long long)__b.__val[1]; - for (int __i = 0; __i < 4; __i++) { - unsigned short __va = (unsigned short)(__a0 >> (__i * 16)); - unsigned short __vb = (unsigned short)(__b0 >> (__i * 16)); - if (__va == __vb) __r0 |= (0xFFFFULL << (__i * 16)); - } - for (int __i = 0; __i < 4; __i++) { - unsigned short __va = (unsigned short)(__a1 >> (__i * 16)); - unsigned short __vb = (unsigned short)(__b1 >> (__i * 16)); - if (__va == __vb) __r1 |= (0xFFFFULL << (__i * 16)); - } - return (__m128i){ { (long long)__r0, (long long)__r1 } }; -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_cmplt_epi16(__m128i __a, __m128i __b) -{ - /* Returns 0xFFFF for lanes where a < b (signed), 0 otherwise. - * Equivalent to _mm_cmpgt_epi16(b, a). */ - return __CCC_M128I_FROM_BUILTIN(__builtin_ia32_pcmpgtw128(__b, __a)); -} - -/* === Unsigned Saturating Arithmetic === */ - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_adds_epu8(__m128i __a, __m128i __b) -{ - /* Unsigned saturating add of 16 x 8-bit elements. */ - unsigned char *__pa = (unsigned char *)&__a; - unsigned char *__pb = (unsigned char *)&__b; - __m128i __r; - unsigned char *__pr = (unsigned char *)&__r; - for (int __i = 0; __i < 16; __i++) { - unsigned int __s = (unsigned int)__pa[__i] + (unsigned int)__pb[__i]; - __pr[__i] = (unsigned char)(__s > 255 ? 255 : __s); - } - return __r; -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_adds_epu16(__m128i __a, __m128i __b) -{ - /* Unsigned saturating add of 8 x 16-bit elements. */ - unsigned short *__pa = (unsigned short *)&__a; - unsigned short *__pb = (unsigned short *)&__b; - __m128i __r; - unsigned short *__pr = (unsigned short *)&__r; - for (int __i = 0; __i < 8; __i++) { - unsigned int __s = (unsigned int)__pa[__i] + (unsigned int)__pb[__i]; - __pr[__i] = (unsigned short)(__s > 65535 ? 65535 : __s); - } - return __r; -} - -/* === Signed Saturating Arithmetic === */ - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_adds_epi8(__m128i __a, __m128i __b) -{ - /* Signed saturating add of 16 x 8-bit elements. */ - signed char *__pa = (signed char *)&__a; - signed char *__pb = (signed char *)&__b; - __m128i __r; - signed char *__pr = (signed char *)&__r; - for (int __i = 0; __i < 16; __i++) { - int __s = (int)__pa[__i] + (int)__pb[__i]; - if (__s > 127) __s = 127; - if (__s < -128) __s = -128; - __pr[__i] = (signed char)__s; - } - return __r; -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_adds_epi16(__m128i __a, __m128i __b) -{ - /* Signed saturating add of 8 x 16-bit elements. */ - short *__pa = (short *)&__a; - short *__pb = (short *)&__b; - __m128i __r; - short *__pr = (short *)&__r; - for (int __i = 0; __i < 8; __i++) { - int __s = (int)__pa[__i] + (int)__pb[__i]; - if (__s > 32767) __s = 32767; - if (__s < -32768) __s = -32768; - __pr[__i] = (short)__s; - } - return __r; -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_subs_epi16(__m128i __a, __m128i __b) -{ - /* Signed saturating subtract of 8 x 16-bit elements. */ - short *__pa = (short *)&__a; - short *__pb = (short *)&__b; - __m128i __r; - short *__pr = (short *)&__r; - for (int __i = 0; __i < 8; __i++) { - int __d = (int)__pa[__i] - (int)__pb[__i]; - if (__d > 32767) __d = 32767; - if (__d < -32768) __d = -32768; - __pr[__i] = (short)__d; - } - return __r; -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_subs_epu16(__m128i __a, __m128i __b) -{ - /* Unsigned saturating subtract of 8 x 16-bit elements. */ - unsigned short *__pa = (unsigned short *)&__a; - unsigned short *__pb = (unsigned short *)&__b; - __m128i __r; - unsigned short *__pr = (unsigned short *)&__r; - for (int __i = 0; __i < 8; __i++) { - int __d = (int)__pa[__i] - (int)__pb[__i]; - __pr[__i] = (unsigned short)(__d < 0 ? 0 : __d); - } - return __r; -} - -/* === Arithmetic === */ - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_subs_epu8(__m128i __a, __m128i __b) -{ - return __CCC_M128I_FROM_BUILTIN(__builtin_ia32_psubusb128(__a, __b)); -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_subs_epi8(__m128i __a, __m128i __b) -{ - return __CCC_M128I_FROM_BUILTIN(__builtin_ia32_psubsb128(__a, __b)); -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_avg_epu8(__m128i __a, __m128i __b) -{ - /* Unsigned byte average with rounding: (a + b + 1) >> 1 for each byte. */ - unsigned char *__pa = (unsigned char *)&__a; - unsigned char *__pb = (unsigned char *)&__b; - __m128i __r; - unsigned char *__pr = (unsigned char *)&__r; - for (int __i = 0; __i < 16; __i++) - __pr[__i] = (unsigned char)(((unsigned int)__pa[__i] + (unsigned int)__pb[__i] + 1) >> 1); - return __r; -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_min_epi16(__m128i __a, __m128i __b) -{ - /* Signed 16-bit minimum for each of 8 lanes. */ - short *__pa = (short *)&__a; - short *__pb = (short *)&__b; - __m128i __r; - short *__pr = (short *)&__r; - for (int __i = 0; __i < 8; __i++) - __pr[__i] = __pa[__i] < __pb[__i] ? __pa[__i] : __pb[__i]; - return __r; -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_max_epi16(__m128i __a, __m128i __b) -{ - /* Signed 16-bit maximum for each of 8 lanes. */ - short *__pa = (short *)&__a; - short *__pb = (short *)&__b; - __m128i __r; - short *__pr = (short *)&__r; - for (int __i = 0; __i < 8; __i++) - __pr[__i] = __pa[__i] > __pb[__i] ? __pa[__i] : __pb[__i]; - return __r; -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_min_epu8(__m128i __a, __m128i __b) -{ - /* Unsigned byte minimum for each of 16 lanes. */ - unsigned char *__pa = (unsigned char *)&__a; - unsigned char *__pb = (unsigned char *)&__b; - __m128i __r; - unsigned char *__pr = (unsigned char *)&__r; - for (int __i = 0; __i < 16; __i++) - __pr[__i] = __pa[__i] < __pb[__i] ? __pa[__i] : __pb[__i]; - return __r; -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_max_epu8(__m128i __a, __m128i __b) -{ - /* Unsigned byte maximum for each of 16 lanes. */ - unsigned char *__pa = (unsigned char *)&__a; - unsigned char *__pb = (unsigned char *)&__b; - __m128i __r; - unsigned char *__pr = (unsigned char *)&__r; - for (int __i = 0; __i < 16; __i++) - __pr[__i] = __pa[__i] > __pb[__i] ? __pa[__i] : __pb[__i]; - return __r; -} - -/* === Bitwise === */ - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_or_si128(__m128i __a, __m128i __b) -{ - return (__m128i){ { __a.__val[0] | __b.__val[0], - __a.__val[1] | __b.__val[1] } }; -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_and_si128(__m128i __a, __m128i __b) -{ - return (__m128i){ { __a.__val[0] & __b.__val[0], - __a.__val[1] & __b.__val[1] } }; -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_andnot_si128(__m128i __a, __m128i __b) -{ - return (__m128i){ { ~__a.__val[0] & __b.__val[0], - ~__a.__val[1] & __b.__val[1] } }; -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_xor_si128(__m128i __a, __m128i __b) -{ - return (__m128i){ { __a.__val[0] ^ __b.__val[0], - __a.__val[1] ^ __b.__val[1] } }; -} - -/* === 8-bit Arithmetic === */ - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_add_epi8(__m128i __a, __m128i __b) -{ - /* Byte-level add using carry-free byte addition trick: - * For each byte, (a+b) mod 256. We use a mask to isolate the low - * bit of each byte pair to propagate carries correctly within bytes - * but not across byte boundaries. */ - unsigned long long __mask = 0x7f7f7f7f7f7f7f7fULL; - unsigned long long __a0 = (unsigned long long)__a.__val[0]; - unsigned long long __b0 = (unsigned long long)__b.__val[0]; - unsigned long long __a1 = (unsigned long long)__a.__val[1]; - unsigned long long __b1 = (unsigned long long)__b.__val[1]; - unsigned long long __lo = ((__a0 & __mask) + (__b0 & __mask)) ^ ((__a0 ^ __b0) & ~__mask); - unsigned long long __hi = ((__a1 & __mask) + (__b1 & __mask)) ^ ((__a1 ^ __b1) & ~__mask); - return (__m128i){ { (long long)__lo, (long long)__hi } }; -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_sub_epi8(__m128i __a, __m128i __b) -{ - unsigned long long __mask = 0x8080808080808080ULL; - unsigned long long __a0 = (unsigned long long)__a.__val[0]; - unsigned long long __b0 = (unsigned long long)__b.__val[0]; - unsigned long long __a1 = (unsigned long long)__a.__val[1]; - unsigned long long __b1 = (unsigned long long)__b.__val[1]; - unsigned long long __lo = ((__a0 | __mask) - (__b0 & ~__mask)) ^ ((__a0 ^ ~__b0) & __mask); - unsigned long long __hi = ((__a1 | __mask) - (__b1 & ~__mask)) ^ ((__a1 ^ ~__b1) & __mask); - return (__m128i){ { (long long)__lo, (long long)__hi } }; -} - -/* === 16-bit Arithmetic === */ - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_add_epi16(__m128i __a, __m128i __b) -{ - return __CCC_M128I_FROM_BUILTIN(__builtin_ia32_paddw128(__a, __b)); -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_sub_epi16(__m128i __a, __m128i __b) -{ - return __CCC_M128I_FROM_BUILTIN(__builtin_ia32_psubw128(__a, __b)); -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_mullo_epi16(__m128i __a, __m128i __b) -{ - /* Multiply 8 x 16-bit signed integers, return low 16 bits of each 32-bit result. */ - unsigned short *__pa = (unsigned short *)&__a; - unsigned short *__pb = (unsigned short *)&__b; - __m128i __r; - unsigned short *__pr = (unsigned short *)&__r; - for (int __i = 0; __i < 8; __i++) - __pr[__i] = (unsigned short)((unsigned int)__pa[__i] * (unsigned int)__pb[__i]); - return __r; -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_mulhi_epi16(__m128i __a, __m128i __b) -{ - return __CCC_M128I_FROM_BUILTIN(__builtin_ia32_pmulhw128(__a, __b)); -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_mulhi_epu16(__m128i __a, __m128i __b) -{ - /* Multiply 8 x 16-bit unsigned integers, return high 16 bits of each 32-bit result. */ - unsigned short *__pa = (unsigned short *)&__a; - unsigned short *__pb = (unsigned short *)&__b; - __m128i __r; - unsigned short *__pr = (unsigned short *)&__r; - for (int __i = 0; __i < 8; __i++) - __pr[__i] = (unsigned short)(((unsigned int)__pa[__i] * (unsigned int)__pb[__i]) >> 16); - return __r; -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_madd_epi16(__m128i __a, __m128i __b) -{ - return __CCC_M128I_FROM_BUILTIN(__builtin_ia32_pmaddwd128(__a, __b)); -} - -/* === 32-bit Arithmetic === */ - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_add_epi32(__m128i __a, __m128i __b) -{ - return __CCC_M128I_FROM_BUILTIN(__builtin_ia32_paddd128(__a, __b)); -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_sub_epi32(__m128i __a, __m128i __b) -{ - return __CCC_M128I_FROM_BUILTIN(__builtin_ia32_psubd128(__a, __b)); -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_cmpgt_epi16(__m128i __a, __m128i __b) -{ - return __CCC_M128I_FROM_BUILTIN(__builtin_ia32_pcmpgtw128(__a, __b)); -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_cmpgt_epi8(__m128i __a, __m128i __b) -{ - return __CCC_M128I_FROM_BUILTIN(__builtin_ia32_pcmpgtb128(__a, __b)); -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_cmpgt_epi32(__m128i __a, __m128i __b) -{ - /* Compare 4 x 32-bit signed integers: returns 0xFFFFFFFF where a > b, 0 otherwise. */ - int *__pa = (int *)&__a; - int *__pb = (int *)&__b; - __m128i __r; - int *__pr = (int *)&__r; - for (int __i = 0; __i < 4; __i++) - __pr[__i] = __pa[__i] > __pb[__i] ? -1 : 0; - return __r; -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_cmplt_epi8(__m128i __a, __m128i __b) -{ - /* Returns 0xFF for lanes where a < b (signed), 0 otherwise. - * Equivalent to _mm_cmpgt_epi8(b, a). */ - return __CCC_M128I_FROM_BUILTIN(__builtin_ia32_pcmpgtb128(__b, __a)); -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_cmplt_epi32(__m128i __a, __m128i __b) -{ - /* Returns 0xFFFFFFFF for lanes where a < b (signed), 0 otherwise. - * Equivalent to _mm_cmpgt_epi32(b, a). */ - int *__pa = (int *)&__a; - int *__pb = (int *)&__b; - __m128i __r; - int *__pr = (int *)&__r; - for (int __i = 0; __i < 4; __i++) - __pr[__i] = __pa[__i] < __pb[__i] ? -1 : 0; - return __r; -} - -/* _mm_mul_epu32: unsigned 32x32->64 multiply (PMULUDQ) - * Multiplies the low 32-bit unsigned integers from each 64-bit lane: - * result[0] = (u32)a[0] * (u32)b[0], result[1] = (u32)a[2] * (u32)b[2] */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_mul_epu32(__m128i __a, __m128i __b) -{ - unsigned long long __a0 = (unsigned long long)(unsigned int)__a.__val[0]; - unsigned long long __b0 = (unsigned long long)(unsigned int)__b.__val[0]; - unsigned long long __a1 = (unsigned long long)(unsigned int)__a.__val[1]; - unsigned long long __b1 = (unsigned long long)(unsigned int)__b.__val[1]; - return (__m128i){ { (long long)(__a0 * __b0), (long long)(__a1 * __b1) } }; -} - -/* NOTE: _mm_mullo_epi16 is already defined above */ - -/* === 64-bit Arithmetic === */ - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_add_epi64(__m128i __a, __m128i __b) -{ - return (__m128i){ { __a.__val[0] + __b.__val[0], - __a.__val[1] + __b.__val[1] } }; -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_sub_epi64(__m128i __a, __m128i __b) -{ - return (__m128i){ { __a.__val[0] - __b.__val[0], - __a.__val[1] - __b.__val[1] } }; -} - -/* === Pack / Unpack === */ - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_packs_epi32(__m128i __a, __m128i __b) -{ - return __CCC_M128I_FROM_BUILTIN(__builtin_ia32_packssdw128(__a, __b)); -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_packs_epi16(__m128i __a, __m128i __b) -{ - return __CCC_M128I_FROM_BUILTIN(__builtin_ia32_packsswb128(__a, __b)); -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_packus_epi16(__m128i __a, __m128i __b) -{ - return __CCC_M128I_FROM_BUILTIN(__builtin_ia32_packuswb128(__a, __b)); -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_unpacklo_epi8(__m128i __a, __m128i __b) -{ - return __CCC_M128I_FROM_BUILTIN(__builtin_ia32_punpcklbw128(__a, __b)); -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_unpackhi_epi8(__m128i __a, __m128i __b) -{ - return __CCC_M128I_FROM_BUILTIN(__builtin_ia32_punpckhbw128(__a, __b)); -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_unpacklo_epi16(__m128i __a, __m128i __b) -{ - return __CCC_M128I_FROM_BUILTIN(__builtin_ia32_punpcklwd128(__a, __b)); -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_unpackhi_epi16(__m128i __a, __m128i __b) -{ - return __CCC_M128I_FROM_BUILTIN(__builtin_ia32_punpckhwd128(__a, __b)); -} - -/* Interleave low 32-bit integers: a0, b0, a1, b1 */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_unpacklo_epi32(__m128i __a, __m128i __b) -{ - unsigned int __a0 = (unsigned int)__a.__val[0]; - unsigned int __a1 = (unsigned int)(__a.__val[0] >> 32); - unsigned int __b0 = (unsigned int)__b.__val[0]; - unsigned int __b1 = (unsigned int)(__b.__val[0] >> 32); - long long __lo = (long long)__a0 | ((long long)__b0 << 32); - long long __hi = (long long)__a1 | ((long long)__b1 << 32); - return (__m128i){ { __lo, __hi } }; -} - -/* Interleave high 32-bit integers: a2, b2, a3, b3 */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_unpackhi_epi32(__m128i __a, __m128i __b) -{ - unsigned int __a2 = (unsigned int)__a.__val[1]; - unsigned int __a3 = (unsigned int)(__a.__val[1] >> 32); - unsigned int __b2 = (unsigned int)__b.__val[1]; - unsigned int __b3 = (unsigned int)(__b.__val[1] >> 32); - long long __lo = (long long)__a2 | ((long long)__b2 << 32); - long long __hi = (long long)__a3 | ((long long)__b3 << 32); - return (__m128i){ { __lo, __hi } }; -} - -/* Interleave low 64-bit integers: a0, b0 */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_unpacklo_epi64(__m128i __a, __m128i __b) -{ - return (__m128i){ { __a.__val[0], __b.__val[0] } }; -} - -/* Interleave high 64-bit integers: a1, b1 */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_unpackhi_epi64(__m128i __a, __m128i __b) -{ - return (__m128i){ { __a.__val[1], __b.__val[1] } }; -} - -/* === Set / Broadcast === */ - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_set1_epi16(short __w) -{ - unsigned short __uw = (unsigned short)__w; - long long __q = (long long)__uw | ((long long)__uw << 16) - | ((long long)__uw << 32) | ((long long)__uw << 48); - return (__m128i){ { __q, __q } }; -} - -/* _mm_setr_epi16: set 8 x 16-bit lanes in natural (low-to-high) order */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, - short __w4, short __w5, short __w6, short __w7) -{ - long long __lo = (long long)(unsigned short)__w0 - | ((long long)(unsigned short)__w1 << 16) - | ((long long)(unsigned short)__w2 << 32) - | ((long long)(unsigned short)__w3 << 48); - long long __hi = (long long)(unsigned short)__w4 - | ((long long)(unsigned short)__w5 << 16) - | ((long long)(unsigned short)__w6 << 32) - | ((long long)(unsigned short)__w7 << 48); - return (__m128i){ { __lo, __hi } }; -} - -/* _mm_set_epi16: set 8 x 16-bit lanes in reverse (high-to-low) order */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, - short __w3, short __w2, short __w1, short __w0) -{ - return _mm_setr_epi16(__w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7); -} - -/* _mm_setr_epi32: set 4 x 32-bit lanes in natural (low-to-high) order */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_setr_epi32(int __i0, int __i1, int __i2, int __i3) -{ - long long __lo = (long long)(unsigned int)__i0 - | ((long long)(unsigned int)__i1 << 32); - long long __hi = (long long)(unsigned int)__i2 - | ((long long)(unsigned int)__i3 << 32); - return (__m128i){ { __lo, __hi } }; -} - -/* _mm_set_epi32: set 4 x 32-bit lanes in reverse (high-to-low) order */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_set_epi32(int __i3, int __i2, int __i1, int __i0) -{ - return _mm_setr_epi32(__i0, __i1, __i2, __i3); -} - -/* _mm_set_epi8: set 16 x 8-bit lanes in reverse (high-to-low) order */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, - char __b11, char __b10, char __b9, char __b8, - char __b7, char __b6, char __b5, char __b4, - char __b3, char __b2, char __b1, char __b0) -{ - long long __lo = (long long)(unsigned char)__b0 - | ((long long)(unsigned char)__b1 << 8) - | ((long long)(unsigned char)__b2 << 16) - | ((long long)(unsigned char)__b3 << 24) - | ((long long)(unsigned char)__b4 << 32) - | ((long long)(unsigned char)__b5 << 40) - | ((long long)(unsigned char)__b6 << 48) - | ((long long)(unsigned char)__b7 << 56); - long long __hi = (long long)(unsigned char)__b8 - | ((long long)(unsigned char)__b9 << 8) - | ((long long)(unsigned char)__b10 << 16) - | ((long long)(unsigned char)__b11 << 24) - | ((long long)(unsigned char)__b12 << 32) - | ((long long)(unsigned char)__b13 << 40) - | ((long long)(unsigned char)__b14 << 48) - | ((long long)(unsigned char)__b15 << 56); - return (__m128i){ { __lo, __hi } }; -} - -/* _mm_setr_epi8: set 16 x 8-bit lanes in natural (low-to-high) order */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, - char __b4, char __b5, char __b6, char __b7, - char __b8, char __b9, char __b10, char __b11, - char __b12, char __b13, char __b14, char __b15) -{ - return _mm_set_epi8(__b15, __b14, __b13, __b12, - __b11, __b10, __b9, __b8, - __b7, __b6, __b5, __b4, - __b3, __b2, __b1, __b0); -} - -/* _mm_set_epi64x: set two 64-bit integers (high, low) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_set_epi64x(long long __hi, long long __lo) -{ - return (__m128i){ { __lo, __hi } }; -} - -/* _mm_set1_epi64x: broadcast 64-bit integer to both lanes */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_set1_epi64x(long long __q) -{ - return (__m128i){ { __q, __q } }; -} - -/* === Insert / Extract === */ - -#define _mm_insert_epi16(a, i, imm) \ - __CCC_M128I_FROM_BUILTIN(__builtin_ia32_pinsrw128((a), (i), (imm))) - -#define _mm_extract_epi16(a, imm) \ - __builtin_ia32_pextrw128((a), (imm)) - -/* === Convert / Move === */ - -static __inline__ int __attribute__((__always_inline__)) -_mm_cvtsi128_si32(__m128i __a) -{ - return __builtin_ia32_cvtsi128si32(__a); -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_cvtsi32_si128(int __a) -{ - return __CCC_M128I_FROM_BUILTIN(__builtin_ia32_cvtsi32si128(__a)); -} - -static __inline__ long long __attribute__((__always_inline__)) -_mm_cvtsi128_si64(__m128i __a) -{ - return __builtin_ia32_cvtsi128si64(__a); -} - -#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a) - -/* _mm_cvtsi64_si128: convert 64-bit integer to __m128i (MOVQ, zero upper) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_cvtsi64_si128(long long __a) -{ - return _mm_set_epi64x(0, __a); -} - -#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a) - -/* === Store low 64 bits === */ - -static __inline__ void __attribute__((__always_inline__)) -_mm_storel_epi64(__m128i *__p, __m128i __a) -{ - __builtin_ia32_storeldi128(__p, __a); -} - -/* === Shuffle 16-bit === */ - -#define _mm_shufflelo_epi16(a, imm) \ - __CCC_M128I_FROM_BUILTIN(__builtin_ia32_pshuflw128((a), (imm))) - -#define _mm_shufflehi_epi16(a, imm) \ - __CCC_M128I_FROM_BUILTIN(__builtin_ia32_pshufhw128((a), (imm))) - -/* === Shift operations === */ - -/* Bit-level shift left on each 16-bit element (PSLLW) */ -#define _mm_slli_epi16(a, count) \ - __CCC_M128I_FROM_BUILTIN(__builtin_ia32_psllwi128((a), (count))) - -/* Bit-level shift right logical on each 16-bit element (PSRLW) */ -#define _mm_srli_epi16(a, count) \ - __CCC_M128I_FROM_BUILTIN(__builtin_ia32_psrlwi128((a), (count))) - -/* Bit-level shift right arithmetic on each 16-bit element (PSRAW) */ -#define _mm_srai_epi16(a, count) \ - __CCC_M128I_FROM_BUILTIN(__builtin_ia32_psrawi128((a), (count))) - -/* Bit-level shift right arithmetic on each 32-bit element (PSRAD) */ -#define _mm_srai_epi32(a, count) \ - __CCC_M128I_FROM_BUILTIN(__builtin_ia32_psradi128((a), (count))) - -/* Bit-level shift left on each 32-bit element (PSLLD) */ -#define _mm_slli_epi32(a, count) \ - __CCC_M128I_FROM_BUILTIN(__builtin_ia32_pslldi128((a), (count))) - -/* Bit-level shift right logical on each 32-bit element (PSRLD) */ -#define _mm_srli_epi32(a, count) \ - __CCC_M128I_FROM_BUILTIN(__builtin_ia32_psrldi128((a), (count))) - -/* Byte-level shift left (PSLLDQ): shift __a left by __N bytes, zero-fill */ -#define _mm_slli_si128(a, N) \ - __CCC_M128I_FROM_BUILTIN(__builtin_ia32_pslldqi128((a), (N))) - -/* Byte-level shift right (PSRLDQ): shift __a right by __N bytes, zero-fill */ -#define _mm_srli_si128(a, N) \ - __CCC_M128I_FROM_BUILTIN(__builtin_ia32_psrldqi128((a), (N))) - -/* Bit-level shift left on each 64-bit element (PSLLQ) */ -#define _mm_slli_epi64(a, count) \ - __CCC_M128I_FROM_BUILTIN(__builtin_ia32_psllqi128((a), (count))) - -/* Bit-level shift right on each 64-bit element (PSRLQ) */ -#define _mm_srli_epi64(a, count) \ - __CCC_M128I_FROM_BUILTIN(__builtin_ia32_psrlqi128((a), (count))) - -/* Shuffle 32-bit integers (PSHUFD) */ -#define _mm_shuffle_epi32(a, imm) \ - __CCC_M128I_FROM_BUILTIN(__builtin_ia32_pshufd128((a), (imm))) - -/* Load low 64 bits into lower half, zero upper half (MOVQ) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_loadl_epi64(__m128i const *__p) -{ - return __CCC_M128I_FROM_BUILTIN(__builtin_ia32_loadldi128(__p)); -} - -/* === Float/Int Conversion (SSE2) === */ - -/* Convert packed 32-bit integers to packed single-precision floats */ -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cvtepi32_ps(__m128i __a) -{ - int __i0 = (int)__a.__val[0]; - int __i1 = (int)(__a.__val[0] >> 32); - int __i2 = (int)__a.__val[1]; - int __i3 = (int)(__a.__val[1] >> 32); - return (__m128){ { (float)__i0, (float)__i1, (float)__i2, (float)__i3 } }; -} - -/* Convert packed single-precision floats to packed 32-bit integers (round to nearest) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_cvtps_epi32(__m128 __a) -{ - /* Round to nearest integer. Use (int)(x + copysignf(0.5f, x)) as a - * portable approximation of round-to-nearest-even. This doesn't perfectly - * match banker's rounding for .5 cases, but works for typical audio/video usage. */ - int __i0 = (int)(__a.__val[0] >= 0.0f ? __a.__val[0] + 0.5f : __a.__val[0] - 0.5f); - int __i1 = (int)(__a.__val[1] >= 0.0f ? __a.__val[1] + 0.5f : __a.__val[1] - 0.5f); - int __i2 = (int)(__a.__val[2] >= 0.0f ? __a.__val[2] + 0.5f : __a.__val[2] - 0.5f); - int __i3 = (int)(__a.__val[3] >= 0.0f ? __a.__val[3] + 0.5f : __a.__val[3] - 0.5f); - long long __lo = (long long)(unsigned int)__i0 | ((long long)(unsigned int)__i1 << 32); - long long __hi = (long long)(unsigned int)__i2 | ((long long)(unsigned int)__i3 << 32); - return (__m128i){ { __lo, __hi } }; -} - -/* Convert packed single-precision floats to packed 32-bit integers (truncate) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_cvttps_epi32(__m128 __a) -{ - int __i0 = (int)__a.__val[0]; - int __i1 = (int)__a.__val[1]; - int __i2 = (int)__a.__val[2]; - int __i3 = (int)__a.__val[3]; - long long __lo = (long long)(unsigned int)__i0 | ((long long)(unsigned int)__i1 << 32); - long long __hi = (long long)(unsigned int)__i2 | ((long long)(unsigned int)__i3 << 32); - return (__m128i){ { __lo, __hi } }; -} - -/* === Miscellaneous === */ - -static __inline__ int __attribute__((__always_inline__)) -_mm_movemask_epi8(__m128i __a) -{ - return __builtin_ia32_pmovmskb128(__a); -} - -/* === Streaming / Non-temporal stores === */ - -static __inline__ void __attribute__((__always_inline__)) -_mm_stream_si128(__m128i *__p, __m128i __a) -{ - __builtin_ia32_movntdq(__p, __a); -} - -static __inline__ void __attribute__((__always_inline__)) -_mm_stream_si64(long long *__p, long long __a) -{ - __builtin_ia32_movnti64(__p, __a); -} - -static __inline__ void __attribute__((__always_inline__)) -_mm_stream_si32(int *__p, int __a) -{ - __builtin_ia32_movnti(__p, __a); -} - -static __inline__ void __attribute__((__always_inline__)) -_mm_stream_pd(double *__p, __m128d __a) -{ - __builtin_ia32_movntpd(__p, __a); -} - -/* === Type Cast (reinterpret) === */ - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_castps_si128(__m128 __a) -{ - __m128i __r; - __builtin_memcpy(&__r, &__a, sizeof(__r)); - return __r; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_castsi128_ps(__m128i __a) -{ - __m128 __r; - __builtin_memcpy(&__r, &__a, sizeof(__r)); - return __r; -} - -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_castps_pd(__m128 __a) -{ - __m128d __r; - __builtin_memcpy(&__r, &__a, sizeof(__r)); - return __r; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_castpd_ps(__m128d __a) -{ - __m128 __r; - __builtin_memcpy(&__r, &__a, sizeof(__r)); - return __r; -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_castpd_si128(__m128d __a) -{ - __m128i __r; - __builtin_memcpy(&__r, &__a, sizeof(__r)); - return __r; -} - -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_castsi128_pd(__m128i __a) -{ - __m128d __r; - __builtin_memcpy(&__r, &__a, sizeof(__r)); - return __r; -} - -/* ==================================================================== - * SSE2 Double-Precision Floating-Point Intrinsics (__m128d) - * ==================================================================== */ - -/* === Double Set / Broadcast === */ - -/* _mm_set_pd: set two doubles (high, low) - note parameter order */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_set_pd(double __hi, double __lo) -{ - return (__m128d){ { __lo, __hi } }; -} - -/* _mm_set1_pd: broadcast one double to both lanes */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_set1_pd(double __d) -{ - return (__m128d){ { __d, __d } }; -} - -#define _mm_set_pd1(d) _mm_set1_pd(d) - -/* _mm_set_sd: set low double, zero high */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_set_sd(double __d) -{ - return (__m128d){ { __d, 0.0 } }; -} - -/* _mm_setr_pd: set in natural order (low, high) */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_setr_pd(double __lo, double __hi) -{ - return (__m128d){ { __lo, __hi } }; -} - -/* _mm_setzero_pd: zero both lanes */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_setzero_pd(void) -{ - return (__m128d){ { 0.0, 0.0 } }; -} - -/* _mm_undefined_pd: uninitialized (returns zero for safety) */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_undefined_pd(void) -{ - return (__m128d){ { 0.0, 0.0 } }; -} - -/* === Double Load === */ - -/* _mm_load_pd: aligned load of 2 doubles */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_load_pd(double const *__p) -{ - return *(__m128d const *)__p; -} - -/* _mm_loadu_pd: unaligned load of 2 doubles */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_loadu_pd(double const *__p) -{ - return *(__m128d_u const *)__p; -} - -/* _mm_load_sd: load one double into low lane, zero high */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_load_sd(double const *__p) -{ - return (__m128d){ { *__p, 0.0 } }; -} - -/* _mm_load1_pd: load one double, broadcast to both lanes */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_load1_pd(double const *__p) -{ - double __d = *__p; - return (__m128d){ { __d, __d } }; -} - -#define _mm_load_pd1(p) _mm_load1_pd(p) - -/* _mm_loadr_pd: load 2 doubles in reverse order */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_loadr_pd(double const *__p) -{ - return (__m128d){ { __p[1], __p[0] } }; -} - -/* _mm_loadl_pd: load low double, keep high from __a */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_loadl_pd(__m128d __a, double const *__p) -{ - __a.__val[0] = *__p; - return __a; -} - -/* _mm_loadh_pd: load high double, keep low from __a */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_loadh_pd(__m128d __a, double const *__p) -{ - __a.__val[1] = *__p; - return __a; -} - -/* === Double Store === */ - -/* _mm_store_pd: aligned store of 2 doubles */ -static __inline__ void __attribute__((__always_inline__)) -_mm_store_pd(double *__p, __m128d __a) -{ - *(__m128d *)__p = __a; -} - -/* _mm_storeu_pd: unaligned store of 2 doubles */ -static __inline__ void __attribute__((__always_inline__)) -_mm_storeu_pd(double *__p, __m128d __a) -{ - *(__m128d_u *)__p = __a; -} - -/* _mm_store_sd: store low double */ -static __inline__ void __attribute__((__always_inline__)) -_mm_store_sd(double *__p, __m128d __a) -{ - *__p = __a.__val[0]; -} - -/* _mm_store1_pd: store low double to both positions */ -static __inline__ void __attribute__((__always_inline__)) -_mm_store1_pd(double *__p, __m128d __a) -{ - __p[0] = __a.__val[0]; - __p[1] = __a.__val[0]; -} - -#define _mm_store_pd1(p, a) _mm_store1_pd(p, a) - -/* _mm_storer_pd: store 2 doubles in reverse order */ -static __inline__ void __attribute__((__always_inline__)) -_mm_storer_pd(double *__p, __m128d __a) -{ - __p[0] = __a.__val[1]; - __p[1] = __a.__val[0]; -} - -/* _mm_storel_pd: store low double to memory */ -static __inline__ void __attribute__((__always_inline__)) -_mm_storel_pd(double *__p, __m128d __a) -{ - *__p = __a.__val[0]; -} - -/* _mm_storeh_pd: store high double to memory */ -static __inline__ void __attribute__((__always_inline__)) -_mm_storeh_pd(double *__p, __m128d __a) -{ - *__p = __a.__val[1]; -} - -/* === Double Arithmetic === */ - -/* _mm_add_pd: packed double add */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_add_pd(__m128d __a, __m128d __b) -{ - return (__m128d){ { __a.__val[0] + __b.__val[0], - __a.__val[1] + __b.__val[1] } }; -} - -/* _mm_sub_pd: packed double subtract */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_sub_pd(__m128d __a, __m128d __b) -{ - return (__m128d){ { __a.__val[0] - __b.__val[0], - __a.__val[1] - __b.__val[1] } }; -} - -/* _mm_mul_pd: packed double multiply */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_mul_pd(__m128d __a, __m128d __b) -{ - return (__m128d){ { __a.__val[0] * __b.__val[0], - __a.__val[1] * __b.__val[1] } }; -} - -/* _mm_div_pd: packed double divide */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_div_pd(__m128d __a, __m128d __b) -{ - return (__m128d){ { __a.__val[0] / __b.__val[0], - __a.__val[1] / __b.__val[1] } }; -} - -/* _mm_add_sd: scalar double add (low lane only, high unchanged) */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_add_sd(__m128d __a, __m128d __b) -{ - __a.__val[0] += __b.__val[0]; - return __a; -} - -/* _mm_sub_sd: scalar double subtract (low lane only) */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_sub_sd(__m128d __a, __m128d __b) -{ - __a.__val[0] -= __b.__val[0]; - return __a; -} - -/* _mm_mul_sd: scalar double multiply (low lane only) */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_mul_sd(__m128d __a, __m128d __b) -{ - __a.__val[0] *= __b.__val[0]; - return __a; -} - -/* _mm_div_sd: scalar double divide (low lane only) */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_div_sd(__m128d __a, __m128d __b) -{ - __a.__val[0] /= __b.__val[0]; - return __a; -} - -/* _mm_sqrt_pd: packed double square root */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_sqrt_pd(__m128d __a) -{ - return (__m128d){ { __builtin_sqrt(__a.__val[0]), __builtin_sqrt(__a.__val[1]) } }; -} - -/* _mm_sqrt_sd: scalar double square root (low lane only, high from __a) */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_sqrt_sd(__m128d __a, __m128d __b) -{ - return (__m128d){ { __builtin_sqrt(__b.__val[0]), __a.__val[1] } }; -} - -/* _mm_min_pd: packed double minimum - * Note: NaN handling may differ from hardware SSE2 (which returns second - * operand when one input is NaN). This uses C comparison semantics. */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_min_pd(__m128d __a, __m128d __b) -{ - return (__m128d){ { __a.__val[0] < __b.__val[0] ? __a.__val[0] : __b.__val[0], - __a.__val[1] < __b.__val[1] ? __a.__val[1] : __b.__val[1] } }; -} - -/* _mm_max_pd: packed double maximum - * Note: NaN handling may differ from hardware SSE2. */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_max_pd(__m128d __a, __m128d __b) -{ - return (__m128d){ { __a.__val[0] > __b.__val[0] ? __a.__val[0] : __b.__val[0], - __a.__val[1] > __b.__val[1] ? __a.__val[1] : __b.__val[1] } }; -} - -/* _mm_min_sd: scalar double minimum */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_min_sd(__m128d __a, __m128d __b) -{ - __a.__val[0] = __a.__val[0] < __b.__val[0] ? __a.__val[0] : __b.__val[0]; - return __a; -} - -/* _mm_max_sd: scalar double maximum */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_max_sd(__m128d __a, __m128d __b) -{ - __a.__val[0] = __a.__val[0] > __b.__val[0] ? __a.__val[0] : __b.__val[0]; - return __a; -} - -/* === Double Bitwise Operations === */ - -/* _mm_and_pd: bitwise AND */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_and_pd(__m128d __a, __m128d __b) -{ - __m128d __r; - long long *__ra = (long long *)&__a; - long long *__rb = (long long *)&__b; - long long *__rr = (long long *)&__r; - __rr[0] = __ra[0] & __rb[0]; - __rr[1] = __ra[1] & __rb[1]; - return __r; -} - -/* _mm_or_pd: bitwise OR */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_or_pd(__m128d __a, __m128d __b) -{ - __m128d __r; - long long *__ra = (long long *)&__a; - long long *__rb = (long long *)&__b; - long long *__rr = (long long *)&__r; - __rr[0] = __ra[0] | __rb[0]; - __rr[1] = __ra[1] | __rb[1]; - return __r; -} - -/* _mm_xor_pd: bitwise XOR */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_xor_pd(__m128d __a, __m128d __b) -{ - __m128d __r; - long long *__ra = (long long *)&__a; - long long *__rb = (long long *)&__b; - long long *__rr = (long long *)&__r; - __rr[0] = __ra[0] ^ __rb[0]; - __rr[1] = __ra[1] ^ __rb[1]; - return __r; -} - -/* _mm_andnot_pd: bitwise AND NOT (~a & b) */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_andnot_pd(__m128d __a, __m128d __b) -{ - __m128d __r; - long long *__ra = (long long *)&__a; - long long *__rb = (long long *)&__b; - long long *__rr = (long long *)&__r; - __rr[0] = ~__ra[0] & __rb[0]; - __rr[1] = ~__ra[1] & __rb[1]; - return __r; -} - -/* === Double Compare (returns mask: all 1s for true, all 0s for false) === */ - -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_cmpeq_pd(__m128d __a, __m128d __b) -{ - __m128d __r; - long long *__rr = (long long *)&__r; - __rr[0] = __a.__val[0] == __b.__val[0] ? ~0LL : 0LL; - __rr[1] = __a.__val[1] == __b.__val[1] ? ~0LL : 0LL; - return __r; -} - -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_cmplt_pd(__m128d __a, __m128d __b) -{ - __m128d __r; - long long *__rr = (long long *)&__r; - __rr[0] = __a.__val[0] < __b.__val[0] ? ~0LL : 0LL; - __rr[1] = __a.__val[1] < __b.__val[1] ? ~0LL : 0LL; - return __r; -} - -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_cmple_pd(__m128d __a, __m128d __b) -{ - __m128d __r; - long long *__rr = (long long *)&__r; - __rr[0] = __a.__val[0] <= __b.__val[0] ? ~0LL : 0LL; - __rr[1] = __a.__val[1] <= __b.__val[1] ? ~0LL : 0LL; - return __r; -} - -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_cmpgt_pd(__m128d __a, __m128d __b) -{ - return _mm_cmplt_pd(__b, __a); -} - -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_cmpge_pd(__m128d __a, __m128d __b) -{ - return _mm_cmple_pd(__b, __a); -} - -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_cmpneq_pd(__m128d __a, __m128d __b) -{ - __m128d __r; - long long *__rr = (long long *)&__r; - __rr[0] = __a.__val[0] != __b.__val[0] ? ~0LL : 0LL; - __rr[1] = __a.__val[1] != __b.__val[1] ? ~0LL : 0LL; - return __r; -} - -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_cmpnlt_pd(__m128d __a, __m128d __b) -{ - __m128d __r; - long long *__rr = (long long *)&__r; - __rr[0] = !(__a.__val[0] < __b.__val[0]) ? ~0LL : 0LL; - __rr[1] = !(__a.__val[1] < __b.__val[1]) ? ~0LL : 0LL; - return __r; -} - -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_cmpnle_pd(__m128d __a, __m128d __b) -{ - __m128d __r; - long long *__rr = (long long *)&__r; - __rr[0] = !(__a.__val[0] <= __b.__val[0]) ? ~0LL : 0LL; - __rr[1] = !(__a.__val[1] <= __b.__val[1]) ? ~0LL : 0LL; - return __r; -} - -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_cmpngt_pd(__m128d __a, __m128d __b) -{ - return _mm_cmpnlt_pd(__b, __a); -} - -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_cmpnge_pd(__m128d __a, __m128d __b) -{ - return _mm_cmpnle_pd(__b, __a); -} - -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_cmpord_pd(__m128d __a, __m128d __b) -{ - __m128d __r; - long long *__rr = (long long *)&__r; - /* ordered: both operands are not NaN */ - __rr[0] = (__a.__val[0] == __a.__val[0] && __b.__val[0] == __b.__val[0]) ? ~0LL : 0LL; - __rr[1] = (__a.__val[1] == __a.__val[1] && __b.__val[1] == __b.__val[1]) ? ~0LL : 0LL; - return __r; -} - -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_cmpunord_pd(__m128d __a, __m128d __b) -{ - __m128d __r; - long long *__rr = (long long *)&__r; - /* unordered: at least one operand is NaN */ - __rr[0] = (__a.__val[0] != __a.__val[0] || __b.__val[0] != __b.__val[0]) ? ~0LL : 0LL; - __rr[1] = (__a.__val[1] != __a.__val[1] || __b.__val[1] != __b.__val[1]) ? ~0LL : 0LL; - return __r; -} - -/* Scalar double compares (low lane only, high from __a) */ - -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_cmpeq_sd(__m128d __a, __m128d __b) -{ - long long *__ra = (long long *)&__a; - __ra[0] = __a.__val[0] == __b.__val[0] ? ~0LL : 0LL; - return __a; -} - -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_cmplt_sd(__m128d __a, __m128d __b) -{ - long long *__ra = (long long *)&__a; - __ra[0] = __a.__val[0] < __b.__val[0] ? ~0LL : 0LL; - return __a; -} - -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_cmple_sd(__m128d __a, __m128d __b) -{ - long long *__ra = (long long *)&__a; - __ra[0] = __a.__val[0] <= __b.__val[0] ? ~0LL : 0LL; - return __a; -} - -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_cmpgt_sd(__m128d __a, __m128d __b) -{ - long long *__ra = (long long *)&__a; - __ra[0] = __a.__val[0] > __b.__val[0] ? ~0LL : 0LL; - return __a; -} - -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_cmpge_sd(__m128d __a, __m128d __b) -{ - long long *__ra = (long long *)&__a; - __ra[0] = __a.__val[0] >= __b.__val[0] ? ~0LL : 0LL; - return __a; -} - -/* Scalar compare predicates (return int) */ - -static __inline__ int __attribute__((__always_inline__)) -_mm_comieq_sd(__m128d __a, __m128d __b) -{ - return __a.__val[0] == __b.__val[0]; -} - -static __inline__ int __attribute__((__always_inline__)) -_mm_comilt_sd(__m128d __a, __m128d __b) -{ - return __a.__val[0] < __b.__val[0]; -} - -static __inline__ int __attribute__((__always_inline__)) -_mm_comile_sd(__m128d __a, __m128d __b) -{ - return __a.__val[0] <= __b.__val[0]; -} - -static __inline__ int __attribute__((__always_inline__)) -_mm_comigt_sd(__m128d __a, __m128d __b) -{ - return __a.__val[0] > __b.__val[0]; -} - -static __inline__ int __attribute__((__always_inline__)) -_mm_comige_sd(__m128d __a, __m128d __b) -{ - return __a.__val[0] >= __b.__val[0]; -} - -static __inline__ int __attribute__((__always_inline__)) -_mm_comineq_sd(__m128d __a, __m128d __b) -{ - return __a.__val[0] != __b.__val[0]; -} - -#define _mm_ucomieq_sd(a, b) _mm_comieq_sd(a, b) -#define _mm_ucomilt_sd(a, b) _mm_comilt_sd(a, b) -#define _mm_ucomile_sd(a, b) _mm_comile_sd(a, b) -#define _mm_ucomigt_sd(a, b) _mm_comigt_sd(a, b) -#define _mm_ucomige_sd(a, b) _mm_comige_sd(a, b) -#define _mm_ucomineq_sd(a, b) _mm_comineq_sd(a, b) - -/* === Double Shuffle / Unpack === */ - -/* _mm_unpacklo_pd: interleave low doubles: {a[0], b[0]} */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_unpacklo_pd(__m128d __a, __m128d __b) -{ - return (__m128d){ { __a.__val[0], __b.__val[0] } }; -} - -/* _mm_unpackhi_pd: interleave high doubles: {a[1], b[1]} */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_unpackhi_pd(__m128d __a, __m128d __b) -{ - return (__m128d){ { __a.__val[1], __b.__val[1] } }; -} - -/* _mm_shuffle_pd: shuffle two doubles based on immediate mask */ -#define _mm_shuffle_pd(__a, __b, __imm) \ - ((__m128d){ { (__a).__val[(__imm) & 1], (__b).__val[((__imm) >> 1) & 1] } }) - -/* _mm_move_sd: move low double from __b, keep high from __a */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_move_sd(__m128d __a, __m128d __b) -{ - return (__m128d){ { __b.__val[0], __a.__val[1] } }; -} - -/* _mm_movemask_pd: extract sign bits of two doubles */ -static __inline__ int __attribute__((__always_inline__)) -_mm_movemask_pd(__m128d __a) -{ - long long *__p = (long long *)&__a; - return ((__p[0] >> 63) & 1) | (((__p[1] >> 63) & 1) << 1); -} - -/* === Double Conversion === */ - -/* _mm_cvtsd_f64: extract low double as scalar */ -static __inline__ double __attribute__((__always_inline__)) -_mm_cvtsd_f64(__m128d __a) -{ - return __a.__val[0]; -} - -/* _mm_cvtsd_si32: convert low double to int (round to nearest) */ -static __inline__ int __attribute__((__always_inline__)) -_mm_cvtsd_si32(__m128d __a) -{ - double __d = __a.__val[0]; - return (int)(__d >= 0.0 ? __d + 0.5 : __d - 0.5); -} - -/* _mm_cvttsd_si32: convert low double to int (truncate) */ -static __inline__ int __attribute__((__always_inline__)) -_mm_cvttsd_si32(__m128d __a) -{ - return (int)__a.__val[0]; -} - -/* _mm_cvtsd_si64: convert low double to long long (round to nearest) */ -static __inline__ long long __attribute__((__always_inline__)) -_mm_cvtsd_si64(__m128d __a) -{ - double __d = __a.__val[0]; - return (long long)(__d >= 0.0 ? __d + 0.5 : __d - 0.5); -} - -#define _mm_cvtsd_si64x(a) _mm_cvtsd_si64(a) - -/* _mm_cvttsd_si64: convert low double to long long (truncate) */ -static __inline__ long long __attribute__((__always_inline__)) -_mm_cvttsd_si64(__m128d __a) -{ - return (long long)__a.__val[0]; -} - -#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a) - -/* _mm_cvtsi32_sd: convert int to double, set low lane, keep high from __a */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_cvtsi32_sd(__m128d __a, int __b) -{ - __a.__val[0] = (double)__b; - return __a; -} - -/* _mm_cvtsi64_sd: convert long long to double, set low lane */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_cvtsi64_sd(__m128d __a, long long __b) -{ - __a.__val[0] = (double)__b; - return __a; -} - -#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b) - -/* _mm_cvtpd_ps: convert 2 doubles to 2 floats (low half of __m128) */ -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cvtpd_ps(__m128d __a) -{ - return (__m128){ { (float)__a.__val[0], (float)__a.__val[1], 0.0f, 0.0f } }; -} - -/* _mm_cvtps_pd: convert 2 floats (low half of __m128) to 2 doubles */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_cvtps_pd(__m128 __a) -{ - return (__m128d){ { (double)__a.__val[0], (double)__a.__val[1] } }; -} - -/* _mm_cvtsd_ss: convert low double to float, put in low lane of __a */ -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cvtsd_ss(__m128 __a, __m128d __b) -{ - __a.__val[0] = (float)__b.__val[0]; - return __a; -} - -/* _mm_cvtss_sd: convert low float to double, put in low lane of __a */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_cvtss_sd(__m128d __a, __m128 __b) -{ - __a.__val[0] = (double)__b.__val[0]; - return __a; -} - -/* _mm_cvtpd_epi32: convert 2 doubles to 2 packed 32-bit integers (round to nearest) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_cvtpd_epi32(__m128d __a) -{ - int __i0 = (int)(__a.__val[0] >= 0.0 ? __a.__val[0] + 0.5 : __a.__val[0] - 0.5); - int __i1 = (int)(__a.__val[1] >= 0.0 ? __a.__val[1] + 0.5 : __a.__val[1] - 0.5); - long long __lo = (long long)(unsigned int)__i0 | ((long long)(unsigned int)__i1 << 32); - return (__m128i){ { __lo, 0LL } }; -} - -/* _mm_cvttpd_epi32: convert 2 doubles to 2 packed 32-bit integers (truncate) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_cvttpd_epi32(__m128d __a) -{ - int __i0 = (int)__a.__val[0]; - int __i1 = (int)__a.__val[1]; - long long __lo = (long long)(unsigned int)__i0 | ((long long)(unsigned int)__i1 << 32); - return (__m128i){ { __lo, 0LL } }; -} - -/* _mm_cvtepi32_pd: convert 2 packed 32-bit integers (low half) to 2 doubles */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_cvtepi32_pd(__m128i __a) -{ - int __i0 = (int)__a.__val[0]; - int __i1 = (int)(__a.__val[0] >> 32); - return (__m128d){ { (double)__i0, (double)__i1 } }; -} - -/* === Fence / Cache === */ - -static __inline__ void __attribute__((__always_inline__)) -_mm_lfence(void) -{ - __builtin_ia32_lfence(); -} - -static __inline__ void __attribute__((__always_inline__)) -_mm_mfence(void) -{ - __builtin_ia32_mfence(); -} - -static __inline__ void __attribute__((__always_inline__)) -_mm_clflush(void const *__p) -{ - __builtin_ia32_clflush(__p); -} - -#endif /* _EMMINTRIN_H_INCLUDED */ diff --git a/include/fmaintrin.h b/include/fmaintrin.h deleted file mode 100644 index 9a42ba170c..0000000000 --- a/include/fmaintrin.h +++ /dev/null @@ -1,75 +0,0 @@ -/* CCC compiler bundled fmaintrin.h - FMA3 intrinsics */ -#ifndef _FMAINTRIN_H_INCLUDED -#define _FMAINTRIN_H_INCLUDED - -#include - -/* === 128-bit FMA === */ - -/* _mm_fmadd_ps: a*b + c (single-precision, 128-bit) */ -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_fmadd_ps(__m128 __a, __m128 __b, __m128 __c) -{ - __m128 __r; - for (int __i = 0; __i < 4; __i++) - __r.__val[__i] = __a.__val[__i] * __b.__val[__i] + __c.__val[__i]; - return __r; -} - -/* _mm_fmadd_pd: a*b + c (double-precision, 128-bit) */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_fmadd_pd(__m128d __a, __m128d __b, __m128d __c) -{ - double *__pa = (double *)&__a; - double *__pb = (double *)&__b; - double *__pc = (double *)&__c; - __m128d __r; - double *__pr = (double *)&__r; - __pr[0] = __pa[0] * __pb[0] + __pc[0]; - __pr[1] = __pa[1] * __pb[1] + __pc[1]; - return __r; -} - -/* === 256-bit FMA === */ - -/* _mm256_fmadd_ps: a*b + c (single-precision, 256-bit) */ -static __inline__ __m256 __attribute__((__always_inline__)) -_mm256_fmadd_ps(__m256 __a, __m256 __b, __m256 __c) -{ - __m256 __r; - for (int __i = 0; __i < 8; __i++) - __r.__val[__i] = __a.__val[__i] * __b.__val[__i] + __c.__val[__i]; - return __r; -} - -/* _mm256_fmadd_pd: a*b + c (double-precision, 256-bit) */ -static __inline__ __m256d __attribute__((__always_inline__)) -_mm256_fmadd_pd(__m256d __a, __m256d __b, __m256d __c) -{ - __m256d __r; - for (int __i = 0; __i < 4; __i++) - __r.__val[__i] = __a.__val[__i] * __b.__val[__i] + __c.__val[__i]; - return __r; -} - -/* _mm256_fmsub_ps: a*b - c (single-precision, 256-bit) */ -static __inline__ __m256 __attribute__((__always_inline__)) -_mm256_fmsub_ps(__m256 __a, __m256 __b, __m256 __c) -{ - __m256 __r; - for (int __i = 0; __i < 8; __i++) - __r.__val[__i] = __a.__val[__i] * __b.__val[__i] - __c.__val[__i]; - return __r; -} - -/* _mm256_fnmadd_ps: -(a*b) + c (single-precision, 256-bit) */ -static __inline__ __m256 __attribute__((__always_inline__)) -_mm256_fnmadd_ps(__m256 __a, __m256 __b, __m256 __c) -{ - __m256 __r; - for (int __i = 0; __i < 8; __i++) - __r.__val[__i] = -(__a.__val[__i] * __b.__val[__i]) + __c.__val[__i]; - return __r; -} - -#endif /* _FMAINTRIN_H_INCLUDED */ diff --git a/include/immintrin.h b/include/immintrin.h deleted file mode 100644 index 1dacd61c5e..0000000000 --- a/include/immintrin.h +++ /dev/null @@ -1,73 +0,0 @@ -/* CCC compiler bundled immintrin.h - all x86 SIMD intrinsics */ -#ifndef _IMMINTRIN_H_INCLUDED -#define _IMMINTRIN_H_INCLUDED - -/* x86 SIMD intrinsics are only available on x86/x86-64 targets */ -#if !defined(__x86_64__) && !defined(__i386__) && !defined(__i686__) -#error "x86 SIMD intrinsics (immintrin.h) require an x86 target" -#endif - -#include -#include -#include -#include -#include -#include -#include -#include - -/* === RDRAND / RDSEED intrinsics === */ - -static __inline__ int __attribute__((__always_inline__)) -_rdrand16_step(unsigned short *__p) -{ - unsigned char __ok; - __asm__ __volatile__("rdrand %0; setc %1" : "=r"(*__p), "=qm"(__ok)); - return (int)__ok; -} - -static __inline__ int __attribute__((__always_inline__)) -_rdrand32_step(unsigned int *__p) -{ - unsigned char __ok; - __asm__ __volatile__("rdrand %0; setc %1" : "=r"(*__p), "=qm"(__ok)); - return (int)__ok; -} - -#ifdef __x86_64__ -static __inline__ int __attribute__((__always_inline__)) -_rdrand64_step(unsigned long long *__p) -{ - unsigned char __ok; - __asm__ __volatile__("rdrand %0; setc %1" : "=r"(*__p), "=qm"(__ok)); - return (int)__ok; -} -#endif - -static __inline__ int __attribute__((__always_inline__)) -_rdseed16_step(unsigned short *__p) -{ - unsigned char __ok; - __asm__ __volatile__("rdseed %0; setc %1" : "=r"(*__p), "=qm"(__ok)); - return (int)__ok; -} - -static __inline__ int __attribute__((__always_inline__)) -_rdseed32_step(unsigned int *__p) -{ - unsigned char __ok; - __asm__ __volatile__("rdseed %0; setc %1" : "=r"(*__p), "=qm"(__ok)); - return (int)__ok; -} - -#ifdef __x86_64__ -static __inline__ int __attribute__((__always_inline__)) -_rdseed64_step(unsigned long long *__p) -{ - unsigned char __ok; - __asm__ __volatile__("rdseed %0; setc %1" : "=r"(*__p), "=qm"(__ok)); - return (int)__ok; -} -#endif - -#endif /* _IMMINTRIN_H_INCLUDED */ diff --git a/include/mmintrin.h b/include/mmintrin.h deleted file mode 100644 index 42eaba32f4..0000000000 --- a/include/mmintrin.h +++ /dev/null @@ -1,852 +0,0 @@ -/* CCC compiler bundled mmintrin.h - MMX intrinsics */ -#ifndef _MMINTRIN_H_INCLUDED -#define _MMINTRIN_H_INCLUDED - -/* MMX intrinsics are only available on x86/x86-64 targets */ -#if !defined(__x86_64__) && !defined(__i386__) && !defined(__i686__) -#error "MMX intrinsics (mmintrin.h) require an x86 target" -#endif - -/* __m64: 64-bit MMX vector type. - * Represents 8 bytes / 4 shorts / 2 ints / 1 long long packed value. */ -typedef struct __attribute__((__aligned__(8))) { - long long __val; -} __m64; - -/* === Empty (EMMS) === */ - -/* Signal end of MMX state usage (EMMS). No-op in our scalar implementation - * since we don't use actual MMX registers. Included for API compatibility. */ -static __inline__ void __attribute__((__always_inline__)) -_mm_empty(void) -{ - /* no-op in our implementation since we use scalar code */ -} - -/* === Zero / Set === */ - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_setzero_si64(void) -{ - __m64 __r; - __r.__val = 0; - return __r; -} - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_cvtsi32_si64(int __a) -{ - __m64 __r; - __r.__val = (unsigned int)__a; - return __r; -} - -static __inline__ int __attribute__((__always_inline__)) -_mm_cvtsi64_si32(__m64 __a) -{ - return (int)__a.__val; -} - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_set_pi32(int __hi, int __lo) -{ - __m64 __r; - __r.__val = ((long long)(unsigned int)__hi << 32) | (unsigned int)__lo; - return __r; -} - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_set_pi16(short __e3, short __e2, short __e1, short __e0) -{ - __m64 __r; - __r.__val = ((long long)(unsigned short)__e3 << 48) | - ((long long)(unsigned short)__e2 << 32) | - ((long long)(unsigned short)__e1 << 16) | - ((long long)(unsigned short)__e0); - return __r; -} - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_set_pi8(char __e7, char __e6, char __e5, char __e4, - char __e3, char __e2, char __e1, char __e0) -{ - __m64 __r; - __r.__val = ((long long)(unsigned char)__e7 << 56) | - ((long long)(unsigned char)__e6 << 48) | - ((long long)(unsigned char)__e5 << 40) | - ((long long)(unsigned char)__e4 << 32) | - ((long long)(unsigned char)__e3 << 24) | - ((long long)(unsigned char)__e2 << 16) | - ((long long)(unsigned char)__e1 << 8) | - ((long long)(unsigned char)__e0); - return __r; -} - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_set1_pi32(int __a) -{ - return _mm_set_pi32(__a, __a); -} - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_set1_pi16(short __a) -{ - return _mm_set_pi16(__a, __a, __a, __a); -} - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_set1_pi8(char __a) -{ - return _mm_set_pi8(__a, __a, __a, __a, __a, __a, __a, __a); -} - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_setr_pi32(int __lo, int __hi) -{ - return _mm_set_pi32(__hi, __lo); -} - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_setr_pi16(short __e0, short __e1, short __e2, short __e3) -{ - return _mm_set_pi16(__e3, __e2, __e1, __e0); -} - -/* === Bitwise === */ - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_and_si64(__m64 __a, __m64 __b) -{ - __m64 __r; - __r.__val = __a.__val & __b.__val; - return __r; -} - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_andnot_si64(__m64 __a, __m64 __b) -{ - __m64 __r; - __r.__val = (~__a.__val) & __b.__val; - return __r; -} - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_or_si64(__m64 __a, __m64 __b) -{ - __m64 __r; - __r.__val = __a.__val | __b.__val; - return __r; -} - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_xor_si64(__m64 __a, __m64 __b) -{ - __m64 __r; - __r.__val = __a.__val ^ __b.__val; - return __r; -} - -/* === Packed Add (8/16/32-bit) === */ - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_add_pi8(__m64 __a, __m64 __b) -{ - __m64 __r; - unsigned char *__ra = (unsigned char *)&__a.__val; - unsigned char *__rb = (unsigned char *)&__b.__val; - unsigned char __rr[8]; - for (int __i = 0; __i < 8; __i++) - __rr[__i] = __ra[__i] + __rb[__i]; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_add_pi16(__m64 __a, __m64 __b) -{ - __m64 __r; - unsigned short *__ra = (unsigned short *)&__a.__val; - unsigned short *__rb = (unsigned short *)&__b.__val; - unsigned short __rr[4]; - for (int __i = 0; __i < 4; __i++) - __rr[__i] = __ra[__i] + __rb[__i]; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_add_pi32(__m64 __a, __m64 __b) -{ - __m64 __r; - unsigned int *__ra = (unsigned int *)&__a.__val; - unsigned int *__rb = (unsigned int *)&__b.__val; - unsigned int __rr[2]; - for (int __i = 0; __i < 2; __i++) - __rr[__i] = __ra[__i] + __rb[__i]; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -/* === Packed Subtract (8/16/32-bit) === */ - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_sub_pi8(__m64 __a, __m64 __b) -{ - __m64 __r; - unsigned char *__ra = (unsigned char *)&__a.__val; - unsigned char *__rb = (unsigned char *)&__b.__val; - unsigned char __rr[8]; - for (int __i = 0; __i < 8; __i++) - __rr[__i] = __ra[__i] - __rb[__i]; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_sub_pi16(__m64 __a, __m64 __b) -{ - __m64 __r; - unsigned short *__ra = (unsigned short *)&__a.__val; - unsigned short *__rb = (unsigned short *)&__b.__val; - unsigned short __rr[4]; - for (int __i = 0; __i < 4; __i++) - __rr[__i] = __ra[__i] - __rb[__i]; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_sub_pi32(__m64 __a, __m64 __b) -{ - __m64 __r; - unsigned int *__ra = (unsigned int *)&__a.__val; - unsigned int *__rb = (unsigned int *)&__b.__val; - unsigned int __rr[2]; - for (int __i = 0; __i < 2; __i++) - __rr[__i] = __ra[__i] - __rb[__i]; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -/* === Packed Saturating Add (8/16-bit signed and unsigned) === */ - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_adds_pi8(__m64 __a, __m64 __b) -{ - __m64 __r; - signed char *__ra = (signed char *)&__a.__val; - signed char *__rb = (signed char *)&__b.__val; - signed char __rr[8]; - for (int __i = 0; __i < 8; __i++) { - int __sum = (int)__ra[__i] + (int)__rb[__i]; - if (__sum > 127) __sum = 127; - if (__sum < -128) __sum = -128; - __rr[__i] = (signed char)__sum; - } - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_adds_pi16(__m64 __a, __m64 __b) -{ - __m64 __r; - short *__ra = (short *)&__a.__val; - short *__rb = (short *)&__b.__val; - short __rr[4]; - for (int __i = 0; __i < 4; __i++) { - int __sum = (int)__ra[__i] + (int)__rb[__i]; - if (__sum > 32767) __sum = 32767; - if (__sum < -32768) __sum = -32768; - __rr[__i] = (short)__sum; - } - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_adds_pu8(__m64 __a, __m64 __b) -{ - __m64 __r; - unsigned char *__ra = (unsigned char *)&__a.__val; - unsigned char *__rb = (unsigned char *)&__b.__val; - unsigned char __rr[8]; - for (int __i = 0; __i < 8; __i++) { - unsigned int __sum = (unsigned int)__ra[__i] + (unsigned int)__rb[__i]; - if (__sum > 255) __sum = 255; - __rr[__i] = (unsigned char)__sum; - } - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_adds_pu16(__m64 __a, __m64 __b) -{ - __m64 __r; - unsigned short *__ra = (unsigned short *)&__a.__val; - unsigned short *__rb = (unsigned short *)&__b.__val; - unsigned short __rr[4]; - for (int __i = 0; __i < 4; __i++) { - unsigned int __sum = (unsigned int)__ra[__i] + (unsigned int)__rb[__i]; - if (__sum > 65535) __sum = 65535; - __rr[__i] = (unsigned short)__sum; - } - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -/* === Packed Saturating Subtract (8/16-bit signed and unsigned) === */ - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_subs_pi8(__m64 __a, __m64 __b) -{ - __m64 __r; - signed char *__ra = (signed char *)&__a.__val; - signed char *__rb = (signed char *)&__b.__val; - signed char __rr[8]; - for (int __i = 0; __i < 8; __i++) { - int __diff = (int)__ra[__i] - (int)__rb[__i]; - if (__diff > 127) __diff = 127; - if (__diff < -128) __diff = -128; - __rr[__i] = (signed char)__diff; - } - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_subs_pi16(__m64 __a, __m64 __b) -{ - __m64 __r; - short *__ra = (short *)&__a.__val; - short *__rb = (short *)&__b.__val; - short __rr[4]; - for (int __i = 0; __i < 4; __i++) { - int __diff = (int)__ra[__i] - (int)__rb[__i]; - if (__diff > 32767) __diff = 32767; - if (__diff < -32768) __diff = -32768; - __rr[__i] = (short)__diff; - } - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_subs_pu8(__m64 __a, __m64 __b) -{ - __m64 __r; - unsigned char *__ra = (unsigned char *)&__a.__val; - unsigned char *__rb = (unsigned char *)&__b.__val; - unsigned char __rr[8]; - for (int __i = 0; __i < 8; __i++) { - int __diff = (int)__ra[__i] - (int)__rb[__i]; - if (__diff < 0) __diff = 0; - __rr[__i] = (unsigned char)__diff; - } - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_subs_pu16(__m64 __a, __m64 __b) -{ - __m64 __r; - unsigned short *__ra = (unsigned short *)&__a.__val; - unsigned short *__rb = (unsigned short *)&__b.__val; - unsigned short __rr[4]; - for (int __i = 0; __i < 4; __i++) { - int __diff = (int)__ra[__i] - (int)__rb[__i]; - if (__diff < 0) __diff = 0; - __rr[__i] = (unsigned short)__diff; - } - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -/* === Packed Multiply === */ - -/* Multiply packed 16-bit signed integers, return low 16 bits of each 32-bit result */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_mullo_pi16(__m64 __a, __m64 __b) -{ - __m64 __r; - short *__ra = (short *)&__a.__val; - short *__rb = (short *)&__b.__val; - short __rr[4]; - for (int __i = 0; __i < 4; __i++) - __rr[__i] = (short)((int)__ra[__i] * (int)__rb[__i]); - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -/* Multiply packed 16-bit signed integers, return high 16 bits of each 32-bit result */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_mulhi_pi16(__m64 __a, __m64 __b) -{ - __m64 __r; - short *__ra = (short *)&__a.__val; - short *__rb = (short *)&__b.__val; - short __rr[4]; - for (int __i = 0; __i < 4; __i++) - __rr[__i] = (short)(((int)__ra[__i] * (int)__rb[__i]) >> 16); - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -/* Multiply and add: (a0*b0 + a1*b1), (a2*b2 + a3*b3) */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_madd_pi16(__m64 __a, __m64 __b) -{ - __m64 __r; - short *__ra = (short *)&__a.__val; - short *__rb = (short *)&__b.__val; - int __rr[2]; - __rr[0] = (int)__ra[0] * (int)__rb[0] + (int)__ra[1] * (int)__rb[1]; - __rr[1] = (int)__ra[2] * (int)__rb[2] + (int)__ra[3] * (int)__rb[3]; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -/* === Shift === */ - -/* Shift left logical: 16-bit packed */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_slli_pi16(__m64 __a, int __count) -{ - __m64 __r; - if (__count < 0 || __count > 15) { - __r.__val = 0; - return __r; - } - unsigned short *__ra = (unsigned short *)&__a.__val; - unsigned short __rr[4]; - for (int __i = 0; __i < 4; __i++) - __rr[__i] = __ra[__i] << __count; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -/* Shift left logical: 32-bit packed */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_slli_pi32(__m64 __a, int __count) -{ - __m64 __r; - if (__count < 0 || __count > 31) { - __r.__val = 0; - return __r; - } - unsigned int *__ra = (unsigned int *)&__a.__val; - unsigned int __rr[2]; - for (int __i = 0; __i < 2; __i++) - __rr[__i] = __ra[__i] << __count; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -/* Shift left logical: 64-bit */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_slli_si64(__m64 __a, int __count) -{ - __m64 __r; - if (__count < 0 || __count > 63) { - __r.__val = 0; - return __r; - } - __r.__val = (long long)((unsigned long long)__a.__val << __count); - return __r; -} - -/* Shift right logical: 16-bit packed */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_srli_pi16(__m64 __a, int __count) -{ - __m64 __r; - if (__count < 0 || __count > 15) { - __r.__val = 0; - return __r; - } - unsigned short *__ra = (unsigned short *)&__a.__val; - unsigned short __rr[4]; - for (int __i = 0; __i < 4; __i++) - __rr[__i] = __ra[__i] >> __count; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -/* Shift right logical: 32-bit packed */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_srli_pi32(__m64 __a, int __count) -{ - __m64 __r; - if (__count < 0 || __count > 31) { - __r.__val = 0; - return __r; - } - unsigned int *__ra = (unsigned int *)&__a.__val; - unsigned int __rr[2]; - for (int __i = 0; __i < 2; __i++) - __rr[__i] = __ra[__i] >> __count; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -/* Shift right logical: 64-bit */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_srli_si64(__m64 __a, int __count) -{ - __m64 __r; - if (__count < 0 || __count > 63) { - __r.__val = 0; - return __r; - } - __r.__val = (long long)((unsigned long long)__a.__val >> __count); - return __r; -} - -/* Shift right arithmetic: 16-bit packed */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_srai_pi16(__m64 __a, int __count) -{ - __m64 __r; - if (__count > 15) __count = 15; - if (__count < 0) __count = 0; - short *__ra = (short *)&__a.__val; - short __rr[4]; - for (int __i = 0; __i < 4; __i++) - __rr[__i] = __ra[__i] >> __count; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -/* Shift right arithmetic: 32-bit packed */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_srai_pi32(__m64 __a, int __count) -{ - __m64 __r; - if (__count > 31) __count = 31; - if (__count < 0) __count = 0; - int *__ra = (int *)&__a.__val; - int __rr[2]; - for (int __i = 0; __i < 2; __i++) - __rr[__i] = __ra[__i] >> __count; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -/* === Pack === */ - -/* Pack 32-bit signed integers to 16-bit signed integers with saturation. - * _mm_packs_pi32(a, b): a contains low 2 words, b contains high 2 words. - * Result: [sat16(a0), sat16(a1), sat16(b0), sat16(b1)] */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_packs_pi32(__m64 __a, __m64 __b) -{ - __m64 __r; - int *__ra = (int *)&__a.__val; - int *__rb = (int *)&__b.__val; - short __rr[4]; - for (int __i = 0; __i < 2; __i++) { - int __v = __ra[__i]; - if (__v > 32767) __v = 32767; - if (__v < -32768) __v = -32768; - __rr[__i] = (short)__v; - } - for (int __i = 0; __i < 2; __i++) { - int __v = __rb[__i]; - if (__v > 32767) __v = 32767; - if (__v < -32768) __v = -32768; - __rr[__i + 2] = (short)__v; - } - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -/* Pack 16-bit signed integers to 8-bit signed integers with signed saturation. - * _mm_packs_pi16(a, b): pack a's 4 words and b's 4 words into 8 signed bytes. */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_packs_pi16(__m64 __a, __m64 __b) -{ - __m64 __r; - short *__ra = (short *)&__a.__val; - short *__rb = (short *)&__b.__val; - signed char __rr[8]; - for (int __i = 0; __i < 4; __i++) { - int __v = __ra[__i]; - if (__v > 127) __v = 127; - if (__v < -128) __v = -128; - __rr[__i] = (signed char)__v; - } - for (int __i = 0; __i < 4; __i++) { - int __v = __rb[__i]; - if (__v > 127) __v = 127; - if (__v < -128) __v = -128; - __rr[__i + 4] = (signed char)__v; - } - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -/* Pack 16-bit signed integers to 8-bit unsigned integers with unsigned saturation. - * _mm_packs_pu16(a, b): pack a's 4 words and b's 4 words into 8 unsigned bytes. */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_packs_pu16(__m64 __a, __m64 __b) -{ - __m64 __r; - short *__ra = (short *)&__a.__val; - short *__rb = (short *)&__b.__val; - unsigned char __rr[8]; - for (int __i = 0; __i < 4; __i++) { - int __v = __ra[__i]; - if (__v > 255) __v = 255; - if (__v < 0) __v = 0; - __rr[__i] = (unsigned char)__v; - } - for (int __i = 0; __i < 4; __i++) { - int __v = __rb[__i]; - if (__v > 255) __v = 255; - if (__v < 0) __v = 0; - __rr[__i + 4] = (unsigned char)__v; - } - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -/* === Compare === */ - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_cmpeq_pi8(__m64 __a, __m64 __b) -{ - __m64 __r; - unsigned char *__ra = (unsigned char *)&__a.__val; - unsigned char *__rb = (unsigned char *)&__b.__val; - unsigned char __rr[8]; - for (int __i = 0; __i < 8; __i++) - __rr[__i] = (__ra[__i] == __rb[__i]) ? 0xFF : 0x00; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_cmpeq_pi16(__m64 __a, __m64 __b) -{ - __m64 __r; - unsigned short *__ra = (unsigned short *)&__a.__val; - unsigned short *__rb = (unsigned short *)&__b.__val; - unsigned short __rr[4]; - for (int __i = 0; __i < 4; __i++) - __rr[__i] = (__ra[__i] == __rb[__i]) ? 0xFFFF : 0x0000; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_cmpeq_pi32(__m64 __a, __m64 __b) -{ - __m64 __r; - unsigned int *__ra = (unsigned int *)&__a.__val; - unsigned int *__rb = (unsigned int *)&__b.__val; - unsigned int __rr[2]; - for (int __i = 0; __i < 2; __i++) - __rr[__i] = (__ra[__i] == __rb[__i]) ? 0xFFFFFFFFU : 0x00000000U; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_cmpgt_pi8(__m64 __a, __m64 __b) -{ - __m64 __r; - signed char *__ra = (signed char *)&__a.__val; - signed char *__rb = (signed char *)&__b.__val; - unsigned char __rr[8]; - for (int __i = 0; __i < 8; __i++) - __rr[__i] = (__ra[__i] > __rb[__i]) ? 0xFF : 0x00; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_cmpgt_pi16(__m64 __a, __m64 __b) -{ - __m64 __r; - short *__ra = (short *)&__a.__val; - short *__rb = (short *)&__b.__val; - unsigned short __rr[4]; - for (int __i = 0; __i < 4; __i++) - __rr[__i] = (__ra[__i] > __rb[__i]) ? 0xFFFF : 0x0000; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_cmpgt_pi32(__m64 __a, __m64 __b) -{ - __m64 __r; - int *__ra = (int *)&__a.__val; - int *__rb = (int *)&__b.__val; - unsigned int __rr[2]; - for (int __i = 0; __i < 2; __i++) - __rr[__i] = (__ra[__i] > __rb[__i]) ? 0xFFFFFFFFU : 0x00000000U; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -/* === Unpack / Interleave === */ - -/* Interleave low bytes: result = [a0, b0, a1, b1, a2, b2, a3, b3] */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_unpacklo_pi8(__m64 __a, __m64 __b) -{ - __m64 __r; - unsigned char *__ra = (unsigned char *)&__a.__val; - unsigned char *__rb = (unsigned char *)&__b.__val; - unsigned char __rr[8]; - __rr[0] = __ra[0]; __rr[1] = __rb[0]; - __rr[2] = __ra[1]; __rr[3] = __rb[1]; - __rr[4] = __ra[2]; __rr[5] = __rb[2]; - __rr[6] = __ra[3]; __rr[7] = __rb[3]; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -/* Interleave high bytes: result = [a4, b4, a5, b5, a6, b6, a7, b7] */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_unpackhi_pi8(__m64 __a, __m64 __b) -{ - __m64 __r; - unsigned char *__ra = (unsigned char *)&__a.__val; - unsigned char *__rb = (unsigned char *)&__b.__val; - unsigned char __rr[8]; - __rr[0] = __ra[4]; __rr[1] = __rb[4]; - __rr[2] = __ra[5]; __rr[3] = __rb[5]; - __rr[4] = __ra[6]; __rr[5] = __rb[6]; - __rr[6] = __ra[7]; __rr[7] = __rb[7]; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -/* Interleave low words: result = [a0, b0, a1, b1] */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_unpacklo_pi16(__m64 __a, __m64 __b) -{ - __m64 __r; - unsigned short *__ra = (unsigned short *)&__a.__val; - unsigned short *__rb = (unsigned short *)&__b.__val; - unsigned short __rr[4]; - __rr[0] = __ra[0]; __rr[1] = __rb[0]; - __rr[2] = __ra[1]; __rr[3] = __rb[1]; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -/* Interleave high words: result = [a2, b2, a3, b3] */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_unpackhi_pi16(__m64 __a, __m64 __b) -{ - __m64 __r; - unsigned short *__ra = (unsigned short *)&__a.__val; - unsigned short *__rb = (unsigned short *)&__b.__val; - unsigned short __rr[4]; - __rr[0] = __ra[2]; __rr[1] = __rb[2]; - __rr[2] = __ra[3]; __rr[3] = __rb[3]; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -/* Interleave low dwords: result = [a0, b0] */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_unpacklo_pi32(__m64 __a, __m64 __b) -{ - __m64 __r; - unsigned int *__ra = (unsigned int *)&__a.__val; - unsigned int *__rb = (unsigned int *)&__b.__val; - unsigned int __rr[2]; - __rr[0] = __ra[0]; __rr[1] = __rb[0]; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -/* Interleave high dwords: result = [a1, b1] */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_unpackhi_pi32(__m64 __a, __m64 __b) -{ - __m64 __r; - unsigned int *__ra = (unsigned int *)&__a.__val; - unsigned int *__rb = (unsigned int *)&__b.__val; - unsigned int __rr[2]; - __rr[0] = __ra[1]; __rr[1] = __rb[1]; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -/* === 64-bit conversion === */ - -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_cvtsi64_m64(long long __a) -{ - __m64 __r; - __r.__val = __a; - return __r; -} - -static __inline__ long long __attribute__((__always_inline__)) -_mm_cvtm64_si64(__m64 __a) -{ - return __a.__val; -} - -/* _m_from_int / _m_to_int aliases */ -#define _m_from_int(a) _mm_cvtsi32_si64(a) -#define _m_to_int(a) _mm_cvtsi64_si32(a) -#define _m_from_int64(a) _mm_cvtsi64_m64(a) -#define _m_to_int64(a) _mm_cvtm64_si64(a) - -/* _m_ prefix aliases (alternate names) */ -#define _m_paddb _mm_add_pi8 -#define _m_paddw _mm_add_pi16 -#define _m_paddd _mm_add_pi32 -#define _m_psubb _mm_sub_pi8 -#define _m_psubw _mm_sub_pi16 -#define _m_psubd _mm_sub_pi32 -#define _m_paddsb _mm_adds_pi8 -#define _m_paddsw _mm_adds_pi16 -#define _m_paddusb _mm_adds_pu8 -#define _m_paddusw _mm_adds_pu16 -#define _m_psubsb _mm_subs_pi8 -#define _m_psubsw _mm_subs_pi16 -#define _m_psubusb _mm_subs_pu8 -#define _m_psubusw _mm_subs_pu16 -#define _m_pmullw _mm_mullo_pi16 -#define _m_pmulhw _mm_mulhi_pi16 -#define _m_pmaddwd _mm_madd_pi16 -#define _m_pand _mm_and_si64 -#define _m_pandn _mm_andnot_si64 -#define _m_por _mm_or_si64 -#define _m_pxor _mm_xor_si64 -#define _m_psllwi _mm_slli_pi16 -#define _m_pslldi _mm_slli_pi32 -#define _m_psllqi _mm_slli_si64 -#define _m_psrlwi _mm_srli_pi16 -#define _m_psrldi _mm_srli_pi32 -#define _m_psrlqi _mm_srli_si64 -#define _m_psrawi _mm_srai_pi16 -#define _m_psradi _mm_srai_pi32 -#define _m_pcmpeqb _mm_cmpeq_pi8 -#define _m_pcmpeqw _mm_cmpeq_pi16 -#define _m_pcmpeqd _mm_cmpeq_pi32 -#define _m_pcmpgtb _mm_cmpgt_pi8 -#define _m_pcmpgtw _mm_cmpgt_pi16 -#define _m_pcmpgtd _mm_cmpgt_pi32 -#define _m_packsswb _mm_packs_pi16 -#define _m_packssdw _mm_packs_pi32 -#define _m_packuswb _mm_packs_pu16 -#define _m_punpcklbw _mm_unpacklo_pi8 -#define _m_punpcklwd _mm_unpacklo_pi16 -#define _m_punpckldq _mm_unpacklo_pi32 -#define _m_punpckhbw _mm_unpackhi_pi8 -#define _m_punpckhwd _mm_unpackhi_pi16 -#define _m_punpckhdq _mm_unpackhi_pi32 -#define _m_empty _mm_empty - -#endif /* _MMINTRIN_H_INCLUDED */ diff --git a/include/nmmintrin.h b/include/nmmintrin.h deleted file mode 100644 index 17c32a29f4..0000000000 --- a/include/nmmintrin.h +++ /dev/null @@ -1,7 +0,0 @@ -/* CCC compiler bundled nmmintrin.h - SSE4.2 intrinsics */ -#ifndef _NMMINTRIN_H_INCLUDED -#define _NMMINTRIN_H_INCLUDED - -#include - -#endif /* _NMMINTRIN_H_INCLUDED */ diff --git a/include/pmmintrin.h b/include/pmmintrin.h deleted file mode 100644 index c9e8666425..0000000000 --- a/include/pmmintrin.h +++ /dev/null @@ -1,61 +0,0 @@ -/* CCC compiler bundled pmmintrin.h - SSE3 intrinsics */ -#ifndef _PMMINTRIN_H_INCLUDED -#define _PMMINTRIN_H_INCLUDED - -/* SSE3 intrinsics are only available on x86/x86-64 targets */ -#if !defined(__x86_64__) && !defined(__i386__) && !defined(__i686__) -#error "SSE3 intrinsics (pmmintrin.h) require an x86 target" -#endif - -#include - -/* _mm_hadd_ps: horizontal add packed single-precision (HADDPS) - * Result: { a[0]+a[1], a[2]+a[3], b[0]+b[1], b[2]+b[3] } */ -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_hadd_ps(__m128 __a, __m128 __b) -{ - return (__m128){ { __a.__val[0] + __a.__val[1], - __a.__val[2] + __a.__val[3], - __b.__val[0] + __b.__val[1], - __b.__val[2] + __b.__val[3] } }; -} - -/* _mm_hadd_pd: horizontal add packed double-precision (HADDPD) - * Result: { a[0]+a[1], b[0]+b[1] } */ -static __inline__ __m128d __attribute__((__always_inline__)) -_mm_hadd_pd(__m128d __a, __m128d __b) -{ - double *__pa = (double *)&__a; - double *__pb = (double *)&__b; - __m128d __r; - double *__pr = (double *)&__r; - __pr[0] = __pa[0] + __pa[1]; - __pr[1] = __pb[0] + __pb[1]; - return __r; -} - -/* _mm_hsub_ps: horizontal subtract packed single-precision (HSUBPS) */ -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_hsub_ps(__m128 __a, __m128 __b) -{ - return (__m128){ { __a.__val[0] - __a.__val[1], - __a.__val[2] - __a.__val[3], - __b.__val[0] - __b.__val[1], - __b.__val[2] - __b.__val[3] } }; -} - -/* _mm_movehdup_ps: duplicate odd-indexed single-precision elements (MOVSHDUP) */ -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_movehdup_ps(__m128 __a) -{ - return (__m128){ { __a.__val[1], __a.__val[1], __a.__val[3], __a.__val[3] } }; -} - -/* _mm_moveldup_ps: duplicate even-indexed single-precision elements (MOVSLDUP) */ -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_moveldup_ps(__m128 __a) -{ - return (__m128){ { __a.__val[0], __a.__val[0], __a.__val[2], __a.__val[2] } }; -} - -#endif /* _PMMINTRIN_H_INCLUDED */ diff --git a/include/shaintrin.h b/include/shaintrin.h deleted file mode 100644 index 88c3de3495..0000000000 --- a/include/shaintrin.h +++ /dev/null @@ -1,232 +0,0 @@ -/* CCC compiler bundled shaintrin.h - SHA-NI intrinsics */ -#ifndef _SHAINTRIN_H_INCLUDED -#define _SHAINTRIN_H_INCLUDED - -#include - -/* ======================================================================== - * SHA-1 intrinsics - * ======================================================================== */ - -/* SHA1NEXTE: Calculate SHA1 state variable E after four rounds. - * Result[127:96] = ROL32(SRC1[127:96], 30) + SRC2[127:96] - * Result[95:0] = SRC2[95:0] - * Corresponds to x86 SHA1NEXTE instruction. */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_sha1nexte_epu32(__m128i __a, __m128i __b) -{ - unsigned int *__pa = (unsigned int *)&__a; - unsigned int *__pb = (unsigned int *)&__b; - __m128i __r; - unsigned int *__pr = (unsigned int *)&__r; - - unsigned int __rotated = (__pa[3] << 30) | (__pa[3] >> 2); - __pr[3] = __rotated + __pb[3]; - __pr[2] = __pb[2]; - __pr[1] = __pb[1]; - __pr[0] = __pb[0]; - - return __r; -} - -/* SHA1MSG1: Perform an intermediate calculation for four SHA1 message dwords. - * Result[i] = SRC1[i] ^ SRC1[i+1], with SRC2[0] used for the last element. - * Corresponds to x86 SHA1MSG1 instruction. */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_sha1msg1_epu32(__m128i __a, __m128i __b) -{ - unsigned int *__pa = (unsigned int *)&__a; - unsigned int *__pb = (unsigned int *)&__b; - __m128i __r; - unsigned int *__pr = (unsigned int *)&__r; - - __pr[0] = __pa[0] ^ __pa[1]; - __pr[1] = __pa[1] ^ __pa[2]; - __pr[2] = __pa[2] ^ __pa[3]; - __pr[3] = __pa[3] ^ __pb[0]; - - return __r; -} - -/* SHA1MSG2: Perform a final calculation for four SHA1 message dwords. - * Each dword is XOR'd with a previous result and then rotated left by 1. - * Corresponds to x86 SHA1MSG2 instruction. */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_sha1msg2_epu32(__m128i __a, __m128i __b) -{ - unsigned int *__pa = (unsigned int *)&__a; - unsigned int *__pb = (unsigned int *)&__b; - __m128i __r; - unsigned int *__pr = (unsigned int *)&__r; - -#define __SHA1_ROL1(x) (((x) << 1) | ((x) >> 31)) - - __pr[0] = __SHA1_ROL1(__pa[0] ^ __pb[2]); - __pr[1] = __SHA1_ROL1(__pa[1] ^ __pb[3]); - __pr[2] = __SHA1_ROL1(__pa[2] ^ __pr[0]); - __pr[3] = __SHA1_ROL1(__pa[3] ^ __pr[1]); - -#undef __SHA1_ROL1 - - return __r; -} - -/* SHA1RNDS4: Perform four rounds of SHA1 operation. - * __func selects the boolean function: - * 0: Ch(b,c,d) = (b & c) ^ (~b & d) - * 1: Parity(b,c,d) = b ^ c ^ d - * 2: Maj(b,c,d) = (b & c) ^ (b & d) ^ (c & d) - * 3: Parity(b,c,d) = b ^ c ^ d - * SRC1 = {A,B,C,D} in [127:96],[95:64],[63:32],[31:0] - * SRC2 = {WK0,WK1,WK2,WK3} (pre-added with E via SHA1NEXTE) - * Each round: T = ROL5(A) + f(B,C,D) + SRC2[round] - * E=D, D=C, C=ROL30(B), B=A, A=T - * Corresponds to x86 SHA1RNDS4 instruction. */ -static __inline__ __m128i __attribute__((__always_inline__)) -__ccc_sha1rnds4(__m128i __a, __m128i __b, int __func) -{ - unsigned int *__pa = (unsigned int *)&__a; - unsigned int *__pb = (unsigned int *)&__b; - __m128i __r; - unsigned int *__pr = (unsigned int *)&__r; - - unsigned int A = __pa[3], B = __pa[2], C = __pa[1], D = __pa[0]; - -#define __SHA1_ROL5(x) (((x) << 5) | ((x) >> 27)) -#define __SHA1_ROL30(x) (((x) << 30) | ((x) >> 2)) -#define __SHA1_ROUND(wk) do { \ - unsigned int f; \ - if (__func == 0) f = (B & C) ^ (~B & D); \ - else if (__func == 2) f = (B & C) ^ (B & D) ^ (C & D);\ - else f = B ^ C ^ D; \ - unsigned int T = __SHA1_ROL5(A) + f + (wk); \ - D = C; C = __SHA1_ROL30(B); B = A; A = T; \ - } while (0) - - __SHA1_ROUND(__pb[3]); - __SHA1_ROUND(__pb[2]); - __SHA1_ROUND(__pb[1]); - __SHA1_ROUND(__pb[0]); - -#undef __SHA1_ROUND -#undef __SHA1_ROL5 -#undef __SHA1_ROL30 - - __pr[3] = A; __pr[2] = B; __pr[1] = C; __pr[0] = D; - return __r; -} -/* __func must be a compile-time constant (0-3) per the Intel spec */ -#define _mm_sha1rnds4_epu32(a, b, func) \ - __ccc_sha1rnds4((a), (b), (func)) - -/* ======================================================================== - * SHA-256 intrinsics - * ======================================================================== */ - -/* SHA256 round function: perform 2 rounds of SHA-256 using state in __a, __b - * and message/constant sum in __c (only low 2 dwords of __c are used). - * Corresponds to x86 SHA256RNDS2 instruction. */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_sha256rnds2_epu32(__m128i __a, __m128i __b, __m128i __c) -{ - unsigned int *__pa = (unsigned int *)&__a; - unsigned int *__pb = (unsigned int *)&__b; - unsigned int *__pc = (unsigned int *)&__c; - __m128i __r; - unsigned int *__pr = (unsigned int *)&__r; - - /* State mapping (SHA-256 working variables): - * __a = {C1, D1, G1, H1} (indices 0,1,2,3) - * __b = {A1, B1, E1, F1} (indices 0,1,2,3) - * __c low 2 dwords = WK0, WK1 */ - - /* ABEF layout: __b[3]=A, __b[2]=B, __b[1]=E, __b[0]=F - * CDGH layout: __a[3]=C, __a[2]=D, __a[1]=G, __a[0]=H */ - unsigned int A = __pb[3], B = __pb[2], E = __pb[1], F = __pb[0]; - unsigned int C = __pa[3], D = __pa[2], G = __pa[1], H = __pa[0]; - - unsigned int W0K = __pc[0], W1K = __pc[1]; - - /* Utility macros */ -#define __SHA256_CH(e, f, g) (((e) & (f)) ^ (~(e) & (g))) -#define __SHA256_MAJ(a, b, c) (((a) & (b)) ^ ((a) & (c)) ^ ((b) & (c))) -#define __SHA256_ROR(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) -#define __SHA256_SIGMA0(a) (__SHA256_ROR(a, 2) ^ __SHA256_ROR(a, 13) ^ __SHA256_ROR(a, 22)) -#define __SHA256_SIGMA1(e) (__SHA256_ROR(e, 6) ^ __SHA256_ROR(e, 11) ^ __SHA256_ROR(e, 25)) - - /* Round 0 (using W0K) */ - unsigned int T1_0 = H + __SHA256_SIGMA1(E) + __SHA256_CH(E, F, G) + W0K; - unsigned int T2_0 = __SHA256_SIGMA0(A) + __SHA256_MAJ(A, B, C); - unsigned int H2 = G, G2 = F, F2 = E, E2 = D + T1_0; - unsigned int D2 = C, C2 = B, B2 = A, A2 = T1_0 + T2_0; - - /* Round 1 (using W1K) */ - unsigned int T1_1 = H2 + __SHA256_SIGMA1(E2) + __SHA256_CH(E2, F2, G2) + W1K; - unsigned int T2_1 = __SHA256_SIGMA0(A2) + __SHA256_MAJ(A2, B2, C2); - /* Only A3, B3, E3 and F3(=E2) are needed for the output ABEF state */ - unsigned int E3 = D2 + T1_1; - unsigned int B3 = A2; - unsigned int A3 = T1_1 + T2_1; - - /* Output: new {A, B, E, F} */ - __pr[3] = A3; - __pr[2] = B3; - __pr[1] = E3; - __pr[0] = E2; /* F3 = E2 */ - -#undef __SHA256_CH -#undef __SHA256_MAJ -#undef __SHA256_ROR -#undef __SHA256_SIGMA0 -#undef __SHA256_SIGMA1 - - return __r; -} - -/* SHA256MSG1: perform an intermediate calculation for the next four - * SHA256 message dwords. Corresponds to SHA256MSG1 instruction. */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_sha256msg1_epu32(__m128i __a, __m128i __b) -{ - unsigned int *__pa = (unsigned int *)&__a; - unsigned int *__pb = (unsigned int *)&__b; - __m128i __r; - unsigned int *__pr = (unsigned int *)&__r; - -#define __SHA256_SIGMA0_MSG(x) \ - (((x) >> 7 | (x) << 25) ^ ((x) >> 18 | (x) << 14) ^ ((x) >> 3)) - - __pr[0] = __pa[0] + __SHA256_SIGMA0_MSG(__pa[1]); - __pr[1] = __pa[1] + __SHA256_SIGMA0_MSG(__pa[2]); - __pr[2] = __pa[2] + __SHA256_SIGMA0_MSG(__pa[3]); - __pr[3] = __pa[3] + __SHA256_SIGMA0_MSG(__pb[0]); - -#undef __SHA256_SIGMA0_MSG - - return __r; -} - -/* SHA256MSG2: perform the final calculation for the next four - * SHA256 message dwords. Corresponds to SHA256MSG2 instruction. */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_sha256msg2_epu32(__m128i __a, __m128i __b) -{ - unsigned int *__pa = (unsigned int *)&__a; - unsigned int *__pb = (unsigned int *)&__b; - __m128i __r; - unsigned int *__pr = (unsigned int *)&__r; - -#define __SHA256_SIGMA1_MSG(x) \ - (((x) >> 17 | (x) << 15) ^ ((x) >> 19 | (x) << 13) ^ ((x) >> 10)) - - __pr[0] = __pa[0] + __SHA256_SIGMA1_MSG(__pb[2]); - __pr[1] = __pa[1] + __SHA256_SIGMA1_MSG(__pb[3]); - __pr[2] = __pa[2] + __SHA256_SIGMA1_MSG(__pr[0]); - __pr[3] = __pa[3] + __SHA256_SIGMA1_MSG(__pr[1]); - -#undef __SHA256_SIGMA1_MSG - - return __r; -} - -#endif /* _SHAINTRIN_H_INCLUDED */ diff --git a/include/smmintrin.h b/include/smmintrin.h deleted file mode 100644 index 2b38a25aac..0000000000 --- a/include/smmintrin.h +++ /dev/null @@ -1,350 +0,0 @@ -/* CCC compiler bundled smmintrin.h - SSE4.1 / SSE4.2 intrinsics */ -#ifndef _SMMINTRIN_H_INCLUDED -#define _SMMINTRIN_H_INCLUDED - -#include - -/* === SSE4.1 insert/extract intrinsics === */ - -/* _mm_extract_epi8: extract 8-bit int from lane (PEXTRB) */ -#define _mm_extract_epi8(a, imm) \ - ((int)(unsigned char)__builtin_ia32_pextrb128((a), (imm))) - -/* _mm_extract_epi32: extract 32-bit int from lane (PEXTRD) */ -#define _mm_extract_epi32(a, imm) \ - ((int)__builtin_ia32_pextrd128((a), (imm))) - -/* _mm_extract_epi64: extract 64-bit int from lane (PEXTRQ) */ -#define _mm_extract_epi64(a, imm) \ - ((long long)__builtin_ia32_pextrq128((a), (imm))) - -/* _mm_insert_epi8: insert 8-bit int at lane (PINSRB) */ -#define _mm_insert_epi8(a, i, imm) \ - __CCC_M128I_FROM_BUILTIN(__builtin_ia32_pinsrb128((a), (i), (imm))) - -/* _mm_insert_epi32: insert 32-bit int at lane (PINSRD) */ -#define _mm_insert_epi32(a, i, imm) \ - __CCC_M128I_FROM_BUILTIN(__builtin_ia32_pinsrd128((a), (i), (imm))) - -/* _mm_insert_epi64: insert 64-bit int at lane (PINSRQ) */ -#define _mm_insert_epi64(a, i, imm) \ - __CCC_M128I_FROM_BUILTIN(__builtin_ia32_pinsrq128((a), (i), (imm))) - -/* === SSE4.1 comparison intrinsics === */ - -/* _mm_cmpeq_epi64: compare packed 64-bit integers for equality (PCMPEQQ) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_cmpeq_epi64(__m128i __a, __m128i __b) -{ - long long *__pa = (long long *)&__a; - long long *__pb = (long long *)&__b; - __m128i __r; - long long *__pr = (long long *)&__r; - __pr[0] = (__pa[0] == __pb[0]) ? -1LL : 0LL; - __pr[1] = (__pa[1] == __pb[1]) ? -1LL : 0LL; - return __r; -} - -/* === SSE4.1 blending === */ - -/* _mm_blendv_epi8: byte-level blend using mask high bits (PBLENDVB) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_blendv_epi8(__m128i __a, __m128i __b, __m128i __mask) -{ - /* For each byte: result = (mask_byte & 0x80) ? b_byte : a_byte */ - unsigned char *__pa = (unsigned char *)&__a; - unsigned char *__pb = (unsigned char *)&__b; - unsigned char *__pm = (unsigned char *)&__mask; - __m128i __r; - unsigned char *__pr = (unsigned char *)&__r; - for (int __i = 0; __i < 16; __i++) - __pr[__i] = (__pm[__i] & 0x80) ? __pb[__i] : __pa[__i]; - return __r; -} - -/* === SSE4.1 min/max signed byte === */ - -/* _mm_max_epi8: packed signed 8-bit max (PMAXSB) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_max_epi8(__m128i __a, __m128i __b) -{ - signed char *__pa = (signed char *)&__a; - signed char *__pb = (signed char *)&__b; - __m128i __r; - signed char *__pr = (signed char *)&__r; - for (int __i = 0; __i < 16; __i++) - __pr[__i] = __pa[__i] > __pb[__i] ? __pa[__i] : __pb[__i]; - return __r; -} - -/* _mm_min_epi8: packed signed 8-bit min (PMINSB) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_min_epi8(__m128i __a, __m128i __b) -{ - signed char *__pa = (signed char *)&__a; - signed char *__pb = (signed char *)&__b; - __m128i __r; - signed char *__pr = (signed char *)&__r; - for (int __i = 0; __i < 16; __i++) - __pr[__i] = __pa[__i] < __pb[__i] ? __pa[__i] : __pb[__i]; - return __r; -} - -/* === SSE4.1 min/max unsigned 32-bit and signed 32-bit === */ - -/* _mm_max_epi32: packed signed 32-bit max (PMAXSD) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_max_epi32(__m128i __a, __m128i __b) -{ - int *__pa = (int *)&__a; - int *__pb = (int *)&__b; - __m128i __r; - int *__pr = (int *)&__r; - for (int __i = 0; __i < 4; __i++) - __pr[__i] = __pa[__i] > __pb[__i] ? __pa[__i] : __pb[__i]; - return __r; -} - -/* _mm_min_epi32: packed signed 32-bit min (PMINSD) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_min_epi32(__m128i __a, __m128i __b) -{ - int *__pa = (int *)&__a; - int *__pb = (int *)&__b; - __m128i __r; - int *__pr = (int *)&__r; - for (int __i = 0; __i < 4; __i++) - __pr[__i] = __pa[__i] < __pb[__i] ? __pa[__i] : __pb[__i]; - return __r; -} - -/* _mm_max_epu32: packed unsigned 32-bit max (PMAXUD) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_max_epu32(__m128i __a, __m128i __b) -{ - unsigned int *__pa = (unsigned int *)&__a; - unsigned int *__pb = (unsigned int *)&__b; - __m128i __r; - unsigned int *__pr = (unsigned int *)&__r; - for (int __i = 0; __i < 4; __i++) - __pr[__i] = __pa[__i] > __pb[__i] ? __pa[__i] : __pb[__i]; - return __r; -} - -/* _mm_min_epu32: packed unsigned 32-bit min (PMINUD) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_min_epu32(__m128i __a, __m128i __b) -{ - unsigned int *__pa = (unsigned int *)&__a; - unsigned int *__pb = (unsigned int *)&__b; - __m128i __r; - unsigned int *__pr = (unsigned int *)&__r; - for (int __i = 0; __i < 4; __i++) - __pr[__i] = __pa[__i] < __pb[__i] ? __pa[__i] : __pb[__i]; - return __r; -} - -/* _mm_max_epu16: packed unsigned 16-bit max (PMAXUW) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_max_epu16(__m128i __a, __m128i __b) -{ - unsigned short *__pa = (unsigned short *)&__a; - unsigned short *__pb = (unsigned short *)&__b; - __m128i __r; - unsigned short *__pr = (unsigned short *)&__r; - for (int __i = 0; __i < 8; __i++) - __pr[__i] = __pa[__i] > __pb[__i] ? __pa[__i] : __pb[__i]; - return __r; -} - -/* _mm_min_epu16: packed unsigned 16-bit min (PMINUW) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_min_epu16(__m128i __a, __m128i __b) -{ - unsigned short *__pa = (unsigned short *)&__a; - unsigned short *__pb = (unsigned short *)&__b; - __m128i __r; - unsigned short *__pr = (unsigned short *)&__r; - for (int __i = 0; __i < 8; __i++) - __pr[__i] = __pa[__i] < __pb[__i] ? __pa[__i] : __pb[__i]; - return __r; -} - -/* _mm_max_epi16: packed signed 16-bit max - already in SSE2 (PMAXSW) - * _mm_min_epi16: packed signed 16-bit min - already in SSE2 (PMINSW) */ - -/* === SSE4.1 zero/sign extension === */ - -/* _mm_cvtepi8_epi16: sign-extend 8 low bytes to 8 shorts (PMOVSXBW) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_cvtepi8_epi16(__m128i __a) -{ - signed char *__pa = (signed char *)&__a; - __m128i __r; - short *__pr = (short *)&__r; - for (int __i = 0; __i < 8; __i++) - __pr[__i] = __pa[__i]; - return __r; -} - -/* _mm_cvtepi8_epi32: sign-extend 4 low bytes to 4 ints (PMOVSXBD) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_cvtepi8_epi32(__m128i __a) -{ - signed char *__pa = (signed char *)&__a; - __m128i __r; - int *__pr = (int *)&__r; - for (int __i = 0; __i < 4; __i++) - __pr[__i] = __pa[__i]; - return __r; -} - -/* _mm_cvtepu8_epi16: zero-extend 8 low bytes to 8 shorts (PMOVZXBW) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_cvtepu8_epi16(__m128i __a) -{ - unsigned char *__pa = (unsigned char *)&__a; - __m128i __r; - short *__pr = (short *)&__r; - for (int __i = 0; __i < 8; __i++) - __pr[__i] = __pa[__i]; - return __r; -} - -/* _mm_cvtepu8_epi32: zero-extend 4 low bytes to 4 ints (PMOVZXBD) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_cvtepu8_epi32(__m128i __a) -{ - unsigned char *__pa = (unsigned char *)&__a; - __m128i __r; - int *__pr = (int *)&__r; - for (int __i = 0; __i < 4; __i++) - __pr[__i] = __pa[__i]; - return __r; -} - -/* _mm_cvtepi16_epi32: sign-extend 4 low shorts to 4 ints (PMOVSXWD) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_cvtepi16_epi32(__m128i __a) -{ - short *__pa = (short *)&__a; - __m128i __r; - int *__pr = (int *)&__r; - for (int __i = 0; __i < 4; __i++) - __pr[__i] = __pa[__i]; - return __r; -} - -/* _mm_cvtepu16_epi32: zero-extend 4 low shorts to 4 ints (PMOVZXWD) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_cvtepu16_epi32(__m128i __a) -{ - unsigned short *__pa = (unsigned short *)&__a; - __m128i __r; - int *__pr = (int *)&__r; - for (int __i = 0; __i < 4; __i++) - __pr[__i] = __pa[__i]; - return __r; -} - -/* _mm_cvtepi32_epi64: sign-extend 2 low ints to 2 longs (PMOVSXDQ) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_cvtepi32_epi64(__m128i __a) -{ - int *__pa = (int *)&__a; - __m128i __r; - long long *__pr = (long long *)&__r; - for (int __i = 0; __i < 2; __i++) - __pr[__i] = __pa[__i]; - return __r; -} - -/* _mm_cvtepu32_epi64: zero-extend 2 low ints to 2 longs (PMOVZXDQ) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_cvtepu32_epi64(__m128i __a) -{ - unsigned int *__pa = (unsigned int *)&__a; - __m128i __r; - long long *__pr = (long long *)&__r; - for (int __i = 0; __i < 2; __i++) - __pr[__i] = __pa[__i]; - return __r; -} - -/* === SSE4.1 test === */ - -/* _mm_testz_si128: test all zeros (PTEST) - returns 1 if (a & b) == 0 */ -static __inline__ int __attribute__((__always_inline__)) -_mm_testz_si128(__m128i __a, __m128i __b) -{ - return (__a.__val[0] & __b.__val[0]) == 0 && (__a.__val[1] & __b.__val[1]) == 0; -} - -/* === SSE4.1 multiply === */ - -/* _mm_mullo_epi32: packed 32-bit multiply low (PMULLD) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_mullo_epi32(__m128i __a, __m128i __b) -{ - int *__pa = (int *)&__a; - int *__pb = (int *)&__b; - __m128i __r; - int *__pr = (int *)&__r; - for (int __i = 0; __i < 4; __i++) - __pr[__i] = __pa[__i] * __pb[__i]; - return __r; -} - -/* === CRC32 intrinsics (SSE4.2) === */ - -static __inline__ unsigned int __attribute__((__always_inline__)) -_mm_crc32_u8(unsigned int __crc, unsigned char __v) -{ - return __builtin_ia32_crc32qi(__crc, __v); -} - -static __inline__ unsigned int __attribute__((__always_inline__)) -_mm_crc32_u16(unsigned int __crc, unsigned short __v) -{ - return __builtin_ia32_crc32hi(__crc, __v); -} - -static __inline__ unsigned int __attribute__((__always_inline__)) -_mm_crc32_u32(unsigned int __crc, unsigned int __v) -{ - return __builtin_ia32_crc32si(__crc, __v); -} - -static __inline__ unsigned long long __attribute__((__always_inline__)) -_mm_crc32_u64(unsigned long long __crc, unsigned long long __v) -{ - return __builtin_ia32_crc32di(__crc, __v); -} - -/* === Pack with Unsigned Saturation (SSE4.1) === */ - -/* Pack 32-bit signed integers from a and b to 16-bit unsigned integers with unsigned saturation */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_packus_epi32(__m128i __a, __m128i __b) -{ - __m128i __r; - int *__ai = (int *)&__a.__val; - int *__bi = (int *)&__b.__val; - unsigned short __rr[8]; - for (int __i = 0; __i < 4; __i++) { - int __v = __ai[__i]; - if (__v > 65535) __v = 65535; - if (__v < 0) __v = 0; - __rr[__i] = (unsigned short)__v; - } - for (int __i = 0; __i < 4; __i++) { - int __v = __bi[__i]; - if (__v > 65535) __v = 65535; - if (__v < 0) __v = 0; - __rr[__i + 4] = (unsigned short)__v; - } - __builtin_memcpy(&__r.__val, __rr, 16); - return __r; -} - -#endif /* _SMMINTRIN_H_INCLUDED */ diff --git a/include/tmmintrin.h b/include/tmmintrin.h deleted file mode 100644 index d9f19abe11..0000000000 --- a/include/tmmintrin.h +++ /dev/null @@ -1,279 +0,0 @@ -/* CCC compiler bundled tmmintrin.h - SSSE3 intrinsics */ -#ifndef _TMMINTRIN_H_INCLUDED -#define _TMMINTRIN_H_INCLUDED - -#include - -/* _mm_abs_epi16: absolute value of signed 16-bit integers (PABSW) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_abs_epi16(__m128i __a) -{ - short *__pa = (short *)&__a; - __m128i __r; - short *__pr = (short *)&__r; - for (int __i = 0; __i < 8; __i++) - __pr[__i] = __pa[__i] < 0 ? (short)-__pa[__i] : __pa[__i]; - return __r; -} - -/* _mm_abs_epi8: absolute value of signed 8-bit integers (PABSB) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_abs_epi8(__m128i __a) -{ - signed char *__pa = (signed char *)&__a; - __m128i __r; - unsigned char *__pr = (unsigned char *)&__r; - for (int __i = 0; __i < 16; __i++) - __pr[__i] = (unsigned char)(__pa[__i] < 0 ? -__pa[__i] : __pa[__i]); - return __r; -} - -/* _mm_abs_epi32: absolute value of signed 32-bit integers (PABSD) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_abs_epi32(__m128i __a) -{ - int *__pa = (int *)&__a; - __m128i __r; - int *__pr = (int *)&__r; - for (int __i = 0; __i < 4; __i++) - __pr[__i] = __pa[__i] < 0 ? -__pa[__i] : __pa[__i]; - return __r; -} - -/* _mm_maddubs_epi16: multiply unsigned 8-bit * signed 8-bit, horizontally add - * adjacent pairs to produce 8 x 16-bit results with saturation (PMADDUBSW). - * __a is treated as unsigned, __b as signed. */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_maddubs_epi16(__m128i __a, __m128i __b) -{ - unsigned char *__pa = (unsigned char *)&__a; - signed char *__pb = (signed char *)&__b; - __m128i __r; - short *__pr = (short *)&__r; - for (int __i = 0; __i < 8; __i++) { - int __s = (int)__pa[__i * 2] * (int)__pb[__i * 2] - + (int)__pa[__i * 2 + 1] * (int)__pb[__i * 2 + 1]; - /* Saturate to [-32768, 32767] */ - if (__s > 32767) __s = 32767; - if (__s < -32768) __s = -32768; - __pr[__i] = (short)__s; - } - return __r; -} - -/* _mm_shuffle_epi8: shuffle bytes according to control mask (PSHUFB) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_shuffle_epi8(__m128i __a, __m128i __b) -{ - unsigned char *__pa = (unsigned char *)&__a; - unsigned char *__pb = (unsigned char *)&__b; - __m128i __r; - unsigned char *__pr = (unsigned char *)&__r; - for (int __i = 0; __i < 16; __i++) { - if (__pb[__i] & 0x80) - __pr[__i] = 0; - else - __pr[__i] = __pa[__pb[__i] & 0x0F]; - } - return __r; -} - -/* _mm_alignr_epi8: concatenate __a (high) and __b (low) into a 32-byte - * intermediate, shift right by __n bytes, return the low 16 bytes (PALIGNR) */ -#define _mm_alignr_epi8(__a, __b, __n) __extension__ ({ \ - __m128i __r_alignr; \ - unsigned char *__pa_alignr = (unsigned char *)&(__a); \ - unsigned char *__pb_alignr = (unsigned char *)&(__b); \ - unsigned char *__pr_alignr = (unsigned char *)&__r_alignr; \ - unsigned char __tmp_alignr[32]; \ - for (int __i = 0; __i < 16; __i++) \ - __tmp_alignr[__i] = __pb_alignr[__i]; \ - for (int __i = 0; __i < 16; __i++) \ - __tmp_alignr[16 + __i] = __pa_alignr[__i]; \ - if ((__n) >= 32) { \ - for (int __i = 0; __i < 16; __i++) \ - __pr_alignr[__i] = 0; \ - } else { \ - for (int __i = 0; __i < 16; __i++) \ - __pr_alignr[__i] = ((__n) + __i < 32) ? \ - __tmp_alignr[(__n) + __i] : 0; \ - } \ - __r_alignr; \ -}) - -/* _mm_hadd_epi16: horizontal add adjacent pairs of 16-bit integers (PHADDW) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_hadd_epi16(__m128i __a, __m128i __b) -{ - short *__pa = (short *)&__a; - short *__pb = (short *)&__b; - __m128i __r; - short *__pr = (short *)&__r; - /* From __a: add adjacent pairs */ - __pr[0] = (short)(__pa[0] + __pa[1]); - __pr[1] = (short)(__pa[2] + __pa[3]); - __pr[2] = (short)(__pa[4] + __pa[5]); - __pr[3] = (short)(__pa[6] + __pa[7]); - /* From __b: add adjacent pairs */ - __pr[4] = (short)(__pb[0] + __pb[1]); - __pr[5] = (short)(__pb[2] + __pb[3]); - __pr[6] = (short)(__pb[4] + __pb[5]); - __pr[7] = (short)(__pb[6] + __pb[7]); - return __r; -} - -/* _mm_hadd_epi32: horizontal add adjacent pairs of 32-bit integers (PHADDD) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_hadd_epi32(__m128i __a, __m128i __b) -{ - int *__pa = (int *)&__a; - int *__pb = (int *)&__b; - __m128i __r; - int *__pr = (int *)&__r; - __pr[0] = __pa[0] + __pa[1]; - __pr[1] = __pa[2] + __pa[3]; - __pr[2] = __pb[0] + __pb[1]; - __pr[3] = __pb[2] + __pb[3]; - return __r; -} - -/* _mm_hsub_epi16: horizontal subtract adjacent pairs of 16-bit integers (PHSUBW) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_hsub_epi16(__m128i __a, __m128i __b) -{ - short *__pa = (short *)&__a; - short *__pb = (short *)&__b; - __m128i __r; - short *__pr = (short *)&__r; - __pr[0] = (short)(__pa[0] - __pa[1]); - __pr[1] = (short)(__pa[2] - __pa[3]); - __pr[2] = (short)(__pa[4] - __pa[5]); - __pr[3] = (short)(__pa[6] - __pa[7]); - __pr[4] = (short)(__pb[0] - __pb[1]); - __pr[5] = (short)(__pb[2] - __pb[3]); - __pr[6] = (short)(__pb[4] - __pb[5]); - __pr[7] = (short)(__pb[6] - __pb[7]); - return __r; -} - -/* _mm_hsub_epi32: horizontal subtract adjacent pairs of 32-bit integers (PHSUBD) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_hsub_epi32(__m128i __a, __m128i __b) -{ - int *__pa = (int *)&__a; - int *__pb = (int *)&__b; - __m128i __r; - int *__pr = (int *)&__r; - __pr[0] = __pa[0] - __pa[1]; - __pr[1] = __pa[2] - __pa[3]; - __pr[2] = __pb[0] - __pb[1]; - __pr[3] = __pb[2] - __pb[3]; - return __r; -} - -/* _mm_sign_epi8: negate/zero/keep bytes based on sign of __b (PSIGNB) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_sign_epi8(__m128i __a, __m128i __b) -{ - signed char *__pa = (signed char *)&__a; - signed char *__pb = (signed char *)&__b; - __m128i __r; - signed char *__pr = (signed char *)&__r; - for (int __i = 0; __i < 16; __i++) { - if (__pb[__i] < 0) __pr[__i] = (signed char)-__pa[__i]; - else if (__pb[__i] == 0) __pr[__i] = 0; - else __pr[__i] = __pa[__i]; - } - return __r; -} - -/* _mm_sign_epi16: negate/zero/keep 16-bit ints based on sign (PSIGNW) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_sign_epi16(__m128i __a, __m128i __b) -{ - short *__pa = (short *)&__a; - short *__pb = (short *)&__b; - __m128i __r; - short *__pr = (short *)&__r; - for (int __i = 0; __i < 8; __i++) { - if (__pb[__i] < 0) __pr[__i] = (short)-__pa[__i]; - else if (__pb[__i] == 0) __pr[__i] = 0; - else __pr[__i] = __pa[__i]; - } - return __r; -} - -/* _mm_sign_epi32: negate/zero/keep 32-bit ints based on sign (PSIGND) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_sign_epi32(__m128i __a, __m128i __b) -{ - int *__pa = (int *)&__a; - int *__pb = (int *)&__b; - __m128i __r; - int *__pr = (int *)&__r; - for (int __i = 0; __i < 4; __i++) { - if (__pb[__i] < 0) __pr[__i] = -__pa[__i]; - else if (__pb[__i] == 0) __pr[__i] = 0; - else __pr[__i] = __pa[__i]; - } - return __r; -} - -/* _mm_mulhrs_epi16: multiply high with round and scale (PMULHRSW) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_mulhrs_epi16(__m128i __a, __m128i __b) -{ - short *__pa = (short *)&__a; - short *__pb = (short *)&__b; - __m128i __r; - short *__pr = (short *)&__r; - for (int __i = 0; __i < 8; __i++) { - int __t = ((int)__pa[__i] * (int)__pb[__i] + 0x4000) >> 15; - __pr[__i] = (short)__t; - } - return __r; -} - -/* _mm_hadds_epi16: horizontal add with saturation (PHADDSW) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_hadds_epi16(__m128i __a, __m128i __b) -{ - short *__pa = (short *)&__a; - short *__pb = (short *)&__b; - __m128i __r; - short *__pr = (short *)&__r; - for (int __i = 0; __i < 4; __i++) { - int __s = (int)__pa[__i * 2] + (int)__pa[__i * 2 + 1]; - if (__s > 32767) __s = 32767; if (__s < -32768) __s = -32768; - __pr[__i] = (short)__s; - } - for (int __i = 0; __i < 4; __i++) { - int __s = (int)__pb[__i * 2] + (int)__pb[__i * 2 + 1]; - if (__s > 32767) __s = 32767; if (__s < -32768) __s = -32768; - __pr[4 + __i] = (short)__s; - } - return __r; -} - -/* _mm_hsubs_epi16: horizontal subtract with saturation (PHSUBSW) */ -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_hsubs_epi16(__m128i __a, __m128i __b) -{ - short *__pa = (short *)&__a; - short *__pb = (short *)&__b; - __m128i __r; - short *__pr = (short *)&__r; - for (int __i = 0; __i < 4; __i++) { - int __s = (int)__pa[__i * 2] - (int)__pa[__i * 2 + 1]; - if (__s > 32767) __s = 32767; if (__s < -32768) __s = -32768; - __pr[__i] = (short)__s; - } - for (int __i = 0; __i < 4; __i++) { - int __s = (int)__pb[__i * 2] - (int)__pb[__i * 2 + 1]; - if (__s > 32767) __s = 32767; if (__s < -32768) __s = -32768; - __pr[4 + __i] = (short)__s; - } - return __r; -} - -#endif /* _TMMINTRIN_H_INCLUDED */ diff --git a/include/wmmintrin.h b/include/wmmintrin.h deleted file mode 100644 index 1ac6c5008a..0000000000 --- a/include/wmmintrin.h +++ /dev/null @@ -1,49 +0,0 @@ -/* CCC compiler bundled wmmintrin.h - AES-NI and CLMUL intrinsics */ -#ifndef _WMMINTRIN_H_INCLUDED -#define _WMMINTRIN_H_INCLUDED - -#include - -/* === AES-NI intrinsics === */ - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_aesenc_si128(__m128i __V, __m128i __R) -{ - return __CCC_M128I_FROM_BUILTIN(__builtin_ia32_aesenc128(__V, __R)); -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_aesenclast_si128(__m128i __V, __m128i __R) -{ - return __CCC_M128I_FROM_BUILTIN(__builtin_ia32_aesenclast128(__V, __R)); -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_aesdec_si128(__m128i __V, __m128i __R) -{ - return __CCC_M128I_FROM_BUILTIN(__builtin_ia32_aesdec128(__V, __R)); -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_aesdeclast_si128(__m128i __V, __m128i __R) -{ - return __CCC_M128I_FROM_BUILTIN(__builtin_ia32_aesdeclast128(__V, __R)); -} - -static __inline__ __m128i __attribute__((__always_inline__)) -_mm_aesimc_si128(__m128i __V) -{ - return __CCC_M128I_FROM_BUILTIN(__builtin_ia32_aesimc128(__V)); -} - -/* _mm_aeskeygenassist_si128 requires a compile-time constant imm8 */ -#define _mm_aeskeygenassist_si128(V, I) \ - __CCC_M128I_FROM_BUILTIN(__builtin_ia32_aeskeygenassist128((V), (I))) - -/* === CLMUL (carry-less multiplication) === */ - -/* _mm_clmulepi64_si128 requires a compile-time constant imm8 */ -#define _mm_clmulepi64_si128(X, Y, I) \ - __CCC_M128I_FROM_BUILTIN(__builtin_ia32_pclmulqdq128((X), (Y), (I))) - -#endif /* _WMMINTRIN_H_INCLUDED */ diff --git a/include/x86intrin.h b/include/x86intrin.h deleted file mode 100644 index 1b554fde93..0000000000 --- a/include/x86intrin.h +++ /dev/null @@ -1,158 +0,0 @@ -/* CCC compiler bundled x86intrin.h - x86 intrinsics umbrella header */ -#ifndef _X86INTRIN_H_INCLUDED -#define _X86INTRIN_H_INCLUDED - -/* x86 intrinsics are only available on x86/x86-64 targets */ -#if !defined(__x86_64__) && !defined(__i386__) && !defined(__i686__) -#error "x86 intrinsics (x86intrin.h) require an x86 target" -#endif - -/* Include all SIMD intrinsics */ -#include - -/* rdtsc - Read Time-Stamp Counter */ -static __inline__ unsigned long long -__attribute__((__always_inline__)) -__rdtsc(void) -{ - unsigned int __lo, __hi; - __asm__ __volatile__("rdtsc" : "=a"(__lo), "=d"(__hi)); - return ((unsigned long long)__hi << 32) | __lo; -} - -/* rdtscp - Read Time-Stamp Counter and Processor ID */ -static __inline__ unsigned long long -__attribute__((__always_inline__)) -__rdtscp(unsigned int *__aux) -{ - unsigned int __lo, __hi; - __asm__ __volatile__("rdtscp" : "=a"(__lo), "=d"(__hi), "=c"(*__aux)); - return ((unsigned long long)__hi << 32) | __lo; -} - -/* Compatibility aliases */ -#define _rdtsc() __rdtsc() -#define _rdtscp(a) __rdtscp(a) - -/* Bit-scan intrinsics */ -static __inline__ int -__attribute__((__always_inline__)) -__bsfd(int __a) -{ - int __r; - __asm__("bsfl %1, %0" : "=r"(__r) : "rm"(__a)); - return __r; -} - -static __inline__ int -__attribute__((__always_inline__)) -__bsrd(int __a) -{ - int __r; - __asm__("bsrl %1, %0" : "=r"(__r) : "rm"(__a)); - return __r; -} - -/* Byte-swap intrinsics */ -static __inline__ int -__attribute__((__always_inline__)) -__bswapd(int __a) -{ - return __builtin_bswap32(__a); -} - -#ifdef __x86_64__ -static __inline__ long long -__attribute__((__always_inline__)) -__bswapq(long long __a) -{ - return __builtin_bswap64(__a); -} - -static __inline__ int -__attribute__((__always_inline__)) -__bsfq(long long __a) -{ - long long __r; - __asm__("bsfq %1, %0" : "=r"(__r) : "rm"(__a)); - return (int)__r; -} - -static __inline__ int -__attribute__((__always_inline__)) -__bsrq(long long __a) -{ - long long __r; - __asm__("bsrq %1, %0" : "=r"(__r) : "rm"(__a)); - return (int)__r; -} -#endif /* __x86_64__ */ - -/* Pause instruction - hint for spin-wait loops */ -static __inline__ void -__attribute__((__always_inline__)) -__pause(void) -{ - __asm__ __volatile__("pause"); -} - -/* Rotation intrinsics */ -static __inline__ unsigned char -__attribute__((__always_inline__)) -__rolb(unsigned char __a, int __n) -{ - return (unsigned char)((__a << (__n & 7)) | (__a >> (8 - (__n & 7)))); -} - -static __inline__ unsigned short -__attribute__((__always_inline__)) -__rolw(unsigned short __a, int __n) -{ - return (unsigned short)((__a << (__n & 15)) | (__a >> (16 - (__n & 15)))); -} - -static __inline__ unsigned int -__attribute__((__always_inline__)) -__rold(unsigned int __a, int __n) -{ - return (__a << (__n & 31)) | (__a >> (32 - (__n & 31))); -} - -static __inline__ unsigned char -__attribute__((__always_inline__)) -__rorb(unsigned char __a, int __n) -{ - return (unsigned char)((__a >> (__n & 7)) | (__a << (8 - (__n & 7)))); -} - -static __inline__ unsigned short -__attribute__((__always_inline__)) -__rorw(unsigned short __a, int __n) -{ - return (unsigned short)((__a >> (__n & 15)) | (__a << (16 - (__n & 15)))); -} - -static __inline__ unsigned int -__attribute__((__always_inline__)) -__rord(unsigned int __a, int __n) -{ - return (__a >> (__n & 31)) | (__a << (32 - (__n & 31))); -} - -#ifdef __x86_64__ -static __inline__ unsigned long long -__attribute__((__always_inline__)) -__rolq(unsigned long long __a, int __n) -{ - return (__a << (__n & 63)) | (__a >> (64 - (__n & 63))); -} - -static __inline__ unsigned long long -__attribute__((__always_inline__)) -__rorq(unsigned long long __a, int __n) -{ - return (__a >> (__n & 63)) | (__a << (64 - (__n & 63))); -} -#endif /* __x86_64__ */ - -#endif /* _X86INTRIN_H_INCLUDED */ diff --git a/include/xmmintrin.h b/include/xmmintrin.h deleted file mode 100644 index fcc3546c92..0000000000 --- a/include/xmmintrin.h +++ /dev/null @@ -1,1287 +0,0 @@ -/* CCC compiler bundled xmmintrin.h - SSE intrinsics */ -#ifndef _XMMINTRIN_H_INCLUDED -#define _XMMINTRIN_H_INCLUDED - -/* SSE intrinsics are only available on x86/x86-64 targets */ -#if !defined(__x86_64__) && !defined(__i386__) && !defined(__i686__) -#error "SSE intrinsics (xmmintrin.h) require an x86 target" -#endif - -#include - -typedef struct __attribute__((__aligned__(16))) { - float __val[4]; -} __m128; - -/* Internal vector type referenced by GCC system headers. - * Note: vector_size attribute is parsed but vectors are lowered as aggregates. */ -typedef float __v4sf __attribute__ ((__vector_size__ (16))); - -/* _MM_SHUFFLE: build an immediate for _mm_shuffle_ps / _mm_shuffle_epi32. - * The result encodes four 2-bit lane selectors as (z<<6|y<<4|x<<2|w). */ -#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) - -/* === Set / Broadcast === */ - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_setzero_ps(void) -{ - return (__m128){ { 0.0f, 0.0f, 0.0f, 0.0f } }; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_set1_ps(float __w) -{ - return (__m128){ { __w, __w, __w, __w } }; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_set_ps(float __z, float __y, float __x, float __w) -{ - return (__m128){ { __w, __x, __y, __z } }; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_setr_ps(float __w, float __x, float __y, float __z) -{ - return (__m128){ { __w, __x, __y, __z } }; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_set_ss(float __w) -{ - return (__m128){ { __w, 0.0f, 0.0f, 0.0f } }; -} - -/* _mm_set_ps1 is a standard alias for _mm_set1_ps */ -#define _mm_set_ps1(w) _mm_set1_ps(w) - -/* === Load === */ - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_loadu_ps(const float *__p) -{ - __m128 __r; - __builtin_memcpy(&__r, __p, 16); - return __r; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_load_ps(const float *__p) -{ - return *(const __m128 *)__p; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_load_ss(const float *__p) -{ - return (__m128){ { *__p, 0.0f, 0.0f, 0.0f } }; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_load1_ps(const float *__p) -{ - float __v = *__p; - return (__m128){ { __v, __v, __v, __v } }; -} - -#define _mm_load_ps1(p) _mm_load1_ps(p) - -/* === Store === */ - -static __inline__ void __attribute__((__always_inline__)) -_mm_storeu_ps(float *__p, __m128 __a) -{ - __builtin_memcpy(__p, &__a, 16); -} - -static __inline__ void __attribute__((__always_inline__)) -_mm_store_ps(float *__p, __m128 __a) -{ - *((__m128 *)__p) = __a; -} - -static __inline__ void __attribute__((__always_inline__)) -_mm_store_ss(float *__p, __m128 __a) -{ - *__p = __a.__val[0]; -} - -static __inline__ void __attribute__((__always_inline__)) -_mm_store1_ps(float *__p, __m128 __a) -{ - __p[0] = __a.__val[0]; __p[1] = __a.__val[0]; - __p[2] = __a.__val[0]; __p[3] = __a.__val[0]; -} - -#define _mm_store_ps1(p, a) _mm_store1_ps(p, a) - -/* Non-temporal store of 128-bit float vector (MOVNTPS). - * Implemented as a regular aligned store (non-temporal hint is optimization only). */ -static __inline__ void __attribute__((__always_inline__)) -_mm_stream_ps(float *__p, __m128 __a) -{ - *((__m128 *)__p) = __a; -} - -/* Store the lower 2 floats of __m128 to __m64* memory location. */ -static __inline__ void __attribute__((__always_inline__)) -_mm_storel_pi(__m64 *__p, __m128 __a) -{ - __builtin_memcpy(__p, &__a, 8); -} - -/* Store the upper 2 floats of __m128 to __m64* memory location. */ -static __inline__ void __attribute__((__always_inline__)) -_mm_storeh_pi(__m64 *__p, __m128 __a) -{ - __builtin_memcpy(__p, (const char *)&__a + 8, 8); -} - -/* === Arithmetic === */ - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_add_ps(__m128 __a, __m128 __b) -{ - return (__m128){ { __a.__val[0] + __b.__val[0], __a.__val[1] + __b.__val[1], - __a.__val[2] + __b.__val[2], __a.__val[3] + __b.__val[3] } }; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_sub_ps(__m128 __a, __m128 __b) -{ - return (__m128){ { __a.__val[0] - __b.__val[0], __a.__val[1] - __b.__val[1], - __a.__val[2] - __b.__val[2], __a.__val[3] - __b.__val[3] } }; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_mul_ps(__m128 __a, __m128 __b) -{ - return (__m128){ { __a.__val[0] * __b.__val[0], __a.__val[1] * __b.__val[1], - __a.__val[2] * __b.__val[2], __a.__val[3] * __b.__val[3] } }; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_div_ps(__m128 __a, __m128 __b) -{ - return (__m128){ { __a.__val[0] / __b.__val[0], __a.__val[1] / __b.__val[1], - __a.__val[2] / __b.__val[2], __a.__val[3] / __b.__val[3] } }; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_min_ps(__m128 __a, __m128 __b) -{ - return (__m128){ { __a.__val[0] < __b.__val[0] ? __a.__val[0] : __b.__val[0], - __a.__val[1] < __b.__val[1] ? __a.__val[1] : __b.__val[1], - __a.__val[2] < __b.__val[2] ? __a.__val[2] : __b.__val[2], - __a.__val[3] < __b.__val[3] ? __a.__val[3] : __b.__val[3] } }; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_max_ps(__m128 __a, __m128 __b) -{ - return (__m128){ { __a.__val[0] > __b.__val[0] ? __a.__val[0] : __b.__val[0], - __a.__val[1] > __b.__val[1] ? __a.__val[1] : __b.__val[1], - __a.__val[2] > __b.__val[2] ? __a.__val[2] : __b.__val[2], - __a.__val[3] > __b.__val[3] ? __a.__val[3] : __b.__val[3] } }; -} - -/* Scalar operations (lowest element only, rest pass through __a) */ - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_add_ss(__m128 __a, __m128 __b) -{ - __a.__val[0] += __b.__val[0]; - return __a; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_sub_ss(__m128 __a, __m128 __b) -{ - __a.__val[0] -= __b.__val[0]; - return __a; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_mul_ss(__m128 __a, __m128 __b) -{ - __a.__val[0] *= __b.__val[0]; - return __a; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_div_ss(__m128 __a, __m128 __b) -{ - __a.__val[0] /= __b.__val[0]; - return __a; -} - -/* === Bitwise (float domain) === */ -/* These operate on the bitwise representation of float values, - using memcpy to type-pun between float and unsigned int. */ - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_and_ps(__m128 __a, __m128 __b) -{ - unsigned int __ai[4], __bi[4]; - __builtin_memcpy(__ai, &__a, 16); - __builtin_memcpy(__bi, &__b, 16); - __ai[0] &= __bi[0]; __ai[1] &= __bi[1]; - __ai[2] &= __bi[2]; __ai[3] &= __bi[3]; - __m128 __r; - __builtin_memcpy(&__r, __ai, 16); - return __r; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_andnot_ps(__m128 __a, __m128 __b) -{ - unsigned int __ai[4], __bi[4]; - __builtin_memcpy(__ai, &__a, 16); - __builtin_memcpy(__bi, &__b, 16); - __ai[0] = ~__ai[0] & __bi[0]; __ai[1] = ~__ai[1] & __bi[1]; - __ai[2] = ~__ai[2] & __bi[2]; __ai[3] = ~__ai[3] & __bi[3]; - __m128 __r; - __builtin_memcpy(&__r, __ai, 16); - return __r; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_or_ps(__m128 __a, __m128 __b) -{ - unsigned int __ai[4], __bi[4]; - __builtin_memcpy(__ai, &__a, 16); - __builtin_memcpy(__bi, &__b, 16); - __ai[0] |= __bi[0]; __ai[1] |= __bi[1]; - __ai[2] |= __bi[2]; __ai[3] |= __bi[3]; - __m128 __r; - __builtin_memcpy(&__r, __ai, 16); - return __r; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_xor_ps(__m128 __a, __m128 __b) -{ - unsigned int __ai[4], __bi[4]; - __builtin_memcpy(__ai, &__a, 16); - __builtin_memcpy(__bi, &__b, 16); - __ai[0] ^= __bi[0]; __ai[1] ^= __bi[1]; - __ai[2] ^= __bi[2]; __ai[3] ^= __bi[3]; - __m128 __r; - __builtin_memcpy(&__r, __ai, 16); - return __r; -} - -/* === Square root, Reciprocal, Reciprocal square root === */ - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_sqrt_ps(__m128 __a) -{ - return (__m128){ { __builtin_sqrtf(__a.__val[0]), __builtin_sqrtf(__a.__val[1]), - __builtin_sqrtf(__a.__val[2]), __builtin_sqrtf(__a.__val[3]) } }; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_sqrt_ss(__m128 __a) -{ - __a.__val[0] = __builtin_sqrtf(__a.__val[0]); - return __a; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_rcp_ps(__m128 __a) -{ - return (__m128){ { 1.0f / __a.__val[0], 1.0f / __a.__val[1], - 1.0f / __a.__val[2], 1.0f / __a.__val[3] } }; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_rcp_ss(__m128 __a) -{ - __a.__val[0] = 1.0f / __a.__val[0]; - return __a; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_rsqrt_ps(__m128 __a) -{ - return (__m128){ { 1.0f / __builtin_sqrtf(__a.__val[0]), 1.0f / __builtin_sqrtf(__a.__val[1]), - 1.0f / __builtin_sqrtf(__a.__val[2]), 1.0f / __builtin_sqrtf(__a.__val[3]) } }; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_rsqrt_ss(__m128 __a) -{ - __a.__val[0] = 1.0f / __builtin_sqrtf(__a.__val[0]); - return __a; -} - -/* === Comparison (packed) - return all-ones or all-zeros per lane === */ - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cmpeq_ps(__m128 __a, __m128 __b) -{ - unsigned int __r[4]; - __r[0] = __a.__val[0] == __b.__val[0] ? 0xFFFFFFFFu : 0; - __r[1] = __a.__val[1] == __b.__val[1] ? 0xFFFFFFFFu : 0; - __r[2] = __a.__val[2] == __b.__val[2] ? 0xFFFFFFFFu : 0; - __r[3] = __a.__val[3] == __b.__val[3] ? 0xFFFFFFFFu : 0; - __m128 __rv; - __builtin_memcpy(&__rv, __r, 16); - return __rv; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cmplt_ps(__m128 __a, __m128 __b) -{ - unsigned int __r[4]; - __r[0] = __a.__val[0] < __b.__val[0] ? 0xFFFFFFFFu : 0; - __r[1] = __a.__val[1] < __b.__val[1] ? 0xFFFFFFFFu : 0; - __r[2] = __a.__val[2] < __b.__val[2] ? 0xFFFFFFFFu : 0; - __r[3] = __a.__val[3] < __b.__val[3] ? 0xFFFFFFFFu : 0; - __m128 __rv; - __builtin_memcpy(&__rv, __r, 16); - return __rv; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cmple_ps(__m128 __a, __m128 __b) -{ - unsigned int __r[4]; - __r[0] = __a.__val[0] <= __b.__val[0] ? 0xFFFFFFFFu : 0; - __r[1] = __a.__val[1] <= __b.__val[1] ? 0xFFFFFFFFu : 0; - __r[2] = __a.__val[2] <= __b.__val[2] ? 0xFFFFFFFFu : 0; - __r[3] = __a.__val[3] <= __b.__val[3] ? 0xFFFFFFFFu : 0; - __m128 __rv; - __builtin_memcpy(&__rv, __r, 16); - return __rv; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cmpgt_ps(__m128 __a, __m128 __b) -{ - return _mm_cmplt_ps(__b, __a); -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cmpge_ps(__m128 __a, __m128 __b) -{ - return _mm_cmple_ps(__b, __a); -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cmpneq_ps(__m128 __a, __m128 __b) -{ - unsigned int __r[4]; - __r[0] = __a.__val[0] != __b.__val[0] ? 0xFFFFFFFFu : 0; - __r[1] = __a.__val[1] != __b.__val[1] ? 0xFFFFFFFFu : 0; - __r[2] = __a.__val[2] != __b.__val[2] ? 0xFFFFFFFFu : 0; - __r[3] = __a.__val[3] != __b.__val[3] ? 0xFFFFFFFFu : 0; - __m128 __rv; - __builtin_memcpy(&__rv, __r, 16); - return __rv; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cmpord_ps(__m128 __a, __m128 __b) -{ - unsigned int __r[4]; - __r[0] = (__a.__val[0] == __a.__val[0] && __b.__val[0] == __b.__val[0]) ? 0xFFFFFFFFu : 0; - __r[1] = (__a.__val[1] == __a.__val[1] && __b.__val[1] == __b.__val[1]) ? 0xFFFFFFFFu : 0; - __r[2] = (__a.__val[2] == __a.__val[2] && __b.__val[2] == __b.__val[2]) ? 0xFFFFFFFFu : 0; - __r[3] = (__a.__val[3] == __a.__val[3] && __b.__val[3] == __b.__val[3]) ? 0xFFFFFFFFu : 0; - __m128 __rv; - __builtin_memcpy(&__rv, __r, 16); - return __rv; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cmpunord_ps(__m128 __a, __m128 __b) -{ - unsigned int __r[4]; - __r[0] = (__a.__val[0] != __a.__val[0] || __b.__val[0] != __b.__val[0]) ? 0xFFFFFFFFu : 0; - __r[1] = (__a.__val[1] != __a.__val[1] || __b.__val[1] != __b.__val[1]) ? 0xFFFFFFFFu : 0; - __r[2] = (__a.__val[2] != __a.__val[2] || __b.__val[2] != __b.__val[2]) ? 0xFFFFFFFFu : 0; - __r[3] = (__a.__val[3] != __a.__val[3] || __b.__val[3] != __b.__val[3]) ? 0xFFFFFFFFu : 0; - __m128 __rv; - __builtin_memcpy(&__rv, __r, 16); - return __rv; -} - -/* Scalar comparison intrinsics (operate on element 0 only, rest pass through __a) */ - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cmpeq_ss(__m128 __a, __m128 __b) -{ - unsigned int __u = (__a.__val[0] == __b.__val[0]) ? 0xFFFFFFFFu : 0; - __builtin_memcpy(&__a.__val[0], &__u, 4); - return __a; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cmplt_ss(__m128 __a, __m128 __b) -{ - unsigned int __u = (__a.__val[0] < __b.__val[0]) ? 0xFFFFFFFFu : 0; - __builtin_memcpy(&__a.__val[0], &__u, 4); - return __a; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cmple_ss(__m128 __a, __m128 __b) -{ - unsigned int __u = (__a.__val[0] <= __b.__val[0]) ? 0xFFFFFFFFu : 0; - __builtin_memcpy(&__a.__val[0], &__u, 4); - return __a; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cmpgt_ss(__m128 __a, __m128 __b) -{ - unsigned int __u = (__a.__val[0] > __b.__val[0]) ? 0xFFFFFFFFu : 0; - __builtin_memcpy(&__a.__val[0], &__u, 4); - return __a; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cmpge_ss(__m128 __a, __m128 __b) -{ - unsigned int __u = (__a.__val[0] >= __b.__val[0]) ? 0xFFFFFFFFu : 0; - __builtin_memcpy(&__a.__val[0], &__u, 4); - return __a; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cmpneq_ss(__m128 __a, __m128 __b) -{ - unsigned int __u = (__a.__val[0] != __b.__val[0]) ? 0xFFFFFFFFu : 0; - __builtin_memcpy(&__a.__val[0], &__u, 4); - return __a; -} - -/* === Integer conversion === */ - -/* TODO: _mm_cvtss_si32 should use current MXCSR rounding mode (round-to-nearest - by default), but we use C cast truncation for simplicity. This matches - _mm_cvttss_si32 behavior. */ -static __inline__ int __attribute__((__always_inline__)) -_mm_cvtss_si32(__m128 __a) -{ - return (int)__a.__val[0]; -} - -/* Alias: _mm_cvt_ss2si is standard alias for _mm_cvtss_si32 */ -#define _mm_cvt_ss2si(a) _mm_cvtss_si32(a) - -static __inline__ int __attribute__((__always_inline__)) -_mm_cvttss_si32(__m128 __a) -{ - return (int)__a.__val[0]; -} - -/* Alias: _mm_cvtt_ss2si */ -#define _mm_cvtt_ss2si(a) _mm_cvttss_si32(a) - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cvtsi32_ss(__m128 __a, int __b) -{ - __a.__val[0] = (float)__b; - return __a; -} - -/* Alias: _mm_cvt_si2ss */ -#define _mm_cvt_si2ss(a, b) _mm_cvtsi32_ss(a, b) - -/* === Shuffle === */ - -/* _mm_shuffle_ps: shuffle floats from __a and __b using immediate mask. - * Bits [1:0] select from __a for element 0, [3:2] for element 1, - * [5:4] select from __b for element 2, [7:6] for element 3. */ -#define _mm_shuffle_ps(__a, __b, __imm) __extension__ ({ \ - __m128 __r; \ - __r.__val[0] = (__a).__val[(__imm) & 3]; \ - __r.__val[1] = (__a).__val[((__imm) >> 2) & 3]; \ - __r.__val[2] = (__b).__val[((__imm) >> 4) & 3]; \ - __r.__val[3] = (__b).__val[((__imm) >> 6) & 3]; \ - __r; \ -}) - -/* === Unpack / Interleave === */ - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_unpacklo_ps(__m128 __a, __m128 __b) -{ - return (__m128){ { __a.__val[0], __b.__val[0], __a.__val[1], __b.__val[1] } }; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_unpackhi_ps(__m128 __a, __m128 __b) -{ - return (__m128){ { __a.__val[2], __b.__val[2], __a.__val[3], __b.__val[3] } }; -} - -/* === Move === */ - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_movehl_ps(__m128 __a, __m128 __b) -{ - return (__m128){ { __b.__val[2], __b.__val[3], __a.__val[2], __a.__val[3] } }; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_movelh_ps(__m128 __a, __m128 __b) -{ - return (__m128){ { __a.__val[0], __a.__val[1], __b.__val[0], __b.__val[1] } }; -} - -static __inline__ float __attribute__((__always_inline__)) -_mm_cvtss_f32(__m128 __a) -{ - return __a.__val[0]; -} - -/* === Compare (packed) - return all-ones or all-zeros per lane === */ - -static __inline__ int __attribute__((__always_inline__)) -_mm_movemask_ps(__m128 __a) -{ - int __r = 0; - unsigned int __u; - __builtin_memcpy(&__u, &__a.__val[0], 4); __r |= (__u >> 31); - __builtin_memcpy(&__u, &__a.__val[1], 4); __r |= ((__u >> 31) << 1); - __builtin_memcpy(&__u, &__a.__val[2], 4); __r |= ((__u >> 31) << 2); - __builtin_memcpy(&__u, &__a.__val[3], 4); __r |= ((__u >> 31) << 3); - return __r; -} - -/* === Prefetch === */ - -/* Prefetch hint constants */ -#define _MM_HINT_T0 3 -#define _MM_HINT_T1 2 -#define _MM_HINT_T2 1 -#define _MM_HINT_NTA 0 - -/* _mm_prefetch: hint to prefetch data into cache. - * In our implementation this is a no-op since we don't emit prefetch - * instructions, but it must be defined for source compatibility. */ -#define _mm_prefetch(P, I) ((void)(P), (void)(I)) - -/* === Aligned memory allocation === */ - -static __inline__ void *__attribute__((__always_inline__)) -_mm_malloc(unsigned long __size, unsigned long __align) -{ - void *__ptr; - if (__align <= sizeof(void *)) - return __builtin_malloc(__size); - /* Use posix_memalign for aligned allocation */ - if (__size == 0) - return (void *)0; - /* Manually align: allocate extra space for alignment and store original pointer */ - void *__raw = __builtin_malloc(__size + __align + sizeof(void *)); - if (!__raw) - return (void *)0; - __ptr = (void *)(((unsigned long)((char *)__raw + sizeof(void *) + __align - 1)) & ~(__align - 1)); - ((void **)__ptr)[-1] = __raw; - return __ptr; -} - -static __inline__ void __attribute__((__always_inline__)) -_mm_free(void *__ptr) -{ - if (__ptr) - __builtin_free(((void **)__ptr)[-1]); -} - -/* === Fence === */ - -static __inline__ void __attribute__((__always_inline__)) -_mm_sfence(void) -{ - __builtin_ia32_sfence(); -} - -static __inline__ void __attribute__((__always_inline__)) -_mm_pause(void) -{ - __builtin_ia32_pause(); -} - -/* === MXCSR control/status register === */ - -/* Read the MXCSR register */ -static __inline__ unsigned int __attribute__((__always_inline__)) -_mm_getcsr(void) -{ - unsigned int __csr; - __asm__ __volatile__("stmxcsr %0" : "=m" (__csr)); - return __csr; -} - -/* Write the MXCSR register */ -static __inline__ void __attribute__((__always_inline__)) -_mm_setcsr(unsigned int __csr) -{ - __asm__ __volatile__("ldmxcsr %0" : : "m" (__csr)); -} - -/* Exception state bits (bits 0-5 of MXCSR) */ -#define _MM_EXCEPT_INVALID 0x0001 -#define _MM_EXCEPT_DENORM 0x0002 -#define _MM_EXCEPT_DIV_ZERO 0x0004 -#define _MM_EXCEPT_OVERFLOW 0x0008 -#define _MM_EXCEPT_UNDERFLOW 0x0010 -#define _MM_EXCEPT_INEXACT 0x0020 -#define _MM_EXCEPT_MASK 0x003f - -/* Exception mask bits (bits 7-12 of MXCSR) */ -#define _MM_MASK_INVALID 0x0080 -#define _MM_MASK_DENORM 0x0100 -#define _MM_MASK_DIV_ZERO 0x0200 -#define _MM_MASK_OVERFLOW 0x0400 -#define _MM_MASK_UNDERFLOW 0x0800 -#define _MM_MASK_INEXACT 0x1000 -#define _MM_MASK_MASK 0x1f80 - -/* Rounding mode bits (bits 13-14 of MXCSR) */ -#define _MM_ROUND_NEAREST 0x0000 -#define _MM_ROUND_DOWN 0x2000 -#define _MM_ROUND_UP 0x4000 -#define _MM_ROUND_TOWARD_ZERO 0x6000 -#define _MM_ROUND_MASK 0x6000 - -/* Flush-to-zero bit (bit 15 of MXCSR) */ -#define _MM_FLUSH_ZERO_MASK 0x8000 -#define _MM_FLUSH_ZERO_ON 0x8000 -#define _MM_FLUSH_ZERO_OFF 0x0000 - -/* Get/set exception state */ -static __inline__ unsigned int __attribute__((__always_inline__)) -_MM_GET_EXCEPTION_STATE(void) -{ - return _mm_getcsr() & _MM_EXCEPT_MASK; -} - -static __inline__ void __attribute__((__always_inline__)) -_MM_SET_EXCEPTION_STATE(unsigned int __mask) -{ - _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask); -} - -/* Get/set exception mask */ -static __inline__ unsigned int __attribute__((__always_inline__)) -_MM_GET_EXCEPTION_MASK(void) -{ - return _mm_getcsr() & _MM_MASK_MASK; -} - -static __inline__ void __attribute__((__always_inline__)) -_MM_SET_EXCEPTION_MASK(unsigned int __mask) -{ - _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask); -} - -/* Get/set rounding mode */ -static __inline__ unsigned int __attribute__((__always_inline__)) -_MM_GET_ROUNDING_MODE(void) -{ - return _mm_getcsr() & _MM_ROUND_MASK; -} - -static __inline__ void __attribute__((__always_inline__)) -_MM_SET_ROUNDING_MODE(unsigned int __mode) -{ - _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode); -} - -/* Get/set flush-to-zero mode */ -static __inline__ unsigned int __attribute__((__always_inline__)) -_MM_GET_FLUSH_ZERO_MODE(void) -{ - return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; -} - -static __inline__ void __attribute__((__always_inline__)) -_MM_SET_FLUSH_ZERO_MODE(unsigned int __mode) -{ - _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode); -} - -/* === Scalar min/max === */ - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_min_ss(__m128 __a, __m128 __b) -{ - __a.__val[0] = __a.__val[0] < __b.__val[0] ? __a.__val[0] : __b.__val[0]; - return __a; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_max_ss(__m128 __a, __m128 __b) -{ - __a.__val[0] = __a.__val[0] > __b.__val[0] ? __a.__val[0] : __b.__val[0]; - return __a; -} - -/* === Move === */ - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_move_ss(__m128 __a, __m128 __b) -{ - __a.__val[0] = __b.__val[0]; - return __a; -} - -/* === Negated comparisons (packed) === */ - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cmpnlt_ps(__m128 __a, __m128 __b) -{ - unsigned int __r[4]; - __r[0] = !(__a.__val[0] < __b.__val[0]) ? 0xFFFFFFFFu : 0; - __r[1] = !(__a.__val[1] < __b.__val[1]) ? 0xFFFFFFFFu : 0; - __r[2] = !(__a.__val[2] < __b.__val[2]) ? 0xFFFFFFFFu : 0; - __r[3] = !(__a.__val[3] < __b.__val[3]) ? 0xFFFFFFFFu : 0; - __m128 __rv; - __builtin_memcpy(&__rv, __r, 16); - return __rv; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cmpnle_ps(__m128 __a, __m128 __b) -{ - unsigned int __r[4]; - __r[0] = !(__a.__val[0] <= __b.__val[0]) ? 0xFFFFFFFFu : 0; - __r[1] = !(__a.__val[1] <= __b.__val[1]) ? 0xFFFFFFFFu : 0; - __r[2] = !(__a.__val[2] <= __b.__val[2]) ? 0xFFFFFFFFu : 0; - __r[3] = !(__a.__val[3] <= __b.__val[3]) ? 0xFFFFFFFFu : 0; - __m128 __rv; - __builtin_memcpy(&__rv, __r, 16); - return __rv; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cmpngt_ps(__m128 __a, __m128 __b) -{ - return _mm_cmpnlt_ps(__b, __a); -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cmpnge_ps(__m128 __a, __m128 __b) -{ - return _mm_cmpnle_ps(__b, __a); -} - -/* === Negated scalar comparisons === */ - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cmpnlt_ss(__m128 __a, __m128 __b) -{ - unsigned int __u = !(__a.__val[0] < __b.__val[0]) ? 0xFFFFFFFFu : 0; - __builtin_memcpy(&__a.__val[0], &__u, 4); - return __a; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cmpnle_ss(__m128 __a, __m128 __b) -{ - unsigned int __u = !(__a.__val[0] <= __b.__val[0]) ? 0xFFFFFFFFu : 0; - __builtin_memcpy(&__a.__val[0], &__u, 4); - return __a; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cmpngt_ss(__m128 __a, __m128 __b) -{ - unsigned int __u = !(__a.__val[0] > __b.__val[0]) ? 0xFFFFFFFFu : 0; - __builtin_memcpy(&__a.__val[0], &__u, 4); - return __a; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cmpnge_ss(__m128 __a, __m128 __b) -{ - unsigned int __u = !(__a.__val[0] >= __b.__val[0]) ? 0xFFFFFFFFu : 0; - __builtin_memcpy(&__a.__val[0], &__u, 4); - return __a; -} - -/* === Scalar ordered/unordered comparisons === */ - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cmpord_ss(__m128 __a, __m128 __b) -{ - unsigned int __u = (__a.__val[0] == __a.__val[0] && __b.__val[0] == __b.__val[0]) ? 0xFFFFFFFFu : 0; - __builtin_memcpy(&__a.__val[0], &__u, 4); - return __a; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cmpunord_ss(__m128 __a, __m128 __b) -{ - unsigned int __u = (__a.__val[0] != __a.__val[0] || __b.__val[0] != __b.__val[0]) ? 0xFFFFFFFFu : 0; - __builtin_memcpy(&__a.__val[0], &__u, 4); - return __a; -} - -/* === Ordered scalar comparison returning int (COMISS) === */ - -static __inline__ int __attribute__((__always_inline__)) -_mm_comieq_ss(__m128 __a, __m128 __b) -{ - return __a.__val[0] == __b.__val[0]; -} - -static __inline__ int __attribute__((__always_inline__)) -_mm_comilt_ss(__m128 __a, __m128 __b) -{ - return __a.__val[0] < __b.__val[0]; -} - -static __inline__ int __attribute__((__always_inline__)) -_mm_comile_ss(__m128 __a, __m128 __b) -{ - return __a.__val[0] <= __b.__val[0]; -} - -static __inline__ int __attribute__((__always_inline__)) -_mm_comigt_ss(__m128 __a, __m128 __b) -{ - return __a.__val[0] > __b.__val[0]; -} - -static __inline__ int __attribute__((__always_inline__)) -_mm_comige_ss(__m128 __a, __m128 __b) -{ - return __a.__val[0] >= __b.__val[0]; -} - -static __inline__ int __attribute__((__always_inline__)) -_mm_comineq_ss(__m128 __a, __m128 __b) -{ - return __a.__val[0] != __b.__val[0]; -} - -/* === Unordered scalar comparison returning int (UCOMISS) === */ - -static __inline__ int __attribute__((__always_inline__)) -_mm_ucomieq_ss(__m128 __a, __m128 __b) -{ - return __a.__val[0] == __b.__val[0]; -} - -static __inline__ int __attribute__((__always_inline__)) -_mm_ucomilt_ss(__m128 __a, __m128 __b) -{ - return __a.__val[0] < __b.__val[0]; -} - -static __inline__ int __attribute__((__always_inline__)) -_mm_ucomile_ss(__m128 __a, __m128 __b) -{ - return __a.__val[0] <= __b.__val[0]; -} - -static __inline__ int __attribute__((__always_inline__)) -_mm_ucomigt_ss(__m128 __a, __m128 __b) -{ - return __a.__val[0] > __b.__val[0]; -} - -static __inline__ int __attribute__((__always_inline__)) -_mm_ucomige_ss(__m128 __a, __m128 __b) -{ - return __a.__val[0] >= __b.__val[0]; -} - -static __inline__ int __attribute__((__always_inline__)) -_mm_ucomineq_ss(__m128 __a, __m128 __b) -{ - return __a.__val[0] != __b.__val[0]; -} - -/* === 64-bit integer conversion (SSE) === */ - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cvtsi64_ss(__m128 __a, long long __b) -{ - __a.__val[0] = (float)__b; - return __a; -} - -#define _mm_cvtsi64x_ss(a, b) _mm_cvtsi64_ss(a, b) - -static __inline__ long long __attribute__((__always_inline__)) -_mm_cvtss_si64(__m128 __a) -{ - return (long long)__a.__val[0]; -} - -#define _mm_cvtss_si64x(a) _mm_cvtss_si64(a) - -static __inline__ long long __attribute__((__always_inline__)) -_mm_cvttss_si64(__m128 __a) -{ - return (long long)__a.__val[0]; -} - -#define _mm_cvttss_si64x(a) _mm_cvttss_si64(a) - -/* === Load (additional) === */ - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_loadh_pi(__m128 __a, const __m64 *__p) -{ - __builtin_memcpy((char *)&__a + 8, __p, 8); - return __a; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_loadl_pi(__m128 __a, const __m64 *__p) -{ - __builtin_memcpy(&__a, __p, 8); - return __a; -} - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_loadr_ps(const float *__p) -{ - return (__m128){ { __p[3], __p[2], __p[1], __p[0] } }; -} - -/* === Store (additional) === */ - -static __inline__ void __attribute__((__always_inline__)) -_mm_storer_ps(float *__p, __m128 __a) -{ - __p[0] = __a.__val[3]; __p[1] = __a.__val[2]; - __p[2] = __a.__val[1]; __p[3] = __a.__val[0]; -} - -/* === MMX<->float conversion (SSE additions that operate on __m64) === */ - -/* Convert packed 32-bit integers in __m64 to packed floats (low 2 elements) */ -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cvtpi32_ps(__m128 __a, __m64 __b) -{ - int *__bi = (int *)&__b.__val; - __a.__val[0] = (float)__bi[0]; - __a.__val[1] = (float)__bi[1]; - return __a; -} - -#define _mm_cvt_pi2ps(a, b) _mm_cvtpi32_ps(a, b) - -/* Convert packed floats (low 2 elements) to packed 32-bit integers in __m64 */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_cvtps_pi32(__m128 __a) -{ - __m64 __r; - int __rr[2]; - __rr[0] = (int)__a.__val[0]; - __rr[1] = (int)__a.__val[1]; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -#define _mm_cvt_ps2pi(a) _mm_cvtps_pi32(a) - -/* Convert packed floats (low 2 elements) to packed 32-bit integers with truncation */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_cvttps_pi32(__m128 __a) -{ - __m64 __r; - int __rr[2]; - __rr[0] = (int)__a.__val[0]; - __rr[1] = (int)__a.__val[1]; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -#define _mm_cvtt_ps2pi(a) _mm_cvttps_pi32(a) - -/* Convert packed 16-bit signed integers to packed floats */ -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cvtpi16_ps(__m64 __a) -{ - short *__as = (short *)&__a.__val; - return (__m128){ { (float)__as[0], (float)__as[1], (float)__as[2], (float)__as[3] } }; -} - -/* Convert packed 16-bit unsigned integers to packed floats */ -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cvtpu16_ps(__m64 __a) -{ - unsigned short *__as = (unsigned short *)&__a.__val; - return (__m128){ { (float)__as[0], (float)__as[1], (float)__as[2], (float)__as[3] } }; -} - -/* Convert packed 8-bit signed integers (low 4 bytes) to packed floats */ -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cvtpi8_ps(__m64 __a) -{ - signed char *__ab = (signed char *)&__a.__val; - return (__m128){ { (float)__ab[0], (float)__ab[1], (float)__ab[2], (float)__ab[3] } }; -} - -/* Convert packed 8-bit unsigned integers (low 4 bytes) to packed floats */ -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cvtpu8_ps(__m64 __a) -{ - unsigned char *__ab = (unsigned char *)&__a.__val; - return (__m128){ { (float)__ab[0], (float)__ab[1], (float)__ab[2], (float)__ab[3] } }; -} - -/* Convert two __m64 packed 32-bit integers to __m128 packed floats */ -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_cvtpi32x2_ps(__m64 __a, __m64 __b) -{ - int *__ai = (int *)&__a.__val; - int *__bi = (int *)&__b.__val; - return (__m128){ { (float)__ai[0], (float)__ai[1], (float)__bi[0], (float)__bi[1] } }; -} - -/* Convert packed floats to packed 16-bit signed integers (with saturation) in __m64 */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_cvtps_pi16(__m128 __a) -{ - __m64 __r; - short __rr[4]; - for (int __i = 0; __i < 4; __i++) { - int __v = (int)__a.__val[__i]; - if (__v > 32767) __v = 32767; - if (__v < -32768) __v = -32768; - __rr[__i] = (short)__v; - } - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -/* Convert packed floats to packed 8-bit signed integers (low 4 bytes, high 4 zero) */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_cvtps_pi8(__m128 __a) -{ - __m64 __r; - signed char __rr[8]; - for (int __i = 0; __i < 4; __i++) { - int __v = (int)__a.__val[__i]; - if (__v > 127) __v = 127; - if (__v < -128) __v = -128; - __rr[__i] = (signed char)__v; - } - __rr[4] = __rr[5] = __rr[6] = __rr[7] = 0; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -/* === SSE intrinsics operating on __m64 (SSE additions to MMX) === */ - -/* Extract a 16-bit integer from __m64 at position N */ -#define _mm_extract_pi16(A, N) \ - ((int)(unsigned short)(((unsigned short *)&(A).__val)[(N) & 3])) - -#define _m_pextrw(A, N) _mm_extract_pi16(A, N) - -/* Insert a 16-bit integer into __m64 at position N */ -#define _mm_insert_pi16(A, D, N) __extension__ ({ \ - __m64 __tmp = (A); \ - ((unsigned short *)&__tmp.__val)[(N) & 3] = (unsigned short)(D); \ - __tmp; \ -}) - -#define _m_pinsrw(A, D, N) _mm_insert_pi16(A, D, N) - -/* Packed maximum of signed 16-bit integers */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_max_pi16(__m64 __a, __m64 __b) -{ - __m64 __r; - short *__ra = (short *)&__a.__val; - short *__rb = (short *)&__b.__val; - short __rr[4]; - for (int __i = 0; __i < 4; __i++) - __rr[__i] = __ra[__i] > __rb[__i] ? __ra[__i] : __rb[__i]; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -#define _m_pmaxsw(a, b) _mm_max_pi16(a, b) - -/* Packed maximum of unsigned 8-bit integers */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_max_pu8(__m64 __a, __m64 __b) -{ - __m64 __r; - unsigned char *__ra = (unsigned char *)&__a.__val; - unsigned char *__rb = (unsigned char *)&__b.__val; - unsigned char __rr[8]; - for (int __i = 0; __i < 8; __i++) - __rr[__i] = __ra[__i] > __rb[__i] ? __ra[__i] : __rb[__i]; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -#define _m_pmaxub(a, b) _mm_max_pu8(a, b) - -/* Packed minimum of signed 16-bit integers */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_min_pi16(__m64 __a, __m64 __b) -{ - __m64 __r; - short *__ra = (short *)&__a.__val; - short *__rb = (short *)&__b.__val; - short __rr[4]; - for (int __i = 0; __i < 4; __i++) - __rr[__i] = __ra[__i] < __rb[__i] ? __ra[__i] : __rb[__i]; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -#define _m_pminsw(a, b) _mm_min_pi16(a, b) - -/* Packed minimum of unsigned 8-bit integers */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_min_pu8(__m64 __a, __m64 __b) -{ - __m64 __r; - unsigned char *__ra = (unsigned char *)&__a.__val; - unsigned char *__rb = (unsigned char *)&__b.__val; - unsigned char __rr[8]; - for (int __i = 0; __i < 8; __i++) - __rr[__i] = __ra[__i] < __rb[__i] ? __ra[__i] : __rb[__i]; - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -#define _m_pminub(a, b) _mm_min_pu8(a, b) - -/* Create mask from most significant bit of each byte in __m64 */ -static __inline__ int __attribute__((__always_inline__)) -_mm_movemask_pi8(__m64 __a) -{ - int __r = 0; - unsigned char *__ab = (unsigned char *)&__a.__val; - for (int __i = 0; __i < 8; __i++) - __r |= ((__ab[__i] >> 7) & 1) << __i; - return __r; -} - -#define _m_pmovmskb(a) _mm_movemask_pi8(a) - -/* Multiply packed unsigned 16-bit integers, return high 16 bits */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_mulhi_pu16(__m64 __a, __m64 __b) -{ - __m64 __r; - unsigned short *__ra = (unsigned short *)&__a.__val; - unsigned short *__rb = (unsigned short *)&__b.__val; - unsigned short __rr[4]; - for (int __i = 0; __i < 4; __i++) - __rr[__i] = (unsigned short)(((unsigned int)__ra[__i] * (unsigned int)__rb[__i]) >> 16); - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b) - -/* Shuffle 16-bit integers in __m64 using immediate selector */ -#define _mm_shuffle_pi16(A, N) __extension__ ({ \ - __m64 __tmp_a = (A); \ - unsigned short *__src = (unsigned short *)&__tmp_a.__val; \ - __m64 __tmp_r; \ - unsigned short *__dst = (unsigned short *)&__tmp_r.__val; \ - __dst[0] = __src[(N) & 3]; \ - __dst[1] = __src[((N) >> 2) & 3]; \ - __dst[2] = __src[((N) >> 4) & 3]; \ - __dst[3] = __src[((N) >> 6) & 3]; \ - __tmp_r; \ -}) - -#define _m_pshufw(A, N) _mm_shuffle_pi16(A, N) - -/* Average of packed unsigned 8-bit integers (rounded) */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_avg_pu8(__m64 __a, __m64 __b) -{ - __m64 __r; - unsigned char *__ra = (unsigned char *)&__a.__val; - unsigned char *__rb = (unsigned char *)&__b.__val; - unsigned char __rr[8]; - for (int __i = 0; __i < 8; __i++) - __rr[__i] = (unsigned char)(((unsigned int)__ra[__i] + (unsigned int)__rb[__i] + 1) >> 1); - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -#define _m_pavgb(a, b) _mm_avg_pu8(a, b) - -/* Average of packed unsigned 16-bit integers (rounded) */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_avg_pu16(__m64 __a, __m64 __b) -{ - __m64 __r; - unsigned short *__ra = (unsigned short *)&__a.__val; - unsigned short *__rb = (unsigned short *)&__b.__val; - unsigned short __rr[4]; - for (int __i = 0; __i < 4; __i++) - __rr[__i] = (unsigned short)(((unsigned int)__ra[__i] + (unsigned int)__rb[__i] + 1) >> 1); - __builtin_memcpy(&__r.__val, __rr, 8); - return __r; -} - -#define _m_pavgw(a, b) _mm_avg_pu16(a, b) - -/* Sum of absolute differences of packed unsigned 8-bit integers */ -static __inline__ __m64 __attribute__((__always_inline__)) -_mm_sad_pu8(__m64 __a, __m64 __b) -{ - __m64 __r; - unsigned char *__ra = (unsigned char *)&__a.__val; - unsigned char *__rb = (unsigned char *)&__b.__val; - unsigned int __sum = 0; - for (int __i = 0; __i < 8; __i++) { - int __diff = (int)__ra[__i] - (int)__rb[__i]; - __sum += __diff < 0 ? -__diff : __diff; - } - __r.__val = __sum; - return __r; -} - -#define _m_psadbw(a, b) _mm_sad_pu8(a, b) - -/* Non-temporal store of __m64 */ -static __inline__ void __attribute__((__always_inline__)) -_mm_stream_pi(__m64 *__p, __m64 __a) -{ - *__p = __a; -} - -/* Conditional byte store from __m64 (MASKMOVQ) */ -static __inline__ void __attribute__((__always_inline__)) -_mm_maskmove_si64(__m64 __a, __m64 __n, char *__p) -{ - unsigned char *__da = (unsigned char *)&__a.__val; - unsigned char *__dn = (unsigned char *)&__n.__val; - for (int __i = 0; __i < 8; __i++) { - if (__dn[__i] & 0x80) - __p[__i] = __da[__i]; - } -} - -#define _m_maskmovq(a, n, p) _mm_maskmove_si64(a, n, p) - -/* === Undefined value === */ - -static __inline__ __m128 __attribute__((__always_inline__)) -_mm_undefined_ps(void) -{ - __m128 __r; - return __r; -} - -/* GCC's xmmintrin.h includes emmintrin.h so that code which only includes - * still gets access to __m128i and SSE2 intrinsics. - * Match that behavior here. */ -#include - -#endif /* _XMMINTRIN_H_INCLUDED */ diff --git a/projects/cleanup_code_quality.txt b/projects/cleanup_code_quality.txt deleted file mode 100644 index 380a86448a..0000000000 --- a/projects/cleanup_code_quality.txt +++ /dev/null @@ -1,80 +0,0 @@ -Code Quality Cleanup Backlog -============================ - -Items a Rust expert would flag, roughly ordered by impact: - -1. [DONE] GVN pass parameter threading -> GvnState struct - - gvn_dfs had 14 params, process_block had 10 params - - Fixed by introducing GvnState struct with save_scope/restore_scope - -2. [DONE] Functions with 10-20+ parameters - - parse_declaration_rest: 22 params -> DeclContext struct (DeclAttributes + alignment info) - - parse_function_def: 11 params -> accepts DeclAttributes directly - - resolve_type_flags: 16 boolean params -> TypeSpecFlags struct - - collect_trailing_specifiers: 10 params -> TypeSpecFlags struct - - gvn_dfs: DONE (was 14 params, now GvnState struct) - -3. [PARTIAL] Visibility: was 836 pub + 18 pub(crate), now improved at module level - - All 22 mod.rs files updated: pub mod -> pub(crate) mod throughout - - Only lib.rs::backend and lib.rs::driver remain pub (needed by main.rs) - - backend::CodegenOptions and its fields changed to pub(crate) - - backend::Target methods (except triple()) changed to pub(crate) - - Unused re-exports removed (ir::*, ast::*, sema builtins, codegen types) - - Driver struct: all 38 pub fields changed to private. Only 1 was accessed - externally (input_files.is_empty() in main.rs), replaced by has_input_files(). - parse_cli_args() now returns Result instead of using - eprintln!+process::exit(), and real_main() returns Result<(), String>. - - Remaining: individual pub fn/struct/enum/const within pub(crate) modules - are semantically capped at pub(crate) by Rust's visibility rules, but - could be explicitly marked pub(crate) for consistency. Low priority since - the module-level changes already enforce the correct boundary. - - The tighter visibility now exposes ~32 dead code warnings (previously - hidden behind pub), which could be cleaned up separately. - -4. [PARTIAL] String allocations from literals (medium effort) - - Was 420 .to_string() calls, now 334 (86 eliminated) - - Register name functions (reg_to_32, reg_to_16, reg_to_64, reg_to_8l, - reg_to_8h) in x86 and i686 backends now return Cow<'_, str> instead - of String, avoiding heap allocations for known register names. - format_x86_reg, format_i686_reg, src_reg_for_type, dest_reg_for_type - also updated. x86_normalize_reg_to_64bit returns Cow<'static, str>. - - Remaining: builtin_macros.rs (41 calls), preprocessor includes (33), - other frontend/IR files (~260 remaining calls) - -5. [PARTIAL] format! in codegen hot paths (medium effort) - - Was 320 format! calls, improved via emit_fmt and static &str replacements - - emit(&format!(...)) anti-pattern eliminated (variadic.rs: 3 calls fixed) - - RISC-V alu.rs: format!("add{}", w) mnemonic construction replaced with - static &str table (13 format! allocations eliminated) - - 32-bit .long splitting pattern extracted to emit_u64_as_long_pair helper - - Remaining: ~28 format!("%{}", reg) in x86 asm_emitter.rs (inline asm), - ARM codegen register name allocations, label generation format! calls - -6. Deep nesting (16 levels) in global_init.rs (small-medium effort) - - Nested match/if-let chains at src/ir/lowering/global_init.rs:490-522 - - Extract compound literal handling into helper functions - -7. Lowerer split across 37 files (large effort, low priority) - - Valid Rust pattern but extreme; makes API surface hard to understand - - Consider grouping related impl blocks or adding a module-level doc - -8. [DONE] x86/i686 code duplication -> x86_common module - - gcc_cc_to_x86: exact duplicate extracted to shared free function - - reg_to_32, reg_to_16, reg_to_8l, reg_to_8h: merged to accept both - 64-bit and 32-bit register names in a single match - - substitute_*_asm_operands: ~150 lines of identical template parsing - extracted to shared function parameterized by operand emission callback - - emit_operand_common: shared logic for %n/%c/%P/memory/$symbol/$imm - - Both backends now delegate through thin wrappers to x86_common - - ~400 lines removed, zero functional changes - -9. [DONE] Wildcard imports replaced with explicit imports - - Was 124 `use ...::*` statements across 90 files - - Replaced all `use crate::ir::ir::*` (72 files) with explicit imports - listing only the types/enums/functions actually used in each file - - Replaced all `use crate::frontend::parser::ast::*` (42 files) likewise - - Removed 87 false-positive imports that the initial pass included - - 114 files updated total, zero functional changes - - Remaining: 10 module-internal wildcards (parser `super::ast::*`, - peephole `super::types::*`, lowering `super::definitions::*`) are - standard Rust patterns for closely-coupled module internals diff --git a/src/backend/README.md b/src/backend/README.md deleted file mode 100644 index 9f8ea38a2d..0000000000 --- a/src/backend/README.md +++ /dev/null @@ -1,1271 +0,0 @@ -# Backend Subsystem Design - -The backend transforms SSA IR produced by the compiler's middle-end into -target-specific assembly text, then assembles and links it into ELF -executables. Four architectures are supported: x86-64, i686, AArch64, and -RISC-V 64. Each architecture has a builtin assembler (instruction encoder + -ELF object file writer) and a builtin linker (symbol resolution + relocation -application + ELF executable writer), enabling fully self-contained -compilation with no external toolchain. An external GCC toolchain can be used -as a fallback. - ---- - -## Code Generation Pipeline - -``` - +------------------+ - | IR Module | - | (SSA, per-func) | - +--------+---------+ - | - +------------v-----------+ - | Pre-scan Analysis | - | - GEP fold map | - | - Cmp-branch fusion | - | - GlobalAddr folding | - | - Use counts | - +------------+-----------+ - | - +------------v-----------+ - | Stack Layout / RegAlloc | - | - Three-tier slot | - | allocation | - | - Liveness analysis | - | - Linear scan regalloc | - +------------+-----------+ - | - +------------v-----------+ - | Instruction Selection | - | (ArchCodegen trait) | - | - Per-instruction | - | dispatch | - | - Prologue / epilogue | - | - ABI parameter stores | - +------------+-----------+ - | - +------------v-----------+ - | Raw Assembly Text | - +------------+-----------+ - | - +------------v-----------+ - | Peephole Optimizer | - | - Local passes (x8) | - | - Global passes (x1) | - | - Local cleanup (x4) | - | - Tail call, callee- | - | save, frame compact | - +------------+-----------+ - | - +------------v-----------+ - | Optimized Assembly | - +------------+-----------+ - | - +------------v-----------+ - | Builtin Assembler | - | parse asm text | - | encode instructions | - | write ELF .o | - +------------+-----------+ - | - +------------v-----------+ - | Builtin Linker | - | read .o + CRT + libs | - | resolve symbols | - | apply relocations | - | write ELF executable | - +------------+-----------+ - | - +--------v---------+ - | ELF Executable | - +------------------+ -``` - -The pipeline is driven from `Target::generate_assembly_with_opts_and_debug` in -`mod.rs`. For each target, it: - -1. Instantiates the architecture-specific codegen struct (e.g., `X86Codegen`). -2. Applies CLI-driven `CodegenOptions` (PIC, retpoline, CET, patchable entry, - etc.). -3. Calls `generation::generate_module_with_debug`, which emits data sections, - iterates over functions, and dispatches each IR instruction through the - `ArchCodegen` trait. -4. Passes the resulting assembly text through the architecture's peephole - optimizer. -5. Returns the final assembly string, which is then assembled (via the - builtin assembler or an external toolchain) and linked (via the builtin - linker or an external toolchain) into an ELF executable. - ---- - -## Directory Layout - -The backend is split into shared modules at the top level (some as directory -modules with submodules) and 4 architecture-specific subdirectories: - -``` -src/backend/ - mod.rs Target enum, CodegenOptions, top-level dispatch - elf/ Shared ELF constants, StringTable, read/write helpers, archive parsing - mod.rs Core ELF types, struct definitions, re-exports - constants.rs ELF format constants (section types, flags, relocations) - string_table.rs StringTable builder for ELF string sections - section_flags.rs Section flag parsing from GAS directives - archive.rs AR archive (.a) parsing - io.rs Binary read/write helpers (little-endian, big-endian) - parse_string.rs GAS string literal parsing with escape sequences - linker_symbols.rs Linker-generated symbols (_GLOBAL_OFFSET_TABLE_, etc.) - symbol_table.rs ELF symbol table emission - numeric_labels.rs GAS numeric label (1:, 2f, 3b) support - object_writer.rs High-level ELF object file writer - writer_base.rs Low-level ELF writer (headers, sections, relocations) - linker_common/ Shared linker infrastructure (17 files, see linker_common/README.md) - mod.rs Re-exports, linker-defined symbol detection - types.rs Core ELF64 types: Elf64Section, Elf64Symbol, Elf64Object, DynSymbol - parse_object.rs Parse ELF64 relocatable objects (.o) - parse_shared.rs Extract dynamic symbols and SONAME from shared libraries (.so) - symbols.rs GlobalSymbolOps trait, InputSection/OutputSection - merge.rs Merge input sections into output sections, COMMON symbols - dynamic.rs Match undefined globals against shared library exports - archive.rs Load archives (.a, thin archives), iterative resolution - resolve_lib.rs Resolve -l library names to filesystem paths - args.rs Parse -Wl, linker flags into structured LinkerArgs - check.rs Post-link undefined symbol validation - section_map.rs Section ordering and address assignment - write.rs Shared ELF executable writing helpers - dynstr.rs Dynamic string table builder - hash.rs GNU hash table and SysV hash table generation - eh_frame.rs .eh_frame FDE counting and .eh_frame_hdr builder - gc_sections.rs --gc-sections BFS reachability analysis - peephole_common.rs Shared peephole optimizer utilities: word matching, register replacement, LineStore - asm_expr.rs Shared integer expression evaluator (all 4 assembler parsers) - asm_preprocess.rs Shared GAS preprocessing: comments, macros, .rept, .if/.elseif/.else/.endif - elf_writer_common.rs Shared generic ELF object writer for x86-64 and i686 assemblers - traits.rs ArchCodegen trait (~185 methods, ~64 default impls) - generation.rs Module/function/instruction dispatch (arch-independent) - state.rs CodegenState, StackSlot, SlotAddr, RegCache - stack_layout/ Three-tier stack slot allocation (7 files, see stack_layout/README.md) - mod.rs Entry point, calculate_stack_space_common driver - analysis.rs Use counting, immediately-consumed detection, block analysis - alloca_coalescing.rs Escape analysis for non-escaping single-block allocas - copy_coalescing.rs Copy alias tracking (share source slot) - slot_assignment.rs Tier 2 liveness packing, Tier 3 block-local reuse - inline_asm.rs Inline asm callee-saved register scanning - regalloc_helpers.rs Register allocation setup and parameter alloca lookup - call_abi.rs Unified ABI classification (CallArgClass, ParamClass) - cast.rs CastKind classification, FloatOp classification - liveness.rs Live interval computation (backward dataflow) - regalloc.rs Linear scan register allocator - common.rs Assembler/linker invocation, data emission - f128_softfloat.rs Shared F128 soft-float orchestration (ARM + RISC-V) - inline_asm.rs InlineAsmEmitter trait and 4-phase framework - x86_common.rs Shared x86/i686 register names, condition codes - - x86/ x86-64 backend (SysV AMD64 ABI) - codegen/ Code generation (18 files) + peephole optimizer subdirectory - assembler/ Builtin assembler (parser, encoder, ELF writer) - linker/ Builtin linker (dynamic linking, PLT/GOT, TLS) - i686/ i686 backend (cdecl, ILP32) - codegen/ Code generation (18 files) + peephole optimizer - assembler/ Builtin assembler (reuses x86 parser, 32-bit encoder) - linker/ Builtin linker (32-bit ELF, R_386 relocations) - arm/ AArch64 backend (AAPCS64) - codegen/ Code generation (19 files) + peephole optimizer - assembler/ Builtin assembler (parser, encoder, ELF writer) - linker/ Builtin linker (dynamic linking, IFUNC/TLS) - riscv/ RISC-V 64 backend (LP64D) - codegen/ Code generation (19 files) + peephole optimizer - assembler/ Builtin assembler (parser, encoder, RV64C compress) - linker/ Builtin linker (dynamic linking) -``` - -Each architecture subdirectory contains 18-19 codegen files -(including `mod.rs`) that implement the `ArchCodegen` trait methods. The -x86 backend's peephole optimizer is a subdirectory (`peephole/`) rather -than a single file, containing its own module structure for the multi-stage -pass pipeline: - -| File | Responsibility | -|------|---------------| -| `emit.rs` | Struct definition, `ArchCodegen` impl, `delegate_to_impl!` | -| `alu.rs` | Integer arithmetic and bitwise operations | -| `atomics.rs` | Atomic load/store/RMW/cmpxchg | -| `calls.rs` | Function call emission, argument marshalling | -| `cast_ops.rs` (`casts.rs` on i686) | Type casts (int widening/narrowing, int-float conversions) | -| `comparison.rs` | Comparison and fused compare-and-branch | -| `f128.rs` | F128 (long double) operations (absent on i686, which uses x87) | -| `float_ops.rs` | Floating-point arithmetic | -| `globals.rs` | Global address materialization, TLS access | -| `i128_ops.rs` | 128-bit integer operations | -| `inline_asm.rs` | Architecture-specific inline assembly emission | -| `intrinsics.rs` | Compiler builtins (popcount, bswap, clz, etc.) | -| `memory.rs` | Load, store, memcpy, GEP | -| `peephole.rs` or `peephole/` | Post-generation assembly optimization (x86 uses a subdirectory) | -| `prologue.rs` | Function prologue and epilogue | -| `returns.rs` | Return value emission | -| `variadic.rs` | Variadic function support (va_start, va_arg, va_copy) | -| `asm_emitter.rs` | `InlineAsmEmitter` trait implementation | - -For architecture-specific details, see: - -| Architecture | Overview | Code Generation | Assembler | Linker | -|-------------|----------|----------------|-----------|--------| -| x86-64 | [`x86/README.md`](x86/README.md) | [`x86/codegen/README.md`](x86/codegen/README.md) | [`x86/assembler/README.md`](x86/assembler/README.md) | [`x86/linker/README.md`](x86/linker/README.md) | -| i686 | [`i686/README.md`](i686/README.md) | [`i686/codegen/README.md`](i686/codegen/README.md) | [`i686/assembler/README.md`](i686/assembler/README.md) | [`i686/linker/README.md`](i686/linker/README.md) | -| AArch64 | [`arm/README.md`](arm/README.md) | [`arm/codegen/README.md`](arm/codegen/README.md) | [`arm/assembler/README.md`](arm/assembler/README.md) | [`arm/linker/README.md`](arm/linker/README.md) | -| RISC-V 64 | [`riscv/README.md`](riscv/README.md) | [`riscv/codegen/README.md`](riscv/codegen/README.md) | [`riscv/assembler/README.md`](riscv/assembler/README.md) | [`riscv/linker/README.md`](riscv/linker/README.md) | - -The x86-64 peephole optimizer has its own detailed documentation: [`x86/codegen/peephole/README.md`](x86/codegen/peephole/README.md). - ---- - -## The ArchCodegen Trait - -The `ArchCodegen` trait (defined in `traits.rs`) is the central abstraction -that decouples the shared code generation framework from architecture-specific -instruction emission. It defines approximately 185 methods organized into -several categories: - -- **State access**: `state()` and `state_ref()` provide mutable and immutable - access to the shared `CodegenState`. -- **Prologue/epilogue**: `emit_prologue`, `emit_epilogue`, - `calculate_stack_space`, `aligned_frame_size`. -- **Operand handling**: `emit_load_operand`, `emit_store_result`, - `emit_copy_value`. -- **Memory operations**: `emit_store`, `emit_load`, - `emit_load_with_const_offset`, `emit_store_with_const_offset`, - `emit_seg_load`, `emit_seg_store`, `emit_global_load_rip_rel`, - `emit_global_store_rip_rel`. -- **Arithmetic and logic**: `emit_binop`, `emit_unaryop`, - `emit_float_binop`. -- **Comparisons**: `emit_cmp`, `emit_fused_cmp_branch_blocks`. -- **Casts**: `emit_cast`, `emit_cast_instrs`. -- **Control flow**: `emit_branch`, `emit_cond_branch_blocks`, `emit_switch`, - `emit_indirect_branch`. -- **Function calls**: `emit_call` (handles both direct and indirect calls - via `direct_name: Option<&str>` and `func_ptr: Option<&Operand>`), plus - the 8-phase hook methods (`emit_call_compute_stack_space`, - `emit_call_f128_pre_convert`, `emit_call_spill_fptr`, - `emit_call_stack_args`, `emit_call_sret_setup`, `emit_call_reg_args`, - `emit_call_instruction`, `emit_call_cleanup`, `emit_call_store_result`). -- **Atomics**: `emit_atomic_load`, `emit_atomic_store`, `emit_atomic_rmw`, - `emit_atomic_cmpxchg`. -- **128-bit**: `emit_i128_binop`, `emit_i128_cmp`, - `emit_i128_store_result`, `emit_store_acc_pair`, - `emit_load_acc_pair`. -- **Register allocation**: `get_phys_reg_for_value`, `emit_reg_to_reg_move`, - `emit_acc_to_phys_reg`. - -### Default Implementations and Primitive Composition - -Approximately 64 methods have default implementations that capture shared -codegen patterns. These defaults are built from small "primitive" methods that -each backend overrides with 1--4 line architecture-specific implementations. -The design lets the shared framework express an algorithm once while backends -only provide instruction-level differences. - -Key default implementations include: - -- **`emit_store_default` / `emit_load_default`**: Resolve a value's `SlotAddr` - (Direct, Indirect, or OverAligned) and dispatch to the appropriate primitive: - `emit_typed_store_to_slot`, `emit_typed_store_indirect`, or - `emit_alloca_aligned_addr` plus `emit_typed_store_indirect`. Each primitive - is a backend-supplied one-liner. -- **`emit_copy_value`**: Checks whether source and destination have physical - register assignments (from the register allocator) and emits direct - register-to-register moves when possible, falling back to the accumulator - load/store path otherwise. Backends needing special handling (e.g., x86 F128 - x87 copies) override this for the special case and delegate the rest to the - default. -- **`emit_binop`**: Classifies the operation as i128, float, or integer and - delegates to the corresponding specialized method. -- **`emit_call`**: Orchestrates an 8-phase call sequence (Phase 0: classify - arguments and compute stack space, Phase 1: F128 pre-conversion, Phase 2: - spill function pointer, Phase 3: push stack arguments, Phase 3.5: sret - pointer setup, Phase 4: load register arguments, Phase 5: emit the call - instruction, Phase 6: clean up the stack and store the result). Each phase - is a backend-supplied hook method. -- **`emit_load_with_const_offset` / `emit_store_with_const_offset`**: Handle - GEP-folded memory accesses by dispatching on `SlotAddr` and folding the - constant offset into the appropriate addressing mode. -- **`build_jump_table`**: Shared jump table construction. All 64-bit backends - use relative 32-bit offsets (`.long target - table_base`) to avoid - unresolved `R_*_ABS64` relocations; i686 uses absolute 4-byte entries. - -The `traits.rs` module also provides free functions (`emit_store_default`, -`emit_load_default`, `emit_cast_default`, `emit_unaryop_default`, -`emit_return_default`) that backends overriding a trait method for special -types (e.g., x86 F128) can call for the non-special cases, avoiding code -duplication. - -### The `delegate_to_impl!` Macro - -Most `impl ArchCodegen for XxxCodegen` blocks consist of one-liner -delegations to `_impl` methods defined on the codegen struct itself. The -`delegate_to_impl!` macro eliminates this boilerplate: - -```rust -delegate_to_impl! { - fn calculate_stack_space(&mut self, func: &IrFunction) -> i64 - => calculate_stack_space_impl; - fn emit_prologue(&mut self, func: &IrFunction, frame_size: i64) - => emit_prologue_impl; - fn store_instr_for_type(&self, ty: IrType) -> &'static str - => store_instr_for_type_impl; -} -``` - -Each line maps a trait method to its corresponding `_impl` counterpart. The -macro handles `&self`/`&mut self` receivers and optional return types, -generating the forwarding body automatically. - ---- - -## Code Generation Dispatch (generation.rs) - -The `generation.rs` module contains the arch-independent driver that -orchestrates code generation through the `ArchCodegen` trait. Its entry -points are: - -### `generate_module` - -1. Pre-sizes the output buffer based on total IR instruction count (each - instruction generates roughly 40 bytes of assembly text; buffer is - clamped to 256 KB--64 MB). -2. Collects symbol sets (local, TLS, weak extern) for PIC and GOT decisions. -3. Builds and emits the DWARF file table (`.file` directives) when debug info - is enabled. -4. Emits data sections (`.data`, `.bss`, `.rodata`, string literals) via - `common::emit_data_sections`. -5. Emits top-level `asm("...")` directives verbatim. -6. Emits extern visibility directives for referenced symbols. -7. Iterates over functions, calling `generate_function` for each. -8. Emits aliases, symbol attributes, `.init_array`/`.fini_array` entries. -9. Emits architecture-specific runtime helper stubs (e.g., i686 `__divdi3`). -10. Emits `.note.GNU-stack` section for non-executable stack. - -### `generate_function` - -1. Resets per-function state and emits linkage directives (`.globl`, - `.local`), visibility, and type directives. -2. Emits patchable function entry NOP padding when configured - (`-fpatchable-function-entry`), along with the - `__patchable_function_entries` section pointer for ftrace. Inline - functions are excluded to avoid overwhelming the kernel's ftrace - initialization. -3. For naked functions, emits only inline asm blocks with no - prologue/epilogue. -4. Pre-scans for `DynAlloca`/`StackRestore` (triggers frame-pointer-based SP - restore in the epilogue). -5. Calls `calculate_stack_space` -- which internally runs the three-tier - allocator and register allocator -- then aligns the frame and emits the - prologue. -6. Emits parameter stores from argument registers to stack slots. -7. Builds several pre-scan maps used during instruction dispatch: - - **Value use counts**: for compare-branch fusion eligibility and GEP - fold analysis. - - **GEP fold map**: identifies GEPs with constant offsets foldable into - Load/Store (uses value use counts to verify single-use). - - **Global address map**: maps GlobalAddr values to symbol names for - RIP-relative folding. - - **Global address pointer set**: distinguishes pointer vs. integer uses - of GlobalAddr in kernel code model. - - **Foldable GlobalAddr set**: GlobalAddr values whose `leaq` can be - skipped entirely (all uses are foldable Load/Store pointers). -8. Iterates over basic blocks. At each block boundary, invalidates the - register cache. For each instruction, calls `generate_instruction`; for - the terminator, either emits a fused compare-and-branch or calls - `generate_terminator`. -9. Emits `.size` directive. - -### `generate_instruction` - -A large `match` statement dispatches each IR instruction variant to the -appropriate `ArchCodegen` method. It also manages the register value cache: -instructions that follow the load-compute-store pattern leave the accumulator -holding the destination value, so subsequent instructions can skip reloading. -Instructions with unpredictable clobbers (calls, inline asm, atomics, complex -operations) invalidate the cache. - ---- - -## Three-Tier Stack Slot Allocation (stack_layout/) - -All four backends share a unified stack layout algorithm that assigns stack -slots to IR values. The algorithm is implemented in -`calculate_stack_space_common` and uses three tiers to minimize frame size. - -### Tier 1: Alloca Slots - -Allocas represent addressable local variables. They receive permanent, -non-shared stack slots because their addresses may escape (be passed to -other functions, stored in pointers, etc.). The exception is -**non-escaping single-block allocas**: an escape analysis pass -(`compute_coalescable_allocas`) identifies allocas whose addresses never -leave their defining block (no address taken in calls, no address stored to -memory, no address passed through phi nodes or across block boundaries), and -these are demoted to Tier 3 for block-local sharing. - -Dead non-parameter allocas (those with no uses at all) are detected and -skipped entirely -- no stack slot is allocated. - -### Tier 2: Multi-Block SSA Temporaries - -Non-alloca SSA values that are live across multiple basic blocks use -**liveness-based packing**. The liveness analysis (from `liveness.rs`) -computes live intervals for each value. A greedy interval coloring -algorithm uses a min-heap to assign values with non-overlapping live -intervals to the same stack slot. This is particularly effective for -switch-heavy code (dispatch tables, state machines) where many values are -defined in different case handlers that never execute simultaneously. - -Multi-definition values (from phi elimination, which creates Copy -instructions in multiple predecessor blocks) are always routed to Tier 2, -since their definition spans multiple blocks and they cannot safely share -block-local pools. - -### Tier 3: Single-Block Values - -Non-alloca values that are defined and used entirely within a single basic -block use **block-local slot reuse**. Each block maintains its own slot -pool; pools from different blocks overlap in the frame since only one block -executes at a time. Within a block, slots are reused greedily: once a -value's last use has been passed, its slot becomes available for the next -value of the same size. - -Non-escaping single-block allocas (identified by the escape analysis in -Tier 1) are included in the Tier 3 pools, sharing slots with regular -single-block values. - -### Additional Optimizations - -- **Copy alias tracking**: `Copy` instructions that simply move a value - between SSA names share the same stack slot as their source, avoiding - redundant allocation and copies. -- **Immediately-consumed value elimination**: Values that are produced and - immediately consumed by the very next instruction as the first operand do - not need a stack slot at all -- the accumulator register cache keeps them - alive. The `immediately_consumed` set in `StackLayoutContext` tracks - these. -- **Dead parameter alloca elimination**: Unused parameter allocas are - detected and skipped entirely (no stack slot needed). -- **Small slot tracking**: The `small_slot_values` set in `CodegenState` - tracks values eligible for 4-byte slots (I32, U32, F32, and smaller on - 64-bit targets), but slot allocation currently always uses 8 bytes. Using - 4-byte movl store/load is unsafe because it zero-extends on reload, losing - sign information when 32-bit values are widened to 64 bits (see - `ideas/reduce_stack_frame_size_for_postgres.txt`). -- **Deferred slot finalization**: Block-local slots use `DeferredSlot` - entries whose final frame offset is computed only after all tiers have - determined their space requirements, so Tier 3 slots are placed after the - Tier 1 and Tier 2 regions. - ---- - -## Linear Scan Register Allocator (regalloc.rs) - -The register allocator assigns physical registers to IR values based on -their live intervals, prioritizing values with the most uses and longest -lifetimes. Values that do not receive a register remain on the stack and -are accessed through the accumulator load/store path. - -### Three-Phase Allocation - -**Phase 1 -- Callee-saved registers for call-spanning values.** -Values whose live ranges span function calls are assigned callee-saved -registers (x86: `rbx`, `r12`--`r15`; ARM: `x20`--`x28`; RISC-V: `s1`, -`s7`--`s11`). These registers are preserved across calls by the ABI, so no -per-call save/restore is needed -- only the prologue and epilogue must save -and restore them. The linear scan walks sorted candidate intervals and -assigns each to the callee-saved register with the earliest free time. - -**Phase 2 -- Caller-saved registers for non-call-spanning values.** -Values whose live ranges do not cross any call are assigned caller-saved -registers (x86: `r11`, `r10`, `r8`, `r9`; ARM: `x13`, `x14`). Since -these values are not live across calls, the registers do not need to be -saved or restored at all -- neither at call sites nor in the -prologue/epilogue. - -**Phase 3 -- Callee-saved spillover.** -After Phases 1 and 2, any remaining callee-saved registers are assigned to -the highest-priority non-call-spanning values that did not fit in the -caller-saved pool. This is critical for call-free hot loops (hash -functions, matrix multiply, sorting kernels) where all values compete for -only a few caller-saved registers. The one-time prologue/epilogue -save/restore cost is amortized over many loop iterations. - -### Priority Scoring - -Candidates are sorted by a priority score that combines live range length -and use count. Uses inside loops are weighted exponentially by nesting -depth: a use at loop depth D contributes 10^D to the weighted count -(depth 1 = 10x, depth 2 = 100x, depth 3 = 1000x, capped at 10,000 for -very deep nesting). This ensures inner-loop temporaries receive registers -ahead of straight-line code values, which is critical for compute-heavy -loops like zlib's `deflate_slow`, `longest_match`, and `slide_hash`. - -### Eligibility Filtering - -The allocator uses a whitelist approach: only values produced by simple, -well-understood instructions are eligible. Specifically: - -- **Eligible**: `BinOp`, `UnaryOp`, `Cmp`, `Cast` (GPR types only), - `Load`, `GetElementPtr`, `Copy`, `Call`/`CallIndirect` (integer result), - `Select`, `GlobalAddr`, `LabelAddr`, `AtomicLoad`, `AtomicRmw`, - `AtomicCmpxchg`. -- **Excluded**: Alloca values (they represent stack addresses, not data); - float and F128 values (they use dedicated FP register paths); i128 - values (they require register pairs); I64/U64 on 32-bit targets (they - require `eax:edx` pairs); values used only once immediately after - definition (no benefit from a register); values used as memory pointers - in instructions whose codegen paths access `resolve_slot_addr` directly. - -The allocator does not split live intervals: a value either gets a register -for its entire lifetime or remains on the stack. - -### Integration with Stack Layout - -The register allocator runs during `calculate_stack_space` as part of the -`run_regalloc_and_merge_clobbers` helper. Its liveness analysis result is -cached in `RegAllocResult::liveness` so the Tier 2 liveness-based slot -packing can reuse it, avoiding a redundant dataflow computation. Inline -assembly clobber registers are collected separately -(`collect_inline_asm_callee_saved`) and merged with the register allocator's -used-register set to produce the final set of callee-saved registers -requiring prologue/epilogue save/restore. - ---- - -## Liveness Analysis (liveness.rs) - -The liveness module computes live intervals for each IR value. A live -interval `[start, end]` represents the program point range where a value -must be preserved (either in a register or a stack slot). - -### Backward Dataflow - -The analysis proceeds in three steps: - -1. **Numbering**: Assign sequential program points to all instructions and - terminators across all basic blocks. -2. **Dataflow**: Run backward dataflow iteration to compute `live_in` and - `live_out` sets for each block. This correctly handles values live across - loop back-edges by iterating until a fixed point is reached. The dataflow - uses compact bitsets (packed `u64` words) instead of hash sets, with - value IDs remapped to a dense `[0..N)` range. Operations are - word-level: union = bitwise OR, difference = AND-NOT, equality = `==`. - This eliminates per-iteration heap allocation and replaces hash-table - operations with fast word-level bitwise ops. -3. **Interval construction**: Build intervals by taking the union of - definition/use points and live-through blocks. - -### Auxiliary Results - -The liveness result includes: - -- **Call points** (`call_points`): program points corresponding to - `Call`/`CallIndirect` instructions, used by the register allocator to - identify which values span calls and therefore need callee-saved - registers. -- **Loop nesting depth** (`block_loop_depth`): per-block depth computed via - DFS-based back-edge detection. Depth 0 = not in any loop, depth 1 = one - loop, etc. Used for priority weighting in the register allocator. - -### Canonical Operand Iterators - -The module also provides the canonical instruction/terminator operand -iterators (`for_each_operand_in_instruction`, -`for_each_value_use_in_instruction`, `for_each_operand_in_terminator`) -used by code generation, register allocation, stack layout, and liveness -analysis itself. This ensures that new IR instruction variants only need -operand traversal updates in one place. - ---- - -## Call ABI Classification (call_abi.rs) - -The call ABI module provides a unified classification system for function -call arguments and callee-side parameters. The core insight is that callers -and callees must agree exactly on where each argument lives (which register -or which stack offset), so the classification algorithm is implemented once -in `classify_args_core` and wrapped by two thin entry points: - -- **`classify_call_args`**: caller-side classification (returns - `CallArgClass`), used by call emission to place arguments into registers - and stack slots. -- **`classify_params_full`**: callee-side classification (returns - `ParamClass` with concrete stack offsets), used by `emit_store_params` to - load incoming parameters from their ABI-defined locations. - -### Classification Categories - -The classifier walks the argument list and assigns each to one of these -categories, consuming GP and FP register slots in order: - -| Category | Description | -|----------|-------------| -| `IntReg` | Integer/pointer in a GP register | -| `FloatReg` | Float/double in an FP register | -| `I128RegPair` | 128-bit integer in an aligned GP register pair | -| `F128Reg` / `F128GpPair` / `F128AlwaysStack` | Long double (arch-specific) | -| `StructByValReg` | Small struct (<=16 bytes) in 1--2 GP registers | -| `StructSseReg` | Small struct with all-float eightbytes in XMM registers | -| `StructMixedIntSseReg` | Mixed struct: INTEGER first, SSE second | -| `StructMixedSseIntReg` | Mixed struct: SSE first, INTEGER second | -| `StructSplitRegStack` | Struct split across last GP register and stack | -| `LargeStructStack` / `LargeStructByRefReg` / `LargeStructByRefStack` | Large struct (>16 bytes) | -| `Stack` / `F128Stack` / `I128Stack` / `StructByValStack` | Overflow to stack | -| `ZeroSizeSkip` | Zero-size struct, consumes nothing | - -### Architecture Parameterization via `CallAbiConfig` - -The classification is parameterized by a `CallAbiConfig` struct that each -backend provides via its `call_abi_config()` method. This captures ABI -differences without duplicating the core algorithm: - -- Number of available GP and FP argument registers (x86: 6 GP + 8 FP; - ARM: 8 GP + 8 FP; RISC-V: 8 GP + 8 FP; i686: 0 GP by default, up to 3 - with `-mregparm`). -- Whether variadic float arguments are promoted to GP registers (ARM, - RISC-V) or remain in FP registers (x86). -- Whether SysV struct classification (per-eightbyte GP/SSE analysis) is - used (x86-64 only, via `classify_sysv_struct`). -- Whether i128 arguments require even-aligned register pairs (ARM, RISC-V). -- Whether large structs are passed by hidden reference in a GP register - (AAPCS64) or by value on the stack. -- F128 handling: x87 stack convention (x86), Q-register (ARM), or GP pair - (RISC-V). -- Whether sret uses a dedicated register (ARM: x8) or consumes a regular GP - slot (x86: rdi, RISC-V: a0). When a dedicated register is used, the callee - classification promotes the first stack-overflow GP arg to the freed register - slot so that caller and callee agree on argument locations. - ---- - -## Cast Classification (cast.rs) - -The `cast.rs` module provides a shared `CastKind` enum and -`classify_cast_with_f128` function that all four backends use to determine -what conversion to emit. After pointer normalization (`Ptr` treated as -`U64`) and F128 reduction (on x86, `F128` is approximated as `F64` for -x87), the classifier returns one of approximately 20 `CastKind` variants: - -- `Noop` -- no conversion needed (same type, or Ptr to I64/U64). -- `FloatToSigned` / `FloatToUnsigned` -- FP to integer. -- `SignedToFloat` / `UnsignedToFloat` -- integer to FP. -- `FloatToFloat` -- F32 to F64 or vice versa. -- `IntWiden` / `IntNarrow` -- integer widening (sign/zero-extend) or - truncation. -- `SignedToUnsignedSameSize` / `UnsignedToSignedSameSize` -- same-width - reinterpretation (the RISC-V case requires sign-extension for U32 to I32 - due to the ABI requiring sign-extended 32-bit values in 64-bit - registers). -- `SignedToF128` / `UnsignedToF128` / `F128ToSigned` / `F128ToUnsigned` / - `FloatToF128` / `F128ToFloat` -- IEEE binary128 softfloat conversions on - ARM/RISC-V. - -The `f128_is_native` parameter distinguishes ARM/RISC-V (IEEE binary128, -requiring `__floatsitf`, `__fixtfdi`, `__extendsftf2`, etc.) from x86 -(x87 80-bit, approximated as F64 for computation). - -The module also provides `FloatOp` classification and `classify_float_binop` -to map IR binary operations to their floating-point categories, and -`F128CmpKind` / `f128_cmp_libcall` for F128 comparison library call -mapping. - ---- - -## Peephole Optimizer - -Each backend has a text-based peephole optimizer that operates on the -generated assembly string after instruction selection and before the -external assembler. The optimizer works on the full assembly output of a -module, processing it line by line. - -### Line Classification - -Each line of assembly text is pre-parsed into a compact enum/struct -representation. On x86-64, a `LineInfo` struct captures the `LineKind` -(store-to-rbp, load-from-rbp, self-move, label, jump, call, push, pop, -etc.), destination register, stack offset, extension kind, whether the line -contains indirect memory access, and a bitmask of referenced register -families. On ARM, a `LineKind` enum captures stores/loads to sp, register -moves, branches, and ALU instructions. This pre-parsing avoids repeated -string comparisons in hot optimization loops. - -### NOP Marking Strategy - -Lines to be eliminated are marked as `Nop` (their kind set to the Nop -variant and their text replaced with an empty string) rather than removed -from the line array. This preserves array indices for multi-line pattern -matching and adjacency checks. A final compaction pass filters out all -Nop-marked lines before returning the optimized text. - -### Iterative Convergence - -Local passes run iteratively (up to 8 rounds on all four architectures, -up to 4 rounds for cleanup) until no further changes are made. Each round makes a -single pass over all lines applying multiple pattern matchers -simultaneously. This handles cascading opportunities where one optimization -(e.g., removing a redundant load) exposes another (e.g., making a store -dead). - -### x86-64 Pass Structure (Seven Phases) - -The x86-64 peephole (in `x86/codegen/peephole/`) is the most -comprehensive, organized into seven phases: - -**Phase 1 -- Local passes** (iterative, up to 8 rounds): -`combined_local_pass` merges several single-scan patterns into one: - -1. **Adjacent store/load elimination**: `movq %rax, -8(%rbp)` followed by - `movq -8(%rbp), %rax` -- the load is redundant since the value is - already in the register. -2. **Redundant jump elimination**: `jmp .LBB0_1` where `.LBB0_1:` is the - immediately next non-empty line. -3. **Self-move elimination**: `movq %rax, %rax` is a no-op. -4. **Redundant `cltq` elimination**: sign-extension when the value is - already sign-extended. -5. **Redundant zero/sign extension elimination**: eliminates unnecessary - `movzbl`, `movsbl`, etc. - -Additionally: push/pop elimination and binary-op push/pop rewriting -(replacing push-op-pop sequences with direct register operations). - -**Phase 2 -- Global passes** (single execution): - -- **Global store forwarding** across fallthrough labels. -- **Register copy propagation**. -- **Dead register move elimination**. -- **Dead store elimination** (stores to stack slots never subsequently - loaded). -- **Compare-and-branch fusion** at the assembly level. -- **Memory operand folding** (combining separate load + operation into a - single memory-operand instruction). - -**Phase 3 -- Post-global local cleanup** (up to 4 rounds): -Re-runs local passes to clean up new opportunities exposed by global -passes (e.g., a global pass may make a store dead, which makes a preceding -load dead). - -**Phase 4 -- Loop trampoline elimination**: Removes unnecessary jump -trampolines created during code generation, followed by additional local -cleanup if changes were made. - -**Phase 5 -- Tail call optimization + never-read store elimination**: -Converts `call` + `ret` sequences into `jmp` (tail calls), then performs -whole-function analysis removing stores to stack slots that are never -subsequently loaded. - -**Phase 6 -- Unused callee-save elimination**: Removes prologue -push/epilogue pop pairs for callee-saved registers that are never actually -referenced in the function body. - -**Phase 7 -- Frame compaction**: Reassigns stack slot offsets to eliminate -gaps left by eliminated stores, reducing total frame size. - -### Other Architectures - -- **AArch64**: Three-phase structure (8 rounds local, global passes once, - 4 rounds cleanup). Local passes cover store/load elimination on - `[sp, #off]` pairs, redundant branch elimination, self-move elimination - (64-bit `mov xN, xN` only -- 32-bit `mov wN, wN` zeros upper bits and - is not safe to eliminate), move chain optimization (`mov A, B; mov C, A` - becomes `mov C, B`), branch-over-branch fusion (`b.cc .Lskip; b .target; - .Lskip:` becomes `b.!cc .target`), and move-immediate chain optimization. - Global passes include register copy propagation and dead store - elimination. -- **RISC-V**: Follows the same three-phase structure (8/1/4 rounds) - adapted to its instruction set. -- **i686**: Four-phase structure (8/1/4 rounds plus never-read store - elimination as a final phase) adapted to the 32-bit x86 instruction set. - ---- - -## GEP Constant Offset Folding - -A pre-scan pass in `generation.rs` (`build_gep_fold_map`) identifies -`GetElementPtr` instructions with constant offsets that can be folded -directly into subsequent `Load`/`Store` addressing modes. This eliminates -the GEP instruction entirely and avoids materializing the computed pointer -to a stack slot. - -### Eligibility Criteria - -A GEP is foldable when all three conditions hold: - -1. Its offset is a compile-time constant (`Operand::Const`). -2. The constant fits in a 32-bit signed displacement (the safe common limit - across x86 disp32, ARM signed 9-bit unscaled / 12-bit scaled, and - RISC-V signed 12-bit). Unsigned constants that exceed `i32::MAX` but fit - in `u32` are sign-narrowed. -3. The GEP result is used *only* as the pointer operand of `Load`/`Store` - instructions -- not as a value operand, call argument, terminator - operand, or the base of another GEP. - -Phase 1 of the map construction collects all GEPs with constant offsets. -Phase 2 verifies that each candidate's result is used exclusively in -foldable positions by scanning all instructions and terminators for -non-pointer uses. GEPs with any non-pointer use, or whose Load/Store uses -involve i128 types or segment overrides, are removed from the map. - -### Folding Mechanism - -When a GEP is foldable, `generate_function` skips it during instruction -iteration. Each `Load`/`Store` that references the GEP result receives the -`(base, offset)` pair through the `GepFoldInfo` struct and calls -`emit_load_with_const_offset` or `emit_store_with_const_offset`. These -trait methods handle all three `SlotAddr` variants: - -- **Direct** (alloca base): the offset is folded directly into the - frame-pointer-relative slot address. For example, an alloca at `rbp-24` - with a GEP offset of `+8` becomes a single `movl -16(%rbp), %eax`, - avoiding a separate `leaq` instruction. -- **OverAligned** (runtime-aligned alloca): the aligned address is computed - first, then the offset is added to the address register before the - load/store. -- **Indirect** (non-alloca pointer base): the base pointer is loaded from - its stack slot into the address register, the constant offset is added - (if non-zero), and the load/store uses the resulting address. - -This optimization yields approximately 5% assembly size reduction on -real-world code like zlib. - -### Global Address Folding (x86-64) - -A related optimization on x86-64 folds `GlobalAddr` instructions whose only -uses are as `Load`/`Store` pointers into direct `symbol(%rip)` memory -accesses: - -- `build_global_addr_map` maps GlobalAddr values (and GEPs derived from - them) to symbol name strings (e.g., `"myvar"`, `"myvar+8"`). -- `build_foldable_global_addr_set` identifies GlobalAddr values where ALL - uses are foldable Load/Store pointers, allowing the `leaq symbol(%rip), - %rax` to be eliminated entirely. -- In kernel code model (`-mcmodel=kernel`), `build_global_addr_ptr_set` - distinguishes pointer uses (which need RIP-relative addressing) from - integer uses (which need absolute `R_X86_64_32S` addressing for the - linked virtual address). - ---- - -## Compare-and-Branch Fusion - -When the last instruction in a basic block is a `Cmp` whose boolean result -is used only by the block's `CondBranch` terminator (exactly one use, -detected via `count_value_uses`), the comparison and conditional branch are -fused into a single instruction sequence: `cmp` + `jCC` (x86) / `cmp` + -`b.cc` (ARM) / `bCC` (RISC-V). This avoids materializing the boolean -result to a register or stack slot and then re-testing it. - -The fusion is conservative: it excludes i128 and float comparisons (which -have multi-instruction codegen paths) and, on 32-bit targets, I64/U64 -comparisons (which require two-word comparison sequences that the fused -path does not support). - ---- - -## SlotAddr: Three-Way Memory Access Dispatch - -The `SlotAddr` enum (defined in `state.rs`) captures the three distinct -memory access patterns that appear throughout the codegen: - -```rust -pub enum SlotAddr { - OverAligned(StackSlot, u32), // Runtime-aligned alloca (alignment > 16) - Direct(StackSlot), // Normal alloca: slot IS the data - Indirect(StackSlot), // Non-alloca: slot holds a pointer -} -``` - -`CodegenState::resolve_slot_addr` classifies a value by checking whether it -is an alloca, whether it has over-alignment, and whether it is -register-assigned. Every load, store, GEP, memcpy, and address computation -dispatches on this enum to emit the correct instruction sequence, ensuring -the three patterns are handled uniformly and consistently across the entire -codebase. This eliminates the risk of one codegen path handling allocas -correctly while another forgets the OverAligned case. - ---- - -## Register Value Cache (state.rs) - -The `RegCache` tracks which IR value is currently known to reside in the -primary accumulator register (x86: `%rax`, ARM: `x0`, RISC-V: `t0`). When -an instruction produces a result via the accumulator (the common -load-compute-store pattern), the cache records the value ID. If the next -instruction needs that same value as its first operand, `emit_load_operand` -can skip the redundant stack load entirely. - -The cache follows a conservative invalidation policy: - -- **Invalidated after**: function calls, inline asm, stores through - pointers, atomic operations, complex multi-register operations, and any - instruction that might clobber the accumulator in ways the cache cannot - track. -- **Invalidated at**: basic block boundaries, since a value in a register - from a predecessor's fall-through is not guaranteed valid when control - arrives from a different predecessor. -- **Safety property**: a stale entry causes only a redundant load (the same - behavior as without the cache), while a missing invalidation would produce - incorrect code by skipping a needed load. - -The architecture mapping is: -- x86: accumulator = `%rax` -- ARM64: accumulator = `x0` -- RISC-V: accumulator = `t0` - ---- - -## F128 Soft-Float Framework (f128_softfloat.rs) - -ARM and RISC-V lack hardware quad-precision floating point. All F128 -operations on these targets go through compiler-rt/libgcc soft-float library -calls (`__addtf3`, `__multf3`, `__fixtfsi`, `__extendsftf2`, etc.). The -orchestration logic -- loading operands to argument positions, calling the -library function, and storing the result -- is identical between the two -architectures; only the register names, instruction mnemonics, and F128 -register representation differ. - -The `F128SoftFloat` trait captures approximately 48 architecture-specific -primitive methods (each 1--5 instructions), organized into categories: - -- **State access**: `f128_get_slot`, `f128_get_source`, - `f128_resolve_slot_addr`, `f128_is_alloca`, `f128_track_self`, - `f128_set_acc_cache`, `f128_set_dyn_alloca`. -- **Loading constants**: `f128_load_const_to_arg1`. -- **Loading from memory**: `f128_load_16b_from_addr_reg_to_arg1`, - `f128_load_from_frame_offset_to_arg1`, `f128_load_operand_and_extend`, - `f128_load_operand_to_acc`, `f128_load_indirect_ptr_to_addr_reg`, - `f128_load_from_addr_reg_to_acc`, `f128_load_from_direct_slot_to_acc`. -- **Address computation**: `f128_load_ptr_to_addr_reg`, - `f128_add_offset_to_addr_reg`, `f128_alloca_aligned_addr`, - `f128_move_callee_reg_to_addr_reg`, `f128_move_aligned_to_addr_reg`. -- **Storing**: `f128_store_const_halves_to_slot`, `f128_store_arg1_to_slot`, - `f128_copy_slot_to_slot`, `f128_copy_addr_reg_to_slot`, - `f128_store_const_halves_to_addr`, `f128_store_acc_to_dest`, - `f128_store_result_and_truncate`. -- **Argument marshalling**: `f128_move_arg1_to_arg2`, - `f128_save_arg1_to_sp`, `f128_reload_arg1_from_sp`, - `f128_move_acc_to_arg0`, `f128_move_arg0_to_acc`. -- **Conversions and comparison**: `f128_truncate_result_to_acc`, - `f128_cmp_result_to_bool`, `f128_sign_extend_acc`, - `f128_zero_extend_acc`, `f128_narrow_acc`, - `f128_extend_float_to_f128`, `f128_truncate_to_float_acc`. - -Shared orchestration functions build on these primitives: - -- `f128_operand_to_arg1`: load an F128 operand (value or constant) to the - first argument position, handling all SlotAddr variants. -- `f128_emit_store` / `f128_emit_load`: SlotAddr 4-way dispatch for F128 - store/load. -- `f128_emit_cast`: integer-to-F128 and float-to-F128 casts via libcalls. -- `f128_emit_binop`: F128 arithmetic via libcalls (`__addtf3`, etc.). -- `f128_cmp`: F128 comparison via libcalls. -- `f128_neg`: sign bit flip. - -The key architecture difference: -- **ARM**: F128 lives in a single NEON Q register (`q0`/`q1`). Moving - between argument positions is `mov v1.16b, v0.16b`. Sign bit flip uses - `mov` + `eor` + `mov` on the high lane. -- **RISC-V**: F128 lives in a GP register pair (`a0:a1` / `a2:a3`). Moving - between argument positions is `mv a2, a0; mv a3, a1`. Sign bit flip uses - `li` + `slli` + `xor` on the high register. - -On x86, F128 corresponds to x87 80-bit extended precision and is handled -differently: values are loaded/stored via `fldt`/`fstpt`, and arithmetic -uses x87 instructions directly rather than soft-float library calls. The -`CodegenState` tracks F128 load sources (`f128_load_sources`) to enable -full-precision reloading from the original memory location, and -`f128_direct_slots` to identify slots containing full x87 80-bit data. - ---- - -## Inline Assembly Framework (inline_asm.rs) - -All four backends share a common 4-phase inline assembly processing -pipeline, orchestrated by `emit_inline_asm_common`: - -1. **Classify constraints**: Parse GCC-style constraint strings and assign - register or memory operands. Specific register constraints (e.g., `"a"` - for `%rax`, RISC-V specific register names) are assigned first, then - general constraints (`"r"`) are assigned from a scratch register pool. - Tied operands (`"0"`, `"1"`) inherit their tied partner's register. -2. **Load inputs**: Load input values from stack slots into their assigned - registers. Read-write outputs (`"+r"`) are pre-loaded as well. -3. **Template substitution**: Replace `%0`, `%1`, `%[name]` operand - references in the template string with the assigned register names. - GCC modifiers are handled: `%b` (byte register), `%w` (word), `%h` - (high byte), `%P` (raw symbol name), `%c` (constant). Dialect - alternatives (`{att|intel}`) select the AT&T variant. The x86_common - module provides `resolve_dialect_alternatives` for this. -4. **Store outputs**: Store output registers back to their destination - stack slots. - -Each backend implements the `InlineAsmEmitter` trait to provide -architecture-specific register classification, constraint-to-register -mapping, sized register naming, and store/load logic. - -The `AsmOperandKind` enum covers: `GpReg`, `FpReg`, `Memory`, -`Specific(name)`, `Tied(index)`, `Immediate`, `Address`, `ZeroOrReg`, -`ConditionCode(suffix)`, `X87St0`, `X87St1`, and `QReg` (x86 registers -with accessible high-byte forms). - ---- - -## Codegen Options - -The `CodegenOptions` struct (in `mod.rs`) captures all CLI-driven flags -that affect code generation. These are propagated to the backends via -`apply_options` before code generation begins. - -| Option | Flag | Description | -|--------|------|-------------| -| `pic` | `-fPIC` | Position-independent code | -| `function_return_thunk` | `-mfunction-return=thunk-extern` | Replace `ret` with retpoline thunk (Spectre v2 / retbleed) | -| `indirect_branch_thunk` | `-mindirect-branch=thunk-extern` | Retpoline indirect calls/jumps | -| `patchable_function_entry` | `-fpatchable-function-entry=N[,M]` | NOP padding + `__patchable_function_entries` for ftrace | -| `cf_protection_branch` | `-fcf-protection=branch` | Intel CET/IBT `endbr64` emission | -| `no_sse` | `-mno-sse` | Avoid SSE in variadic prologues (Linux kernel) | -| `general_regs_only` | `-mgeneral-regs-only` | Avoid FP/SIMD registers (Linux kernel, AArch64) | -| `code_model_kernel` | `-mcmodel=kernel` | Kernel code model, `R_X86_64_32S` relocations | -| `no_jump_tables` | `-fno-jump-tables` | Force compare-and-branch for all switches | -| `no_relax` | `-mno-relax` | Suppress RISC-V linker relaxation | -| `debug_info` | `-g` | Emit DWARF `.file`/`.loc` directives | -| `function_sections` | `-ffunction-sections` | Each function in its own `.text.name` section | -| `data_sections` | `-fdata-sections` | Each global in its own data section | -| `code16gcc` | `-m16` | Prepend `.code16gcc` for 16-bit real mode (Linux boot) | -| `regparm` | `-mregparm=N` | Integer args in registers (i686, 0--3) | -| `omit_frame_pointer` | `-fomit-frame-pointer` | Free EBP as general register (i686) | -| `emit_cfi` | `-f[no-]asynchronous-unwind-tables` | Emit `.cfi_*` directives for `.eh_frame` unwind tables (default: on) | - ---- - -## Builtin Assembler - -Each architecture has a native assembler that parses the generated assembly -text into instructions, encodes them into machine code, and writes ELF -object files directly. The builtin assembler is the default. When the -`gcc_assembler` Cargo feature is enabled at compile time, GCC is used -instead. - -### Architecture - -Each assembler follows a three-stage pipeline: - -``` - Assembly text (String) - | - | Parser (parser.rs) - v - Vec (parsed instructions, directives, labels) - | - | Encoder (encoder.rs) - v - Vec (encoded machine code bytes + relocation entries) - | - | ELF Writer (elf_writer.rs) - v - ELF object file (.o) -``` - -**Parser**: Tokenizes and parses the assembly text into structured -`AsmStatement` items. Each statement is either an instruction (with -opcode, operands, and optional size suffix), a directive (`.section`, -`.globl`, `.byte`, `.long`, `.ascii`, `.align`, etc.), or a label -definition. The x86 and i686 backends share the same AT&T syntax parser -(i686 reuses `x86::assembler::parser`). AArch64 and RISC-V have -architecture-specific parsers. - -**Encoder**: Translates parsed instructions into machine code bytes. For -x86-64, this involves REX prefix generation, ModR/M and SIB byte -construction, and displacement/immediate encoding. For i686, REX prefixes -are omitted and the default operand size is 32-bit. For AArch64, -instructions are encoded as fixed-width 32-bit words. For RISC-V, -instructions are 32-bit words with an optional compression pass -(`compress.rs`) that converts eligible instructions to 16-bit RV64C -compact form. - -**ELF Writer**: Collects encoded sections (`.text`, `.data`, `.rodata`, -`.bss`, etc.), builds the symbol table and relocation entries, and writes -a complete ELF object file. x86-64 produces ELFCLASS64 with `EM_X86_64`; -i686 produces ELFCLASS32 with `EM_386` using `Elf32_Sym` and `Elf32_Rel`; -AArch64 produces ELFCLASS64 with `EM_AARCH64`; RISC-V produces ELFCLASS64 -with `EM_RISCV`. - -### Per-Architecture Assembler Details - -| Architecture | Parser | Encoder | Extra Features | -|-------------|--------|---------|---------------| -| x86-64 | AT&T syntax, shared | REX prefixes, ModR/M, SIB | SSE/AES-NI encoding | -| i686 | Reuses x86 parser | No REX, 32-bit operands | ELFCLASS32, Elf32_Rel | -| AArch64 | ARM assembly syntax | Fixed 32-bit encoding | imm12 auto-shift | -| RISC-V | RV assembly syntax | Fixed 32-bit encoding | RV64C compression | - -### Assembler Files - -Each backend's `assembler/` directory contains: - -| File | Purpose | -|------|---------| -| `mod.rs` | Entry point: `assemble(asm_text, output_path)` | -| `parser.rs` | Tokenize + parse assembly text (absent in i686; reuses x86) | -| `encoder/` | Instruction-to-bytes encoding (directory with 7--8 submodules per arch) | -| `elf_writer.rs` | ELF object file generation | -| `compress.rs` | RV64C instruction compression (RISC-V only) | - -The `encoder/` directory splits encoding by instruction category (e.g., GP -integer, SSE/NEON, FP, system, atomics) to keep each file focused. See -each architecture's assembler README for details. - ---- - -## Builtin Linker - -Each architecture has a native ELF linker that reads object files and -static archives, resolves symbols, applies relocations, and writes a -complete ELF executable. The builtin linker is the default. When the -`gcc_linker` Cargo feature is enabled at compile time, GCC is used instead. - -### Architecture - -The linker pipeline processes input files in this order: - -``` - Input .o files + CRT objects + static archives (.a) - | - | Read ELF headers and sections - v - Collected sections, symbols, and relocations - | - | Symbol resolution (strong/weak, local/global) - v - Resolved symbol table - | - | Apply relocations (per-architecture relocation types) - v - Linked sections with resolved addresses - | - | Write ELF executable (program headers, dynamic section if needed) - v - ELF executable -``` - -### Common Linker Responsibilities - -All four linker implementations handle: - -- **CRT object discovery**: Locates and includes `crt1.o`, `crti.o`, - `crtbegin.o`, `crtend.o`, and `crtn.o` from standard system paths - (probing both native and cross-compilation directories). -- **Static archive processing** (`.a` files): Reads ar-format archives and - selectively includes object files that define needed symbols. -- **Symbol resolution**: Handles strong vs. weak symbols, local vs. global - visibility, COMMON symbols, and undefined symbol diagnostics. -- **Section merging**: Merges `.text`, `.data`, `.rodata`, `.bss`, and - custom sections from multiple input objects. -- **Relocation application**: Applies all architecture-specific relocation - types to produce position-correct machine code. - -### Per-Architecture Linker Details - -| Architecture | Link Mode | Key Relocations | Special Features | -|-------------|-----------|-----------------|-----------------| -| x86-64 | Dynamic | R_X86_64_64, PC32, PLT32, GOTPCREL, GOTTPOFF, TPOFF32 | PLT/GOT, TLS (IE-to-LE relaxation), copy relocations | -| i686 | Dynamic | R_386_32, PC32, PLT32, GOTPC, GOTOFF, GOT32X, GOT32 | 32-bit ELF, `.rel` (not `.rela`) | -| AArch64 | Dynamic | ADR_PREL_PG_HI21, ADD_ABS_LO12_NC, CALL26, JUMP26, LDST*, ADR_GOT_PAGE | PLT/GOT, shared library output, IFUNC/IPLT, GLOB_DAT, copy relocations, TLS | -| RISC-V | Dynamic | HI20, LO12_I, LO12_S, CALL, PCREL_HI20, GOT_HI20, BRANCH | Linker relaxation markers | - -### Linker Files - -Each backend's `linker/` directory has a similar structure. Common files -include `mod.rs` (entry point), `link.rs` (main link driver), and -`input.rs` (input object/archive loading). See each architecture's -linker README for file-level details. - -- **x86-64** (8 files): `mod.rs`, `link.rs`, `input.rs`, `types.rs`, - `elf.rs` (ELF helpers), `plt_got.rs` (PLT/GOT construction), - `emit_exec.rs`, `emit_shared.rs` -- **i686** (12 files): `mod.rs`, `link.rs`, `input.rs`, `parse.rs`, - `types.rs`, `reloc.rs`, `emit.rs`, `sections.rs`, `symbols.rs`, - `shared.rs`, `dynsym.rs`, `gnu_hash.rs` -- **AArch64** (10 files): `mod.rs`, `link.rs`, `input.rs`, `types.rs`, - `elf.rs`, `reloc.rs`, `plt_got.rs`, `emit_dynamic.rs`, - `emit_shared.rs`, `emit_static.rs` -- **RISC-V** (10 files): `mod.rs`, `link.rs`, `input.rs`, `elf_read.rs`, - `relocations.rs`, `reloc.rs`, `sections.rs`, `symbols.rs`, - `emit_exec.rs`, `emit_shared.rs` - ---- - -## Assembler and Linker Selection (common.rs + mod.rs) - -Assembler and linker selection is controlled at **compile time** via Cargo -features, not by environment variables: - -| Feature | Effect | -|---------|--------| -| (default, no features) | **Builtin assembler and linker** for all architectures | -| `gcc_assembler` | Use GCC as the assembler instead of the builtin | -| `gcc_linker` | Use GCC as the linker instead of the builtin | -| `gcc_m16` | Use GCC for `-m16` (16-bit real mode boot code) | - -When using the GCC fallback, each target provides an `AssemblerConfig` -and `LinkerConfig` that specify the toolchain command, static flags, -and expected ELF `e_machine` value for input validation. - -The `CCC_KEEP_ASM` environment variable preserves the intermediate `.s` -file next to the output for debugging. - -### Usage Examples - -```bash -# Default build: builtin assembler and linker (fully self-contained) -cargo build --release -./target/release/ccc -o output input.c - -# Build with GCC assembler and linker fallback -cargo build --release --features gcc_assembler,gcc_linker - -# Static linking with builtin linker -ccc -static file.c -o file -``` - ---- - -## Switch Statement Compilation - -Switch statements use a density-based heuristic (defined in `traits.rs`) -to choose between jump tables and compare-and-branch chains: - -| Parameter | Value | Rationale | -|-----------|-------|-----------| -| `MIN_JUMP_TABLE_CASES` | 4 | Below this, a linear chain is always faster | -| `MAX_JUMP_TABLE_RANGE` | 4,096 | Larger tables waste memory for sparse switches | -| `MIN_JUMP_TABLE_DENSITY_PERCENT` | 40% | Below this, the table is mostly empty default entries | - -When `-fno-jump-tables` is set (required by the Linux kernel with retpoline -to avoid indirect jumps that objtool would reject), all switches use -compare-and-branch chains regardless of density. - ---- - -## Data Section Emission (common.rs) - -Global variables are classified into sections via `classify_global` which -returns a `GlobalSection` enum: `Extern`, `Custom`, `Rodata`, `Tdata`, -`Data`, `Common`, `Tbss`, or `Bss`. Each section group is emitted by -internal helpers: - -- `emit_section_group`: groups globals by section and emits section - directives. -- `emit_init_data`: recursively emits all `GlobalInit` variants (integers, - floats, strings, global addresses, address offsets, label differences, - compound initializers, zero-fill). -- `emit_symbol_directives`: emits linkage (`.globl`/`.local`) and - visibility (`.hidden`/`.protected`/`.internal`) directives. -- `emit_zero_global`: emits the zero-init BSS pattern (`.zero N`). - -The 64-bit data directive varies by architecture: `.quad` on x86/i686, -`.xword` on AArch64, `.dword` on RISC-V. This is parameterized through the -`PtrDirective` type returned by each backend's `ptr_directive()` method. diff --git a/src/backend/arm/README.md b/src/backend/arm/README.md deleted file mode 100644 index f1e1375de4..0000000000 --- a/src/backend/arm/README.md +++ /dev/null @@ -1,45 +0,0 @@ -# AArch64 (ARM64) Backend - -The AArch64 backend targets ARM64 with the AAPCS64 calling convention. It -covers the full pipeline from IR to ELF executable: code generation -(instruction selection, register allocation, peephole optimization), a builtin -assembler (GNU assembly syntax parser, fixed-width encoder, ELF object writer), -and a builtin linker (static and dynamic linking, shared library output, -IFUNC/IPLT, TLS support). - -## Directory Structure - -``` -arm/ - codegen/ Code generation and peephole optimizer - assembler/ Builtin AArch64 assembler (parser, encoder, ELF writer) - linker/ Builtin AArch64 linker (static/dynamic linking, IFUNC/TLS) -``` - -## Sub-Module Documentation - -| Module | README | -|--------|--------| -| Code generation | [`codegen/README.md`](codegen/README.md) | -| Assembler | [`assembler/README.md`](assembler/README.md) | -| Linker | [`linker/README.md`](linker/README.md) | - -## Key Characteristics - -- **ABI**: AAPCS64 -- 8 GP argument registers, 8 FP/SIMD argument registers, - F128 in Q registers, I128 in aligned register pairs -- **Accumulator model**: Values flow through `x0`; callee-saved registers - (`x20`-`x28`) and two caller-saved registers (`x13`, `x14`) reduce stack - traffic -- **F128 (long double)**: IEEE binary128 via soft-float library calls - (`__addtf3`, `__multf3`, etc.) through NEON Q registers -- **NEON intrinsics**: SSE-equivalent 128-bit vector operations via NEON - instructions -- **Atomics**: ARMv8 exclusive monitor (LDXR/STXR) loops -- **Peephole optimizer**: 3-phase pipeline (iterative local, global copy - propagation + dead store elimination, local cleanup) -- **Assembler**: GNU assembly syntax, fixed 32-bit encoding, macro/conditional - preprocessor, ~400 base mnemonics -- **Linker**: Static and dynamic linking, shared library (`.so`) output, - IFUNC/IPLT with IRELATIVE relocations, TLS (LE, IE via GOT, TLSDESC and - GD relaxation to LE) diff --git a/src/backend/arm/asm_stub.sh b/src/backend/arm/asm_stub.sh deleted file mode 100755 index eb0f74ee7a..0000000000 --- a/src/backend/arm/asm_stub.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/sh -# Placeholder assembler for AArch64 backend. -# Set MY_ASM to point to this script to test the custom assembler integration. -# TODO: Replace this stub with a real assembler implementation. -echo "ERROR: AArch64 custom assembler stub called but not yet implemented." >&2 -echo "Arguments: $@" >&2 -exit 1 diff --git a/src/backend/arm/assembler/README.md b/src/backend/arm/assembler/README.md deleted file mode 100644 index f7f4314ed7..0000000000 --- a/src/backend/arm/assembler/README.md +++ /dev/null @@ -1,512 +0,0 @@ -# AArch64 Built-in Assembler -- Design Document - -## Overview - -The built-in AArch64 assembler is a self-contained subsystem that translates -GNU-style assembly text (`.s` files), as emitted by the compiler's AArch64 -codegen, into ELF64 relocatable object files (`.o`). Its purpose is to -eliminate the external dependency on `aarch64-linux-gnu-gcc` for assembling, -making the compiler fully self-hosting on AArch64 Linux targets. - -The assembler is active by default (when the `gcc_assembler` Cargo feature is -not enabled). It accepts the same textual assembly that GCC's gas would consume -and produces ABI-compatible `.o` files that any standard AArch64 ELF linker (or -the companion built-in linker) can link. - -The implementation spans roughly 9,600 lines of Rust across four files and is -organized as a clean three-stage pipeline. Shared ELF infrastructure (section -management, symbol tables, ELF serialization) lives in `ElfWriterBase` in -`elf.rs`; the files here contain only AArch64-specific logic. - -``` - AArch64 Built-in Assembler - ================================================================ - - .s assembly text - | - v - +------------------+ - | parser.rs | Stage 1: Preprocess + Parse - | (~2,600 lines) | Macros, .rept, .if -> AsmStatement[] - +------------------+ - | - | Vec - v - +------------------+ - | elf_writer.rs | Stage 2: Process + Encode + Emit - | (~580 lines) | AArch64-specific: branch resolution, - | | sym diffs (uses ElfWriterBase) - +------------------+ - | ^ - | | encode_instruction() - v | - +------------------+ - | encoder/ | Instruction Encoding Library - | (~6,190 lines) | Mnemonic + Operands -> u32 words - +------------------+ - | - v - ELF64 .o file on disk -``` - -The single public entry point is: - -```rust -// mod.rs (~220 lines) -pub fn assemble(asm_text: &str, output_path: &str) -> Result<(), String> -``` - -It calls `parse_asm()`, expands literal pools (`ldr Xn, =symbol` → `ldr Xn, .Llpool_N` -+ `.quad` pool entries), resolves GNU numeric labels (`1f`/`1b`), creates an -`ElfWriter`, feeds it the parsed statements, and writes the final `.o` file. - - ---- - -## Stage 1: Parser (`parser.rs`) - -### Purpose - -Convert raw assembly text into a structured, typed intermediate -representation -- a `Vec`. Every subsequent stage works on this -IR; no raw text parsing happens after this point. - -### Key Data Structures - -| Type | Role | -|------|------| -| `AsmStatement` | Top-level IR node: `Label`, `Directive`, `Instruction`, or `Empty`. | -| `AsmDirective` | Fully-typed directive variant (28 kinds, from `.section` to `.cfi_*`). | -| `Operand` | Operand of an instruction (20 variants covering every AArch64 addressing mode). | -| `SectionDirective` | Parsed `.section name, "flags", @type` triple. | -| `DataValue` | Data that can be an integer, a symbol, a symbol+offset, or a symbol difference. | -| `SizeExpr` | The expression in `.size sym, expr` -- either a constant or `.- sym`. | -| `SymbolKind` | From `.type`: `Function`, `Object`, `TlsObject`, `NoType`. | - -### Operand Variants - -The `Operand` enum models every AArch64 operand shape the codegen can emit: - -``` -Reg("x0") -- general / FP / SIMD register -Imm(42) -- immediate value (#42) -Symbol("printf") -- bare symbol reference -SymbolOffset("arr", 16) -- symbol + constant -Mem { base, offset } -- [base, #offset] -MemPreIndex { base, offset } -- [base, #offset]! -MemPostIndex { base, offset } -- [base], #offset -MemRegOffset { base, index, .. } -- [base, Xm, extend #shift] -Modifier { kind, symbol } -- :lo12:symbol, :got_lo12:symbol -ModifierOffset { kind, sym, off } -- :lo12:symbol+offset -Shift { kind, amount } -- lsl #N -Extend { kind, amount } -- sxtw #N -Cond("eq") -- condition code -Barrier("ish") -- barrier option -Label(".LBB0_4") -- branch target -Expr("complex_expr") -- raw expression fallback -RegArrangement { reg, arr } -- v0.16b (NEON arrangement) -RegLane { reg, elem_size, index } -- v0.d[1] (NEON lane) -RegList(Vec) -- {v0.16b, v1.16b} -RegListIndexed { regs, index } -- {v0.s, v1.s}[0] (NEON single-element) -``` - -### Parsing Algorithm - -``` -parse_asm(text) - 0. Strip C-style /* ... */ block comments - 1. expand_macros() -- collect .macro/.endm definitions, expand invocations - - Supports default parameters, varargs (\()), .purgem - - Nested macro definitions tracked with depth counter - - Recursive expansion with 64-level depth limit - - \@ counter substitution for unique label generation - 2. expand_rept_blocks() -- flatten .rept/.endr and .irp/.irpc/.endr - 3. resolve_set_constants() -- substitute .set/.equ symbol values - 4. resolve_register_aliases() -- process .req/.unreq register aliases - 5. Conditional assembly (evaluated during line processing): - - .if/.elseif/.elsif/.else/.endif - - .ifdef/.ifndef (symbol existence) - - .ifc/.ifnc (string comparison) - - .ifb/.ifnb (blank argument test) - - .ifeq/.ifne (numeric comparison) - - Supports ==, !=, >, >=, <, <= comparisons - - Arithmetic expressions via shared asm_expr evaluator - 6. ldr =symbol pseudo-instruction → LdrLiteralPool (expanded to literal pool later) - for each line: - a. Trim whitespace, strip comments (// and @ style) - b. Split on ';' (GAS multi-statement separator) - c. For each sub-statement: - - Try to match "name:" -> Label(name) - - Try to match "." prefix -> parse_directive() - - Otherwise -> parse_instruction() - - parse_operands() splits on ',' respecting [] and {} nesting - - parse_single_operand() handles all operand shapes - - Post-pass: merge [base], #offset into MemPostIndex -``` - -### Supported Directives - -| Category | Directives | -|----------|-----------| -| Sections | `.section`, `.text`, `.data`, `.bss`, `.rodata`, `.pushsection`, `.popsection`/`.previous` | -| Symbols | `.globl`/`.global`, `.weak`, `.hidden`, `.protected`, `.internal`, `.type`, `.size`, `.local`, `.comm`, `.set`/`.equ` | -| Alignment | `.align`, `.p2align` (power-of-2), `.balign` (byte count) | -| Data emission | `.byte`, `.short`/`.hword`/`.2byte`, `.long`/`.4byte`/`.word`, `.quad`/`.8byte`/`.xword`, `.zero`/`.space`, `.ascii`, `.asciz`/`.string`, `.float`/`.single`, `.double`, `.inst` | -| Macros | `.macro`/`.endm` (with default params, varargs), `.purgem`, `.req`/`.unreq` (register aliases) | -| Repetition | `.rept`/`.endr`, `.irp`/`.endr`, `.irpc`/`.endr` | -| Conditionals | `.if`/`.elseif`/`.elsif`/`.else`/`.endif`, `.ifdef`/`.ifndef`, `.ifc`/`.ifnc`, `.ifb`/`.ifnb`, `.ifeq`/`.ifne` | -| Includes | `.incbin` (binary file inclusion) | -| CFI | `.cfi_startproc`, `.cfi_endproc`, `.cfi_def_cfa_offset`, `.cfi_offset`, and 12 more (all passed through as no-ops) | -| Literal pool | `.ltorg`/`.pool` (flushes pending literal pool entries from `ldr Xn, =symbol`) | -| Ignored | `.file`, `.loc`, `.ident`, `.addrsig`, `.addrsig_sym`, `.build_attributes`, `.eabi_attribute`, `.arch`, `.arch_extension`, `.cpu` | - -### Design Decisions (Parser) - -- **Eager parsing**: Directives are fully parsed at parse time (not deferred). - The `.section` flags string is decomposed into `SectionDirective`; `.type` - maps to a `SymbolKind` enum; `.align` values are converted from power-of-2 - to byte counts immediately. - -- **Comment stripping guards**: Both `//` and `@` are handled, but `@` is only - treated as a comment character when it does not prefix known GAS type tags - (`@object`, `@function`, `@progbits`, `@nobits`, `@tls_object`, `@note`). - -- **Raw operand preservation**: Each `Instruction` stores both the parsed - `Vec` and the raw operand text string, allowing the encoder to fall - back to text-level heuristics for unusual operand patterns. - - ---- - -## Stage 2: Instruction Encoder (`encoder/`) - -### Purpose - -Given a mnemonic string and a `Vec`, produce the 4-byte (32-bit) -little-endian machine code word. AArch64 has a fixed 32-bit instruction -width, which makes encoding straightforward compared to variable-length ISAs. - -### Key Data Structures - -| Type | Role | -|------|------| -| `EncodeResult` | Outcome of encoding one instruction. | -| `RelocType` | AArch64 ELF relocation types (21 variants). | -| `Relocation` | A relocation request: type + symbol + addend. | - -The `EncodeResult` enum has four variants: - -``` -Word(u32) -- single fully-resolved instruction -WordWithReloc { word, reloc } -- instruction needing a linker relocation -Words(Vec) -- multi-word sequence (e.g., movz+movk) -Skip -- pseudo-instruction; no code emitted -``` - -### Supported Instruction Categories - -The encoder handles a comprehensive set of AArch64 instructions. -The dispatch table in `encode_instruction()` maps ~400 base mnemonics -(~440 including condition code variants): - -| Category | Mnemonics | -|----------|-----------| -| **Data Processing** | `mov`, `movz`, `movk`, `movn`, `add`, `adds`, `sub`, `subs`, `and`, `orr`, `eor`, `ands`, `orn`, `eon`, `bics`, `mul`, `madd`, `msub`, `smull`, `umull`, `smaddl`, `umaddl`, `mneg`, `udiv`, `sdiv`, `umulh`, `smulh`, `neg`, `negs`, `mvn`, `adc`, `adcs`, `sbc`, `sbcs` | -| **Shifts** | `lsl`, `lsr`, `asr`, `ror` | -| **Bit fields** | `ubfm`, `sbfm`, `ubfx`, `sbfx`, `ubfiz`, `sbfiz`, `bfm`, `bfi`, `bfxil`, `extr` | -| **Extensions** | `sxtw`, `sxth`, `sxtb`, `uxtw`, `uxth`, `uxtb` | -| **Compare** | `cmp`, `cmn`, `tst`, `ccmp`, `fccmp` | -| **Conditional select** | `csel`, `csinc`, `csinv`, `csneg`, `cset`, `csetm`, `fcsel` | -| **Branches** | `b`, `bl`, `br`, `blr`, `ret`, `cbz`, `cbnz`, `tbz`, `tbnz`, `b.eq`/`beq`, ... (all 16 conditions) | -| **Loads/Stores** | `ldr`, `str`, `ldrb`, `strb`, `ldrh`, `strh`, `ldrsw`, `ldrsb`, `ldrsh`, `ldur`, `stur`, `ldp`, `stp`, `ldnp`, `stnp`, `ldxr`, `stxr`, `ldxrb`, `stxrb`, `ldxrh`, `stxrh`, `ldaxr`, `stlxr`, `ldaxrb`, `stlxrb`, `ldaxrh`, `stlxrh`, `ldar`, `stlr`, `ldarb`, `stlrb`, `ldarh`, `stlrh` | -| **Address** | `adrp`, `adr` | -| **Floating point** | `fmov`, `fadd`, `fsub`, `fmul`, `fdiv`, `fmax`, `fmin`, `fmaxnm`, `fminnm`, `fneg`, `fabs`, `fsqrt`, `fcmp`, `fmadd`, `fmsub`, `fnmadd`, `fnmsub`, `frintn`/`p`/`m`/`z`/`a`/`x`/`i`, `fcvtzs`, `fcvtzu`, `fcvtas`/`au`/`ns`/`nu`/`ms`/`mu`/`ps`/`pu`, `ucvtf`, `scvtf`, `fcvt` | -| **NEON three-same** | `add`, `sub`, `mul`, `and`, `orr`, `eor`, `orn`, `bic`, `bif`, `bit`, `bsl`, `cmeq`, `cmge`, `cmgt`, `cmhi`, `cmhs`, `cmtst`, `sqadd`, `uqadd`, `sqsub`, `uqsub`, `shadd`, `uhadd`, `shsub`, `uhsub`, `srhadd`, `urhadd`, `smax`, `umax`, `smin`, `umin`, `sabd`, `uabd`, `saba`, `uaba`, `sshl`, `ushl`, `sqshl`, `uqshl`, `srshl`, `urshl`, `sqrshl`, `uqrshl`, `addp`, `uminp`, `umaxp`, `sminp`, `smaxp`, `pmul` | -| **NEON two-misc** | `cnt`, `not`/`mvn`, `rev16`, `rev32`, `rev64`, `cls`, `clz`, `neg`, `abs`, `sqabs`, `sqneg`, `xtn`/`xtn2`, `uqxtn`/`uqxtn2`, `sqxtn`/`sqxtn2`, `sqxtun`/`sqxtun2`, `fcvtn`/`fcvtl`, `shll`/`shll2` | -| **NEON float vector** | `fadd`, `fsub`, `fmul`, `fdiv`, `fmax`, `fmin`, `fmaxnm`, `fminnm`, `fneg`, `fabs`, `fsqrt`, `frintn`/`p`/`m`/`z`/`a`/`x`/`i` (vector forms), `frecpe`, `frsqrte`, `frecps`, `frsqrts`, `faddp`, `fmaxp`, `fminp`, `fmaxnmp`, `fminnmp` | -| **NEON compare-zero** | `cmgt`/`cmge`/`cmeq`/`cmle`/`cmlt` `#0`, `fcmeq`/`fcmgt`/`fcmle`/`fcmlt` `#0.0` | -| **NEON shifts** | `sshr`, `ushr`, `srshr`, `urshr`, `ssra`, `usra`, `srsra`, `ursra`, `sri`, `sli`, `shl`, `sqshl`, `uqshl`, `sqshlu` | -| **NEON narrow** | `shrn`/`shrn2`, `rshrn`/`rshrn2`, `sqshrn`/`uqshrn`/`sqrshrn`/`uqrshrn` (+ `2` variants), `sqshrun`/`sqrshrun` (+ `2` variants) | -| **NEON widen/long** | `sshll`/`ushll`/`sxtl`/`uxtl` (+ `2` variants), `smull`/`umull`/`smlal`/`umlal`/`smlsl`/`umlsl`/`saddw`/`uaddw`/`ssubw`/`usubw`/`addhn`/`subhn`/`pmull` (+ `2` variants) | -| **NEON by-element** | `mul`, `mla`, `mls`, `fmul`, `fmla`, `fmls`, `smull`, `umull`, `smlal`, `umlal`, `sqdmulh`, `sqrdmulh` (with lane index) | -| **NEON reduce** | `addv`, `saddlv`, `umaxv`, `uminv`, `smaxv`, `sminv`, `fmaxv`, `fminv`, `fmaxnmv`, `fminnmv` | -| **NEON permute** | `zip1`, `zip2`, `uzp1`, `uzp2`, `trn1`, `trn2`, `ext`, `tbl`, `tbx` | -| **NEON insert/move** | `ins` (element/GPR), `umov`, `smov`, `dup` (element/GPR), `movi`, `mvni` | -| **NEON load/store** | `ld1`/`st1` (1-4 regs), `ld2`/`st2`, `ld3`/`st3`, `ld4`/`st4` (with post-index), `ld1r`/`ld2r`/`ld3r`/`ld4r` (with post-index) | -| **NEON convert** | `fcvtzs`, `fcvtzu`, `scvtf`, `ucvtf`, `fcvtns`, `fcvtms`, `fcvtas`, `fcvtps` (vector forms) | -| **NEON scalar** | `addp` (scalar), `add`/`sub` (d-regs), `sqabs`/`sqneg` (scalar), `sqshrn` (scalar) | -| **NEON crypto** | `aese`, `aesd`, `aesmc`, `aesimc`, `sha1h`, `sha1c`, `sha1m`, `sha1p`, `sha1su0`, `sha1su1`, `sha256h`, `sha256h2`, `sha256su0`, `sha256su1`, `eor3` | -| **System** | `nop`, `yield`, `wfe`, `wfi`, `sev`, `sevl`, `clrex`, `hint`, `bti`, `dc`, `ic`, `tlbi`, `dmb`, `dsb`, `isb`, `mrs`, `msr`, `svc`, `brk` | -| **Bit manipulation** | `clz`, `cls`, `rbit`, `rev`, `rev16`, `rev32` | -| **CRC32** | `crc32b`, `crc32h`, `crc32w`, `crc32x`, `crc32cb`, `crc32ch`, `crc32cw`, `crc32cx` | -| **LSE Atomics** | `cas`/`swp`/`ldadd`/`ldclr`/`ldeor`/`ldset` (with acquire/release/byte/halfword variants), `stadd`/`stclr`/`steor`/`stset` store aliases (with release/byte/halfword variants) | -| **Prefetch** | `prfm` | - -### Relocation Types Emitted - -When an instruction references an external symbol (e.g., `bl printf` or -`adrp x0, :got:variable`), the encoder returns `WordWithReloc`. The 21 -relocation types cover the full AArch64 static-linking relocation model: - -| Relocation | ELF Number | Usage | -|-----------|-----------|-------| -| `Call26` | 283 | `bl` (26-bit PC-relative call) | -| `Jump26` | 282 | `b` (26-bit PC-relative jump) | -| `AdrPrelLo21` | 274 | `adr` (21-bit PC-relative) | -| `AdrpPage21` | 275 | `adrp` (page-relative, bits [32:12]) | -| `AddAbsLo12` | 277 | `add :lo12:sym` (low 12 bits) | -| `Ldst8AbsLo12` | 278 | Load/store byte, low 12 | -| `Ldst16AbsLo12` | 284 | Load/store halfword, low 12 | -| `Ldst32AbsLo12` | 285 | Load/store word, low 12 | -| `Ldst64AbsLo12` | 286 | Load/store doubleword, low 12 | -| `Ldst128AbsLo12` | 299 | Load/store quadword, low 12 | -| `AdrGotPage21` | 311 | `adrp` via GOT | -| `Ld64GotLo12` | 312 | `ldr` from GOT entry | -| `TlsLeAddTprelHi12` | 549 | TLS Local Exec, high 12 | -| `TlsLeAddTprelLo12` | 551 | TLS Local Exec, low 12 (no overflow check) | -| `CondBr19` | 280 | Conditional branch, 19-bit | -| `TstBr14` | 279 | Test-and-branch, 14-bit | -| `Ldr19` | 273 | LDR literal, 19-bit PC-relative | -| `Abs64` | 257 | 64-bit absolute | -| `Abs32` | 258 | 32-bit absolute | -| `Prel32` | 261 | 32-bit PC-relative | -| `Prel64` | 260 | 64-bit PC-relative | - -### Encoding Approach - -1. **Register parsing**: `parse_reg_num()` converts textual register names - (`x0`-`x30`, `w0`-`w30`, `sp`, `xzr`, `wzr`, `lr`, `d0`-`d31`, - `s0`-`s31`, `q0`-`q31`, `v0`-`v31`) to 5-bit register numbers. - -2. **Size inference**: The `sf` bit (bit 31) is set from the register name - prefix: `x`/`sp`/`xzr` = 64-bit, `w`/`wsp`/`wzr` = 32-bit. - -3. **Condition codes**: 16 codes (`eq`, `ne`, `cs`/`hs`, `cc`/`lo`, ..., - `al`, `nv`) are mapped to their 4-bit encoding. - -4. **Wide immediates**: `mov Xd, #large` first tries single-instruction - encodings (MOVZ for 0..0xFFFF, MOVN for bitwise-NOT in 16-bit range, - ORR with bitmask immediate for repeating patterns like 0x0101010101010101), - then falls back to a `movz`+`movk` sequence (up to 4 instructions for a - 64-bit constant), returned as `EncodeResult::Words`. - -5. **MOV special cases**: `mov` to/from `sp` encodes as `add Xd, Xn, #0`; - register-to-register `mov` encodes as `orr Rd, xzr, Rm`. NEON `mov` - variants (lane insert, lane extract, element-to-element) are detected by - operand type and encoded as `INS`/`UMOV`/`ORR` as appropriate. - - ---- - -## Stage 3: ELF Object Writer (`elf_writer.rs`) - -### Purpose - -Walk the `Vec`, accumulate section data, build the symbol table, -resolve local branches, and serialize everything into a valid ELF64 -relocatable object file. - -### Key Data Structures - -| Type | Role | -|------|------| -| `ElfWriter` | AArch64-specific ELF writer; composes with `ElfWriterBase` for shared infrastructure. | -| `ElfWriterBase` | Shared state machine from `elf.rs` -- section management, symbols, labels, directive processing, ELF serialization. | -| `ObjSection` | A section being built: name, type, flags, data bytes, alignment, relocation list (from `elf.rs`). | -| `ObjReloc` | A relocation entry: offset, type, symbol name, addend (from `elf.rs`). | -| `PendingReloc` | A deferred branch/local relocation (resolved after all labels are known). | -| `PendingSymDiff` | A deferred symbol-difference expression (e.g., `.long .LBB3 - .Ljt_0`). | -| `PendingExpr` | A deferred complex expression (section, offset, expression string, size). | - -### ElfWriter State - -```rust -pub struct ElfWriter { - pub base: ElfWriterBase, // Shared: sections, labels, symbols, directives - pending_branch_relocs: Vec, // Branch relocs to resolve at asm time - pending_sym_diffs: Vec, // Deferred A-B expressions - pending_exprs: Vec, // Deferred complex expressions -} -``` - -The `ElfWriterBase` (defined in `elf.rs`) holds all shared state: `current_section`, -`sections`, `section_order`, `labels`, `global_symbols`, `weak_symbols`, -`symbol_types`, `symbol_sizes`, `symbol_visibility`, and `aliases`. It also -provides shared methods for section management, directive processing, data -emission, and ELF serialization. This file only adds AArch64-specific branch -resolution and symbol difference handling. - -### Processing Algorithm - -``` -process_statements(statements): - for each statement: - Label(name) -> record (section, offset) in labels map - Directive(dir) -> process_directive(): - Section -> ensure_section(), update current_section - PushSection -> push current_section onto section_stack, switch - PopSection -> pop section_stack, restore current_section - Global -> mark in global_symbols - Weak -> mark in weak_symbols - Hidden/Protected/Internal -> mark visibility - SymbolType -> record in symbol_types - Size -> compute and record in symbol_sizes - Align/Balign -> pad current section to alignment - Byte/Short/Long/Quad -> emit data bytes - (Long/Quad with symbols emit relocations) - Zero -> emit fill bytes - Asciz/Ascii -> emit string bytes - Comm -> create COMMON symbol - Cfi/Ignored -> no-op - Instruction(m,ops) -> process_instruction(): - call encode_instruction(m, ops) - Word -> emit 4 bytes - WordWithReloc: - local (.L*) or branch reloc -> store in pending_branch_relocs - other external -> add_reloc() to section - Words -> emit all 4-byte words - Skip -> no-op - - resolve_sym_diffs(): resolve all A-B expressions - same-section -> patch data in place - cross-section -> emit R_AARCH64_PREL32 or PREL64 relocation (based on data size) - - resolve_local_branches(): resolve all deferred branch targets - same-section -> compute PC-relative offset, patch instruction word - JUMP26/CALL26 -> encode imm26 field - CONDBR19 -> encode imm19 field - TSTBR14 -> encode imm14 field - cross-section -> emit relocation with section symbol + addend - undefined symbol -> emit external relocation (symbol name + addend) -``` - -### ELF File Layout - -``` - +========================+ offset 0 - | ELF64 Header (64 B) | e_machine=EM_AARCH64 (183) - | | e_type=ET_REL (1) - +========================+ - | Section Data | .text, .data, .rodata, .bss, etc. - | (aligned per section) | Each section padded to its sh_addralign - +========================+ - | .rela sections | One per content section with relocations - | (8-byte aligned) | Each entry: 24 bytes (Elf64_Rela) - +========================+ - | .symtab | Symbol table (24 bytes per Elf64_Sym) - | (8-byte aligned) | Order: NULL, section syms, local, global - +========================+ - | .strtab | Symbol name strings - +========================+ - | .shstrtab | Section name strings - +========================+ - | Section Header Table | One Elf64_Shdr per section (64 bytes each) - | (8-byte aligned) | Order: NULL, content, rela, symtab, - | | strtab, shstrtab - +========================+ -``` - -### Symbol Table Construction - -`build_symbol_table()` runs just before ELF serialization: - -1. **Section symbols**: One `STT_SECTION` / `STB_LOCAL` symbol per content - section. - -2. **Defined symbols**: Every label recorded in `self.labels`, excluding - `.L*` / `.l*` local labels. Binding is determined from `global_symbols`, - `weak_symbols`, or defaults to local. Type and size come from - `symbol_types` and `symbol_sizes`. - -3. **Undefined symbols**: Every symbol referenced in relocations that has no - definition. These get `STB_GLOBAL` binding (or `STB_WEAK` if declared - `.weak`). - -4. **COMMON symbols**: Created by `.comm` directives with `SHN_COMMON` section - index. - -### Local Label and Data Relocation Resolution - -Two resolution passes run after all statements are processed: - -- **`resolve_local_data_relocs()`**: Rewrites relocations that reference `.L*` - labels (which will not appear in the symbol table) to instead reference the - section symbol plus the label's offset as addend. This matches the behavior - of GCC's assembler. - -- **`resolve_sym_diffs()`**: Handles `.long .LA - .LB` and `.quad sym+off - .` - style expressions. Same-section differences are computed and patched directly. - Cross-section and external-symbol differences produce `R_AARCH64_PREL32` - relocations for 4-byte data or `R_AARCH64_PREL64` for 8-byte data. Composite - symbol names like `sym+offset` are decomposed into a base symbol and numeric addend. - - ---- - -## Design Decisions and Trade-offs - -### 1. Single-pass parsing, two-pass encoding - -Parsing is single-pass and purely syntactic. The ELF writer makes two -logical passes: a forward pass to collect all labels and emit code/data, then -backward resolution passes to fix up local branches and symbol differences. -This avoids the complexity of a full two-pass assembler while handling forward -references correctly. - -### 2. Intra-section branch resolution at assembly time - -All branch-type relocations (B, BL, B.cond, CBZ/CBNZ, TBZ/TBNZ) are deferred -and resolved after all labels are known. Same-section branches -- whether to -`.L*` local labels or to named labels like `__primary_switch` -- are resolved -by the assembler itself, producing fully-linked instruction words. Only -cross-section or truly external symbol references generate relocations in the -`.o` file. This matches GAS behavior and avoids relying on the linker to -resolve intra-section PC-relative branches. - -### 3. No DWARF emission - -CFI directives (`.cfi_startproc`, `.cfi_offset`, etc.) are parsed and -silently ignored. The assembler does not emit `.eh_frame` or `.debug_*` -sections. This is acceptable for the compiler's use case but means -stack unwinding and debugger support rely on the external assembler path. - -### 4. Deterministic output - -Section order is tracked via `section_order: Vec` rather than relying -on `HashMap` iteration order. This ensures identical input always produces -bit-identical output. - -### 5. Fixed instruction width simplifies encoding - -AArch64's uniform 32-bit instruction encoding means every instruction is -exactly 4 bytes. There is no need for instruction-length calculation or -relaxation passes (unlike x86). The only multi-word output is the -`movz`+`movk` sequence for wide immediates, returned as `EncodeResult::Words`. - - ---- - -## File Inventory - -| File | Lines | Purpose | -|------|-------|---------| -| `mod.rs` | ~220 | Public API: `assemble()` entry point, GNU numeric label resolution (`1f`/`1b`) | -| `parser.rs` | ~2,600 | Preprocessor (macros, .rept, .irp, conditionals, aliases) and parser: text -> `Vec` | -| `encoder/` | ~6,190 | Instruction encoder (split into focused submodules, see below) | -| `elf_writer.rs` | ~580 | ELF object file writer: composes with `ElfWriterBase` (from `elf.rs`), adds AArch64-specific branch resolution and symbol difference handling | -| **Total** | **~9,600** | | - -### Encoder Submodules (`encoder/`) - -The instruction encoder is organized as a directory of focused submodules: - -| File | Lines | Role | -|------|-------|------| -| `mod.rs` | ~973 | `EncodeResult`/`RelocType`/`Relocation` types, register parsing helpers (`parse_reg_num`, `encode_cond`, `get_reg`, `get_imm`, `sf_bit`), `encode_instruction()` dispatch | -| `data_processing.rs` | ~1,067 | Data processing: MOV/MOVZ/MOVK/MOVN, ADD/SUB, logical (AND/ORR/EOR), multiply/divide, carry ops, shifts, extensions, ORN/EON/BICS/BIC | -| `compare_branch.rs` | ~322 | Compare (CMP/CMN/TST), conditional select (CSEL/CSINC/CSINV/CSNEG), branches (B/BL/BR/BLR/RET/CBZ/CBNZ/TBZ/TBNZ), conditional aliases | -| `load_store.rs` | ~884 | Load/store: LDR/STR variants (byte/half/word/double), exclusive/acquire/release, ADRP/ADR, prefetch, LSE atomic operations | -| `fp_scalar.rs` | ~271 | Scalar floating-point: FADD/FSUB/FMUL/FDIV/FSQRT, FCMP, FMA, rounding, conversions (FCVT/SCVTF/UCVTF) | -| `neon.rs` | ~1,854 | NEON/SIMD: three-same, two-misc, float vector, compare-zero, shifts, narrow/widen, by-element, reduce, permute, insert/move, load/store (LD1-LD4/ST1-ST4), crypto (AES/SHA) | -| `system.rs` | ~577 | System instructions: NOP, WFE, WFI, SEV, BTI, DMB, DSB, ISB, DC, IC, TLBI, MRS, MSR, SVC, BRK, CLREX, HINT | -| `bitfield.rs` | ~246 | Bit manipulation: UBFM/SBFM/UBFX/SBFX/UBFIZ/SBFIZ/BFM/BFI, CLZ/CLS, RBIT/REV/REV16/REV32, CRC32 | diff --git a/src/backend/arm/assembler/elf_writer.rs b/src/backend/arm/assembler/elf_writer.rs deleted file mode 100644 index 33c70b6d72..0000000000 --- a/src/backend/arm/assembler/elf_writer.rs +++ /dev/null @@ -1,788 +0,0 @@ -//! ELF object file writer for AArch64. -//! -//! Takes parsed assembly statements and produces an ELF .o (relocatable) file -//! with proper sections, symbols, and relocations for AArch64/ELF64. -//! -//! Uses `ElfWriterBase` from `elf.rs` for shared section/symbol/relocation -//! management, directive processing, and ELF serialization. This file only -//! contains AArch64-specific logic: instruction encoding dispatch, branch -//! resolution (AArch64 relocation types), and symbol difference resolution. - -// ELF writer helpers; some section/relocation utilities defined for completeness. -#![allow(dead_code)] - -use super::parser::{AsmStatement, AsmDirective, SymbolKind, SizeExpr, DataValue, Operand}; -use super::encoder::{encode_instruction, EncodeResult, RelocType}; -use crate::backend::elf::{ - self, - STT_NOTYPE, STT_OBJECT, STT_FUNC, STT_TLS, - STV_HIDDEN, STV_PROTECTED, STV_INTERNAL, - ELFCLASS64, EM_AARCH64, - ElfWriterBase, ObjReloc, -}; - -/// AArch64 NOP instruction: `d503201f` in little-endian -const AARCH64_NOP: [u8; 4] = [0x1f, 0x20, 0x03, 0xd5]; - -/// The ELF writer for AArch64. -/// -/// Composes with `ElfWriterBase` for shared infrastructure and adds -/// AArch64-specific branch resolution and symbol difference handling. -pub struct ElfWriter { - /// Shared ELF writer state (sections, symbols, labels, directives) - pub base: ElfWriterBase, - /// Pending branch/local relocations to resolve after all labels are known. - /// Includes all branch-type relocs (B, BL, B.cond, CBZ, TBZ, etc.) plus - /// local .L-prefixed label references, so intra-section targets can be - /// resolved at assembly time without emitting unnecessary relocations. - pending_branch_relocs: Vec, - /// Pending symbol differences to resolve after all labels are known - pending_sym_diffs: Vec, - /// Pending raw expressions to resolve after all labels are known - pending_exprs: Vec, - /// Pending instructions with symbolic offset expressions (resolved after labels are known) - pending_instructions: Vec, -} - -struct PendingReloc { - section: String, - offset: u64, - reloc_type: u32, - symbol: String, - addend: i64, -} - -/// Returns true if the given ELF relocation type is a branch/jump relocation -/// that should be deferred for intra-section resolution at assembly time. -fn is_branch_reloc_type(elf_type: u32) -> bool { - matches!(elf_type, - 279 | // R_AARCH64_TSTBR14 - 280 | // R_AARCH64_CONDBR19 - 282 | // R_AARCH64_JUMP26 - 283 // R_AARCH64_CALL26 - ) -} - -/// A pending raw expression to be resolved after all labels are known. -struct PendingExpr { - section: String, - offset: u64, - expr: String, - size: usize, -} - -/// A pending symbol difference to be resolved after all labels are known. -struct PendingSymDiff { - /// Section containing the data directive - section: String, - /// Offset within that section where the value should be written - offset: u64, - /// The positive symbol (A in A - B) - sym_a: String, - /// The negative symbol (B in A - B) - sym_b: String, - /// Extra addend - extra_addend: i64, - /// Size in bytes (1, 4, or 8) - size: usize, -} - -/// A pending instruction whose operands contain symbolic expressions. -/// A NOP placeholder is emitted at the instruction offset; after all labels -/// are known, the expression is evaluated, operands are updated, and the -/// instruction is re-encoded and patched in place. -struct PendingInstruction { - section: String, - offset: u64, - mnemonic: String, - operands: Vec, - raw_operands: String, -} - -impl ElfWriter { - pub fn new() -> Self { - Self { - base: ElfWriterBase::new(AARCH64_NOP, 4), - pending_branch_relocs: Vec::new(), - pending_sym_diffs: Vec::new(), - pending_exprs: Vec::new(), - pending_instructions: Vec::new(), - } - } - - /// Resolve pending symbol differences after all labels are known. - fn resolve_sym_diffs(&mut self) -> Result<(), String> { - let pending = std::mem::take(&mut self.pending_sym_diffs); - for diff in &pending { - let sym_a_info = self.base.labels.get(&diff.sym_a).cloned(); - let sym_b_info = self.base.labels.get(&diff.sym_b).cloned(); - - match (sym_a_info, sym_b_info) { - (Some((sec_a, off_a)), Some((sec_b, off_b))) => { - if sec_a == sec_b { - // Same section: resolve at assembly time by patching the data - let value = (off_a as i64) - (off_b as i64) + diff.extra_addend; - if let Some(section) = self.base.sections.get_mut(&diff.section) { - let off = diff.offset as usize; - if diff.size == 1 && off < section.data.len() { - section.data[off] = value as u8; - } else if diff.size == 4 && off + 4 <= section.data.len() { - section.data[off..off + 4].copy_from_slice(&(value as i32).to_le_bytes()); - } else if diff.size == 8 && off + 8 <= section.data.len() { - section.data[off..off + 8].copy_from_slice(&value.to_le_bytes()); - } - } - } else { - // Cross-section: emit R_AARCH64_PREL32 or PREL64 based on size - let addend = off_a as i64 + diff.offset as i64 - off_b as i64 + diff.extra_addend; - let reloc_type = if diff.size == 8 { - RelocType::Prel64.elf_type() - } else { - RelocType::Prel32.elf_type() - }; - if let Some(section) = self.base.sections.get_mut(&diff.section) { - section.relocs.push(ObjReloc { - offset: diff.offset, - reloc_type, - symbol_name: sec_a.clone(), - addend, - }); - } - } - } - _ => { - // Forward-referenced or external symbols: emit PREL32 or PREL64 based on size - let reloc_type = if diff.size == 8 { - RelocType::Prel64.elf_type() - } else { - RelocType::Prel32.elf_type() - }; - if let Some(section) = self.base.sections.get_mut(&diff.section) { - section.relocs.push(ObjReloc { - offset: diff.offset, - reloc_type, - symbol_name: diff.sym_a.clone(), - addend: diff.extra_addend, - }); - } - } - } - } - Ok(()) - } - - /// Resolve pending raw expressions by substituting label offsets and evaluating. - fn resolve_pending_exprs(&mut self) -> Result<(), String> { - let pending = std::mem::take(&mut self.pending_exprs); - for pexpr in &pending { - // Substitute label names with their offset values - let mut expr = pexpr.expr.clone(); - - // Collect all label names, sorted longest first to avoid partial replacements - let mut label_names: Vec<&String> = self.base.labels.keys().collect(); - label_names.sort_by_key(|name| std::cmp::Reverse(name.len())); - - let mut all_resolved = true; - for label_name in &label_names { - if expr.contains(label_name.as_str()) { - if let Some((_section, offset)) = self.base.labels.get(*label_name) { - expr = expr.replace(label_name.as_str(), &offset.to_string()); - } else { - all_resolved = false; - } - } - } - - if all_resolved { - // Try to evaluate the expression - if let Ok(val) = crate::backend::asm_expr::parse_integer_expr(&expr) { - if let Some(section) = self.base.sections.get_mut(&pexpr.section) { - let off = pexpr.offset as usize; - match pexpr.size { - 1 if off < section.data.len() => { - section.data[off] = val as u8; - } - 2 if off + 2 <= section.data.len() => { - section.data[off..off + 2].copy_from_slice(&(val as i16).to_le_bytes()); - } - 4 if off + 4 <= section.data.len() => { - section.data[off..off + 4].copy_from_slice(&(val as i32).to_le_bytes()); - } - 8 if off + 8 <= section.data.len() => { - section.data[off..off + 8].copy_from_slice(&val.to_le_bytes()); - } - _ => {} - } - } - } else { - // Couldn't evaluate - emit as symbol reference - if let Some(section) = self.base.sections.get_mut(&pexpr.section) { - section.relocs.push(ObjReloc { - offset: pexpr.offset, - reloc_type: RelocType::Abs32.elf_type(), - symbol_name: pexpr.expr.clone(), - addend: 0, - }); - } - } - } else { - // Unresolved symbols - emit as relocation - if let Some(section) = self.base.sections.get_mut(&pexpr.section) { - section.relocs.push(ObjReloc { - offset: pexpr.offset, - reloc_type: RelocType::Abs32.elf_type(), - symbol_name: pexpr.expr.clone(), - addend: 0, - }); - } - } - } - Ok(()) - } - - /// Resolve pending instructions that contain symbolic offset expressions. - /// After all labels are positioned, substitute label names with offsets, - /// evaluate the expression, update operands, re-encode, and patch in place. - fn resolve_pending_instructions(&mut self) -> Result<(), String> { - let pending = std::mem::take(&mut self.pending_instructions); - for pinstr in &pending { - // Resolve each operand that has a deferred expression - let resolved_operands: Vec = pinstr.operands.iter().map(|op| { - match op { - Operand::MemExpr { base, expr, writeback } => { - if let Some(val) = self.resolve_label_expr(expr) { - if *writeback { - Operand::MemPreIndex { base: base.clone(), offset: val } - } else { - Operand::Mem { base: base.clone(), offset: val } - } - } else { - op.clone() - } - } - Operand::Expr(expr) => { - if let Some(val) = self.resolve_label_expr(expr) { - Operand::Imm(val) - } else { - op.clone() - } - } - _ => op.clone(), - } - }).collect(); - - // Re-encode the instruction with resolved operands - match encode_instruction(&pinstr.mnemonic, &resolved_operands, &pinstr.raw_operands) { - Ok(EncodeResult::Word(word)) => { - if let Some(section) = self.base.sections.get_mut(&pinstr.section) { - let off = pinstr.offset as usize; - if off + 4 <= section.data.len() { - section.data[off..off + 4].copy_from_slice(&word.to_le_bytes()); - } - } - } - Ok(EncodeResult::WordWithReloc { word, reloc }) => { - let elf_type = reloc.reloc_type.elf_type(); - if let Some(section) = self.base.sections.get_mut(&pinstr.section) { - let off = pinstr.offset as usize; - if off + 4 <= section.data.len() { - section.data[off..off + 4].copy_from_slice(&word.to_le_bytes()); - } - let is_local = reloc.symbol.starts_with(".L") || reloc.symbol.starts_with(".l") || reloc.symbol == "."; - if is_local || is_branch_reloc_type(elf_type) { - self.pending_branch_relocs.push(PendingReloc { - section: pinstr.section.clone(), - offset: pinstr.offset, - reloc_type: elf_type, - symbol: reloc.symbol.clone(), - addend: reloc.addend, - }); - } else { - section.relocs.push(ObjReloc { - offset: pinstr.offset, - reloc_type: elf_type, - symbol_name: reloc.symbol, - addend: reloc.addend, - }); - } - } - } - Ok(EncodeResult::Words(words)) => { - if let Some(section) = self.base.sections.get_mut(&pinstr.section) { - let off = pinstr.offset as usize; - for (j, word) in words.iter().enumerate() { - let wo = off + j * 4; - if wo + 4 <= section.data.len() { - section.data[wo..wo + 4].copy_from_slice(&word.to_le_bytes()); - } - } - } - } - Ok(EncodeResult::Skip) => {} - Err(e) => { - // Log error but don't fail the whole assembly - eprintln!("warning: failed to resolve deferred instruction '{}': {}", pinstr.mnemonic, e); - } - } - } - Ok(()) - } - - /// Substitute all known label names in an expression string with their byte - /// offsets, then evaluate the resulting numeric expression. - fn resolve_label_expr(&self, expr: &str) -> Option { - let mut resolved = expr.to_string(); - - // Collect all label names, sorted longest first to avoid partial replacements - let mut label_names: Vec<&String> = self.base.labels.keys().collect(); - label_names.sort_by_key(|name| std::cmp::Reverse(name.len())); - - for label_name in &label_names { - if resolved.contains(label_name.as_str()) { - if let Some((_section, offset)) = self.base.labels.get(*label_name) { - resolved = resolved.replace(label_name.as_str(), &offset.to_string()); - } - } - } - - crate::backend::asm_expr::parse_integer_expr(&resolved).ok() - } - - /// Process all parsed assembly statements. - pub fn process_statements(&mut self, statements: &[AsmStatement]) -> Result<(), String> { - for stmt in statements { - self.process_statement(stmt)?; - } - // Merge subsections (e.g., .text.__subsection.1 → .text) before resolving - let remap = self.base.merge_subsections(); - // Fix up pending references that pointed to now-merged subsection names - if !remap.is_empty() { - for reloc in &mut self.pending_branch_relocs { - if let Some((parent, offset_adj)) = remap.get(&reloc.section) { - reloc.offset += offset_adj; - reloc.section = parent.clone(); - } - } - for diff in &mut self.pending_sym_diffs { - if let Some((parent, offset_adj)) = remap.get(&diff.section) { - diff.offset += offset_adj; - diff.section = parent.clone(); - } - } - for expr in &mut self.pending_exprs { - if let Some((parent, offset_adj)) = remap.get(&expr.section) { - expr.offset += offset_adj; - expr.section = parent.clone(); - } - } - for instr in &mut self.pending_instructions { - if let Some((parent, offset_adj)) = remap.get(&instr.section) { - instr.offset += offset_adj; - instr.section = parent.clone(); - } - } - } - // Resolve symbol differences first (needs all labels to be known) - self.resolve_sym_diffs()?; - self.resolve_pending_exprs()?; - self.resolve_pending_instructions()?; - self.resolve_local_branches()?; - Ok(()) - } - - fn process_statement(&mut self, stmt: &AsmStatement) -> Result<(), String> { - match stmt { - AsmStatement::Empty => Ok(()), - - AsmStatement::Label(name) => { - self.base.ensure_text_section(); - let section = self.base.current_section.clone(); - let offset = self.base.current_offset(); - self.base.labels.insert(name.clone(), (section, offset)); - Ok(()) - } - - AsmStatement::Directive(dir) => { - self.process_directive(dir) - } - - AsmStatement::Instruction { mnemonic, operands, raw_operands } => { - self.process_instruction(mnemonic, operands, raw_operands) - } - - AsmStatement::LdrLiteralPool { .. } => { - // Should have been expanded by expand_literal_pools() before reaching here - Err("LdrLiteralPool should have been expanded before ELF writing".to_string()) - } - } - } - - fn process_directive(&mut self, dir: &AsmDirective) -> Result<(), String> { - match dir { - AsmDirective::Section(sec) => { - self.base.process_section_directive( - &sec.name, - sec.flags.as_deref().unwrap_or(""), - sec.flags.is_some(), // flags are explicit when provided - sec.section_type.as_deref(), - ); - Ok(()) - } - - AsmDirective::PushSection(sec) => { - self.base.push_section( - &sec.name, - sec.flags.as_deref().unwrap_or(""), - sec.flags.is_some(), - sec.section_type.as_deref(), - ); - Ok(()) - } - - AsmDirective::PopSection => { - self.base.pop_section(); - Ok(()) - } - - AsmDirective::Previous => { - self.base.restore_previous_section(); - Ok(()) - } - - AsmDirective::Subsection(n) => { - self.base.set_subsection(*n); - Ok(()) - } - - AsmDirective::Global(sym) => { - for s in sym.split(',') { - let s = s.trim(); - if !s.is_empty() { - self.base.set_global(s); - } - } - Ok(()) - } - AsmDirective::Weak(sym) => { self.base.set_weak(sym); Ok(()) } - AsmDirective::Hidden(sym) => { self.base.set_visibility(sym, STV_HIDDEN); Ok(()) } - AsmDirective::Protected(sym) => { self.base.set_visibility(sym, STV_PROTECTED); Ok(()) } - AsmDirective::Internal(sym) => { self.base.set_visibility(sym, STV_INTERNAL); Ok(()) } - - AsmDirective::SymbolType(sym, kind) => { - let st = match kind { - SymbolKind::Function => STT_FUNC, - SymbolKind::Object => STT_OBJECT, - SymbolKind::TlsObject => STT_TLS, - SymbolKind::NoType => STT_NOTYPE, - }; - self.base.set_symbol_type(sym, st); - Ok(()) - } - - AsmDirective::Size(sym, expr) => { - match expr { - SizeExpr::CurrentMinusSymbol(label) => { - self.base.set_symbol_size(sym, Some(label), None); - } - SizeExpr::Constant(size) => { - self.base.set_symbol_size(sym, None, Some(*size)); - } - } - Ok(()) - } - - AsmDirective::Align(bytes) | AsmDirective::Balign(bytes) => { - self.base.align_to(*bytes); - Ok(()) - } - - AsmDirective::Byte(vals) => self.emit_data_values(vals, 1), - - AsmDirective::Short(vals) => { - for val in vals { - self.base.emit_bytes(&(*val as u16).to_le_bytes()); - } - Ok(()) - } - - AsmDirective::Long(vals) => self.emit_data_values(vals, 4), - AsmDirective::Quad(vals) => self.emit_data_values(vals, 8), - - AsmDirective::Zero(size, fill) => { self.base.emit_bytes(&vec![*fill; *size]); Ok(()) } - AsmDirective::Asciz(bytes) => { self.base.emit_bytes(bytes); Ok(()) } - AsmDirective::Ascii(bytes) => { self.base.emit_bytes(bytes); Ok(()) } - - AsmDirective::Comm(sym, size, align) => { - self.base.emit_comm(sym, *size, *align); - Ok(()) - } - - AsmDirective::Local(_) => Ok(()), - - AsmDirective::Set(alias, target) => { - self.base.set_alias(alias, target); - Ok(()) - } - - AsmDirective::Incbin { path, skip, count } => { - let data = std::fs::read(path) - .map_err(|e| format!(".incbin: failed to read '{}': {}", path, e))?; - let skip = *skip as usize; - let data = if skip < data.len() { &data[skip..] } else { &[] }; - let data = match count { - Some(c) => { - let c = *c as usize; - if c < data.len() { &data[..c] } else { data } - } - None => data, - }; - self.base.emit_bytes(data); - Ok(()) - } - - AsmDirective::RawBytes(bytes) => { self.base.emit_bytes(bytes); Ok(()) } - - AsmDirective::Cfi | AsmDirective::Ignored | AsmDirective::Ltorg => Ok(()), - } - } - - /// Emit typed data values (Long or Quad) with proper relocations. - fn emit_data_values(&mut self, vals: &[DataValue], size: usize) -> Result<(), String> { - for val in vals { - match val { - DataValue::Integer(v) => { - self.base.emit_data_integer(*v, size); - } - DataValue::Symbol(sym) => { - let reloc_type = if size == 4 { - RelocType::Abs32.elf_type() - } else { - RelocType::Abs64.elf_type() - }; - self.base.emit_data_symbol_ref(sym, 0, size, reloc_type); - } - DataValue::SymbolOffset(sym, addend) => { - let reloc_type = if size == 4 { - RelocType::Abs32.elf_type() - } else { - RelocType::Abs64.elf_type() - }; - self.base.emit_data_symbol_ref(sym, *addend, size, reloc_type); - } - DataValue::SymbolDiff(sym_a, sym_b) => { - self.record_sym_diff(sym_a, sym_b, 0, size); - } - DataValue::SymbolDiffAddend(sym_a, sym_b, addend) => { - self.record_sym_diff(sym_a, sym_b, *addend, size); - } - DataValue::Expr(expr) => { - let section = self.base.current_section.clone(); - let offset = self.base.current_offset(); - self.pending_exprs.push(PendingExpr { - section, - offset, - expr: expr.clone(), - size, - }); - self.base.emit_placeholder(size); - } - } - } - Ok(()) - } - - /// Record a pending symbol difference for deferred resolution. - fn record_sym_diff(&mut self, sym_a: &str, sym_b: &str, extra_addend: i64, size: usize) { - let section = self.base.current_section.clone(); - let offset = self.base.current_offset(); - self.pending_sym_diffs.push(PendingSymDiff { - section, - offset, - sym_a: sym_a.to_string(), - sym_b: sym_b.to_string(), - extra_addend, - size, - }); - self.base.emit_placeholder(size); - } - - /// Check if any operand contains a deferred symbolic expression. - fn has_deferred_expr(operands: &[Operand]) -> bool { - operands.iter().any(|op| matches!(op, Operand::MemExpr { .. } | Operand::Expr(_))) - } - - fn process_instruction(&mut self, mnemonic: &str, operands: &[Operand], raw_operands: &str) -> Result<(), String> { - self.base.ensure_text_section(); - - // If any operand has a symbolic expression that needs deferred resolution, - // emit a NOP placeholder and queue for later resolution. - if Self::has_deferred_expr(operands) { - let section = self.base.current_section.clone(); - let offset = self.base.current_offset(); - self.pending_instructions.push(PendingInstruction { - section, - offset, - mnemonic: mnemonic.to_string(), - operands: operands.to_vec(), - raw_operands: raw_operands.to_string(), - }); - // Emit NOP placeholder (will be patched during resolution) - self.base.emit_u32_le(u32::from_le_bytes(AARCH64_NOP)); - return Ok(()); - } - - match encode_instruction(mnemonic, operands, raw_operands) { - Ok(EncodeResult::Word(word)) => { - self.base.emit_u32_le(word); - Ok(()) - } - Ok(EncodeResult::WordWithReloc { word, reloc }) => { - let elf_type = reloc.reloc_type.elf_type(); - let is_local = reloc.symbol.starts_with(".L") || reloc.symbol.starts_with(".l") || reloc.symbol == "."; - - if is_local || is_branch_reloc_type(elf_type) { - let offset = self.base.current_offset(); - self.pending_branch_relocs.push(PendingReloc { - section: self.base.current_section.clone(), - offset, - reloc_type: elf_type, - symbol: reloc.symbol.clone(), - addend: reloc.addend, - }); - self.base.emit_u32_le(word); - } else { - self.base.add_reloc(elf_type, reloc.symbol, reloc.addend); - self.base.emit_u32_le(word); - } - Ok(()) - } - Ok(EncodeResult::Words(words)) => { - for word in words { - self.base.emit_u32_le(word); - } - Ok(()) - } - Ok(EncodeResult::Skip) => Ok(()), - Err(e) => Err(e), - } - } - - /// Resolve local branch labels to PC-relative offsets using AArch64 relocation types. - /// For symbols defined in the same section, the PC-relative offset is computed and - /// patched directly into the instruction (matching GAS behavior). For undefined or - /// cross-section symbols, an external relocation is emitted. - fn resolve_local_branches(&mut self) -> Result<(), String> { - for reloc in &self.pending_branch_relocs { - // "." means current address (branch to self) - let (target_section, target_offset) = if reloc.symbol == "." { - (reloc.section.clone(), reloc.offset) - } else if let Some(label_info) = self.base.labels.get(&reloc.symbol) { - label_info.clone() - } else { - // Symbol not defined locally - emit as external relocation - if let Some(section) = self.base.sections.get_mut(&reloc.section) { - section.relocs.push(ObjReloc { - offset: reloc.offset, - reloc_type: reloc.reloc_type, - symbol_name: reloc.symbol.clone(), - addend: reloc.addend, - }); - } - continue; - }; - - if target_section != reloc.section { - // Cross-section reference - convert to section symbol + offset - if let Some(section) = self.base.sections.get_mut(&reloc.section) { - section.relocs.push(ObjReloc { - offset: reloc.offset, - reloc_type: reloc.reloc_type, - symbol_name: target_section.clone(), - addend: target_offset as i64 + reloc.addend, - }); - } - continue; - } - - let pc_offset = (target_offset as i64) - (reloc.offset as i64) + reloc.addend; - - if let Some(section) = self.base.sections.get_mut(&reloc.section) { - let instr_offset = reloc.offset as usize; - if instr_offset + 4 > section.data.len() { - continue; - } - - let mut word = u32::from_le_bytes([ - section.data[instr_offset], - section.data[instr_offset + 1], - section.data[instr_offset + 2], - section.data[instr_offset + 3], - ]); - - match reloc.reloc_type { - 282 | 283 => { - // R_AARCH64_JUMP26 / R_AARCH64_CALL26 - let imm26 = ((pc_offset >> 2) as u32) & 0x3FFFFFF; - word |= imm26; - } - 280 => { - // R_AARCH64_CONDBR19 - let imm19 = ((pc_offset >> 2) as u32) & 0x7FFFF; - word |= imm19 << 5; - } - 279 => { - // R_AARCH64_TSTBR14 - let imm14 = ((pc_offset >> 2) as u32) & 0x3FFF; - word |= imm14 << 5; - } - 273 => { - // R_AARCH64_LD_PREL_LO19 - LDR literal - let imm19 = ((pc_offset >> 2) as u32) & 0x7FFFF; - word |= imm19 << 5; - } - 274 => { - // R_AARCH64_ADR_PREL_LO21 - ADR instruction - let imm = pc_offset as i32; - let immlo = (imm as u32) & 0x3; - let immhi = ((imm as u32) >> 2) & 0x7FFFF; - word |= (immlo << 29) | (immhi << 5); - } - 275 => { - // R_AARCH64_ADR_PREL_PG_HI21 - ADRP instruction (local resolution) - let pc_page = (reloc.offset as i64) & !0xFFF; - let target_page = (target_offset as i64) & !0xFFF; - let page_off = target_page - pc_page; - let imm = (page_off >> 12) as i32; - let immlo = (imm as u32) & 0x3; - let immhi = ((imm as u32) >> 2) & 0x7FFFF; - word |= (immlo << 29) | (immhi << 5); - } - _ => { - // Unknown reloc type for local branch - leave as external - section.relocs.push(ObjReloc { - offset: reloc.offset, - reloc_type: reloc.reloc_type, - symbol_name: reloc.symbol.clone(), - addend: reloc.addend, - }); - continue; - } - } - - section.data[instr_offset..instr_offset + 4].copy_from_slice(&word.to_le_bytes()); - } - } - Ok(()) - } - - /// Write the final ELF object file. - pub fn write_elf(&mut self, output_path: &str) -> Result<(), String> { - let config = elf::ElfConfig { - e_machine: EM_AARCH64, - e_flags: 0, - elf_class: ELFCLASS64, - force_rela: false, - }; - self.base.write_elf(output_path, &config, false) - } -} diff --git a/src/backend/arm/assembler/encoder/bitfield.rs b/src/backend/arm/assembler/encoder/bitfield.rs deleted file mode 100644 index 99fb7c78b0..0000000000 --- a/src/backend/arm/assembler/encoder/bitfield.rs +++ /dev/null @@ -1,246 +0,0 @@ -use super::*; -use crate::backend::arm::assembler::parser::Operand; - -// ── Bitfield extract/insert ────────────────────────────────────────────── - -/// Encode UBFX Rd, Rn, #lsb, #width -> UBFM Rd, Rn, #lsb, #(lsb+width-1) -pub(crate) fn encode_ubfx(operands: &[Operand]) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let lsb = get_imm(operands, 2)? as u32; - let width = get_imm(operands, 3)? as u32; - let sf = sf_bit(is_64); - let n = if is_64 { 1u32 } else { 0u32 }; - let immr = lsb; - let imms = lsb + width - 1; - // UBFM: sf 10 100110 N immr imms Rn Rd - let word = (sf << 31) | (0b10 << 29) | (0b100110 << 23) | (n << 22) | (immr << 16) | (imms << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode SBFX Rd, Rn, #lsb, #width -> SBFM Rd, Rn, #lsb, #(lsb+width-1) -pub(crate) fn encode_sbfx(operands: &[Operand]) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let lsb = get_imm(operands, 2)? as u32; - let width = get_imm(operands, 3)? as u32; - let sf = sf_bit(is_64); - let n = if is_64 { 1u32 } else { 0u32 }; - let immr = lsb; - let imms = lsb + width - 1; - // SBFM: sf 00 100110 N immr imms Rn Rd - let word = (sf << 31) | (0b100110 << 23) | (n << 22) | (immr << 16) | (imms << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode UBFM Rd, Rn, #immr, #imms (raw form) -pub(crate) fn encode_ubfm(operands: &[Operand]) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let immr = get_imm(operands, 2)? as u32; - let imms = get_imm(operands, 3)? as u32; - let sf = sf_bit(is_64); - let n = if is_64 { 1u32 } else { 0u32 }; - let word = (sf << 31) | (0b10 << 29) | (0b100110 << 23) | (n << 22) | (immr << 16) | (imms << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode SBFM Rd, Rn, #immr, #imms (raw form) -pub(crate) fn encode_sbfm(operands: &[Operand]) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let immr = get_imm(operands, 2)? as u32; - let imms = get_imm(operands, 3)? as u32; - let sf = sf_bit(is_64); - let n = if is_64 { 1u32 } else { 0u32 }; - let word = (sf << 31) | (0b100110 << 23) | (n << 22) | (immr << 16) | (imms << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode SBFIZ Rd, Rn, #lsb, #width — alias for SBFM Rd, Rn, #(-lsb MOD regsize), #(width-1) -pub(crate) fn encode_sbfiz(operands: &[Operand]) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let lsb = get_imm(operands, 2)? as u32; - let width = get_imm(operands, 3)? as u32; - let regsize = if is_64 { 64u32 } else { 32 }; - let sf = sf_bit(is_64); - let n = if is_64 { 1u32 } else { 0u32 }; - let immr = (regsize.wrapping_sub(lsb)) & (regsize - 1); - let imms = width - 1; - let word = (sf << 31) | (0b100110 << 23) | (n << 22) | (immr << 16) | (imms << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode UBFIZ Rd, Rn, #lsb, #width — alias for UBFM Rd, Rn, #(-lsb MOD regsize), #(width-1) -pub(crate) fn encode_ubfiz(operands: &[Operand]) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let lsb = get_imm(operands, 2)? as u32; - let width = get_imm(operands, 3)? as u32; - let regsize = if is_64 { 64u32 } else { 32 }; - let sf = sf_bit(is_64); - let n = if is_64 { 1u32 } else { 0u32 }; - let immr = (regsize.wrapping_sub(lsb)) & (regsize - 1); - let imms = width - 1; - let word = (sf << 31) | (0b10 << 29) | (0b100110 << 23) | (n << 22) | (immr << 16) | (imms << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode BFM Rd, Rn, #immr, #imms (bitfield move) -pub(crate) fn encode_bfm(operands: &[Operand]) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let immr = get_imm(operands, 2)? as u32; - let imms = get_imm(operands, 3)? as u32; - let sf = sf_bit(is_64); - let n = if is_64 { 1u32 } else { 0u32 }; - // BFM: sf 01 100110 N immr imms Rn Rd - let word = (sf << 31) | (0b01 << 29) | (0b100110 << 23) | (n << 22) | (immr << 16) | (imms << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode BFI Rd, Rn, #lsb, #width -> BFM Rd, Rn, #(-lsb mod width_reg), #(width-1) -pub(crate) fn encode_bfi(operands: &[Operand]) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let lsb = get_imm(operands, 2)? as u32; - let width = get_imm(operands, 3)? as u32; - let sf = sf_bit(is_64); - let n = if is_64 { 1u32 } else { 0u32 }; - let reg_width = if is_64 { 64u32 } else { 32u32 }; - let immr = (reg_width - lsb) % reg_width; - let imms = width - 1; - let word = (sf << 31) | (0b01 << 29) | (0b100110 << 23) | (n << 22) | (immr << 16) | (imms << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode BFXIL Rd, Rn, #lsb, #width -> BFM Rd, Rn, #lsb, #(lsb+width-1) -pub(crate) fn encode_bfxil(operands: &[Operand]) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let lsb = get_imm(operands, 2)? as u32; - let width = get_imm(operands, 3)? as u32; - let sf = sf_bit(is_64); - let n = if is_64 { 1u32 } else { 0u32 }; - let immr = lsb; - let imms = lsb + width - 1; - let word = (sf << 31) | (0b01 << 29) | (0b100110 << 23) | (n << 22) | (immr << 16) | (imms << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode EXTR Rd, Rn, Rm, #lsb -pub(crate) fn encode_extr(operands: &[Operand]) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let (rm, _) = get_reg(operands, 2)?; - let lsb = get_imm(operands, 3)? as u32; - let sf = sf_bit(is_64); - let n = if is_64 { 1u32 } else { 0u32 }; - // EXTR: sf 0 0 100111 N 0 Rm imms Rn Rd - let word = (sf << 31) | (0b00100111 << 23) | (n << 22) | (rm << 16) - | (lsb << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -// ── Bit manipulation ───────────────────────────────────────────────────── - -pub(crate) fn encode_clz(operands: &[Operand]) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let sf = sf_bit(is_64); - // CLZ: sf 1 0 11010110 00000 00010 0 Rn Rd - let word = ((sf << 31) | (1 << 30) | (0b011010110 << 21)) - | (0b000100 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_cls(operands: &[Operand]) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let sf = sf_bit(is_64); - let word = ((sf << 31) | (1 << 30) | (0b011010110 << 21)) - | (0b000101 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_rbit(operands: &[Operand]) -> Result { - // NEON vector form: RBIT Vd.T, Vn.T (reverse bits in each byte) - if let Some(Operand::RegArrangement { .. }) = operands.first() { - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let q: u32 = if arr_d == "16b" { 1 } else { 0 }; - // RBIT (vector): 0 Q 1 01110 01 10000 00101 10 Rn Rd - let word = (q << 30) | (1 << 29) | (0b01110 << 24) | (0b01 << 22) - | (0b10000 << 17) | (0b00101 << 12) | (0b10 << 10) | (rn << 5) | rd; - return Ok(EncodeResult::Word(word)); - } - // Scalar form: RBIT Rd, Rn - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let sf = sf_bit(is_64); - let word = ((sf << 31) | (1 << 30) | (0b011010110 << 21)) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_rev(operands: &[Operand]) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let sf = sf_bit(is_64); - let opc = if is_64 { 0b000011 } else { 0b000010 }; - let word = ((sf << 31) | (1 << 30) | (0b011010110 << 21)) - | (opc << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_rev16(operands: &[Operand]) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let sf = sf_bit(is_64); - let word = ((sf << 31) | (1 << 30) | (0b011010110 << 21)) - | (0b000001 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_rev32(operands: &[Operand]) -> Result { - // Check for NEON vector form: REV32 Vd.T, Vn.T - if let Some(Operand::RegArrangement { .. }) = operands.first() { - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let (q, size) = neon_arr_to_q_size(&arr_d)?; - // REV32 Vd.T, Vn.T: 0 Q 1 01110 size 10 0000 0000 10 Rn Rd - let word = (q << 30) | (1 << 29) | (0b01110 << 24) | (size << 22) - | (0b100000 << 16) | (0b000010 << 10) | (rn << 5) | rd; - return Ok(EncodeResult::Word(word)); - } - let (rd, _) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - // REV32 is 64-bit only: 1 1 0 11010110 00000 000010 Rn Rd - let word = ((1u32 << 31) | (1 << 30) | (0b011010110 << 21)) - | (0b000010 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -// ── CRC32 ──────────────────────────────────────────────────────────────── - -pub(crate) fn encode_crc32(mnemonic: &str, operands: &[Operand]) -> Result { - let (rd, _) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let (rm, _) = get_reg(operands, 2)?; - - let is_c = mnemonic.contains("crc32c"); - let c_bit = if is_c { 1u32 } else { 0 }; - - let (sf, sz) = match mnemonic { - "crc32b" | "crc32cb" => (0u32, 0b00u32), - "crc32h" | "crc32ch" => (0, 0b01), - "crc32w" | "crc32cw" => (0, 0b10), - "crc32x" | "crc32cx" => (1, 0b11), - _ => (0, 0b00), - }; - - // CRC32: sf 0 0 11010110 Rm 010 C sz Rn Rd - let word = (sf << 31) | (0b0011010110 << 21) | (rm << 16) | (0b010 << 13) - | (c_bit << 12) | (sz << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} diff --git a/src/backend/arm/assembler/encoder/compare_branch.rs b/src/backend/arm/assembler/encoder/compare_branch.rs deleted file mode 100644 index 021af9f874..0000000000 --- a/src/backend/arm/assembler/encoder/compare_branch.rs +++ /dev/null @@ -1,322 +0,0 @@ -use super::*; -use crate::backend::arm::assembler::parser::Operand; - -// ── Compare ────────────────────────────────────────────────────────────── - -pub(crate) fn encode_cmp(operands: &[Operand]) -> Result { - // CMP Rn, op -> SUBS XZR, Rn, op - let mut new_ops = vec![Operand::Reg("xzr".to_string())]; - new_ops.extend(operands.iter().cloned()); - // Determine if 32-bit or 64-bit from the first operand - let is_32 = if let Some(Operand::Reg(r)) = operands.first() { - is_32bit_reg(r) - } else { - false - }; - if is_32 { - new_ops[0] = Operand::Reg("wzr".to_string()); - } - encode_add_sub(&new_ops, true, true) -} - -pub(crate) fn encode_cmn(operands: &[Operand]) -> Result { - // CMN Rn, op -> ADDS XZR, Rn, op - let mut new_ops = vec![Operand::Reg("xzr".to_string())]; - new_ops.extend(operands.iter().cloned()); - let is_32 = if let Some(Operand::Reg(r)) = operands.first() { - is_32bit_reg(r) - } else { - false - }; - if is_32 { - new_ops[0] = Operand::Reg("wzr".to_string()); - } - encode_add_sub(&new_ops, false, true) -} - -pub(crate) fn encode_tst(operands: &[Operand]) -> Result { - // TST Rn, op -> ANDS XZR, Rn, op - let mut new_ops = vec![Operand::Reg("xzr".to_string())]; - new_ops.extend(operands.iter().cloned()); - let is_32 = if let Some(Operand::Reg(r)) = operands.first() { - is_32bit_reg(r) - } else { - false - }; - if is_32 { - new_ops[0] = Operand::Reg("wzr".to_string()); - } - encode_logical(&new_ops, 0b11) -} - -pub(crate) fn encode_ccmp_ccmn(operands: &[Operand], is_ccmp: bool) -> Result { - // CCMP/CCMN Rn, #imm5, #nzcv, cond - // The only difference: CCMP has bit 30 = 1, CCMN has bit 30 = 0 - let (rn, is_64) = get_reg(operands, 0)?; - let sf = sf_bit(is_64); - let op = if is_ccmp { 1u32 << 30 } else { 0u32 }; - - if let (Some(Operand::Imm(imm5)), Some(Operand::Imm(nzcv)), Some(Operand::Cond(cond))) = - (operands.get(1), operands.get(2), operands.get(3)) - { - let cond_val = encode_cond(cond).ok_or("invalid condition")?; - let word = (sf << 31) | op | (1 << 29) | (0b11010010 << 21) - | ((*imm5 as u32 & 0x1F) << 16) | (cond_val << 12) | (1 << 11) | (rn << 5) | (*nzcv as u32 & 0xF); - return Ok(EncodeResult::Word(word)); - } - - // CCMP/CCMN Rn, Rm, #nzcv, cond - if let (Some(Operand::Reg(rm_name)), Some(Operand::Imm(nzcv)), Some(Operand::Cond(cond))) = - (operands.get(1), operands.get(2), operands.get(3)) - { - let rm = parse_reg_num(rm_name).ok_or("invalid rm")?; - let cond_val = encode_cond(cond).ok_or("invalid condition")?; - let word = (sf << 31) | op | (1 << 29) | (0b11010010 << 21) - | (rm << 16) | (cond_val << 12) | (rn << 5) | (*nzcv as u32 & 0xF); - return Ok(EncodeResult::Word(word)); - } - - let name = if is_ccmp { "ccmp" } else { "ccmn" }; - Err(format!("unsupported {} operands", name)) -} - -// ── Conditional select ─────────────────────────────────────────────────── - -pub(crate) fn encode_csel(operands: &[Operand]) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let (rm, _) = get_reg(operands, 2)?; - let cond = match operands.get(3) { - Some(Operand::Cond(c)) => encode_cond(c).ok_or("invalid cond")?, - _ => return Err("csel requires condition".to_string()), - }; - let sf = sf_bit(is_64); - let word = ((sf << 31) | (0b11010100 << 21) - | (rm << 16) | (cond << 12)) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_csinc(operands: &[Operand]) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let (rm, _) = get_reg(operands, 2)?; - let cond = match operands.get(3) { - Some(Operand::Cond(c)) => encode_cond(c).ok_or("invalid cond")?, - _ => return Err("csinc requires condition".to_string()), - }; - let sf = sf_bit(is_64); - let word = (sf << 31) | (0b11010100 << 21) - | (rm << 16) | (cond << 12) | (0b01 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_csinv(operands: &[Operand]) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let (rm, _) = get_reg(operands, 2)?; - let cond = match operands.get(3) { - Some(Operand::Cond(c)) => encode_cond(c).ok_or("invalid cond")?, - _ => return Err("csinv requires condition".to_string()), - }; - let sf = sf_bit(is_64); - let word = (((sf << 31) | (1 << 30)) | (0b11010100 << 21) - | (rm << 16) | (cond << 12)) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_csneg(operands: &[Operand]) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let (rm, _) = get_reg(operands, 2)?; - let cond = match operands.get(3) { - Some(Operand::Cond(c)) => encode_cond(c).ok_or("invalid cond")?, - _ => return Err("csneg requires condition".to_string()), - }; - let sf = sf_bit(is_64); - let word = ((sf << 31) | (1 << 30)) | (0b11010100 << 21) - | (rm << 16) | (cond << 12) | (0b01 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_cset(operands: &[Operand]) -> Result { - // CSET Rd, cond -> CSINC Rd, XZR, XZR, invert(cond) - let (rd, is_64) = get_reg(operands, 0)?; - let cond = match operands.get(1) { - Some(Operand::Cond(c)) => encode_cond(c).ok_or("invalid cond")?, - _ => return Err("cset requires condition".to_string()), - }; - let sf = sf_bit(is_64); - let inv_cond = cond ^ 1; // invert least significant bit - let word = (sf << 31) | (0b11010100 << 21) - | (0b11111 << 16) | (inv_cond << 12) | (0b01 << 10) | (0b11111 << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_csetm(operands: &[Operand]) -> Result { - // CSETM Rd, cond -> CSINV Rd, XZR, XZR, invert(cond) - let (rd, is_64) = get_reg(operands, 0)?; - let cond = match operands.get(1) { - Some(Operand::Cond(c)) => encode_cond(c).ok_or("invalid cond")?, - _ => return Err("csetm requires condition".to_string()), - }; - let sf = sf_bit(is_64); - let inv_cond = cond ^ 1; - let word = (((sf << 31) | (1 << 30)) | (0b11010100 << 21) - | (0b11111 << 16) | (inv_cond << 12)) | (0b11111 << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -// ── Branches ───────────────────────────────────────────────────────────── - -pub(crate) fn encode_branch(operands: &[Operand]) -> Result { - let (sym, addend) = get_symbol(operands, 0)?; - // B: 000101 imm26 (filled by linker/assembler) - Ok(EncodeResult::WordWithReloc { - word: 0b000101 << 26, - reloc: Relocation { - reloc_type: RelocType::Jump26, - symbol: sym, - addend, - }, - }) -} - -pub(crate) fn encode_bl(operands: &[Operand]) -> Result { - let (sym, addend) = get_symbol(operands, 0)?; - // BL: 100101 imm26 - Ok(EncodeResult::WordWithReloc { - word: 0b100101 << 26, - reloc: Relocation { - reloc_type: RelocType::Call26, - symbol: sym, - addend, - }, - }) -} - -pub(crate) fn encode_cond_branch(cond: &str, operands: &[Operand]) -> Result { - let cond_val = encode_cond(cond).ok_or_else(|| format!("unknown condition: {}", cond))?; - let (sym, addend) = get_symbol(operands, 0)?; - // B.cond: 01010100 imm19 0 cond - let word = (0b01010100 << 24) | cond_val; - Ok(EncodeResult::WordWithReloc { - word, - reloc: Relocation { - reloc_type: RelocType::CondBr19, - symbol: sym, - addend, - }, - }) -} - -pub(crate) fn encode_br(operands: &[Operand]) -> Result { - let (rn, _) = get_reg(operands, 0)?; - // BR: 1101011 0000 11111 000000 Rn 00000 - let word = 0xd61f0000 | (rn << 5); - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_blr(operands: &[Operand]) -> Result { - let (rn, _) = get_reg(operands, 0)?; - // BLR: 1101011 0001 11111 000000 Rn 00000 - let word = 0xd63f0000 | (rn << 5); - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_ret(operands: &[Operand]) -> Result { - let rn = if operands.is_empty() { - 30 // default to x30 (LR) - } else { - get_reg(operands, 0)?.0 - }; - // RET: 1101011 0010 11111 000000 Rn 00000 - let word = 0xd65f0000 | (rn << 5); - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_cbz(operands: &[Operand], is_nz: bool) -> Result { - let (rt, is_64) = get_reg(operands, 0)?; - let (sym, addend) = get_symbol(operands, 1)?; - let sf = sf_bit(is_64); - let op = if is_nz { 1u32 } else { 0u32 }; - // CBZ/CBNZ: sf 011010 op imm19 Rt - let word = (sf << 31) | (0b011010 << 25) | (op << 24) | rt; - Ok(EncodeResult::WordWithReloc { - word, - reloc: Relocation { - reloc_type: RelocType::CondBr19, - symbol: sym, - addend, - }, - }) -} - -pub(crate) fn encode_tbz(operands: &[Operand], is_nz: bool) -> Result { - let (rt, _) = get_reg(operands, 0)?; - let bit = get_imm(operands, 1)?; - let (sym, addend) = get_symbol(operands, 2)?; - let b5 = ((bit as u32) >> 5) & 1; - let b40 = (bit as u32) & 0x1F; - let op = if is_nz { 1u32 } else { 0u32 }; - // TBZ/TBNZ: b5 011011 op b40 imm14 Rt - let word = (b5 << 31) | (0b011011 << 25) | (op << 24) | (b40 << 19) | rt; - Ok(EncodeResult::WordWithReloc { - word, - reloc: Relocation { - reloc_type: RelocType::TstBr14, - symbol: sym, - addend, - }, - }) -} - -// ── Additional conditional operations ──────────────────────────────────── - -/// Encode CNEG Rd, Rn, cond -> CSNEG Rd, Rn, Rn, invert(cond) -pub(crate) fn encode_cneg(operands: &[Operand]) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let cond = match operands.get(2) { - Some(Operand::Cond(c)) => encode_cond(c).ok_or_else(|| format!("unknown condition: {}", c))?, - _ => return Err("cneg: expected condition code as third operand".to_string()), - }; - let sf = sf_bit(is_64); - // Invert the condition (flip bit 0) - let inv_cond = cond ^ 1; - // CSNEG: sf 1 0 11010100 Rm cond 0 1 Rn Rd (with Rm = Rn) - let word = (sf << 31) | (1 << 30) | (0b011010100 << 21) | (rn << 16) - | (inv_cond << 12) | (0b01 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode CINC Rd, Rn, cond -> CSINC Rd, Rn, Rn, invert(cond) -pub(crate) fn encode_cinc(operands: &[Operand]) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let cond = match operands.get(2) { - Some(Operand::Cond(c)) => encode_cond(c).ok_or_else(|| format!("unknown condition: {}", c))?, - _ => return Err("cinc: expected condition code as third operand".to_string()), - }; - let sf = sf_bit(is_64); - let inv_cond = cond ^ 1; - // CSINC: sf 0 0 11010100 Rm cond 0 1 Rn Rd (with Rm = Rn) - let word = (sf << 31) | (0b011010100 << 21) | (rn << 16) - | (inv_cond << 12) | (0b01 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode CINV Rd, Rn, cond -> CSINV Rd, Rn, Rn, invert(cond) -pub(crate) fn encode_cinv(operands: &[Operand]) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let cond = match operands.get(2) { - Some(Operand::Cond(c)) => encode_cond(c).ok_or_else(|| format!("unknown condition: {}", c))?, - _ => return Err("cinv: expected condition code as third operand".to_string()), - }; - let sf = sf_bit(is_64); - let inv_cond = cond ^ 1; - // CSINV: sf 1 0 11010100 Rm cond 0 0 Rn Rd (with Rm = Rn) - let word = (sf << 31) | (1 << 30) | (0b011010100 << 21) | (rn << 16) - | (inv_cond << 12) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} diff --git a/src/backend/arm/assembler/encoder/data_processing.rs b/src/backend/arm/assembler/encoder/data_processing.rs deleted file mode 100644 index ac65e2a156..0000000000 --- a/src/backend/arm/assembler/encoder/data_processing.rs +++ /dev/null @@ -1,1067 +0,0 @@ -use super::*; -use crate::backend::arm::assembler::parser::Operand; - -// ── MOV ────────────────────────────────────────────────────────────────── - -pub(crate) fn encode_mov(operands: &[Operand]) -> Result { - if operands.len() < 2 { - return Err("mov requires 2 operands".to_string()); - } - - // NEON register-to-register move: mov v1.16b, v0.16b -> ORR v1.16b, v0.16b, v0.16b - if let (Some(Operand::RegArrangement { reg: rd_name, arrangement: arr_d }), - Some(Operand::RegArrangement { reg: rm_name, arrangement: _arr_m })) = - (operands.first(), operands.get(1)) - { - let rd = parse_reg_num(rd_name).ok_or("invalid NEON rd")?; - let rm = parse_reg_num(rm_name).ok_or("invalid NEON rm")?; - let q: u32 = if arr_d == "16b" { 1 } else { 0 }; - // ORR Vd.T, Vm.T, Vm.T: 0 Q 0 01110 10 1 Rm 0 00111 Rn Rd - let word = (q << 30) | (0b001110 << 24) | (0b10 << 22) | (1 << 21) - | (rm << 16) | (0b000111 << 10) | (rm << 5) | rd; - return Ok(EncodeResult::Word(word)); - } - - // NEON lane insert: mov v0.d[1], x1 -> INS Vd.D[index], Xn - if let (Some(Operand::RegLane { reg: vd_name, elem_size, index }), - Some(Operand::Reg(rn_name))) = - (operands.first(), operands.get(1)) - { - let vd = parse_reg_num(vd_name).ok_or("invalid NEON vd")?; - let rn = parse_reg_num(rn_name).ok_or("invalid rn")?; - // INS Vd.Ts[index], Rn - // Encoding: 0 1 0 0 1110 000 imm5 0 0011 1 Rn Rd - // imm5 encoding depends on element size and index - let imm5 = match elem_size.as_str() { - "b" => ((*index & 0xF) << 1) | 0b00001, - "h" => ((*index & 0x7) << 2) | 0b00010, - "s" => ((*index & 0x3) << 3) | 0b00100, - "d" => ((*index & 0x1) << 4) | 0b01000, - _ => return Err(format!("unsupported element size for ins: {}", elem_size)), - }; - let word = (0b01001110000u32 << 21) | (imm5 << 16) | (0b000111 << 10) | (rn << 5) | vd; - return Ok(EncodeResult::Word(word)); - } - - // NEON lane extract: mov x0, v0.d[1] -> UMOV Xd, Vn.D[index] - if let (Some(Operand::Reg(rd_name)), - Some(Operand::RegLane { reg: vn_name, elem_size, index })) = - (operands.first(), operands.get(1)) - { - let rd = parse_reg_num(rd_name).ok_or("invalid rd")?; - let vn = parse_reg_num(vn_name).ok_or("invalid NEON vn")?; - // UMOV Rd, Vn.Ts[index] - // Encoding: 0 Q 0 0 1110 000 imm5 0 0111 1 Rn Rd - let (q, imm5) = match elem_size.as_str() { - "b" => (0u32, ((*index & 0xF) << 1) | 0b00001), - "h" => (0, ((*index & 0x7) << 2) | 0b00010), - "s" => (0, ((*index & 0x3) << 3) | 0b00100), - "d" => (1, ((*index & 0x1) << 4) | 0b01000), - _ => return Err(format!("unsupported element size for umov: {}", elem_size)), - }; - let word = (q << 30) | (0b001110000u32 << 21) | (imm5 << 16) | (0b001111 << 10) | (vn << 5) | rd; - return Ok(EncodeResult::Word(word)); - } - - // NEON element-to-element move: mov v0.s[3], v1.s[0] -> INS Vd.Ts[i1], Vn.Ts[i2] - if let (Some(Operand::RegLane { reg: vd_name, elem_size: es_d, index: idx_d }), - Some(Operand::RegLane { reg: vn_name, elem_size: _es_n, index: idx_n })) = - (operands.first(), operands.get(1)) - { - let vd = parse_reg_num(vd_name).ok_or("invalid NEON vd")?; - let vn = parse_reg_num(vn_name).ok_or("invalid NEON vn")?; - // INS Vd.Ts[i1], Vn.Ts[i2] - // Encoding: 0 1 1 01110 000 imm5 0 imm4 1 Rn Rd - let (imm5, imm4) = match es_d.as_str() { - "b" => ((idx_d << 1) | 0b00001, *idx_n), - "h" => ((idx_d << 2) | 0b00010, idx_n << 1), - "s" => ((idx_d << 3) | 0b00100, idx_n << 2), - "d" => ((idx_d << 4) | 0b01000, idx_n << 3), - _ => return Err(format!("unsupported element size for ins: {}", es_d)), - }; - let word = ((0b01101110000u32 << 21) | (imm5 << 16)) | (imm4 << 11) | (1 << 10) | (vn << 5) | vd; - return Ok(EncodeResult::Word(word)); - } - - // mov Xd, #imm -> movz or movn - if let Some(Operand::Imm(imm)) = operands.get(1) { - let (rd, is_64) = get_reg(operands, 0)?; - let imm = *imm; - - // Check if it can be a simple MOVZ - if (0..=0xFFFF).contains(&imm) { - let sf = sf_bit(is_64); - let word = (sf << 31) | (0b10100101 << 23) | ((imm as u32 & 0xFFFF) << 5) | rd; - return Ok(EncodeResult::Word(word)); - } - - // Negative: try MOVN - if imm < 0 { - let not_imm = !imm; - if (0..=0xFFFF).contains(¬_imm) { - let sf = sf_bit(is_64); - let word = (sf << 31) | (0b00100101 << 23) | ((not_imm as u32 & 0xFFFF) << 5) | rd; - return Ok(EncodeResult::Word(word)); - } - } - - // Try encoding as ORR Rd, XZR, #imm (logical/bitmask immediate) - // This handles patterns like 0x0101010101010101 in a single instruction - if let Some((n, immr, imms)) = encode_bitmask_imm(imm as u64, is_64) { - let sf = sf_bit(is_64); - // ORR Rd, XZR, #imm: sf 01 100100 N immr imms 11111 Rd - let word = (sf << 31) | (0b01 << 29) | (0b100100 << 23) | (n << 22) | (immr << 16) | (imms << 10) | (0b11111 << 5) | rd; - return Ok(EncodeResult::Word(word)); - } - - // Need movz + movk sequence for large immediates - return encode_mov_wide_imm(rd, is_64, imm as u64); - } - - // mov Xd, Xm -> ORR Xd, XZR, Xm - if let (Some(Operand::Reg(rd_name)), Some(Operand::Reg(rm_name))) = (operands.first(), operands.get(1)) { - let rd = parse_reg_num(rd_name).ok_or("invalid rd")?; - let rm = parse_reg_num(rm_name).ok_or("invalid rm")?; - let is_64 = is_64bit_reg(rd_name); - - // Check for MOV to/from SP: uses ADD Xd, Xn, #0 - if rd_name.to_lowercase() == "sp" || rm_name.to_lowercase() == "sp" { - let sf = sf_bit(is_64); - // ADD Xd, Xn, #0: sf 0 0 10001 00 imm12=0 Rn Rd - let word = ((sf << 31) | (0b10001 << 24)) | (rm << 5) | rd; - return Ok(EncodeResult::Word(word)); - } - - let sf = sf_bit(is_64); - // ORR Rd, XZR, Rm: sf 01 01010 00 0 Rm 000000 11111 Rd - let word = ((sf << 31) | (0b01 << 29) | (0b01010 << 24)) | (rm << 16) | (0b11111 << 5) | rd; - return Ok(EncodeResult::Word(word)); - } - - Err(format!("unsupported mov operands: {:?}", operands)) -} - -pub(crate) fn encode_mov_wide_imm(rd: u32, is_64: bool, imm: u64) -> Result { - let sf = sf_bit(is_64); - let mut words = Vec::new(); - let max_hw = if is_64 { 4 } else { 2 }; - let mut first = true; - - for hw in 0..max_hw { - let chunk = ((imm >> (hw * 16)) & 0xFFFF) as u32; - if chunk != 0 || (hw == 0 && imm == 0) { - if first { - // MOVZ - let word = (sf << 31) | (0b10100101 << 23) | (hw << 21) | (chunk << 5) | rd; - words.push(word); - first = false; - } else { - // MOVK - let word = (sf << 31) | (0b11100101 << 23) | (hw << 21) | (chunk << 5) | rd; - words.push(word); - } - } - } - - if words.is_empty() { - // imm is 0 - let word = (sf << 31) | (0b10100101 << 23) | rd; - words.push(word); - } - - if words.len() == 1 { - Ok(EncodeResult::Word(words[0])) - } else { - Ok(EncodeResult::Words(words)) - } -} - -/// Resolve `:abs_g0:`, `:abs_g1:`, etc. modifiers for movz/movk. -/// If the expression is a pure constant, returns Some((imm16, hw)) where -/// imm16 is the relevant 16-bit chunk and hw is the halfword selector. -/// If the expression contains a symbol reference, returns None (needs relocation). -pub(crate) fn resolve_abs_g_modifier(kind: &str, symbol: &str) -> Result, String> { - let shift = match kind { - "abs_g0" | "abs_g0_nc" | "abs_g0_s" => 0, - "abs_g1" | "abs_g1_nc" | "abs_g1_s" => 16, - "abs_g2" | "abs_g2_nc" | "abs_g2_s" => 32, - "abs_g3" => 48, - _ => return Ok(None), // Not an abs_g modifier - }; - let hw = shift / 16; - // Try to evaluate the expression as a constant - if let Ok(val) = crate::backend::asm_expr::parse_integer_expr(symbol) { - let imm16 = ((val as u64) >> shift) as u32 & 0xFFFF; - Ok(Some((imm16, hw))) - } else { - Ok(None) // Contains symbol reference - needs relocation - } -} - -pub(crate) fn encode_movz(operands: &[Operand]) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let sf = sf_bit(is_64); - - // Handle :abs_g*: modifiers - if let Some(Operand::Modifier { kind, symbol }) = operands.get(1) { - if let Some((imm16, hw)) = resolve_abs_g_modifier(kind, symbol)? { - let word = (sf << 31) | (0b10100101 << 23) | (hw << 21) | ((imm16 & 0xFFFF) << 5) | rd; - return Ok(EncodeResult::Word(word)); - } - } - - let imm = get_imm(operands, 1)?; - - // Check for lsl #N shift - let hw = if operands.len() > 2 { - if let Some(Operand::Shift { kind, amount }) = operands.get(2) { - if kind == "lsl" { - *amount / 16 - } else { - 0 - } - } else { - 0 - } - } else { - 0 - }; - - let word = (sf << 31) | (0b10100101 << 23) | (hw << 21) | (((imm as u32) & 0xFFFF) << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_movk(operands: &[Operand]) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let sf = sf_bit(is_64); - - // Handle :abs_g*: modifiers - if let Some(Operand::Modifier { kind, symbol }) = operands.get(1) { - if let Some((imm16, hw)) = resolve_abs_g_modifier(kind, symbol)? { - let word = (sf << 31) | (0b11100101 << 23) | (hw << 21) | ((imm16 & 0xFFFF) << 5) | rd; - return Ok(EncodeResult::Word(word)); - } - } - - let imm = get_imm(operands, 1)?; - - let hw = if operands.len() > 2 { - if let Some(Operand::Shift { kind, amount }) = operands.get(2) { - if kind == "lsl" { - *amount / 16 - } else { - 0 - } - } else { - 0 - } - } else { - 0 - }; - - let word = (sf << 31) | (0b11100101 << 23) | (hw << 21) | (((imm as u32) & 0xFFFF) << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_movn(operands: &[Operand]) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let imm = get_imm(operands, 1)?; - let sf = sf_bit(is_64); - - let hw = if operands.len() > 2 { - if let Some(Operand::Shift { kind, amount }) = operands.get(2) { - if kind == "lsl" { - *amount / 16 - } else { - 0 - } - } else { - 0 - } - } else { - 0 - }; - - let word = (sf << 31) | (0b00100101 << 23) | (hw << 21) | (((imm as u32) & 0xFFFF) << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -// ── ADD/SUB ────────────────────────────────────────────────────────────── - -pub(crate) fn encode_add_sub(operands: &[Operand], is_sub: bool, set_flags: bool) -> Result { - if operands.len() < 3 { - return Err(format!("add/sub requires 3 operands, got {}", operands.len())); - } - - // NEON vector form: ADD/SUB Vd.T, Vn.T, Vm.T - if let Some(Operand::RegArrangement { .. }) = operands.first() { - if !set_flags { - return encode_neon_add_sub(operands, is_sub); - } - } - - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let sf = sf_bit(is_64); - let op = if is_sub { 1u32 } else { 0u32 }; - let s_bit = if set_flags { 1u32 } else { 0u32 }; - - // ADD Rd, Rn, #imm - if let Some(Operand::Imm(imm)) = operands.get(2) { - let imm_signed = *imm; - // Handle negative immediates: add #-N -> sub #N and vice versa - let (imm_val, actual_op) = if imm_signed < 0 { - ((-imm_signed) as u64, if is_sub { 0u32 } else { 1u32 }) - } else { - (imm_signed as u64, op) - }; - // Check for explicit lsl #12 shift - let explicit_shift = if operands.len() > 3 { - if let Some(Operand::Shift { kind, amount }) = operands.get(3) { - kind == "lsl" && *amount == 12 - } else { false } - } else { false }; - - let (imm12, sh) = if explicit_shift { - // Explicit lsl #12: use the immediate as-is (must fit in 12 bits) - ((imm_val as u32) & 0xFFF, 1u32) - } else if imm_val <= 0xFFF { - // Fits in 12 bits unshifted - (imm_val as u32, 0u32) - } else if (imm_val & 0xFFF) == 0 && (imm_val >> 12) <= 0xFFF { - // Low 12 bits are zero and shifted value fits: auto-shift - // e.g., #4096 -> #1, lsl #12 - ((imm_val >> 12) as u32, 1u32) - } else { - return Err(format!("immediate {} does not fit in add/sub imm12 encoding", imm_val)); - }; - - let word = (sf << 31) | (actual_op << 30) | (s_bit << 29) | (0b10001 << 24) | (sh << 22) | (imm12 << 10) | (rn << 5) | rd; - return Ok(EncodeResult::Word(word)); - } - - // ADD Rd, Rn, :lo12:symbol - if let Some(Operand::Modifier { kind, symbol }) = operands.get(2) { - if kind == "lo12" { - let word = ((sf << 31) | (op << 30) | (s_bit << 29) | (0b10001 << 24)) | (rn << 5) | rd; - return Ok(EncodeResult::WordWithReloc { - word, - reloc: Relocation { - reloc_type: RelocType::AddAbsLo12, - symbol: symbol.clone(), - addend: 0, - }, - }); - } - if kind == "tprel_lo12_nc" { - let word = ((sf << 31) | (op << 30) | (s_bit << 29) | (0b10001 << 24)) | (rn << 5) | rd; - return Ok(EncodeResult::WordWithReloc { - word, - reloc: Relocation { - reloc_type: RelocType::TlsLeAddTprelLo12, - symbol: symbol.clone(), - addend: 0, - }, - }); - } - if kind == "tprel_hi12" { - let word = ((sf << 31) | (op << 30) | (s_bit << 29) | (0b10001 << 24) | (1 << 22)) | (rn << 5) | rd; - return Ok(EncodeResult::WordWithReloc { - word, - reloc: Relocation { - reloc_type: RelocType::TlsLeAddTprelHi12, - symbol: symbol.clone(), - addend: 0, - }, - }); - } - } - if let Some(Operand::ModifierOffset { kind, symbol, offset }) = operands.get(2) { - if kind == "lo12" { - let word = ((sf << 31) | (op << 30) | (s_bit << 29) | (0b10001 << 24)) | (rn << 5) | rd; - return Ok(EncodeResult::WordWithReloc { - word, - reloc: Relocation { - reloc_type: RelocType::AddAbsLo12, - symbol: symbol.clone(), - addend: *offset, - }, - }); - } - } - - // ADD Rd, Rn, Rm - if let Some(Operand::Reg(rm_name)) = operands.get(2) { - let rm = parse_reg_num(rm_name).ok_or("invalid rm")?; - - // Check for extended register: add Xd, Xn, Wm, sxtw [#N] - if let Some(Operand::Extend { kind, amount }) = operands.get(3) { - let option = match kind.as_str() { - "uxtb" => 0b000u32, - "uxth" => 0b001, - "uxtw" => 0b010, - "uxtx" => 0b011, - "sxtb" => 0b100, - "sxth" => 0b101, - "sxtw" => 0b110, - "sxtx" => 0b111, - _ => 0b011, // default UXTX/LSL - }; - let imm3 = *amount & 0x7; - // Extended register form: sf op S 01011 00 1 Rm option imm3 Rn Rd - let word = ((sf << 31) | (op << 30) | (s_bit << 29) | (0b01011 << 24)) | (1 << 21) | (rm << 16) | (option << 13) | (imm3 << 10) | (rn << 5) | rd; - return Ok(EncodeResult::Word(word)); - } - - // When Rn or Rd is SP (register 31), the shifted register form encodes - // register 31 as XZR, not SP. We must use the extended register form - // with UXTX (option=0b011) to get SP semantics. - let rn_is_sp = matches!(&operands[1], Operand::Reg(name) if { - let n = name.to_lowercase(); n == "sp" || n == "wsp" - }); - let rd_is_sp = matches!(&operands[0], Operand::Reg(name) if { - let n = name.to_lowercase(); n == "sp" || n == "wsp" - }); - - if (rn_is_sp || rd_is_sp) && operands.len() <= 3 { - // Extended register form with UXTX #0: sf op S 01011 00 1 Rm 011 000 Rn Rd - let option = if is_64 { 0b011u32 } else { 0b010u32 }; // UXTX for 64-bit, UXTW for 32-bit - let word = (((sf << 31) | (op << 30) | (s_bit << 29) | (0b01011 << 24)) | (1 << 21) | (rm << 16) | (option << 13)) | (rn << 5) | rd; - return Ok(EncodeResult::Word(word)); - } - - // Check for shifted register: add Xd, Xn, Xm, lsl #N - let (shift_type, shift_amount) = if let Some(Operand::Shift { kind, amount }) = operands.get(3) { - let st = match kind.as_str() { - "lsl" => 0b00u32, - "lsr" => 0b01, - "asr" => 0b10, - _ => 0b00, - }; - (st, *amount) - } else { - (0, 0) - }; - - let word = ((sf << 31) | (op << 30) | (s_bit << 29) | (0b01011 << 24) | (shift_type << 22)) | (rm << 16) | ((shift_amount & 0x3F) << 10) | (rn << 5) | rd; - return Ok(EncodeResult::Word(word)); - } - - Err(format!("unsupported add/sub operands: {:?}", operands)) -} - -// ── Logical ────────────────────────────────────────────────────────────── - -pub(crate) fn encode_logical(operands: &[Operand], opc: u32) -> Result { - if operands.len() < 3 { - return Err("logical op requires 3 operands".to_string()); - } - - // NEON vector form: ORR/AND/EOR Vd.T, Vn.T, Vm.T - if let Some(Operand::RegArrangement { .. }) = operands.first() { - return encode_neon_logical(operands, opc); - } - - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let sf = sf_bit(is_64); - - // AND/ORR/EOR Rd, Rn, #imm (bitmask immediate) - if let Some(Operand::Imm(imm)) = operands.get(2) { - if let Some((n, immr, imms)) = encode_bitmask_imm(*imm as u64, is_64) { - let word = (sf << 31) | (opc << 29) | (0b100100 << 23) | (n << 22) | (immr << 16) | (imms << 10) | (rn << 5) | rd; - return Ok(EncodeResult::Word(word)); - } - return Err(format!("cannot encode bitmask immediate: 0x{:x}", imm)); - } - - // AND/ORR/EOR Rd, Rn, Rm [, shift #amount] - if let Some(Operand::Reg(rm_name)) = operands.get(2) { - let rm = parse_reg_num(rm_name).ok_or("invalid rm")?; - - let (shift_type, shift_amount) = if let Some(Operand::Shift { kind, amount }) = operands.get(3) { - let st = match kind.as_str() { - "lsl" => 0b00u32, - "lsr" => 0b01, - "asr" => 0b10, - "ror" => 0b11, - _ => 0b00, - }; - (st, *amount) - } else { - (0, 0) - }; - - let word = ((sf << 31) | (opc << 29) | (0b01010 << 24) | (shift_type << 22)) - | (rm << 16) | ((shift_amount & 0x3F) << 10) | (rn << 5) | rd; - return Ok(EncodeResult::Word(word)); - } - - Err("unsupported logical operands".to_string()) -} - -/// Encode a bitmask immediate for AArch64. -/// Returns (N, immr, imms) if the value is a valid bitmask immediate. -pub(crate) fn encode_bitmask_imm(val: u64, is_64: bool) -> Option<(u32, u32, u32)> { - if val == 0 || (!is_64 && val == 0xFFFFFFFF) || (is_64 && val == u64::MAX) { - return None; // Not a valid bitmask immediate - } - - let width = if is_64 { 64 } else { 32 }; - let val = if !is_64 { val & 0xFFFFFFFF } else { val }; - - // Try each possible element size: 2, 4, 8, 16, 32, 64 - for size in [2u32, 4, 8, 16, 32, 64] { - if size > width { - continue; - } - - let mask = if size == 64 { u64::MAX } else { (1u64 << size) - 1 }; - let elem = val & mask; - - // Check that the pattern repeats - let mut repeats = true; - let mut pos = size; - while pos < width { - if ((val >> pos) & mask) != elem { - repeats = false; - break; - } - pos += size; - } - if !repeats { - continue; - } - - // Check that elem is a contiguous run of 1s (possibly rotated) - let ones = elem.count_ones(); - if ones == 0 || ones == size { - continue; // All zeros or all ones in element - } - - // Find rotation: rotate elem right until the least significant bit is 1 - // and the run of 1s starts at bit 0. - // The `r` we find is the right-rotation from actual -> base. - // immr is the right-rotation from base -> actual = size - r (mod size). - let mut found_rotation = false; - let mut rotation = 0u32; - for r in 0..size { - let rot = if r == 0 { elem } else { ((elem >> r) | (elem << (size - r))) & mask }; - // Check if this is a contiguous run from bit 0 - let run = rot.trailing_ones(); - if run == ones { - // r rotates actual -> base, so immr = size - r (mod size) rotates base -> actual - rotation = if r == 0 { 0 } else { size - r }; - found_rotation = true; - break; - } - } - if !found_rotation { - continue; - } - - // Encode the fields - let n = if size == 64 { 1u32 } else { 0u32 }; - let immr = rotation; - let imms = match size { - 2 => 0b111100 | (ones - 1), - 4 => 0b111000 | (ones - 1), - 8 => 0b110000 | (ones - 1), - 16 => 0b100000 | (ones - 1), - 32 => ones - 1, - 64 => ones - 1, - _ => unreachable!(), - }; - - return Some((n, immr, imms)); - } - - None -} - -// ── MUL/DIV ────────────────────────────────────────────────────────────── - -pub(crate) fn encode_mul(operands: &[Operand]) -> Result { - // NEON vector form: MUL Vd.T, Vn.T, Vm.T - if let Some(Operand::RegArrangement { .. }) = operands.first() { - return encode_neon_mul(operands); - } - // MUL Rd, Rn, Rm is MADD Rd, Rn, Rm, XZR - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let (rm, _) = get_reg(operands, 2)?; - let sf = sf_bit(is_64); - let word = (sf << 31) | (0b0011011000 << 21) | (rm << 16) | (0b11111 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_madd(operands: &[Operand]) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let (rm, _) = get_reg(operands, 2)?; - let (ra, _) = get_reg(operands, 3)?; - let sf = sf_bit(is_64); - let word = ((sf << 31) | (0b0011011000 << 21) | (rm << 16)) | (ra << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_msub(operands: &[Operand]) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let (rm, _) = get_reg(operands, 2)?; - let (ra, _) = get_reg(operands, 3)?; - let sf = sf_bit(is_64); - let word = (sf << 31) | (0b0011011000 << 21) | (rm << 16) | (1 << 15) | (ra << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_div(operands: &[Operand], unsigned: bool) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let (rm, _) = get_reg(operands, 2)?; - let sf = sf_bit(is_64); - let o1 = if unsigned { 0u32 } else { 1u32 }; - // Data-processing (2 source): sf 0 S=0 11010110 Rm 00001 o1 Rn Rd - let word = (sf << 31) | (0b0011010110 << 21) | (rm << 16) - | (0b00001 << 11) | (o1 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode SMULL Xd, Wn, Wm -> SMADDL Xd, Wn, Wm, XZR -pub(crate) fn encode_smull(operands: &[Operand]) -> Result { - let (rd, _) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let (rm, _) = get_reg(operands, 2)?; - // SMADDL: 1 00 11011 001 Rm 0 11111 Rn Rd (Ra=XZR makes it SMULL) - let word = (1u32 << 31) | (0b0011011001 << 21) | (rm << 16) - | (0b011111 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode UMULL Xd, Wn, Wm -> UMADDL Xd, Wn, Wm, XZR -pub(crate) fn encode_umull(operands: &[Operand]) -> Result { - let (rd, _) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let (rm, _) = get_reg(operands, 2)?; - // UMADDL: 1 00 11011 101 Rm 0 11111 Rn Rd (Ra=XZR makes it UMULL) - let word = (1u32 << 31) | (0b0011011101 << 21) | (rm << 16) - | (0b011111 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode SMADDL Xd, Wn, Wm, Xa (signed multiply-add long) -pub(crate) fn encode_smaddl(operands: &[Operand]) -> Result { - let (rd, _) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let (rm, _) = get_reg(operands, 2)?; - let (ra, _) = get_reg(operands, 3)?; - // SMADDL: 1 00 11011 001 Rm 0 Ra Rn Rd - let word = (1u32 << 31) | (0b0011011001 << 21) | (rm << 16) - | (ra << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode UMADDL Xd, Wn, Wm, Xa (unsigned multiply-add long) -pub(crate) fn encode_umaddl(operands: &[Operand]) -> Result { - let (rd, _) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let (rm, _) = get_reg(operands, 2)?; - let (ra, _) = get_reg(operands, 3)?; - // UMADDL: 1 00 11011 101 Rm 0 Ra Rn Rd - let word = (1u32 << 31) | (0b0011011101 << 21) | (rm << 16) - | (ra << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode MNEG Xd, Xn, Xm -> MSUB Xd, Xn, Xm, XZR -pub(crate) fn encode_mneg(operands: &[Operand]) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let (rm, _) = get_reg(operands, 2)?; - let sf = sf_bit(is_64); - // MSUB with Ra=XZR: sf 00 11011 000 Rm 1 11111 Rn Rd - let word = (sf << 31) | (0b0011011000 << 21) | (rm << 16) - | (1 << 15) | (0b11111 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_umulh(operands: &[Operand]) -> Result { - let (rd, _) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let (rm, _) = get_reg(operands, 2)?; - // UMULH: 1 00 11011 1 10 Rm 0 11111 Rn Rd - let word = (1u32 << 31) | (0b0011011110 << 21) | (rm << 16) | (0b011111 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_smulh(operands: &[Operand]) -> Result { - let (rd, _) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let (rm, _) = get_reg(operands, 2)?; - // SMULH: 1 00 11011 0 10 Rm 0 11111 Rn Rd - let word = (1u32 << 31) | (0b0011011010 << 21) | (rm << 16) | (0b011111 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_neg(operands: &[Operand]) -> Result { - // NEG Rd, Rm [, shift #amount] -> SUB Rd, XZR, Rm [, shift #amount] - let (rd, is_64) = get_reg(operands, 0)?; - let (rm, _) = get_reg(operands, 1)?; - let sf = sf_bit(is_64); - let (shift_type, shift_amount) = if let Some(Operand::Shift { kind, amount }) = operands.get(2) { - let st = match kind.as_str() { - "lsl" => 0b00u32, - "lsr" => 0b01, - "asr" => 0b10, - _ => 0b00, - }; - (st, *amount) - } else { - (0, 0) - }; - let word = (sf << 31) | (1 << 30) | (0b01011 << 24) | (shift_type << 22) - | (rm << 16) | ((shift_amount & 0x3F) << 10) | (0b11111 << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_negs(operands: &[Operand]) -> Result { - // NEGS Rd, Rm [, shift #amount] -> SUBS Rd, XZR, Rm [, shift #amount] - let (rd, is_64) = get_reg(operands, 0)?; - let (rm, _) = get_reg(operands, 1)?; - let sf = sf_bit(is_64); - let (shift_type, shift_amount) = if let Some(Operand::Shift { kind, amount }) = operands.get(2) { - let st = match kind.as_str() { - "lsl" => 0b00u32, - "lsr" => 0b01, - "asr" => 0b10, - _ => 0b00, - }; - (st, *amount) - } else { - (0, 0) - }; - let word = (sf << 31) | (1 << 30) | (1 << 29) | (0b01011 << 24) | (shift_type << 22) - | (rm << 16) | ((shift_amount & 0x3F) << 10) | (0b11111 << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_mvn(operands: &[Operand]) -> Result { - // NEON vector form: MVN Vd.T, Vn.T (alias of NOT) - if let Some(Operand::RegArrangement { .. }) = operands.first() { - return encode_neon_not(operands); - } - // MVN Rd, Rm [, shift #amount] -> ORN Rd, XZR, Rm [, shift #amount] - let (rd, is_64) = get_reg(operands, 0)?; - let (rm, _) = get_reg(operands, 1)?; - let sf = sf_bit(is_64); - let (shift_type, shift_amount) = if let Some(Operand::Shift { kind, amount }) = operands.get(2) { - let st = match kind.as_str() { - "lsl" => 0b00u32, - "lsr" => 0b01, - "asr" => 0b10, - "ror" => 0b11, - _ => 0b00, - }; - (st, *amount) - } else { - (0, 0) - }; - let word = (sf << 31) | (0b01 << 29) | (0b01010 << 24) | (shift_type << 22) | (1 << 21) - | (rm << 16) | ((shift_amount & 0x3F) << 10) | (0b11111 << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_adc(operands: &[Operand], set_flags: bool) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let (rm, _) = get_reg(operands, 2)?; - let sf = sf_bit(is_64); - let s = if set_flags { 1u32 } else { 0 }; - let word = ((sf << 31) | (s << 29) | (0b11010000 << 21) | (rm << 16)) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_sbc(operands: &[Operand], set_flags: bool) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let (rm, _) = get_reg(operands, 2)?; - let sf = sf_bit(is_64); - let s = if set_flags { 1u32 } else { 0 }; - let word = ((sf << 31) | (1 << 30) | (s << 29) | (0b11010000 << 21) | (rm << 16)) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -// ── Shifts ─────────────────────────────────────────────────────────────── - -pub(crate) fn encode_shift(operands: &[Operand], shift_type: u32) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - - // LSL/LSR/ASR Rd, Rn, #imm (immediate form -> UBFM/SBFM) - if let Some(Operand::Imm(imm)) = operands.get(2) { - let sf = sf_bit(is_64); - let imm = *imm as u32; - let width = if is_64 { 64 } else { 32 }; - let n = if is_64 { 1u32 } else { 0u32 }; - - match shift_type { - 0b00 => { - // LSL #imm -> UBFM Rd, Rn, #(-imm mod width), #(width-1-imm) - let immr = (width - imm) % width; - let imms = width - 1 - imm; - let word = (sf << 31) | (0b10 << 29) | (0b100110 << 23) | (n << 22) | (immr << 16) | (imms << 10) | (rn << 5) | rd; - return Ok(EncodeResult::Word(word)); - } - 0b01 => { - // LSR #imm -> UBFM Rd, Rn, #imm, #(width-1) - let immr = imm; - let imms = width - 1; - let word = (sf << 31) | (0b10 << 29) | (0b100110 << 23) | (n << 22) | (immr << 16) | (imms << 10) | (rn << 5) | rd; - return Ok(EncodeResult::Word(word)); - } - 0b10 => { - // ASR #imm -> SBFM Rd, Rn, #imm, #(width-1) - let immr = imm; - let imms = width - 1; - let word = (sf << 31) | (0b100110 << 23) | (n << 22) | (immr << 16) | (imms << 10) | (rn << 5) | rd; - return Ok(EncodeResult::Word(word)); - } - 0b11 => { - // ROR #imm -> EXTR Rd, Rn, Rn, #imm - // EXTR: sf 0 0 100111 N 0 Rm imms Rn Rd - let word = (sf << 31) | (0b00100111 << 23) | (n << 22) | (rn << 16) - | (imm << 10) | (rn << 5) | rd; - return Ok(EncodeResult::Word(word)); - } - _ => {} - } - } - - // LSL/LSR/ASR Rd, Rn, Rm (register form) - if let Some(Operand::Reg(rm_name)) = operands.get(2) { - let rm = parse_reg_num(rm_name).ok_or("invalid rm")?; - let sf = sf_bit(is_64); - // Data-processing (2 source): sf 0 S=0 11010110 Rm 0010 op2 Rn Rd - let op2 = shift_type; // 00=LSL, 01=LSR, 10=ASR, 11=ROR - let word = (sf << 31) | (0b0011010110 << 21) | (rm << 16) | (0b0010 << 12) | (op2 << 10) | (rn << 5) | rd; - return Ok(EncodeResult::Word(word)); - } - - Err("unsupported shift operands".to_string()) -} - -// ── Extensions ─────────────────────────────────────────────────────────── - -pub(crate) fn encode_sxtw(operands: &[Operand]) -> Result { - // SXTW Xd, Wn -> SBFM Xd, Xn, #0, #31 - let (rd, _) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let word = ((1u32 << 31) | (0b100110 << 23) | (1 << 22)) | (31 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_sxth(operands: &[Operand]) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let sf = sf_bit(is_64); - let n = if is_64 { 1u32 } else { 0 }; - let word = ((sf << 31) | (0b100110 << 23) | (n << 22)) | (15 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_sxtb(operands: &[Operand]) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let sf = sf_bit(is_64); - let n = if is_64 { 1u32 } else { 0 }; - let word = ((sf << 31) | (0b100110 << 23) | (n << 22)) | (7 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_uxtw(operands: &[Operand]) -> Result { - // UXTW is MOV Wd, Wn (the upper 32 bits are zeroed) - // Or: UBFM Xd, Xn, #0, #31 - let (rd, _) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - // Use 32-bit ORR (MOV alias) - let word = (0b001010100 << 23) | (rn << 16) | (0b11111 << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_uxth(operands: &[Operand]) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let sf = sf_bit(is_64); - let n = if is_64 { 1u32 } else { 0 }; - let word = ((sf << 31) | (0b10 << 29) | (0b100110 << 23) | (n << 22)) | (15 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_uxtb(operands: &[Operand]) -> Result { - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let sf = sf_bit(is_64); - let n = if is_64 { 1u32 } else { 0 }; - let word = ((sf << 31) | (0b10 << 29) | (0b100110 << 23) | (n << 22)) | (7 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode ORN (logical OR NOT): ORN Rd, Rn, Rm (scalar or vector) -pub(crate) fn encode_orn(operands: &[Operand]) -> Result { - if operands.len() < 3 { - return Err("orn requires 3 operands".to_string()); - } - - // NEON vector form: ORN Vd.T, Vn.T, Vm.T - if let Some(Operand::RegArrangement { .. }) = operands.first() { - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let (rm, _) = get_neon_reg(operands, 2)?; - let q: u32 = if arr_d == "16b" { 1 } else { 0 }; - // ORN Vd.T, Vn.T, Vm.T: 0 Q 0 01110 11 1 Rm 000111 Rn Rd - let word = (q << 30) | (0b001110 << 24) | (0b11 << 22) | (1 << 21) - | (rm << 16) | (0b000111 << 10) | (rn << 5) | rd; - return Ok(EncodeResult::Word(word)); - } - - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let (rm, _) = get_reg(operands, 2)?; - let sf = sf_bit(is_64); - - let (shift_type, shift_amount) = if let Some(Operand::Shift { kind, amount }) = operands.get(3) { - let st = match kind.as_str() { - "lsl" => 0b00u32, - "lsr" => 0b01, - "asr" => 0b10, - "ror" => 0b11, - _ => 0b00, - }; - (st, *amount) - } else { - (0, 0) - }; - - // ORN Rd, Rn, Rm [, shift #amount]: sf 01 01010 shift 1 Rm imm6 Rn Rd - let word = (sf << 31) | (0b01 << 29) | (0b01010 << 24) | (shift_type << 22) | (1 << 21) - | (rm << 16) | ((shift_amount & 0x3F) << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode EON (exclusive OR NOT): EON Rd, Rn, Rm [, shift #amount] -pub(crate) fn encode_eon(operands: &[Operand]) -> Result { - if operands.len() < 3 { - return Err("eon requires 3 operands".to_string()); - } - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let (rm, _) = get_reg(operands, 2)?; - let sf = sf_bit(is_64); - - let (shift_type, shift_amount) = if let Some(Operand::Shift { kind, amount }) = operands.get(3) { - let st = match kind.as_str() { - "lsl" => 0b00u32, - "lsr" => 0b01, - "asr" => 0b10, - "ror" => 0b11, - _ => 0b00, - }; - (st, *amount) - } else { - (0, 0) - }; - - // EON Rd, Rn, Rm [, shift #amount]: sf 10 01010 shift 1 Rm imm6 Rn Rd (opc=10, N=1) - let word = (sf << 31) | (0b10 << 29) | (0b01010 << 24) | (shift_type << 22) | (1 << 21) - | (rm << 16) | ((shift_amount & 0x3F) << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode BICS (bitwise clear, setting flags): BICS Rd, Rn, Rm [, shift #amount] -pub(crate) fn encode_bics(operands: &[Operand]) -> Result { - if operands.len() < 3 { - return Err("bics requires 3 operands".to_string()); - } - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let (rm, _) = get_reg(operands, 2)?; - let sf = sf_bit(is_64); - - let (shift_type, shift_amount) = if let Some(Operand::Shift { kind, amount }) = operands.get(3) { - let st = match kind.as_str() { - "lsl" => 0b00u32, - "lsr" => 0b01, - "asr" => 0b10, - "ror" => 0b11, - _ => 0b00, - }; - (st, *amount) - } else { - (0, 0) - }; - - // BICS Rd, Rn, Rm [, shift #amount]: sf 11 01010 shift 1 Rm imm6 Rn Rd - let word = (sf << 31) | (0b11 << 29) | (0b01010 << 24) | (shift_type << 22) | (1 << 21) - | (rm << 16) | ((shift_amount & 0x3F) << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode BIC instruction - disambiguates between scalar and NEON forms. -/// Scalar register: BIC Xd, Xn, Xm [, shift #amount] -> AND NOT (opc=00, N=1) -/// Scalar immediate: BIC Xd, Xn, #imm -> AND Xd, Xn, #~imm -/// NEON vector: BIC Vd.T, Vn.T, Vm.T -pub(crate) fn encode_bic(operands: &[Operand]) -> Result { - if operands.len() < 3 { - return Err("bic requires 3 operands".to_string()); - } - - // NEON vector form: BIC Vd.T, Vn.T, Vm.T - if let Some(Operand::RegArrangement { .. }) = operands.first() { - return encode_neon_bic(operands); - } - - let (rd, is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let sf = sf_bit(is_64); - - // BIC Xd, Xn, #imm -> AND Xd, Xn, #~imm (bitmask immediate, inverted) - if let Some(Operand::Imm(imm)) = operands.get(2) { - let inverted = if is_64 { - !(*imm as u64) - } else { - (!(*imm as u32)) as u64 - }; - if let Some((n, immr, imms)) = encode_bitmask_imm(inverted, is_64) { - // AND Rd, Rn, #~imm: sf 00 100100 N immr imms Rn Rd - // AND Rd, Rn, #~imm encoding: sf=bit31, opc=00 (bits29:30), 100100 (bits23:28), N, immr, imms, Rn, Rd - let word = (sf << 31) | (0b100100 << 23) | (n << 22) | (immr << 16) | (imms << 10) | (rn << 5) | rd; - return Ok(EncodeResult::Word(word)); - } - return Err(format!("cannot encode bitmask immediate for bic: 0x{:x} (inverted: 0x{:x})", imm, inverted)); - } - - // BIC Xd, Xn, Xm [, shift #amount]: sf 00 01010 shift 1 Rm imm6 Rn Rd (N=1) - if let Some(Operand::Reg(rm_name)) = operands.get(2) { - let rm = parse_reg_num(rm_name).ok_or("invalid rm register for bic")?; - - let (shift_type, shift_amount) = if let Some(Operand::Shift { kind, amount }) = operands.get(3) { - let st = match kind.as_str() { - "lsl" => 0b00u32, - "lsr" => 0b01, - "asr" => 0b10, - "ror" => 0b11, - _ => 0b00, - }; - (st, *amount) - } else { - (0, 0) - }; - - // BIC is AND with N=1 (bit 21): sf opc=00(bits29:30) 01010 shift 1 Rm imm6 Rn Rd - let word = (sf << 31) | (0b01010 << 24) | (shift_type << 22) | (1 << 21) - | (rm << 16) | ((shift_amount & 0x3F) << 10) | (rn << 5) | rd; - return Ok(EncodeResult::Word(word)); - } - - Err("unsupported bic operands".to_string()) -} diff --git a/src/backend/arm/assembler/encoder/fp_scalar.rs b/src/backend/arm/assembler/encoder/fp_scalar.rs deleted file mode 100644 index 678f61cd9c..0000000000 --- a/src/backend/arm/assembler/encoder/fp_scalar.rs +++ /dev/null @@ -1,271 +0,0 @@ -use super::*; -use crate::backend::arm::assembler::parser::Operand; - -// ── Floating point ─────────────────────────────────────────────────────── - -pub(crate) fn encode_fmov(operands: &[Operand]) -> Result { - if operands.len() < 2 { - return Err("fmov requires 2 operands".to_string()); - } - - let (rd_name, rm_name) = match (&operands[0], &operands[1]) { - (Operand::Reg(a), Operand::Reg(b)) => (a.clone(), b.clone()), - (Operand::Reg(_a), Operand::Imm(_)) => { - // TODO: implement fmov with float immediate encoding - return Err("fmov with immediate operand not yet supported".to_string()); - } - _ => return Err("fmov needs register operands".to_string()), - }; - - let rd = parse_reg_num(&rd_name).ok_or("invalid rd")?; - let rm = parse_reg_num(&rm_name).ok_or("invalid rm")?; - - let rd_is_fp = is_fp_reg(&rd_name); - let rm_is_fp = is_fp_reg(&rm_name); - let rd_lower = rd_name.to_lowercase(); - let rm_lower = rm_name.to_lowercase(); - - if rd_is_fp && rm_is_fp { - // FMOV between FP registers - let is_double = rd_lower.starts_with('d') || rm_lower.starts_with('d'); - let ftype = if is_double { 0b01 } else { 0b00 }; - // 0 00 11110 ftype 1 0000 00 10000 Rn Rd - let word = (0b00011110 << 24) | (ftype << 22) | (0b100000 << 16) | (0b10000 << 10) | (rm << 5) | rd; - return Ok(EncodeResult::Word(word)); - } - - if rd_is_fp && !rm_is_fp { - // FMOV from GP to FP: FMOV Dn, Xn or FMOV Sn, Wn - let is_double = rd_lower.starts_with('d'); - if is_double { - // FMOV Dd, Xn: 1 00 11110 01 1 00 111 000000 Rn Rd - let word = ((0b1001111001 << 22) | (0b100111 << 16)) | (rm << 5) | rd; - return Ok(EncodeResult::Word(word)); - } else { - // FMOV Sd, Wn: 0 00 11110 00 1 00 111 000000 Rn Rd - let word = ((0b0001111000 << 22) | (0b100111 << 16)) | (rm << 5) | rd; - return Ok(EncodeResult::Word(word)); - } - } - - if !rd_is_fp && rm_is_fp { - // FMOV from FP to GP: FMOV Xn, Dn or FMOV Wn, Sn - let is_double = rm_lower.starts_with('d'); - if is_double { - // FMOV Xd, Dn: 1 00 11110 01 1 00 110 000000 Rn Rd - let word = ((0b1001111001 << 22) | (0b100110 << 16)) | (rm << 5) | rd; - return Ok(EncodeResult::Word(word)); - } else { - // FMOV Wd, Sn: 0 00 11110 00 1 00 110 000000 Rn Rd - let word = ((0b0001111000 << 22) | (0b100110 << 16)) | (rm << 5) | rd; - return Ok(EncodeResult::Word(word)); - } - } - - Err(format!("unsupported fmov operands: {} -> {}", rd_name, rm_name)) -} - -pub(crate) fn encode_fp_arith(operands: &[Operand], opcode: u32) -> Result { - let (rd, _) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let (rm, _) = get_reg(operands, 2)?; - - let rd_name = match &operands[0] { Operand::Reg(r) => r.to_lowercase(), _ => String::new() }; - let is_double = rd_name.starts_with('d'); - let ftype = if is_double { 0b01 } else { 0b00 }; - - // 0 00 11110 ftype 1 Rm opcode 10 Rn Rd - let word = (0b00011110 << 24) | (ftype << 22) | (1 << 21) | (rm << 16) | (opcode << 12) | (0b10 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_fneg(operands: &[Operand]) -> Result { - let (rd, _) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let rd_name = match &operands[0] { Operand::Reg(r) => r.to_lowercase(), _ => String::new() }; - let is_double = rd_name.starts_with('d'); - let ftype = if is_double { 0b01 } else { 0b00 }; - // FNEG: 0 00 11110 ftype 1 0000 10 10000 Rn Rd - let word = (0b00011110 << 24) | (ftype << 22) | (0b100001 << 16) | (0b10000 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_fabs(operands: &[Operand]) -> Result { - let (rd, _) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let rd_name = match &operands[0] { Operand::Reg(r) => r.to_lowercase(), _ => String::new() }; - let is_double = rd_name.starts_with('d'); - let ftype = if is_double { 0b01 } else { 0b00 }; - // FABS: 0 00 11110 ftype 1 0000 01 10000 Rn Rd - let word = (0b00011110 << 24) | (ftype << 22) | (0b100000 << 16) | (0b110000 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_fsqrt(operands: &[Operand]) -> Result { - let (rd, _) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let rd_name = match &operands[0] { Operand::Reg(r) => r.to_lowercase(), _ => String::new() }; - let is_double = rd_name.starts_with('d'); - let ftype = if is_double { 0b01 } else { 0b00 }; - // FSQRT: 0 00 11110 ftype 1 0000 11 10000 Rn Rd - let word = (0b00011110 << 24) | (ftype << 22) | (0b100001 << 16) | (0b110000 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode FP 1-source ops: FRINTN/P/M/Z/A/X/I -/// Format: 0 00 11110 ftype 1 opcode 10000 Rn Rd -pub(crate) fn encode_fp_1src(operands: &[Operand], opcode: u32) -> Result { - let (rd, _) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let rd_name = match &operands[0] { Operand::Reg(r) => r.to_lowercase(), _ => String::new() }; - let is_double = rd_name.starts_with('d'); - let ftype = if is_double { 0b01u32 } else { 0b00 }; - let word = (0b00011110u32 << 24) | (ftype << 22) | (1 << 21) - | (opcode << 15) | (0b10000 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode FMADD/FMSUB: Rd = Ra +/- (Rn * Rm) -/// Format: 0 00 11111 ftype 0 Rm o1 Ra Rn Rd -pub(crate) fn encode_fmadd_fmsub(operands: &[Operand], is_sub: bool) -> Result { - let (rd, _) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let (rm, _) = get_reg(operands, 2)?; - let (ra, _) = get_reg(operands, 3)?; - let rd_name = match &operands[0] { Operand::Reg(r) => r.to_lowercase(), _ => String::new() }; - let is_double = rd_name.starts_with('d'); - let ftype = if is_double { 0b01u32 } else { 0b00 }; - let o1 = if is_sub { 1u32 } else { 0 }; - let word = (0b00011111u32 << 24) | (ftype << 22) | (rm << 16) - | (o1 << 15) | (ra << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode FNMADD/FNMSUB: Rd = -Ra +/- (Rn * Rm) -/// Format: 0 00 11111 ftype 1 Rm o1 Ra Rn Rd -pub(crate) fn encode_fnmadd_fnmsub(operands: &[Operand], is_sub: bool) -> Result { - let (rd, _) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - let (rm, _) = get_reg(operands, 2)?; - let (ra, _) = get_reg(operands, 3)?; - let rd_name = match &operands[0] { Operand::Reg(r) => r.to_lowercase(), _ => String::new() }; - let is_double = rd_name.starts_with('d'); - let ftype = if is_double { 0b01u32 } else { 0b00 }; - let o1 = if is_sub { 1u32 } else { 0 }; - let word = (0b00011111u32 << 24) | (ftype << 22) | (1 << 21) | (rm << 16) - | (o1 << 15) | (ra << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_fcmp(operands: &[Operand]) -> Result { - let (rn, _) = get_reg(operands, 0)?; - let rn_name = match &operands[0] { Operand::Reg(r) => r.to_lowercase(), _ => String::new() }; - let is_double = rn_name.starts_with('d'); - let ftype = if is_double { 0b01 } else { 0b00 }; - - // FCMP Dn, #0.0 - if operands.len() < 2 || matches!(operands.get(1), Some(Operand::Imm(0))) { - let word = ((0b00011110 << 24) | (ftype << 22) | (1 << 21)) | (0b001000 << 10) | (rn << 5) | 0b01000; - return Ok(EncodeResult::Word(word)); - } - - let (rm, _) = get_reg(operands, 1)?; - // FCMP Dn, Dm: 0 00 11110 ftype 1 Rm 00 1000 Rn 00 000 - let word = (0b00011110 << 24) | (ftype << 22) | (1 << 21) | (rm << 16) | (0b001000 << 10) | (rn << 5); - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_fcvt_rounding(operands: &[Operand], rmode: u32, opcode: u32) -> Result { - // Float-to-integer conversion with specified rounding mode - // Encoding: sf 00 11110 ftype 1 rmode opcode 000000 Rn Rd - // sf: 0=W dest, 1=X dest - // ftype: 00=S source, 01=D source - // rmode+opcode: determines rounding mode and signedness - if operands.len() < 2 { - return Err("fcvt* requires 2 operands".to_string()); - } - let (rd, rd_is_64) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - - let src_name = match &operands[1] { - Operand::Reg(name) => name.to_lowercase(), - _ => return Err("fcvt*: expected register source".to_string()), - }; - let ftype: u32 = if src_name.starts_with('d') { 0b01 } else { 0b00 }; - let sf: u32 = if rd_is_64 { 1 } else { 0 }; - - let word = ((sf << 31) | (0b11110 << 24) | (ftype << 22) - | (1 << 21) | (rmode << 19) | (opcode << 16)) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_ucvtf(operands: &[Operand]) -> Result { - encode_int_to_float(operands, false) -} - -pub(crate) fn encode_scvtf(operands: &[Operand]) -> Result { - encode_int_to_float(operands, true) -} - -pub(crate) fn encode_int_to_float(operands: &[Operand], is_signed: bool) -> Result { - // SCVTF/UCVTF: integer-to-float conversion - // Encoding: sf 00 11110 ftype 1 00 opcode 000000 Rn Rd - // sf: 0=W source, 1=X source - // ftype: 00=S dest, 01=D dest - // opcode: 010=signed (SCVTF), 011=unsigned (UCVTF) - if operands.len() < 2 { - return Err("scvtf/ucvtf requires 2 operands".to_string()); - } - let (rd, _) = get_reg(operands, 0)?; - let (rn, rn_is_64) = get_reg(operands, 1)?; - - let dst_name = match &operands[0] { - Operand::Reg(name) => name.to_lowercase(), - _ => return Err("scvtf/ucvtf: expected register dest".to_string()), - }; - let ftype: u32 = if dst_name.starts_with('d') { 0b01 } else { 0b00 }; - let sf: u32 = if rn_is_64 { 1 } else { 0 }; - let opcode: u32 = if is_signed { 0b010 } else { 0b011 }; - - let word = (((sf << 31) | (0b11110 << 24) | (ftype << 22) - | (1 << 21)) | (opcode << 16)) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_fcvt_precision(operands: &[Operand]) -> Result { - // FCVT: float precision conversion (e.g., FCVT Dd, Sn or FCVT Sd, Dn) - // Encoding: 0 00 11110 ftype 1 0001 opc 10000 Rn Rd - // ftype: source precision (00=S, 01=D, 11=H) - // opc: dest precision (00=S, 01=D, 11=H) - if operands.len() < 2 { - return Err("fcvt requires 2 operands".to_string()); - } - let (rd, _) = get_reg(operands, 0)?; - let (rn, _) = get_reg(operands, 1)?; - - let dst_name = match &operands[0] { - Operand::Reg(name) => name.to_lowercase(), - _ => return Err("fcvt: expected register dest".to_string()), - }; - let src_name = match &operands[1] { - Operand::Reg(name) => name.to_lowercase(), - _ => return Err("fcvt: expected register source".to_string()), - }; - - let ftype: u32 = match src_name.chars().next() { - Some('s') => 0b00, - Some('d') => 0b01, - Some('h') => 0b11, - _ => return Err(format!("fcvt: unsupported source type: {}", src_name)), - }; - let opc: u32 = match dst_name.chars().next() { - Some('s') => 0b00, - Some('d') => 0b01, - Some('h') => 0b11, - _ => return Err(format!("fcvt: unsupported dest type: {}", dst_name)), - }; - - let word = (0b00011110 << 24) | (ftype << 22) | (1 << 21) | (0b0001 << 17) - | (opc << 15) | (0b10000 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} diff --git a/src/backend/arm/assembler/encoder/load_store.rs b/src/backend/arm/assembler/encoder/load_store.rs deleted file mode 100644 index c60b264820..0000000000 --- a/src/backend/arm/assembler/encoder/load_store.rs +++ /dev/null @@ -1,966 +0,0 @@ -use super::*; -use crate::backend::arm::assembler::parser::Operand; - -// ── Loads/Stores ───────────────────────────────────────────────────────── - -/// Auto-detect LDR/STR size from the first register operand. -pub(crate) fn encode_ldr_str_auto(operands: &[Operand], is_load: bool) -> Result { - // Determine size from register: Wn -> 32-bit (size=10), Xn -> 64-bit (size=11) - // FP: Sn -> 32-bit, Dn -> 64-bit, Qn -> 128-bit - let reg_name = match operands.first() { - Some(Operand::Reg(r)) => r.to_lowercase(), - _ => return Err("ldr/str needs register operand".to_string()), - }; - - let size = if reg_name.starts_with('w') { - 0b10 // 32-bit - } else if reg_name.starts_with('x') || reg_name == "sp" || reg_name == "xzr" || reg_name == "lr" { - 0b11 // 64-bit - } else if reg_name.starts_with('s') { - 0b10 // 32-bit float - } else if reg_name.starts_with('d') { - 0b11 // 64-bit float - } else if reg_name.starts_with('q') { - 0b00 // 128-bit: size=00 with opc adjustment in encode_ldr_str - } else { - 0b11 // default 64-bit - }; - - let is_128bit = reg_name.starts_with('q'); - encode_ldr_str(operands, is_load, size, false, is_128bit) -} - -pub(crate) fn encode_ldr_str(operands: &[Operand], is_load: bool, size: u32, is_signed: bool, is_128bit: bool) -> Result { - if operands.len() < 2 { - return Err("ldr/str requires at least 2 operands".to_string()); - } - - let (rt, _) = get_reg(operands, 0)?; - let fp = is_fp_reg(operands.first().map(|o| match o { Operand::Reg(r) => r.as_str(), _ => "" }).unwrap_or("")); - - // Use the size parameter as-is (auto-detection happens in encode_ldr_str_auto) - let actual_size = size; - - let v = if fp { 1u32 } else { 0u32 }; - - match operands.get(1) { - // [base, #offset] - Some(Operand::Mem { base, offset }) => { - let rn = parse_reg_num(base).ok_or("invalid base reg")?; - - // Unsigned offset encoding - // Size determines the shift for offset alignment - // For 128-bit Q registers: shift=4, opc=11 (load) or 10 (store) - let shift = if is_128bit { 4 } else { actual_size }; - let opc = if is_128bit { - if is_load { 0b11 } else { 0b10 } - } else if is_load { - if is_signed { 0b10 } else { 0b01 } - } else { - 0b00 - }; - - // Check if offset is aligned and fits in 12-bit unsigned field - let abs_offset = *offset as u64; - let align = 1u64 << shift; - if *offset >= 0 && abs_offset.is_multiple_of(align) { - let imm12 = (abs_offset / align) as u32; - if imm12 < 4096 { - // Unsigned offset form: size 111 V 01 opc imm12 Rn Rt - let word = (actual_size << 30) | (0b111 << 27) | (v << 26) | (0b01 << 24) - | (opc << 22) | (imm12 << 10) | (rn << 5) | rt; - return Ok(EncodeResult::Word(word)); - } - } - - // Unscaled offset (LDUR/STUR form) - let imm9 = (*offset as i32) & 0x1FF; - let opc = if is_128bit { - if is_load { 0b11 } else { 0b10 } - } else if is_load { - if is_signed { 0b10 } else { 0b01 } - } else { - 0b00 - }; - let word = (((actual_size << 30) | (0b111 << 27) | (v << 26)) | (opc << 22) - | ((imm9 as u32 & 0x1FF) << 12)) | (rn << 5) | rt; - return Ok(EncodeResult::Word(word)); - } - - // [base, #offset]! (pre-index) - Some(Operand::MemPreIndex { base, offset }) => { - let rn = parse_reg_num(base).ok_or("invalid base reg")?; - let imm9 = (*offset as i32) & 0x1FF; - let opc = if is_128bit { - if is_load { 0b11 } else { 0b10 } - } else if is_load { 0b01 } else { 0b00 }; - let word = ((actual_size << 30) | (0b111 << 27) | (v << 26)) | (opc << 22) - | ((imm9 as u32 & 0x1FF) << 12) | (0b11 << 10) | (rn << 5) | rt; - return Ok(EncodeResult::Word(word)); - } - - // [base], #offset (post-index) - Some(Operand::MemPostIndex { base, offset }) => { - let rn = parse_reg_num(base).ok_or("invalid base reg")?; - let imm9 = (*offset as i32) & 0x1FF; - let opc = if is_128bit { - if is_load { 0b11 } else { 0b10 } - } else if is_load { 0b01 } else { 0b00 }; - let word = ((actual_size << 30) | (0b111 << 27) | (v << 26)) | (opc << 22) - | ((imm9 as u32 & 0x1FF) << 12) | (0b01 << 10) | (rn << 5) | rt; - return Ok(EncodeResult::Word(word)); - } - - // [base, Xm] register offset - Some(Operand::MemRegOffset { base, index, extend, shift }) => { - // Check if index is a :lo12: modifier - if index.starts_with(':') { - // Parse modifier from the index string - let rn = parse_reg_num(base).ok_or("invalid base reg")?; - let mod_str = index.trim_start_matches(':'); - let (kind, sym) = if let Some(colon_pos) = mod_str.find(':') { - (&mod_str[..colon_pos], &mod_str[colon_pos + 1..]) - } else { - return Err(format!("malformed modifier in memory operand: {}", index)); - }; - - let (symbol, addend) = if let Some(plus_pos) = sym.find('+') { - let s = &sym[..plus_pos]; - let off: i64 = sym[plus_pos + 1..].parse().unwrap_or(0); - (s.to_string(), off) - } else { - (sym.to_string(), 0i64) - }; - - let opc = if is_128bit { - if is_load { 0b11 } else { 0b10 } - } else if is_load { 0b01 } else { 0b00 }; - - let reloc_type = match kind { - "lo12" => { - if is_128bit { - RelocType::Ldst128AbsLo12 - } else { - match actual_size { - 0b00 => RelocType::Ldst8AbsLo12, - 0b01 => RelocType::Ldst16AbsLo12, - 0b10 => RelocType::Ldst32AbsLo12, - 0b11 => RelocType::Ldst64AbsLo12, - _ => RelocType::Ldst64AbsLo12, - } - } - } - "got_lo12" => RelocType::Ld64GotLo12, - _ => return Err(format!("unsupported modifier in load/store: {}", kind)), - }; - - let word = ((actual_size << 30) | (0b111 << 27) | (v << 26) | (0b01 << 24) | (opc << 22)) | (rn << 5) | rt; - return Ok(EncodeResult::WordWithReloc { - word, - reloc: Relocation { - reloc_type, - symbol, - addend, - }, - }); - } - - let rn = parse_reg_num(base).ok_or("invalid base reg")?; - let rm = parse_reg_num(index).ok_or("invalid index reg")?; - let opc = if is_128bit { - if is_load { 0b11 } else { 0b10 } - } else if is_load { 0b01 } else { 0b00 }; - // Register offset: size 111 V opc 1 Rm option S 10 Rn Rt - // Determine option and S from extend/shift specifiers - let is_w_index = index.starts_with('w') || index.starts_with('W'); - let shift_amount: u8 = match shift { Some(s) => *s, None => 0 }; - let (option, s_bit) = match extend.as_deref() { - Some("lsl") => { - // LSL with shift: S=1 if shift amount > 0 - let s_val = if shift_amount > 0 { 1u32 } else { 0u32 }; - (0b011u32, s_val) - } - Some("sxtw") => { - let s_val = if shift_amount > 0 { 1u32 } else { 0u32 }; - (0b110u32, s_val) - } - Some("sxtx") => { - let s_val = if shift_amount > 0 { 1u32 } else { 0u32 }; - (0b111u32, s_val) - } - Some("uxtw") => { - let s_val = if shift_amount > 0 { 1u32 } else { 0u32 }; - (0b010u32, s_val) - } - Some("uxtx") => { - let s_val = if shift_amount > 0 { 1u32 } else { 0u32 }; - (0b011u32, s_val) - } - None => { - // Default: if W register index, use UXTW; if X register, use LSL - if is_w_index { - (0b010u32, 0u32) // UXTW, no shift - } else { - (0b011u32, 0u32) // LSL, no shift - } - } - _ => (0b011u32, 0u32), // default LSL - }; - let word = (actual_size << 30) | (0b111 << 27) | (v << 26) | (opc << 22) - | (1 << 21) | (rm << 16) | (option << 13) | (s_bit << 12) | (0b10 << 10) | (rn << 5) | rt; - return Ok(EncodeResult::Word(word)); - } - - // LDR (literal): ldr Rt, label — PC-relative load - Some(Operand::Symbol(sym)) if is_load => { - // opc V 011 00 imm19 Rt - // For GP registers: opc=00 → 32-bit (W), opc=01 → 64-bit (X), opc=11 → PRFM - // For FP/SIMD: opc=00 → 32-bit (S), opc=01 → 64-bit (D), opc=10 → 128-bit (Q) - // Note: actual_size uses 10=32-bit, 11=64-bit but LDR literal uses 00=32-bit, 01=64-bit - let opc = if is_128bit { - 0b10u32 - } else if fp { - // FP: S=00, D=01 (same mapping as GP) - if actual_size == 0b11 { 0b01 } else { 0b00 } - } else { - // GP: W=00, X=01 - if actual_size == 0b11 { 0b01 } else { 0b00 } - }; - let word = (opc << 30) | (v << 26) | (0b011 << 27) | rt; - return Ok(EncodeResult::WordWithReloc { - word, - reloc: Relocation { - reloc_type: RelocType::Ldr19, - symbol: sym.clone(), - addend: 0, - }, - }); - } - - _ => {} - } - - Err(format!("unsupported ldr/str operands: {:?}", operands)) -} - -/// Encode LDUR/STUR (unscaled immediate offset load/store) -/// Format: size 111 V 00 opc 0 imm9 00 Rn Rt -pub(crate) fn encode_ldur_stur(operands: &[Operand], is_load: bool, op2_bits: u32) -> Result { - if operands.len() < 2 { - return Err("ldur/stur requires 2 operands".to_string()); - } - let (rt, _) = get_reg(operands, 0)?; - let reg_name = match &operands[0] { Operand::Reg(r) => r.to_lowercase(), _ => String::new() }; - let fp = is_fp_reg(®_name); - let v = if fp { 1u32 } else { 0u32 }; - - let (size, opc) = if fp { - if reg_name.starts_with('q') { - (0b00u32, if is_load { 0b11u32 } else { 0b10 }) - } else if reg_name.starts_with('d') { - (0b11, if is_load { 0b01 } else { 0b00 }) - } else if reg_name.starts_with('s') { - (0b10, if is_load { 0b01 } else { 0b00 }) - } else if reg_name.starts_with('h') { - (0b01, if is_load { 0b01 } else { 0b00 }) - } else if reg_name.starts_with('b') { - (0b00, if is_load { 0b01 } else { 0b00 }) - } else { - (0b11, if is_load { 0b01 } else { 0b00 }) - } - } else { - let is_64 = reg_name.starts_with('x'); - let sz = if is_64 { 0b11u32 } else { 0b10 }; - (sz, if is_load { 0b01u32 } else { 0b00 }) - }; - - let (rn, imm9) = match &operands[1] { - Operand::Mem { base, offset } => { - let rn = parse_reg_num(base).ok_or("invalid base reg")?; - (rn, *offset as i32) - } - _ => return Err(format!("ldur/stur: expected memory operand, got {:?}", operands[1])), - }; - - let imm9_enc = (imm9 as u32) & 0x1FF; - let word = (size << 30) | (0b111 << 27) | (v << 26) | (opc << 22) - | (imm9_enc << 12) | (op2_bits << 10) | (rn << 5) | rt; - Ok(EncodeResult::Word(word)) -} - -/// Encode LDTR/STTR with explicit size (for ldtrh, ldtrb, etc.) -pub(crate) fn encode_ldtr_sized(operands: &[Operand], is_load: bool, size: u32) -> Result { - if operands.len() < 2 { - return Err("ldtr/sttr requires 2 operands".to_string()); - } - let (rt, _) = get_reg(operands, 0)?; - let opc = if is_load { 0b01u32 } else { 0b00 }; - let (rn, imm9) = match &operands[1] { - Operand::Mem { base, offset } => { - let rn = parse_reg_num(base).ok_or("invalid base reg")?; - (rn, *offset as i32) - } - _ => return Err("ldtr/sttr: expected memory operand".to_string()), - }; - let imm9_enc = (imm9 as u32) & 0x1FF; - let word = (size << 30) | (0b111 << 27) | (opc << 22) - | (imm9_enc << 12) | (0b10 << 10) | (rn << 5) | rt; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_ldrsw(operands: &[Operand]) -> Result { - if operands.len() < 2 { - return Err("ldrsw requires 2 operands".to_string()); - } - - let (rt, _) = get_reg(operands, 0)?; - - match operands.get(1) { - Some(Operand::Mem { base, offset }) => { - let rn = parse_reg_num(base).ok_or("invalid base reg")?; - // LDRSW: size=10 111 V=0 01 opc=10 -> unsigned offset - // Actually: 10 111 0 01 10 imm12 Rn Rt - let abs_offset = *offset as u64; - if *offset >= 0 && abs_offset.is_multiple_of(4) { - let imm12 = (abs_offset / 4) as u32; - if imm12 < 4096 { - let word = ((0b10 << 30) | (0b111 << 27)) | (0b01 << 24) | (0b10 << 22) - | (imm12 << 10) | (rn << 5) | rt; - return Ok(EncodeResult::Word(word)); - } - } - // Unscaled: LDURSW - let imm9 = (*offset as i32) & 0x1FF; - let word = (((0b10 << 30) | (0b111 << 27)) | (0b10 << 22) - | ((imm9 as u32 & 0x1FF) << 12)) | (rn << 5) | rt; - return Ok(EncodeResult::Word(word)); - } - - Some(Operand::MemPostIndex { base, offset }) => { - let rn = parse_reg_num(base).ok_or("invalid base reg")?; - let imm9 = (*offset as i32) & 0x1FF; - let word = ((0b10 << 30) | (0b111 << 27)) | (0b10 << 22) - | ((imm9 as u32 & 0x1FF) << 12) | (0b01 << 10) | (rn << 5) | rt; - return Ok(EncodeResult::Word(word)); - } - - Some(Operand::MemPreIndex { base, offset }) => { - let rn = parse_reg_num(base).ok_or("invalid base reg")?; - let imm9 = (*offset as i32) & 0x1FF; - let word = ((0b10 << 30) | (0b111 << 27)) | (0b10 << 22) - | ((imm9 as u32 & 0x1FF) << 12) | (0b11 << 10) | (rn << 5) | rt; - return Ok(EncodeResult::Word(word)); - } - - Some(Operand::MemRegOffset { base, index, extend, shift }) => { - let rn = parse_reg_num(base).ok_or("invalid base reg")?; - let rm = parse_reg_num(index).ok_or("invalid index reg")?; - let (option, s_bit) = match (extend.as_deref(), shift) { - (Some("lsl"), Some(2)) => (0b011u32, 1u32), - (Some("lsl"), Some(0)) | (Some("lsl"), None) => (0b011, 0), - (None, None) | (None, Some(0)) => (0b011, 0), - (Some("sxtw"), Some(2)) => (0b110, 1), - (Some("sxtw"), Some(0)) | (Some("sxtw"), None) => (0b110, 0), - (Some("uxtw"), Some(2)) => (0b010, 1), - (Some("uxtw"), Some(0)) | (Some("uxtw"), None) => (0b010, 0), - (Some("sxtx"), Some(2)) => (0b111, 1), - (Some("sxtx"), Some(0)) | (Some("sxtx"), None) => (0b111, 0), - _ => return Err(format!("unsupported ldrsw extend/shift: {:?}/{:?}", extend, shift)), - }; - // LDRSW reg: 10 111 0 00 10 1 Rm option S 10 Rn Rt - let word = (0b10 << 30) | (0b111 << 27) | (0b10 << 22) | (1 << 21) - | (rm << 16) | (option << 13) | (s_bit << 12) | (0b10 << 10) | (rn << 5) | rt; - return Ok(EncodeResult::Word(word)); - } - - _ => {} - } - - Err(format!("unsupported ldrsw operands: {:?}", operands)) -} - -pub(crate) fn encode_ldrs(operands: &[Operand], size: u32) -> Result { - // LDRSB/LDRSH: sign-extending byte/halfword loads - if operands.len() < 2 { - return Err("ldrsb/ldrsh requires 2 operands".to_string()); - } - - let (rt, is_64) = get_reg(operands, 0)?; - let opc = if is_64 { 0b10 } else { 0b11 }; // 64-bit target: opc=10, 32-bit: opc=11 - - if let Some(Operand::Mem { base, offset }) = operands.get(1) { - let rn = parse_reg_num(base).ok_or("invalid base reg")?; - let shift = size; - let abs_offset = *offset as u64; - let align = 1u64 << shift; - if *offset >= 0 && abs_offset.is_multiple_of(align) { - let imm12 = (abs_offset / align) as u32; - if imm12 < 4096 { - let word = ((size << 30) | (0b111 << 27)) | (0b01 << 24) | (opc << 22) - | (imm12 << 10) | (rn << 5) | rt; - return Ok(EncodeResult::Word(word)); - } - } - // Unscaled - let imm9 = (*offset as i32) & 0x1FF; - let word = (((size << 30) | (0b111 << 27)) | (opc << 22) - | ((imm9 as u32 & 0x1FF) << 12)) | (rn << 5) | rt; - return Ok(EncodeResult::Word(word)); - } - - // Post-index: ldrsb/ldrsh Rt, [Xn], #imm - if let Some(Operand::MemPostIndex { base, offset }) = operands.get(1) { - let rn = parse_reg_num(base).ok_or("invalid base reg")?; - let imm9 = (*offset as i32) & 0x1FF; - let word = (size << 30) | (0b111 << 27) | (opc << 22) - | ((imm9 as u32 & 0x1FF) << 12) | (0b01 << 10) | (rn << 5) | rt; - return Ok(EncodeResult::Word(word)); - } - - // Pre-index: ldrsb/ldrsh Rt, [Xn, #imm]! - if let Some(Operand::MemPreIndex { base, offset }) = operands.get(1) { - let rn = parse_reg_num(base).ok_or("invalid base reg")?; - let imm9 = (*offset as i32) & 0x1FF; - let word = (size << 30) | (0b111 << 27) | (opc << 22) - | ((imm9 as u32 & 0x1FF) << 12) | (0b11 << 10) | (rn << 5) | rt; - return Ok(EncodeResult::Word(word)); - } - - // Register offset: ldrsb/ldrsh Rt, [Xn, Xm{, extend {#amount}}] - if let Some(Operand::MemRegOffset { base, index, extend, shift }) = operands.get(1) { - let rn = parse_reg_num(base).ok_or("invalid base reg")?; - let rm = parse_reg_num(index).ok_or("invalid index reg")?; - let is_w_index = index.starts_with('w') || index.starts_with('W'); - let shift_amount: u8 = match shift { Some(s) => *s, None => 0 }; - let (option, s_bit) = match extend.as_deref() { - Some("lsl") => (0b011u32, if shift_amount > 0 { 1u32 } else { 0 }), - Some("sxtw") => (0b110u32, if shift_amount > 0 { 1u32 } else { 0 }), - Some("sxtx") => (0b111u32, if shift_amount > 0 { 1u32 } else { 0 }), - Some("uxtw") => (0b010u32, if shift_amount > 0 { 1u32 } else { 0 }), - Some("uxtx") => (0b011u32, if shift_amount > 0 { 1u32 } else { 0 }), - None => if is_w_index { (0b010u32, 0u32) } else { (0b011u32, 0u32) }, - _ => (0b011u32, 0u32), - }; - let word = (size << 30) | (0b111 << 27) | (opc << 22) | (1 << 21) - | (rm << 16) | (option << 13) | (s_bit << 12) | (0b10 << 10) | (rn << 5) | rt; - return Ok(EncodeResult::Word(word)); - } - - Err(format!("unsupported ldrsb/ldrsh operands: {:?}", operands)) -} - -pub(crate) fn encode_ldp_stp(operands: &[Operand], is_load: bool) -> Result { - if operands.len() < 3 { - return Err("ldp/stp requires 3 operands".to_string()); - } - - let (rt1, is_64) = get_reg(operands, 0)?; - let (rt2, _) = get_reg(operands, 1)?; - let fp = is_fp_reg(match &operands[0] { Operand::Reg(r) => r.as_str(), _ => "" }); - - let opc = if fp { - let r = match &operands[0] { Operand::Reg(r) => r.to_lowercase(), _ => String::new() }; - if r.starts_with('s') { 0b00 } - else if r.starts_with('d') { 0b01 } - else if r.starts_with('q') || is_64 { 0b10 } - else { 0b00 } - } else if is_64 { 0b10 } else { 0b00 }; - - let v = if fp { 1u32 } else { 0u32 }; - let l = if is_load { 1u32 } else { 0u32 }; - - // Shift depends on register size - let shift = if fp { - let r = match &operands[0] { Operand::Reg(r) => r.to_lowercase(), _ => String::new() }; - if r.starts_with('s') { 2 } - else if r.starts_with('d') { 3 } - else if r.starts_with('q') { 4 } - else if is_64 { 3 } else { 2 } - } else if is_64 { 3 } else { 2 }; - - match operands.get(2) { - // STP rt1, rt2, [base, #offset]! (pre-index) - Some(Operand::MemPreIndex { base, offset }) => { - let rn = parse_reg_num(base).ok_or("invalid base reg")?; - let imm7 = ((*offset >> shift) as i32) & 0x7F; - let word = (opc << 30) | (0b101 << 27) | (v << 26) | (0b011 << 23) | (l << 22) - | ((imm7 as u32 & 0x7F) << 15) | (rt2 << 10) | (rn << 5) | rt1; - return Ok(EncodeResult::Word(word)); - } - - // LDP/STP rt1, rt2, [base], #offset (post-index) - Some(Operand::MemPostIndex { base, offset }) => { - let rn = parse_reg_num(base).ok_or("invalid base reg")?; - let imm7 = ((*offset >> shift) as i32) & 0x7F; - let word = (opc << 30) | (0b101 << 27) | (v << 26) | (0b001 << 23) | (l << 22) - | ((imm7 as u32 & 0x7F) << 15) | (rt2 << 10) | (rn << 5) | rt1; - return Ok(EncodeResult::Word(word)); - } - - // LDP/STP rt1, rt2, [base, #offset] (signed offset) - Some(Operand::Mem { base, offset }) => { - let rn = parse_reg_num(base).ok_or("invalid base reg")?; - let imm7 = ((*offset >> shift) as i32) & 0x7F; - let word = (opc << 30) | (0b101 << 27) | (v << 26) | (0b010 << 23) | (l << 22) - | ((imm7 as u32 & 0x7F) << 15) | (rt2 << 10) | (rn << 5) | rt1; - return Ok(EncodeResult::Word(word)); - } - - _ => {} - } - - Err(format!("unsupported ldp/stp operands: {:?}", operands)) -} - -/// Encode LDNP/STNP (load/store pair non-temporal) -/// Encoding: opc 101 V 000 L imm7 Rt2 Rn Rt -/// TODO: Only handles integer registers (V=0). FP/SIMD register support needed for V=1. -pub(crate) fn encode_ldnp_stnp(operands: &[Operand], is_load: bool) -> Result { - if operands.len() < 3 { - return Err("ldnp/stnp requires 3 operands".to_string()); - } - - let (rt1, is_64) = get_reg(operands, 0)?; - let (rt2, _) = get_reg(operands, 1)?; - - let opc: u32 = if is_64 { 0b10 } else { 0b00 }; - let l: u32 = if is_load { 1 } else { 0 }; - let shift = if is_64 { 3 } else { 2 }; // scale factor: 8 for 64-bit, 4 for 32-bit - - match operands.get(2) { - Some(Operand::Mem { base, offset }) => { - let rn = parse_reg_num(base).ok_or("invalid base reg")?; - let imm7 = ((*offset >> shift) as i32) & 0x7F; - // LDNP/STNP: opc 101 V=0 000 L imm7 Rt2 Rn Rt - let word = (opc << 30) | (0b101 << 27) | (l << 22) - | ((imm7 as u32 & 0x7F) << 15) | (rt2 << 10) | (rn << 5) | rt1; - Ok(EncodeResult::Word(word)) - } - _ => Err(format!("unsupported ldnp/stnp operands: {:?}", operands)), - } -} - -// ── Exclusive loads/stores ─────────────────────────────────────────────── - -/// Encode LDXR/STXR and byte/halfword variants. -/// `forced_size`: None = auto-detect from register width, Some(0b00) = byte, Some(0b01) = halfword -pub(crate) fn encode_ldxr_stxr(operands: &[Operand], is_load: bool, forced_size: Option) -> Result { - if is_load { - let (rt, is_64) = get_reg(operands, 0)?; - let rn = match operands.get(1) { - Some(Operand::Mem { base, .. }) => parse_reg_num(base).ok_or("invalid base")?, - _ => return Err("ldxr needs memory operand".to_string()), - }; - let size = forced_size.unwrap_or(if is_64 { 0b11 } else { 0b10 }); - let word = ((size << 30) | (0b001000010 << 21) | (0b11111 << 16)) - | (0b11111 << 10) | (rn << 5) | rt; - Ok(EncodeResult::Word(word)) - } else { - let (ws, _) = get_reg(operands, 0)?; - let (rt, is_64) = get_reg(operands, 1)?; - let rn = match operands.get(2) { - Some(Operand::Mem { base, .. }) => parse_reg_num(base).ok_or("invalid base")?, - _ => return Err("stxr needs memory operand".to_string()), - }; - let size = forced_size.unwrap_or(if is_64 { 0b11 } else { 0b10 }); - let word = ((size << 30) | (0b001000000 << 21) | (ws << 16)) - | (0b11111 << 10) | (rn << 5) | rt; - Ok(EncodeResult::Word(word)) - } -} - -/// Encode LDAXR/STLXR and byte/halfword variants. -pub(crate) fn encode_ldaxr_stlxr(operands: &[Operand], is_load: bool, forced_size: Option) -> Result { - if is_load { - let (rt, is_64) = get_reg(operands, 0)?; - let rn = match operands.get(1) { - Some(Operand::Mem { base, .. }) => parse_reg_num(base).ok_or("invalid base")?, - _ => return Err("ldaxr needs memory operand".to_string()), - }; - let size = forced_size.unwrap_or(if is_64 { 0b11 } else { 0b10 }); - let word = (size << 30) | (0b001000010 << 21) | (0b11111 << 16) | (1 << 15) - | (0b11111 << 10) | (rn << 5) | rt; - Ok(EncodeResult::Word(word)) - } else { - let (ws, _) = get_reg(operands, 0)?; - let (rt, is_64) = get_reg(operands, 1)?; - let rn = match operands.get(2) { - Some(Operand::Mem { base, .. }) => parse_reg_num(base).ok_or("invalid base")?, - _ => return Err("stlxr needs memory operand".to_string()), - }; - let size = forced_size.unwrap_or(if is_64 { 0b11 } else { 0b10 }); - let word = (size << 30) | (0b001000000 << 21) | (ws << 16) | (1 << 15) - | (0b11111 << 10) | (rn << 5) | rt; - Ok(EncodeResult::Word(word)) - } -} - -/// Encode LDXP/STXP/LDAXP/STLXP (exclusive pair) instructions. -/// -/// LDXP Xt1, Xt2, [Xn] : sz 001000 0 1 1 11111 0 Rt2 Rn Rt -/// LDAXP Xt1, Xt2, [Xn] : sz 001000 0 1 1 11111 1 Rt2 Rn Rt -/// STXP Ws, Xt1, Xt2, [Xn] : sz 001000 0 0 1 Rs 0 Rt2 Rn Rt -/// STLXP Ws, Xt1, Xt2, [Xn] : sz 001000 0 0 1 Rs 1 Rt2 Rn Rt -pub(crate) fn encode_ldxp_stxp(operands: &[Operand], is_load: bool, acquire_release: bool) -> Result { - let o0 = if acquire_release { 1u32 } else { 0 }; - if is_load { - // LDXP/LDAXP Rt, Rt2, [Rn] - let (rt, is_64) = get_reg(operands, 0)?; - let (rt2, _) = get_reg(operands, 1)?; - let rn = match operands.get(2) { - Some(Operand::Mem { base, .. }) => parse_reg_num(base).ok_or("ldxp needs memory operand")?, - _ => return Err("ldxp needs memory operand".to_string()), - }; - let sz = if is_64 { 1u32 } else { 0 }; - // 1 sz 001000 0 1 1 11111 o0 Rt2 Rn Rt (bit23=0) - let word = (1u32 << 31) | (sz << 30) | (0b001000 << 24) | (1 << 22) - | (1 << 21) | (0b11111 << 16) | (o0 << 15) | (rt2 << 10) | (rn << 5) | rt; - Ok(EncodeResult::Word(word)) - } else { - // STXP/STLXP Ws, Rt, Rt2, [Rn] - let (ws, _) = get_reg(operands, 0)?; // status register (always W) - let (rt, is_64) = get_reg(operands, 1)?; - let (rt2, _) = get_reg(operands, 2)?; - let rn = match operands.get(3) { - Some(Operand::Mem { base, .. }) => parse_reg_num(base).ok_or("stxp needs memory operand")?, - _ => return Err("stxp needs memory operand".to_string()), - }; - let sz = if is_64 { 1u32 } else { 0 }; - // 1 sz 001000 0 0 1 Rs o0 Rt2 Rn Rt (bit23=0, bit22=0) - let word = (1u32 << 31) | (sz << 30) | (0b001000 << 24) - | (1 << 21) | (ws << 16) | (o0 << 15) | (rt2 << 10) | (rn << 5) | rt; - Ok(EncodeResult::Word(word)) - } -} - -/// Encode LDAR/STLR and byte/halfword variants. -pub(crate) fn encode_ldar_stlr(operands: &[Operand], is_load: bool, forced_size: Option) -> Result { - let (rt, is_64) = get_reg(operands, 0)?; - let rn = match operands.get(1) { - Some(Operand::Mem { base, .. }) => parse_reg_num(base).ok_or("invalid base")?, - _ => return Err("ldar/stlr needs memory operand".to_string()), - }; - let size = forced_size.unwrap_or(if is_64 { 0b11 } else { 0b10 }); - let l = if is_load { 1u32 } else { 0 }; - // LDAR/STLR: size 001000 1 L 0 11111 1 11111 Rn Rt - let word = ((size << 30) | (0b001000 << 24) | (1 << 23) | (l << 22)) - | (0b11111 << 16) | (1 << 15) | (0b11111 << 10) | (rn << 5) | rt; - Ok(EncodeResult::Word(word)) -} - -// ── Address computation ────────────────────────────────────────────────── - -pub(crate) fn encode_adrp(operands: &[Operand]) -> Result { - let (rd, _) = get_reg(operands, 0)?; - - let (sym, addend) = match operands.get(1) { - Some(Operand::Symbol(s)) => (s.clone(), 0i64), - Some(Operand::Modifier { kind, symbol }) if kind == "got" => { - // adrp x0, :got:symbol - let word = (1u32 << 31) | (0b10000 << 24) | rd; - return Ok(EncodeResult::WordWithReloc { - word, - reloc: Relocation { - reloc_type: RelocType::AdrGotPage21, - symbol: symbol.clone(), - addend: 0, - }, - }); - } - Some(Operand::SymbolOffset(s, off)) => (s.clone(), *off), - Some(Operand::Label(s)) => (s.clone(), 0i64), - // Parser misclassifies symbol names that collide with register names (s1, v0, d1, etc.), - // condition codes (cc, lt, le), or barrier names (st, ld). - // ADRP never takes these as actual operand types, so treat them as symbols. - Some(Operand::Reg(name)) => (name.clone(), 0i64), - Some(Operand::Cond(name)) => (name.clone(), 0i64), - Some(Operand::Barrier(name)) => (name.clone(), 0i64), - _ => return Err(format!("adrp needs symbol operand, got {:?}", operands.get(1))), - }; - - // ADRP: 1 immlo[1:0] 10000 immhi[18:0] Rd - let word = (1u32 << 31) | (0b10000 << 24) | rd; - Ok(EncodeResult::WordWithReloc { - word, - reloc: Relocation { - reloc_type: RelocType::AdrpPage21, - symbol: sym, - addend, - }, - }) -} - -pub(crate) fn encode_adr(operands: &[Operand]) -> Result { - let (rd, _) = get_reg(operands, 0)?; - - // Check for immediate offset form: adr Rd, #imm - // TODO: validate 21-bit signed immediate range - if let Some(Operand::Imm(imm)) = operands.get(1) { - let imm = *imm; - // ADR: 0 immlo[1:0] 10000 immhi[18:0] Rd - let immlo = ((imm as u32) & 3) << 29; - let immhi = (((imm as u32) >> 2) & 0x7FFFF) << 5; - let word = immlo | (0b10000 << 24) | immhi | rd; - return Ok(EncodeResult::Word(word)); - } - - let (sym, addend) = get_symbol(operands, 1)?; - // ADR: 0 immlo[1:0] 10000 immhi[18:0] Rd - let word = (0b10000 << 24) | rd; - Ok(EncodeResult::WordWithReloc { - word, - reloc: Relocation { - reloc_type: RelocType::AdrPrelLo21, - symbol: sym, - addend, - }, - }) -} - -// ── Prefetch ───────────────────────────────────────────────────────────── - -/// Encode the PRFM (prefetch memory) instruction. -/// Format: PRFM , [{, #}] -/// Encoding: 1111 1001 10 imm12 Rn Rt -/// where Rt is the 5-bit prefetch operation type. -pub(crate) fn encode_prfm(operands: &[Operand]) -> Result { - if operands.len() < 2 { - return Err("prfm requires 2 operands".to_string()); - } - - // First operand: prefetch operation type (parsed as Symbol) - let prfop = match &operands[0] { - Operand::Symbol(s) => encode_prfop(s)?, - Operand::Imm(v) => { - if *v < 0 || *v > 31 { - return Err(format!("prfm: immediate prefetch type out of range: {}", v)); - } - *v as u32 - } - _ => return Err(format!("prfm: expected prefetch operation name, got {:?}", operands[0])), - }; - - // Second operand: memory address [Xn{, #imm}] - match &operands[1] { - Operand::Mem { base, offset } => { - let rn = parse_reg_num(base).ok_or_else(|| format!("prfm: invalid base register: {}", base))?; - let imm = *offset; - if imm < 0 || imm % 8 != 0 { - return Err(format!("prfm: offset must be non-negative and 8-byte aligned, got {}", imm)); - } - let imm12 = (imm / 8) as u32; - if imm12 > 0xFFF { - return Err(format!("prfm: offset too large: {}", imm)); - } - // PRFM (imm): 1111 1001 10 imm12(12) Rn(5) Rt(5) - let word = 0xF9800000 | (imm12 << 10) | (rn << 5) | prfop; - Ok(EncodeResult::Word(word)) - } - Operand::Symbol(_sym) => { - // PRFM (literal) with symbol reference is not yet supported - Err("prfm with symbol/label operand not yet supported".to_string()) - } - Operand::MemRegOffset { base, index, extend, shift } => { - // PRFM (register): 11 111 0 00 10 1 Rm option S 10 Rn Rt - let rn = parse_reg_num(base).ok_or_else(|| format!("prfm: invalid base register: {}", base))?; - let rm = parse_reg_num(index).ok_or_else(|| format!("prfm: invalid index register: {}", index))?; - let is_w_index = index.starts_with('w') || index.starts_with('W'); - let shift_amount: u8 = match shift { Some(s) => *s, None => 0 }; - let (option, s_bit) = match extend.as_deref() { - Some("lsl") => (0b011u32, if shift_amount > 0 { 1u32 } else { 0 }), - Some("sxtw") => (0b110u32, if shift_amount > 0 { 1u32 } else { 0 }), - Some("sxtx") => (0b111u32, if shift_amount > 0 { 1u32 } else { 0 }), - Some("uxtw") => (0b010u32, if shift_amount > 0 { 1u32 } else { 0 }), - None => if is_w_index { (0b010u32, 0u32) } else { (0b011u32, 0u32) }, - _ => (0b011u32, 0u32), - }; - let word = (0b11 << 30) | (0b111 << 27) | (0b10 << 23) | (1 << 21) - | (rm << 16) | (option << 13) | (s_bit << 12) | (0b10 << 10) | (rn << 5) | prfop; - Ok(EncodeResult::Word(word)) - } - _ => Err(format!("prfm: expected memory operand, got {:?}", operands[1])), - } -} - -/// Map prefetch operation name to its 5-bit encoding. -pub(crate) fn encode_prfop(name: &str) -> Result { - match name.to_lowercase().as_str() { - "pldl1keep" => Ok(0b00000), - "pldl1strm" => Ok(0b00001), - "pldl2keep" => Ok(0b00010), - "pldl2strm" => Ok(0b00011), - "pldl3keep" => Ok(0b00100), - "pldl3strm" => Ok(0b00101), - "plil1keep" => Ok(0b01000), - "plil1strm" => Ok(0b01001), - "plil2keep" => Ok(0b01010), - "plil2strm" => Ok(0b01011), - "plil3keep" => Ok(0b01100), - "plil3strm" => Ok(0b01101), - "pstl1keep" => Ok(0b10000), - "pstl1strm" => Ok(0b10001), - "pstl2keep" => Ok(0b10010), - "pstl2strm" => Ok(0b10011), - "pstl3keep" => Ok(0b10100), - "pstl3strm" => Ok(0b10101), - _ => Err(format!("prfm: unknown prefetch operation: {}", name)), - } -} - -// ── LSE Atomics ────────────────────────────────────────────────────────── - -/// Encode CAS/CASA/CASAL/CASL and byte/halfword variants (Compare and Swap). -/// CAS Xs, Xt, [Xn]: size 001000 1 L 1 Rs o0 11111 Rn Rt -pub(crate) fn encode_cas(mnemonic: &str, operands: &[Operand]) -> Result { - if operands.len() < 3 { - return Err(format!("{} requires 3 operands", mnemonic)); - } - let (rs, is_64) = get_reg(operands, 0)?; - let (rt, _) = get_reg(operands, 1)?; - let rn = match operands.get(2) { - Some(Operand::Mem { base, .. }) => parse_reg_num(base).ok_or("cas: invalid base")?, - _ => return Err("cas requires memory operand [Xn]".to_string()), - }; - let mn = mnemonic.to_lowercase(); - let suffix = mn.strip_prefix("cas").unwrap_or(""); - // Determine size: 'b' suffix = byte (00), 'h' suffix = half (01), else register-based - let size = if suffix.contains('b') { - 0b00u32 - } else if suffix.contains('h') { - 0b01u32 - } else if is_64 { - 0b11u32 - } else { - 0b10u32 - }; - // L bit (acquire): set for casa, casal - let l = if suffix.contains('a') { 1u32 } else { 0u32 }; - // o0 bit (release): set for casl, casal - let o0 = if suffix.contains('l') { 1u32 } else { 0u32 }; - // size 001000 1 L 1 Rs o0 11111 Rn Rt - let word = (size << 30) | (0b001000 << 24) | (1 << 23) | (l << 22) | (1 << 21) - | (rs << 16) | (o0 << 15) | (0b11111 << 10) | (rn << 5) | rt; - Ok(EncodeResult::Word(word)) -} - -/// Encode SWP/SWPA/SWPAL/SWPL and byte/halfword variants (Swap). -/// SWP Xs, Xt, [Xn]: size 111000 AR 1 Rs 1 000 00 Rn Rt -/// Variants: swp, swpa, swpal, swpl, swpb, swpab, swpalb, swplb, swph, swpah, swpalh, swplh -pub(crate) fn encode_swp(mnemonic: &str, operands: &[Operand]) -> Result { - if operands.len() < 3 { - return Err(format!("{} requires 3 operands", mnemonic)); - } - let (rs, is_64) = get_reg(operands, 0)?; - let (rt, _) = get_reg(operands, 1)?; - let rn = match operands.get(2) { - Some(Operand::Mem { base, .. }) => parse_reg_num(base).ok_or("swp: invalid base")?, - _ => return Err("swp requires memory operand [Xn]".to_string()), - }; - let mn = mnemonic.to_lowercase(); - let suffix = mn.strip_prefix("swp").unwrap_or(""); - // Determine size: 'b' suffix = byte (00), 'h' suffix = half (01), else register-based - let size = if suffix.contains('b') { - 0b00u32 - } else if suffix.contains('h') { - 0b01u32 - } else if is_64 { - 0b11u32 - } else { - 0b10u32 - }; - let a = if suffix.contains('a') { 1u32 } else { 0u32 }; - let r = if suffix.contains('l') { 1u32 } else { 0u32 }; - // size 111000 A R 1 Rs 1 000 00 Rn Rt - let word = (size << 30) | (0b111000 << 24) | (a << 23) | (r << 22) | (1 << 21) - | (rs << 16) | (1 << 15) | (rn << 5) | rt; - Ok(EncodeResult::Word(word)) -} - -/// Encode LDADD/LDCLR/LDEOR/LDSET and their acquire/release/byte/halfword variants (LSE atomics). -/// LDADD Rs, Rt, [Xn]: size 111000 A R 1 Rs 0 opc 00 Rn Rt -/// opc: LDADD=000, LDCLR=001, LDEOR=010, LDSET=011 -pub(crate) fn encode_ldop(mnemonic: &str, operands: &[Operand]) -> Result { - if operands.len() < 3 { - return Err(format!("{} requires 3 operands", mnemonic)); - } - let (rs, is_64) = get_reg(operands, 0)?; - let (rt, _) = get_reg(operands, 1)?; - let rn = match operands.get(2) { - Some(Operand::Mem { base, .. }) => parse_reg_num(base).ok_or("ldop: invalid base")?, - _ => return Err(format!("{} requires memory operand [Xn]", mnemonic)), - }; - let mn = mnemonic.to_lowercase(); - // Determine base op and suffix - let (base, suffix) = if let Some(s) = mn.strip_prefix("ldadd") { - (0b000u32, s) - } else if let Some(s) = mn.strip_prefix("ldclr") { - (0b001u32, s) - } else if let Some(s) = mn.strip_prefix("ldeor") { - (0b010u32, s) - } else if let Some(s) = mn.strip_prefix("ldset") { - (0b011u32, s) - } else { - return Err(format!("unknown ld atomic op: {}", mnemonic)); - }; - // Determine size: 'b' suffix = byte (00), 'h' suffix = half (01), else register-based - let size = if suffix.contains('b') { - 0b00u32 - } else if suffix.contains('h') { - 0b01u32 - } else if is_64 { - 0b11u32 - } else { - 0b10u32 - }; - let a = if suffix.contains('a') { 1u32 } else { 0u32 }; - let r = if suffix.contains('l') { 1u32 } else { 0u32 }; - // size 111000 A R 1 Rs 0 opc 00 Rn Rt - let word = (size << 30) | (0b111000 << 24) | (a << 23) | (r << 22) | (1 << 21) - | (rs << 16) | (base << 12) | (rn << 5) | rt; - Ok(EncodeResult::Word(word)) -} - -/// Encode STADD/STCLR/STEOR/STSET and their release/byte/halfword variants. -/// These are aliases for LDADD/LDCLR/LDEOR/LDSET with Rt=XZR (register 31). -/// STADD Ws, [Xn] encodes as LDADD Ws, WZR, [Xn] -/// Variants: stadd/stclr/steor/stset, plus 'l' (release), 'b' (byte), 'h' (half). -pub(crate) fn encode_stop(mnemonic: &str, operands: &[Operand]) -> Result { - if operands.len() < 2 { - return Err(format!("{} requires 2 operands", mnemonic)); - } - let (rs, is_64) = get_reg(operands, 0)?; - let rn = match operands.get(1) { - Some(Operand::Mem { base, .. }) => parse_reg_num(base).ok_or_else(|| format!("{}: invalid base", mnemonic))?, - _ => return Err(format!("{} requires memory operand [Xn]", mnemonic)), - }; - let mn = mnemonic.to_lowercase(); - // Determine base op from the prefix - let (opc, suffix) = if let Some(s) = mn.strip_prefix("stadd") { - (0b000u32, s) - } else if let Some(s) = mn.strip_prefix("stclr") { - (0b001u32, s) - } else if let Some(s) = mn.strip_prefix("steor") { - (0b010u32, s) - } else if let Some(s) = mn.strip_prefix("stset") { - (0b011u32, s) - } else { - return Err(format!("unknown st atomic op: {}", mnemonic)); - }; - // Determine size: 'b' suffix = byte (00), 'h' suffix = half (01), else register-based - let size = if suffix.contains('b') { - 0b00u32 - } else if suffix.contains('h') { - 0b01u32 - } else if is_64 { - 0b11u32 - } else { - 0b10u32 - }; - // A=0 (no acquire for store aliases), R from 'l' suffix (release) - let r = if suffix.contains('l') { 1u32 } else { 0u32 }; - let rt = 31u32; // XZR/WZR - discard result - // size 111000 A R 1 Rs 0 opc 00 Rn Rt - let word = (size << 30) | (0b111000 << 24) | (r << 22) | (1 << 21) - | (rs << 16) | (opc << 12) | (rn << 5) | rt; - Ok(EncodeResult::Word(word)) -} diff --git a/src/backend/arm/assembler/encoder/mod.rs b/src/backend/arm/assembler/encoder/mod.rs deleted file mode 100644 index 3b54d6cf49..0000000000 --- a/src/backend/arm/assembler/encoder/mod.rs +++ /dev/null @@ -1,993 +0,0 @@ -//! AArch64 instruction encoder. -//! -//! Encodes AArch64 instructions into 32-bit machine code words. -//! This covers the subset of instructions emitted by our codegen. -//! -//! AArch64 instructions are always 4 bytes (32 bits), little-endian. -//! The encoding format varies by instruction class. - -// Encoding helpers for all AArch64 instruction formats; not all formats used yet. -#![allow(dead_code)] - -use super::parser::Operand; - -mod data_processing; -mod compare_branch; -mod load_store; -mod fp_scalar; -mod system; -mod bitfield; -mod neon; - -pub(crate) use data_processing::*; -pub(crate) use compare_branch::*; -pub(crate) use load_store::*; -pub(crate) use fp_scalar::*; -pub(crate) use system::*; -pub(crate) use bitfield::*; -pub(crate) use neon::*; - -/// Result of encoding an instruction. -#[derive(Debug, Clone)] -pub enum EncodeResult { - /// Successfully encoded as a 4-byte instruction word - Word(u32), - /// Instruction needs a relocation to be applied later - WordWithReloc { - word: u32, - reloc: Relocation, - }, - /// Multiple encoded words (e.g., movz+movk sequence) - Words(Vec), - /// Skip this instruction (e.g., pseudo-instruction handled elsewhere) - Skip, -} - -/// Relocation types for AArch64 ELF -#[derive(Debug, Clone)] -pub enum RelocType { - /// R_AARCH64_CALL26 - for BL instruction (26-bit PC-relative) - Call26, - /// R_AARCH64_JUMP26 - for B instruction (26-bit PC-relative) - Jump26, - /// R_AARCH64_ADR_PREL_PG_HI21 - for ADRP (page-relative, bits [32:12]) - AdrpPage21, - /// R_AARCH64_ADD_ABS_LO12_NC - for ADD :lo12: (low 12 bits) - AddAbsLo12, - /// R_AARCH64_LDST8_ABS_LO12_NC - Ldst8AbsLo12, - /// R_AARCH64_LDST16_ABS_LO12_NC - Ldst16AbsLo12, - /// R_AARCH64_LDST32_ABS_LO12_NC - Ldst32AbsLo12, - /// R_AARCH64_LDST64_ABS_LO12_NC - Ldst64AbsLo12, - /// R_AARCH64_LDST128_ABS_LO12_NC - Ldst128AbsLo12, - /// R_AARCH64_ADR_GOT_PAGE21 - GOT-relative ADRP - AdrGotPage21, - /// R_AARCH64_LD64_GOT_LO12_NC - GOT entry LDR - Ld64GotLo12, - /// R_AARCH64_TLSLE_ADD_TPREL_HI12 - TlsLeAddTprelHi12, - /// R_AARCH64_TLSLE_ADD_TPREL_LO12_NC - TlsLeAddTprelLo12, - /// R_AARCH64_CONDBR19 - conditional branch, 19-bit offset - CondBr19, - /// R_AARCH64_TSTBR14 - test-and-branch, 14-bit offset - TstBr14, - /// R_AARCH64_ADR_PREL_LO21 - for ADR (21-bit PC-relative) - AdrPrelLo21, - /// R_AARCH64_ABS64 - 64-bit absolute - Abs64, - /// R_AARCH64_ABS32 - 32-bit absolute - Abs32, - /// R_AARCH64_PREL32 - 32-bit PC-relative - Prel32, - /// R_AARCH64_PREL64 - 64-bit PC-relative - Prel64, - /// R_AARCH64_LD_PREL_LO19 - LDR literal, 19-bit PC-relative - Ldr19, -} - -impl RelocType { - /// Get the ELF relocation type number - pub fn elf_type(&self) -> u32 { - match self { - RelocType::Abs64 => 257, // R_AARCH64_ABS64 - RelocType::Abs32 => 258, // R_AARCH64_ABS32 - RelocType::Prel32 => 261, // R_AARCH64_PREL32 - RelocType::Prel64 => 260, // R_AARCH64_PREL64 - RelocType::Call26 => 283, // R_AARCH64_CALL26 - RelocType::Jump26 => 282, // R_AARCH64_JUMP26 - RelocType::AdrPrelLo21 => 274, // R_AARCH64_ADR_PREL_LO21 - RelocType::AdrpPage21 => 275, // R_AARCH64_ADR_PREL_PG_HI21 - RelocType::AddAbsLo12 => 277, // R_AARCH64_ADD_ABS_LO12_NC - RelocType::Ldst8AbsLo12 => 278, // R_AARCH64_LDST8_ABS_LO12_NC - RelocType::Ldst16AbsLo12 => 284, // R_AARCH64_LDST16_ABS_LO12_NC - RelocType::Ldst32AbsLo12 => 285, // R_AARCH64_LDST32_ABS_LO12_NC - RelocType::Ldst64AbsLo12 => 286, // R_AARCH64_LDST64_ABS_LO12_NC - RelocType::Ldst128AbsLo12 => 299, // R_AARCH64_LDST128_ABS_LO12_NC - RelocType::AdrGotPage21 => 311, // R_AARCH64_ADR_GOT_PAGE21 - RelocType::Ld64GotLo12 => 312, // R_AARCH64_LD64_GOT_LO12_NC - RelocType::TlsLeAddTprelHi12 => 549, // R_AARCH64_TLSLE_ADD_TPREL_HI12 - RelocType::TlsLeAddTprelLo12 => 551, // R_AARCH64_TLSLE_ADD_TPREL_LO12_NC - RelocType::CondBr19 => 280, // R_AARCH64_CONDBR19 - RelocType::TstBr14 => 279, // R_AARCH64_TSTBR14 - RelocType::Ldr19 => 273, // R_AARCH64_LD_PREL_LO19 - } - } -} - -/// A relocation to be applied. -#[derive(Debug, Clone)] -pub struct Relocation { - pub reloc_type: RelocType, - pub symbol: String, - pub addend: i64, -} - -/// Parse a register name to its 5-bit encoding number (0-30, 31 for sp/zr). -pub fn parse_reg_num(name: &str) -> Option { - let name = name.to_lowercase(); - match name.as_str() { - "sp" | "wsp" => Some(31), - "xzr" | "wzr" => Some(31), - "lr" => Some(30), - _ => { - let prefix = name.chars().next()?; - match prefix { - 'x' | 'w' | 'd' | 's' | 'q' | 'v' | 'h' | 'b' => { - let num: u32 = name[1..].parse().ok()?; - if num <= 31 { Some(num) } else { None } - } - _ => None, - } - } - } -} - -/// Check if a register name is a 64-bit (X) register or SP. -fn is_64bit_reg(name: &str) -> bool { - let name = name.to_lowercase(); - name.starts_with('x') || name == "sp" || name == "xzr" || name == "lr" -} - -/// Check if a register name is a 32-bit (W) register. -fn is_32bit_reg(name: &str) -> bool { - let name = name.to_lowercase(); - name.starts_with('w') || name == "wsp" || name == "wzr" -} - -/// Check if a register is a floating-point/SIMD register. -fn is_fp_reg(name: &str) -> bool { - let c = name.chars().next().unwrap_or(' ').to_ascii_lowercase(); - matches!(c, 'd' | 's' | 'q' | 'v' | 'h' | 'b') -} - -/// Encode a condition code string to 4-bit encoding. -fn encode_cond(cond: &str) -> Option { - match cond.to_lowercase().as_str() { - "eq" => Some(0), - "ne" => Some(1), - "cs" | "hs" => Some(2), - "cc" | "lo" => Some(3), - "mi" => Some(4), - "pl" => Some(5), - "vs" => Some(6), - "vc" => Some(7), - "hi" => Some(8), - "ls" => Some(9), - "ge" => Some(10), - "lt" => Some(11), - "gt" => Some(12), - "le" => Some(13), - "al" => Some(14), - "nv" => Some(15), - _ => None, - } -} - -/// Encode an AArch64 instruction from its mnemonic and parsed operands. -pub fn encode_instruction(mnemonic: &str, operands: &[Operand], raw_operands: &str) -> Result { - let mn = mnemonic.to_lowercase(); - - // Handle condition-code suffixed branches: b.eq, b.ne, b.lt, etc. - if let Some(cond) = mn.strip_prefix("b.") { - return encode_cond_branch(cond, operands); - } - - // Handle condition-code branches without the dot: beq, bne, bge, blt, etc. - // These are common aliases used in GNU assembler syntax. - { - let cond_aliases: &[(&str, &str)] = &[ - ("beq", "eq"), ("bne", "ne"), ("bcs", "cs"), ("bhs", "hs"), - ("bcc", "cc"), ("blo", "lo"), ("bmi", "mi"), ("bpl", "pl"), - ("bvs", "vs"), ("bvc", "vc"), ("bhi", "hi"), ("bls", "ls"), - ("bge", "ge"), ("blt", "lt"), ("bgt", "gt"), ("ble", "le"), - ("bal", "al"), - ]; - for &(alias, cond) in cond_aliases { - if mn == alias { - return encode_cond_branch(cond, operands); - } - } - } - - match mn.as_str() { - // Data processing - register - "mov" => encode_mov(operands), - "movz" => encode_movz(operands), - "movk" => encode_movk(operands), - "movn" => encode_movn(operands), - "add" => if is_neon_scalar_d_reg_op(operands) { - encode_neon_scalar_three_same(operands, 0, 0b10000, 0b11) - } else { encode_add_sub(operands, false, false) }, - "adds" => encode_add_sub(operands, false, true), - "sub" => if is_neon_scalar_d_reg_op(operands) { - encode_neon_scalar_three_same(operands, 1, 0b10000, 0b11) - } else { encode_add_sub(operands, true, false) }, - "subs" => encode_add_sub(operands, true, true), - "and" => encode_logical(operands, 0b00), - "orr" => encode_logical(operands, 0b01), - "eor" => encode_logical(operands, 0b10), - "ands" => encode_logical(operands, 0b11), - "orn" => encode_orn(operands), - "eon" => encode_eon(operands), - "bics" => encode_bics(operands), - "mul" => if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - if matches!(operands.get(2), Some(Operand::RegLane { .. })) { - encode_neon_elem(operands, 0, 0b1000) - } else { - encode_neon_three_same(operands, 0, 0b10011) - } - } else { encode_mul(operands) }, - "madd" => encode_madd(operands), - "msub" => encode_msub(operands), - "smull" => { - if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - if matches!(operands.get(2), Some(Operand::RegLane { .. })) { - encode_neon_elem_long(operands, 0, 0b1010, false) // SMULL (by element) - } else { - encode_neon_three_diff(operands, 0, 0b1100, false) // SMULL (vector) - } - } else { - encode_smull(operands) // SMULL (scalar) - } - } - "umull" => { - if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - if matches!(operands.get(2), Some(Operand::RegLane { .. })) { - encode_neon_elem_long(operands, 1, 0b1010, false) // UMULL (by element) - } else { - encode_neon_three_diff(operands, 1, 0b1100, false) // UMULL (vector) - } - } else { - encode_umull(operands) // UMULL (scalar) - } - } - "smaddl" => encode_smaddl(operands), - "umaddl" => encode_umaddl(operands), - "mneg" => encode_mneg(operands), - "udiv" => encode_div(operands, true), - "sdiv" => encode_div(operands, false), - "umulh" => encode_umulh(operands), - "smulh" => encode_smulh(operands), - "neg" => if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - encode_neon_two_misc(operands, 1, 0b01011) - } else { encode_neg(operands) }, - "negs" => encode_negs(operands), - "mvn" => encode_mvn(operands), - "adc" => encode_adc(operands, false), - "adcs" => encode_adc(operands, true), - "sbc" => encode_sbc(operands, false), - "sbcs" => encode_sbc(operands, true), - - // Shifts - "lsl" => encode_shift(operands, 0b00), - "lsr" => encode_shift(operands, 0b01), - "asr" => encode_shift(operands, 0b10), - "ror" => encode_shift(operands, 0b11), - - // Extensions - "sxtw" => encode_sxtw(operands), - "sxth" => encode_sxth(operands), - "sxtb" => encode_sxtb(operands), - "uxtw" => encode_uxtw(operands), - "uxth" => encode_uxth(operands), - "uxtb" => encode_uxtb(operands), - - // Compare - "cmp" => encode_cmp(operands), - "cmn" => encode_cmn(operands), - "tst" => encode_tst(operands), - "ccmp" => encode_ccmp_ccmn(operands, true), - "ccmn" => encode_ccmp_ccmn(operands, false), - - // Conditional select - "csel" => encode_csel(operands), - "csinc" => encode_csinc(operands), - "csinv" => encode_csinv(operands), - "csneg" => encode_csneg(operands), - "cset" => encode_cset(operands), - "csetm" => encode_csetm(operands), - - // Branches - "b" => encode_branch(operands), - "bl" => encode_bl(operands), - "br" => encode_br(operands), - "blr" => encode_blr(operands), - "ret" => encode_ret(operands), - "cbz" => encode_cbz(operands, false), - "cbnz" => encode_cbz(operands, true), - "tbz" => encode_tbz(operands, false), - "tbnz" => encode_tbz(operands, true), - - // Loads/stores - size determined from register width - "ldr" => encode_ldr_str_auto(operands, true), - "str" => encode_ldr_str_auto(operands, false), - "ldrb" => encode_ldr_str(operands, true, 0b00, false, false), // byte load - "strb" => encode_ldr_str(operands, false, 0b00, false, false), - "ldrh" => encode_ldr_str(operands, true, 0b01, false, false), // halfword load - "strh" => encode_ldr_str(operands, false, 0b01, false, false), - "ldrw" | "ldrsw" => encode_ldrsw(operands), - "ldrsb" => encode_ldrs(operands, 0b00), - "ldrsh" => encode_ldrs(operands, 0b01), - "ldur" => encode_ldur_stur(operands, true, 0b00), - "stur" => encode_ldur_stur(operands, false, 0b00), - "ldtr" => encode_ldur_stur(operands, true, 0b10), - "sttr" => encode_ldur_stur(operands, false, 0b10), - "ldtrh" => encode_ldtr_sized(operands, true, 0b01), - "sttrh" => encode_ldtr_sized(operands, false, 0b01), - "ldtrb" => encode_ldtr_sized(operands, true, 0b00), - "sttrb" => encode_ldtr_sized(operands, false, 0b00), - "ldp" => encode_ldp_stp(operands, true), - "stp" => encode_ldp_stp(operands, false), - "ldnp" => encode_ldnp_stnp(operands, true), - "stnp" => encode_ldnp_stnp(operands, false), - "ldxr" => encode_ldxr_stxr(operands, true, None), - "stxr" => encode_ldxr_stxr(operands, false, None), - "ldxrb" => encode_ldxr_stxr(operands, true, Some(0b00)), - "stxrb" => encode_ldxr_stxr(operands, false, Some(0b00)), - "ldxrh" => encode_ldxr_stxr(operands, true, Some(0b01)), - "stxrh" => encode_ldxr_stxr(operands, false, Some(0b01)), - "ldaxr" => encode_ldaxr_stlxr(operands, true, None), - "stlxr" => encode_ldaxr_stlxr(operands, false, None), - "ldaxrb" => encode_ldaxr_stlxr(operands, true, Some(0b00)), - "stlxrb" => encode_ldaxr_stlxr(operands, false, Some(0b00)), - "ldaxrh" => encode_ldaxr_stlxr(operands, true, Some(0b01)), - "stlxrh" => encode_ldaxr_stlxr(operands, false, Some(0b01)), - "ldar" => encode_ldar_stlr(operands, true, None), - "stlr" => encode_ldar_stlr(operands, false, None), - "ldarb" => encode_ldar_stlr(operands, true, Some(0b00)), - "stlrb" => encode_ldar_stlr(operands, false, Some(0b00)), - "ldarh" => encode_ldar_stlr(operands, true, Some(0b01)), - "stlrh" => encode_ldar_stlr(operands, false, Some(0b01)), - "ldxp" => encode_ldxp_stxp(operands, true, false), - "ldaxp" => encode_ldxp_stxp(operands, true, true), - "stxp" => encode_ldxp_stxp(operands, false, false), - "stlxp" => encode_ldxp_stxp(operands, false, true), - - // Address computation - "adrp" => encode_adrp(operands), - "adr" => encode_adr(operands), - - // Floating point (scalar or vector based on operand type) - "fmov" => encode_fmov(operands), - "fadd" => if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - encode_neon_float_three_same(operands, 0, 0, 0b11010) - } else { encode_fp_arith(operands, 0b0010) }, - "fsub" => if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - encode_neon_float_three_same(operands, 0, 1, 0b11010) - } else { encode_fp_arith(operands, 0b0011) }, - "fmul" => if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - if matches!(operands.get(2), Some(Operand::RegLane { .. })) { - encode_neon_float_elem(operands, 1, 0b1001) - } else { - encode_neon_float_three_same(operands, 1, 0, 0b11011) - } - } else { encode_fp_arith(operands, 0b0000) }, - "fdiv" => if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - encode_neon_float_three_same(operands, 1, 0, 0b11111) - } else { encode_fp_arith(operands, 0b0001) }, - "fmax" => if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - encode_neon_float_three_same(operands, 0, 0, 0b11110) - } else { encode_fp_arith(operands, 0b0100) }, - "fmin" => if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - encode_neon_float_three_same(operands, 0, 1, 0b11110) - } else { encode_fp_arith(operands, 0b0101) }, - "fmaxnm" => if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - encode_neon_float_three_same(operands, 0, 0, 0b11000) - } else { encode_fp_arith(operands, 0b0110) }, - "fminnm" => if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - encode_neon_float_three_same(operands, 0, 1, 0b11000) - } else { encode_fp_arith(operands, 0b0111) }, - "fneg" => if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - encode_neon_float_two_misc(operands, 1, 0, 0b01111) - } else { encode_fneg(operands) }, - "fabs" => if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - encode_neon_float_two_misc(operands, 0, 1, 0b01111) - } else { encode_fabs(operands) }, - "fsqrt" => if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - encode_neon_float_two_misc(operands, 1, 1, 0b11111) - } else { encode_fsqrt(operands) }, - "frintn" => if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - encode_neon_float_two_misc(operands, 0, 0, 0b11000) - } else { encode_fp_1src(operands, 0b001000) }, - "frintp" => if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - encode_neon_float_two_misc(operands, 0, 1, 0b11000) - } else { encode_fp_1src(operands, 0b001001) }, - "frintm" => if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - encode_neon_float_two_misc(operands, 0, 0, 0b11001) - } else { encode_fp_1src(operands, 0b001010) }, - "frintz" => if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - encode_neon_float_two_misc(operands, 0, 1, 0b11001) - } else { encode_fp_1src(operands, 0b001011) }, - "frinta" => if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - encode_neon_float_two_misc(operands, 1, 0, 0b11000) - } else { encode_fp_1src(operands, 0b001100) }, - "frintx" => if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - encode_neon_float_two_misc(operands, 1, 0, 0b11001) - } else { encode_fp_1src(operands, 0b001110) }, - "frinti" => if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - encode_neon_float_two_misc(operands, 1, 1, 0b11001) - } else { encode_fp_1src(operands, 0b001111) }, - "fmadd" => encode_fmadd_fmsub(operands, false), - "fmsub" => encode_fmadd_fmsub(operands, true), - "fnmadd" => encode_fnmadd_fnmsub(operands, false), - "fnmsub" => encode_fnmadd_fnmsub(operands, true), - "fcmp" => encode_fcmp(operands), - "fcvtzs" => if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - encode_neon_float_two_misc(operands, 0, 1, 0b11011) - } else { encode_fcvt_rounding(operands, 0b11, 0b000) }, - "fcvtzu" => if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - encode_neon_float_two_misc(operands, 1, 1, 0b11011) - } else { encode_fcvt_rounding(operands, 0b11, 0b001) }, - "fcvtas" => encode_fcvt_rounding(operands, 0b00, 0b100), - "fcvtau" => encode_fcvt_rounding(operands, 0b00, 0b101), - "fcvtns" => encode_fcvt_rounding(operands, 0b00, 0b000), - "fcvtnu" => encode_fcvt_rounding(operands, 0b00, 0b001), - "fcvtms" => encode_fcvt_rounding(operands, 0b10, 0b000), - "fcvtmu" => encode_fcvt_rounding(operands, 0b10, 0b001), - "fcvtps" => encode_fcvt_rounding(operands, 0b01, 0b000), - "fcvtpu" => encode_fcvt_rounding(operands, 0b01, 0b001), - "ucvtf" => if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - encode_neon_float_two_misc(operands, 1, 0, 0b11101) - } else { encode_ucvtf(operands) }, - "scvtf" => if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - encode_neon_float_two_misc(operands, 0, 0, 0b11101) - } else { encode_scvtf(operands) }, - "fcvt" => encode_fcvt_precision(operands), - "fcvtl" => encode_neon_fcvtl(operands, false), - "fcvtl2" => encode_neon_fcvtl(operands, true), - "fcvtn" => encode_neon_fcvtn(operands, false), - "fcvtn2" => encode_neon_fcvtn(operands, true), - // NEON float three-same instructions (vector-only) - "fmla" => if matches!(operands.get(2), Some(Operand::RegLane { .. })) { - encode_neon_float_elem(operands, 0, 0b0001) - } else { - encode_neon_float_three_same(operands, 0, 0, 0b11001) - }, - "fmls" => if matches!(operands.get(2), Some(Operand::RegLane { .. })) { - encode_neon_float_elem(operands, 0, 0b0101) - } else { - encode_neon_float_three_same(operands, 0, 1, 0b11001) - }, - "frecps" => encode_neon_float_three_same(operands, 0, 0, 0b11111), - "frsqrts" => encode_neon_float_three_same(operands, 0, 1, 0b11111), - "fcmeq" => if matches!(operands.get(2), Some(Operand::Imm(0))) { - encode_neon_float_cmp_zero(operands, 0, 0, 0b01101) - } else { - encode_neon_float_three_same(operands, 0, 0, 0b11100) - }, - "fcmge" => if matches!(operands.get(2), Some(Operand::Imm(0))) { - encode_neon_float_cmp_zero(operands, 1, 0, 0b01100) - } else { - encode_neon_float_three_same(operands, 1, 0, 0b11100) - }, - "fcmgt" => if matches!(operands.get(2), Some(Operand::Imm(0))) { - encode_neon_float_cmp_zero(operands, 0, 1, 0b01100) - } else { - encode_neon_float_three_same(operands, 1, 1, 0b11100) - }, - "fcmle" => encode_neon_float_cmp_zero(operands, 1, 0, 0b01101), - "fcmlt" => encode_neon_float_cmp_zero(operands, 0, 1, 0b01101), - "facge" => encode_neon_float_three_same(operands, 1, 0, 0b11101), - "facgt" => encode_neon_float_three_same(operands, 1, 1, 0b11101), - - // NEON/SIMD - "cnt" => encode_cnt(operands), - "cmeq" => { - // CMEQ has two forms: - // - CMEQ Vd, Vn, Vm (three-same, U=1): compare registers - // - CMEQ Vd, Vn, #0 (two-reg misc, U=0): compare to zero - if matches!(operands.get(2), Some(Operand::Imm(0))) { - encode_neon_cmp_zero(operands, 0, 0b01001) - } else { - encode_neon_three_same(operands, 1, 0b10001) - } - } - "cmhi" => encode_neon_three_same(operands, 1, 0b00110), - "cmhs" => encode_neon_three_same(operands, 1, 0b00111), - "cmge" => if matches!(operands.get(2), Some(Operand::Imm(0))) { - encode_neon_cmp_zero(operands, 1, 0b01000) // CMGE #0 - } else { - encode_neon_three_same(operands, 0, 0b00111) - }, - "cmgt" => if matches!(operands.get(2), Some(Operand::Imm(0))) { - encode_neon_cmp_zero(operands, 0, 0b01000) // CMGT #0 - } else { - encode_neon_three_same(operands, 0, 0b00110) - }, - "cmtst" => encode_neon_three_same(operands, 0, 0b10001), - "uqsub" => encode_neon_three_same(operands, 1, 0b00101), - "sqsub" => encode_neon_three_same(operands, 0, 0b00101), - "uhadd" => encode_neon_three_same(operands, 1, 0b00000), - "shadd" => encode_neon_three_same(operands, 0, 0b00000), - "urhadd" => encode_neon_three_same(operands, 1, 0b00010), - "srhadd" => encode_neon_three_same(operands, 0, 0b00010), - "uhsub" => encode_neon_three_same(operands, 1, 0b00100), - "shsub" => encode_neon_three_same(operands, 0, 0b00100), - "umax" => encode_neon_three_same(operands, 1, 0b01100), - "smax" => encode_neon_three_same(operands, 0, 0b01100), - "umin" => encode_neon_three_same(operands, 1, 0b01101), - "smin" => encode_neon_three_same(operands, 0, 0b01101), - "uabd" => encode_neon_three_same(operands, 1, 0b01110), - "sabd" => encode_neon_three_same(operands, 0, 0b01110), - "uaba" => encode_neon_three_same(operands, 1, 0b01111), - "saba" => encode_neon_three_same(operands, 0, 0b01111), - "uqadd" => encode_neon_three_same(operands, 1, 0b00001), - "sqadd" => encode_neon_three_same(operands, 0, 0b00001), - "sshl" => encode_neon_three_same(operands, 0, 0b01000), - "ushl" => encode_neon_three_same(operands, 1, 0b01000), - "sqshl" => if matches!(operands.get(2), Some(Operand::Imm(_))) { - encode_neon_shift_left_imm(operands, 0, 0b01110) - } else { - encode_neon_three_same(operands, 0, 0b01001) - }, - "uqshl" => if matches!(operands.get(2), Some(Operand::Imm(_))) { - encode_neon_shift_left_imm(operands, 1, 0b01110) - } else { - encode_neon_three_same(operands, 1, 0b01001) - }, - "srshl" => encode_neon_three_same(operands, 0, 0b01010), - "urshl" => encode_neon_three_same(operands, 1, 0b01010), - "sqrshl" => encode_neon_three_same(operands, 0, 0b01011), - "uqrshl" => encode_neon_three_same(operands, 1, 0b01011), - "addp" => if operands.len() == 2 && matches!(operands.first(), Some(Operand::Reg(r)) if r.starts_with('d') || r.starts_with('D')) { - // Scalar ADDP: addp Dd, Vn.2d - encode_neon_scalar_addp(operands) - } else { - encode_neon_three_same(operands, 0, 0b10111) - }, - "uminp" => encode_neon_three_same(operands, 1, 0b10101), - "umaxp" => encode_neon_three_same(operands, 1, 0b10100), - "sminp" => encode_neon_three_same(operands, 0, 0b10101), - "smaxp" => encode_neon_three_same(operands, 0, 0b10100), - // NEON two-reg misc (integer) - "abs" => encode_neon_two_misc(operands, 0, 0b01011), - // neg dispatch moved to early scalar section - "cls" => if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - encode_neon_two_misc(operands, 0, 0b00100) - } else { encode_cls(operands) }, - "clz" => if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - encode_neon_two_misc(operands, 1, 0b00100) - } else { encode_clz(operands) }, - "rev16" => if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - encode_neon_two_misc(operands, 0, 0b00001) - } else { encode_rev16(operands) }, - "rev32" => if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - encode_neon_two_misc(operands, 1, 0b00000) - } else { encode_rev32(operands) }, - "saddlp" => encode_neon_two_misc(operands, 0, 0b00010), - "uaddlp" => encode_neon_two_misc(operands, 1, 0b00010), - "sadalp" => encode_neon_two_misc(operands, 0, 0b00110), - "uadalp" => encode_neon_two_misc(operands, 1, 0b00110), - "sqxtun" => encode_neon_two_misc_narrow(operands, 1, 0b10010, false), - "sqxtun2" => encode_neon_two_misc_narrow(operands, 1, 0b10010, true), - "sqabs" => if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - encode_neon_two_misc(operands, 0, 0b00111) - } else { - encode_neon_scalar_two_misc(operands, 0, 0b00111) - }, - "sqneg" => if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - encode_neon_two_misc(operands, 0, 0b01000) - } else { - encode_neon_scalar_two_misc(operands, 0, 0b01000) - }, - // Compare to zero forms - "cmlt" => encode_neon_cmp_zero(operands, 0, 0b01010), // CMLT #0 - "cmle" => encode_neon_cmp_zero(operands, 1, 0b01001), // CMLE #0 - // NEON shift right narrow - "shrn" => encode_neon_shrn(operands, 0b100001, false), - "shrn2" => encode_neon_shrn(operands, 0b100001, true), - "rshrn" => encode_neon_shrn(operands, 0b100011, false), - "rshrn2" => encode_neon_shrn(operands, 0b100011, true), - // NEON shift right accumulate and rounding shift right - "srshr" => encode_neon_shift_right(operands, 0, 0b001001), - "urshr" => encode_neon_shift_right(operands, 1, 0b001001), - "ssra" => encode_neon_shift_right(operands, 0, 0b000101), - "usra" => encode_neon_shift_right(operands, 1, 0b000101), - "srsra" => encode_neon_shift_right(operands, 0, 0b001101), - "ursra" => encode_neon_shift_right(operands, 1, 0b001101), - // NEON shift left long - "ushll" => encode_neon_shll(operands, 1, false), - "ushll2" => encode_neon_shll(operands, 1, true), - "sshll" => encode_neon_shll(operands, 0, false), - "sshll2" => encode_neon_shll(operands, 0, true), - // NEON three-different extras - "uabal" => encode_neon_three_diff(operands, 1, 0b0101, false), - "uabal2" => encode_neon_three_diff(operands, 1, 0b0101, true), - "sabal" => encode_neon_three_diff(operands, 0, 0b0101, false), - "sabal2" => encode_neon_three_diff(operands, 0, 0b0101, true), - "uabdl" => encode_neon_three_diff(operands, 1, 0b0111, false), - "uabdl2" => encode_neon_three_diff(operands, 1, 0b0111, true), - "sabdl" => encode_neon_three_diff(operands, 0, 0b0111, false), - "sabdl2" => encode_neon_three_diff(operands, 0, 0b0111, true), - // ADDHN/RADDHN/SUBHN/RSUBHN (narrowing high) - "addhn" => encode_neon_three_diff_narrow(operands, 0, 0b0100, false), - "addhn2" => encode_neon_three_diff_narrow(operands, 0, 0b0100, true), - "raddhn" => encode_neon_three_diff_narrow(operands, 1, 0b0100, false), - "raddhn2" => encode_neon_three_diff_narrow(operands, 1, 0b0100, true), - "subhn" => encode_neon_three_diff_narrow(operands, 0, 0b0110, false), - "subhn2" => encode_neon_three_diff_narrow(operands, 0, 0b0110, true), - "rsubhn" => encode_neon_three_diff_narrow(operands, 1, 0b0110, false), - "rsubhn2" => encode_neon_three_diff_narrow(operands, 1, 0b0110, true), - // NEON sat shift right narrow - "sqshrn" => if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - encode_neon_qshrn(operands, 0, false, false) - } else { - encode_neon_scalar_qshrn(operands, 0, false) - }, - "sqshrn2" => encode_neon_qshrn(operands, 0, false, true), - "uqshrn" => encode_neon_qshrn(operands, 1, false, false), - "uqshrn2" => encode_neon_qshrn(operands, 1, false, true), - "sqrshrn" => encode_neon_qshrn(operands, 0, true, false), - "sqrshrn2" => encode_neon_qshrn(operands, 0, true, true), - "uqrshrn" => encode_neon_qshrn(operands, 1, true, false), - "uqrshrn2" => encode_neon_qshrn(operands, 1, true, true), - "sqrshrun" => encode_neon_sqshrun(operands, true, false), - "sqrshrun2" => encode_neon_sqshrun(operands, true, true), - // NEON permute: TRN1/TRN2 - "trn1" => encode_neon_zip_uzp(operands, 0b010, false), - "trn2" => encode_neon_zip_uzp(operands, 0b110, false), - // NEON replicate loads - "ld2r" => encode_neon_ldnr(operands, 2), - "ld3r" => encode_neon_ldnr(operands, 3), - "ld4r" => encode_neon_ldnr(operands, 4), - "ushr" => encode_neon_ushr(operands), - "sshr" => encode_neon_sshr(operands), - "shl" => encode_neon_shl(operands), - "sli" => encode_neon_sli(operands), - "sri" => encode_neon_sri(operands), - "ext" => encode_neon_ext(operands), - "addv" => encode_neon_addv(operands), - "umaxv" => encode_neon_across(operands, 1, 0b01010), - "uminv" => encode_neon_across(operands, 1, 0b11010), - "smaxv" => encode_neon_across(operands, 0, 0b01010), - "sminv" => encode_neon_across(operands, 0, 0b11010), - "umov" => encode_neon_umov(operands), - "dup" => encode_neon_dup(operands), - "ins" => encode_neon_ins(operands), - "not" => encode_neon_not(operands), - "movi" => encode_neon_movi(operands), - "bic" => encode_bic(operands), - "bsl" => encode_neon_bsl(operands), - "bit" => encode_neon_bitwise_insert(operands, 0b10), - "bif" => encode_neon_bitwise_insert(operands, 0b11), - "faddp" => encode_neon_faddp(operands), - "saddlv" => encode_neon_across_long(operands, 0, 0b00011), - "uaddlv" => encode_neon_across_long(operands, 1, 0b00011), - "sqdmlal" => if matches!(operands.get(2), Some(Operand::RegLane { .. })) { - encode_neon_elem_long(operands, 0, 0b0011, false) - } else { - encode_neon_three_diff(operands, 0, 0b1001, false) - }, - "sqdmlal2" => if matches!(operands.get(2), Some(Operand::RegLane { .. })) { - encode_neon_elem_long(operands, 0, 0b0011, true) - } else { - encode_neon_three_diff(operands, 0, 0b1001, true) - }, - "sqdmlsl" => if matches!(operands.get(2), Some(Operand::RegLane { .. })) { - encode_neon_elem_long(operands, 0, 0b0111, false) - } else { - encode_neon_three_diff(operands, 0, 0b1011, false) - }, - "sqdmlsl2" => if matches!(operands.get(2), Some(Operand::RegLane { .. })) { - encode_neon_elem_long(operands, 0, 0b0111, true) - } else { - encode_neon_three_diff(operands, 0, 0b1011, true) - }, - "sqdmull" => if matches!(operands.get(2), Some(Operand::RegLane { .. })) { - encode_neon_elem_long(operands, 0, 0b1011, false) - } else { - encode_neon_three_diff(operands, 0, 0b1101, false) - }, - "sqdmull2" => if matches!(operands.get(2), Some(Operand::RegLane { .. })) { - encode_neon_elem_long(operands, 0, 0b1011, true) - } else { - encode_neon_three_diff(operands, 0, 0b1101, true) - }, - "sqdmulh" => if matches!(operands.get(2), Some(Operand::RegLane { .. })) { - encode_neon_elem(operands, 0, 0b1100) - } else { - encode_neon_three_same(operands, 0, 0b10110) - }, - "sqrdmulh" => if matches!(operands.get(2), Some(Operand::RegLane { .. })) { - encode_neon_elem(operands, 0, 0b1101) - } else { - encode_neon_three_same(operands, 1, 0b10110) - }, - "pmul" => encode_neon_pmul(operands), - "mla" => if matches!(operands.get(2), Some(Operand::RegLane { .. })) { - encode_neon_elem(operands, 0, 0b0000) - } else { encode_neon_mla(operands) }, - "mls" => if matches!(operands.get(2), Some(Operand::RegLane { .. })) { - encode_neon_elem(operands, 0, 0b0100) - } else { encode_neon_mls(operands) }, - "rev64" => encode_neon_rev64(operands), - "tbl" => encode_neon_tbl(operands), - "tbx" => encode_neon_tbx(operands), - "ld1" => encode_neon_ld_st_dispatch(operands, true, 1), - "ld1r" => encode_neon_ld1r(operands), - "ld2" => encode_neon_ld_st_dispatch(operands, true, 2), - "ld3" => encode_neon_ld_st_dispatch(operands, true, 3), - "ld4" => encode_neon_ld_st_dispatch(operands, true, 4), - "st1" => encode_neon_ld_st_dispatch(operands, false, 1), - "st2" => encode_neon_ld_st_dispatch(operands, false, 2), - "st3" => encode_neon_ld_st_dispatch(operands, false, 3), - "st4" => encode_neon_ld_st_dispatch(operands, false, 4), - "uzp1" => encode_neon_zip_uzp(operands, 0b001, false), - "uzp2" => encode_neon_zip_uzp(operands, 0b101, false), - "zip1" => encode_neon_zip_uzp(operands, 0b011, false), - "zip2" => encode_neon_zip_uzp(operands, 0b111, false), - "eor3" => encode_neon_eor3(operands), - "pmull" => encode_neon_pmull(operands, false), - "pmull2" => encode_neon_pmull(operands, true), - "aese" => encode_neon_aes(operands, 0b00100), - "aesd" => encode_neon_aes(operands, 0b00101), - "aesmc" => encode_neon_aes(operands, 0b00110), - "aesimc" => encode_neon_aes(operands, 0b00111), - - // NEON three-different (widening/narrowing) - "usubl" => encode_neon_three_diff(operands, 1, 0b0010, false), - "usubl2" => encode_neon_three_diff(operands, 1, 0b0010, true), - "ssubl" => encode_neon_three_diff(operands, 0, 0b0010, false), - "ssubl2" => encode_neon_three_diff(operands, 0, 0b0010, true), - "usubw" => encode_neon_three_diff(operands, 1, 0b0011, false), - "usubw2" => encode_neon_three_diff(operands, 1, 0b0011, true), - "ssubw" => encode_neon_three_diff(operands, 0, 0b0011, false), - "ssubw2" => encode_neon_three_diff(operands, 0, 0b0011, true), - "uaddl" => encode_neon_three_diff(operands, 1, 0b0000, false), - "uaddl2" => encode_neon_three_diff(operands, 1, 0b0000, true), - "saddl" => encode_neon_three_diff(operands, 0, 0b0000, false), - "saddl2" => encode_neon_three_diff(operands, 0, 0b0000, true), - "uaddw" => encode_neon_three_diff(operands, 1, 0b0001, false), - "uaddw2" => encode_neon_three_diff(operands, 1, 0b0001, true), - "saddw" => encode_neon_three_diff(operands, 0, 0b0001, false), - "saddw2" => encode_neon_three_diff(operands, 0, 0b0001, true), - "umlal" => { - if matches!(operands.get(2), Some(Operand::RegLane { .. })) { - encode_neon_elem_long(operands, 1, 0b0010, false) - } else { - encode_neon_three_diff(operands, 1, 0b1000, false) - } - } - "umlal2" => { - if matches!(operands.get(2), Some(Operand::RegLane { .. })) { - encode_neon_elem_long(operands, 1, 0b0010, true) - } else { - encode_neon_three_diff(operands, 1, 0b1000, true) - } - } - "smlal" => { - if matches!(operands.get(2), Some(Operand::RegLane { .. })) { - encode_neon_elem_long(operands, 0, 0b0010, false) - } else { - encode_neon_three_diff(operands, 0, 0b1000, false) - } - } - "smlal2" => { - if matches!(operands.get(2), Some(Operand::RegLane { .. })) { - encode_neon_elem_long(operands, 0, 0b0010, true) - } else { - encode_neon_three_diff(operands, 0, 0b1000, true) - } - } - "umlsl" => { - if matches!(operands.get(2), Some(Operand::RegLane { .. })) { - encode_neon_elem_long(operands, 1, 0b0110, false) - } else { - encode_neon_three_diff(operands, 1, 0b1010, false) - } - } - "umlsl2" => { - if matches!(operands.get(2), Some(Operand::RegLane { .. })) { - encode_neon_elem_long(operands, 1, 0b0110, true) - } else { - encode_neon_three_diff(operands, 1, 0b1010, true) - } - } - "smlsl" => { - if matches!(operands.get(2), Some(Operand::RegLane { .. })) { - encode_neon_elem_long(operands, 0, 0b0110, false) - } else { - encode_neon_three_diff(operands, 0, 0b1010, false) - } - } - "smlsl2" => { - if matches!(operands.get(2), Some(Operand::RegLane { .. })) { - encode_neon_elem_long(operands, 0, 0b0110, true) - } else { - encode_neon_three_diff(operands, 0, 0b1010, true) - } - } - "umull2" => { - if matches!(operands.get(2), Some(Operand::RegLane { .. })) { - encode_neon_elem_long(operands, 1, 0b1010, true) - } else { - encode_neon_three_diff(operands, 1, 0b1100, true) - } - } - "smull2" => { - if matches!(operands.get(2), Some(Operand::RegLane { .. })) { - encode_neon_elem_long(operands, 0, 0b1010, true) - } else { - encode_neon_three_diff(operands, 0, 0b1100, true) - } - } - - // NEON saturating shift right narrow - "sqshrun" => encode_neon_sqshrun(operands, false, false), - "sqshrun2" => encode_neon_sqshrun(operands, false, true), - - // NEON extend long (aliases for USHLL/SSHLL #0) - "uxtl" => encode_neon_xtl(operands, 1, false), - "uxtl2" => encode_neon_xtl(operands, 1, true), - "sxtl" => encode_neon_xtl(operands, 0, false), - "sxtl2" => encode_neon_xtl(operands, 0, true), - - // NEON two-register narrowing - "uqxtn" => encode_neon_two_misc_narrow(operands, 1, 0b10100, false), - "uqxtn2" => encode_neon_two_misc_narrow(operands, 1, 0b10100, true), - "sqxtn" => encode_neon_two_misc_narrow(operands, 0, 0b10100, false), - "sqxtn2" => encode_neon_two_misc_narrow(operands, 0, 0b10100, true), - "xtn" => encode_neon_two_misc_narrow(operands, 0, 0b10010, false), - "xtn2" => encode_neon_two_misc_narrow(operands, 0, 0b10010, true), - - // System - "hint" => encode_hint(operands), - "bti" => encode_bti(raw_operands), - "nop" => Ok(EncodeResult::Word(0xd503201f)), - "yield" => Ok(EncodeResult::Word(0xd503203f)), - "wfe" => Ok(EncodeResult::Word(0xd503205f)), - "wfi" => Ok(EncodeResult::Word(0xd503207f)), - "sev" => Ok(EncodeResult::Word(0xd503209f)), - "sevl" => Ok(EncodeResult::Word(0xd50320bf)), - "eret" => Ok(EncodeResult::Word(0xd69f03e0)), - "clrex" => Ok(EncodeResult::Word(0xd503305f)), - "dc" => encode_dc(operands, raw_operands), - "tlbi" => encode_tlbi(operands, raw_operands), - "ic" => encode_ic(raw_operands), - "dmb" => encode_dmb(operands), - "dsb" => encode_dsb(operands), - "isb" => Ok(EncodeResult::Word(0xd5033fdf)), - "mrs" => encode_mrs(operands), - "msr" => encode_msr(operands), - "svc" => encode_svc(operands), - "hvc" => encode_hvc(operands), - "smc" => encode_smc(operands), - "at" => encode_at(operands, raw_operands), - "sys" => encode_sys(raw_operands), - "brk" => encode_brk(operands), - - // Bitfield extract/insert - "ubfx" => encode_ubfx(operands), - "sbfx" => encode_sbfx(operands), - "ubfm" => encode_ubfm(operands), - "sbfm" => encode_sbfm(operands), - "ubfiz" => encode_ubfiz(operands), - "sbfiz" => encode_sbfiz(operands), - "bfm" => encode_bfm(operands), - "bfi" => encode_bfi(operands), - "bfxil" => encode_bfxil(operands), - "extr" => encode_extr(operands), - - // Additional conditional operations - "cneg" => encode_cneg(operands), - "cinc" => encode_cinc(operands), - "cinv" => encode_cinv(operands), - - // Bit manipulation - "rbit" => { - // RBIT has both scalar and NEON forms - if matches!(operands.first(), Some(Operand::RegArrangement { .. })) { - encode_neon_rbit(operands) - } else { - encode_rbit(operands) - } - } - "rev" => encode_rev(operands), - - // CRC32 - "crc32b" | "crc32h" | "crc32w" | "crc32x" - | "crc32cb" | "crc32ch" | "crc32cw" | "crc32cx" => encode_crc32(mnemonic, operands), - - // Prefetch - "prfm" => encode_prfm(operands), - - // LSE atomics - "cas" | "casa" | "casal" | "casl" - | "casb" | "casab" | "casalb" | "caslb" - | "cash" | "casah" | "casalh" | "caslh" => encode_cas(mnemonic, operands), - "swp" | "swpa" | "swpal" | "swpl" - | "swpb" | "swpab" | "swpalb" | "swplb" - | "swph" | "swpah" | "swpalh" | "swplh" => encode_swp(mnemonic, operands), - "ldadd" | "ldadda" | "ldaddal" | "ldaddl" - | "ldaddb" | "ldaddab" | "ldaddalb" | "ldaddlb" - | "ldaddh" | "ldaddah" | "ldaddalh" | "ldaddlh" - | "ldclr" | "ldclra" | "ldclral" | "ldclrl" - | "ldclrb" | "ldclrab" | "ldclralb" | "ldclrlb" - | "ldclrh" | "ldclrah" | "ldclralh" | "ldclrlh" - | "ldeor" | "ldeora" | "ldeoral" | "ldeorl" - | "ldeorb" | "ldeorab" | "ldeoralb" | "ldeorlb" - | "ldeorh" | "ldeorah" | "ldeoralh" | "ldeorlh" - | "ldset" | "ldseta" | "ldsetal" | "ldsetl" - | "ldsetb" | "ldsetab" | "ldsetalb" | "ldsetlb" - | "ldseth" | "ldsetah" | "ldsetalh" | "ldsetlh" => encode_ldop(mnemonic, operands), - // LSE atomic store aliases (Rt=XZR, discard result) - "stadd" | "staddl" | "staddb" | "staddlb" | "staddh" | "staddlh" - | "stclr" | "stclrl" | "stclrb" | "stclrlb" | "stclrh" | "stclrlh" - | "steor" | "steorl" | "steorb" | "steorlb" | "steorh" | "steorlh" - | "stset" | "stsetl" | "stsetb" | "stsetlb" | "stseth" | "stsetlh" => encode_stop(mnemonic, operands), - - // NEON move-not-immediate - "mvni" => encode_neon_mvni(operands), - - _ => { - // TODO: handle remaining instructions - Err(format!("unsupported instruction: {} {}", mnemonic, raw_operands)) - } - } -} - -// ── Encoding helpers ────────────────────────────────────────────────────── - -fn get_reg(operands: &[Operand], idx: usize) -> Result<(u32, bool), String> { - match operands.get(idx) { - Some(Operand::Reg(name)) => { - let num = parse_reg_num(name) - .ok_or_else(|| format!("invalid register: {}", name))?; - let is_64 = is_64bit_reg(name); - Ok((num, is_64)) - } - other => Err(format!("expected register at operand {}, got {:?}", idx, other)), - } -} - -fn get_imm(operands: &[Operand], idx: usize) -> Result { - match operands.get(idx) { - Some(Operand::Imm(v)) => Ok(*v), - other => Err(format!("expected immediate at operand {}, got {:?}", idx, other)), - } -} - -fn get_symbol(operands: &[Operand], idx: usize) -> Result<(String, i64), String> { - match operands.get(idx) { - Some(Operand::Symbol(s)) => Ok((s.clone(), 0)), - Some(Operand::Label(s)) => Ok((s.clone(), 0)), - Some(Operand::SymbolOffset(s, off)) => Ok((s.clone(), *off)), - Some(Operand::Modifier { symbol, .. }) => Ok((symbol.clone(), 0)), - Some(Operand::ModifierOffset { symbol, offset, .. }) => Ok((symbol.clone(), *offset)), - // The parser misclassifies symbol names that collide with register names, - // condition codes, or barrier names. These are valid symbols in context. - Some(Operand::Reg(name)) => Ok((name.clone(), 0)), - Some(Operand::Cond(name)) => Ok((name.clone(), 0)), - Some(Operand::Barrier(name)) => Ok((name.clone(), 0)), - other => Err(format!("expected symbol at operand {}, got {:?}", idx, other)), - } -} - -fn sf_bit(is_64: bool) -> u32 { - if is_64 { 1 } else { 0 } -} diff --git a/src/backend/arm/assembler/encoder/neon.rs b/src/backend/arm/assembler/encoder/neon.rs deleted file mode 100644 index d4a2fc31f6..0000000000 --- a/src/backend/arm/assembler/encoder/neon.rs +++ /dev/null @@ -1,1854 +0,0 @@ -use super::*; -use crate::backend::arm::assembler::parser::Operand; - -// ── NEON/SIMD ──────────────────────────────────────────────────────────── - -/// Helper to extract register number from a RegArrangement operand -pub(crate) fn get_neon_reg(operands: &[Operand], idx: usize) -> Result<(u32, String), String> { - match operands.get(idx) { - Some(Operand::RegArrangement { reg, arrangement }) => { - let num = parse_reg_num(reg) - .ok_or_else(|| format!("invalid NEON register: {}", reg))?; - Ok((num, arrangement.clone())) - } - Some(Operand::Reg(name)) => { - let num = parse_reg_num(name) - .ok_or_else(|| format!("invalid register: {}", name))?; - Ok((num, String::new())) - } - other => Err(format!("expected NEON register at operand {}, got {:?}", idx, other)), - } -} - -pub(crate) fn encode_cnt(operands: &[Operand]) -> Result { - // CNT Vd., Vn. - // Encoding: 0 Q 00 1110 size 10 0000 0101 10 Rn Rd - // Only valid for .8b (Q=0) and .16b (Q=1) - if operands.len() < 2 { - return Err("cnt requires 2 operands".to_string()); - } - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _arr_n) = get_neon_reg(operands, 1)?; - - let q: u32 = if arr_d == "16b" { 1 } else { 0 }; // .8b -> Q=0, .16b -> Q=1 - - // 0 Q 00 1110 00 10 0000 0101 10 Rn Rd - let word = ((q << 30) | (0b001110 << 24)) | (0b100000 << 16) - | (0b010110 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -// ── NEON three-same register operations ────────────────────────────────── - -/// Get Q bit and size from arrangement specifier. -pub(crate) fn neon_arr_to_q_size(arr: &str) -> Result<(u32, u32), String> { - match arr { - "8b" => Ok((0, 0b00)), - "16b" => Ok((1, 0b00)), - "4h" => Ok((0, 0b01)), - "8h" => Ok((1, 0b01)), - "2s" => Ok((0, 0b10)), - "4s" => Ok((1, 0b10)), - "1d" => Ok((0, 0b11)), - "2d" => Ok((1, 0b11)), - _ => Err(format!("unsupported NEON arrangement: {}", arr)), - } -} - -/// Encode NEON three-same-register instructions: CMEQ, UQSUB, SQSUB, CMHI, etc. -/// -/// Layout: 0 Q U 01110 size 1 Rm opcode 1 Rn Rd -/// 31 30 29 28-24 23-22 21 20-16 15-11 10 9-5 4-0 -/// -/// `u_bit`: U field (bit 29) - 0 for signed, 1 for unsigned -/// `opcode`: instruction opcode (bits 15-11) -pub(crate) fn encode_neon_three_same(operands: &[Operand], u_bit: u32, opcode: u32) -> Result { - if operands.len() < 3 { - return Err("NEON three-same requires 3 operands".to_string()); - } - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _arr_n) = get_neon_reg(operands, 1)?; - let (rm, _arr_m) = get_neon_reg(operands, 2)?; - - let (q, size) = neon_arr_to_q_size(&arr_d)?; - - // 0 Q U 01110 size 1 Rm opcode 1 Rn Rd - let word = (q << 30) | (u_bit << 29) | (0b01110 << 24) | (size << 22) | (1 << 21) - | (rm << 16) | (opcode << 11) | (1 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode NEON three-different instructions: USUBL, SSUBL, UADDL, SADDL, etc. -/// -/// These instructions have wider destination than source operands. -/// Format: 0 Q U 01110 size 1 Rm opcode 00 Rn Rd -/// -/// `u_bit`: 0 for signed, 1 for unsigned -/// `opcode`: 4-bit opcode (bits 15-12) -/// `is_high`: true for the "2" variant (upper half, Q=1) -pub(crate) fn encode_neon_three_diff(operands: &[Operand], u_bit: u32, opcode: u32, is_high: bool) -> Result { - if operands.len() < 3 { - return Err("NEON three-different requires 3 operands".to_string()); - } - let (rd, _arr_d) = get_neon_reg(operands, 0)?; - let (rn, arr_n) = get_neon_reg(operands, 1)?; - let (rm, _arr_m) = get_neon_reg(operands, 2)?; - - // Size is determined from the source (narrow) arrangement - let (q, size) = match arr_n.as_str() { - "8b" => (0u32, 0b00u32), // base - "16b" => (1, 0b00), // "2" variant - "4h" => (0, 0b01), - "8h" => (1, 0b01), - "2s" => (0, 0b10), - "4s" => (1, 0b10), - _ => return Err(format!("unsupported source arrangement for three-diff: {}", arr_n)), - }; - - // For the "2" variant, override Q - let q = if is_high { 1 } else { q }; - - // 0 Q U 01110 size 1 Rm opcode 00 Rn Rd - let word = (q << 30) | (u_bit << 29) | (0b01110 << 24) | (size << 22) - | (1 << 21) | (rm << 16) | (opcode << 12) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode NEON SQSHRUN/SQSHRUN2: Signed saturating shift right unsigned narrow -/// Format: 0 Q 1 011110 immh immb 100011 Rn Rd -pub(crate) fn encode_neon_sqshrun(operands: &[Operand], is_rounding: bool, is_high: bool) -> Result { - if operands.len() < 3 { - return Err("sqshrun requires 3 operands".to_string()); - } - let (rd, _arr_d) = get_neon_reg(operands, 0)?; - let (rn, arr_n) = get_neon_reg(operands, 1)?; - let shift = match &operands[2] { - Operand::Imm(v) => *v as u32, - _ => return Err("sqshrun: expected immediate shift".to_string()), - }; - - // immh:immb encode element size and shift amount - // For source .4s (dest .4h or .8h): immh=001x, shift_amount = 32 - (immh:immb) - // For source .8h (dest .8b or .16b): immh=0001, shift_amount = 16 - (immh:immb) - // For source .2d (dest .2s or .4s): immh=01xx, shift_amount = 64 - (immh:immb) - let (element_bits, immh_base) = match arr_n.as_str() { - "8h" => (16u32, 0b0001u32), - "4s" => (32, 0b0010), - "2d" => (64, 0b0100), - _ => return Err(format!("sqshrun: unsupported source arrangement: {}", arr_n)), - }; - - if shift == 0 || shift > element_bits { - return Err(format!("sqshrun: shift {} out of range for {}-bit elements", shift, element_bits)); - } - - let immhb = (element_bits - shift) & 0x7F; // immh:immb combined - let immh = (immhb >> 3) | immh_base; - let immb = immhb & 0x7; - - let q = if is_high { 1u32 } else { 0 }; - - // 0 Q 1 011110 immh immb opcode 1 Rn Rd - // SQSHRUN: opcode = 100001, SQRSHRUN: opcode = 100011 - let opcode_bits: u32 = if is_rounding { 0b100011 } else { 0b100001 }; - let word = (q << 30) | (1 << 29) | (0b011110 << 23) | (immh << 19) | (immb << 16) - | (opcode_bits << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode NEON UXTL/SXTL (unsigned/signed extend long). -/// These are aliases for USHLL/SSHLL with shift #0. -/// -/// Format: 0 Q U 011110 immh immb 10100 1 Rn Rd -pub(crate) fn encode_neon_xtl(operands: &[Operand], u_bit: u32, is_high: bool) -> Result { - if operands.len() < 2 { - return Err("NEON uxtl/sxtl requires 2 operands".to_string()); - } - let (rd, _arr_d) = get_neon_reg(operands, 0)?; - let (rn, arr_n) = get_neon_reg(operands, 1)?; - - // immh encodes the source element size, immb=0 (shift=0) - let immh = match arr_n.as_str() { - "8b" | "16b" => 0b0001u32, - "4h" | "8h" => 0b0010, - "2s" | "4s" => 0b0100, - _ => return Err(format!("uxtl/sxtl: unsupported source arrangement: {}", arr_n)), - }; - - let q = if is_high { 1u32 } else { 0 }; - - // 0 Q U 011110 immh immb 10100 1 Rn Rd - let word = (q << 30) | (u_bit << 29) | (0b011110 << 23) | (immh << 19) - | (0b101001 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode NEON compare-to-zero: CMEQ Vd, Vn, #0, CMGE Vd, Vn, #0, etc. -/// -/// Format: 0 Q U 01110 size 10000 opcode 10 Rn Rd -pub(crate) fn encode_neon_cmp_zero(operands: &[Operand], u_bit: u32, opcode: u32) -> Result { - if operands.len() < 2 { - return Err("NEON compare-zero requires at least 2 operands".to_string()); - } - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let (q, size) = neon_arr_to_q_size(&arr_d)?; - - // 0 Q U 01110 size 10000 opcode 10 Rn Rd - let word = (q << 30) | (u_bit << 29) | (0b01110 << 24) | (size << 22) - | (0b10000 << 17) | (opcode << 12) | (0b10 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode NEON two-register miscellaneous narrowing: UQXTN, SQXTN, XTN -/// -/// Format: 0 Q U 01110 size 10000 opcode 10 Rn Rd -pub(crate) fn encode_neon_two_misc_narrow(operands: &[Operand], u_bit: u32, opcode: u32, is_high: bool) -> Result { - if operands.len() < 2 { - return Err("NEON two-reg narrow requires 2 operands".to_string()); - } - let (rd, _arr_d) = get_neon_reg(operands, 0)?; - let (rn, arr_n) = get_neon_reg(operands, 1)?; - - // Size from source (wider) arrangement - let size = match arr_n.as_str() { - "8h" => 0b00u32, - "4s" => 0b01, - "2d" => 0b10, - _ => return Err(format!("unsupported source arrangement for narrow: {}", arr_n)), - }; - - let q = if is_high { 1u32 } else { 0 }; - - // 0 Q U 01110 size 10000 opcode 10 Rn Rd - let word = (q << 30) | (u_bit << 29) | (0b01110 << 24) | (size << 22) - | (0b10000 << 17) | (opcode << 12) | (0b10 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode NEON vector-by-element long instructions: SMULL/UMULL/SMLAL/UMLAL/SMLSL/UMLSL (elem) -/// -/// Format: 0 Q U 01111 size L M Rm opcode H 0 Rn Rd -/// -/// These are the widening multiply-by-element forms where the third operand -/// is a register lane (e.g., v0.h[2]). -pub(crate) fn encode_neon_elem_long(operands: &[Operand], u_bit: u32, opcode: u32, is_high: bool) -> Result { - if operands.len() < 3 { - return Err("NEON elem-long requires 3 operands".to_string()); - } - let (rd, _arr_d) = get_neon_reg(operands, 0)?; - let (rn, arr_n) = get_neon_reg(operands, 1)?; - - // Third operand is RegLane: v0.h[2] - let (rm, index) = match &operands[2] { - Operand::RegLane { reg, elem_size: _, index } => { - let rm = parse_reg_num(reg).ok_or("invalid NEON register")?; - (rm, *index) - } - _ => return Err(format!("expected register lane operand, got {:?}", operands[2])), - }; - - // Determine size and Q from source arrangement - let (q, size) = match arr_n.as_str() { - "4h" => (0u32, 0b01u32), - "8h" => (1, 0b01), - "2s" => (0, 0b10), - "4s" => (1, 0b10), - _ => return Err(format!("unsupported source arrangement for elem-long: {}", arr_n)), - }; - let q = if is_high { 1 } else { q }; - - // Encode index into H:L:M bits depending on element size - let (h, l, m) = match size { - 0b01 => { - // Half-word: index = H:L:M (3 bits), Rm limited to v0-v15 - if index > 7 { - return Err(format!("element index {} out of range for .h", index)); - } - let h = (index >> 2) & 1; - let l = (index >> 1) & 1; - let m = index & 1; - (h, l, m) - } - 0b10 => { - // Word: index = H:L (2 bits), M=Rm[4] - if index > 3 { - return Err(format!("element index {} out of range for .s", index)); - } - let h = (index >> 1) & 1; - let l = index & 1; - let m = (rm >> 4) & 1; // M bit from Rm[4] - (h, l, m) - } - _ => return Err("unsupported element size for by-element".to_string()), - }; - - // Limit Rm for half-word indexing (only v0-v15) - let rm_enc = if size == 0b01 { rm & 0xF } else { rm & 0x1F }; - - // 0 Q U 01111 size L M Rm opcode H 0 Rn Rd - let word = (q << 30) | (u_bit << 29) | (0b01111 << 24) | (size << 22) - | (l << 21) | (m << 20) | (rm_enc << 16) | (opcode << 12) - | (h << 11) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode NEON logical operations: ORR/AND/EOR Vd.T, Vn.T, Vm.T -pub(crate) fn encode_neon_logical(operands: &[Operand], opc: u32) -> Result { - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _arr_n) = get_neon_reg(operands, 1)?; - let (rm, _arr_m) = get_neon_reg(operands, 2)?; - - let q: u32 = if arr_d == "16b" { 1 } else { 0 }; - - // NEON logical three-same: - // ORR: 0 Q 0 01110 10 1 Rm 000111 Rn Rd (opc=0b01 -> size=10) - // AND: 0 Q 0 01110 00 1 Rm 000111 Rn Rd (opc=0b00 -> size=00) - // EOR: 0 Q 1 01110 00 1 Rm 000111 Rn Rd (opc=0b10 -> size=00, U=1) - // BIC: 0 Q 0 01110 01 1 Rm 000111 Rn Rd (would be opc=0b01 with N=1... but not needed) - let (u_bit, size_bits): (u32, u32) = match opc { - 0b00 => (0, 0b00), // AND - 0b01 => (0, 0b10), // ORR - 0b10 => (1, 0b00), // EOR - 0b11 => (1, 0b00), // ANDS - not valid for NEON, fall back - _ => return Err("unsupported NEON logical opc".to_string()), - }; - - let word = (q << 30) | (u_bit << 29) | (0b01110 << 24) | (size_bits << 22) - | (1 << 21) | (rm << 16) | (0b000111 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode NEON MUL Vd.T, Vn.T, Vm.T -pub(crate) fn encode_neon_mul(operands: &[Operand]) -> Result { - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let (rm, _) = get_neon_reg(operands, 2)?; - let (q, size) = neon_arr_to_q_size(&arr_d)?; - - // MUL (vector): 0 Q 0 01110 size 1 Rm 10011 1 Rn Rd - let word = (q << 30) | (0b001110 << 24) | (size << 22) | (1 << 21) - | (rm << 16) | (0b100111 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode NEON PMUL Vd.T, Vn.T, Vm.T (polynomial multiply, bytes only) -pub(crate) fn encode_neon_pmul(operands: &[Operand]) -> Result { - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let (rm, _) = get_neon_reg(operands, 2)?; - let q: u32 = if arr_d == "16b" { 1 } else { 0 }; - // PMUL: 0 Q 1 01110 00 1 Rm 10011 1 Rn Rd (size=00 for bytes, U=1) - // PMUL encoding: size=00 (bytes) is implicit (zero bits at [23:22]) - let word = (q << 30) | (1 << 29) | (0b01110 << 24) | (1 << 21) - | (rm << 16) | (0b100111 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode NEON MLA Vd.T, Vn.T, Vm.T (multiply-accumulate) -pub(crate) fn encode_neon_mla(operands: &[Operand]) -> Result { - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let (rm, _) = get_neon_reg(operands, 2)?; - let (q, size) = neon_arr_to_q_size(&arr_d)?; - // MLA: 0 Q 0 01110 size 1 Rm 10010 1 Rn Rd - let word = (q << 30) | (0b001110 << 24) | (size << 22) | (1 << 21) - | (rm << 16) | (0b100101 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode NEON MLS Vd.T, Vn.T, Vm.T (multiply-subtract) -pub(crate) fn encode_neon_mls(operands: &[Operand]) -> Result { - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let (rm, _) = get_neon_reg(operands, 2)?; - let (q, size) = neon_arr_to_q_size(&arr_d)?; - // MLS: 0 Q 1 01110 size 1 Rm 10010 1 Rn Rd (U=1) - let word = (q << 30) | (1 << 29) | (0b01110 << 24) | (size << 22) | (1 << 21) - | (rm << 16) | (0b100101 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode NEON USHR Vd.T, Vn.T, #shift (unsigned shift right immediate) -pub(crate) fn encode_neon_shift_imm(operands: &[Operand], _is_unsigned: bool) -> Result { - if operands.len() < 3 { - return Err("ushr requires 3 operands".to_string()); - } - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let shift = get_imm(operands, 2)?; - - let (q, _size) = neon_arr_to_q_size(&arr_d)?; - - // USHR: 0 Q 1 011110 immh:immb 00000 1 Rn Rd - // For .16b (bytes, size=8): immh = 0001, immb = 8-shift (3 bits) - // For .8h (halfwords, size=16): immh = 001x - // For .4s (words, size=32): immh = 01xx - // For .2d (doublewords, size=64): immh = 1xxx - // immh:immb = (element_size * 2 - shift) - let (elem_bits, immh_immb) = match arr_d.as_str() { - "8b" | "16b" => (8u32, (16 - shift as u32) & 0xF), // immh:immb is 4 bits for 8-bit elems - "4h" | "8h" => (16, (32 - shift as u32) & 0x1F), - "2s" | "4s" => (32, (64 - shift as u32) & 0x3F), - "2d" => (64, (128 - shift as u32) & 0x7F), - _ => return Err(format!("unsupported USHR arrangement: {}", arr_d)), - }; - let _ = elem_bits; - - // Full encoding: 0 Q 1 011110 immh:immb 000001 Rn Rd - let word = (q << 30) | (1 << 29) | (0b011110 << 23) | (immh_immb << 16) - | (0b000001 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode NEON EXT Vd.T, Vn.T, Vm.T, #index -pub(crate) fn encode_neon_ext(operands: &[Operand]) -> Result { - if operands.len() < 4 { - return Err("ext requires 4 operands".to_string()); - } - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let (rm, _) = get_neon_reg(operands, 2)?; - let index = get_imm(operands, 3)? as u32; - - let q: u32 = if arr_d == "16b" { 1 } else { 0 }; - - // EXT Vd.T, Vn.T, Vm.T, #index - // Encoding: 0 Q 10 1110 00 0 Rm 0 imm4 0 Rn Rd - let word = ((((q << 30) | (0b101110 << 24)) - | (rm << 16)) | ((index & 0xF) << 11)) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode NEON ADDV: add across vector lanes -pub(crate) fn encode_neon_addv(operands: &[Operand]) -> Result { - if operands.len() < 2 { - return Err("addv requires 2 operands".to_string()); - } - let (rd, _) = get_neon_reg(operands, 0)?; - let (rn, arr_n) = get_neon_reg(operands, 1)?; - - let (q, size) = neon_arr_to_q_size(&arr_n)?; - - // ADDV: 0 Q 0 01110 size 11000 11011 10 Rn Rd - let word = (q << 30) | (0b001110 << 24) | (size << 22) | (0b11000 << 17) - | (0b110111 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode NEON across-vector instructions: UMAXV, UMINV, SMAXV, SMINV -/// -/// Format: 0 Q U 01110 size 11000 opcode 10 Rn Rd -/// -/// `u_bit`: 0 for signed, 1 for unsigned -/// `opcode`: 5-bit opcode (bits 16-12) -pub(crate) fn encode_neon_across(operands: &[Operand], u_bit: u32, opcode: u32) -> Result { - if operands.len() < 2 { - return Err("NEON across-vector requires 2 operands".to_string()); - } - let (rd, _) = get_neon_reg(operands, 0)?; - let (rn, arr_n) = get_neon_reg(operands, 1)?; - - let (q, size) = neon_arr_to_q_size(&arr_n)?; - - // 0 Q U 01110 size 11000 opcode 10 Rn Rd - let word = (q << 30) | (u_bit << 29) | (0b01110 << 24) | (size << 22) | (0b11000 << 17) - | (opcode << 12) | (0b10 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode NEON UMOV: move element to GP register -pub(crate) fn encode_neon_umov(operands: &[Operand]) -> Result { - if operands.len() < 2 { - return Err("umov requires 2 operands".to_string()); - } - let (rd, is_64) = get_reg(operands, 0)?; - - // Second operand should be a RegLane (v0.b[0]) - match operands.get(1) { - Some(Operand::RegLane { reg, elem_size, index }) => { - let rn = parse_reg_num(reg).ok_or("invalid NEON register")?; - let q = if is_64 { 1u32 } else { 0 }; - - let imm5 = match elem_size.as_str() { - "b" => ((*index & 0xF) << 1) | 0b00001, - "h" => ((*index & 0x7) << 2) | 0b00010, - "s" => ((*index & 0x3) << 3) | 0b00100, - "d" => ((*index & 0x1) << 4) | 0b01000, - _ => return Err(format!("unsupported umov element size: {}", elem_size)), - }; - - // UMOV Rd, Vn.Ts[index]: 0 Q 0 01110 000 imm5 0 0111 1 Rn Rd - let word = (q << 30) | (0b001110000u32 << 21) | (imm5 << 16) - | (0b001111 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) - } - _ => Err("umov: expected register lane operand".to_string()), - } -} - -/// Encode NEON DUP: broadcast GP register to all vector lanes -pub(crate) fn encode_neon_dup(operands: &[Operand]) -> Result { - if operands.len() < 2 { - return Err("dup requires 2 operands".to_string()); - } - let (rd, arr_d) = get_neon_reg(operands, 0)?; - - // DUP Vd.T, Rn (general form - broadcast GP reg to vector) - if let Some(Operand::Reg(rn_name)) = operands.get(1) { - let rn = parse_reg_num(rn_name).ok_or("invalid rn")?; - let (q, _) = neon_arr_to_q_size(&arr_d)?; - - // imm5 encoding for element size: - // .8b/.16b: imm5 = 00001 - // .4h/.8h: imm5 = 00010 - // .2s/.4s: imm5 = 00100 - // .2d: imm5 = 01000 - let imm5 = match arr_d.as_str() { - "8b" | "16b" => 0b00001u32, - "4h" | "8h" => 0b00010, - "2s" | "4s" => 0b00100, - "2d" => 0b01000, - _ => return Err(format!("unsupported dup arrangement: {}", arr_d)), - }; - - // DUP Vd.T, Rn: 0 Q 0 01110 000 imm5 0 0001 1 Rn Rd - let word = (q << 30) | (0b001110000u32 << 21) | (imm5 << 16) - | (0b000011 << 10) | (rn << 5) | rd; - return Ok(EncodeResult::Word(word)); - } - - // DUP Vd.T, Vn.Ts[index] (broadcast element to all lanes) - if let Some(Operand::RegLane { reg, elem_size, index }) = operands.get(1) { - let rn = parse_reg_num(reg).ok_or("invalid NEON register")?; - let (q, _) = neon_arr_to_q_size(&arr_d)?; - - // imm5 encodes both element size and index: - // .b[i]: imm5 = (i << 1) | 0b00001 - // .h[i]: imm5 = (i << 2) | 0b00010 - // .s[i]: imm5 = (i << 3) | 0b00100 - // .d[i]: imm5 = (i << 4) | 0b01000 - let imm5 = match elem_size.as_str() { - "b" => ((*index & 0xF) << 1) | 0b00001, - "h" => ((*index & 0x7) << 2) | 0b00010, - "s" => ((*index & 0x3) << 3) | 0b00100, - "d" => ((*index & 0x1) << 4) | 0b01000, - _ => return Err(format!("unsupported dup element size: {}", elem_size)), - }; - - // DUP Vd.T, Vn.Ts[i]: 0 Q 0 01110 000 imm5 0 0000 1 Rn Rd - let word = (q << 30) | (0b001110000u32 << 21) | (imm5 << 16) - | (0b000001 << 10) | (rn << 5) | rd; - return Ok(EncodeResult::Word(word)); - } - - Err("unsupported dup operands".to_string()) -} - -/// Encode NEON INS (insert element from GP register): INS Vd.Ts[index], Xn -pub(crate) fn encode_neon_ins(operands: &[Operand]) -> Result { - if operands.len() < 2 { - return Err("ins requires 2 operands".to_string()); - } - match (&operands[0], &operands[1]) { - // INS Vd.Ts[dst_idx], Xn (general register to element) - (Operand::RegLane { reg, elem_size, index }, Operand::Reg(rn_name)) => { - let rd = parse_reg_num(reg).ok_or("invalid NEON register")?; - let rn = parse_reg_num(rn_name).ok_or("invalid register")?; - - let imm5 = match elem_size.as_str() { - "b" => ((*index & 0xF) << 1) | 0b00001, - "h" => ((*index & 0x7) << 2) | 0b00010, - "s" => ((*index & 0x3) << 3) | 0b00100, - "d" => ((*index & 0x1) << 4) | 0b01000, - _ => return Err(format!("unsupported ins element size: {}", elem_size)), - }; - - // INS Vd.Ts[i], Xn: 0 1 0 01110 000 imm5 0 0011 1 Rn Rd - let word = (0b01001110000u32 << 21) | (imm5 << 16) - | (0b000111 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) - } - // INS Vd.Ts[dst_idx], Vn.Ts[src_idx] (element to element) - (Operand::RegLane { reg: rd_name, elem_size: dst_size, index: dst_idx }, - Operand::RegLane { reg: rn_name, elem_size: _src_size, index: src_idx }) => { - let rd = parse_reg_num(rd_name).ok_or("invalid NEON rd")?; - let rn = parse_reg_num(rn_name).ok_or("invalid NEON rn")?; - - let (imm5, imm4) = match dst_size.as_str() { - "b" => ( - ((*dst_idx & 0xF) << 1) | 0b00001, - *src_idx & 0xF, - ), - "h" => ( - ((*dst_idx & 0x7) << 2) | 0b00010, - (*src_idx & 0x7) << 1, - ), - "s" => ( - ((*dst_idx & 0x3) << 3) | 0b00100, - (*src_idx & 0x3) << 2, - ), - "d" => ( - ((*dst_idx & 0x1) << 4) | 0b01000, - (*src_idx & 0x1) << 3, - ), - _ => return Err(format!("unsupported ins element size: {}", dst_size)), - }; - - // INS Vd.Ts[dst], Vn.Ts[src]: 0 1 1 01110 000 imm5 0 imm4 1 Rn Rd - let word = (0b01101110000u32 << 21) | (imm5 << 16) - | (imm4 << 11) | (1 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) - } - _ => Err("ins: expected (RegLane, Reg) or (RegLane, RegLane) operands".to_string()), - } -} - -/// Encode NEON NOT (bitwise NOT): NOT Vd.T, Vn.T -pub(crate) fn encode_neon_not(operands: &[Operand]) -> Result { - if operands.len() < 2 { - return Err("not requires 2 operands".to_string()); - } - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - - let q: u32 = if arr_d == "16b" { 1 } else { 0 }; - - // NOT Vd.T, Vn.T (alias of MVN): 0 Q 1 01110 00 10000 00101 10 Rn Rd - let word = ((q << 30) | (1 << 29) | (0b01110 << 24)) - | (0b10000 << 17) | (0b00101 << 12) | (0b10 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode NEON MOVI (move immediate to vector) -pub(crate) fn encode_neon_movi(operands: &[Operand]) -> Result { - if operands.len() < 2 { - return Err("movi requires 2 operands".to_string()); - } - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let imm = get_imm(operands, 1)?; - - match arr_d.as_str() { - "16b" | "8b" => { - // MOVI Vd.16b, #imm8 - // Encoding: 0 Q 00 1111 00000 abc 1110 01 defgh Rd - // where imm8 = abc:defgh - let q: u32 = if arr_d == "16b" { 1 } else { 0 }; - let imm8 = imm as u32 & 0xFF; - let abc = (imm8 >> 5) & 0x7; - let defgh = imm8 & 0x1F; - // 0 Q op 0 1111 0 a b c cmode(1110) o2(0) 1 defgh Rd - let word = (q << 30) | (0b0011110 << 23) | ((abc >> 2) << 18) | (((abc >> 1) & 1) << 17) - | ((abc & 1) << 16) | (0b1110 << 12) | (0b01 << 10) | (defgh << 5) | rd; - Ok(EncodeResult::Word(word)) - } - "2d" => { - // MOVI Vd.2d, #imm - // The 64-bit immediate is encoded as 8 bits, where each bit expands - // to 8 bits (0x00 or 0xFF) in the result. - // Convert the 64-bit value to the 8-bit encoding. - let imm64 = imm as u64; - let mut imm8 = 0u32; - for i in 0..8 { - let byte_val = (imm64 >> (i * 8)) & 0xFF; - if byte_val == 0xFF { - imm8 |= 1 << i; - } else if byte_val != 0 { - return Err(format!("movi .2d: each byte of immediate must be 0x00 or 0xFF, got 0x{:02x} at byte {}", byte_val, i)); - } - } - let abc = (imm8 >> 5) & 0x7; - let defgh = imm8 & 0x1F; - // MOVI Vd.2d, #imm: 0 1 1 0 1111 00 abc 1110 01 defgh Rd (op=1, Q=1) - let word = (0b01101111 << 24) | ((abc >> 2) << 18) | (((abc >> 1) & 1) << 17) - | ((abc & 1) << 16) | (0b111001 << 10) | (defgh << 5) | rd; - Ok(EncodeResult::Word(word)) - } - "2s" | "4s" => { - // MOVI Vd.2s/4s, #imm8 (32-bit element, no shift) - // Encoding: 0 Q op(0) 0 1111 0 abc cmode(0000) o2(0) 1 defgh Rd - let q: u32 = if arr_d == "4s" { 1 } else { 0 }; - let imm8 = imm as u32 & 0xFF; - let abc = (imm8 >> 5) & 0x7; - let defgh = imm8 & 0x1F; - - // Check for optional LSL shift operand - let (cmode, shift_val) = if operands.len() > 2 { - if let Some(Operand::Shift { kind, amount }) = operands.get(2) { - if kind == "lsl" { - match amount { - 0 => (0b0000u32, 0), - 8 => (0b0010, 8), - 16 => (0b0100, 16), - 24 => (0b0110, 24), - _ => return Err(format!("movi: unsupported shift amount: {}", amount)), - } - } else { - (0b0000, 0) - } - } else { - (0b0000, 0) - } - } else { - (0b0000, 0) - }; - let _ = shift_val; - - let word = (q << 30) | (0b0011110 << 23) | ((abc >> 2) << 18) | (((abc >> 1) & 1) << 17) - | ((abc & 1) << 16) | (cmode << 12) | (0b01 << 10) | (defgh << 5) | rd; - Ok(EncodeResult::Word(word)) - } - "4h" | "8h" => { - // MOVI Vd.4h/8h, #imm8 - let q: u32 = if arr_d == "8h" { 1 } else { 0 }; - let imm8 = imm as u32 & 0xFF; - let abc = (imm8 >> 5) & 0x7; - let defgh = imm8 & 0x1F; - // cmode=1000 for .4h/.8h with no shift - let word = (q << 30) | (0b0011110 << 23) | ((abc >> 2) << 18) | (((abc >> 1) & 1) << 17) - | ((abc & 1) << 16) | (0b1000 << 12) | (0b01 << 10) | (defgh << 5) | rd; - Ok(EncodeResult::Word(word)) - } - _ => Err(format!("movi: unsupported arrangement: {}", arr_d)), - } -} - - -/// Encode NEON BIC (bitwise clear vector): BIC Vd.T, Vn.T, Vm.T -pub(crate) fn encode_neon_bic(operands: &[Operand]) -> Result { - if operands.len() < 3 { - return Err("bic requires 3 operands".to_string()); - } - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let (rm, _) = get_neon_reg(operands, 2)?; - - let q: u32 = if arr_d == "16b" { 1 } else { 0 }; - - // BIC Vd.T, Vn.T, Vm.T: 0 Q 0 01110 01 1 Rm 000111 Rn Rd - let word = (q << 30) | (0b001110 << 24) | (0b01 << 22) | (1 << 21) - | (rm << 16) | (0b000111 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode NEON BSL (bitwise select): BSL Vd.T, Vn.T, Vm.T -pub(crate) fn encode_neon_bsl(operands: &[Operand]) -> Result { - if operands.len() < 3 { - return Err("bsl requires 3 operands".to_string()); - } - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let (rm, _) = get_neon_reg(operands, 2)?; - - let q: u32 = if arr_d == "16b" { 1 } else { 0 }; - - // BSL Vd.T, Vn.T, Vm.T: 0 Q 1 01110 01 1 Rm 000111 Rn Rd - let word = (q << 30) | (1 << 29) | (0b01110 << 24) | (0b01 << 22) | (1 << 21) - | (rm << 16) | (0b000111 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode NEON REV64: reverse elements within 64-bit doublewords -pub(crate) fn encode_neon_rev64(operands: &[Operand]) -> Result { - if operands.len() < 2 { - return Err("rev64 requires 2 operands".to_string()); - } - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - - let (q, size) = neon_arr_to_q_size(&arr_d)?; - - // REV64 Vd.T, Vn.T: 0 Q 0 01110 size 10 0000 0000 10 Rn Rd - let word = (q << 30) | (0b001110 << 24) | (size << 22) - | (0b100000 << 16) | (0b000010 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode NEON TBL: table vector lookup -pub(crate) fn encode_neon_tbl(operands: &[Operand]) -> Result { - // TBL Vd.T, {Vn.T}, Vm.T (single register table) - // TBL Vd.T, {Vn.T, Vn+1.T}, Vm.T (two register table) - // etc. - if operands.len() < 3 { - return Err("tbl requires 3 operands".to_string()); - } - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let q: u32 = if arr_d == "16b" { 1 } else { 0 }; - - // The second operand is a register list - let (rn, num_regs) = match &operands[1] { - Operand::RegList(regs) => { - let first_reg = match ®s[0] { - Operand::RegArrangement { reg, .. } => parse_reg_num(reg).ok_or("invalid reg")?, - Operand::Reg(name) => parse_reg_num(name).ok_or("invalid reg")?, - _ => return Err("tbl: expected register in list".to_string()), - }; - (first_reg, regs.len() as u32) - } - _ => return Err("tbl: expected register list as second operand".to_string()), - }; - - let (rm, _) = get_neon_reg(operands, 2)?; - - // len field: 1 reg -> 00, 2 -> 01, 3 -> 10, 4 -> 11 - let len = (num_regs - 1) & 0x3; - - // TBL: 0 Q 00 1110 000 Rm 0 len 0 00 Rn Rd - let word = ((((q << 30) | (0b001110 << 24)) - | (rm << 16)) | (len << 13)) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode NEON TBX: table vector lookup with insert (preserves out-of-range lanes) -pub(crate) fn encode_neon_tbx(operands: &[Operand]) -> Result { - if operands.len() < 3 { - return Err("tbx requires 3 operands".to_string()); - } - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let q: u32 = if arr_d == "16b" { 1 } else { 0 }; - - let (rn, num_regs) = match &operands[1] { - Operand::RegList(regs) => { - let first_reg = match ®s[0] { - Operand::RegArrangement { reg, .. } => parse_reg_num(reg).ok_or("invalid reg")?, - Operand::Reg(name) => parse_reg_num(name).ok_or("invalid reg")?, - _ => return Err("tbx: expected register in list".to_string()), - }; - (first_reg, regs.len() as u32) - } - _ => return Err("tbx: expected register list as second operand".to_string()), - }; - - let (rm, _) = get_neon_reg(operands, 2)?; - let len = (num_regs - 1) & 0x3; - - // TBX: 0 Q 00 1110 000 Rm 0 len 1 00 Rn Rd (op=1 for TBX vs op=0 for TBL) - let word = (q << 30) | (0b001110 << 24) | (rm << 16) | (len << 13) - | (1 << 12) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode NEON LD1R: load single structure and replicate to all lanes -pub(crate) fn encode_neon_ld1r(operands: &[Operand]) -> Result { - // LD1R {Vt.T}, [Xn] - if operands.len() < 2 { - return Err("ld1r requires 2 operands".to_string()); - } - - let (rt, arr) = match &operands[0] { - Operand::RegList(regs) => { - if regs.len() != 1 { - return Err("ld1r expects exactly one register in list".to_string()); - } - match ®s[0] { - Operand::RegArrangement { reg, arrangement } => { - let num = parse_reg_num(reg).ok_or("invalid reg")?; - (num, arrangement.clone()) - } - _ => return Err("ld1r: expected register with arrangement".to_string()), - } - } - _ => return Err("ld1r: expected register list as first operand".to_string()), - }; - - let (q, size) = match arr.as_str() { - "8b" => (0u32, 0b00u32), - "16b" => (1, 0b00), - "4h" => (0, 0b01), - "8h" => (1, 0b01), - "2s" => (0, 0b10), - "4s" => (1, 0b10), - "1d" => (0, 0b11), - "2d" => (1, 0b11), - _ => return Err(format!("ld1r: unsupported arrangement: {}", arr)), - }; - - match &operands[1] { - Operand::Mem { base, offset: 0 } => { - let rn = parse_reg_num(base).ok_or("invalid base reg")?; - // LD1R: 0 Q 0 01101 0 1 0 00000 110 0 size Rn Rt (no post-index) - let word = (q << 30) | (0b001101 << 24) | (1 << 22) | (0b110 << 13) - | (size << 10) | (rn << 5) | rt; - Ok(EncodeResult::Word(word)) - } - Operand::MemPostIndex { base, offset } => { - let rn = parse_reg_num(base).ok_or("invalid base reg")?; - // LD1R post-index (immediate): 0 Q 0 01101 1 1 0 11111 110 0 size Rn Rt - // Rm=11111 means post-index by element size - let _ = offset; // offset must match element size, not encoded separately - let word = (q << 30) | (0b001101 << 24) | (1 << 23) | (1 << 22) - | (0b11111 << 16) | (0b110 << 13) | (size << 10) | (rn << 5) | rt; - Ok(EncodeResult::Word(word)) - } - _ => Err("ld1r: expected [Xn] or [Xn], #imm memory operand".to_string()), - } -} - -/// Encode NEON LD1 (vector load, multiple structures) -/// Dispatch LD/ST1-4: choose between "multiple structures" and "single structure (element)" encoding. -pub(crate) fn encode_neon_ld_st_dispatch(operands: &[Operand], is_load: bool, num_structs: u32) -> Result { - // If the first operand is a RegListIndexed, use single-element encoding - if let Some(Operand::RegListIndexed { .. }) = operands.first() { - return encode_neon_ld_st_single(operands, is_load, num_structs); - } - // Multiple-structures encoding for ld1-4/st1-4 - encode_neon_ld_st_multi(operands, is_load, num_structs) -} - -/// Encode NEON LD/ST single structure (element): -/// st1 {v0.s}[0], [x3] -/// st2 {v0.s, v1.s}[0], [x3] -/// st4 {v0.s, v1.s, v2.s, v3.s}[0], [x3] -/// ld2 {v0.s, v1.s}[0], [x3] -// TODO: add post-index form [Xn], #imm -pub(crate) fn encode_neon_ld_st_single(operands: &[Operand], is_load: bool, num_structs: u32) -> Result { - if operands.len() < 2 { - return Err(format!("ld/st{} single element requires at least 2 operands", num_structs)); - } - - let (regs, index) = match &operands[0] { - Operand::RegListIndexed { regs, index } => (regs, *index), - _ => return Err("expected register list with index".to_string()), - }; - - if regs.len() as u32 != num_structs { - return Err(format!("expected {} registers in list, got {}", num_structs, regs.len())); - } - // TODO: validate that registers in the list are consecutive (ARM ISA requirement) - - // Get element size and first register from the list - let (rt, elem_size) = match ®s[0] { - Operand::RegArrangement { reg, arrangement } => { - (parse_reg_num(reg).ok_or("invalid register in list")?, arrangement.clone()) - } - _ => return Err("expected register with arrangement in list".to_string()), - }; - - // Get base register and check for post-index - let (rn, post_index) = match &operands[1] { - Operand::Mem { base, offset: 0 } => { - let rn = parse_reg_num(base).ok_or_else(|| format!("invalid base register: {}", base))?; - // Check for post-index immediate: operands[2] is the post-index offset - let pi = if operands.len() > 2 { - match &operands[2] { - Operand::Imm(off) => Some(*off), - _ => None, - } - } else { - None - }; - (rn, pi) - } - Operand::MemPostIndex { base, offset } => { - let rn = parse_reg_num(base).ok_or_else(|| format!("invalid base register: {}", base))?; - (rn, Some(*offset)) - } - _ => return Err("expected [Xn] memory operand".to_string()), - }; - - let l_bit = if is_load { 1u32 } else { 0u32 }; - - // R bit: 0 for 1,3 registers; 1 for 2,4 registers - let r_bit = match num_structs { - 1 | 3 => 0u32, - 2 | 4 => 1u32, - _ => return Err(format!("unsupported struct count: {}", num_structs)), - }; - - // Compute opcode, S, Q, size based on element size and index - let (opcode, s_bit, q_bit, size_field) = match elem_size.as_str() { - "b" => { - // opcode = 000 (1,2 regs) or 001 (3,4 regs) - let base_opc = if num_structs <= 2 { 0b000u32 } else { 0b001u32 }; - // index bits: Q:S:size[1]:size[0] = 4 bits for 0-15 - let q = (index >> 3) & 1; - let s = (index >> 2) & 1; - let sz = index & 3; - (base_opc, s, q, sz) - } - "h" => { - let base_opc = if num_structs <= 2 { 0b010u32 } else { 0b011u32 }; - // index bits: Q:S:size[1] = 3 bits for 0-7, size[0]=0 - let q = (index >> 2) & 1; - let s = (index >> 1) & 1; - let sz = (index & 1) << 1; - (base_opc, s, q, sz) - } - "s" => { - let base_opc = if num_structs <= 2 { 0b100u32 } else { 0b101u32 }; - // index bits: Q:S = 2 bits for 0-3, size=00 - let q = (index >> 1) & 1; - let s = index & 1; - (base_opc, s, q, 0b00u32) - } - "d" => { - let base_opc = if num_structs <= 2 { 0b100u32 } else { 0b101u32 }; - // index bits: Q = 1 bit for 0-1, S=0, size=01 - let q = index & 1; - (base_opc, 0u32, q, 0b01u32) - } - _ => return Err(format!("unsupported element size for ld/st single: {}", elem_size)), - }; - - if let Some(_offset) = post_index { - // Post-index form: Q 0011011 L R 11111 opcode S size Rn Rt - // (Rm=11111 means immediate post-index, the amount is implicit from element size) - let word = (q_bit << 30) | (0b0011011 << 23) | (l_bit << 22) | (r_bit << 21) - | (0b11111 << 16) | (opcode << 13) | (s_bit << 12) | (size_field << 10) | (rn << 5) | rt; - Ok(EncodeResult::Word(word)) - } else { - // No post-index: Q 0011010 L R 00000 opcode S size Rn Rt - let word = (q_bit << 30) | (0b0011010 << 23) | (l_bit << 22) | (r_bit << 21) - | (opcode << 13) | (s_bit << 12) | (size_field << 10) | (rn << 5) | rt; - Ok(EncodeResult::Word(word)) - } -} - -/// Common encoder for LD1/ST1 (multiple structures) -pub(crate) fn encode_neon_ld_st_multi(operands: &[Operand], is_load: bool, num_structs: u32) -> Result { - if operands.len() < 2 { - return Err(format!("ld{}/st{} requires at least 2 operands", num_structs, num_structs)); - } - - // First operand: register list {Vt.T} or {Vt.T, Vt+1.T, ...} - let (rt, arr, num_regs) = match &operands[0] { - Operand::RegList(regs) => { - let (first_reg, arrangement) = match ®s[0] { - Operand::RegArrangement { reg, arrangement } => { - (parse_reg_num(reg).ok_or("invalid reg")?, arrangement.clone()) - } - _ => return Err(format!("ld{}/st{}: expected RegArrangement in list", num_structs, num_structs)), - }; - (first_reg, arrangement, regs.len() as u32) - } - _ => return Err(format!("ld{}/st{}: expected register list", num_structs, num_structs)), - }; - - let (q, size) = neon_arr_to_q_size(&arr)?; - - // Second operand: [Xn] memory base or [Xn], #imm (post-index, merged by parser) - let (rn, post_index) = match &operands[1] { - Operand::Mem { base, offset: 0 } => { - let r = parse_reg_num(base).ok_or_else(|| format!("invalid base register: {}", base))?; - (r, None) - } - Operand::MemPostIndex { base, offset } => { - let r = parse_reg_num(base).ok_or_else(|| format!("invalid base register: {}", base))?; - (r, Some(*offset)) - } - _ => return Err(format!("ld{}/st{}: expected [Xn] memory operand", num_structs, num_structs)), - }; - - // opcode field based on structure count and number of registers: - // LD1/ST1: 1 reg=0111, 2 reg=1010, 3 reg=0110, 4 reg=0010 - // LD2/ST2: 2 reg=1000 - // LD3/ST3: 3 reg=0100 - // LD4/ST4: 4 reg=0000 - let opcode = match num_structs { - 1 => match num_regs { - 1 => 0b0111u32, - 2 => 0b1010, - 3 => 0b0110, - 4 => 0b0010, - _ => return Err(format!("ld1/st1: unsupported register count: {}", num_regs)), - }, - 2 => 0b1000u32, - 3 => 0b0100, - 4 => 0b0000, - _ => return Err(format!("unsupported structure count: {}", num_structs)), - }; - - let l_bit = if is_load { 1u32 } else { 0u32 }; - - // Handle post-index form from merged MemPostIndex - if let Some(_imm) = post_index { - // Post-index with immediate: use Rm=11111 (0x1F) - let word = ((q << 30) | (0b001100 << 24) | (1 << 23) | (l_bit << 22)) | (0b11111 << 16) | (opcode << 12) | (size << 10) | (rn << 5) | rt; - return Ok(EncodeResult::Word(word)); - } - - // Check for post-index form via separate operands: [Xn], Xm - if operands.len() > 2 { - match &operands[2] { - Operand::Imm(_) => { - // Post-index with immediate: use Rm=11111 - let word = ((q << 30) | (0b001100 << 24) | (1 << 23) | (l_bit << 22)) | (0b11111 << 16) | (opcode << 12) | (size << 10) | (rn << 5) | rt; - return Ok(EncodeResult::Word(word)); - } - Operand::Reg(rm_name) => { - let rm = parse_reg_num(rm_name).ok_or("invalid rm")?; - let word = ((q << 30) | (0b001100 << 24) | (1 << 23) | (l_bit << 22)) | (rm << 16) | (opcode << 12) | (size << 10) | (rn << 5) | rt; - return Ok(EncodeResult::Word(word)); - } - _ => {} - } - } - - // No post-index: LD1/ST1 {Vt.T...}, [Xn] - // 0 Q 001100 0 L 0 00000 opcode size Rn Rt - let word = (((q << 30) | (0b001100 << 24)) | (l_bit << 22)) | (opcode << 12) | (size << 10) | (rn << 5) | rt; - Ok(EncodeResult::Word(word)) -} - -/// Encode NEON UZP1/UZP2/ZIP1/ZIP2 -pub(crate) fn encode_neon_zip_uzp(operands: &[Operand], op_bits: u32, _is_zip: bool) -> Result { - if operands.len() < 3 { - return Err("uzp/zip requires 3 operands".to_string()); - } - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let (rm, _) = get_neon_reg(operands, 2)?; - let (q, size) = neon_arr_to_q_size(&arr_d)?; - - // UZP1: 0 Q 0 01110 size 0 Rm 0 001 10 Rn Rd (op_bits=001) - // UZP2: 0 Q 0 01110 size 0 Rm 0 101 10 Rn Rd (op_bits=101) - // ZIP1: 0 Q 0 01110 size 0 Rm 0 011 10 Rn Rd (op_bits=011) - // ZIP2: 0 Q 0 01110 size 0 Rm 0 111 10 Rn Rd (op_bits=111) - let word = (((q << 30) | (0b001110 << 24) | (size << 22)) | (rm << 16)) | (op_bits << 12) | (0b10 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode NEON EOR3 (three-way XOR, SHA3 extension): EOR3 Vd.16b, Vn.16b, Vm.16b, Vk.16b -pub(crate) fn encode_neon_eor3(operands: &[Operand]) -> Result { - if operands.len() < 4 { - return Err("eor3 requires 4 operands".to_string()); - } - let (rd, _) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let (rm, _) = get_neon_reg(operands, 2)?; - let (rk, _) = get_neon_reg(operands, 3)?; - - // EOR3 Vd.16b, Vn.16b, Vm.16b, Vk.16b - // Encoding: 11001110 000 Rm 0 Rk(4:0) 00 Rn Rd - let word = ((0b11001110u32 << 24) | (rm << 16)) | (rk << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode NEON PMULL/PMULL2 (polynomial multiply long) -pub(crate) fn encode_neon_pmull(operands: &[Operand], is_pmull2: bool) -> Result { - if operands.len() < 3 { - return Err("pmull requires 3 operands".to_string()); - } - let (rd, _) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let (rm, _) = get_neon_reg(operands, 2)?; - - let q = if is_pmull2 { 1u32 } else { 0 }; - - // PMULL Vd.1q, Vn.1d, Vm.1d: 0 0 00 1110 11 1 Rm 11100 0 Rn Rd (size=11) - // PMULL2 Vd.1q, Vn.2d, Vm.2d: 0 1 00 1110 11 1 Rm 11100 0 Rn Rd - let word = ((q << 30) | (0b001110 << 24) | (0b11 << 22) | (1 << 21) - | (rm << 16) | (0b11100 << 11)) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode NEON AES instructions (AESE, AESD, AESMC, AESIMC) -pub(crate) fn encode_neon_aes(operands: &[Operand], opcode: u32) -> Result { - if operands.len() < 2 { - return Err("aes instruction requires 2 operands".to_string()); - } - let (rd, _) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - - // AES instructions: 0100 1110 0010 1000 opcode 10 Rn Rd - // AESE: opcode = 00100 (0x4) - // AESD: opcode = 00101 (0x5) - // AESMC: opcode = 00110 (0x6) - // AESIMC:opcode = 00111 (0x7) - let word = (0b01001110 << 24) | (0b0010100 << 17) | (opcode << 12) - | (0b10 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode NEON ADD/SUB (vector integer): ADD/SUB Vd.T, Vn.T, Vm.T -pub(crate) fn encode_neon_add_sub(operands: &[Operand], is_sub: bool) -> Result { - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let (rm, _) = get_neon_reg(operands, 2)?; - let (q, size) = neon_arr_to_q_size(&arr_d)?; - let u = if is_sub { 1u32 } else { 0u32 }; - - // ADD: 0 Q 0 01110 size 1 Rm 10000 1 Rn Rd - // SUB: 0 Q 1 01110 size 1 Rm 10000 1 Rn Rd - let word = (q << 30) | (u << 29) | (0b01110 << 24) | (size << 22) | (1 << 21) - | (rm << 16) | (0b10000 << 11) | (1 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode NEON USHR (unsigned shift right immediate) -pub(crate) fn encode_neon_ushr(operands: &[Operand]) -> Result { - if operands.len() < 3 { - return Err("ushr requires 3 operands".to_string()); - } - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let shift = get_imm(operands, 2)? as u32; - - let (q, _) = neon_arr_to_q_size(&arr_d)?; - - // USHR Vd.T, Vn.T, #shift - // 0 Q 1 0 11110 immh:immb 000001 Rn Rd - let immh_immb = match arr_d.as_str() { - "8b" | "16b" => (16 - shift) & 0xF, - "4h" | "8h" => (32 - shift) & 0x1F, - "2s" | "4s" => (64 - shift) & 0x3F, - "2d" => (128 - shift) & 0x7F, - _ => return Err(format!("unsupported ushr arrangement: {}", arr_d)), - }; - - let word = (q << 30) | (1 << 29) | (0b011110 << 23) | (immh_immb << 16) - | (0b000001 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode NEON SSHR (signed shift right immediate) -pub(crate) fn encode_neon_sshr(operands: &[Operand]) -> Result { - if operands.len() < 3 { - return Err("sshr requires 3 operands".to_string()); - } - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let shift = get_imm(operands, 2)? as u32; - - let (q, _) = neon_arr_to_q_size(&arr_d)?; - - // SSHR Vd.T, Vn.T, #shift - // 0 Q 0 0 11110 immh:immb 000001 Rn Rd (U=0) - let immh_immb = match arr_d.as_str() { - "8b" | "16b" => (16 - shift) & 0xF, - "4h" | "8h" => (32 - shift) & 0x1F, - "2s" | "4s" => (64 - shift) & 0x3F, - "2d" => (128 - shift) & 0x7F, - _ => return Err(format!("unsupported sshr arrangement: {}", arr_d)), - }; - - let word = (q << 30) | (0b011110 << 23) | (immh_immb << 16) - | (0b000001 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode NEON SHL (shift left immediate) -pub(crate) fn encode_neon_shl(operands: &[Operand]) -> Result { - if operands.len() < 3 { - return Err("shl requires 3 operands".to_string()); - } - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let shift = get_imm(operands, 2)? as u32; - - let (q, _) = neon_arr_to_q_size(&arr_d)?; - - // SHL Vd.T, Vn.T, #shift - // 0 Q 0 0 11110 immh:immb 010101 Rn Rd - // immh:immb = element_size + shift - let immh_immb = match arr_d.as_str() { - "8b" | "16b" => (8 + shift) & 0xF, - "4h" | "8h" => (16 + shift) & 0x1F, - "2s" | "4s" => (32 + shift) & 0x3F, - "2d" => (64 + shift) & 0x7F, - _ => return Err(format!("unsupported shl arrangement: {}", arr_d)), - }; - - let word = (q << 30) | (0b011110 << 23) | (immh_immb << 16) - | (0b010101 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode NEON SLI (shift left and insert) -pub(crate) fn encode_neon_sli(operands: &[Operand]) -> Result { - if operands.len() < 3 { - return Err("sli requires 3 operands".to_string()); - } - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let shift = get_imm(operands, 2)? as u32; - - let (q, _) = neon_arr_to_q_size(&arr_d)?; - - // SLI Vd.T, Vn.T, #shift - // 0 Q 1 0 11110 immh:immb 010101 Rn Rd (U=1) - let immh_immb = match arr_d.as_str() { - "8b" | "16b" => (8 + shift) & 0xF, - "4h" | "8h" => (16 + shift) & 0x1F, - "2s" | "4s" => (32 + shift) & 0x3F, - "2d" => (64 + shift) & 0x7F, - _ => return Err(format!("unsupported sli arrangement: {}", arr_d)), - }; - - let word = (q << 30) | (1 << 29) | (0b011110 << 23) | (immh_immb << 16) - | (0b010101 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// Encode SRI (Shift Right and Insert) immediate. -/// SRI Vd.T, Vn.T, #shift: 0 Q 1 0 11110 immh:immb 010001 Rn Rd (U=1) -pub(crate) fn encode_neon_sri(operands: &[Operand]) -> Result { - if operands.len() < 3 { - return Err("sri requires 3 operands".to_string()); - } - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let shift = get_imm(operands, 2)? as u32; - - let (q, _) = neon_arr_to_q_size(&arr_d)?; - - // immh:immb = (2*esize - shift) for right shift - let immh_immb = match arr_d.as_str() { - "8b" | "16b" => (16 - shift) & 0xF, - "4h" | "8h" => (32 - shift) & 0x1F, - "2s" | "4s" => (64 - shift) & 0x3F, - "2d" => (128 - shift) & 0x7F, - _ => return Err(format!("unsupported sri arrangement: {}", arr_d)), - }; - - let word = (q << 30) | (1 << 29) | (0b011110 << 23) | (immh_immb << 16) - | (0b010001 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -// ── NEON RBIT (vector bit reverse) ─────────────────────────────────────── - -/// Encode NEON RBIT Vd.T, Vn.T (per-byte bit reversal in each element). -pub(crate) fn encode_neon_rbit(operands: &[Operand]) -> Result { - if operands.len() < 2 { - return Err("neon rbit requires 2 operands".to_string()); - } - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - - // Only .8b and .16b arrangements are valid for NEON RBIT - if arr_d != "8b" && arr_d != "16b" { - return Err(format!("neon rbit: unsupported arrangement .{}, expected .8b or .16b", arr_d)); - } - let q: u32 = if arr_d == "16b" { 1 } else { 0 }; - // RBIT Vd.T, Vn.T: 0 Q 1 01110 01 10000 00101 10 Rn Rd - let word = (q << 30) | (1 << 29) | (0b01110 << 24) | (0b01 << 22) - | (0b10000 << 17) | (0b00101 << 12) | (0b10 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -// ── NEON MVNI (move NOT immediate) ─────────────────────────────────────── - -/// Encode NEON MVNI Vd.T, #imm (move bitwise NOT immediate to vector). -pub(crate) fn encode_neon_mvni(operands: &[Operand]) -> Result { - if operands.len() < 2 { - return Err("mvni requires 2 operands".to_string()); - } - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let imm = get_imm(operands, 1)?; - let imm8 = imm as u32 & 0xFF; - - // Extract abc:defgh for encoding - let abc = (imm8 >> 5) & 0x7; - let defgh = imm8 & 0x1f; - - match arr_d.as_str() { - "2s" | "4s" => { - let q: u32 = if arr_d == "4s" { 1 } else { 0 }; - // Check for optional shift - let cmode = if let Some(Operand::Shift { kind, amount }) = operands.get(2) { - if kind.to_lowercase() == "lsl" { - match *amount { - 0 => 0b0000u32, - 8 => 0b0010, - 16 => 0b0100, - 24 => 0b0110, - _ => return Err(format!("mvni: unsupported shift amount: {}", amount)), - } - } else if kind.to_lowercase() == "msl" { - match *amount { - 8 => 0b1100u32, - 16 => 0b1101, - _ => return Err(format!("mvni: unsupported MSL shift: {}", amount)), - } - } else { - 0b0000 - } - } else { - 0b0000 - }; - // MVNI: 0 Q 1 0 1111 00 abc cmode 01 defgh Rd (op=1) - let word = (q << 30) | (1 << 29) | (0b0111100 << 22) - | (abc << 16) | (cmode << 12) | (0b01 << 10) | (defgh << 5) | rd; - Ok(EncodeResult::Word(word)) - } - "4h" | "8h" => { - let q: u32 = if arr_d == "8h" { 1 } else { 0 }; - // MVNI 16-bit: cmode=1000, op=1 - let word = (q << 30) | (1 << 29) | (0b0111100 << 22) - | (abc << 16) | (0b1000 << 12) | (0b01 << 10) | (defgh << 5) | rd; - Ok(EncodeResult::Word(word)) - } - _ => Err(format!("mvni: unsupported arrangement: {}", arr_d)), - } -} - -// ── NEON float three-same ──────────────────────────────────────────────── -/// Encode NEON float three-same: FADD, FSUB, FMUL, FDIV, FMLA, FMLS, etc. -/// Format: 0 Q U 01110 size 1 Rm opcode 1 Rn Rd -/// size[1]=size_hi (0 or 1), size[0]=sz (0=single, 1=double) -pub(crate) fn encode_neon_float_three_same(operands: &[Operand], u_bit: u32, size_hi: u32, opcode: u32) -> Result { - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let (rm, _) = get_neon_reg(operands, 2)?; - let (q, sz) = match arr_d.as_str() { - "2s" => (0u32, 0u32), "4s" => (1, 0), "2d" => (1, 1), - _ => return Err(format!("float three-same: unsupported arrangement: {}", arr_d)), - }; - let size = (size_hi << 1) | sz; - let word = (q << 30) | (u_bit << 29) | (0b01110 << 24) | (size << 22) | (1 << 21) - | (rm << 16) | (opcode << 11) | (1 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -// ── NEON two-register misc (integer) ───────────────────────────────────── -/// Encode NEON two-reg misc: ABS, NEG, CLS, CLZ, etc. -/// Format: 0 Q U 01110 size 10000 opcode 10 Rn Rd -pub(crate) fn encode_neon_two_misc(operands: &[Operand], u_bit: u32, opcode: u32) -> Result { - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let (q, size) = neon_arr_to_q_size(&arr_d)?; - let word = (q << 30) | (u_bit << 29) | (0b01110 << 24) | (size << 22) - | (0b10000 << 17) | (opcode << 12) | (0b10 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -// ── NEON float two-register misc ───────────────────────────────────────── -/// Encode NEON float two-reg misc: UCVTF, SCVTF, FCVTZS, FCVTZU, FNEG, FABS, etc. (vector) -/// Format: 0 Q U 01110 size 10000 opcode 10 Rn Rd -/// size[1]=size_hi, size[0]=sz (0=single, 1=double) -pub(crate) fn encode_neon_float_two_misc(operands: &[Operand], u_bit: u32, size_hi: u32, opcode: u32) -> Result { - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let (q, sz) = match arr_d.as_str() { - "2s" => (0u32, 0u32), "4s" => (1, 0), "2d" => (1, 1), - _ => return Err(format!("float two-misc: unsupported arrangement: {}", arr_d)), - }; - let size = (size_hi << 1) | sz; - let word = (q << 30) | (u_bit << 29) | (0b01110 << 24) | (size << 22) - | (0b10000 << 17) | (opcode << 12) | (0b10 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -// ── NEON shift right narrow (SHRN/RSHRN) ───────────────────────────────── -/// Format: 0 Q 0 01111 0 immh immb opcode 1 Rn Rd -/// SHRN opcode=10000, RSHRN opcode=10001 -pub(crate) fn encode_neon_shrn(operands: &[Operand], opcode: u32, is_high: bool) -> Result { - if operands.len() < 3 { return Err("shrn/rshrn requires 3 operands".to_string()); } - let (rd, _) = get_neon_reg(operands, 0)?; - let (rn, arr_n) = get_neon_reg(operands, 1)?; - let shift = get_imm(operands, 2)? as u32; - let element_bits = match arr_n.as_str() { "8h" => 16u32, "4s" => 32, "2d" => 64, - _ => return Err(format!("shrn: unsupported source: {}", arr_n)), }; - let half_bits = element_bits / 2; - if shift == 0 || shift > half_bits { return Err(format!("shrn: shift {} out of range", shift)); } - let immhb = element_bits - shift; - let q = if is_high { 1u32 } else { 0 }; - let word = (q << 30) | (0b011110 << 23) | ((immhb >> 3) << 19) | ((immhb & 7) << 16) - | (opcode << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -// ── NEON shift right accumulate (SSRA/USRA/SRSHR/URSHR) ───────────────── -/// Format: 0 Q U 01111 0 immh immb opcode 1 Rn Rd -pub(crate) fn encode_neon_shift_right(operands: &[Operand], u_bit: u32, opcode: u32) -> Result { - if operands.len() < 3 { return Err("shift-right requires 3 operands".to_string()); } - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let shift = get_imm(operands, 2)? as u32; - let (q, _) = neon_arr_to_q_size(&arr_d)?; - let element_bits: u32 = match arr_d.as_str() { - "8b" | "16b" => 8, "4h" | "8h" => 16, "2s" | "4s" => 32, "2d" => 64, - _ => return Err(format!("shift-right: unsupported: {}", arr_d)), }; - if shift == 0 || shift > element_bits { return Err(format!("shift {} out of range", shift)); } - let immhb = (element_bits * 2) - shift; - let word = (q << 30) | (u_bit << 29) | (0b011110 << 23) | ((immhb >> 3) << 19) | ((immhb & 7) << 16) - | (opcode << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -// ── NEON SSHLL/USHLL (shift left long) ─────────────────────────────────── -/// Format: 0 Q U 011110 immh immb 10100 1 Rn Rd -pub(crate) fn encode_neon_shll(operands: &[Operand], u_bit: u32, is_high: bool) -> Result { - if operands.len() < 3 { return Err("sshll/ushll requires 3 operands".to_string()); } - let (rd, _) = get_neon_reg(operands, 0)?; - let (rn, arr_n) = get_neon_reg(operands, 1)?; - let shift = get_imm(operands, 2)? as u32; - let base_val = match arr_n.as_str() { - "8b" | "16b" => 8u32, "4h" | "8h" => 16, "2s" | "4s" => 32, - _ => return Err(format!("sshll/ushll: unsupported source: {}", arr_n)), }; - let immhb = base_val + shift; - let q = if is_high { 1u32 } else { 0 }; - let word = (q << 30) | (u_bit << 29) | (0b011110 << 23) | ((immhb >> 3) << 19) | ((immhb & 7) << 16) - | (0b101001 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -// ── NEON pairwise add (UADDLP/SADDLP/UADALP/SADALP) ──────────────────── - -// ── NEON three-different extras: UABAL/SABAL/ADDHN/RADDHN/SUBHN/RSUBHN ── -// Already have encode_neon_three_diff which handles these opcodes. - -// ── NEON SQXTUN ────────────────────────────────────────────────────────── -// Two-reg misc with U=1, opcode=10010. Reuse encode_neon_two_misc_narrow. - -// ── NEON shift right narrow saturating (SQSHRN/UQSHRN/SQRSHRN/UQRSHRN) ─ -pub(crate) fn encode_neon_qshrn(operands: &[Operand], u_bit: u32, is_rounding: bool, is_high: bool) -> Result { - if operands.len() < 3 { return Err("qshrn requires 3 operands".to_string()); } - let (rd, _) = get_neon_reg(operands, 0)?; - let (rn, arr_n) = get_neon_reg(operands, 1)?; - let shift = get_imm(operands, 2)? as u32; - let element_bits = match arr_n.as_str() { "8h" => 16u32, "4s" => 32, "2d" => 64, - _ => return Err(format!("qshrn: unsupported source: {}", arr_n)), }; - if shift == 0 || shift > element_bits { return Err(format!("qshrn: shift {} out of range for {}-bit elements", shift, element_bits)); } - let immhb = element_bits - shift; - let q = if is_high { 1u32 } else { 0 }; - let opcode_bits: u32 = if is_rounding { 0b100111 } else { 0b100101 }; - let word = (q << 30) | (u_bit << 29) | (0b011110 << 23) | ((immhb >> 3) << 19) | ((immhb & 7) << 16) - | (opcode_bits << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -// ── NEON ADDHN/RADDHN/SUBHN/RSUBHN ────────────────────────────────────── -/// Three-different narrowing high: Format: 0 Q U 01110 size 1 Rm opcode 00 Rn Rd -pub(crate) fn encode_neon_three_diff_narrow(operands: &[Operand], u_bit: u32, opcode: u32, is_high: bool) -> Result { - if operands.len() < 3 { return Err("addhn/subhn requires 3 operands".to_string()); } - let (rd, _) = get_neon_reg(operands, 0)?; - let (rn, arr_n) = get_neon_reg(operands, 1)?; - let (rm, _) = get_neon_reg(operands, 2)?; - let size = match arr_n.as_str() { "8h" => 0b00u32, "4s" => 0b01, "2d" => 0b10, - _ => return Err(format!("addhn: unsupported source: {}", arr_n)), }; - let q = if is_high { 1u32 } else { 0 }; - let word = (q << 30) | (u_bit << 29) | (0b01110 << 24) | (size << 22) | (1 << 21) - | (rm << 16) | (opcode << 12) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -// ── NEON LD2R/LD3R/LD4R ────────────────────────────────────────────────── -pub(crate) fn encode_neon_ldnr(operands: &[Operand], num_structs: u32) -> Result { - if operands.len() < 2 { return Err(format!("ld{}r requires 2 operands", num_structs)); } - let (rt, arr, num_regs) = match &operands[0] { - Operand::RegList(regs) => { - let (first_reg, arrangement) = match ®s[0] { - Operand::RegArrangement { reg, arrangement } => - (parse_reg_num(reg).ok_or("invalid reg")?, arrangement.clone()), - _ => return Err("expected RegArrangement in list".to_string()), - }; - (first_reg, arrangement, regs.len() as u32) - } - _ => return Err("expected register list".to_string()), - }; - if num_regs != num_structs { return Err(format!("ld{}r: expected {} regs, got {}", num_structs, num_structs, num_regs)); } - let (q, size) = match arr.as_str() { - "8b" => (0u32, 0b00u32), "16b" => (1, 0b00), - "4h" => (0, 0b01), "8h" => (1, 0b01), - "2s" => (0, 0b10), "4s" => (1, 0b10), - "1d" => (0, 0b11), "2d" => (1, 0b11), - _ => return Err(format!("ld{}r: unsupported arrangement: {}", num_structs, arr)), - }; - // opcode: ld1r=110, ld2r=110(S=1), ld3r=111, ld4r=111(S=1) - let (opcode, s_bit) = match num_structs { - 1 => (0b110u32, 0u32), - 2 => (0b110, 1), - 3 => (0b111, 0), - 4 => (0b111, 1), - _ => return Err(format!("unsupported: ld{}r", num_structs)), - }; - let base = match &operands[1] { - Operand::Mem { base, .. } => parse_reg_num(base).ok_or("invalid base")?, - Operand::MemPostIndex { base, .. } => parse_reg_num(base).ok_or("invalid base")?, - _ => return Err("expected memory operand".to_string()), - }; - // check for post-index - let rm = match &operands[1] { - Operand::MemPostIndex { .. } => 0b11111u32, // immediate post-index - _ => 0u32, - }; - let has_post = rm != 0; - let word = (q << 30) | (0b001101 << 24) | (if has_post { 1u32 } else { 0 } << 23) - | (1 << 22) | (if has_post { rm } else { 0 } << 16) | (opcode << 13) | (s_bit << 12) | (size << 10) | (base << 5) | rt; - Ok(EncodeResult::Word(word)) -} - -// ── NEON float compare-to-zero ─────────────────────────────────────────── -/// FCMEQ/FCMLE/FCMLT/FCMGE/FCMGT to zero -/// Format: 0 Q U 01110 size 10000 opcode 10 Rn Rd (float, size = 0sz) -pub(crate) fn encode_neon_float_cmp_zero(operands: &[Operand], u_bit: u32, size_hi: u32, opcode: u32) -> Result { - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let (q, sz) = match arr_d.as_str() { - "2s" => (0u32, 0u32), "4s" => (1, 0), "2d" => (1, 1), - _ => return Err(format!("float cmp zero: unsupported: {}", arr_d)), - }; - let size = (size_hi << 1) | sz; - let word = (q << 30) | (u_bit << 29) | (0b01110 << 24) | (size << 22) - | (0b10000 << 17) | (opcode << 12) | (0b10 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -// ── NEON by-element (non-long) ─────────────────────────────────────────── -/// MUL/MLA/MLS by element: 0 Q U 01111 size L M Rm opcode H 0 Rn Rd -pub(crate) fn encode_neon_elem(operands: &[Operand], u_bit: u32, opcode: u32) -> Result { - if operands.len() < 3 { return Err("NEON by-element requires 3 operands".to_string()); } - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let (rm, index) = match &operands[2] { - Operand::RegLane { reg, index, .. } => (parse_reg_num(reg).ok_or("invalid reg")?, *index), - _ => return Err(format!("expected register lane, got {:?}", operands[2])), - }; - let (q, size) = neon_arr_to_q_size(&arr_d)?; - let (h, l, m_bit) = match size { - 0b01 => ((index >> 2) & 1, (index >> 1) & 1, index & 1), - 0b10 => ((index >> 1) & 1, index & 1, (rm >> 4) & 1), - _ => return Err("unsupported element size for by-element".to_string()), - }; - let rm_enc = if size == 0b01 { rm & 0xF } else { rm & 0x1F }; - let word = (q << 30) | (u_bit << 29) | (0b01111 << 24) | (size << 22) - | (l << 21) | (m_bit << 20) | (rm_enc << 16) | (opcode << 12) - | (h << 11) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -// ── NEON float by-element ──────────────────────────────────────────────── -pub(crate) fn encode_neon_float_elem(operands: &[Operand], u_bit: u32, opcode: u32) -> Result { - if operands.len() < 3 { return Err("NEON float by-element requires 3 operands".to_string()); } - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let (rm, index) = match &operands[2] { - Operand::RegLane { reg, index, .. } => (parse_reg_num(reg).ok_or("invalid reg")?, *index), - _ => return Err(format!("expected register lane, got {:?}", operands[2])), - }; - let (q, sz) = match arr_d.as_str() { - "2s" => (0u32, 0u32), "4s" => (1, 0), "2d" => (1, 1), - _ => return Err(format!("float by-element: unsupported: {}", arr_d)), - }; - let (h, l, m_bit) = if sz == 0 { - ((index >> 1) & 1, index & 1, (rm >> 4) & 1) - } else { - (index & 1, 0u32, (rm >> 4) & 1) - }; - let rm_enc = rm & 0x1F; - let word = (q << 30) | (u_bit << 29) | (0b01111 << 24) | (sz << 22) - | (l << 21) | (m_bit << 20) | (rm_enc << 16) | (opcode << 12) - | (h << 11) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -// ── NEON FCVTL/FCVTN ──────────────────────────────────────────────────── -/// FCVTL: half→single or single→double widening float convert -/// Format: 0 Q 0 01110 0 sz 10000 10111 10 Rn Rd -pub(crate) fn encode_neon_fcvtl(operands: &[Operand], is_high: bool) -> Result { - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let sz = match arr_d.as_str() { "4s" | "2s" => 0u32, "2d" => 1, - _ => return Err(format!("fcvtl: unsupported dest: {}", arr_d)), }; - let q = if is_high { 1u32 } else { 0 }; - let word = (q << 30) | (0b01110 << 24) | (sz << 22) | (0b10000 << 17) - | (0b10111 << 12) | (0b10 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -/// FCVTN: single→half or double→single narrowing float convert -pub(crate) fn encode_neon_fcvtn(operands: &[Operand], is_high: bool) -> Result { - let (rd, _) = get_neon_reg(operands, 0)?; - let (rn, arr_n) = get_neon_reg(operands, 1)?; - let sz = match arr_n.as_str() { "4s" | "2s" => 0u32, "2d" => 1, - _ => return Err(format!("fcvtn: unsupported source: {}", arr_n)), }; - let q = if is_high { 1u32 } else { 0 }; - let word = (q << 30) | (0b01110 << 24) | (sz << 22) | (0b10000 << 17) - | (0b10110 << 12) | (0b10 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -// ── BIT/BIF (bitwise insert if true/false) ────────────────────────────── -/// Encodes BIT (size=10) and BIF (size=11) instructions. -/// Same format as BSL but with different size field. -/// Format: 0 Q 1 01110 ss 1 Rm 000111 Rn Rd -pub(crate) fn encode_neon_bitwise_insert(operands: &[Operand], size: u32) -> Result { - if operands.len() < 3 { - return Err("bit/bif requires 3 operands".to_string()); - } - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let (rm, _) = get_neon_reg(operands, 2)?; - let q: u32 = if arr_d == "16b" { 1 } else { 0 }; - let word = (q << 30) | (1 << 29) | (0b01110 << 24) | (size << 22) | (1 << 21) - | (rm << 16) | (0b000111 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -// ── FADDP (float pairwise add) ────────────────────────────────────────── -/// FADDP — float add pairwise -/// Vector form: FADDP Vd.T, Vn.T, Vm.T -/// Format: 0 Q 1 01110 0 sz 1 Rm 110101 Rn Rd -/// Scalar form: FADDP Sd, Vn.2S or FADDP Dd, Vn.2D -/// Format: 01 1 11110 0 sz 11000 01101 10 Rn Rd -pub(crate) fn encode_neon_faddp(operands: &[Operand]) -> Result { - if operands.len() >= 3 { - // Vector form: 3 operands - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let (rm, _) = get_neon_reg(operands, 2)?; - let (q, sz) = match arr_d.as_str() { - "2s" => (0u32, 0u32), - "4s" => (1, 0), - "2d" => (1, 1), - _ => return Err(format!("faddp: unsupported arrangement: {}", arr_d)), - }; - let word = (q << 30) | (1 << 29) | (0b01110 << 24) | (sz << 22) | (1 << 21) - | (rm << 16) | (0b110101 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) - } else if operands.len() == 2 { - // Scalar form: FADDP Sd, Vn.2S or FADDP Dd, Vn.2D - let rd = match &operands[0] { - Operand::Reg(r) => parse_reg_num(r).ok_or("invalid dest reg")?, - _ => return Err("faddp scalar: expected register".to_string()), - }; - let (rn, arr_n) = get_neon_reg(operands, 1)?; - let sz = match arr_n.as_str() { - "2s" => 0u32, - "2d" => 1, - _ => return Err(format!("faddp scalar: unsupported source: {}", arr_n)), - }; - // 01 1 11110 0 sz 11000 01101 10 Rn Rd - let word = (0b01 << 30) | (1 << 29) | (0b11110 << 24) | (sz << 22) - | (0b11000 << 17) | (0b01101 << 12) | (0b10 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) - } else { - Err("faddp requires 2 or 3 operands".to_string()) - } -} - -// ── SADDLV/UADDLV (signed/unsigned add long across vector) ───────────── -/// Format: 0 Q U 01110 size 11000 00011 10 Rn Rd -pub(crate) fn encode_neon_across_long(operands: &[Operand], u: u32, opcode: u32) -> Result { - if operands.len() < 2 { - return Err("saddlv/uaddlv requires 2 operands".to_string()); - } - // Destination is a scalar register (e.g., s16), source is a vector arrangement - let rd = match &operands[0] { - Operand::Reg(r) => parse_reg_num(r).ok_or("invalid dest reg")?, - Operand::RegArrangement { reg, .. } => parse_reg_num(reg).ok_or("invalid dest reg")?, - _ => return Err("saddlv: expected register".to_string()), - }; - let (rn, arr_n) = get_neon_reg(operands, 1)?; - let (q, size) = neon_arr_to_q_size(&arr_n)?; - let word = (q << 30) | (u << 29) | (0b01110 << 24) | (size << 22) - | (0b11000 << 17) | (opcode << 12) | (0b10 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -// ── NEON shift left by immediate (SQSHL, UQSHL, SHL, etc.) ───────────── -/// Format: 0 Q U 011110 immh:immb opcode 1 Rn Rd -/// immh:immb encodes both the element size and the shift amount. -pub(crate) fn encode_neon_shift_left_imm(operands: &[Operand], u: u32, opcode: u32) -> Result { - if operands.len() < 3 { - return Err("shift left immediate requires 3 operands".to_string()); - } - let (rd, arr_d) = get_neon_reg(operands, 0)?; - let (rn, _) = get_neon_reg(operands, 1)?; - let shift = get_imm(operands, 2)? as u32; - - let (q, _immh_base, esize) = match arr_d.as_str() { - "8b" => (0u32, 0b0001u32, 8u32), - "16b" => (1, 0b0001, 8), - "4h" => (0, 0b0010, 16), - "8h" => (1, 0b0010, 16), - "2s" => (0, 0b0100, 32), - "4s" => (1, 0b0100, 32), - "2d" => (1, 0b1000, 64), - _ => return Err(format!("shift left imm: unsupported arrangement: {}", arr_d)), - }; - - // immh:immb = esize + shift_amount - // For 8-bit: immh=0001, shift in 0..7 => immh:immb = 8 + shift - // For 16-bit: immh=001x, shift in 0..15 => immh:immb = 16 + shift - // For 32-bit: immh=01xx, shift in 0..31 => immh:immb = 32 + shift - // For 64-bit: immh=1xxx, shift in 0..63 => immh:immb = 64 + shift - let immhb = esize + shift; - let immh = (immhb >> 3) & 0xF; - let immb = immhb & 0x7; - - let word = (q << 30) | (u << 29) | (0b011110 << 23) | (immh << 19) | (immb << 16) - | (opcode << 11) | (1 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -// ── Helper: detect scalar d-register 3-operand NEON operations ────────────── -pub(crate) fn is_neon_scalar_d_reg_op(operands: &[Operand]) -> bool { - if operands.len() < 3 { return false; } - match &operands[0] { - Operand::Reg(r) => { - let r = r.to_lowercase(); - r.starts_with('d') && r[1..].parse::().is_ok() - } - _ => false, - } -} - -// ── NEON scalar three-same: ADD/SUB Dd, Dn, Dm ──────────────────────────── -/// Encode scalar NEON three-same: 01 U 11110 size 1 Rm opcode 1 Rn Rd -pub(crate) fn encode_neon_scalar_three_same(operands: &[Operand], u_bit: u32, opcode: u32, size: u32) -> Result { - if operands.len() < 3 { return Err("scalar three-same requires 3 operands".to_string()); } - let rd = match &operands[0] { Operand::Reg(r) => parse_reg_num(r).ok_or("invalid reg")?, _ => return Err("expected register".to_string()) }; - let rn = match &operands[1] { Operand::Reg(r) => parse_reg_num(r).ok_or("invalid reg")?, _ => return Err("expected register".to_string()) }; - let rm = match &operands[2] { Operand::Reg(r) => parse_reg_num(r).ok_or("invalid reg")?, _ => return Err("expected register".to_string()) }; - let word = (0b01 << 30) | (u_bit << 29) | (0b11110 << 24) | (size << 22) | (1 << 21) - | (rm << 16) | (opcode << 11) | (1 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -// ── NEON scalar ADDP: addp Dd, Vn.2d ────────────────────────────────────── -pub(crate) fn encode_neon_scalar_addp(operands: &[Operand]) -> Result { - if operands.len() < 2 { return Err("scalar addp requires 2 operands".to_string()); } - let rd = match &operands[0] { Operand::Reg(r) => parse_reg_num(r).ok_or("invalid reg")?, _ => return Err("expected d register".to_string()) }; - let rn = match &operands[1] { - Operand::RegArrangement { reg, arrangement } => { - if arrangement != "2d" { return Err(format!("scalar addp requires .2d source, got .{}", arrangement)); } - parse_reg_num(reg).ok_or("invalid reg")? - } - _ => return Err("scalar addp: expected Vn.2d source".to_string()), - }; - // Scalar ADDP: 01 0 11110 11 11000 11011 10 Rn Rd - let word = (0b01 << 30) | (0b011110 << 24) | (0b11 << 22) | (0b11000 << 17) - | (0b11011 << 12) | (0b10 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -// ── NEON scalar two-reg misc: SQABS/SQNEG Hd,Hn / Sd,Sn / Dd,Dn ────────── -pub(crate) fn encode_neon_scalar_two_misc(operands: &[Operand], u_bit: u32, opcode: u32) -> Result { - if operands.len() < 2 { return Err("scalar two-misc requires 2 operands".to_string()); } - let (rd, rd_name) = match &operands[0] { Operand::Reg(r) => (parse_reg_num(r).ok_or("invalid reg")?, r.to_lowercase()), _ => return Err("expected register".to_string()) }; - let rn = match &operands[1] { Operand::Reg(r) => parse_reg_num(r).ok_or("invalid reg")?, _ => return Err("expected register".to_string()) }; - let size = if rd_name.starts_with('b') { 0b00u32 } - else if rd_name.starts_with('h') { 0b01 } - else if rd_name.starts_with('s') { 0b10 } - else if rd_name.starts_with('d') { 0b11 } - else { return Err(format!("scalar two-misc: unsupported register type: {}", rd_name)); }; - // 01 U 11110 size 10000 opcode 10 Rn Rd - let word = (0b01 << 30) | (u_bit << 29) | (0b11110 << 24) | (size << 22) - | (0b10000 << 17) | (opcode << 12) | (0b10 << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -// ── NEON scalar SQSHRN: sqshrn Hd,Sn,#shift / sqshrn Sd,Dn,#shift ──────── -pub(crate) fn encode_neon_scalar_qshrn(operands: &[Operand], u_bit: u32, is_rounding: bool) -> Result { - if operands.len() < 3 { return Err("scalar qshrn requires 3 operands".to_string()); } - let (rd, rd_name) = match &operands[0] { Operand::Reg(r) => (parse_reg_num(r).ok_or("invalid reg")?, r.to_lowercase()), _ => return Err("expected register".to_string()) }; - let rn = match &operands[1] { Operand::Reg(r) => parse_reg_num(r).ok_or("invalid reg")?, _ => return Err("expected register".to_string()) }; - let shift = get_imm(operands, 2)? as u32; - // Determine element bits from destination register type - let element_bits = if rd_name.starts_with('b') { 8u32 } // b <- h (narrow from 16-bit) - else if rd_name.starts_with('h') { 16 } // h <- s (narrow from 32-bit), immh base = 16 - else if rd_name.starts_with('s') { 32 } // s <- d (narrow from 64-bit), immh base = 32 - else { return Err(format!("scalar qshrn: unsupported dest: {}", rd_name)); }; - if shift == 0 || shift > element_bits { return Err(format!("scalar qshrn: shift {} out of range", shift)); } - let immhb = (element_bits * 2) - shift; // source element bits - shift - let opcode_bits: u32 = if is_rounding { 0b100111 } else { 0b100101 }; - // 01 U 11110 immh:immb opcode 1 Rn Rd - let word = (0b01 << 30) | (u_bit << 29) | (0b011110 << 23) | ((immhb >> 3) << 19) | ((immhb & 7) << 16) - | (opcode_bits << 10) | (rn << 5) | rd; - Ok(EncodeResult::Word(word)) -} - -// ── NEON addp (integer pairwise add) — already handled in three-same as addp ── diff --git a/src/backend/arm/assembler/encoder/system.rs b/src/backend/arm/assembler/encoder/system.rs deleted file mode 100644 index 17e3a864f4..0000000000 --- a/src/backend/arm/assembler/encoder/system.rs +++ /dev/null @@ -1,613 +0,0 @@ -use super::*; -use crate::backend::arm::assembler::parser::Operand; - -// ── System instructions ────────────────────────────────────────────────── - -pub(crate) fn encode_dmb(operands: &[Operand]) -> Result { - let option = match operands.first() { - Some(Operand::Barrier(b)) | Some(Operand::Symbol(b)) => match b.to_lowercase().as_str() { - "sy" => 0b1111u32, - "st" => 0b1110, - "ld" => 0b1101, - "ish" => 0b1011, - "ishst" => 0b1010, - "ishld" => 0b1001, - "nsh" => 0b0111, - "nshst" => 0b0110, - "nshld" => 0b0101, - "osh" => 0b0011, - "oshst" => 0b0010, - "oshld" => 0b0001, - _ => return Err(format!("unknown dmb option: {}", b)), - }, - _ => 0b1111, - }; - // DMB: 0xD50330BF | (CRm << 8) - let word = 0xd50330bf | (option << 8); - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_dsb(operands: &[Operand]) -> Result { - let option = match operands.first() { - Some(Operand::Barrier(b)) | Some(Operand::Symbol(b)) => match b.to_lowercase().as_str() { - "sy" => 0b1111u32, - "st" => 0b1110, - "ld" => 0b1101, - "ish" => 0b1011, - "ishst" => 0b1010, - "ishld" => 0b1001, - "nsh" => 0b0111, - "nshst" => 0b0110, - "nshld" => 0b0101, - "osh" => 0b0011, - "oshst" => 0b0010, - "oshld" => 0b0001, - _ => return Err(format!("unknown dsb option: {}", b)), - }, - _ => 0b1111, - }; - // DSB: 0xD503309F | (option << 8) - let word = 0xd503309f | (option << 8); - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_mrs(operands: &[Operand]) -> Result { - // MRS Xt, system_reg - let (rt, _) = get_reg(operands, 0)?; - let sysreg = match operands.get(1) { - Some(Operand::Symbol(s)) => s.to_lowercase(), - _ => return Err("mrs needs system register name".to_string()), - }; - - let encoding = match sysreg.as_str() { - "sp_el0" => 0xc208u32, - "tpidr_el0" => 0xde82, - "tpidr_el1" => 0xc684, - "tpidr_el2" => 0xe682, - "tpidrro_el0" => 0xde83, - "tcr_el1" => 0xc102, - "ttbr0_el1" => 0xc100, - "sctlr_el1" => 0xc080, - "mdscr_el1" => 0x8012, - "id_aa64mmfr0_el1" => 0xc038, - "id_aa64mmfr1_el1" => 0xc039, - "cpacr_el1" => 0xc082, - "par_el1" => 0xc3a0, - "osdlr_el1" => 0x809c, - "currentel" => 0xc212, - "elr_el1" => 0xc201, - "spsr_el1" => 0xc200, - "esr_el1" => 0xc290, - "far_el1" => 0xc300, - "vbar_el1" => 0xc600, - "mpidr_el1" => 0xc005, - "contextidr_el1" => 0xc681, - "mair_el1" => 0xc510, - "isr_el1" => 0xc608, - "oslsr_el1" => 0x808c, - "midr_el1" => 0xc000, - "revidr_el1" => 0xc006, - "id_aa64pfr0_el1" => 0xc020, - "id_aa64pfr1_el1" => 0xc021, - "id_aa64isar0_el1" => 0xc030, - "id_aa64isar1_el1" => 0xc031, - "id_aa64isar2_el1" => 0xc032, - "amair_el1" => 0xc518, - "hcr_el2" => 0xe088, - "cptr_el2" => 0xe08a, - "hstr_el2" => 0xe08b, - "hacr_el2" => 0xe08f, - "vpidr_el2" => 0xe000, - "vmpidr_el2" => 0xe005, - "actlr_el2" => 0xe081, - "elr_el2" => 0xe201, - "esr_el2" => 0xe290, - "afsr0_el2" => 0xe288, - "afsr1_el2" => 0xe289, - "far_el2" => 0xe300, - "hpfar_el2" => 0xe304, - "spsr_el2" => 0xe200, - "sctlr_el2" => 0xe080, - "mdcr_el2" => 0xe089, - "tcr_el2" => 0xe102, - "ttbr0_el2" => 0xe100, - "vttbr_el2" => 0xe108, - "vtcr_el2" => 0xe10a, - "vbar_el2" => 0xe600, - "mair_el2" => 0xe510, - "amair_el2" => 0xe518, - "sp_el1" => 0xe208, - "pmuserenr_el0" => 0xdcf0, - "cntfrq_el0" => 0xdf00, - "cntpct_el0" => 0xdf01, - "cntv_ctl_el0" => 0xdf19, - "cntp_ctl_el0" => 0xdf11, - "cntv_cval_el0" => 0xdf1c, - "cntp_cval_el0" => 0xdf12, - "ctr_el0" => 0xd801, - "ttbr1_el1" => 0xc101, - "cntkctl_el1" => 0xc708, - "id_aa64dfr0_el1" => 0xc028, - "oslar_el1" => 0x8084, - "cntvct_el0" => 0xdf02, - "clidr_el1" => 0xc801, - "ccsidr_el1" => 0xc800, - "csselr_el1" => 0xd000, - "id_aa64mmfr2_el1" => 0xc03a, - "id_aa64dfr1_el1" => 0xc029, - "actlr_el1" => 0xc081, - "afsr0_el1" => 0xc288, - "afsr1_el1" => 0xc289, - "id_pfr0_el1" => 0xc008, - "id_pfr1_el1" => 0xc009, - "cnthctl_el2" => 0xe708, - "cntvoff_el2" => 0xe703, - "sp_el2" => 0xf208, - "pmintenset_el1" => 0xc4f1, - "pmintenclr_el1" => 0xc4f2, - "pmcr_el0" => 0xdce0, - "pmcntenset_el0" => 0xdce1, - "pmcntenclr_el0" => 0xdce2, - "pmovsclr_el0" => 0xdce3, - "pmselr_el0" => 0xdce5, - "pmceid0_el0" => 0xdce6, - "pmceid1_el0" => 0xdce7, - "pmccntr_el0" => 0xdce8, - "pmxevtyper_el0" => 0xdce9, - "pmxevcntr_el0" => 0xdcea, - "pmccfiltr_el0" => 0xdf7f, - "dczid_el0" => 0xd807, - "daif" => 0xda11, - "fpcr" => 0xda20, - "fpsr" => 0xda21, - "nzcv" => 0xda10, - "spsel" => 0xc210, - "mdccint_el1" => 0x8010, - "fpexc32_el2" => 0xe298, - "dbgauthstatus_el1" => 0x83f6, - "spsr_abt" => 0xe219, - "spsr_und" => 0xe21a, - "spsr_irq" => 0xe218, - "spsr_fiq" => 0xe21b, - "ifsr32_el2" => 0xe281, - "dacr32_el2" => 0xe180, - _ => parse_generic_sysreg(&sysreg)?, - }; - - // MRS encoding: 0xd520_0000 has L=1 (bit 21) for read. - // Bits [20:19] = op0, supplied entirely by the sysreg encoding field. - let word = 0xd5200000 | (encoding << 5) | rt; - Ok(EncodeResult::Word(word)) -} - -/// Compute sysreg encoding from (op0, op1, CRn, CRm, op2) fields. -pub(crate) fn sysreg_encoding(op0: u32, op1: u32, crn: u32, crm: u32, op2: u32) -> u32 { - ((op0 & 3) << 14) | ((op1 & 7) << 11) | ((crn & 0xF) << 7) | ((crm & 0xF) << 3) | (op2 & 7) -} - -/// Try to parse a numbered debug/performance register family name like -/// `dbgbcr15_el1` or `dbgwvr0_el1` into its encoding. Returns None if not matched. -pub(crate) fn parse_numbered_sysreg(name: &str) -> Option { - // Debug breakpoint/watchpoint registers: dbg{b,w}{c,v}r_el1 - // dbgbcr_el1: op0=2, op1=0, CRn=0, CRm=n, op2=5 - // dbgbvr_el1: op0=2, op1=0, CRn=0, CRm=n, op2=4 - // dbgwcr_el1: op0=2, op1=0, CRn=0, CRm=n, op2=7 - // dbgwvr_el1: op0=2, op1=0, CRn=0, CRm=n, op2=6 - let prefixes: &[(&str, &str, u32)] = &[ - ("dbgbcr", "_el1", 5), - ("dbgbvr", "_el1", 4), - ("dbgwcr", "_el1", 7), - ("dbgwvr", "_el1", 6), - ]; - for &(prefix, suffix, op2) in prefixes { - if let Some(rest) = name.strip_prefix(prefix) { - if let Some(num_str) = rest.strip_suffix(suffix) { - if let Ok(n) = num_str.parse::() { - if n <= 15 { - return Some(sysreg_encoding(2, 0, 0, n, op2)); - } - } - } - } - } - - // Performance monitor event count registers: pmevcntr_el0, pmevtyper_el0 - // pmevcntr_el0: op0=3, op1=3, CRn=14, CRm=8+n/8, op2=n%8 - // pmevtyper_el0: op0=3, op1=3, CRn=14, CRm=12+n/8, op2=n%8 - if let Some(rest) = name.strip_prefix("pmevcntr") { - if let Some(num_str) = rest.strip_suffix("_el0") { - if let Ok(n) = num_str.parse::() { - if n <= 30 { - return Some(sysreg_encoding(3, 3, 14, 8 + n / 8, n % 8)); - } - } - } - } - if let Some(rest) = name.strip_prefix("pmevtyper") { - if let Some(num_str) = rest.strip_suffix("_el0") { - if let Ok(n) = num_str.parse::() { - if n <= 30 { - return Some(sysreg_encoding(3, 3, 14, 12 + n / 8, n % 8)); - } - } - } - } - - None -} - -/// Parse generic system register name like `s3_0_c1_c0_1` into encoding bits. -/// Also handles numbered register families like `dbgbcr15_el1`. -pub(crate) fn parse_generic_sysreg(name: &str) -> Result { - // Try numbered register families first - if let Some(enc) = parse_numbered_sysreg(name) { - return Ok(enc); - } - - // Format: s__c_c_ - let parts: Vec<&str> = name.split('_').collect(); - if parts.len() == 5 && parts[0].starts_with('s') && parts[2].starts_with('c') && parts[3].starts_with('c') { - let op0: u32 = parts[0][1..].parse().map_err(|_| format!("unsupported system register: {}", name))?; - let op1: u32 = parts[1].parse().map_err(|_| format!("unsupported system register: {}", name))?; - let crn: u32 = parts[2][1..].parse().map_err(|_| format!("unsupported system register: {}", name))?; - let crm: u32 = parts[3][1..].parse().map_err(|_| format!("unsupported system register: {}", name))?; - let op2: u32 = parts[4].parse().map_err(|_| format!("unsupported system register: {}", name))?; - let enc = sysreg_encoding(op0, op1, crn, crm, op2); - Ok(enc) - } else { - Err(format!("unsupported system register: {}", name)) - } -} - -pub(crate) fn encode_msr(operands: &[Operand]) -> Result { - let sysreg = match operands.first() { - Some(Operand::Symbol(s)) => s.to_lowercase(), - _ => return Err("msr needs system register name".to_string()), - }; - - // MSR (immediate): msr , #imm - // Encoding: 1101_0101_0000_0 op1[18:16] 0100 CRm[11:8] op2[7:5] 11111[4:0] - // daifset: op1=3, op2=6; daifclr: op1=3, op2=7; spsel: op1=0, op2=5 - match sysreg.as_str() { - "daifset" => { - let imm = get_imm(operands, 1)? as u32 & 0xF; - let word = 0xd5034000 | (imm << 8) | (0b110 << 5) | 0x1F; - return Ok(EncodeResult::Word(word)); - } - "daifclr" => { - let imm = get_imm(operands, 1)? as u32 & 0xF; - let word = 0xd5034000 | (imm << 8) | (0b111 << 5) | 0x1F; - return Ok(EncodeResult::Word(word)); - } - "spsel" => { - // SPSel: op1=0, op2=5 (MSR immediate form) - // If the second operand is a register, fall through to MSR register form - if let Ok(imm) = get_imm(operands, 1) { - let imm = imm as u32 & 0xF; - let word = 0xd5004000 | (imm << 8) | (0b101 << 5) | 0x1F; - return Ok(EncodeResult::Word(word)); - } - } - _ => {} - } - - // MSR (register): msr sysreg, Xt - let (rt, _) = get_reg(operands, 1)?; - - let encoding = match sysreg.as_str() { - "sp_el0" => 0xc208u32, - "tpidr_el0" => 0xde82, - "tpidr_el1" => 0xc684, - "tpidr_el2" => 0xe682, - "tpidrro_el0" => 0xde83, - "tcr_el1" => 0xc102, - "ttbr0_el1" => 0xc100, - "sctlr_el1" => 0xc080, - "mdscr_el1" => 0x8012, - "cpacr_el1" => 0xc082, - "par_el1" => 0xc3a0, - "osdlr_el1" => 0x809c, - "oslar_el1" => 0x8084, - "oslsr_el1" => 0x808c, - "elr_el1" => 0xc201, - "spsr_el1" => 0xc200, - "esr_el1" => 0xc290, - "far_el1" => 0xc300, - "vbar_el1" => 0xc600, - "contextidr_el1" => 0xc681, - "mair_el1" => 0xc510, - "amair_el1" => 0xc518, - "hcr_el2" => 0xe088, - "cptr_el2" => 0xe08a, - "hstr_el2" => 0xe08b, - "elr_el2" => 0xe201, - "esr_el2" => 0xe290, - "far_el2" => 0xe300, - "spsr_el2" => 0xe200, - "sctlr_el2" => 0xe080, - "mdcr_el2" => 0xe089, - "tcr_el2" => 0xe102, - "ttbr0_el2" => 0xe100, - "vttbr_el2" => 0xe108, - "vtcr_el2" => 0xe10a, - "vbar_el2" => 0xe600, - "mair_el2" => 0xe510, - "sp_el1" => 0xe208, - "csselr_el1" => 0xd000, - "actlr_el1" => 0xc081, - "cnthctl_el2" => 0xe708, - "cntvoff_el2" => 0xe703, - "sp_el2" => 0xf208, - "vpidr_el2" => 0xe000, - "vmpidr_el2" => 0xe005, - "hacr_el2" => 0xe08f, - "actlr_el2" => 0xe081, - "afsr0_el2" => 0xe288, - "afsr1_el2" => 0xe289, - "amair_el2" => 0xe518, - "hpfar_el2" => 0xe304, - "pmintenset_el1" => 0xc4f1, - "pmintenclr_el1" => 0xc4f2, - "pmcr_el0" => 0xdce0, - "pmcntenset_el0" => 0xdce1, - "pmcntenclr_el0" => 0xdce2, - "pmovsclr_el0" => 0xdce3, - "pmselr_el0" => 0xdce5, - "pmccntr_el0" => 0xdce8, - "pmxevtyper_el0" => 0xdce9, - "pmxevcntr_el0" => 0xdcea, - "pmuserenr_el0" => 0xdcf0, - "pmccfiltr_el0" => 0xdf7f, - "cntv_ctl_el0" => 0xdf19, - "cntp_ctl_el0" => 0xdf11, - "cntp_cval_el0" => 0xdf12, - "cntv_cval_el0" => 0xdf1c, - "ttbr1_el1" => 0xc101, - "cntkctl_el1" => 0xc708, - "daif" => 0xda11, - "fpcr" => 0xda20, - "fpsr" => 0xda21, - "nzcv" => 0xda10, - "spsel" => 0xc210, - "mdccint_el1" => 0x8010, - "fpexc32_el2" => 0xe298, - "spsr_abt" => 0xe219, - "spsr_und" => 0xe21a, - "spsr_irq" => 0xe218, - "spsr_fiq" => 0xe21b, - "ifsr32_el2" => 0xe281, - "dacr32_el2" => 0xe180, - _ => parse_generic_sysreg(&sysreg)?, - }; - - // MSR encoding: 0xd500_0000 has L=0 (bit 21) for write. - // Bits [20:19] = op0, supplied entirely by the sysreg encoding field. - let word = 0xd5000000 | (encoding << 5) | rt; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_svc(operands: &[Operand]) -> Result { - let imm = get_imm(operands, 0)?; - let word = 0xd4000001 | ((imm as u32 & 0xFFFF) << 5); - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_hvc(operands: &[Operand]) -> Result { - let imm = get_imm(operands, 0)?; - let word = 0xd4000002 | ((imm as u32 & 0xFFFF) << 5); - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_ic(raw_operands: &str) -> Result { - let parts: Vec<&str> = raw_operands.splitn(2, ',').collect(); - let op_name = parts[0].trim().to_lowercase(); - let rt = if parts.len() > 1 { - let reg_str = parts[1].trim(); - parse_reg_num(reg_str).ok_or_else(|| format!("ic: invalid register '{}'", reg_str))? - } else { - 31 // xzr - }; - let base = match op_name.as_str() { - "ialluis" => 0xd508711fu32, - "iallu" => 0xd508751f, - "ivau" => 0xd50b7520, - _ => return Err(format!("unsupported ic operation: {}", op_name)), - }; - let word = (base & !0x1F) | rt; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_smc(operands: &[Operand]) -> Result { - let imm = get_imm(operands, 0)?; - let word = 0xd4000003 | ((imm as u32 & 0xFFFF) << 5); - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_at(_operands: &[Operand], raw_operands: &str) -> Result { - let parts: Vec<&str> = raw_operands.splitn(2, ',').collect(); - let op_name = parts[0].trim().to_lowercase(); - let rt = if parts.len() > 1 { - let reg_str = parts[1].trim(); - parse_reg_num(reg_str).ok_or_else(|| format!("at: invalid register '{}'", reg_str))? - } else { - 31 - }; - // AT encoding: SYS instruction. Base words from GCC: - let base = match op_name.as_str() { - "s1e1r" => 0xd5087800u32, - "s1e1w" => 0xd5087820, - "s1e0r" => 0xd5087840, - "s1e0w" => 0xd5087860, - _ => return Err(format!("unsupported at operation: {}", op_name)), - }; - let word = (base & !0x1F) | rt; - Ok(EncodeResult::Word(word)) -} - -/// Encode `sys #op1, Cn, Cm, #op2, Xt` instruction. -pub(crate) fn encode_sys(raw_operands: &str) -> Result { - let parts: Vec<&str> = raw_operands.split(',').map(|s| s.trim()).collect(); - if parts.len() < 4 { - return Err(format!("sys needs at least 4 operands, got: {}", raw_operands)); - } - let op1: u32 = parts[0].trim_start_matches('#').trim().parse() - .map_err(|_| format!("sys: invalid op1: {}", parts[0]))?; - let crn: u32 = parts[1].trim().to_lowercase().trim_start_matches('c').parse() - .map_err(|_| format!("sys: invalid CRn: {}", parts[1]))?; - let crm: u32 = parts[2].trim().to_lowercase().trim_start_matches('c').parse() - .map_err(|_| format!("sys: invalid CRm: {}", parts[2]))?; - let op2: u32 = parts[3].trim_start_matches('#').trim().parse() - .map_err(|_| format!("sys: invalid op2: {}", parts[3]))?; - let rt = if parts.len() >= 5 { - let reg = parts[4].trim().to_lowercase(); - parse_reg_num(®).ok_or_else(|| format!("sys: invalid register: {}", parts[4]))? - } else { - 31 // xzr if no register specified - }; - let word = 0xd5080000 | ((op1 & 7) << 16) | ((crn & 0xF) << 12) | ((crm & 0xF) << 8) | ((op2 & 7) << 5) | rt; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_brk(operands: &[Operand]) -> Result { - let imm = get_imm(operands, 0)?; - let word = 0xd4200000 | ((imm as u32 & 0xFFFF) << 5); - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_tlbi(_operands: &[Operand], raw_operands: &str) -> Result { - let parts: Vec<&str> = raw_operands.splitn(2, ',').collect(); - let op_name = parts[0].trim().to_lowercase(); - let rt = if parts.len() > 1 { - let reg_str = parts[1].trim(); - parse_reg_num(reg_str).ok_or_else(|| format!("tlbi: invalid register '{}'", reg_str))? - } else { - 31 // xzr - }; - // TLBI encoding: SYS instruction with fixed fields - // Full word from GCC objdump for known ops (with Rt=x0): - let base = match op_name.as_str() { - // Standard ARMv8.0 TLBI operations - "vmalle1is" => 0xd508831fu32, - "vmalle1" => 0xd508871f, - "alle1is" => 0xd50c839f, - "alle1" => 0xd50c879f, - "alle2is" => 0xd50c831f, - "vale1is" => 0xd50883a0, - "vale1" => 0xd50887a0, - "vale2is" => 0xd50c83a0, - "vale2" => 0xd50c87a0, - "vaae1is" => 0xd5088360, - "vaae1" => 0xd5088760, - "vaale1is" => 0xd50883e0, - "vaale1" => 0xd50887e0, - "vae1is" => 0xd5088320, - "vae1" => 0xd5088720, - "vae2is" => 0xd50c8320, - "vae2" => 0xd50c8720, - "aside1is" => 0xd5088340, - "aside1" => 0xd5088740, - "vmalls12e1is" => 0xd50c83df, - "vmalls12e1" => 0xd50c87df, - "ipas2e1is" => 0xd50c8020, - "ipas2e1" => 0xd50c8420, - "ipas2le1is" => 0xd50c80a0, - "ipas2le1" => 0xd50c84a0, - // FEAT_TLBIRANGE: range TLBI operations (ARMv8.4-A) - "rvae1is" => 0xd5088220, - "rvale1is" => 0xd50882a0, - "rvaae1is" => 0xd5088260, - "rvaale1is" => 0xd50882e0, - "rvae1" => 0xd5088620, - "rvale1" => 0xd50886a0, - "rvaae1" => 0xd5088660, - "rvaale1" => 0xd50886e0, - "rvae1os" => 0xd5088520, - "rvale1os" => 0xd50885a0, - "rvaae1os" => 0xd5088560, - "rvaale1os" => 0xd50885e0, - "ripas2e1is" => 0xd50c8040, - "ripas2e1" => 0xd50c8440, - "ripas2e1os" => 0xd50c8460, - "ripas2le1is" => 0xd50c80c0, - "ripas2le1" => 0xd50c84c0, - "ripas2le1os" => 0xd50c84e0, - _ => return Err(format!("unsupported tlbi operation: {}", op_name)), - }; - // Replace Rt field (bits 4:0) - let word = (base & !0x1F) | rt; - Ok(EncodeResult::Word(word)) -} - -/// Encode HINT #imm (system hint instruction) -pub(crate) fn encode_bti(raw_operands: &str) -> Result { - let target = raw_operands.trim().to_lowercase(); - let word = match target.as_str() { - "" => 0xd503241f, // bti (no target) - "c" => 0xd503245f, // bti c - "j" => 0xd503249f, // bti j - "jc" => 0xd50324df, // bti jc - _ => return Err(format!("unsupported bti target: {}", target)), - }; - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_hint(operands: &[Operand]) -> Result { - let imm = get_imm(operands, 0)?; - // HINT: 11010101 00000011 0010 CRm op2 11111 - // CRm = imm >> 3, op2 = imm & 7 - let crm = ((imm as u32) >> 3) & 0xF; - let op2 = (imm as u32) & 0x7; - let word = 0xd503201f | (crm << 8) | (op2 << 5); - Ok(EncodeResult::Word(word)) -} - -pub(crate) fn encode_dc(operands: &[Operand], raw_operands: &str) -> Result { - // Check for the operation type in the operands or raw string - let op = match operands.first() { - Some(Operand::Symbol(s)) => s.to_lowercase(), - _ => raw_operands.to_lowercase(), - }; - - // Find the register operand (second operand or last operand) - let rt = match operands.get(1) { - Some(Operand::Reg(name)) => parse_reg_num(name).ok_or("invalid register for dc")?, - _ => { - if let Some(Operand::Reg(name)) = operands.last() { - parse_reg_num(name).ok_or("invalid register for dc")? - } else { - 0 - } - } - }; - - if op.contains("civac") { - // DC CIVAC: sys #3, c7, c14, #1, Xt - let word = 0xd50b7e20 | rt; - return Ok(EncodeResult::Word(word)); - } - if op.contains("cvac") { - // DC CVAC: sys #3, c7, c10, #1, Xt - let word = 0xd50b7a20 | rt; - return Ok(EncodeResult::Word(word)); - } - if op.contains("cvap") { - // DC CVAP: sys #3, c7, c12, #1, Xt - let word = 0xd50b7c20 | rt; - return Ok(EncodeResult::Word(word)); - } - if op.contains("cvau") { - let word = 0xd50b7b20 | rt; - return Ok(EncodeResult::Word(word)); - } - if op.contains("ivac") { - let word = 0xd5087620 | rt; - return Ok(EncodeResult::Word(word)); - } - if op.contains("zva") { - // DC ZVA: sys #3, c7, c4, #1, Xt - let word = 0xd50b7420 | rt; - return Ok(EncodeResult::Word(word)); - } - - Err(format!("unsupported dc variant: {}", raw_operands)) -} diff --git a/src/backend/arm/assembler/mod.rs b/src/backend/arm/assembler/mod.rs deleted file mode 100644 index 26fcb9e1b3..0000000000 --- a/src/backend/arm/assembler/mod.rs +++ /dev/null @@ -1,381 +0,0 @@ -//! Native AArch64 assembler. -//! -//! Parses `.s` assembly text (as emitted by the AArch64 codegen) and produces -//! ELF `.o` object files, removing the dependency on `aarch64-linux-gnu-gcc` -//! for assembly. -//! -//! Architecture: -//! - `parser.rs` – Tokenize + parse assembly text into `AsmStatement` items -//! - `encoder.rs` – Encode AArch64 instructions into 32-bit machine words -//! - `elf_writer.rs` – Write ELF object files with sections, symbols, and relocations - -pub mod parser; -pub mod encoder; -pub mod elf_writer; - -use std::collections::HashMap; -use parser::{parse_asm, AsmStatement, Operand, AsmDirective, DataValue}; -use elf_writer::ElfWriter; - -/// Assemble AArch64 assembly text into an ELF object file. -/// -/// This is the default assembler (used when the `gcc_assembler` feature is disabled). -pub fn assemble(asm_text: &str, output_path: &str) -> Result<(), String> { - let statements = parse_asm(asm_text)?; - let statements = expand_literal_pools(&statements); - let statements = resolve_numeric_labels(&statements); - let mut writer = ElfWriter::new(); - writer.process_statements(&statements)?; - writer.write_elf(output_path)?; - Ok(()) -} - -/// Expand `LdrLiteralPool` pseudo-instructions into literal pool loads. -/// -/// `ldr Xn, =symbol` must load the exact linker-resolved address via a literal -/// pool, not via `adrp+add` (which gives a PC-relative address). This matters -/// for code that runs at a different address than its linked address (e.g., -/// early boot code running at physical addresses before MMU setup). -/// -/// This pass replaces each `LdrLiteralPool` with a `ldr Xn, .Llpool_N` -/// instruction and accumulates pool entries. Pool entries are flushed (emitted -/// as `.quad` data with labels) when: -/// - A `.ltorg` / `.pool` directive is encountered -/// - A section-changing directive is encountered (`.section`, `.pushsection`, `.text`, `.data`) -/// - The end of the statement list is reached -/// -/// Each pool entry is 8-byte aligned and consists of: -/// .Llpool_N: .quad symbol[+addend] -fn expand_literal_pools(statements: &[AsmStatement]) -> Vec { - struct PoolEntry { - label: String, - symbol: String, - addend: i64, - } - - let mut result = Vec::with_capacity(statements.len()); - let mut pending_pool: Vec = Vec::new(); - let mut pool_counter: usize = 0; - - // TODO: Track W vs X register to emit .long (4-byte) entries for W registers - // instead of always emitting .quad (8-byte). Currently safe on little-endian - // (loads low 32 bits) but wrong on big-endian and wastes 4 bytes per W entry. - let flush_pool = |pool: &mut Vec, out: &mut Vec| { - if pool.is_empty() { - return; - } - // Align pool to 8 bytes - out.push(AsmStatement::Directive(AsmDirective::Balign(8))); - for entry in pool.drain(..) { - out.push(AsmStatement::Label(entry.label)); - if entry.addend != 0 { - out.push(AsmStatement::Directive(AsmDirective::Quad(vec![ - DataValue::SymbolOffset(entry.symbol, entry.addend), - ]))); - } else { - out.push(AsmStatement::Directive(AsmDirective::Quad(vec![ - DataValue::Symbol(entry.symbol), - ]))); - } - } - }; - - for stmt in statements { - match stmt { - AsmStatement::LdrLiteralPool { reg, symbol, addend } => { - let pool_label = format!(".Llpool_{}", pool_counter); - pool_counter += 1; - - // Emit: ldr reg, .Llpool_N (PC-relative literal load) - result.push(AsmStatement::Instruction { - mnemonic: "ldr".to_string(), - operands: vec![ - Operand::Reg(reg.clone()), - Operand::Symbol(pool_label.clone()), - ], - raw_operands: format!("{}, {}", reg, pool_label), - }); - - pending_pool.push(PoolEntry { - label: pool_label, - symbol: symbol.clone(), - addend: *addend, - }); - } - AsmStatement::Directive(AsmDirective::Ltorg) => { - flush_pool(&mut pending_pool, &mut result); - } - AsmStatement::Directive(AsmDirective::Section(_)) - | AsmStatement::Directive(AsmDirective::PushSection(_)) - | AsmStatement::Directive(AsmDirective::PopSection) - | AsmStatement::Directive(AsmDirective::Previous) => { - // Flush pool before section change (pool must be in the same section) - flush_pool(&mut pending_pool, &mut result); - result.push(stmt.clone()); - } - _ => { - result.push(stmt.clone()); - } - } - } - - // Flush any remaining pool entries at the end - flush_pool(&mut pending_pool, &mut result); - - result -} - -/// Check if a label name is a GNU assembler numeric label (e.g., "1", "42"). -fn is_numeric_label(name: &str) -> bool { - !name.is_empty() && name.bytes().all(|b| b.is_ascii_digit()) -} - -/// Check if a string is a numeric forward/backward reference like "1f" or "2b". -/// Returns Some((number_str, is_forward)) if it is, None otherwise. -fn parse_numeric_ref(name: &str) -> Option<(&str, bool)> { - if name.len() < 2 { - return None; - } - let last = name.as_bytes()[name.len() - 1]; - let num_part = &name[..name.len() - 1]; - if !num_part.bytes().all(|b| b.is_ascii_digit()) { - return None; - } - match last { - b'f' | b'F' => Some((num_part, true)), - b'b' | b'B' => Some((num_part, false)), - _ => None, - } -} - -/// Resolve a numeric label reference to its unique name. -fn resolve_numeric_name( - name: &str, - current_idx: usize, - defs: &HashMap>, -) -> Option { - let (num, is_forward) = parse_numeric_ref(name)?; - let def_list = defs.get(num)?; - - if is_forward { - def_list.iter() - .find(|(idx, _)| *idx > current_idx) - .map(|(_, name)| name.clone()) - } else { - def_list.iter() - .rev() - .find(|(idx, _)| *idx < current_idx) - .map(|(_, name)| name.clone()) - } -} - -/// Resolve numeric local labels (1:, 2:, etc.) and their references (1f, 1b) -/// into unique internal label names. -/// -/// GNU assembler numeric labels can be defined multiple times. Each forward -/// reference `Nf` refers to the next definition of `N`, and each backward -/// reference `Nb` refers to the most recent definition of `N`. -fn resolve_numeric_labels(statements: &[AsmStatement]) -> Vec { - // First pass: find all numeric label definitions and assign unique names. - let mut defs: HashMap> = HashMap::new(); - let mut unique_counter: HashMap = HashMap::new(); - - for (i, stmt) in statements.iter().enumerate() { - if let AsmStatement::Label(name) = stmt { - if is_numeric_label(name) { - let count = unique_counter.entry(name.clone()).or_insert(0); - let unique_name = format!(".Lnum_{}_{}", name, *count); - *count += 1; - defs.entry(name.clone()).or_default().push((i, unique_name)); - } - } - } - - // If no numeric labels found, return original - if defs.is_empty() { - return statements.to_vec(); - } - - // Second pass: resolve all references - let mut result = Vec::with_capacity(statements.len()); - for (i, stmt) in statements.iter().enumerate() { - match stmt { - AsmStatement::Label(name) if is_numeric_label(name) => { - if let Some(def_list) = defs.get(name) { - if let Some((_, unique_name)) = def_list.iter().find(|(idx, _)| *idx == i) { - result.push(AsmStatement::Label(unique_name.clone())); - continue; - } - } - result.push(stmt.clone()); - } - AsmStatement::Instruction { mnemonic, operands, raw_operands } => { - let new_ops: Vec = operands.iter().map(|op| { - resolve_numeric_operand(op, i, &defs) - }).collect(); - result.push(AsmStatement::Instruction { - mnemonic: mnemonic.clone(), - operands: new_ops, - raw_operands: raw_operands.clone(), - }); - } - AsmStatement::Directive(dir) => { - result.push(AsmStatement::Directive(resolve_numeric_directive(dir, i, &defs))); - } - _ => result.push(stmt.clone()), - } - } - - result -} - -/// Resolve numeric label references in a single operand. -fn resolve_numeric_operand( - op: &Operand, - current_idx: usize, - defs: &HashMap>, -) -> Operand { - match op { - Operand::Symbol(name) => { - if let Some(resolved) = resolve_numeric_name(name, current_idx, defs) { - Operand::Symbol(resolved) - } else { - op.clone() - } - } - Operand::Label(name) => { - if let Some(resolved) = resolve_numeric_name(name, current_idx, defs) { - Operand::Label(resolved) - } else { - op.clone() - } - } - Operand::SymbolOffset(name, off) => { - if let Some(resolved) = resolve_numeric_name(name, current_idx, defs) { - Operand::SymbolOffset(resolved, *off) - } else { - op.clone() - } - } - Operand::MemExpr { base, expr, writeback } => { - let resolved_expr = resolve_numeric_refs_in_expr(expr, current_idx, defs); - Operand::MemExpr { base: base.clone(), expr: resolved_expr, writeback: *writeback } - } - Operand::Expr(expr) => { - let resolved_expr = resolve_numeric_refs_in_expr(expr, current_idx, defs); - Operand::Expr(resolved_expr) - } - _ => op.clone(), - } -} - -/// Resolve numeric label references (e.g., `1b`, `2f`) within an expression string. -/// Replaces each occurrence with the resolved unique label name (e.g., `.Lnum_1_3`). -fn resolve_numeric_refs_in_expr( - expr: &str, - current_idx: usize, - defs: &HashMap>, -) -> String { - let bytes = expr.as_bytes(); - let mut result = String::with_capacity(expr.len()); - let mut i = 0; - while i < bytes.len() { - if bytes[i].is_ascii_digit() { - let start = i; - // Skip hex literals (0x..., 0X...) and binary literals (0b..., 0B...) - // to avoid misinterpreting hex digits as label refs (e.g., 0x1b) - if bytes[i] == b'0' && i + 1 < bytes.len() - && (bytes[i + 1] == b'x' || bytes[i + 1] == b'X' - || bytes[i + 1] == b'b' || bytes[i + 1] == b'B') - { - // Consume the entire hex/binary literal - i += 2; // skip 0x or 0b - while i < bytes.len() && bytes[i].is_ascii_alphanumeric() { - i += 1; - } - result.push_str(&expr[start..i]); - continue; - } - while i < bytes.len() && bytes[i].is_ascii_digit() { - i += 1; - } - // Check if followed by 'b' or 'f' (not part of a longer identifier) - if i < bytes.len() - && (bytes[i] == b'b' || bytes[i] == b'f' || bytes[i] == b'B' || bytes[i] == b'F') - && (i + 1 >= bytes.len() || !bytes[i + 1].is_ascii_alphanumeric()) - { - let label_ref = &expr[start..=i]; - i += 1; - if let Some(resolved) = resolve_numeric_name(label_ref, current_idx, defs) { - result.push_str(&resolved); - } else { - result.push_str(label_ref); - } - } else { - // Regular number - result.push_str(&expr[start..i]); - } - } else { - result.push(bytes[i] as char); - i += 1; - } - } - result -} - -/// Resolve numeric label references in data directives. -fn resolve_numeric_directive( - dir: &AsmDirective, - current_idx: usize, - defs: &HashMap>, -) -> AsmDirective { - match dir { - AsmDirective::Byte(vals) => { - AsmDirective::Byte(resolve_numeric_data_values(vals, current_idx, defs)) - } - AsmDirective::Long(vals) => { - AsmDirective::Long(resolve_numeric_data_values(vals, current_idx, defs)) - } - AsmDirective::Quad(vals) => { - AsmDirective::Quad(resolve_numeric_data_values(vals, current_idx, defs)) - } - _ => dir.clone(), - } -} - -/// Resolve numeric label references in data values. -fn resolve_numeric_data_values( - vals: &[DataValue], - current_idx: usize, - defs: &HashMap>, -) -> Vec { - vals.iter().map(|v| { - match v { - DataValue::Symbol(name) => { - if let Some(resolved) = resolve_numeric_name(name, current_idx, defs) { - DataValue::Symbol(resolved) - } else { - v.clone() - } - } - DataValue::SymbolOffset(name, off) => { - if let Some(resolved) = resolve_numeric_name(name, current_idx, defs) { - DataValue::SymbolOffset(resolved, *off) - } else { - v.clone() - } - } - DataValue::SymbolDiff(a, b) => { - let new_a = resolve_numeric_name(a, current_idx, defs).unwrap_or_else(|| a.clone()); - let new_b = resolve_numeric_name(b, current_idx, defs).unwrap_or_else(|| b.clone()); - DataValue::SymbolDiff(new_a, new_b) - } - DataValue::SymbolDiffAddend(a, b, add) => { - let new_a = resolve_numeric_name(a, current_idx, defs).unwrap_or_else(|| a.clone()); - let new_b = resolve_numeric_name(b, current_idx, defs).unwrap_or_else(|| b.clone()); - DataValue::SymbolDiffAddend(new_a, new_b, *add) - } - _ => v.clone(), - } - }).collect() -} diff --git a/src/backend/arm/assembler/parser.rs b/src/backend/arm/assembler/parser.rs deleted file mode 100644 index 521c05af32..0000000000 --- a/src/backend/arm/assembler/parser.rs +++ /dev/null @@ -1,2644 +0,0 @@ -//! AArch64 assembly parser. -//! -//! Parses the textual assembly format emitted by our AArch64 codegen into -//! structured `AsmStatement` values. The parser handles: -//! - Labels (global and local) -//! - Directives (.section, .globl, .type, .align, .byte, .long, .xword, etc.) -//! - AArch64 instructions (mov, add, sub, ldr, str, bl, ret, etc.) -//! - CFI directives (passed through as-is for DWARF unwind info) - -// Some parser helper functions and enum variants are defined for completeness -// and used only by the encoder or ELF writer, not the parser entry point itself. -#![allow(dead_code)] - -use crate::backend::asm_expr; -use crate::backend::asm_preprocess; -use crate::backend::elf; - -/// A parsed assembly operand. -#[derive(Debug, Clone)] -pub enum Operand { - /// Register: x0-x30, w0-w30, sp, xzr, wzr, d0-d31, s0-s31, q0-q31, v0-v31 - Reg(String), - /// Immediate value: #42, #-1, #0x1000 - Imm(i64), - /// Symbol reference: function name, label, etc. - Symbol(String), - /// Symbol with addend: symbol+offset or symbol-offset - SymbolOffset(String, i64), - /// Memory operand: [base] or [base, #offset] - Mem { base: String, offset: i64 }, - /// Memory operand with symbolic offset expression: [base, #(sym_expr)] or [base, #(sym_expr)]! - /// Used when the offset is a label/symbol expression that can't be resolved at parse time. - MemExpr { base: String, expr: String, writeback: bool }, - /// Memory operand with pre-index writeback: [base, #offset]! - MemPreIndex { base: String, offset: i64 }, - /// Memory operand with post-index writeback: [base], #offset - MemPostIndex { base: String, offset: i64 }, - /// Memory operand with register offset: [base, Xm] - MemRegOffset { base: String, index: String, extend: Option, shift: Option }, - /// :lo12:symbol or :got_lo12:symbol modifier - Modifier { kind: String, symbol: String }, - /// :lo12:symbol+offset - ModifierOffset { kind: String, symbol: String, offset: i64 }, - /// Shift: lsl #N, lsr #N, asr #N - Shift { kind: String, amount: u32 }, - /// Extend: sxtw, uxtw, sxtx, etc. with optional shift amount - Extend { kind: String, amount: u32 }, - /// Condition code for csel etc.: eq, ne, lt, gt, ... - Cond(String), - /// Barrier option for dmb/dsb: ish, ishld, ishst, sy, etc. - Barrier(String), - /// Label reference for branches - Label(String), - /// Raw expression (for things we can't fully parse yet) - Expr(String), - /// NEON register with arrangement specifier: v0.8b, v0.16b, v0.4s, etc. - RegArrangement { reg: String, arrangement: String }, - /// NEON register with lane index: v0.d[1], v0.b[0], v0.s[2], etc. - RegLane { reg: String, elem_size: String, index: u32 }, - /// NEON register list: {v0.16b}, {v0.16b, v1.16b}, etc. - RegList(Vec), - /// NEON register list with element index: {v0.s, v1.s}[0], {v0.d, v1.d}[1], etc. - RegListIndexed { regs: Vec, index: u32 }, -} - -/// Section directive with optional flags and type. -#[derive(Debug, Clone)] -pub struct SectionDirective { - pub name: String, - pub flags: Option, - pub section_type: Option, -} - -/// Symbol kind from `.type` directive. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum SymbolKind { - Function, - Object, - TlsObject, - NoType, -} - -/// Size expression: either a constant or `.-name` (current position minus symbol). -#[derive(Debug, Clone)] -pub enum SizeExpr { - Constant(u64), - CurrentMinusSymbol(String), -} - -/// A data value that can be a constant, a symbol, or a symbol expression. -#[derive(Debug, Clone)] -pub enum DataValue { - Integer(i64), - Symbol(String), - /// symbol + offset (e.g., `.quad func+128`) - SymbolOffset(String, i64), - /// symbol - symbol (e.g., `.long .LBB3 - .Ljt_0`) - SymbolDiff(String, String), - /// symbol - symbol + addend - SymbolDiffAddend(String, String, i64), - /// Raw expression string for deferred evaluation (e.g., `(sym_a - sym_b) >> 5`) - Expr(String), -} - -/// A typed assembly directive, fully parsed at parse time. -#[derive(Debug, Clone)] -pub enum AsmDirective { - /// Switch to a named section: `.section .text,"ax",@progbits` - Section(SectionDirective), - /// Global symbol: `.globl name` - Global(String), - /// Weak symbol: `.weak name` - Weak(String), - /// Hidden visibility: `.hidden name` - Hidden(String), - /// Protected visibility: `.protected name` - Protected(String), - /// Internal visibility: `.internal name` - Internal(String), - /// Symbol type: `.type name, %function` - SymbolType(String, SymbolKind), - /// Symbol size: `.size name, expr` - Size(String, SizeExpr), - /// Alignment: `.align N` or `.p2align N` (stored as byte count, already converted from 2^N) - Align(u64), - /// Byte-alignment: `.balign N` (stored as byte count directly) - Balign(u64), - /// Emit bytes: `.byte val, val, ...` (can be symbol differences for size computations) - Byte(Vec), - /// Emit 16-bit values: `.short val, ...` - Short(Vec), - /// Emit 32-bit values: `.long val, ...` (can be symbol references) - Long(Vec), - /// Emit 64-bit values: `.quad val, ...` (can be symbol references) - Quad(Vec), - /// Emit zero bytes: `.zero N[, fill]` - Zero(usize, u8), - /// NUL-terminated string: `.asciz "str"` - Asciz(Vec), - /// String without NUL: `.ascii "str"` - Ascii(Vec), - /// Common symbol: `.comm name, size, align` - Comm(String, u64, u64), - /// Local symbol: `.local name` - Local(String), - /// Symbol alias: `.set name, value` - Set(String, String), - /// Push current section and switch to a new one: `.pushsection name,"flags",@type` - PushSection(SectionDirective), - /// Pop section stack and restore previous section: `.popsection` - PopSection, - /// `.previous` — swap current and previous sections - Previous, - /// `.subsection N` — switch to numbered subsection within the current section - Subsection(u64), - /// CFI directive (ignored for code generation) - Cfi, - /// `.incbin "file"[, skip[, count]]` — include binary file contents - Incbin { path: String, skip: u64, count: Option }, - /// Raw bytes emitted from .float, .double, etc. - RawBytes(Vec), - /// Literal pool dump: `.ltorg` or `.pool` - Ltorg, - /// Other ignored directives (.file, .loc, .ident, etc.) - Ignored, -} - -/// A parsed assembly statement. -#[derive(Debug, Clone)] -pub enum AsmStatement { - /// A label definition: "name:" - Label(String), - /// A typed directive, fully parsed - Directive(AsmDirective), - /// An AArch64 instruction with mnemonic and operands - Instruction { - mnemonic: String, - operands: Vec, - /// The raw text of the operand string (for fallback encoding) - raw_operands: String, - }, - /// An empty line or comment - Empty, - /// Literal pool load pseudo-instruction: `ldr Rd, =symbol[+offset]` - /// Will be expanded into `ldr Rd, .Llpool_N` + pool entries by a later pass. - LdrLiteralPool { - reg: String, - symbol: String, - addend: i64, - }, -} - -// C-style /* ... */ comment stripping is handled by asm_preprocess::strip_c_comments -// (shared, string-aware version that correctly handles `/*` inside quoted strings). - -/// Parse assembly text into a list of statements. -/// Expand .rept/.endr, .irp/.endr, and .irpc/.endr blocks by repeating contained lines. -// TODO: extract expand_rept_blocks to shared module (duplicated in ARM, RISC-V, x86 parsers) -fn is_rept_start(trimmed: &str) -> bool { - trimmed.starts_with(".rept ") || trimmed.starts_with(".rept\t") - || trimmed.starts_with(".rep ") || trimmed.starts_with(".rep\t") -} - -fn is_irp_start(trimmed: &str) -> bool { - trimmed.starts_with(".irp ") || trimmed.starts_with(".irp\t") -} - -fn is_irpc_start(trimmed: &str) -> bool { - trimmed.starts_with(".irpc ") || trimmed.starts_with(".irpc\t") -} - -fn is_block_start(trimmed: &str) -> bool { - is_rept_start(trimmed) || is_irp_start(trimmed) || is_irpc_start(trimmed) -} - -/// Collect the body lines of a .rept/.irp block, handling nesting. -/// Returns the body lines and advances i past the closing .endr. -fn collect_block_body<'a>(lines: &[&'a str], i: &mut usize) -> Result, String> { - let mut depth = 1; - let mut body = Vec::new(); - *i += 1; - while *i < lines.len() { - let inner = strip_comment(lines[*i]).trim().to_string(); - if is_block_start(&inner) { - depth += 1; - } else if inner == ".endr" { - depth -= 1; - if depth == 0 { - break; - } - } - body.push(lines[*i]); - *i += 1; - } - if depth != 0 { - return Err(".rept/.irp/.irpc without matching .endr".to_string()); - } - Ok(body) -} - -/// Estimate the byte size of a single assembly line for label position tracking. -/// Used to resolve backward label references in .rept count expressions. -/// AArch64 instructions are always 4 bytes; directives have known sizes. -fn estimate_line_bytes(trimmed: &str) -> u64 { - if trimmed.is_empty() || trimmed.starts_with("//") || trimmed.starts_with('#') { - return 0; - } - // Label definitions don't add bytes - if trimmed.ends_with(':') && !trimmed.contains(' ') { - return 0; - } - // Strip leading labels like "661:" from lines like "661: bl foo" - let content = if let Some(pos) = trimmed.find(':') { - let before = &trimmed[..pos]; - if before.chars().all(|c| c.is_alphanumeric() || c == '_' || c == '.') { - trimmed[pos + 1..].trim() - } else { - trimmed - } - } else { - trimmed - }; - if content.is_empty() { - return 0; - } - // Directives - if content.starts_with('.') { - let lower = content.to_lowercase(); - if lower.starts_with(".byte ") { return 1; } - if lower.starts_with(".hword ") || lower.starts_with(".short ") || lower.starts_with(".2byte ") { return 2; } - if lower.starts_with(".word ") || lower.starts_with(".long ") || lower.starts_with(".4byte ") || lower.starts_with(".inst ") { return 4; } - if lower.starts_with(".quad ") || lower.starts_with(".xword ") || lower.starts_with(".8byte ") { return 8; } - // .zero N, .space N, .skip N - if lower.starts_with(".zero ") || lower.starts_with(".space ") || lower.starts_with(".skip ") { - let arg = content.split_whitespace().nth(1).unwrap_or("0"); - if let Ok(n) = arg.trim_end_matches(',').parse::() { return n; } - } - // .ascii "str" / .asciz "str" — approximate - if lower.starts_with(".ascii") || lower.starts_with(".asciz") || lower.starts_with(".string") { - // Don't try to be exact; these are rarely used in .rept context - return 0; - } - // Other directives (.align, .section, .globl, .type, etc.) — 0 bytes of code - return 0; - } - // Everything else is an instruction = 4 bytes for AArch64 - 4 -} - -/// Resolve backward numeric label references (like 662b, 661b) in a .rept count expression. -/// Substitutes each backward reference with its byte position, then evaluates the expression. -fn resolve_rept_label_expr( - count_str: &str, - label_positions: &std::collections::HashMap>, -) -> Result { - // First try direct evaluation (handles simple integer expressions) - if let Ok(val) = asm_expr::parse_integer_expr(count_str) { - return Ok(val); - } - - // Check if expression contains backward label references (e.g., 662b) - let mut resolved = count_str.to_string(); - let mut found_label_ref = false; - - // Find and replace backward label references: digits followed by 'b' or 'B' - // We need to scan for patterns like "662b" that are numeric label backward refs - loop { - let mut replaced = false; - // Use a simple scan to find numeric label backward references - let bytes = resolved.as_bytes(); - let len = bytes.len(); - let mut i = 0; - while i < len { - // Look for a digit sequence followed by 'b' or 'B' - if bytes[i].is_ascii_digit() { - let start = i; - while i < len && bytes[i].is_ascii_digit() { - i += 1; - } - if i < len && (bytes[i] == b'b' || bytes[i] == b'B') { - // Check that the next char (if any) is not alphanumeric - // (to avoid matching things like "0b1010" binary literals) - let after_ok = i + 1 >= len || !bytes[i + 1].is_ascii_alphanumeric(); - // Also ensure it's not a binary literal starting with 0b - let is_binary = start + 1 == i && bytes[start] == b'0'; - if after_ok && !is_binary { - let label_num = &resolved[start..i]; - let ref_end = i + 1; - // Look up the most recent definition of this label - if let Some(positions) = label_positions.get(label_num) { - if let Some(&pos) = positions.last() { - let before = &resolved[..start]; - let after = &resolved[ref_end..]; - resolved = format!("{}{}{}", before, pos, after); - found_label_ref = true; - replaced = true; - break; // restart scan since string changed - } - } - } - } - } - i += 1; - } - if !replaced { - break; - } - } - - if found_label_ref { - asm_expr::parse_integer_expr(&resolved) - } else { - Err(format!("cannot evaluate .rept count: {}", count_str)) - } -} - -fn expand_rept_blocks(lines: &[&str]) -> Result, String> { - let mut result = Vec::new(); - let mut i = 0; - // Track numeric label positions (byte offsets) for resolving backward refs - let mut label_positions: std::collections::HashMap> = std::collections::HashMap::new(); - let mut current_byte_pos: u64 = 0; - while i < lines.len() { - let trimmed = strip_comment(lines[i]).trim().to_string(); - if is_rept_start(&trimmed) { - let prefix_len = if trimmed.starts_with(".rept") { 5 } else { 4 }; - let count_str = trimmed[prefix_len..].trim(); - let count_val = resolve_rept_label_expr(count_str, &label_positions) - .unwrap_or(0); - // Treat negative counts as 0 (matches GNU as behavior) - let count = if count_val < 0 { 0usize } else { count_val as usize }; - let body = collect_block_body(lines, &mut i)?; - let expanded_body = expand_rept_blocks(&body)?; - for _ in 0..count { - result.extend(expanded_body.iter().cloned()); - } - } else if is_irp_start(&trimmed) { - // .irp var, val1, val2, ... - let args_str = trimmed[".irp".len()..].trim(); - // Split on first comma to get variable name and values - let (var, values_str) = match args_str.find(',') { - Some(pos) => (args_str[..pos].trim(), args_str[pos + 1..].trim()), - None => (args_str, ""), - }; - let values: Vec<&str> = values_str.split(',').map(|s| s.trim()).collect(); - let body = collect_block_body(lines, &mut i)?; - for val in &values { - // Substitute \var with val in each body line - let subst_body: Vec = body.iter().map(|line| { - let pattern = format!("\\{}", var); - asm_preprocess::replace_macro_param(line, &pattern, val) - }).collect(); - let subst_refs: Vec<&str> = subst_body.iter().map(|s| s.as_str()).collect(); - let expanded = expand_rept_blocks(&subst_refs)?; - result.extend(expanded); - } - } else if is_irpc_start(&trimmed) { - // .irpc var, string — iterate over each character in the string - let args_str = trimmed[".irpc".len()..].trim(); - // Split on first comma to get variable name and string - let (var, char_str) = match args_str.find(',') { - Some(pos) => (args_str[..pos].trim(), args_str[pos + 1..].trim()), - None => (args_str, ""), - }; - let body = collect_block_body(lines, &mut i)?; - for ch in char_str.chars() { - // Substitute \var with the current character in each body line - let ch_str = ch.to_string(); - let subst_body: Vec = body.iter().map(|line| { - let pattern = format!("\\{}", var); - let substituted = asm_preprocess::replace_macro_param(line, &pattern, &ch_str); - // Strip GAS macro argument delimiters: \() resolves to empty string - substituted.replace("\\()", "") - }).collect(); - let subst_refs: Vec<&str> = subst_body.iter().map(|s| s.as_str()).collect(); - let expanded = expand_rept_blocks(&subst_refs)?; - result.extend(expanded); - } - } else if trimmed == ".endr" { - // stray .endr without .rept - skip - } else { - // Track numeric label definitions and byte positions - // Check if this line defines a numeric label (e.g., "661:" or "661: instruction") - if let Some(colon_pos) = trimmed.find(':') { - let before = &trimmed[..colon_pos]; - if !before.is_empty() && before.chars().all(|c| c.is_ascii_digit()) { - label_positions - .entry(before.to_string()) - .or_default() - .push(current_byte_pos); - } - } - current_byte_pos += estimate_line_bytes(&trimmed); - result.push(lines[i].to_string()); - } - i += 1; - } - Ok(result) -} - -/// Evaluate a `.if` condition expression using the shared implementation. -/// Supports: `==`, `!=`, `>`, `>=`, `<`, `<=`, `||`, `&&`, parentheses. -fn eval_if_condition(cond: &str) -> bool { - asm_preprocess::eval_if_condition(cond) -} - -/// Split macro invocation arguments, separating on commas (preferred) or whitespace. -/// If the argument string contains commas, only commas are used as separators -/// (allowing spaces within arguments like `20 - 8`). If no commas are present, -/// whitespace acts as separator for backwards compatibility. -/// Quoted strings are kept as a single argument with quotes stripped. -/// Parenthesized groups like `0(a1)` are kept together. -fn split_macro_args(s: &str) -> Vec { - if s.is_empty() { - return Vec::new(); - } - // Determine if commas are present (outside parens) — if so, use comma-only splitting - let has_commas = { - let mut depth = 0i32; - s.bytes().any(|b| { - match b { - b'(' => { depth += 1; false } - b')' => { depth -= 1; false } - b',' if depth == 0 => true, - _ => false, - } - }) - }; - let mut args = Vec::new(); - let mut current = String::new(); - let bytes = s.as_bytes(); - let mut i = 0; - let mut paren_depth = 0i32; - while i < bytes.len() { - match bytes[i] { - b'(' => { - paren_depth += 1; - current.push('('); - } - b')' => { - paren_depth -= 1; - current.push(')'); - } - b',' if paren_depth == 0 => { - let trimmed = current.trim().to_string(); - if !trimmed.is_empty() { - args.push(trimmed); - } - current.clear(); - } - b' ' | b'\t' if paren_depth == 0 && !has_commas => { - // Whitespace acts as separator when no commas are present. - // However, we need to be careful with expressions like "20 - 8". - // Check if the whitespace is part of an arithmetic expression. - let trimmed_so_far = current.trim(); - let last_is_num_or_op = trimmed_so_far.is_empty() - || trimmed_so_far.chars().last().is_some_and(|c| c.is_ascii_digit() || c == ')'); - // Peek ahead: skip whitespace to see what follows - let mut peek = i + 1; - while peek < bytes.len() && (bytes[peek] == b' ' || bytes[peek] == b'\t') { - peek += 1; - } - let next_is_op = peek < bytes.len() && matches!(bytes[peek], b'+' | b'-' | b'*' | b'/' | b'|' | b'&' | b'^' | b'~'); - let next_is_num_after_op = if !current.is_empty() { - let last_ch = current.as_bytes()[current.len() - 1]; - matches!(last_ch, b'+' | b'-' | b'*' | b'/' | b'|' | b'&' | b'^' | b'~' | b'(') - && peek < bytes.len() && (bytes[peek].is_ascii_digit() || bytes[peek] == b'(') - } else { - false - }; - if (last_is_num_or_op && next_is_op) || next_is_num_after_op { - // Part of an arithmetic expression — keep as whitespace in current token - current.push(' '); - } else { - let trimmed = current.trim().to_string(); - if !trimmed.is_empty() { - args.push(trimmed); - current.clear(); - } - } - // Skip remaining whitespace - while i + 1 < bytes.len() && (bytes[i + 1] == b' ' || bytes[i + 1] == b'\t') { - i += 1; - } - } - b'"' => { - // Consume quoted string, stripping the outer quotes - i += 1; - while i < bytes.len() && bytes[i] != b'"' { - if bytes[i] == b'\\' && i + 1 < bytes.len() { - current.push(bytes[i + 1] as char); - i += 2; - continue; - } - current.push(bytes[i] as char); - i += 1; - } - // Skip closing quote - } - _ => { - current.push(bytes[i] as char); - } - } - i += 1; - } - let trimmed = current.trim().to_string(); - if !trimmed.is_empty() { - args.push(trimmed); - } - args -} - -/// Macro definition: name, parameter list (with optional defaults), and body lines. -struct MacroDef { - /// Parameter names (without default values). - params: Vec, - /// Default values for parameters (indexed by param position). None = no default. - defaults: Vec>, - body: Vec, - /// Whether the last parameter is :vararg (receives all remaining args). - has_vararg: bool, -} - -/// Parse a .macro directive line, returning (name, params, defaults, has_vararg). -/// Handles GAS syntax: `.macro name param1, param2=default, param3` -fn parse_macro_directive(trimmed: &str) -> (String, Vec, Vec>, bool) { - let rest = trimmed[".macro".len()..].trim(); - let (name, params_str) = match rest.find([' ', '\t', ',']) { - Some(pos) => (rest[..pos].trim(), rest[pos..].trim().trim_start_matches(',')), - None => (rest, ""), - }; - let mut params = Vec::new(); - let mut defaults = Vec::new(); - let mut has_vararg = false; - if !params_str.is_empty() { - // GAS allows spaces around '=' in macro parameter defaults: - // .macro foo enable = 1 => param "enable", default "1" - // First try comma-separated specs, then handle space-separated with '=' merging. - let specs: Vec<&str> = if params_str.contains(',') { - params_str.split(',').map(|s| s.trim()).filter(|s| !s.is_empty()).collect() - } else { - // Space-separated: merge "name = value" triples into single specs. - let tokens: Vec<&str> = params_str.split_whitespace().collect(); - let mut merged: Vec = Vec::new(); - let mut i = 0; - while i < tokens.len() { - if i + 2 < tokens.len() && tokens[i + 1] == "=" { - merged.push(format!("{}={}", tokens[i], tokens[i + 2])); - i += 3; - } else if i + 1 < tokens.len() && tokens[i + 1].starts_with('=') { - merged.push(format!("{}{}", tokens[i], tokens[i + 1])); - i += 2; - } else { - merged.push(tokens[i].to_string()); - i += 1; - } - } - // Convert to a form we can iterate - // We'll handle this below using the merged vector directly - let mut specs_out = Vec::new(); - for m in &merged { - specs_out.push(m.as_str()); - } - // Since we can't return borrowed refs to local merged vec, - // process directly here - for m in &merged { - let s = m.trim(); - if s.is_empty() { continue; } - if let Some(eq_pos) = s.find('=') { - params.push(s[..eq_pos].to_string()); - defaults.push(Some(s[eq_pos + 1..].to_string())); - } else if let Some(colon_pos) = s.find(':') { - let qualifier = &s[colon_pos + 1..]; - if qualifier.eq_ignore_ascii_case("vararg") { - has_vararg = true; - } - params.push(s[..colon_pos].to_string()); - defaults.push(None); - } else { - params.push(s.to_string()); - defaults.push(None); - } - } - return (name.to_string(), params, defaults, has_vararg); - }; - for raw in &specs { - let s = raw.trim(); - if s.is_empty() { - continue; - } - // Handle param=default or param:req syntax - if let Some(eq_pos) = s.find('=') { - params.push(s[..eq_pos].trim().to_string()); - defaults.push(Some(s[eq_pos + 1..].trim().to_string())); - } else if let Some(colon_pos) = s.find(':') { - let qualifier = &s[colon_pos + 1..]; - if qualifier.eq_ignore_ascii_case("vararg") { - has_vararg = true; - } - params.push(s[..colon_pos].to_string()); - defaults.push(None); - } else { - params.push(s.to_string()); - defaults.push(None); - } - } - } - (name.to_string(), params, defaults, has_vararg) -} - -/// Collect a macro body from lines[i..], tracking nested .macro/.endm depth. -/// Returns the body lines and the index after the closing .endm. -fn collect_macro_body(lines: &[&str], start: usize) -> (Vec, usize) { - let mut body = Vec::new(); - let mut depth = 1; - let mut i = start; - while i < lines.len() { - let inner = strip_comment(lines[i]).trim().to_string(); - if inner.starts_with(".macro ") || inner.starts_with(".macro\t") { - depth += 1; - } else if inner == ".endm" || inner.starts_with(".endm ") || inner.starts_with(".endm\t") { - depth -= 1; - if depth == 0 { - return (body, i); - } - } - body.push(lines[i].to_string()); - i += 1; - } - (body, i) -} - -/// Substitute macro parameters in body lines. -/// -/// Handles both positional and named arguments (e.g., `shift=1`). -/// Falls back to default values when arguments are not provided. -fn substitute_params(body: &[String], params: &[String], defaults: &[Option], args: &[String], _has_vararg: bool) -> Vec { - // Build a map of param_name -> value, considering named args and defaults. - let mut param_values: Vec = Vec::with_capacity(params.len()); - // Start with defaults or "0" for each param - for (i, _param) in params.iter().enumerate() { - param_values.push(defaults.get(i).and_then(|d| d.clone()).unwrap_or_default()); - } - // Apply positional and named arguments - let mut pos_idx = 0; - for arg in args.iter() { - if let Some(eq_pos) = arg.find('=') { - // Named argument: "param=value" - let name = &arg[..eq_pos]; - let value = &arg[eq_pos + 1..]; - if let Some(pi) = params.iter().position(|p| p == name) { - param_values[pi] = value.to_string(); - } - } else { - // Positional argument - if pos_idx < params.len() { - param_values[pos_idx] = arg.clone(); - } - pos_idx += 1; - } - } - - // Sort parameter indices by name length (longest first) to avoid - // partial substitution: e.g., \x must not match before \xb. - let mut sorted_indices: Vec = (0..params.len()).collect(); - sorted_indices.sort_by(|&a, &b| params[b].len().cmp(¶ms[a].len())); - - body.iter().map(|body_line| { - let mut expanded = body_line.clone(); - for &pi in &sorted_indices { - let pattern = format!("\\{}", params[pi]); - expanded = asm_preprocess::replace_macro_param(&expanded, &pattern, ¶m_values[pi]); - } - // Strip GAS macro argument delimiters: \() resolves to empty string. - // Used in GAS macros to separate parameter names from adjacent text, - // e.g., \param\().suffix → value.suffix after substitution. - expanded = expanded.replace("\\()", ""); - expanded - }).collect() -} - -/// Expand .macro/.endm definitions and macro invocations in a single pass. -/// -/// Handles nested macros: when a macro body contains .macro/.endm definitions -/// (like FFmpeg's `function` macro which defines `endfunc` inside its body), -/// those definitions are registered in the macro table during expansion. -/// Also handles .purgem to remove macro definitions (used by FFmpeg to allow -/// each `function` invocation to redefine `endfunc`). -fn expand_macros(lines: &[&str]) -> Result, String> { - use std::collections::HashMap; - let mut macros: HashMap = HashMap::new(); - let mut counter = 0u64; - expand_macros_impl(lines, &mut macros, 0, &mut counter) -} - -/// Core macro expansion implementation with a shared, mutable macro table. -/// `depth` limits recursion to prevent infinite expansion. -/// `counter` is GAS's `\@` macro invocation counter. -fn expand_macros_impl( - lines: &[&str], - macros: &mut std::collections::HashMap, - depth: usize, - counter: &mut u64, -) -> Result, String> { - if depth > 64 { - return Err("Macro expansion depth limit exceeded (>64)".to_string()); - } - let mut result = Vec::new(); - let mut i = 0; - - while i < lines.len() { - let trimmed = strip_comment(lines[i]).trim().to_string(); - - if trimmed.starts_with(".macro ") || trimmed.starts_with(".macro\t") { - // Collect macro definition (with nested depth tracking) - let (name, params, defaults, has_vararg) = parse_macro_directive(&trimmed); - let (body, end_idx) = collect_macro_body(lines, i + 1); - macros.insert(name, MacroDef { params, defaults, body, has_vararg }); - i = end_idx + 1; - continue; - } else if trimmed == ".endm" || trimmed.starts_with(".endm ") || trimmed.starts_with(".endm\t") { - // Stray .endm — skip - i += 1; - continue; - } else if trimmed.starts_with(".purgem ") || trimmed.starts_with(".purgem\t") { - // Remove a macro definition - let name = trimmed[".purgem".len()..].trim(); - macros.remove(name); - i += 1; - continue; - } else if !trimmed.is_empty() && !trimmed.starts_with('.') && !trimmed.starts_with('#') { - // Could be a macro invocation - let first_word = trimmed.split([' ', '\t']).next().unwrap_or(""); - let potential_name = first_word.trim_end_matches(':'); - if potential_name != first_word { - // It's a label (has trailing colon) — check if followed by a macro invocation - let rest_after_label = trimmed[first_word.len()..].trim(); - let rest_first_word = rest_after_label.split([' ', '\t']).next().unwrap_or(""); - if !rest_first_word.is_empty() && macros.contains_key(rest_first_word) { - // Label followed by macro invocation: emit label, then expand macro - result.push(first_word.to_string()); - let mac_params = macros[rest_first_word].params.clone(); - let mac_defaults = macros[rest_first_word].defaults.clone(); - let mac_body = macros[rest_first_word].body.clone(); - let mac_vararg = macros[rest_first_word].has_vararg; - let macro_args_str = rest_after_label[rest_first_word.len()..].trim().to_string(); - let macro_args = split_macro_args(¯o_args_str); - let mut expanded_lines = substitute_params(&mac_body, &mac_params, &mac_defaults, ¯o_args, mac_vararg); - let ctr_str = counter.to_string(); - for line in &mut expanded_lines { *line = line.replace("\\@", &ctr_str); } - *counter += 1; - let refs: Vec<&str> = expanded_lines.iter().map(|s| s.as_str()).collect(); - let re_expanded = expand_macros_impl(&refs, macros, depth + 1, counter)?; - result.extend(re_expanded); - } else { - result.push(lines[i].to_string()); - } - } else if macros.contains_key(potential_name) { - // Clone what we need before mutably borrowing macros again - let mac_params = macros[potential_name].params.clone(); - let mac_defaults = macros[potential_name].defaults.clone(); - let mac_body = macros[potential_name].body.clone(); - let mac_vararg = macros[potential_name].has_vararg; - let args_str = trimmed[first_word.len()..].trim().to_string(); - let args = if mac_vararg && !mac_params.is_empty() { - // For vararg macros, split only the non-vararg params, - // and pass the remaining raw text as the vararg value - let all_args = split_macro_args(&args_str); - let non_vararg_count = mac_params.len() - 1; - if all_args.len() > non_vararg_count { - // Find where the vararg portion starts in the raw string - let mut vararg_args = all_args[..non_vararg_count].to_vec(); - // Reconstruct the raw vararg text by finding the position after - // the non-vararg args in the original string - let mut pos = 0; - for _ in 0..non_vararg_count { - // Skip whitespace/commas - while pos < args_str.len() && (args_str.as_bytes()[pos] == b' ' || args_str.as_bytes()[pos] == b'\t' || args_str.as_bytes()[pos] == b',') { - pos += 1; - } - // Skip the arg token - while pos < args_str.len() && args_str.as_bytes()[pos] != b',' && args_str.as_bytes()[pos] != b' ' && args_str.as_bytes()[pos] != b'\t' { - pos += 1; - } - } - // Skip separator after last non-vararg arg - while pos < args_str.len() && (args_str.as_bytes()[pos] == b' ' || args_str.as_bytes()[pos] == b'\t' || args_str.as_bytes()[pos] == b',') { - pos += 1; - } - let raw_vararg = args_str[pos..].to_string(); - vararg_args.push(raw_vararg); - vararg_args - } else { - all_args - } - } else { - split_macro_args(&args_str) - }; - let mut expanded_lines = substitute_params(&mac_body, &mac_params, &mac_defaults, &args, mac_vararg); - let ctr_str = counter.to_string(); - for line in &mut expanded_lines { *line = line.replace("\\@", &ctr_str); } - *counter += 1; - // Recursively expand the result (may define new macros, invoke others) - let refs: Vec<&str> = expanded_lines.iter().map(|s| s.as_str()).collect(); - let re_expanded = expand_macros_impl(&refs, macros, depth + 1, counter)?; - result.extend(re_expanded); - } else { - result.push(lines[i].to_string()); - } - } else { - result.push(lines[i].to_string()); - } - i += 1; - } - Ok(result) -} - -/// Resolve .set/.equ constants with simple integer values in instruction lines. -/// -/// Sequential single-pass: as each `.set name, value` is encountered, update the -/// constant map. For non-.set lines, substitute whole-word occurrences of known -/// constant names with their current values. This correctly handles constants that -/// are reassigned (e.g., `.Lasm_alt_mode` in kernel ALTERNATIVE macros). -/// The `.set` directives themselves are preserved for the encoder to process. -fn resolve_set_constants(lines: &[String]) -> Vec { - use std::collections::HashMap; - let mut constants: HashMap = HashMap::new(); - let mut result = Vec::with_capacity(lines.len()); - - // Single sequential pass: update constants as .set directives are encountered, - // and substitute using the current map state at each line. This correctly handles - // constants that are reassigned (e.g., .Lasm_alt_mode in kernel ALTERNATIVE macros). - for line in lines { - let trimmed = strip_comment(line).trim().to_lowercase(); - if trimmed.starts_with(".set ") || trimmed.starts_with(".set\t") - || trimmed.starts_with(".equ ") || trimmed.starts_with(".equ\t") { - let rest = strip_comment(line).trim(); - let directive_len = 4; // both .set and .equ are 4 chars - let args = rest[directive_len..].trim(); - if let Some(comma_pos) = args.find(',') { - let name = args[..comma_pos].trim().to_string(); - let value_str = args[comma_pos + 1..].trim().to_string(); - // Only resolve simple integer values - if let Ok(_val) = asm_expr::parse_integer_expr(&value_str) { - constants.insert(name, value_str); - } - } - result.push(line.clone()); - } else if constants.is_empty() { - result.push(line.clone()); - } else { - // Substitute current constant values in this line - let mut substituted = line.clone(); - for (name, value) in &constants { - // Replace whole-word occurrences of the constant name - let mut new_result = String::with_capacity(substituted.len()); - let bytes = substituted.as_bytes(); - let name_bytes = name.as_bytes(); - let mut i = 0; - while i < bytes.len() { - if i + name_bytes.len() <= bytes.len() - && &bytes[i..i + name_bytes.len()] == name_bytes - { - // Check word boundary before - let before_ok = i == 0 || !is_ident_char(bytes[i - 1]); - // Check word boundary after - let after_pos = i + name_bytes.len(); - let after_ok = after_pos >= bytes.len() || !is_ident_char(bytes[after_pos]); - if before_ok && after_ok { - new_result.push_str(value); - i += name_bytes.len(); - continue; - } - } - new_result.push(bytes[i] as char); - i += 1; - } - substituted = new_result; - } - result.push(substituted); - } - } - - result -} - -/// Check if a byte is a valid identifier character (alphanumeric, underscore, dot). -fn is_ident_char(b: u8) -> bool { - b.is_ascii_alphanumeric() || b == b'_' || b == b'.' -} - -/// Resolve `.req` / `.unreq` register aliases. -/// -/// First pass: collect `name .req register` definitions and `.unreq name` removals. -/// Second pass: substitute alias names with their register values in all non-directive lines. -/// The `.req` / `.unreq` lines themselves are preserved (they'll be parsed as Empty later). -fn resolve_register_aliases(lines: &[String]) -> Vec { - use std::collections::HashMap; - let mut aliases: HashMap = HashMap::new(); - - // First pass: collect alias definitions and removals in order - for line in lines { - let trimmed = strip_comment(line).trim(); - // Match "name .req register" - if let Some(req_pos) = trimmed.find(".req") { - // Ensure it's " .req " or "\t.req " (not part of a longer directive like .reqXYZ) - let after_req = req_pos + 4; - if req_pos > 0 - && (trimmed.as_bytes()[req_pos - 1] == b' ' || trimmed.as_bytes()[req_pos - 1] == b'\t') - && after_req < trimmed.len() - && (trimmed.as_bytes()[after_req] == b' ' || trimmed.as_bytes()[after_req] == b'\t') - { - let name = trimmed[..req_pos].trim(); - let register = trimmed[after_req..].trim(); - if !name.is_empty() && !register.is_empty() { - aliases.insert(name.to_string(), register.to_string()); - } - continue; - } - } - // Note: We intentionally DON'T process .unreq here. - // .unreq is used to allow re-definition of aliases in different scopes, - // but for our two-pass approach, we want ALL aliases available for substitution. - // The .unreq directive is handled by the parser as an ignored directive. - } - - if aliases.is_empty() { - return lines.to_vec(); - } - - // Sort aliases by name length (longest first) to avoid partial substitution - let mut sorted_aliases: Vec<(&String, &String)> = aliases.iter().collect(); - sorted_aliases.sort_by(|a, b| b.0.len().cmp(&a.0.len())); - - // Second pass: substitute aliases in all lines - lines.iter().map(|line| { - let trimmed = strip_comment(line).trim(); - // Skip .req/.unreq definition lines — leave them as-is - if trimmed.contains(" .req ") || trimmed.contains("\t.req\t") || trimmed.contains("\t.req ") - || trimmed.contains(" .unreq ") || trimmed.contains("\t.unreq\t") || trimmed.contains("\t.unreq ") - { - return line.clone(); - } - // Skip directives (starting with .) — aliases are only used in instructions - if trimmed.starts_with('.') { - return line.clone(); - } - // Skip empty lines and comments - if trimmed.is_empty() || trimmed.starts_with('#') || trimmed.starts_with("//") { - return line.clone(); - } - let mut result = line.clone(); - for (name, register) in &sorted_aliases { - let name_bytes = name.as_bytes(); - let mut new_result = String::with_capacity(result.len()); - let bytes = result.as_bytes(); - let mut i = 0; - while i < bytes.len() { - if i + name_bytes.len() <= bytes.len() - && &bytes[i..i + name_bytes.len()] == name_bytes - { - // Check word boundary before - let before_ok = i == 0 || !is_alias_ident_char(bytes[i - 1]); - // Check word boundary after - let after_pos = i + name_bytes.len(); - let after_ok = after_pos >= bytes.len() || !is_alias_ident_char(bytes[after_pos]); - if before_ok && after_ok { - new_result.push_str(register); - i += name_bytes.len(); - continue; - } - } - new_result.push(bytes[i] as char); - i += 1; - } - result = new_result; - } - result - }).collect() -} - -/// Check if a byte is a valid identifier character for register alias matching. -/// Similar to is_ident_char but does NOT include '.' since register names like -/// "v0.8h" should allow alias substitution where the alias is followed by '.'. -fn is_alias_ident_char(b: u8) -> bool { - b.is_ascii_alphanumeric() || b == b'_' -} - -pub fn parse_asm(text: &str) -> Result, String> { - // Pre-process: strip C-style /* ... */ comments - let text = asm_preprocess::strip_c_comments(text); - - // Split lines on ';' (GAS statement separator) before macro expansion, - // so macro invocations after ';' on the same line get expanded correctly. - // Strip // and @ line comments BEFORE splitting on semicolons, so that - // semicolons inside comments (e.g. "// struct {int a;} *p;") don't cause - // spurious splits. - let presplit: Vec = text.lines().flat_map(|line| { - let line = strip_comment(line); - split_on_semicolons(line).into_iter().map(|s| s.to_string()).collect::>() - }).collect(); - let presplit_refs: Vec<&str> = presplit.iter().map(|s| s.as_str()).collect(); - - // Expand .macro/.endm definitions and invocations - let macro_expanded = expand_macros(&presplit_refs)?; - let macro_refs: Vec<&str> = macro_expanded.iter().map(|s| s.as_str()).collect(); - - // Expand .rept/.endr blocks - let expanded_lines = expand_rept_blocks(¯o_refs)?; - - // Resolve .set/.equ constants in expressions - let expanded_lines = resolve_set_constants(&expanded_lines); - - // Resolve .req/.unreq register aliases - let expanded_lines = resolve_register_aliases(&expanded_lines); - - let mut statements = Vec::new(); - // Stack for .if/.else/.endif conditional assembly. - // Each entry is (active, any_taken): active = current block emitting code, - // any_taken = whether any branch in this if/elseif/else chain was taken. - let mut if_stack: Vec<(bool, bool)> = Vec::new(); - // Track defined symbols for .ifdef/.ifndef - let mut defined_symbols: std::collections::HashSet = std::collections::HashSet::new(); - for (line_num, line) in expanded_lines.iter().enumerate() { - let line = line.trim(); - - // Skip empty lines - if line.is_empty() { - statements.push(AsmStatement::Empty); - continue; - } - - // Strip comments (// style) - let line = strip_comment(line); - let line = line.trim(); - if line.is_empty() { - statements.push(AsmStatement::Empty); - continue; - } - - // Handle .if/.else/.endif before anything else - let lower = line.to_ascii_lowercase(); - if lower.starts_with(".endif") { - if if_stack.pop().is_none() { - return Err(format!("Line {}: .endif without matching .if", line_num + 1)); - } - continue; - } - // .elseif / .elsif — else-if branch (must be checked BEFORE .else) - if lower.starts_with(".elseif ") || lower.starts_with(".elseif\t") - || lower.starts_with(".elsif ") || lower.starts_with(".elsif\t") { - let len = if_stack.len(); - if len > 0 { - let parent_active = if len >= 2 { if_stack[len - 2].0 } else { true }; - if if_stack[len - 1].1 || !parent_active { - if_stack[len - 1].0 = false; - } else { - let keyword_len = if lower.starts_with(".elseif") { 7 } else { 6 }; - let cond_str = line[keyword_len..].trim(); - let result = eval_if_condition(cond_str); - if_stack[len - 1].0 = result; - if result { if_stack[len - 1].1 = true; } - } - } - continue; - } - if lower == ".else" || (lower.starts_with(".else") && !lower.starts_with(".elseif") && !lower.starts_with(".elsif") - && lower.len() >= 5 && (lower.len() == 5 || lower.as_bytes()[5].is_ascii_whitespace())) { - let len = if_stack.len(); - if len > 0 { - let parent_active = if len >= 2 { if_stack[len - 2].0 } else { true }; - if if_stack[len - 1].1 || !parent_active { - if_stack[len - 1].0 = false; - } else { - if_stack[len - 1].0 = true; - if_stack[len - 1].1 = true; - } - } - continue; - } - if lower == ".if" || lower.starts_with(".if ") || lower.starts_with(".if\t") { - let cond_str = if line.len() > 3 { line[3..].trim() } else { "" }; - // Evaluate the condition: if we're already in a false block, push false - let active = if if_stack.last().map(|&(a, _)| a).unwrap_or(true) { - if cond_str.is_empty() { false } else { eval_if_condition(cond_str) } - } else { - false - }; - if_stack.push((active, active)); - continue; - } - // .ifc string1, string2 — conditional if strings are equal - if lower.starts_with(".ifc ") || lower.starts_with(".ifc\t") { - let args = line[4..].trim(); - let active = if if_stack.last().map(|&(a, _)| a).unwrap_or(true) { - if let Some((a, b)) = args.split_once(',') { - a.trim() == b.trim() - } else { - false - } - } else { - false - }; - if_stack.push((active, active)); - continue; - } - // .ifnc string1, string2 — conditional if strings are NOT equal - if lower.starts_with(".ifnc ") || lower.starts_with(".ifnc\t") { - let args = line[5..].trim(); - let active = if if_stack.last().map(|&(a, _)| a).unwrap_or(true) { - if let Some((a, b)) = args.split_once(',') { - a.trim() != b.trim() - } else { - true - } - } else { - false - }; - if_stack.push((active, active)); - continue; - } - // .ifb string — conditional if string is blank - if lower == ".ifb" || lower.starts_with(".ifb ") || lower.starts_with(".ifb\t") { - let arg = if line.len() > 4 { line[4..].trim() } else { "" }; - let active = if if_stack.last().map(|&(a, _)| a).unwrap_or(true) { - arg.is_empty() - } else { - false - }; - if_stack.push((active, active)); - continue; - } - // .ifnb string — conditional if string is NOT blank - if lower == ".ifnb" || lower.starts_with(".ifnb ") || lower.starts_with(".ifnb\t") { - let arg = if line.len() > 5 { line[5..].trim() } else { "" }; - let active = if if_stack.last().map(|&(a, _)| a).unwrap_or(true) { - !arg.is_empty() - } else { - false - }; - if_stack.push((active, active)); - continue; - } - // .ifdef symbol — conditional if symbol is defined - if lower.starts_with(".ifdef ") || lower.starts_with(".ifdef\t") { - let sym = line[6..].trim().to_string(); - let active = if if_stack.last().map(|&(a, _)| a).unwrap_or(true) { - defined_symbols.contains(&sym) - } else { - false - }; - if_stack.push((active, active)); - continue; - } - // .ifndef symbol — conditional if symbol is NOT defined - if lower.starts_with(".ifndef ") || lower.starts_with(".ifndef\t") { - let sym = line[7..].trim().to_string(); - let active = if if_stack.last().map(|&(a, _)| a).unwrap_or(true) { - !defined_symbols.contains(&sym) - } else { - false - }; - if_stack.push((active, active)); - continue; - } - // .ifeq expression — conditional if expression equals 0 - if lower.starts_with(".ifeq ") || lower.starts_with(".ifeq\t") { - let expr_str = line[5..].trim(); - let active = if if_stack.last().map(|&(a, _)| a).unwrap_or(true) { - asm_expr::parse_integer_expr(expr_str).unwrap_or(1) == 0 - } else { - false - }; - if_stack.push((active, active)); - continue; - } - // .ifne expression — conditional if expression is not equal to 0 - if lower.starts_with(".ifne ") || lower.starts_with(".ifne\t") { - let expr_str = line[5..].trim(); - let active = if if_stack.last().map(|&(a, _)| a).unwrap_or(true) { - asm_expr::parse_integer_expr(expr_str).unwrap_or(0) != 0 - } else { - false - }; - if_stack.push((active, active)); - continue; - } - // If we're inside a false .if block, skip this line - if !if_stack.last().map(|&(a, _)| a).unwrap_or(true) { - continue; - } - - // In AArch64 GAS, '#' at the start of a line is a comment character. - // This covers: C preprocessor line markers (# 123 "file"), and - // comment-producing macros (e.g., FFmpeg's FUNC expanding to '#'). - if line.starts_with('#') { - statements.push(AsmStatement::Empty); - continue; - } - - // Handle ';' as statement separator (GAS syntax). - // Split the line on ';' and parse each part independently. - let parts = split_on_semicolons(line); - for part in parts { - let part = part.trim(); - if part.is_empty() { - continue; - } - match parse_line(part) { - Ok(stmts) => { - for stmt in &stmts { - match stmt { - AsmStatement::Label(name) => { - defined_symbols.insert(name.clone()); - } - AsmStatement::Directive(AsmDirective::Set(name, _)) => { - defined_symbols.insert(name.clone()); - } - _ => {} - } - } - statements.extend(stmts); - } - Err(e) => return Err(format!("Line {}: {}: '{}'", line_num + 1, e, part)), - } - } - } - Ok(statements) -} - -/// Split a line on ';' characters, respecting strings and comments. -/// In GAS syntax, ';' separates multiple statements on the same line. -/// Stops splitting once a `//` or `@` line comment is encountered (outside strings), -/// so semicolons inside comments are not treated as statement separators. -fn split_on_semicolons(line: &str) -> Vec<&str> { - let mut parts = Vec::new(); - let mut in_string = false; - let mut escape = false; - let mut start = 0; - let bytes = line.as_bytes(); - for (i, c) in line.char_indices() { - if escape { - escape = false; - continue; - } - if c == '\\' && in_string { - escape = true; - continue; - } - if c == '"' { - in_string = !in_string; - continue; - } - if !in_string { - // Stop splitting at // comment start - if c == '/' && i + 1 < bytes.len() && bytes[i + 1] == b'/' { - break; - } - // Stop splitting at @ comment start (but not @function, @object, etc.) - if c == '@' { - let after = &line[i + 1..]; - if !after.starts_with("object") - && !after.starts_with("function") - && !after.starts_with("progbits") - && !after.starts_with("nobits") - && !after.starts_with("tls_object") - && !after.starts_with("note") - { - break; - } - } - if c == ';' { - parts.push(&line[start..i]); - start = i + 1; - } - } - } - parts.push(&line[start..]); - parts -} - -fn strip_comment(line: &str) -> &str { - // Scan character by character, tracking string state to find comments - // outside of string literals. This correctly handles escaped quotes (\") - // inside strings (e.g. .asciz "a\"b//c" should not strip at //). - let bytes = line.as_bytes(); - let mut in_string = false; - let mut i = 0; - while i < bytes.len() { - if in_string { - if bytes[i] == b'\\' { - i += 2; // skip escaped character - continue; - } - if bytes[i] == b'"' { - in_string = false; - } - i += 1; - continue; - } - // Not in string - if bytes[i] == b'"' { - in_string = true; - i += 1; - continue; - } - // Check for // comment - if bytes[i] == b'/' && i + 1 < bytes.len() && bytes[i + 1] == b'/' { - return &line[..i]; - } - // Check for @ comment (GAS ARM comment character). - // Skip \@ which is the GAS macro invocation counter, not a comment. - if bytes[i] == b'@' && !(i > 0 && bytes[i - 1] == b'\\') { - let after = &line[i + 1..]; - if !after.starts_with("object") - && !after.starts_with("function") - && !after.starts_with("progbits") - && !after.starts_with("nobits") - && !after.starts_with("tls_object") - && !after.starts_with("note") - { - return &line[..i]; - } - } - i += 1; - } - line -} - -/// Try to parse `ldr Rd, =symbol[+offset]` pseudo-instruction. -/// -/// In GAS, `ldr Rd, =expr` loads a 64-bit value via a literal pool. -/// Returns a `LdrLiteralPool` statement that will be expanded into -/// `ldr Rd, .Llpool_N` + pool entries by the `expand_literal_pools` pass. -fn try_expand_ldr_literal(line: &str) -> Option, String>> { - let lower = line.to_ascii_lowercase(); - // Match: ldr xN, =symbol or ldr wN, =symbol (32-bit registers use adrp too on AArch64) - if !lower.starts_with("ldr ") && !lower.starts_with("ldr\t") { - return None; - } - let rest = line[3..].trim(); - // Find the comma separating register from operand - let comma_pos = rest.find(',')?; - let reg = rest[..comma_pos].trim(); - let operand = rest[comma_pos + 1..].trim(); - if !operand.starts_with('=') { - return None; - } - let expr = operand[1..].trim(); - // Parse symbol and optional +offset, e.g. =coeffs+8 - let (symbol, addend) = if let Some(plus_pos) = expr.rfind('+') { - let sym = expr[..plus_pos].trim(); - let off_str = expr[plus_pos + 1..].trim(); - if let Ok(off) = if off_str.starts_with("0x") || off_str.starts_with("0X") { - i64::from_str_radix(&off_str[2..], 16) - } else { - off_str.parse::() - } { - (sym, off) - } else { - (expr, 0i64) - } - } else if let Some(minus_pos) = expr.rfind('-') { - // Only treat as symbol-offset if there's a valid symbol before the minus - let sym = expr[..minus_pos].trim(); - let off_str = expr[minus_pos + 1..].trim(); - if !sym.is_empty() && !sym.ends_with(['+', '-']) { - if let Ok(off) = if off_str.starts_with("0x") || off_str.starts_with("0X") { - i64::from_str_radix(&off_str[2..], 16) - } else { - off_str.parse::() - } { - (sym, -off) - } else { - (expr, 0i64) - } - } else { - (expr, 0i64) - } - } else { - (expr, 0i64) - }; - Some(Ok(vec![AsmStatement::LdrLiteralPool { - reg: reg.to_string(), - symbol: symbol.to_string(), - addend, - }])) -} - -fn parse_line(line: &str) -> Result, String> { - // Check for label definition (name:) - // Labels can be at the start of the line, possibly followed by an instruction - if let Some(colon_pos) = line.find(':') { - let potential_label = &line[..colon_pos].trim(); - // Verify it looks like a valid label (no spaces before colon, alphanumeric + _ + .) - if !potential_label.is_empty() - && !potential_label.contains(' ') - && !potential_label.contains('\t') - && !potential_label.starts_with('.') // Could be a directive - || potential_label.starts_with(".L") // Local labels start with .L - || potential_label.starts_with(".Lstr") // String labels - || potential_label.starts_with(".Lmemcpy") - || potential_label.starts_with(".Lskip") - { - // Check if this is actually a directive like ".section .rodata" - if potential_label.starts_with('.') - && !potential_label.starts_with(".L") - && !potential_label.starts_with(".l") - { - // This is a directive, not a label - } else { - let mut result = vec![AsmStatement::Label(potential_label.to_string())]; - // Check for instruction/directive after the label on the same line - let rest = line[colon_pos + 1..].trim(); - if !rest.is_empty() { - result.extend(parse_line(rest)?); - } - return Ok(result); - } - } - } - - let trimmed = line.trim(); - - // Register alias: "name .req register" or "name .unreq register" - // These define register aliases and can be safely ignored. - if trimmed.contains(" .req ") || trimmed.contains("\t.req\t") || trimmed.contains("\t.req ") - || trimmed.contains(" .unreq ") || trimmed.contains("\t.unreq\t") || trimmed.contains("\t.unreq ") - { - return Ok(vec![AsmStatement::Empty]); - } - - // Directive: starts with . - if trimmed.starts_with('.') { - return Ok(vec![parse_directive(trimmed)?]); - } - - // Handle ldr Rd, =symbol pseudo-instruction (creates LdrLiteralPool for later expansion) - if let Some(expanded) = try_expand_ldr_literal(trimmed) { - return expanded; - } - - // Instruction - Ok(vec![parse_instruction(trimmed)?]) -} - -fn parse_directive(line: &str) -> Result { - // Split directive name from arguments - let (name, args) = if line.starts_with(".inst") && line.len() > 5 && line.as_bytes()[5] == b'(' { - // Handle .inst(expr) without space: ".inst(0x...)" -> name=".inst", args="(0x...)" - (".inst", line[5..].trim()) - } else if let Some(space_pos) = line.find([' ', '\t']) { - let name = &line[..space_pos]; - let args = line[space_pos..].trim(); - (name, args) - } else { - (line, "") - }; - - let dir = match name { - ".section" => parse_section_directive(args)?, - ".text" => AsmDirective::Section(SectionDirective { - name: ".text".to_string(), - flags: None, - section_type: None, - }), - ".data" => AsmDirective::Section(SectionDirective { - name: ".data".to_string(), - flags: None, - section_type: None, - }), - ".bss" => AsmDirective::Section(SectionDirective { - name: ".bss".to_string(), - flags: None, - section_type: None, - }), - ".rodata" => AsmDirective::Section(SectionDirective { - name: ".rodata".to_string(), - flags: None, - section_type: None, - }), - ".globl" | ".global" => AsmDirective::Global(args.trim().to_string()), - ".weak" => AsmDirective::Weak(args.trim().to_string()), - ".hidden" => AsmDirective::Hidden(args.trim().to_string()), - ".protected" => AsmDirective::Protected(args.trim().to_string()), - ".internal" => AsmDirective::Internal(args.trim().to_string()), - ".type" => parse_type_directive(args)?, - ".size" => parse_size_directive(args)?, - ".align" | ".p2align" => { - let align_val: u64 = args.trim().split(',').next() - .and_then(|s| parse_int_literal(s.trim()).ok()) - .unwrap_or(0) as u64; - // AArch64 .align N means 2^N bytes (same as .p2align) - AsmDirective::Align(1u64 << align_val) - } - ".balign" => { - let align_val: u64 = args.trim().split(',').next() - .and_then(|s| parse_int_literal(s.trim()).ok()) - .unwrap_or(1) as u64; - AsmDirective::Balign(align_val) - } - ".byte" => { - let vals = parse_data_values(args)?; - AsmDirective::Byte(vals) - } - ".short" | ".hword" | ".2byte" | ".half" => { - let mut vals = Vec::new(); - for part in args.split(',') { - let val = parse_data_value(part.trim())? as i16; - vals.push(val); - } - AsmDirective::Short(vals) - } - ".long" | ".4byte" | ".word" | ".int" | ".inst" => { - let vals = parse_data_values(args)?; - AsmDirective::Long(vals) - } - ".quad" | ".8byte" | ".xword" | ".dword" => { - let vals = parse_data_values(args)?; - AsmDirective::Quad(vals) - } - ".zero" | ".space" => { - let parts: Vec<&str> = args.trim().split(',').collect(); - let size: usize = parse_int_literal(parts[0].trim()) - .map_err(|_| format!("invalid .zero size: {}", args))? as usize; - let fill: u8 = if parts.len() > 1 { - parse_data_value(parts[1].trim())? as u8 - } else { - 0 - }; - AsmDirective::Zero(size, fill) - } - ".fill" => { - // .fill repeat, size, value - let parts: Vec<&str> = args.splitn(3, ',').collect(); - let repeat = parse_int_literal(parts[0].trim()) - .map_err(|_| format!("bad .fill repeat: {}", parts[0].trim()))? as u64; - let size = if parts.len() > 1 { - parse_int_literal(parts[1].trim()) - .map_err(|_| format!("bad .fill size: {}", parts[1].trim()))? as u64 - } else { - 1 - }; - let value = if parts.len() > 2 { - parse_int_literal(parts[2].trim()) - .map_err(|_| format!("bad .fill value: {}", parts[2].trim()))? as u64 - } else { - 0 - }; - let total_bytes = (repeat * size.min(8)) as usize; - if value == 0 { - AsmDirective::Zero(total_bytes, 0) - } else { - let mut data = Vec::with_capacity(total_bytes); - let value_bytes = value.to_le_bytes(); - for _ in 0..repeat { - for j in 0..size.min(8) as usize { - data.push(value_bytes[j]); - } - } - AsmDirective::Ascii(data) - } - } - ".asciz" | ".string" => { - let s = elf::parse_string_literal(args)?; - let mut bytes = s; - bytes.push(0); // null terminator - AsmDirective::Asciz(bytes) - } - ".ascii" => { - let s = elf::parse_string_literal(args)?; - AsmDirective::Ascii(s) - } - ".comm" => parse_comm_directive(args)?, - ".local" => AsmDirective::Local(args.trim().to_string()), - ".set" | ".equ" => { - let parts: Vec<&str> = args.splitn(2, ',').collect(); - if parts.len() == 2 { - AsmDirective::Set( - parts[0].trim().to_string(), - parts[1].trim().to_string(), - ) - } else { - return Err(format!("malformed .set directive: expected 'name, value', got '{}'", args)); - } - } - ".symver" => { - // .symver name, alias@@VERSION -> treat as alias for default version - let parts: Vec<&str> = args.splitn(2, ',').collect(); - if parts.len() == 2 { - let name = parts[0].trim(); - let ver_string = parts[1].trim(); - if let Some(at_pos) = ver_string.find('@') { - let alias = &ver_string[..at_pos]; - if !alias.is_empty() { - AsmDirective::Set(alias.to_string(), name.to_string()) - } else { - AsmDirective::Ignored - } - } else { - AsmDirective::Ignored - } - } else { - AsmDirective::Ignored - } - } - // CFI directives - ".cfi_startproc" | ".cfi_endproc" | ".cfi_def_cfa_offset" - | ".cfi_offset" | ".cfi_def_cfa_register" | ".cfi_restore" - | ".cfi_remember_state" | ".cfi_restore_state" - | ".cfi_adjust_cfa_offset" | ".cfi_def_cfa" - | ".cfi_sections" | ".cfi_personality" | ".cfi_lsda" - | ".cfi_rel_offset" | ".cfi_register" | ".cfi_return_column" - | ".cfi_undefined" | ".cfi_same_value" | ".cfi_escape" => AsmDirective::Cfi, - ".pushsection" => { - // .pushsection name,"flags",@type - same syntax as .section - match parse_section_directive(args)? { - AsmDirective::Section(dir) => AsmDirective::PushSection(dir), - _ => AsmDirective::Ignored, - } - } - ".popsection" => AsmDirective::PopSection, - ".previous" => AsmDirective::Previous, - ".subsection" => { - let n: u64 = args.trim().parse().unwrap_or(0); - AsmDirective::Subsection(n) - } - ".purgem" => { - // .purgem name — remove a macro definition; ignore for now - AsmDirective::Ignored - } - ".org" => { - // .org expressions like ". - (X) + (Y)" are used as size assertions - // in kernel alternative macros. Silently ignore them. - AsmDirective::Ignored - } - ".incbin" => { - let parts: Vec<&str> = args.splitn(3, ',').collect(); - let path = elf::parse_string_literal(parts[0].trim()) - .map_err(|e| format!(".incbin path: {}", e))?; - let path = String::from_utf8(path) - .map_err(|_| ".incbin: invalid UTF-8 in path".to_string())?; - let skip = if parts.len() > 1 { - parts[1].trim().parse::().unwrap_or(0) - } else { 0 }; - let count = if parts.len() > 2 { - Some(parts[2].trim().parse::().unwrap_or(0)) - } else { None }; - AsmDirective::Incbin { path, skip, count } - } - ".unreq" => { - // .unreq name — remove register alias; ignore - AsmDirective::Ignored - } - ".req" => { - // .req register — register alias (standalone form); ignore - AsmDirective::Ignored - } - ".float" | ".single" => { - // .float val1, val2, ... — emit 32-bit IEEE floats - let mut bytes = Vec::new(); - for part in args.split(',') { - let val: f32 = part.trim().parse() - .map_err(|_| format!("invalid .float value: {}", part.trim()))?; - bytes.extend_from_slice(&val.to_le_bytes()); - } - AsmDirective::RawBytes(bytes) - } - ".double" => { - // .double val1, val2, ... — emit 64-bit IEEE floats - let mut bytes = Vec::new(); - for part in args.split(',') { - let val: f64 = part.trim().parse() - .map_err(|_| format!("invalid .double value: {}", part.trim()))?; - bytes.extend_from_slice(&val.to_le_bytes()); - } - AsmDirective::RawBytes(bytes) - } - // Other directives we can safely ignore - ".file" | ".loc" | ".ident" | ".addrsig" | ".addrsig_sym" - | ".build_attributes" | ".eabi_attribute" - | ".arch" | ".arch_extension" | ".cpu" - | ".ltorg" | ".pool" => AsmDirective::Ltorg, - _ => { - return Err(format!("unsupported AArch64 assembler directive: {} {}", name, args)); - } - }; - - Ok(AsmStatement::Directive(dir)) -} - -fn parse_instruction(line: &str) -> Result { - // Split mnemonic from operands - let (mnemonic, operands_str) = if let Some(space_pos) = line.find([' ', '\t']) { - (&line[..space_pos], line[space_pos..].trim()) - } else { - (line, "") - }; - - let mnemonic = mnemonic.to_lowercase(); - let operands = parse_operands(operands_str)?; - - Ok(AsmStatement::Instruction { - mnemonic, - operands, - raw_operands: operands_str.to_string(), - }) -} - -/// Parse an operand list separated by commas, handling brackets and nested expressions. -fn parse_operands(s: &str) -> Result, String> { - if s.is_empty() { - return Ok(Vec::new()); - } - - let mut operands = Vec::new(); - let mut current = String::new(); - let mut bracket_depth = 0; - let mut brace_depth = 0; - - let chars: Vec = s.chars().collect(); - let mut i = 0; - - while i < chars.len() { - match chars[i] { - '{' => { - brace_depth += 1; - current.push('{'); - } - '}' => { - brace_depth -= 1; - current.push('}'); - } - '[' => { - bracket_depth += 1; - current.push('['); - } - ']' => { - bracket_depth -= 1; - current.push(']'); - // Check for '!' (pre-index writeback) - if i + 1 < chars.len() && chars[i + 1] == '!' { - current.push('!'); - i += 1; - } - } - ',' if bracket_depth == 0 && brace_depth == 0 => { - let op = parse_single_operand(current.trim())?; - operands.push(op); - current.clear(); - } - _ => { - current.push(chars[i]); - } - } - i += 1; - } - - // Last operand - let trimmed = current.trim().to_string(); - if !trimmed.is_empty() { - let op = parse_single_operand(&trimmed)?; - operands.push(op); - } - - // Handle memory operands with post-index: [base], #offset - // This looks like two operands: Mem{base, 0} and Imm(offset) - // We need to merge them into MemPostIndex - let mut merged = Vec::new(); - let mut skip_next = false; - for j in 0..operands.len() { - if skip_next { - skip_next = false; - continue; - } - if j + 1 < operands.len() { - if let (Operand::Mem { base, offset: 0 }, Operand::Imm(off)) = (&operands[j], &operands[j + 1]) { - merged.push(Operand::MemPostIndex { base: base.clone(), offset: *off }); - skip_next = true; - continue; - } - } - merged.push(operands[j].clone()); - } - - Ok(merged) -} - -fn parse_single_operand(s: &str) -> Result { - let s = s.trim(); - if s.is_empty() { - return Err("empty operand".to_string()); - } - - // Register list: {v0.16b}, {v0.16b, v1.16b}, etc. - // Register list with optional element index: {v0.s, v1.s}[0] - if s.starts_with('{') { - if s.ends_with('}') { - return parse_register_list(s); - } - // Check for {regs}[index] form - if let Some(close_brace) = s.find('}') { - let rest = s[close_brace + 1..].trim(); - if rest.starts_with('[') && rest.ends_with(']') { - let idx_str = &rest[1..rest.len() - 1]; - // Use expression evaluator to handle arithmetic like [1 - 1] - let idx_result = idx_str.parse::() - .ok() - .or_else(|| asm_expr::parse_integer_expr(idx_str).ok().and_then(|v| u32::try_from(v).ok())); - if let Some(idx) = idx_result { - let list_str = &s[..close_brace + 1]; - let inner = &list_str[1..list_str.len() - 1]; - let mut regs = Vec::new(); - for part in inner.split(',') { - let part = part.trim(); - if !part.is_empty() { - let op = parse_single_operand(part)?; - regs.push(op); - } - } - if regs.is_empty() { - return Err("empty register list".to_string()); - } - return Ok(Operand::RegListIndexed { regs, index: idx }); - } - } - } - } - - // Memory operand: [base, #offset]! (pre-index) or [base, #offset] or [base] - if s.starts_with('[') { - return parse_memory_operand(s); - } - - // Immediate: #value - if let Some(rest) = s.strip_prefix('#') { - return parse_immediate(rest); - } - - // :modifier:symbol - if s.starts_with(':') { - return parse_modifier(s); - } - - // Shift: lsl, lsr, asr, ror - let lower = s.to_lowercase(); - if lower.starts_with("lsl ") || lower.starts_with("lsr ") || lower.starts_with("asr ") || lower.starts_with("ror ") { - let kind = &lower[..3]; - let amount_str = s[4..].trim(); - let amount = if let Some(stripped) = amount_str.strip_prefix('#') { - parse_int_literal(stripped)? - } else { - parse_int_literal(amount_str)? - }; - return Ok(Operand::Shift { kind: kind.to_string(), amount: amount as u32 }); - } - - // Extend specifiers: sxtw, uxtw, sxtx, uxtx, sxth, uxth, sxtb, uxtb - // May appear alone (sxtw) or with shift (sxtw #2) - { - let extend_prefixes = ["sxtw", "sxtx", "sxth", "sxtb", "uxtw", "uxtx", "uxth", "uxtb"]; - for prefix in &extend_prefixes { - if lower == *prefix { - return Ok(Operand::Extend { kind: prefix.to_string(), amount: 0 }); - } - if lower.starts_with(prefix) && lower.as_bytes().get(prefix.len()) == Some(&b' ') { - let amount_str = s[prefix.len()..].trim(); - let amount = if let Some(stripped) = amount_str.strip_prefix('#') { - parse_int_literal(stripped)? - } else { - parse_int_literal(amount_str)? - }; - return Ok(Operand::Extend { kind: prefix.to_string(), amount: amount as u32 }); - } - } - } - - // Barrier options - match lower.as_str() { - "ish" | "ishld" | "ishst" | "sy" | "ld" | "st" | "osh" | "oshld" | "oshst" - | "nsh" | "nshld" | "nshst" => { - // Store original case: this name may be a C symbol colliding with an ARM keyword - return Ok(Operand::Barrier(s.to_string())); - } - _ => {} - } - - // Condition codes (for csel, csinc, etc.) - match lower.as_str() { - "eq" | "ne" | "cs" | "hs" | "cc" | "lo" | "mi" | "pl" | "vs" | "vc" - | "hi" | "ls" | "ge" | "lt" | "gt" | "le" | "al" | "nv" => { - // Store original case: this name may be a C symbol colliding with an ARM keyword - return Ok(Operand::Cond(s.to_string())); - } - _ => {} - } - - // NEON register with lane index: v0.d[1], v0.b[0], v0.s[2], etc. - if let Some(dot_pos) = s.find('.') { - let reg_part = &s[..dot_pos]; - let arr_part = &s[dot_pos + 1..]; - if is_register(reg_part) { - if let Some(bracket_pos) = arr_part.find('[') { - if arr_part.ends_with(']') { - let elem_size = arr_part[..bracket_pos].to_lowercase(); - let idx_str = &arr_part[bracket_pos + 1..arr_part.len() - 1]; - // Use expression evaluator to handle arithmetic like [1 - 1] - let idx_result = idx_str.parse::() - .ok() - .or_else(|| asm_expr::parse_integer_expr(idx_str).ok().and_then(|v| u32::try_from(v).ok())); - if let Some(idx) = idx_result { - if matches!(elem_size.as_str(), "b" | "h" | "s" | "d") { - return Ok(Operand::RegLane { - reg: reg_part.to_string(), - elem_size, - index: idx, - }); - } - } - } - } - } - } - - // NEON register with arrangement: v0.8b, v0.16b, v0.4s, v0.2d, etc. - if let Some(dot_pos) = s.find('.') { - let reg_part = &s[..dot_pos]; - let arr_part = &s[dot_pos + 1..]; - if is_register(reg_part) { - let arr_lower = arr_part.to_lowercase(); - if matches!(arr_lower.as_str(), "8b" | "16b" | "4h" | "8h" | "2s" | "4s" | "1d" | "2d" | "1q" - | "b" | "h" | "s" | "d") { - return Ok(Operand::RegArrangement { - reg: reg_part.to_string(), - arrangement: arr_lower, - }); - } - } - } - - // Register - if is_register(s) { - return Ok(Operand::Reg(s.to_string())); - } - - // Bare integer (without # prefix) - some inline asm constraints emit these - // e.g., "eor w9, w10, 255" or "ccmp x10, x13, 0, eq" - // Also handles negative like -1, -2, etc. - if s.chars().next().is_some_and(|c| c.is_ascii_digit() || c == '-') { - if let Ok(val) = parse_int_literal(s) { - return Ok(Operand::Imm(val)); - } - } - - // Label/symbol reference (for branches, adrp, etc.) - // Could be: .LBB42, func_name, symbol+offset - if let Some(plus_pos) = s.find('+') { - let sym = s[..plus_pos].trim(); - let off_str = s[plus_pos + 1..].trim(); - if !sym.is_empty() { - if let Ok(off) = parse_int_literal(off_str) { - return Ok(Operand::SymbolOffset(sym.to_string(), off)); - } - } - } - if let Some(minus_pos) = s.find('-') { - // Careful: don't confuse with label names containing '-' in label diff expressions - if minus_pos > 0 { - let sym = s[..minus_pos].trim(); - let off_str = &s[minus_pos..]; // includes the '-' - if let Ok(off) = parse_int_literal(off_str) { - return Ok(Operand::SymbolOffset(sym.to_string(), off)); - } - } - } - - // Try evaluating as a constant expression (handles parenthesized expressions like (0 + ...)) - if s.starts_with('(') || s.starts_with('~') { - if let Ok(val) = parse_int_literal(s) { - return Ok(Operand::Imm(val)); - } - } - - // Plain symbol/label - Ok(Operand::Symbol(s.to_string())) -} - -/// Parse a register list like {v0.16b} or {v0.16b, v1.16b, v2.16b, v3.16b} -/// Also handles range syntax: {v0.8b-v3.8b} which means {v0.8b, v1.8b, v2.8b, v3.8b} -fn parse_register_list(s: &str) -> Result { - let inner = &s[1..s.len() - 1].trim(); // strip { and } - let mut regs = Vec::new(); - for part in inner.split(',') { - let part = part.trim(); - if part.is_empty() { - continue; - } - // Check for range syntax: v0.8b-v2.8b - if let Some(dash_pos) = part.find('-') { - let left = part[..dash_pos].trim(); - let right = part[dash_pos + 1..].trim(); - // Both must be register arrangements - if let (Some(ldot), Some(rdot)) = (left.find('.'), right.find('.')) { - let lreg = &left[..ldot]; - let larr = &left[ldot + 1..]; - let rreg = &right[..rdot]; - let rarr = &right[rdot + 1..]; - if is_register(lreg) && is_register(rreg) && larr.eq_ignore_ascii_case(rarr) { - let start = parse_reg_num_simple(lreg); - let end = parse_reg_num_simple(rreg); - if let (Some(s_num), Some(e_num)) = (start, end) { - let prefix = if lreg.starts_with('v') || lreg.starts_with('V') { "v".to_string() } else { lreg.chars().next().unwrap().to_string() }; - let count = if e_num >= s_num { e_num - s_num + 1 } else { (32 - s_num) + e_num + 1 }; - for i in 0..count { - let reg_num = (s_num + i) % 32; - regs.push(Operand::RegArrangement { - reg: format!("{}{}", prefix, reg_num), - arrangement: larr.to_lowercase(), - }); - } - continue; - } - } - } - } - let op = parse_single_operand(part)?; - regs.push(op); - } - if regs.is_empty() { - return Err("empty register list".to_string()); - } - Ok(Operand::RegList(regs)) -} - -fn parse_reg_num_simple(reg: &str) -> Option { - let s = reg.trim(); - if s.len() < 2 { return None; } - let first = s.chars().next()?; - if matches!(first, 'v' | 'V' | 'x' | 'X' | 'w' | 'W' | 'q' | 'Q' | 'd' | 'D' | 's' | 'S' | 'h' | 'H' | 'b' | 'B') { - s[1..].parse::().ok() - } else { - None - } -} - -fn parse_memory_operand(s: &str) -> Result { - let has_writeback = s.ends_with('!'); - let inner = if has_writeback { - &s[1..s.len() - 2] // strip [ and ]! - } else { - // Find the matching ] - let end = s.find(']').ok_or("missing ] in memory operand")?; - &s[1..end] - }; - - // Split on comma - let parts: Vec<&str> = inner.splitn(2, ',').collect(); - let base = parts[0].trim().to_string(); - - if parts.len() == 1 { - // [base] - if has_writeback { - return Ok(Operand::MemPreIndex { base, offset: 0 }); - } - return Ok(Operand::Mem { base, offset: 0 }); - } - - let second = parts[1].trim(); - - // [base, #imm] or [base, imm] (bare immediate without # prefix) - if let Some(imm_str) = second.strip_prefix('#') { - match parse_int_literal(imm_str) { - Ok(offset) => { - if has_writeback { - return Ok(Operand::MemPreIndex { base, offset }); - } - return Ok(Operand::Mem { base, offset }); - } - Err(_) => { - // Expression contains symbols/labels — defer resolution - return Ok(Operand::MemExpr { base, expr: imm_str.to_string(), writeback: has_writeback }); - } - } - } - - // Handle bare immediate without # prefix (e.g., [sp, -16]! or [x0, 8]) - // Check if the second operand starts with a digit or minus sign followed by a digit - if second.starts_with('-') || second.starts_with('+') || second.bytes().next().is_some_and(|b| b.is_ascii_digit()) { - if let Ok(offset) = parse_int_literal(second) { - if has_writeback { - return Ok(Operand::MemPreIndex { base, offset }); - } - return Ok(Operand::Mem { base, offset }); - } - } - - // [base, :lo12:symbol] - if second.starts_with(':') { - // Parse the modifier embedded in memory operand - // The ] is already stripped, so just parse the modifier - let mod_op = parse_modifier(second)?; - // Return a special memory operand - we'll handle this in the encoder - // For now, return it as a reg+symbol form - match mod_op { - Operand::Modifier { kind, symbol } => { - return Ok(Operand::MemRegOffset { - base, - index: format!(":{}:{}", kind, symbol), - extend: None, - shift: None, - }); - } - Operand::ModifierOffset { kind, symbol, offset } => { - return Ok(Operand::MemRegOffset { - base, - index: format!(":{}:{}+{}", kind, symbol, offset), - extend: None, - shift: None, - }); - } - _ => {} - } - } - - // [base, Xm] or [base, Xm, extend #shift] - // second may be "x0" or "x0, lsl #2" or "w0, sxtw" or "w0, sxtw #2" - let sub_parts: Vec<&str> = second.splitn(2, ',').collect(); - let index_str = sub_parts[0].trim(); - if is_register(index_str) { - let (extend, shift) = if sub_parts.len() > 1 { - parse_extend_shift(sub_parts[1].trim()) - } else { - (None, None) - }; - return Ok(Operand::MemRegOffset { - base, - index: index_str.to_string(), - extend, - shift, - }); - } - - // Fallback: treat as register offset - Ok(Operand::MemRegOffset { - base, - index: second.to_string(), - extend: None, - shift: None, - }) -} - -/// Parse an extend/shift specifier like "lsl #2", "sxtw", "sxtw #0", "uxtx #3" -fn parse_extend_shift(s: &str) -> (Option, Option) { - let s = s.trim().to_lowercase(); - let parts: Vec<&str> = s.split_whitespace().collect(); - if parts.is_empty() { - return (None, None); - } - let kind = parts[0]; - let shift = if parts.len() > 1 { - let shift_str = parts[1].trim_start_matches('#'); - shift_str.parse::().ok() - } else { - None - }; - match kind { - "lsl" | "lsr" | "asr" | "ror" | "sxtw" | "sxtx" | "sxth" | "sxtb" - | "uxtw" | "uxtx" | "uxth" | "uxtb" => { - (Some(kind.to_string()), shift) - } - _ => (None, None), - } -} - -fn parse_modifier(s: &str) -> Result { - // :kind:symbol or :kind:symbol+offset - let s = s.trim_start_matches(':'); - let colon_pos = s.find(':').ok_or("malformed modifier, expected :kind:symbol")?; - let kind = s[..colon_pos].to_string(); - let rest = &s[colon_pos + 1..]; - - // Check for symbol+offset or symbol-offset - if let Some(plus_pos) = rest.find('+') { - let symbol = rest[..plus_pos].trim().to_string(); - let offset_str = rest[plus_pos + 1..].trim(); - if let Ok(offset) = parse_int_literal(offset_str) { - return Ok(Operand::ModifierOffset { kind, symbol, offset }); - } - } - if let Some(minus_pos) = rest.rfind('-') { - if minus_pos > 0 { - let symbol = rest[..minus_pos].trim().to_string(); - let offset_str = &rest[minus_pos..]; // includes the '-' - if let Ok(offset) = parse_int_literal(offset_str) { - return Ok(Operand::ModifierOffset { kind, symbol, offset }); - } - } - } - - Ok(Operand::Modifier { kind, symbol: rest.trim().to_string() }) -} - -fn parse_immediate(s: &str) -> Result { - // Handle :modifier:symbol as immediate (e.g., #:lo12:symbol) - if s.starts_with(':') { - return parse_modifier(s); - } - - match parse_int_literal(s) { - Ok(val) => Ok(Operand::Imm(val)), - Err(_) => { - // Expression contains symbols/labels — store as raw expression for - // deferred resolution (e.g., #(1b - .Lvector_start + 4)) - Ok(Operand::Expr(s.to_string())) - } - } -} - -fn parse_int_literal(s: &str) -> Result { - let s = s.trim(); - if s.is_empty() { - return Err("empty integer literal".to_string()); - } - - // Use the shared expression evaluator which handles parentheses, - // operator precedence, bitwise ops, and arithmetic expressions. - asm_expr::parse_integer_expr(s) -} - -fn is_register(s: &str) -> bool { - let s = s.to_lowercase(); - // General purpose: x0-x30, w0-w30 - if (s.starts_with('x') || s.starts_with('w')) && s.len() >= 2 { - let num = &s[1..]; - if let Ok(n) = num.parse::() { - return n <= 30; - } - } - // Special registers - matches!(s.as_str(), - "sp" | "wsp" | "xzr" | "wzr" | "lr" - ) - || - // FP/SIMD: d0-d31, s0-s31, q0-q31, v0-v31, h0-h31, b0-b31 - { - if (s.starts_with('d') || s.starts_with('s') || s.starts_with('q') - || s.starts_with('v') || s.starts_with('h') || s.starts_with('b')) - && s.len() >= 2 - { - let num = &s[1..]; - if let Ok(n) = num.parse::() { - return n <= 31; - } - } - false - } -} - -// ── Directive parsing helpers ────────────────────────────────────────── - -/// Parse a `.section name,"flags",@type` directive. -fn parse_section_directive(args: &str) -> Result { - let parts = split_section_args(args); - let name = parts.first() - .map(|s| s.trim().trim_matches('"').to_string()) - .unwrap_or_else(|| ".text".to_string()); - let flags = parts.get(1).map(|s| s.trim().trim_matches('"').to_string()); - let section_type = parts.get(2).map(|s| s.trim().to_string()); - Ok(AsmDirective::Section(SectionDirective { name, flags, section_type })) -} - -/// Split section directive args, respecting quoted strings. -fn split_section_args(s: &str) -> Vec { - let mut parts = Vec::new(); - let mut current = String::new(); - let mut in_quotes = false; - for c in s.chars() { - if c == '"' { - in_quotes = !in_quotes; - current.push(c); - } else if c == ',' && !in_quotes { - parts.push(current.clone()); - current.clear(); - } else { - current.push(c); - } - } - if !current.is_empty() { - parts.push(current); - } - parts -} - -/// Parse `.type name, %function` or `@object` etc. -/// Also accepts space-separated form: `.type name STT_NOTYPE`. -fn parse_type_directive(args: &str) -> Result { - let (sym, kind_str) = if let Some(comma_pos) = args.find(',') { - (args[..comma_pos].trim(), args[comma_pos + 1..].trim()) - } else { - // Space-separated fallback: ".type sym STT_NOTYPE" - let parts: Vec<&str> = args.split_whitespace().collect(); - if parts.len() >= 2 { - (parts[0], parts[1]) - } else { - (args.trim(), "") - } - }; - let kind = match kind_str { - "%function" | "@function" | "STT_FUNC" => SymbolKind::Function, - "%object" | "@object" | "STT_OBJECT" => SymbolKind::Object, - "@tls_object" => SymbolKind::TlsObject, - _ => SymbolKind::NoType, - }; - Ok(AsmDirective::SymbolType(sym.to_string(), kind)) -} - -/// Parse `.size name, expr`. -fn parse_size_directive(args: &str) -> Result { - let parts: Vec<&str> = args.splitn(2, ',').collect(); - if parts.len() != 2 { - return Err(format!("malformed .size directive: expected 'name, expr', got '{}'", args)); - } - let sym = parts[0].trim().to_string(); - let expr_str = parts[1].trim(); - if let Some(rest) = expr_str.strip_prefix(".-") { - let label = rest.trim().to_string(); - Ok(AsmDirective::Size(sym, SizeExpr::CurrentMinusSymbol(label))) - } else if let Ok(size) = expr_str.parse::() { - Ok(AsmDirective::Size(sym, SizeExpr::Constant(size))) - } else { - // Size expressions we can't evaluate (e.g. complex expressions) are non-fatal; - // the symbol size is not critical for code correctness in static linking - Ok(AsmDirective::Ignored) - } -} - -/// Parse `.comm name, size[, align]`. -fn parse_comm_directive(args: &str) -> Result { - let parts: Vec<&str> = args.split(',').collect(); - if parts.len() < 2 { - return Err(format!("malformed .comm directive: expected 'name, size[, align]', got '{}'", args)); - } - let sym = parts[0].trim().to_string(); - let size: u64 = parts[1].trim().parse().unwrap_or(0); - let align: u64 = if parts.len() > 2 { - parts[2].trim().parse().unwrap_or(1) - } else { - 1 - }; - Ok(AsmDirective::Comm(sym, size, align)) -} - -/// Parse comma-separated data values that may be integers, symbols, or symbol expressions. -fn parse_data_values(s: &str) -> Result, String> { - let mut vals = Vec::new(); - for part in s.split(',') { - let trimmed = part.trim(); - if trimmed.is_empty() { - continue; - } - // Check for symbol difference: A - B or A - B + C - if let Some(dv) = try_parse_symbol_diff(trimmed) { - vals.push(dv); - continue; - } - // Try integer - if let Ok(val) = parse_data_value(trimmed) { - vals.push(DataValue::Integer(val)); - continue; - } - // Check for symbol+offset or symbol-offset - if let Some(dv) = try_parse_symbol_offset(trimmed) { - vals.push(dv); - continue; - } - // Check if it looks like an expression with operators (for deferred evaluation) - if trimmed.contains('-') || trimmed.contains('+') || trimmed.contains(">>") || trimmed.contains("<<") || trimmed.starts_with('(') { - vals.push(DataValue::Expr(trimmed.to_string())); - } else { - // Symbol reference - vals.push(DataValue::Symbol(trimmed.to_string())); - } - } - Ok(vals) -} - -/// Check if a string looks like a GNU numeric label reference (e.g. "2f", "1b", "42f"). -fn is_numeric_label_ref(s: &str) -> bool { - if s.len() < 2 { - return false; - } - let last = s.as_bytes()[s.len() - 1]; - if last != b'f' && last != b'F' && last != b'b' && last != b'B' { - return false; - } - s[..s.len() - 1].bytes().all(|b| b.is_ascii_digit()) -} - -/// Strip balanced outer parentheses from an expression, recursively. -/// e.g. "((foo) - .)" => "(foo) - ." => "foo - ." (inner parens on individual terms -/// are handled by callers). -fn strip_outer_parens(s: &str) -> &str { - let s = s.trim(); - if !s.starts_with('(') || !s.ends_with(')') { - return s; - } - // Check if the outer parens are actually matched (not "(a)-(b)") - let inner = &s[1..s.len() - 1]; - let mut depth = 0i32; - for ch in inner.chars() { - match ch { - '(' => depth += 1, - ')' => { - if depth == 0 { - return s; // closing paren in middle means outer parens aren't a simple wrapper - } - depth -= 1; - } - _ => {} - } - } - if depth == 0 { - strip_outer_parens(inner) // recursively strip more layers - } else { - s - } -} - -/// Strip parentheses from a symbol name: "(9997f)" => "9997f", "(__label)" => "__label" -fn strip_sym_parens(s: &str) -> &str { - let s = s.trim(); - if s.starts_with('(') && s.ends_with(')') { - let inner = &s[1..s.len() - 1]; - // Make sure there are no unbalanced parens inside - let mut depth = 0i32; - for ch in inner.chars() { - match ch { - '(' => depth += 1, - ')' => { - if depth == 0 { return s; } - depth -= 1; - } - _ => {} - } - } - if depth == 0 { strip_sym_parens(inner) } else { s } - } else { - s - } -} - -/// Try to parse a symbol difference expression like "A - B" or "A - B + C". -/// Also handles numeric label references like "662b-661b". -/// Handles parenthesized expressions like "((9997f) - .)". -fn try_parse_symbol_diff(expr: &str) -> Option { - let expr = strip_outer_parens(expr); - if expr.is_empty() { - return None; - } - let first_char = expr.chars().next()?; - let is_sym_start = first_char.is_ascii_alphabetic() || first_char == '_' || first_char == '.'; - let could_be_numeric_ref = first_char.is_ascii_digit(); - let could_be_paren_sym = first_char == '('; - if !is_sym_start && !could_be_numeric_ref && !could_be_paren_sym { - return None; - } - let minus_pos = find_symbol_diff_minus(expr)?; - let sym_a_raw = strip_sym_parens(expr[..minus_pos].trim()).to_string(); - let rest = expr[minus_pos + 1..].trim(); - // rest might be "B" or "B + offset" - let (sym_b, extra_addend) = if let Some(plus_pos) = rest.find('+') { - let b = strip_sym_parens(rest[..plus_pos].trim()).to_string(); - let add_str = rest[plus_pos + 1..].trim(); - let add_val: i64 = add_str.parse().unwrap_or(0); - (b, add_val) - } else { - (strip_sym_parens(rest).to_string(), 0i64) - }; - // Decompose sym_a into symbol+offset if it contains a '+' or '-' with a numeric suffix. - // ELF relocations require separate symbol name and numeric addend, so composite - // names like "cgroup_bpf_enabled_key+48" must be split into ("cgroup_bpf_enabled_key", 48). - let (sym_a, extra_addend) = { - let mut sym = sym_a_raw.clone(); - let mut addend = extra_addend; - if let Some(plus_idx) = sym_a_raw.rfind('+') { - let left = sym_a_raw[..plus_idx].trim(); - let right = sym_a_raw[plus_idx + 1..].trim(); - if let Ok(val) = right.parse::() { - if !left.is_empty() { - addend += val; - sym = left.to_string(); - } - } - } else if let Some(minus_idx) = sym_a_raw.rfind('-') { - if minus_idx > 0 { - let left = sym_a_raw[..minus_idx].trim(); - let right = sym_a_raw[minus_idx + 1..].trim(); - if let Ok(val) = right.parse::() { - if !left.is_empty() { - addend -= val; - sym = left.to_string(); - } - } - } - } - (sym, addend) - }; - if sym_b.is_empty() { - return None; - } - let a_first = sym_a.chars().next()?; - let a_is_sym = a_first.is_ascii_alphabetic() || a_first == '_' || a_first == '.'; - let b_first = sym_b.chars().next().unwrap(); - let b_is_sym = b_first.is_ascii_alphabetic() || b_first == '_' || b_first == '.'; - if !b_is_sym && !is_numeric_label_ref(&sym_b) { - return None; - } - // Also verify sym_a is valid (symbol or numeric label ref) - if !a_is_sym && !is_numeric_label_ref(&sym_a) { - return None; - } - if extra_addend != 0 { - Some(DataValue::SymbolDiffAddend(sym_a, sym_b, extra_addend)) - } else { - Some(DataValue::SymbolDiff(sym_a, sym_b)) - } -} - -/// Try to parse symbol+offset or symbol-offset. -/// Also handles offset+symbol (e.g., 0x9b000000 + some_symbol). -fn try_parse_symbol_offset(s: &str) -> Option { - for (i, c) in s.char_indices().skip(1) { - if c == '+' || c == '-' { - let left = s[..i].trim(); - let right_with_sign = &s[i..]; // includes the sign - - // Case 1: symbol+offset or symbol-offset - if let Ok(offset) = parse_int_literal(right_with_sign) { - if !left.is_empty() && !left.contains(' ') { - return Some(DataValue::SymbolOffset(left.to_string(), offset)); - } - } - - // Case 2: offset+symbol (e.g., "0x9b000000 + some_symbol") - only for '+' - if c == '+' { - if let Ok(offset) = parse_int_literal(left) { - let sym = right_with_sign[1..].trim(); // skip the '+' - if !sym.is_empty() - && !sym.contains(' ') - && sym - .bytes() - .all(|b| b.is_ascii_alphanumeric() || b == b'_' || b == b'.') - { - return Some(DataValue::SymbolOffset(sym.to_string(), offset)); - } - } - } - } - } - None -} - -/// Find the position of the '-' operator in a symbol difference expression. -/// Skips over parenthesized sub-expressions. -fn find_symbol_diff_minus(expr: &str) -> Option { - let bytes = expr.as_bytes(); - let len = bytes.len(); - let mut i = 1; - let mut depth = 0i32; - while i < len { - match bytes[i] { - b'(' => { depth += 1; i += 1; continue; } - b')' => { depth -= 1; i += 1; continue; } - _ => {} - } - if depth > 0 { - i += 1; - continue; - } - if bytes[i] == b'-' { - let left_char = bytes[i - 1]; - let left_ok = left_char.is_ascii_alphanumeric() || left_char == b'_' || left_char == b'.' || left_char == b' ' || left_char == b')'; - let right_start = expr[i + 1..].trim_start(); - if !right_start.is_empty() { - let right_char = right_start.as_bytes()[0]; - let right_ok = right_char.is_ascii_alphabetic() || right_char == b'_' || right_char == b'.' || right_char.is_ascii_digit() || right_char == b'('; - if left_ok && right_ok { - return Some(i); - } - } - } - i += 1; - } - None -} - -/// Parse a data value (integer literal, possibly negative). -fn parse_data_value(s: &str) -> Result { - let s = s.trim(); - if s.is_empty() { - return Ok(0); - } - asm_expr::parse_integer_expr(s) -} diff --git a/src/backend/arm/codegen/README.md b/src/backend/arm/codegen/README.md deleted file mode 100644 index ac42f6ed5b..0000000000 --- a/src/backend/arm/codegen/README.md +++ /dev/null @@ -1,795 +0,0 @@ -# AArch64 (ARM64) Backend - -Code generation targeting the AArch64 architecture with the AAPCS64 (Arm Architecture -Procedure Call Standard for 64-bit) calling convention. The backend translates the -compiler's intermediate representation into AArch64 assembly text. A post-codegen -peephole optimizer cleans up redundant patterns inherent to the stack-based code -generation strategy. The backend includes a builtin assembler and linker -(enabled by default) supporting both static and dynamic linking with -IFUNC/IPLT and TLS support, producing ELF executables and shared -libraries directly without requiring an external toolchain. - ---- - -## Table of Contents - -1. [Architecture Overview](#architecture-overview) -2. [File Inventory](#file-inventory) -3. [Calling Convention (AAPCS64)](#calling-convention-aapcs64) -4. [Register Allocation](#register-allocation) -5. [Stack Frame Layout](#stack-frame-layout) -6. [Addressing Modes](#addressing-modes) -7. [F128 Quad-Precision Handling](#f128-quad-precision-handling) -8. [128-bit Integer Operations](#128-bit-integer-operations) -9. [Atomic Operations](#atomic-operations) -10. [NEON/SIMD Intrinsics](#neonsimd-intrinsics) -11. [Inline Assembly Support](#inline-assembly-support) -12. [Peephole Optimizer](#peephole-optimizer) -13. [Codegen Options](#codegen-options) -14. [Key Design Decisions](#key-design-decisions) - ---- - -## Architecture Overview - -The AArch64 backend is structured as an implementation of the `ArchCodegen` trait, -which defines the interface between the shared codegen framework and architecture-specific -emission. The central type is `ArmCodegen`, which carries all per-function state: -the current frame size, register assignments, variadic parameter metadata, and a -reference to the shared `CodegenState` that accumulates emitted assembly lines. - -The codegen follows a **stack-based accumulator model**: most values pass through the -`x0` register (the "accumulator") and are spilled to stack slots as needed. A -lightweight register allocator assigns frequently-used values to callee-saved physical -registers (`x20`-`x28`) and a small set of caller-saved registers (`x13`, `x14`) to -reduce stack traffic. The peephole optimizer runs as a final pass over the emitted -assembly text to eliminate redundant store/load pairs, dead moves, and other patterns -that the accumulator model produces. - ---- - -## File Inventory - -All codegen source files reside under `src/backend/arm/codegen/`. - -| File | Responsibility | -|------|---------------| -| `emit.rs` | Core `ArmCodegen` struct, register pool definitions (`ARM_CALLEE_SAVED`, `ARM_CALLER_SAVED`), ALU mnemonic mapping, condition code tables, immediate encoding helpers (`const_as_imm12`, `const_as_power_of_2`), prescan of inline asm for callee-saved register discovery, and integer comparison emission with optimized reg-vs-imm12 and reg-vs-reg paths. | -| `mod.rs` | Module declarations for the codegen submodules. | -| `prologue.rs` | Stack space calculation, prologue/epilogue emission (frame pointer/link register pair save, callee-saved register save/restore via `STP`/`LDP`), parameter store dispatch, and variadic register save area layout. | -| `calls.rs` | Call ABI configuration (`CallAbiConfig` with 8 integer regs, 8 float regs, I128 pair alignment), stack argument marshalling (scalars, I128, F128, structs by value), GP-to-temp staging, FP register argument loading, indirect call via `BLR x17`, and post-call stack cleanup. | -| `memory.rs` | Load/store emission with F128 specialization, typed slot access, pointer indirection, GEP (get-element-pointer) address computation, `memcpy` helpers, dynamic alloca support (`sub sp` / `mov sp`), and alignment rounding. | -| `alu.rs` | Integer ALU operations: binary ops (add, sub, mul, div, rem, and, or, xor, shifts) with strength-reduction for power-of-2 division/remainder, register-direct fast paths for callee-saved operands, and unary ops (neg, not, clz, ctz, bswap, popcount via NEON `CNT`/`UADDLV`). | -| `comparison.rs` | Integer comparison (`CMP` + `CSET`), floating-point comparison (`FCMP` + `CSET`), F128 comparison via soft-float libcalls, fused compare-and-branch emission, and `CSEL`-based select. | -| `float_ops.rs` | Floating-point binary operations (`FADD`, `FSUB`, `FMUL`, `FDIV`) for F32/F64 with `FMOV` transfers between GP and FP registers, and F128 negation. | -| `cast_ops.rs` | Type casts: float-to-int (`FCVTZS`/`FCVTZU`), int-to-float (`SCVTF`/`UCVTF`), float-to-float (`FCVT`), sign/zero extension, and truncation with appropriate `SXTB`/`SXTH`/`SXTW`/`AND` masking. | -| `f128.rs` | IEEE 754 binary128 (quad-precision) support implementing the `F128SoftFloat` trait. Provides AArch64-specific primitives for Q-register loads/stores, constant materialization into `v0`, and calls to compiler-rt/libgcc soft-float routines. | -| `i128_ops.rs` | 128-bit integer arithmetic: add/sub via `ADDS`/`ADC`/`SUBS`/`SBC`, multiplication via `MUL`/`UMULH`/`MADD`, bitwise ops, shifts (with >64 and ==64 special cases), division/remainder via `__divti3`/`__modti3` libcalls, float-to-i128 and i128-to-float conversions, and comparisons. | -| `atomics.rs` | Atomic operations: `LDXR`/`STXR` exclusive load/store loops for RMW (exchange, add, sub, and, or, xor, nand, test-and-set), compare-and-exchange with `CLREX` on failure, atomic loads (`LDAR`/`LDARB`/`LDARH`), atomic stores (`STLR`/`STLRB`/`STLRH`), and fences (`DMB ISH`/`ISHLD`/`ISHST`). | -| `intrinsics.rs` | NEON/SIMD intrinsic emission (SSE-equivalent operations), hardware builtins (CRC32, `fsqrt`, `fabs`), memory barriers (`DMB`), non-temporal stores, cache maintenance (`DC CIVAC`), `__builtin_frame_address`, `__builtin_return_address`, and `__builtin_thread_pointer`. | -| `globals.rs` | Global symbol addressing: `ADRP`+`ADD :lo12:` for direct access (used for all regular symbols, including in PIC/PIE mode), `ADRP`+`LDR :got:` for GOT-indirect access (weak extern symbols only), and TLS access via `MRS TPIDR_EL0` + `tprel_hi12`/`tprel_lo12_nc`. | -| `returns.rs` | Return value handling: integer returns in `x0`, F32 in `s0`, F64 in `d0`, F128 in `q0` (with `__extenddftf2` promotion), I128 in `x0:x1`, struct return via `x8` (sret pointer), and second-slot float returns in `d1`/`s1`. | -| `variadic.rs` | Variadic function support: `va_arg` implementation (GP register save area at offset 24, FP register save area at offset 28), `va_start` initialization of the `va_list` struct (stack pointer, `gr_top`, `vr_top`, `gr_offs`, `vr_offs`), and `va_copy`. | -| `inline_asm.rs` | Inline assembly template substitution: `%0`/`%[name]` positional and named operand references, `%w`/`%x`/`%s`/`%d`/`%q`/`%c`/`%a` register modifiers, GCC `r0`-`r30` alias normalization, and `%%` literal percent. | -| `asm_emitter.rs` | `InlineAsmEmitter` trait implementation: constraint classification (`r`=GP, `w`=FP, `m`/`Q`=memory, `i`/`n`=immediate, `{reg}`=specific), scratch register allocation from `ARM_GP_SCRATCH` and `ARM_FP_SCRATCH` pools, operand loading/storing, memory operand resolution, AArch64 logical immediate validation (`K`/`L` constraints), and output register writeback. | -| `peephole.rs` | Multi-phase post-codegen peephole optimizer operating on assembly text lines. | - ---- - -## Calling Convention (AAPCS64) - -The backend implements the standard AAPCS64 calling convention: - -### Parameter Passing - -| Category | Registers | Notes | -|----------|-----------|-------| -| Integer arguments | `x0`-`x7` | Up to 8 GP registers | -| Floating-point arguments | `d0`-`d7` (or `s0`-`s7` for F32) | Up to 8 FP/SIMD registers | -| F128 arguments | `q0`-`q7` | Passed in NEON Q registers | -| I128 arguments | Aligned register pair (e.g., `x0:x1`, `x2:x3`) | Must start on even-numbered register | -| Indirect result (sret) | `x8` | Pointer to caller-allocated return buffer | -| Stack arguments | `[sp, #0]`, `[sp, #8]`, ... | 8-byte aligned, 16-byte for I128/F128 | - -Because sret uses the dedicated `x8` register (not `x0`), it does not consume a GP -argument slot. The initial classification assigns sret to IntReg(0) like other targets, -but both caller (`emit_call` in `traits.rs`) and callee (`classify_params_full` in -`call_abi.rs`) then shift GP indices down by 1 and promote the first stack-overflow GP -argument to `x7`. The callee uses `sret_shift=1` in `emit_store_gp_params` to map the -promoted reg_idx back to physical registers. - -### Return Values - -| Type | Location | -|------|----------| -| Integer (up to 64-bit) | `x0` | -| I128 | `x0` (low), `x1` (high) | -| F32 | `s0` | -| F64 | `d0` | -| F128 | `q0` | -| Struct (small) | `x0`/`x0:x1` | -| Struct (large, >16 bytes) | Via `x8` sret pointer | - -### Special Registers - -| Register | Role | -|----------|------| -| `x29` | Frame pointer (FP) | -| `x30` | Link register (LR, return address) | -| `x8` | Indirect result location (sret) | -| `x9` | Primary address/scratch register | -| `x10` | memcpy source register, secondary scratch | -| `x11` | memcpy loop counter | -| `x12` | memcpy byte transfer | -| `x15` | F128 large-offset scratch | -| `x16`/`x17` | Intra-procedure-call scratch (IP0/IP1); `x17` used for indirect calls (`BLR x17`) | -| `x18` | Platform-reserved (not used) | -| `sp` | Stack pointer (must remain 16-byte aligned) | - -### Variadic Functions - -For variadic (varargs) functions, the prologue saves the remaining argument registers -to dedicated save areas on the stack: - -- **GP save area**: `x0`-`x7` saved to a 64-byte region (8 registers x 8 bytes) -- **FP save area**: `q0`-`q7` saved to a 128-byte region (8 registers x 16 bytes) - -The `va_list` struct is initialized by `va_start` with five fields: - -``` -struct va_list { - __stack: *void, // offset 0: pointer to next stack argument - __gr_top: *void, // offset 8: top of GP register save area - __vr_top: *void, // offset 16: top of FP register save area - __gr_offs: i32, // offset 24: negative offset from gr_top to next GP reg - __vr_offs: i32, // offset 28: negative offset from vr_top to next FP reg -} -``` - -Named (non-variadic) parameters that consume GP or FP registers are accounted for by -adjusting `__gr_offs` and `__vr_offs` so that `va_arg` skips them. When -`-mgeneral-regs-only` is active, the FP save area is skipped entirely and `__vr_offs` -is set to zero. - ---- - -## Register Allocation - -The backend uses a lightweight register allocator that runs before code emission. It -assigns IR values to physical registers to reduce stack traffic. Values that remain -unassigned use the accumulator (`x0`) with stack spill/reload. - -### Callee-Saved Pool - -Nine callee-saved registers are available for allocation: - -``` -x20, x21, x22, x23, x24, x25, x26, x27, x28 -``` - -These survive across function calls, so values assigned to them do not need spilling -around call sites. `x19` is excluded (reserved by some ABIs). `x29` is the frame -pointer and `x30` is the link register. - -### Caller-Saved Pool - -Two caller-saved registers are available for allocation: - -``` -x13, x14 -``` - -These are a subset of the AAPCS64 "corruptible" registers (`x9`-`x15`). The remaining -corruptible registers are excluded because they have dedicated scratch uses in the -codegen: - -| Register | Hardcoded Use | -|----------|---------------| -| `x9` | Primary address register | -| `x10` | memcpy source, secondary scratch | -| `x11` | memcpy loop counter | -| `x12` | memcpy byte transfer | -| `x15` | F128 large-offset scratch | - -Caller-saved allocation assigns values whose live ranges do **not** span any function -call. Additionally, functions containing inline assembly have the caller-saved pool -disabled entirely (inline asm uses `x13`/`x14` as part of `ARM_GP_SCRATCH`). - -### F128 Interaction - -When a function contains F128 (quad-precision) operations, the caller-saved register -pool is cleared. This is because F128 arithmetic requires soft-float library calls -(e.g., `__addtf3`), which clobber caller-saved registers unpredictably across the -F128 operation's live range. - -### Callee-Saved Register Save/Restore - -Used callee-saved registers are saved in the prologue and restored in the epilogue. -Pairs of registers are saved/restored with `STP`/`LDP` instructions for efficiency; -an odd register uses a single `STR`/`LDR`. - ---- - -## Stack Frame Layout - -Every function prologue establishes a standard AArch64 stack frame. The stack pointer -must remain 16-byte aligned at all times. - -### Prologue Sequence - -The prologue uses one of three code paths depending on frame size: - -**Small frame (≤504 bytes):** -```asm - stp x29, x30, [sp, #-FRAME_SIZE]! // save FP and LR, allocate frame - mov x29, sp // establish frame pointer -``` - -**Medium frame (505–4096 bytes):** -```asm - sub sp, sp, #FRAME_SIZE // allocate frame (via emit_sub_sp) - stp x29, x30, [sp] // save FP and LR at bottom of frame - mov x29, sp // establish frame pointer -``` - -**Large frame (>4096 bytes) — with stack probing:** -```asm - mov x17, #FRAME_SIZE // materialize frame size -.Lstack_probe_N: - sub sp, sp, #4096 // step down one page - str xzr, [sp] // probe (touch page to grow stack) - sub x17, x17, #4096 - cmp x17, #4096 - b.hi .Lstack_probe_N // repeat for remaining pages - sub sp, sp, x17 // allocate residual bytes - str xzr, [sp] // probe final page - stp x29, x30, [sp] // save FP and LR - mov x29, sp // establish frame pointer -``` - -Stack probing ensures the kernel can grow the stack mapping page-by-page. Without -probing, a single large `sub sp` can skip guard pages and cause a segfault. - -For variadic functions, the prologue additionally saves `x0`-`x7` (and optionally -`q0`-`q7` unless `-mgeneral-regs-only`) to the register save areas. - -### Frame Organization - -``` -High addresses (caller's frame) - +----------------------------------+ - | Caller's stack arguments | [x29 + frame_size + ...] - +----------------------------------+ <-- previous sp (before prologue) - | Callee-saved register save area | [sp + callee_save_offset] - | (x20-x28 as needed, via STP) | - +----------------------------------+ - | Variadic FP save area (128 bytes)| [sp + va_fp_save_offset] (if variadic) - | Variadic GP save area (64 bytes) | [sp + va_gp_save_offset] (if variadic) - +----------------------------------+ - | Local variables / spill slots | [sp + 16...] - | (8-byte minimum alignment, | - | respecting alloca alignment) | - +----------------------------------+ - | Saved x30 (LR) | [sp + 8] = [x29 + 8] - | Saved x29 (FP) | [sp + 0] = [x29] - +----------------------------------+ <-- sp = x29 (frame pointer) -Low addresses -``` - -### Frame Size Calculation - -The raw frame size is computed by summing: - -1. Space for all local variables and alloca slots (8-byte granularity, respecting - alignment requirements up to the alloca's specified alignment) -2. Variadic register save areas (64 bytes GP + 128 bytes FP, if applicable) -3. Callee-saved register save slots (8 bytes per register) - -The total is then rounded up to a 16-byte boundary (`(raw + 15) & !15`) to maintain -the AArch64 stack alignment requirement. - ---- - -## Addressing Modes - -### Global Symbols - -The backend supports three addressing modes for global symbols: - -**Direct (PC-relative, used for all regular symbols including PIC/PIE):** -```asm - adrp x0, symbol // load 4KB-aligned page address - add x0, x0, :lo12:symbol // add page offset -``` - -**GOT-indirect (weak extern symbols only):** -```asm - adrp x0, :got:symbol // load page of GOT entry - ldr x0, [x0, :got_lo12:symbol] // load address from GOT -``` - -On AArch64, GOT-indirect addressing is only used for weak extern symbols. -Regular extern symbols use direct PC-relative ADRP+ADD even in PIC/PIE mode, -since this is inherently position-independent and works correctly for -statically-linked executables and early boot code (pre-MMU). - -**Thread-Local Storage (TLS):** -```asm - mrs x0, tpidr_el0 // read thread pointer - add x0, x0, :tprel_hi12:sym // add high 12 bits of TLS offset - add x0, x0, :tprel_lo12_nc:sym // add low 12 bits of TLS offset -``` - -### Stack-Relative Addressing - -Local variables are accessed via SP-relative offsets: - -```asm - ldr x0, [sp, #offset] // load from stack slot - str x0, [sp, #offset] // store to stack slot -``` - -When offsets exceed the immediate encoding range (which varies by instruction), a -large-immediate helper materializes the offset into a scratch register: - -```asm - movz x17, #imm16 // low 16 bits - movk x17, #imm16, lsl #16 // (if needed) bits 16-31 - movk x17, #imm16, lsl #32 // (if needed) bits 32-47 - add x17, sp, x17 - ldr x0, [x17] -``` - -### Immediate Encoding - -The backend recognizes and exploits AArch64 immediate encoding constraints: - -- **imm12** (0-4095): used by `ADD`/`SUB`/`CMP` instructions. The `const_as_imm12` - helper detects operands that fit, avoiding a register load. -- **Logical immediates**: bitmask patterns encodable in the 13-bit `N:immr:imms` field - used by `AND`/`ORR`/`EOR`/`TST`. Validated by `is_valid_aarch64_logical_immediate`. -- **Power-of-2 strength reduction**: `UDiv` by a power of 2 is lowered to `LSR`, - and `URem` by a power of 2 is lowered to `AND` with a bitmask. - ---- - -## F128 Quad-Precision Handling - -On AArch64, `long double` is IEEE 754 binary128 (16 bytes). The hardware has no -quad-precision floating-point instructions, so all F128 arithmetic uses soft-float -library calls from compiler-rt or libgcc. - -### Storage - -F128 values are stored in 16-byte stack slots or NEON Q registers (`q0`-`q7`). The -backend tracks the "source" of each F128 value (which alloca/slot and offset it was -loaded from) to enable full-precision reloads for comparisons and conversions, avoiding -the lossy round-trip through a double-precision truncation. - -### Library Calls - -| Operation | Library Function | -|-----------|-----------------| -| Addition | `__addtf3` | -| Subtraction | `__subtf3` | -| Multiplication | `__multf3` | -| Division | `__divtf3` | -| Equality comparison | `__eqtf2` | -| Less-than | `__lttf2` | -| Less-or-equal | `__letf2` | -| Greater-than | `__gttf2` | -| Greater-or-equal | `__getf2` | -| F64 to F128 | `__extenddftf2` | -| F128 to F64 | `__trunctfdf2` | - -### ABI - -F128 values are passed and returned in Q registers per AAPCS64. Operands are loaded -into `q0` (and `q1` for binary operations) before the library call. The result is -returned in `q0`. For the compiler's internal accumulator representation (which is -64-bit), F128 results are truncated to `double` via `__trunctfdf2` after the operation, -with the full-precision value retained in its stack slot for subsequent F128 operations. - -### NEON Register Transfers - -Moving 128-bit values between Q registers uses the NEON bytewise move: - -```asm - mov v1.16b, v0.16b // q0 -> q1 -``` - -Loading F128 constants involves materializing the two 64-bit halves: - -```asm - fmov d0, x0 // load low 64 bits into d0 (lower half of q0) - mov v0.d[1], x1 // insert high 64 bits into upper half of q0 -``` - ---- - -## 128-bit Integer Operations - -I128 values are represented as register pairs (`x0:x1` where `x0` = low, `x1` = high) -or as two adjacent 8-byte stack slots. - -### Inline Arithmetic - -| Operation | Implementation | -|-----------|---------------| -| Add | `adds x0, x2, x4` / `adc x1, x3, x5` | -| Sub | `subs x0, x2, x4` / `sbc x1, x3, x5` | -| Mul | `mul x0, x2, x4` / `umulh x1, x2, x4` / `madd x1, x3, x4, x1` / `madd x1, x2, x5, x1` | -| Neg | `mvn` + `adds` + `adc` (two's complement) | -| Not | `mvn x0, x0` / `mvn x1, x1` | -| And/Or/Xor | Parallel per-half operations | - -### Shifts - -128-bit shifts use branching sequences that handle three cases: shift amount is zero, -less than 64, and 64 or greater. Constant shift amounts use branchless sequences with -`LSL`/`LSR`/`ASR` and `ORR` combined-shift operands. - -### Division and Remainder - -128-bit division and remainder call compiler-rt/libgcc library functions: - -| Operation | Library Function | -|-----------|-----------------| -| Signed division | `__divti3` | -| Unsigned division | `__udivti3` | -| Signed remainder | `__modti3` | -| Unsigned remainder | `__umodti3` | - -### Float Conversions - -| Conversion | Library Function | -|-----------|-----------------| -| F64 to I128 (signed) | `__fixdfti` | -| F32 to I128 (signed) | `__fixsfti` | -| F64 to I128 (unsigned) | `__fixunsdfti` | -| F32 to I128 (unsigned) | `__fixunssfti` | -| I128 (signed) to F64 | `__floattidf` | -| I128 (signed) to F32 | `__floattisf` | -| I128 (unsigned) to F64 | `__floatuntidf` | -| I128 (unsigned) to F32 | `__floatuntisf` | - -### Comparisons - -Equality/inequality uses XOR-and-OR reduction: - -```asm - eor x0, x2, x4 // XOR low halves - eor x1, x3, x5 // XOR high halves - orr x0, x0, x1 // combine - cmp x0, #0 - cset x0, eq/ne -``` - -Ordered comparisons compare high halves first, then low halves on equality. - ---- - -## Atomic Operations - -The backend implements atomics using the ARMv8 exclusive monitor mechanism -(load-exclusive / store-exclusive pairs). - -### Instruction Selection - -The exclusive instruction variant is selected based on type size and memory ordering: - -| Size | Load-Exclusive | Store-Exclusive | -|------|---------------|----------------| -| 8-bit | `ldxrb` / `ldaxrb` | `stxrb` / `stlxrb` | -| 16-bit | `ldxrh` / `ldaxrh` | `stxrh` / `stlxrh` | -| 32-bit | `ldxr` / `ldaxr` (w-reg) | `stxr` / `stlxr` (w-reg) | -| 64-bit | `ldxr` / `ldaxr` (x-reg) | `stxr` / `stlxr` (x-reg) | - -Acquire semantics use `LDAXR`; release semantics use `STLXR`. - -### Atomic RMW Pattern - -All atomic read-modify-write operations follow the LL/SC (load-linked / store-conditional) -loop pattern: - -```asm -.Latomic_N: - ldxr x0, [x1] // load-exclusive old value - x3, x0, x2 // compute new value (add, sub, and, or, xor, nand) - stxr w4, x3, [x1] // store-exclusive new value - cbnz w4, .Latomic_N // retry if exclusive monitor lost -``` - -### Compare-and-Exchange - -```asm -.Lcas_loop_N: - ldxr x0, [x1] // load current value - cmp x0, x2 // compare with expected - b.ne .Lcas_fail_N // mismatch: fail - stxr w4, x3, [x1] // try to store desired - cbnz w4, .Lcas_loop_N // retry if monitor lost - b .Lcas_done_N -.Lcas_fail_N: - clrex // clear exclusive monitor -.Lcas_done_N: -``` - -### Atomic Loads and Stores - -Atomic loads use `LDAR` (acquire) or plain `LDR` (relaxed). Atomic stores use `STLR` -(release) or plain `STR` (relaxed). Byte and halfword variants (`LDARB`/`LDARH`/ -`STLRB`/`STLRH`) are used for sub-word types. - -### Fences - -| Ordering | Instruction | -|----------|------------| -| Acquire | `dmb ishld` | -| Release | `dmb ishst` | -| AcqRel / SeqCst | `dmb ish` | -| Relaxed | (none) | - ---- - -## NEON/SIMD Intrinsics - -The backend provides NEON equivalents for common SSE intrinsics, operating on 128-bit -vectors through Q registers. - -### Binary Vector Operations - -128-bit binary operations follow a common pattern: - -```asm - ldr q0, [x0] // load first operand - ldr q1, [x1] // load second operand - v0.16b, v0.16b, v1.16b // apply operation bytewise - str q0, [x0] // store result -``` - -Supported operations include: - -| Intrinsic | NEON Instruction | Description | -|-----------|-----------------|-------------| -| `pcmpeqb` | `cmeq v.16b` | Byte-wise equality comparison | -| `pcmpeqd` | `cmeq v.4s` | 32-bit lane equality | -| `psubusb` | `uqsub v.16b` | Unsigned saturating byte subtract | -| `psubsb` | `sqsub v.16b` | Signed saturating byte subtract | -| `por` | `orr v.16b` | Bitwise OR | -| `pand` | `and v.16b` | Bitwise AND | -| `pxor` | `eor v.16b` | Bitwise XOR | -| `loaddqu` | `ldr q` | 128-bit unaligned load | -| `storedqu` | `str q` | 128-bit unaligned store | - -### Scalar Operations - -| Intrinsic | NEON Instruction | -|-----------|-----------------| -| `sqrt(f64)` | `fsqrt d0, d0` | -| `sqrt(f32)` | `fsqrt s0, s0` | -| `fabs(f64)` | `fabs d0, d0` | -| `fabs(f32)` | `fabs s0, s0` | - -### Special Intrinsics - -- **`pmovmskb`**: Emulated via `USHR` + multiply-by-bit-position + `ADDV` reduction - (no direct NEON equivalent) -- **`set_epi8`/`set_epi32`**: Vector broadcast via `DUP v.16b`/`DUP v.4s` -- **`crc32c`**: Hardware CRC32C via `CRC32CB`/`CRC32CH`/`CRC32CW`/`CRC32CX` -- **Barriers**: `DMB ISH` (full), `DMB ISHST` (store), `YIELD` (pause hint) -- **Cache**: `DC CIVAC` (clean and invalidate by VA to Point of Coherency) -- **Non-temporal stores**: Mapped to regular `STR` (ARM has no direct NT store hint - for general-purpose data) -- **x86-only intrinsics** (AES-NI, CLMUL, `pslldq`, `pshufd`, `paddw`, `pmulhw`, - etc.): Stubbed to zero their output since no NEON equivalents are implemented - -### Popcount - -Integer popcount uses NEON byte-count instructions: - -```asm - fmov d0, x0 // move integer to FP register - cnt v0.8b, v0.8b // count set bits per byte - uaddlv h0, v0.8b // sum all byte counts - fmov w0, s0 // move result back to GP register -``` - ---- - -## Inline Assembly Support - -The backend implements the GCC-compatible inline assembly interface with AArch64-specific -constraint handling and register formatting. - -### Constraint Classification - -| Constraint | Kind | Description | -|-----------|------|-------------| -| `r` | GP register | General-purpose register (`x0`-`x30`) | -| `w` | FP register | Floating-point/SIMD register (`v0`-`v31`) | -| `m`, `Q` | Memory | Memory operand (stack slot or indirect pointer) | -| `i`, `n` | Immediate | Compile-time constant | -| `I` | Immediate | Unsigned 12-bit (0-4095) for `ADD`/`SUB` | -| `K` | Immediate | 64-bit logical immediate (bitmask pattern) | -| `L` | Immediate | 32-bit logical immediate | -| `g` | General | GP register, memory, or immediate | -| `{reg}` | Specific | Named register (e.g., `{x0}`, `{d5}`) | -| `0`-`9` | Tied | Tied to operand N | - -### Register Modifiers - -Template operands support modifiers that select the register view: - -| Modifier | Effect | Example | -|----------|--------|---------| -| `%w` | 32-bit GP | `%w0` produces `w20` | -| `%x` | 64-bit GP | `%x0` produces `x20` | -| `%s` | 32-bit FP scalar | `%s0` produces `s16` | -| `%d` | 64-bit FP scalar | `%d0` produces `d16` | -| `%q` | 128-bit FP vector | `%q0` produces `q16` | -| `%h` | 16-bit FP half | `%h0` produces `h16` (half-precision view) | -| `%b` | 8-bit FP byte | `%b0` produces `b16` (byte-width view) | -| `%c` | Raw constant | No `#` prefix (for data directives) | -| `%a` | Address reference | `[reg]` form for prefetch | - -### Scratch Register Pools - -The inline asm emitter allocates scratch registers from two pools: - -- **GP scratch**: `x9, x10, x11, x12, x13, x14, x15, x19, x20, x21` - (caller-saved first, then callee-saved as overflow) -- **FP scratch**: `v16, v17, v18, v19, v20, v21, v22, v23, v24, v25` - (caller-saved NEON registers; `d8`-`d15` are callee-saved and avoided) - -When inline asm clobbers enough caller-saved registers to force allocation into -callee-saved registers (`x19`-`x21`), the prologue prescan (`prescan_inline_asm_callee_saved`) -detects this ahead of time so the prologue can save/restore them. - -### GCC Register Aliases - -The backend normalizes GCC's `r0`-`r30` aliases to `x0`-`x30` for AArch64, matching -the convention used extensively in Linux kernel inline assembly (e.g., `register -unsigned long r0 asm("r0")` in arm-smccc.h). - ---- - -## Peephole Optimizer - -The peephole optimizer runs as a post-processing pass over the emitted assembly text. -It pre-parses every line into a `LineKind` enum for efficient pattern matching using -integer/enum comparisons rather than repeated string parsing. - -### Pass Structure - -The optimizer runs in three phases: - -1. **Iterative local passes** (up to 8 rounds): all core pattern elimination passes - (store/load elimination, redundant branches, self-moves, move chains, - branch-over-branch fusion) -2. **Global passes**: register copy propagation and dead store elimination -3. **Local cleanup** (up to 4 rounds): a subset of the local passes (same as Phase 1 - but without branch-over-branch fusion) to mop up patterns exposed by global passes - -### Optimization Catalog - -**1. Adjacent Store/Load Elimination** - -When a `str xN, [sp, #off]` is immediately followed by `ldr xN, [sp, #off]` (same -register, same offset), the load is redundant and eliminated. If the registers differ -(`str xN` then `ldr xM`), the load is replaced with `mov xM, xN`. - -Also handles `str wN, [sp, #off]` followed by `ldrsw xN, [sp, #off]` (sign-extending -load after 32-bit store). - -**2. Redundant Branch Elimination** - -An unconditional `b .label` where `.label:` is the immediately next non-empty line -is a no-op (natural fall-through) and is eliminated. - -**3. Self-Move Elimination** - -`mov xN, xN` (64-bit) is a no-op and removed. Importantly, `mov wN, wN` (32-bit) is -**not** eliminated because it zeros the upper 32 bits of the 64-bit register, which -is a meaningful operation. - -**4. Move Chain Optimization** - -The sequence `mov A, B; mov C, A` is transformed to `mov C, B`, which enables the -first `mov` to become dead if `A` has no other uses. - -**5. Branch-Over-Branch Fusion** - -The pattern: - -```asm - b.cc .Lskip - b .target -.Lskip: -``` - -is fused into a single inverted conditional branch: - -```asm - b.!cc .target -``` - -**6. Move-Immediate Chain** - -The sequence `mov xN, #imm; mov xM, xN` where `xN` is a scratch register is collapsed -to `mov xM, #imm` when safe. - -**7. Register Copy Propagation** (global) - -Propagates register copies across basic blocks, replacing uses of the copy destination -with the original source when the source is still live. - -**8. Dead Store Elimination** (global) - -Removes `str` instructions to stack slots that are overwritten before being read. - ---- - -## Codegen Options - -The backend supports several command-line-driven options: - -| Option | Effect | -|--------|--------| -| `-fPIC` / `-fpie` | Enable position-independent code generation. On AArch64, regular extern symbols use direct PC-relative addressing (`ADRP` + `ADD`), which works correctly for both PIC executables and early boot code (pre-MMU). Only weak extern symbols use GOT-indirect addressing (`ADRP` + `LDR :got:`). | -| `-mgeneral-regs-only` | Disable FP/SIMD register use in variadic prologues. The FP register save area is skipped, and `__vr_offs` is set to 0. | -| `-fno-jump-tables` | Disable jump table emission for `switch` statements. | -| Patchable function entry | Reserved NOP sled at function entry for runtime patching. | - ---- - -## Key Design Decisions - -### Accumulator-Based Code Generation - -The backend uses a stack-based accumulator model where `x0` is the primary working -register. This simplifies instruction selection (every operation follows a uniform -load-operate-store pattern) at the cost of generating redundant moves and stack -traffic. The peephole optimizer is specifically designed to recover much of this cost -by eliminating the redundant patterns in a post-processing pass. - -### Register Allocation Strategy - -Rather than implementing a full graph-coloring register allocator, the backend uses a -simple but effective two-pool approach: callee-saved registers for values that live -across calls, and a small set of caller-saved registers for call-free live ranges. -This keeps the allocator simple while providing meaningful speedups for hot variables. -The allocator is integrated with inline asm handling: the prescan mechanism ensures -callee-saved registers used by inline asm scratch allocation are properly saved in -the prologue, even though the prologue is emitted before inline asm codegen runs. - -### F128 Dual-Track Representation - -F128 values carry both a full-precision 16-byte representation (in stack slots and Q -registers) and a truncated `double` approximation in the accumulator. This -dual-track approach lets most of the codegen infrastructure (which assumes 64-bit -accumulator values) work unchanged, while precision-sensitive operations (comparisons, -stores, conversions) reload the full-precision value from its tracked source slot. - -### Text-Based Peephole Optimization - -The peephole optimizer operates on assembly text rather than a structured IR. This is -a pragmatic choice: it runs after all codegen is complete, so it can catch redundancies -introduced by any part of the pipeline (including inline asm, intrinsics, and library -call sequences). The pre-parsed `LineKind` classification ensures the text-based -approach does not become a performance bottleneck. - -### Conservative Scratch Register Partitioning - -The corruptible registers `x9`-`x15` are partitioned into dedicated roles (address -computation, memcpy, F128 scratch) rather than being treated as a general pool. This -eliminates the need for tracking register liveness within a single instruction's -emission sequence, which would add significant complexity. Only `x13` and `x14` have -no hardcoded uses and are offered to the register allocator as caller-saved registers. - -### Atomic Loop Structure - -Atomic operations use the LL/SC (LDXR/STXR) pattern rather than the newer LSE -(Large System Extension) atomics (`LDADD`, `SWPAL`, etc.). This ensures compatibility -with all ARMv8.0 implementations. The retry loop with `CBNZ` handles spurious -exclusive monitor failures that can occur on multiprocessor systems. diff --git a/src/backend/arm/codegen/alu.rs b/src/backend/arm/codegen/alu.rs deleted file mode 100644 index 7649cbb784..0000000000 --- a/src/backend/arm/codegen/alu.rs +++ /dev/null @@ -1,229 +0,0 @@ -//! ArmCodegen: ALU operations (integer arithmetic, bitwise, unary). - -use crate::ir::reexports::{IrBinOp, Operand, Value}; -use crate::common::types::IrType; -use super::emit::{ArmCodegen, callee_saved_name, callee_saved_name_32, arm_alu_mnemonic}; - -impl ArmCodegen { - pub(super) fn emit_float_neg_impl(&mut self, ty: IrType) { - if ty == IrType::F32 { - self.state.emit(" fmov s0, w0"); - self.state.emit(" fneg s0, s0"); - self.state.emit(" fmov w0, s0"); - self.state.emit(" mov w0, w0"); // zero-extend - } else { - self.state.emit(" fmov d0, x0"); - self.state.emit(" fneg d0, d0"); - self.state.emit(" fmov x0, d0"); - } - } - - pub(super) fn emit_int_neg_impl(&mut self, _ty: IrType) { - self.state.emit(" neg x0, x0"); - } - - pub(super) fn emit_int_not_impl(&mut self, _ty: IrType) { - self.state.emit(" mvn x0, x0"); - } - - pub(super) fn emit_int_clz_impl(&mut self, ty: IrType) { - if ty == IrType::I32 || ty == IrType::U32 { - self.state.emit(" clz w0, w0"); - } else { - self.state.emit(" clz x0, x0"); - } - } - - pub(super) fn emit_int_ctz_impl(&mut self, ty: IrType) { - if ty == IrType::I32 || ty == IrType::U32 { - self.state.emit(" rbit w0, w0"); - self.state.emit(" clz w0, w0"); - } else { - self.state.emit(" rbit x0, x0"); - self.state.emit(" clz x0, x0"); - } - } - - pub(super) fn emit_int_bswap_impl(&mut self, ty: IrType) { - if ty == IrType::I16 || ty == IrType::U16 { - self.state.emit(" rev w0, w0"); - self.state.emit(" lsr w0, w0, #16"); - } else if ty == IrType::I32 || ty == IrType::U32 { - self.state.emit(" rev w0, w0"); - } else { - self.state.emit(" rev x0, x0"); - } - } - - pub(super) fn emit_int_popcount_impl(&mut self, ty: IrType) { - if ty == IrType::I32 || ty == IrType::U32 { - self.state.emit(" fmov s0, w0"); - } else { - self.state.emit(" fmov d0, x0"); - } - self.state.emit(" cnt v0.8b, v0.8b"); - self.state.emit(" uaddlv h0, v0.8b"); - self.state.emit(" fmov w0, s0"); - } - - pub(super) fn emit_int_binop_impl(&mut self, dest: &Value, op: IrBinOp, lhs: &Operand, rhs: &Operand, ty: IrType) { - let use_32bit = ty == IrType::I32 || ty == IrType::U32; - let is_unsigned = ty.is_unsigned(); - - // Strength reduction: UDiv/URem by power-of-2 constant - if let Some(shift) = Self::const_as_power_of_2(rhs) { - if op == IrBinOp::UDiv { - self.operand_to_x0(lhs); - if use_32bit { - self.state.emit_fmt(format_args!(" lsr w0, w0, #{}", shift)); - } else { - self.state.emit_fmt(format_args!(" lsr x0, x0, #{}", shift)); - } - self.store_x0_to(dest); - return; - } - if op == IrBinOp::URem { - self.operand_to_x0(lhs); - let mask = (1u64 << shift) - 1; - if use_32bit { - self.state.emit_fmt(format_args!(" and w0, w0, #{}", mask)); - } else { - self.state.emit_fmt(format_args!(" and x0, x0, #{}", mask)); - } - self.store_x0_to(dest); - return; - } - } - - // Register-direct path - if let Some(dest_phys) = self.dest_reg(dest) { - let dest_name = callee_saved_name(dest_phys); - let dest_name_32 = callee_saved_name_32(dest_phys); - - let is_simple_alu = matches!(op, IrBinOp::Add | IrBinOp::Sub | IrBinOp::And - | IrBinOp::Or | IrBinOp::Xor | IrBinOp::Mul); - if is_simple_alu { - let mnemonic = arm_alu_mnemonic(op); - - if matches!(op, IrBinOp::Add | IrBinOp::Sub) { - if let Some(imm) = Self::const_as_imm12(rhs) { - self.operand_to_callee_reg(lhs, dest_phys); - if use_32bit { - self.state.emit_fmt(format_args!(" {} {}, {}, #{}", mnemonic, dest_name_32, dest_name_32, imm)); - if !is_unsigned { self.state.emit_fmt(format_args!(" sxtw {}, {}", dest_name, dest_name_32)); } - } else { - self.state.emit_fmt(format_args!(" {} {}, {}, #{}", mnemonic, dest_name, dest_name, imm)); - } - self.state.reg_cache.invalidate_acc(); - return; - } - } - - let rhs_phys = self.operand_reg(rhs); - let rhs_conflicts = rhs_phys.is_some_and(|r| r.0 == dest_phys.0); - let rhs_reg: String = if rhs_conflicts { - self.operand_to_x0(rhs); - self.operand_to_callee_reg(lhs, dest_phys); - "x0".to_string() - } else { - self.operand_to_callee_reg(lhs, dest_phys); - if let Some(rhs_phys) = rhs_phys { - callee_saved_name(rhs_phys).to_string() - } else { - self.operand_to_x0(rhs); - "x0".to_string() - } - }; - let rhs_32: String = if rhs_reg == "x0" { "w0".to_string() } - else { rhs_reg.replace('x', "w") }; - - if use_32bit { - self.state.emit_fmt(format_args!(" {} {}, {}, {}", mnemonic, dest_name_32, dest_name_32, rhs_32)); - if !is_unsigned { self.state.emit_fmt(format_args!(" sxtw {}, {}", dest_name, dest_name_32)); } - } else { - self.state.emit_fmt(format_args!(" {} {}, {}, {}", mnemonic, dest_name, dest_name, rhs_reg)); - } - self.state.reg_cache.invalidate_acc(); - return; - } - } - - // Fallback: accumulator path - self.operand_to_x0(lhs); - self.state.emit(" mov x1, x0"); - self.operand_to_x0(rhs); - self.state.emit(" mov x2, x0"); - - if use_32bit { - match op { - IrBinOp::Add => { - self.state.emit(" add w0, w1, w2"); - if !is_unsigned { self.state.emit(" sxtw x0, w0"); } - } - IrBinOp::Sub => { - self.state.emit(" sub w0, w1, w2"); - if !is_unsigned { self.state.emit(" sxtw x0, w0"); } - } - IrBinOp::Mul => { - self.state.emit(" mul w0, w1, w2"); - if !is_unsigned { self.state.emit(" sxtw x0, w0"); } - } - IrBinOp::SDiv => { - self.state.emit(" sdiv w0, w1, w2"); - self.state.emit(" sxtw x0, w0"); - } - IrBinOp::UDiv => self.state.emit(" udiv w0, w1, w2"), - IrBinOp::SRem => { - self.state.emit(" sdiv w3, w1, w2"); - self.state.emit(" msub w0, w3, w2, w1"); - self.state.emit(" sxtw x0, w0"); - } - IrBinOp::URem => { - self.state.emit(" udiv w3, w1, w2"); - self.state.emit(" msub w0, w3, w2, w1"); - } - IrBinOp::And => self.state.emit(" and w0, w1, w2"), - IrBinOp::Or => self.state.emit(" orr w0, w1, w2"), - IrBinOp::Xor => self.state.emit(" eor w0, w1, w2"), - IrBinOp::Shl => { - self.state.emit(" lsl w0, w1, w2"); - if !is_unsigned { self.state.emit(" sxtw x0, w0"); } - } - IrBinOp::AShr => { - self.state.emit(" asr w0, w1, w2"); - if !is_unsigned { self.state.emit(" sxtw x0, w0"); } - } - IrBinOp::LShr => self.state.emit(" lsr w0, w1, w2"), - } - } else { - match op { - IrBinOp::Add => self.state.emit(" add x0, x1, x2"), - IrBinOp::Sub => self.state.emit(" sub x0, x1, x2"), - IrBinOp::Mul => self.state.emit(" mul x0, x1, x2"), - IrBinOp::SDiv => self.state.emit(" sdiv x0, x1, x2"), - IrBinOp::UDiv => self.state.emit(" udiv x0, x1, x2"), - IrBinOp::SRem => { - self.state.emit(" sdiv x3, x1, x2"); - self.state.emit(" msub x0, x3, x2, x1"); - } - IrBinOp::URem => { - self.state.emit(" udiv x3, x1, x2"); - self.state.emit(" msub x0, x3, x2, x1"); - } - IrBinOp::And => self.state.emit(" and x0, x1, x2"), - IrBinOp::Or => self.state.emit(" orr x0, x1, x2"), - IrBinOp::Xor => self.state.emit(" eor x0, x1, x2"), - IrBinOp::Shl => self.state.emit(" lsl x0, x1, x2"), - IrBinOp::AShr => self.state.emit(" asr x0, x1, x2"), - IrBinOp::LShr => self.state.emit(" lsr x0, x1, x2"), - } - } - - self.store_x0_to(dest); - } - - pub(super) fn emit_copy_i128_impl(&mut self, dest: &Value, src: &Operand) { - self.operand_to_x0_x1(src); - self.store_x0_x1_to(dest); - } -} diff --git a/src/backend/arm/codegen/asm_emitter.rs b/src/backend/arm/codegen/asm_emitter.rs deleted file mode 100644 index 17a215c3d2..0000000000 --- a/src/backend/arm/codegen/asm_emitter.rs +++ /dev/null @@ -1,525 +0,0 @@ -//! AArch64 InlineAsmEmitter implementation: constraint classification, scratch -//! register allocation, operand loading/storing, and template substitution. - -use crate::ir::reexports::{ - BlockId, - IrConst, - Operand, - Value, -}; -use crate::common::types::IrType; -use crate::backend::state::CodegenState; -use crate::backend::inline_asm::{InlineAsmEmitter, AsmOperandKind, AsmOperand}; -use crate::backend::regalloc::PhysReg; -use super::emit::{ArmCodegen, is_arm_fp_reg}; - -/// AArch64 scratch registers for inline asm (caller-saved temporaries). -pub(super) const ARM_GP_SCRATCH: &[&str] = &["x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21"]; -/// AArch64 FP/SIMD scratch registers for inline asm (d8-d15 are callee-saved, -/// d16-d31 are caller-saved; we use v16+ as scratch to avoid save/restore). -/// We use the 'v' prefix so that unmodified %0 in templates like `eor %0.16b, %1.16b, %2.16b` -/// correctly produces `v16.16b` (GCC behavior). Modifiers (%d0, %s0, etc.) convert -/// to the appropriate scalar view in format_reg_static. -const ARM_FP_SCRATCH: &[&str] = &["v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25"]; - -/// Convert an IR constant to a 64-bit value appropriate for an inline asm operand. -/// On AArch64, 32-bit values in X registers must have upper 32 bits zeroed (zero-extended). -/// For 64-bit operand types, the constant is sign-extended to preserve its semantic value -/// (e.g., int64_t a = -128 stored as IrConst::I32(-128) must become 0xFFFFFFFF_FFFFFF80). -fn const_for_asm_operand(c: &IrConst, operand_type: &IrType) -> i64 { - let sext = c.to_i64().unwrap_or(0); - match operand_type.size() { - // 32-bit operand: zero-extend the 32-bit bit pattern. - // This ensures e.g. 0xF0000000u stays 0x00000000_F0000000 in the X register, - // not 0xFFFFFFFF_F0000000 (which would be sign-extended). - 1 => sext & 0xFF, - 2 => sext & 0xFFFF, - 4 => sext & 0xFFFF_FFFF, - // 64-bit or larger operand: sign-extend to preserve int64_t semantics. - _ => sext, - } -} - -impl InlineAsmEmitter for ArmCodegen { - fn asm_state(&mut self) -> &mut CodegenState { &mut self.state } - - // Multi-alternative constraint parsing (e.g., "rm", "ri") matching x86 behavior. - // Priority: specific register > GP register > FP register > memory > immediate. - // Registers are preferred over memory for performance. - fn classify_constraint(&self, constraint: &str) -> AsmOperandKind { - let c = constraint.trim_start_matches(['=', '+', '&', '%']); - // Explicit register constraint from register variable: {regname} - if c.starts_with('{') && c.ends_with('}') { - let reg_name = &c[1..c.len()-1]; - // On AArch64, GCC treats r0-r30 as aliases for x0-x30. - // The Linux kernel uses `register ... asm("r0")` extensively - // (e.g., arm-smccc.h). Normalize to the canonical x-register name. - let normalized = normalize_aarch64_register(reg_name); - return AsmOperandKind::Specific(normalized); - } - // TODO: ARM =@cc not fully implemented — needs CSET/CSINC in store_output_from_reg. - // Currently stores incorrect results (just a GP register value, no condition capture). - if let Some(cond) = c.strip_prefix("@cc") { - return AsmOperandKind::ConditionCode(cond.to_string()); - } - if !c.is_empty() && c.chars().all(|ch| ch.is_ascii_digit()) { - if let Ok(n) = c.parse::() { - return AsmOperandKind::Tied(n); - } - } - - // Parse multi-alternative constraints character by character. - let mut has_gp = false; - let mut has_fp = false; - let mut has_mem = false; - let mut has_imm = false; - - for ch in c.chars() { - match ch { - 'r' => has_gp = true, - 'g' => { has_gp = true; has_mem = true; has_imm = true; } - 'w' => has_fp = true, - 'm' | 'Q' | 'o' | 'V' | 'p' => has_mem = true, - 'i' | 'n' | 'I' | 'K' | 'L' => has_imm = true, - _ => {} - } - } - - if has_gp { - AsmOperandKind::GpReg - } else if has_fp { - AsmOperandKind::FpReg - } else if has_mem { - AsmOperandKind::Memory - } else if has_imm { - AsmOperandKind::Immediate - } else { - AsmOperandKind::GpReg - } - } - - fn setup_operand_metadata(&self, op: &mut AsmOperand, val: &Operand, _is_output: bool) { - if matches!(op.kind, AsmOperandKind::Memory) { - if let Operand::Value(v) = val { - if let Some(slot) = self.state.get_slot(v.0) { - if self.state.is_alloca(v.0) { - // Alloca: stack slot IS the memory location - op.mem_offset = slot.0; - } else { - // Non-alloca: slot holds a pointer that needs indirection. - // Mark with empty mem_addr; resolve_memory_operand will handle it. - op.mem_addr = String::new(); - op.mem_offset = 0; - } - } - } - } - } - - fn resolve_memory_operand(&mut self, op: &mut AsmOperand, val: &Operand, excluded: &[String]) -> bool { - if !op.mem_addr.is_empty() || op.mem_offset != 0 { - return false; - } - // Each memory operand gets its own unique register via assign_scratch_reg, - // so multiple "=m" outputs don't overwrite each other's addresses. - match val { - Operand::Value(v) => { - if let Some(slot) = self.state.get_slot(v.0) { - let tmp_reg = self.assign_scratch_reg(&AsmOperandKind::GpReg, excluded); - self.emit_load_from_sp(&tmp_reg, slot.0, "ldr"); - op.mem_addr = format!("[{}]", tmp_reg); - return true; - } - } - Operand::Const(c) => { - // Constant address (e.g., from MMIO reads at compile-time constant addresses). - // Copy propagation can replace Value operands with Const in inline asm inputs. - // Load the constant into a scratch register for indirect addressing. - if let Some(addr) = c.to_i64() { - let tmp_reg = self.assign_scratch_reg(&AsmOperandKind::GpReg, excluded); - self.emit_load_imm64(&tmp_reg, addr); - op.mem_addr = format!("[{}]", tmp_reg); - return true; - } - } - } - false - } - - fn assign_scratch_reg(&mut self, kind: &AsmOperandKind, excluded: &[String]) -> String { - if matches!(kind, AsmOperandKind::FpReg) { - // Safety: limit iterations to avoid infinite loop if all regs are excluded - for _ in 0..32 { - let idx = self.asm_fp_scratch_idx; - self.asm_fp_scratch_idx += 1; - let reg = if idx < ARM_FP_SCRATCH.len() { - ARM_FP_SCRATCH[idx].to_string() - } else { - format!("v{}", 16 + idx) - }; - if !excluded.iter().any(|e| e == ®) { - return reg; - } - } - // Fallback: return next register even if excluded - format!("v{}", 16 + self.asm_fp_scratch_idx) - } else { - loop { - let idx = self.asm_scratch_idx; - self.asm_scratch_idx += 1; - let reg = if idx < ARM_GP_SCRATCH.len() { - ARM_GP_SCRATCH[idx].to_string() - } else { - format!("x{}", 9 + idx) - }; - if !excluded.iter().any(|e| e == ®) { - // If this is a callee-saved register (x19-x28), ensure it is - // saved/restored in the prologue/epilogue. - let reg_num = reg.strip_prefix('x') - .and_then(|s| s.parse::().ok()); - if let Some(n) = reg_num { - if (19..=28).contains(&n) { - let phys = PhysReg(n); - if !self.used_callee_saved.contains(&phys) { - self.used_callee_saved.push(phys); - self.used_callee_saved.sort_by_key(|r| r.0); - } - } - } - return reg; - } - } - } - } - - fn load_input_to_reg(&mut self, op: &AsmOperand, val: &Operand, _constraint: &str) { - let reg = &op.reg; - let is_fp = is_arm_fp_reg(reg); - let is_sp = reg == "sp"; - match val { - Operand::Const(c) => { - if is_fp { - // Load FP constant: extract IEEE 754 bit pattern, move to GP reg, - // then fmov to FP reg. to_i64() returns None for floats, so we - // must use to_bits() to get the bit-level representation. - // fmov requires d/s register form, not v. - let bits = match c { - IrConst::F32(v) => v.to_bits() as i64, - IrConst::F64(v) => v.to_bits() as i64, - _ => c.to_i64().unwrap_or(0), - }; - self.emit_load_imm64("x9", bits); - if op.operand_type == IrType::F32 { - let s_reg = Self::fp_to_s_reg(reg); - self.state.emit_fmt(format_args!(" fmov {}, w9", s_reg)); - } else { - let d_reg = Self::fp_to_d_reg(reg); - self.state.emit_fmt(format_args!(" fmov {}, x9", d_reg)); - } - } else if is_sp { - // ARM64: can't use ldr/mov imm to sp directly in most cases. - // Load to scratch first, then mov to sp. - let val = const_for_asm_operand(c, &op.operand_type); - self.emit_load_imm64("x9", val); - self.state.emit(" mov sp, x9"); - } else { - let val = const_for_asm_operand(c, &op.operand_type); - self.emit_load_imm64(reg, val); - } - } - Operand::Value(v) => { - if let Some(slot) = self.state.get_slot(v.0) { - if is_fp { - // Load FP value from stack: use ldr with d/s register form. - // For SIMD vector types (>= 16 bytes), use ldr with q register. - let type_size = op.operand_type.size(); - if type_size == 16 { - // 128-bit vector: load directly with ldr qN - let q_reg = Self::fp_to_q_reg(reg); - self.emit_load_from_sp(&q_reg, slot.0, "ldr"); - } else if op.operand_type == IrType::F32 || type_size == 4 { - self.state.emit_fmt(format_args!(" ldr w9, [sp, #{}]", slot.0)); - let s_reg = Self::fp_to_s_reg(reg); - self.state.emit_fmt(format_args!(" fmov {}, w9", s_reg)); - } else { - self.state.emit_fmt(format_args!(" ldr x9, [sp, #{}]", slot.0)); - let d_reg = Self::fp_to_d_reg(reg); - self.state.emit_fmt(format_args!(" fmov {}, x9", d_reg)); - } - } else if is_sp { - // ARM64: can't use ldr to load directly into sp. - // Load to scratch first, then mov to sp. - self.emit_load_from_sp("x9", slot.0, "ldr"); - self.state.emit(" mov sp, x9"); - } else if self.state.is_alloca(v.0) { - // Alloca: the IR value represents the ADDRESS of the - // allocated memory. Compute its address instead of - // loading the contents. - self.emit_alloca_addr(reg, v.0, slot.0); - } else { - // Non-alloca: load the value from the stack slot. - self.emit_load_from_sp(reg, slot.0, "ldr"); - } - } - } - } - } - - fn preload_readwrite_output(&mut self, op: &AsmOperand, ptr: &Value) { - let reg = &op.reg; - let is_fp = is_arm_fp_reg(reg); - if let Some(slot) = self.state.get_slot(ptr.0) { - if is_fp { - // Load current FP value for read-write constraint. - // fmov requires d/s register form, not v. - let type_size = op.operand_type.size(); - if type_size == 16 { - let q_reg = Self::fp_to_q_reg(reg); - self.emit_load_from_sp(&q_reg, slot.0, "ldr"); - } else if op.operand_type == IrType::F32 || type_size == 4 { - self.state.emit_fmt(format_args!(" ldr w9, [sp, #{}]", slot.0)); - let s_reg = Self::fp_to_s_reg(reg); - self.state.emit_fmt(format_args!(" fmov {}, w9", s_reg)); - } else { - self.state.emit_fmt(format_args!(" ldr x9, [sp, #{}]", slot.0)); - let d_reg = Self::fp_to_d_reg(reg); - self.state.emit_fmt(format_args!(" fmov {}, x9", d_reg)); - } - } else if reg == "sp" { - // ARM64: can't use ldr to load directly into sp. - self.emit_load_from_sp("x9", slot.0, "ldr"); - self.state.emit(" mov sp, x9"); - } else { - self.emit_load_from_sp(reg, slot.0, "ldr"); - } - } - } - - fn substitute_template_line(&self, line: &str, operands: &[AsmOperand], gcc_to_internal: &[usize], _operand_types: &[IrType], goto_labels: &[(String, BlockId)]) -> String { - // For memory operands (Q/m constraints), use mem_addr (e.g., "[x9]") or - // format as [sp, #offset] for stack-based memory. For register operands, - // use the register name directly. - let op_regs: Vec = operands.iter().map(|o| { - if matches!(o.kind, AsmOperandKind::Memory) { - if !o.mem_addr.is_empty() { - // Non-alloca pointer: mem_addr already formatted as "[xN]" - o.mem_addr.clone() - } else if o.mem_offset != 0 { - // Alloca: stack-relative address - format!("[sp, #{}]", o.mem_offset) - } else { - // Fallback: wrap register in brackets - if o.reg.is_empty() { - "[sp]".to_string() - } else { - format!("[{}]", o.reg) - } - } - } else { - o.reg.clone() - } - }).collect(); - let op_names: Vec> = operands.iter().map(|o| o.name.clone()).collect(); - let op_imm_values: Vec> = operands.iter().map(|o| o.imm_value).collect(); - let op_imm_symbols: Vec> = operands.iter().map(|o| o.imm_symbol.clone()).collect(); - let mut result = Self::substitute_asm_operands_static(line, &op_regs, &op_names, gcc_to_internal, &op_imm_values, &op_imm_symbols); - // Substitute %l[name] goto label references - result = crate::backend::inline_asm::substitute_goto_labels(&result, goto_labels, operands.len()); - result - } - - fn store_output_from_reg(&mut self, op: &AsmOperand, ptr: &Value, _constraint: &str, all_output_regs: &[&str]) { - if matches!(op.kind, AsmOperandKind::Memory) { - return; - } - let reg = &op.reg; - let is_fp = is_arm_fp_reg(reg); - if let Some(slot) = self.state.get_slot(ptr.0) { - if is_fp { - // Store FP/SIMD register output. fmov requires d/s form, not v. - let type_size = op.operand_type.size(); - if type_size == 16 { - // 128-bit vector: store directly with str qN - let q_reg = Self::fp_to_q_reg(reg); - self.emit_store_to_sp(&q_reg, slot.0, "str"); - } else if op.operand_type == IrType::F32 || type_size == 4 { - let s_reg = Self::fp_to_s_reg(reg); - self.state.emit_fmt(format_args!(" fmov w9, {}", s_reg)); - self.state.emit_fmt(format_args!(" str w9, [sp, #{}]", slot.0)); - } else { - let d_reg = Self::fp_to_d_reg(reg); - self.state.emit_fmt(format_args!(" fmov x9, {}", d_reg)); - self.state.emit_fmt(format_args!(" str x9, [sp, #{}]", slot.0)); - } - } else if reg == "sp" { - // ARM64: sp (register 31) can't be used as str source operand directly. - // Move to a scratch register first, then store. - self.state.emit(" mov x9, sp"); - self.emit_store_to_sp("x9", slot.0, "str"); - } else if self.state.is_direct_slot(ptr.0) { - self.emit_store_to_sp(reg, slot.0, "str"); - } else { - // Non-alloca: slot holds a pointer, store through it. - // Pick a scratch register that doesn't conflict with ANY output register, - // not just the current one. This prevents clobbering other outputs that - // haven't been stored yet. - let candidates = ["x9", "x10", "x11", "x12", "x13", "x14", "x15"]; - let scratch = candidates.iter() - .find(|&&c| !all_output_regs.contains(&c)) - .copied() - .unwrap_or(if reg != "x9" { "x9" } else { "x10" }); - self.emit_load_from_sp(scratch, slot.0, "ldr"); - self.state.emit_fmt(format_args!(" str {}, [{}]", reg, scratch)); - } - } - } - - fn reset_scratch_state(&mut self) { - self.asm_scratch_idx = 0; - self.asm_fp_scratch_idx = 0; - } - - /// Override the default (x86) immediate constraint validation with AArch64 semantics. - /// - /// On AArch64: - /// 'K' - logical immediate: a bitmask value encodable in the N:immr:imms field - /// of AND/ORR/EOR/TST instructions. Excludes 0 and all-ones. - /// 'I' - unsigned 12-bit immediate (0..4095) for add/sub instructions - /// 'L' - logical immediate for 32-bit operations (32-bit bitmask pattern) - fn constant_fits_immediate(&self, constraint: &str, value: i64) -> bool { - let stripped = constraint.trim_start_matches(['=', '+', '&', '%']); - // If constraint has 'i' or 'n', any constant value is accepted - if stripped.contains('i') || stripped.contains('n') { - return true; - } - // Check each constraint letter with AArch64-specific ranges - for ch in stripped.chars() { - let fits = match ch { - // AArch64 'K': 64-bit logical immediate (bitmask encodable in N:immr:imms) - // Used by AND/ORR/EOR/TST instructions. 0 and all-ones are NOT valid. - 'K' => is_valid_aarch64_logical_immediate(value as u64), - // AArch64 'L': 32-bit logical immediate (validate in 32-bit context) - 'L' => is_valid_aarch64_logical_immediate_32(value as u32), - // AArch64 'I': unsigned 12-bit add/sub immediate - 'I' => (0..=4095).contains(&value), - _ => continue, - }; - if fits { - return true; - } - } - false - } -} - -/// Normalize AArch64 register name aliases. -/// -/// GCC treats `r0`-`r30` as aliases for `x0`-`x30` on AArch64. The Linux kernel -/// uses this convention extensively in inline assembly (e.g., `register unsigned long -/// r0 asm("r0")` in arm-smccc.h). This function maps these aliases to canonical -/// AArch64 register names so the assembler accepts them. -pub(super) fn normalize_aarch64_register(name: &str) -> String { - if let Some(suffix) = name.strip_prefix('r') { - if let Ok(n) = suffix.parse::() { - if n <= 30 { - return format!("x{}", n); - } - } - } - name.to_string() -} - -/// Check whether a 32-bit value is a valid AArch64 32-bit logical immediate. -/// Used for the 'L' constraint (32-bit logical operations like AND w0, w1, #imm). -fn is_valid_aarch64_logical_immediate_32(value: u32) -> bool { - if value == 0 || value == u32::MAX { - return false; - } - // Check element sizes: 2, 4, 8, 16, 32 bits within a 32-bit value - let mut size: u32 = 32; - while size >= 2 { - let mask = if size == 32 { u32::MAX } else { (1u32 << size) - 1 }; - let element = value & mask; - // Check if value is a repeating pattern of this element size - let mut check = element; - let mut s = size; - while s < 32 { - check |= check << s; - s *= 2; - } - if check == value { - let val = element & mask; - if val != 0 && val != mask { - let rotated = ((val >> 1) | ((val & 1) << (size - 1))) & mask; - let transitions = val ^ rotated; - if transitions.count_ones() == 2 { - return true; - } - } - } - size >>= 1; - } - false -} - -/// Check whether a 64-bit value is a valid AArch64 logical immediate. -/// -/// AArch64 logical immediates are bitmask patterns encodable in the 13-bit -/// N:immr:imms field of AND/ORR/EOR/TST instructions. Valid patterns consist -/// of a repeating element of size 2, 4, 8, 16, 32, or 64 bits, where each -/// element contains a contiguous (possibly rotated) run of set bits. -/// -/// The values 0 and all-ones (0xFFFFFFFF_FFFFFFFF) are NOT valid logical immediates. -fn is_valid_aarch64_logical_immediate(value: u64) -> bool { - // 0 and all-ones are never valid logical immediates - if value == 0 || value == u64::MAX { - return false; - } - - // Try each possible element size: 2, 4, 8, 16, 32, 64 bits. - // For each size, check if the value is a repeating pattern of that element, - // and if the element contains a contiguous (possibly rotated) run of 1-bits. - let mut size: u32 = 64; - while size >= 2 { - let mask = if size == 64 { u64::MAX } else { (1u64 << size) - 1 }; - let element = value & mask; - - // Check if value is a repeating pattern of this element size - if is_repeating_pattern(value, element, size) { - // Check if the element has a contiguous run of 1-bits (possibly rotated) - if has_contiguous_ones(element, size) { - return true; - } - } - size >>= 1; - } - false -} - -/// Check if `value` is composed of `element` repeated to fill 64 bits. -fn is_repeating_pattern(value: u64, element: u64, size: u32) -> bool { - let mut check = element; - let mut s = size; - while s < 64 { - check |= check << s; - s *= 2; - } - check == value -} - -/// Check if the lowest `size` bits of `element` contain a contiguous run of -/// set bits (possibly rotated). A contiguous-rotated pattern means there's -/// at most one 0->1 transition and one 1->0 transition in the circular bit sequence. -fn has_contiguous_ones(element: u64, size: u32) -> bool { - let mask = if size == 64 { u64::MAX } else { (1u64 << size) - 1 }; - let val = element & mask; - // All-zeros or all-ones within the element are not valid - if val == 0 || val == mask { - return false; - } - // A contiguous run of 1-bits (possibly rotated) has the property that - // val ^ (val rotated by 1) has exactly 2 set bits (the two transitions). - let rotated = ((val >> 1) | ((val & 1) << (size - 1))) & mask; - let transitions = val ^ rotated; - transitions.count_ones() == 2 -} diff --git a/src/backend/arm/codegen/atomics.rs b/src/backend/arm/codegen/atomics.rs deleted file mode 100644 index e552f78694..0000000000 --- a/src/backend/arm/codegen/atomics.rs +++ /dev/null @@ -1,146 +0,0 @@ -//! ArmCodegen: atomic operations. - -use crate::ir::reexports::{AtomicOrdering, AtomicRmwOp, Operand, Value}; -use crate::common::types::IrType; -use super::emit::ArmCodegen; - -impl ArmCodegen { - pub(super) fn emit_atomic_rmw_impl(&mut self, dest: &Value, op: AtomicRmwOp, ptr: &Operand, val: &Operand, ty: IrType, ordering: AtomicOrdering) { - self.operand_to_x0(ptr); - self.state.emit(" mov x1, x0"); - self.operand_to_x0(val); - self.state.emit(" mov x2, x0"); - - let (ldxr, stxr, reg_prefix) = Self::exclusive_instrs(ty, ordering); - let val_reg = format!("{}2", reg_prefix); - let old_reg = format!("{}0", reg_prefix); - let tmp_reg = format!("{}3", reg_prefix); - - match op { - AtomicRmwOp::Xchg => { - let label_id = self.state.next_label_id(); - let loop_label = format!(".Latomic_{}", label_id); - self.state.emit_fmt(format_args!("{}:", loop_label)); - self.state.emit_fmt(format_args!(" {} {}, [x1]", ldxr, old_reg)); - self.state.emit_fmt(format_args!(" {} w4, {}, [x1]", stxr, val_reg)); - self.state.emit_fmt(format_args!(" cbnz w4, {}", loop_label)); - } - AtomicRmwOp::TestAndSet => { - let label_id = self.state.next_label_id(); - let loop_label = format!(".Latomic_{}", label_id); - self.state.emit_fmt(format_args!("{}:", loop_label)); - self.state.emit_fmt(format_args!(" {} {}, [x1]", ldxr, old_reg)); - self.state.emit(" mov w3, #1"); - self.state.emit_fmt(format_args!(" {} w4, w3, [x1]", stxr)); - self.state.emit_fmt(format_args!(" cbnz w4, {}", loop_label)); - } - _ => { - let label_id = self.state.next_label_id(); - let loop_label = format!(".Latomic_{}", label_id); - self.state.emit_fmt(format_args!("{}:", loop_label)); - self.state.emit_fmt(format_args!(" {} {}, [x1]", ldxr, old_reg)); - Self::emit_atomic_op_arm(&mut self.state, op, &tmp_reg, &old_reg, &val_reg); - self.state.emit_fmt(format_args!(" {} w4, {}, [x1]", stxr, tmp_reg)); - self.state.emit_fmt(format_args!(" cbnz w4, {}", loop_label)); - } - } - self.store_x0_to(dest); - } - - pub(super) fn emit_atomic_cmpxchg_impl(&mut self, dest: &Value, ptr: &Operand, expected: &Operand, desired: &Operand, ty: IrType, success_ordering: AtomicOrdering, _failure_ordering: AtomicOrdering, returns_bool: bool) { - self.operand_to_x0(ptr); - self.state.emit(" mov x1, x0"); - self.operand_to_x0(desired); - self.state.emit(" mov x3, x0"); - self.operand_to_x0(expected); - self.state.emit(" mov x2, x0"); - - let (ldxr, stxr, reg_prefix) = Self::exclusive_instrs(ty, success_ordering); - let old_reg = format!("{}0", reg_prefix); - let desired_reg = format!("{}3", reg_prefix); - let expected_reg = format!("{}2", reg_prefix); - - let label_id = self.state.next_label_id(); - let loop_label = format!(".Lcas_loop_{}", label_id); - let fail_label = format!(".Lcas_fail_{}", label_id); - let done_label = format!(".Lcas_done_{}", label_id); - - self.state.emit_fmt(format_args!("{}:", loop_label)); - self.state.emit_fmt(format_args!(" {} {}, [x1]", ldxr, old_reg)); - self.state.emit_fmt(format_args!(" cmp {}, {}", old_reg, expected_reg)); - self.state.emit_fmt(format_args!(" b.ne {}", fail_label)); - self.state.emit_fmt(format_args!(" {} w4, {}, [x1]", stxr, desired_reg)); - self.state.emit_fmt(format_args!(" cbnz w4, {}", loop_label)); - if returns_bool { - self.state.emit(" mov x0, #1"); - } - self.state.emit_fmt(format_args!(" b {}", done_label)); - self.state.emit_fmt(format_args!("{}:", fail_label)); - if returns_bool { - self.state.emit(" mov x0, #0"); - self.state.emit(" clrex"); - } else { - self.state.emit(" clrex"); - } - self.state.emit_fmt(format_args!("{}:", done_label)); - self.store_x0_to(dest); - } - - pub(super) fn emit_atomic_load_impl(&mut self, dest: &Value, ptr: &Operand, ty: IrType, ordering: AtomicOrdering) { - self.operand_to_x0(ptr); - let need_acquire = matches!(ordering, AtomicOrdering::Acquire | AtomicOrdering::AcqRel | AtomicOrdering::SeqCst); - let instr = match (ty, need_acquire) { - (IrType::I8 | IrType::U8, true) => "ldarb", - (IrType::I8 | IrType::U8, false) => "ldrb", - (IrType::I16 | IrType::U16, true) => "ldarh", - (IrType::I16 | IrType::U16, false) => "ldrh", - (IrType::I32 | IrType::U32, true) => "ldar", - (IrType::I32 | IrType::U32, false) => "ldr", - (_, true) => "ldar", - (_, false) => "ldr", - }; - let dest_reg = match ty { - IrType::I8 | IrType::U8 | IrType::I16 | IrType::U16 | IrType::I32 | IrType::U32 => "w0", - _ => "x0", - }; - self.state.emit_fmt(format_args!(" {} {}, [x0]", instr, dest_reg)); - match ty { - IrType::I8 => self.state.emit(" sxtb x0, w0"), - IrType::I16 => self.state.emit(" sxth x0, w0"), - IrType::I32 => self.state.emit(" sxtw x0, w0"), - _ => {} - } - self.store_x0_to(dest); - } - - pub(super) fn emit_atomic_store_impl(&mut self, ptr: &Operand, val: &Operand, ty: IrType, ordering: AtomicOrdering) { - self.operand_to_x0(val); - self.state.emit(" mov x1, x0"); - self.operand_to_x0(ptr); - let need_release = matches!(ordering, AtomicOrdering::Release | AtomicOrdering::AcqRel | AtomicOrdering::SeqCst); - let instr = match (ty, need_release) { - (IrType::I8 | IrType::U8, true) => "stlrb", - (IrType::I8 | IrType::U8, false) => "strb", - (IrType::I16 | IrType::U16, true) => "stlrh", - (IrType::I16 | IrType::U16, false) => "strh", - (IrType::I32 | IrType::U32, true) => "stlr", - (IrType::I32 | IrType::U32, false) => "str", - (_, true) => "stlr", - (_, false) => "str", - }; - let val_reg = match ty { - IrType::I8 | IrType::U8 | IrType::I16 | IrType::U16 | IrType::I32 | IrType::U32 => "w1", - _ => "x1", - }; - self.state.emit_fmt(format_args!(" {} {}, [x0]", instr, val_reg)); - } - - pub(super) fn emit_fence_impl(&mut self, ordering: AtomicOrdering) { - match ordering { - AtomicOrdering::Relaxed => {} - AtomicOrdering::Acquire => self.state.emit(" dmb ishld"), - AtomicOrdering::Release => self.state.emit(" dmb ishst"), - AtomicOrdering::AcqRel | AtomicOrdering::SeqCst => self.state.emit(" dmb ish"), - } - } -} diff --git a/src/backend/arm/codegen/calls.rs b/src/backend/arm/codegen/calls.rs deleted file mode 100644 index d732b77836..0000000000 --- a/src/backend/arm/codegen/calls.rs +++ /dev/null @@ -1,263 +0,0 @@ -//! ArmCodegen: function call operations. - -use crate::ir::reexports::{IrConst, Operand, Value}; -use crate::common::types::IrType; -use crate::backend::call_abi::{CallAbiConfig, CallArgClass, compute_stack_arg_space}; -use super::emit::{ArmCodegen, callee_saved_name}; - -impl ArmCodegen { - pub(super) fn call_abi_config_impl(&self) -> CallAbiConfig { - CallAbiConfig { - max_int_regs: 8, max_float_regs: 8, - align_i128_pairs: true, - f128_in_fp_regs: true, f128_in_gp_pairs: false, - variadic_floats_in_gp: false, - large_struct_by_ref: true, - use_sysv_struct_classification: false, - use_riscv_float_struct_classification: false, - allow_struct_split_reg_stack: false, - align_struct_pairs: false, - sret_uses_dedicated_reg: true, - } - } - - pub(super) fn emit_call_compute_stack_space_impl(&self, arg_classes: &[CallArgClass], _arg_types: &[IrType]) -> usize { - compute_stack_arg_space(arg_classes) - } - - pub(super) fn emit_call_f128_pre_convert_impl(&mut self, args: &[Operand], arg_classes: &[CallArgClass], - _arg_types: &[IrType], _stack_arg_space: usize) -> usize { - let _ = (args, arg_classes); - 0 - } - - pub(super) fn emit_call_stack_args_impl(&mut self, args: &[Operand], arg_classes: &[CallArgClass], - _arg_types: &[IrType], stack_arg_space: usize, fptr_spill: usize, _f128_temp_space: usize) -> i64 { - if stack_arg_space > 0 { - self.emit_sub_sp(stack_arg_space as i64); - let src_adjust = if self.state.has_dyn_alloca { 0 } else { stack_arg_space as i64 + fptr_spill as i64 }; - let mut stack_offset = 0i64; - for (arg_idx, arg) in args.iter().enumerate() { - if !arg_classes[arg_idx].is_stack() { continue; } - let cls = arg_classes[arg_idx]; - if matches!(cls, CallArgClass::F128Stack | CallArgClass::I128Stack) { - stack_offset = (stack_offset + 15) & !15; - } - match cls { - CallArgClass::StructByValStack { size } | CallArgClass::LargeStructStack { size } => { - let n_dwords = size.div_ceil(8); - match arg { - Operand::Value(v) => { - if let Some(®) = self.reg_assignments.get(&v.0) { - let reg_name = callee_saved_name(reg); - self.state.emit_fmt(format_args!(" mov x0, {}", reg_name)); - } else if let Some(slot) = self.state.get_slot(v.0) { - let adjusted = slot.0 + src_adjust; - if self.state.is_alloca(v.0) { - self.emit_alloca_addr("x0", v.0, adjusted); - } else { - self.emit_load_from_sp("x0", adjusted, "ldr"); - } - } else { - self.state.emit(" mov x0, #0"); - } - } - Operand::Const(_) => { self.operand_to_x0(arg); } - } - for qi in 0..n_dwords { - let src_off = (qi * 8) as i64; - self.emit_load_from_reg("x1", "x0", src_off, "ldr"); - self.emit_store_to_raw_sp("x1", stack_offset + src_off, "str"); - } - stack_offset += (n_dwords as i64) * 8; - } - CallArgClass::I128Stack => { - match arg { - Operand::Const(c) => { - if let IrConst::I128(v) = c { - self.emit_load_imm64("x0", *v as u64 as i64); - self.emit_store_to_raw_sp("x0", stack_offset, "str"); - self.emit_load_imm64("x0", (*v >> 64) as u64 as i64); - self.emit_store_to_raw_sp("x0", stack_offset + 8, "str"); - } else { - self.operand_to_x0(arg); - self.emit_store_to_raw_sp("x0", stack_offset, "str"); - self.state.emit(" mov x0, #0"); - self.emit_store_to_raw_sp("x0", stack_offset + 8, "str"); - } - } - Operand::Value(v) => { - if let Some(slot) = self.state.get_slot(v.0) { - let adjusted = slot.0 + src_adjust; - if self.state.is_alloca(v.0) { - self.emit_alloca_addr("x0", v.0, adjusted); - self.emit_store_to_raw_sp("x0", stack_offset, "str"); - self.state.emit(" mov x0, #0"); - self.emit_store_to_raw_sp("x0", stack_offset + 8, "str"); - } else { - self.emit_load_from_sp("x0", adjusted, "ldr"); - self.emit_store_to_raw_sp("x0", stack_offset, "str"); - self.emit_load_from_sp("x0", adjusted + 8, "ldr"); - self.emit_store_to_raw_sp("x0", stack_offset + 8, "str"); - } - } else { - self.state.emit(" mov x0, #0"); - self.emit_store_to_raw_sp("x0", stack_offset, "str"); - self.emit_store_to_raw_sp("x0", stack_offset + 8, "str"); - } - } - } - stack_offset += 16; - } - CallArgClass::F128Stack => { - match arg { - Operand::Const(c) => { - let bytes = match c { - IrConst::LongDouble(_, f128_bytes) => *f128_bytes, - _ => { - let f64_val = c.to_f64().unwrap_or(0.0); - crate::ir::reexports::f64_to_f128_bytes(f64_val) - } - }; - let lo = u64::from_le_bytes(bytes[0..8].try_into().unwrap()); - let hi = u64::from_le_bytes(bytes[8..16].try_into().unwrap()); - self.emit_load_imm64("x0", lo as i64); - self.emit_store_to_raw_sp("x0", stack_offset, "str"); - self.emit_load_imm64("x0", hi as i64); - self.emit_store_to_raw_sp("x0", stack_offset + 8, "str"); - } - Operand::Value(v) => { - let mut loaded_full = false; - if let Some((src_id, offset, is_indirect)) = self.state.get_f128_source(v.0) { - if !is_indirect { - if let Some(src_slot) = self.state.get_slot(src_id) { - let adj = src_slot.0 + offset + src_adjust; - self.emit_load_from_sp("q0", adj, "ldr"); - self.emit_store_to_raw_sp("q0", stack_offset, "str"); - loaded_full = true; - } - } else if let Some(src_slot) = self.state.get_slot(src_id) { - let adj = src_slot.0 + src_adjust; - self.emit_load_from_sp("x17", adj, "ldr"); - if offset != 0 { - if offset > 0 && offset <= 4095 { - self.state.emit_fmt(format_args!(" add x17, x17, #{}", offset)); - } else { - self.load_large_imm("x16", offset); - self.state.emit(" add x17, x17, x16"); - } - } - self.state.emit(" ldr q0, [x17]"); - self.emit_store_to_raw_sp("q0", stack_offset, "str"); - loaded_full = true; - } - } - if !loaded_full { - if let Some(®) = self.reg_assignments.get(&v.0) { - let reg_name = callee_saved_name(reg); - self.state.emit_fmt(format_args!(" mov x0, {}", reg_name)); - } else if let Some(slot) = self.state.get_slot(v.0) { - let adjusted = slot.0 + src_adjust; - if self.state.is_alloca(v.0) { - self.emit_alloca_addr("x0", v.0, adjusted); - } else { - self.emit_load_from_sp("x0", adjusted, "ldr"); - } - } else { - self.state.emit(" mov x0, #0"); - } - self.state.emit(" fmov d0, x0"); - self.state.emit(" stp x9, x10, [sp, #-16]!"); - self.state.emit(" bl __extenddftf2"); - self.state.emit(" ldp x9, x10, [sp], #16"); - self.emit_store_to_raw_sp("q0", stack_offset, "str"); - } - } - } - stack_offset += 16; - } - CallArgClass::Stack => { - match arg { - Operand::Value(v) => { - if let Some(®) = self.reg_assignments.get(&v.0) { - let reg_name = callee_saved_name(reg); - self.state.emit_fmt(format_args!(" mov x0, {}", reg_name)); - } else if let Some(slot) = self.state.get_slot(v.0) { - let adjusted = slot.0 + src_adjust; - if self.state.is_alloca(v.0) { - self.emit_alloca_addr("x0", v.0, adjusted); - } else { - self.emit_load_from_sp("x0", adjusted, "ldr"); - } - } else { - self.state.emit(" mov x0, #0"); - } - } - Operand::Const(_) => { self.operand_to_x0(arg); } - } - self.emit_store_to_raw_sp("x0", stack_offset, "str"); - stack_offset += 8; - } - _ => {} - } - } - } - stack_arg_space as i64 + fptr_spill as i64 - } - - pub(super) fn emit_call_reg_args_impl(&mut self, args: &[Operand], arg_classes: &[CallArgClass], - arg_types: &[IrType], total_sp_adjust: i64, _f128_temp_space: usize, _stack_arg_space: usize, - _struct_arg_riscv_float_classes: &[Option]) { - let slot_adjust = if self.state.has_dyn_alloca { 0 } else { total_sp_adjust }; - let needs_adjusted_load = total_sp_adjust > 0; - - self.emit_call_gp_to_temps(args, arg_classes, slot_adjust, needs_adjusted_load); - self.emit_call_fp_reg_args(args, arg_classes, arg_types, slot_adjust, needs_adjusted_load); - self.emit_call_move_temps_to_arg_regs(args, arg_classes); - self.emit_call_i128_reg_args(args, arg_classes, slot_adjust, needs_adjusted_load); - self.emit_call_struct_byval_reg_args(args, arg_classes, slot_adjust, needs_adjusted_load); - } - - pub(super) fn emit_call_instruction_impl(&mut self, direct_name: Option<&str>, _func_ptr: Option<&Operand>, indirect: bool, stack_arg_space: usize) { - if let Some(name) = direct_name { - self.state.emit_fmt(format_args!(" bl {}", name)); - } else if indirect { - let spill_offset = stack_arg_space as i64; - if Self::is_valid_imm_offset(spill_offset, "ldr", "x17") { - self.state.emit_fmt(format_args!(" ldr x17, [sp, #{}]", spill_offset)); - } else { - self.load_large_imm("x17", spill_offset); - self.state.emit(" add x17, sp, x17"); - self.state.emit(" ldr x17, [x17]"); - } - self.state.emit(" blr x17"); - } - } - - pub(super) fn emit_call_cleanup_impl(&mut self, stack_arg_space: usize, _f128_temp_space: usize, indirect: bool) { - let fptr_spill = if indirect { 16usize } else { 0 }; - let total = stack_arg_space + fptr_spill; - if total > 0 { - self.emit_add_sp(total as i64); - } - } - - pub(super) fn emit_call_store_i128_result_impl(&mut self, dest: &Value) { - self.store_x0_x1_to(dest); - } - - pub(super) fn emit_call_store_f128_result_impl(&mut self, dest: &Value) { - if let Some(slot) = self.state.get_slot(dest.0) { - self.emit_store_to_sp("q0", slot.0, "str"); - self.state.track_f128_self(dest.0); - } - self.state.emit(" sub sp, sp, #16"); - self.state.emit(" str q1, [sp]"); - self.state.emit(" bl __trunctfdf2"); - self.state.emit(" fmov x0, d0"); - self.state.emit(" ldr q1, [sp]"); - self.state.emit(" add sp, sp, #16"); - self.state.reg_cache.invalidate_all(); - self.state.reg_cache.set_acc(dest.0, false); - } -} diff --git a/src/backend/arm/codegen/cast_ops.rs b/src/backend/arm/codegen/cast_ops.rs deleted file mode 100644 index 1974529d43..0000000000 --- a/src/backend/arm/codegen/cast_ops.rs +++ /dev/null @@ -1,139 +0,0 @@ -//! ArmCodegen: cast operations. - -use crate::ir::reexports::{Operand, Value}; -use crate::common::types::IrType; -use crate::backend::cast::{CastKind, classify_cast}; -use super::emit::ArmCodegen; - -impl ArmCodegen { - pub(super) fn emit_cast_instrs_impl(&mut self, from_ty: IrType, to_ty: IrType) { - match classify_cast(from_ty, to_ty) { - CastKind::Noop | CastKind::UnsignedToSignedSameSize { .. } => {} - - CastKind::FloatToSigned { from_f64 } => { - if from_f64 { - self.state.emit(" fmov d0, x0"); - self.state.emit(" fcvtzs x0, d0"); - } else { - self.state.emit(" fmov s0, w0"); - self.state.emit(" fcvtzs x0, s0"); - } - match to_ty { - IrType::I8 => self.state.emit(" sxtb x0, w0"), - IrType::I16 => self.state.emit(" sxth x0, w0"), - IrType::I32 => self.state.emit(" sxtw x0, w0"), - _ => {} - } - } - - CastKind::FloatToUnsigned { from_f64, .. } => { - if from_f64 { - self.state.emit(" fmov d0, x0"); - self.state.emit(" fcvtzu x0, d0"); - } else { - self.state.emit(" fmov s0, w0"); - self.state.emit(" fcvtzu x0, s0"); - } - match to_ty { - IrType::U8 => self.state.emit(" and x0, x0, #0xff"), - IrType::U16 => self.state.emit(" and x0, x0, #0xffff"), - IrType::U32 => self.state.emit(" mov w0, w0"), - _ => {} - } - } - - CastKind::SignedToFloat { to_f64, from_ty } => { - match from_ty.size() { - 1 => self.state.emit(" sxtb x0, w0"), - 2 => self.state.emit(" sxth x0, w0"), - 4 => self.state.emit(" sxtw x0, w0"), - _ => {} - } - if to_f64 { - self.state.emit(" scvtf d0, x0"); - self.state.emit(" fmov x0, d0"); - } else { - self.state.emit(" scvtf s0, x0"); - self.state.emit(" fmov w0, s0"); - } - } - - CastKind::UnsignedToFloat { to_f64, .. } => { - if to_f64 { - self.state.emit(" ucvtf d0, x0"); - self.state.emit(" fmov x0, d0"); - } else { - self.state.emit(" ucvtf s0, x0"); - self.state.emit(" fmov w0, s0"); - } - } - - CastKind::FloatToFloat { widen } => { - if widen { - self.state.emit(" fmov s0, w0"); - self.state.emit(" fcvt d0, s0"); - self.state.emit(" fmov x0, d0"); - } else { - self.state.emit(" fmov d0, x0"); - self.state.emit(" fcvt s0, d0"); - self.state.emit(" fmov w0, s0"); - } - } - - CastKind::SignedToUnsignedSameSize { to_ty } => { - match to_ty { - IrType::U8 => self.state.emit(" and x0, x0, #0xff"), - IrType::U16 => self.state.emit(" and x0, x0, #0xffff"), - IrType::U32 => self.state.emit(" mov w0, w0"), - _ => {} - } - } - - CastKind::IntWiden { from_ty, .. } => { - if from_ty.is_unsigned() { - match from_ty { - IrType::U8 => self.state.emit(" and x0, x0, #0xff"), - IrType::U16 => self.state.emit(" and x0, x0, #0xffff"), - IrType::U32 => self.state.emit(" mov w0, w0"), - _ => {} - } - } else { - match from_ty { - IrType::I8 => self.state.emit(" sxtb x0, w0"), - IrType::I16 => self.state.emit(" sxth x0, w0"), - IrType::I32 => self.state.emit(" sxtw x0, w0"), - _ => {} - } - } - } - - CastKind::IntNarrow { to_ty } => { - match to_ty { - IrType::I8 => self.state.emit(" sxtb x0, w0"), - IrType::U8 => self.state.emit(" and x0, x0, #0xff"), - IrType::I16 => self.state.emit(" sxth x0, w0"), - IrType::U16 => self.state.emit(" and x0, x0, #0xffff"), - IrType::I32 => self.state.emit(" sxtw x0, w0"), - IrType::U32 => self.state.emit(" mov w0, w0"), - _ => {} - } - } - - CastKind::SignedToF128 { .. } - | CastKind::UnsignedToF128 { .. } - | CastKind::F128ToSigned { .. } - | CastKind::F128ToUnsigned { .. } - | CastKind::FloatToF128 { .. } - | CastKind::F128ToFloat { .. } => { - unreachable!("F128 cast variants not produced by classify_cast()"); - } - } - } - - pub(super) fn emit_cast_impl(&mut self, dest: &Value, src: &Operand, from_ty: IrType, to_ty: IrType) { - if crate::backend::f128_softfloat::f128_emit_cast(self, dest, src, from_ty, to_ty) { - return; - } - crate::backend::traits::emit_cast_default(self, dest, src, from_ty, to_ty); - } -} diff --git a/src/backend/arm/codegen/comparison.rs b/src/backend/arm/codegen/comparison.rs deleted file mode 100644 index 9a4c37599f..0000000000 --- a/src/backend/arm/codegen/comparison.rs +++ /dev/null @@ -1,75 +0,0 @@ -//! ArmCodegen: comparison operations. - -use crate::ir::reexports::{IrCmpOp, Operand, Value}; -use crate::common::types::IrType; -use super::emit::{ArmCodegen, arm_int_cond_code, arm_invert_cond_code}; - -impl ArmCodegen { - pub(super) fn emit_float_cmp_impl(&mut self, dest: &Value, op: IrCmpOp, lhs: &Operand, rhs: &Operand, ty: IrType) { - self.operand_to_x0(lhs); - self.state.emit(" mov x1, x0"); - self.operand_to_x0(rhs); - if ty == IrType::F32 { - self.state.emit(" fmov s0, w1"); - self.state.emit(" fmov s1, w0"); - self.state.emit(" fcmp s0, s1"); - } else { - self.state.emit(" fmov d0, x1"); - self.state.emit(" fmov d1, x0"); - self.state.emit(" fcmp d0, d1"); - } - let cond = match op { - IrCmpOp::Eq => "eq", - IrCmpOp::Ne => "ne", - IrCmpOp::Slt | IrCmpOp::Ult => "mi", - IrCmpOp::Sle | IrCmpOp::Ule => "ls", - IrCmpOp::Sgt | IrCmpOp::Ugt => "gt", - IrCmpOp::Sge | IrCmpOp::Uge => "ge", - }; - self.state.emit_fmt(format_args!(" cset x0, {}", cond)); - self.store_x0_to(dest); - } - - pub(super) fn emit_f128_cmp_impl(&mut self, dest: &Value, op: IrCmpOp, lhs: &Operand, rhs: &Operand) { - crate::backend::f128_softfloat::f128_cmp(self, dest, op, lhs, rhs); - } - - pub(super) fn emit_int_cmp_impl(&mut self, dest: &Value, op: IrCmpOp, lhs: &Operand, rhs: &Operand, ty: IrType) { - self.emit_int_cmp_insn(lhs, rhs, ty); - let cond = arm_int_cond_code(op); - self.state.emit_fmt(format_args!(" cset x0, {}", cond)); - self.store_x0_to(dest); - } - - pub(super) fn emit_fused_cmp_branch_impl( - &mut self, - op: IrCmpOp, - lhs: &Operand, - rhs: &Operand, - ty: IrType, - true_label: &str, - false_label: &str, - ) { - self.emit_int_cmp_insn(lhs, rhs, ty); - let cc = arm_int_cond_code(op); - let inv_cc = arm_invert_cond_code(cc); - let skip = self.state.fresh_label("skip"); - self.state.emit_fmt(format_args!(" b.{} {}", inv_cc, skip)); - self.state.emit_fmt(format_args!(" b {}", true_label)); - self.state.emit_fmt(format_args!("{}:", skip)); - self.state.emit_fmt(format_args!(" b {}", false_label)); - self.state.reg_cache.invalidate_all(); - } - - pub(super) fn emit_select_impl(&mut self, dest: &Value, cond: &Operand, true_val: &Operand, false_val: &Operand, _ty: IrType) { - self.operand_to_x0(false_val); - self.state.emit(" mov x1, x0"); - self.operand_to_x0(true_val); - self.state.emit(" mov x2, x0"); - self.operand_to_x0(cond); - self.state.emit(" cmp x0, #0"); - self.state.emit(" csel x0, x2, x1, ne"); - self.state.reg_cache.invalidate_acc(); - self.store_x0_to(dest); - } -} diff --git a/src/backend/arm/codegen/emit.rs b/src/backend/arm/codegen/emit.rs deleted file mode 100644 index 9610e07429..0000000000 --- a/src/backend/arm/codegen/emit.rs +++ /dev/null @@ -1,1999 +0,0 @@ -use crate::delegate_to_impl; -use crate::ir::reexports::{ - AtomicOrdering, - AtomicRmwOp, - BlockId, - Instruction, - IntrinsicOp, - IrBinOp, - IrCmpOp, - IrConst, - IrFunction, - Operand, - Value, -}; -use crate::common::types::IrType; -use crate::common::fx_hash::FxHashMap; -use crate::backend::common::PtrDirective; -use crate::backend::state::{CodegenState, StackSlot}; -use crate::backend::traits::ArchCodegen; -use crate::backend::generation::find_param_alloca; -use crate::backend::call_abi::{CallAbiConfig, CallArgClass}; -use crate::backend::call_abi::ParamClass; -use crate::backend::inline_asm::emit_inline_asm_common; -use crate::backend::regalloc::PhysReg; -use super::asm_emitter::ARM_GP_SCRATCH; - -/// Callee-saved registers available for register allocation: x20-x28. -/// x19 is reserved (some ABIs use it), x29=fp, x30=lr. -pub(super) const ARM_CALLEE_SAVED: [PhysReg; 9] = [ - PhysReg(20), PhysReg(21), PhysReg(22), PhysReg(23), PhysReg(24), - PhysReg(25), PhysReg(26), PhysReg(27), PhysReg(28), -]; - -/// Caller-saved registers available for register allocation: x13, x14. -/// -/// These are a subset of the AAPCS64 "corruptible" registers (x9-x15). -/// We exclude x9 (primary address register), x10 (memcpy source, secondary -/// scratch), x11 (memcpy loop counter), x12 (memcpy byte transfer), x15 -/// (F128 large-offset scratch), and x16/x17/x18 (IP0/IP1/platform-reserved). -/// -/// x13 and x14 have NO hardcoded scratch uses in the codegen. They only -/// appear in ARM_TMP_REGS (call argument staging) and ARM_GP_SCRATCH -/// (inline assembly scratch pool). Since caller-saved allocation only assigns -/// values whose live ranges do NOT span any call, the call staging use is safe. -/// Functions with inline assembly have the caller-saved pool disabled entirely. -pub(super) const ARM_CALLER_SAVED: [PhysReg; 2] = [ - PhysReg(13), PhysReg(14), -]; - -pub(super) fn callee_saved_name(reg: PhysReg) -> &'static str { - match reg.0 { - // Caller-saved registers - 13 => "x13", 14 => "x14", - // Callee-saved registers - 19 => "x19", 20 => "x20", 21 => "x21", 22 => "x22", 23 => "x23", 24 => "x24", - 25 => "x25", 26 => "x26", 27 => "x27", 28 => "x28", - _ => unreachable!("invalid ARM register index"), - } -} - -pub(super) fn callee_saved_name_32(reg: PhysReg) -> &'static str { - match reg.0 { - // Caller-saved registers - 13 => "w13", 14 => "w14", - // Callee-saved registers - 19 => "w19", 20 => "w20", 21 => "w21", 22 => "w22", 23 => "w23", 24 => "w24", - 25 => "w25", 26 => "w26", 27 => "w27", 28 => "w28", - _ => unreachable!("invalid ARM register index"), - } -} - -/// Check if a register name is an AArch64 floating-point/SIMD register -/// (s0-s31, d0-d31, v0-v31, q0-q31). -/// This avoids false positives for "sp" (stack pointer) which starts with 's'. -pub(super) fn is_arm_fp_reg(reg: &str) -> bool { - if let Some(suffix) = reg.strip_prefix('d') - .or_else(|| reg.strip_prefix('s')) - .or_else(|| reg.strip_prefix('v')) - .or_else(|| reg.strip_prefix('q')) - { - !suffix.is_empty() && suffix.chars().all(|c| c.is_ascii_digit()) - } else { - false - } -} - -/// Map IrBinOp to AArch64 mnemonic for simple ALU ops. -pub(super) fn arm_alu_mnemonic(op: IrBinOp) -> &'static str { - match op { - IrBinOp::Add => "add", - IrBinOp::Sub => "sub", - IrBinOp::And => "and", - IrBinOp::Or => "orr", - IrBinOp::Xor => "eor", - IrBinOp::Mul => "mul", - _ => unreachable!("unsupported ALU op for arm_alu_mnemonic: {:?}", op), - } -} - -/// Map an IrCmpOp to its AArch64 integer condition code suffix. -pub(super) fn arm_int_cond_code(op: IrCmpOp) -> &'static str { - match op { - IrCmpOp::Eq => "eq", - IrCmpOp::Ne => "ne", - IrCmpOp::Slt => "lt", - IrCmpOp::Sle => "le", - IrCmpOp::Sgt => "gt", - IrCmpOp::Sge => "ge", - IrCmpOp::Ult => "lo", - IrCmpOp::Ule => "ls", - IrCmpOp::Ugt => "hi", - IrCmpOp::Uge => "hs", - } -} - -/// Return the inverted AArch64 condition code suffix. -pub(super) fn arm_invert_cond_code(cc: &str) -> &'static str { - match cc { - "eq" => "ne", - "ne" => "eq", - "lt" => "ge", - "ge" => "lt", - "gt" => "le", - "le" => "gt", - "lo" => "hs", - "hs" => "lo", - "hi" => "ls", - "ls" => "hi", - "mi" => "pl", - "pl" => "mi", - "vs" => "vc", - "vc" => "vs", - _ => unreachable!("unknown ARM condition code: {}", cc), - } -} - -/// AArch64 code generator. Implements the ArchCodegen trait for the shared framework. -/// Uses AAPCS64 calling convention with stack-based allocation. -pub struct ArmCodegen { - pub(crate) state: CodegenState, - /// Frame size for the current function (needed for epilogue in terminators). - pub(super) current_frame_size: i64, - pub(super) current_return_type: IrType, - /// For variadic functions: offset from SP where the GP register save area starts (x0-x7). - pub(super) va_gp_save_offset: i64, - /// For variadic functions: offset from SP where the FP register save area starts (q0-q7). - pub(super) va_fp_save_offset: i64, - /// Number of named (non-variadic) GP params for current variadic function. - pub(super) va_named_gp_count: usize, - /// Number of named (non-variadic) FP params for current variadic function. - pub(super) va_named_fp_count: usize, - /// Total bytes of named (non-variadic) params passed on the stack. - /// This includes all stack-passed scalars, F128, I128, and structs with alignment. - pub(super) va_named_stack_bytes: usize, - /// Scratch register index for inline asm GP register allocation - pub(super) asm_scratch_idx: usize, - /// Scratch register index for inline asm FP register allocation - pub(super) asm_fp_scratch_idx: usize, - /// Register allocator: value ID -> physical callee-saved register. - pub(super) reg_assignments: FxHashMap, - /// Which callee-saved registers are actually used (for save/restore). - pub(super) used_callee_saved: Vec, - /// SP offset where callee-saved registers are stored. - pub(super) callee_save_offset: i64, - /// For large stack frames: reserved for future x19 frame base optimization. - /// Currently always None (optimization disabled due to correctness issue). - pub(super) frame_base_offset: Option, - /// Whether -mgeneral-regs-only is set. When true, FP/SIMD registers (q0-q7) - /// must not be used. Variadic prologues skip saving q0-q7 and va_start - /// sets __vr_offs=0 (no FP register save area available). - pub(super) general_regs_only: bool, -} - -impl ArmCodegen { - pub fn new() -> Self { - Self { - state: CodegenState::new(), - current_frame_size: 0, - current_return_type: IrType::I64, - va_gp_save_offset: 0, - va_fp_save_offset: 0, - va_named_gp_count: 0, - va_named_fp_count: 0, - va_named_stack_bytes: 0, - asm_scratch_idx: 0, - asm_fp_scratch_idx: 0, - reg_assignments: FxHashMap::default(), - used_callee_saved: Vec::new(), - callee_save_offset: 0, - frame_base_offset: None, - general_regs_only: false, - } - } - - /// Disable jump table emission (-fno-jump-tables). - pub fn set_no_jump_tables(&mut self, enabled: bool) { - self.state.no_jump_tables = enabled; - } - - /// Enable position-independent code generation (-fPIC/-fpie). - pub fn set_pic(&mut self, pic: bool) { - self.state.pic_mode = pic; - } - - /// Set general-regs-only mode (-mgeneral-regs-only). - /// When true, FP/SIMD registers are not used in variadic prologues. - pub fn set_general_regs_only(&mut self, enabled: bool) { - self.general_regs_only = enabled; - } - - /// Apply all relevant options from a `CodegenOptions` struct. - pub fn apply_options(&mut self, opts: &crate::backend::CodegenOptions) { - self.set_pic(opts.pic); - self.set_no_jump_tables(opts.no_jump_tables); - self.set_general_regs_only(opts.general_regs_only); - self.state.emit_cfi = opts.emit_cfi; - } - - /// Get the physical register assigned to an operand (if it's a Value with a register). - pub(super) fn operand_reg(&self, op: &Operand) -> Option { - match op { - Operand::Value(v) => self.reg_assignments.get(&v.0).copied(), - _ => None, - } - } - - /// Get the physical register assigned to a destination value. - pub(super) fn dest_reg(&self, dest: &Value) -> Option { - self.reg_assignments.get(&dest.0).copied() - } - - /// Load an operand into a specific callee-saved register. - pub(super) fn operand_to_callee_reg(&mut self, op: &Operand, reg: PhysReg) { - let reg_name = callee_saved_name(reg); - match op { - Operand::Const(_) => { - self.operand_to_x0(op); - self.state.emit_fmt(format_args!(" mov {}, x0", reg_name)); - } - Operand::Value(v) => { - if let Some(&src_reg) = self.reg_assignments.get(&v.0) { - if src_reg.0 != reg.0 { - let src_name = callee_saved_name(src_reg); - self.state.emit_fmt(format_args!(" mov {}, {}", reg_name, src_name)); - } - } else { - self.operand_to_x0(op); - self.state.emit_fmt(format_args!(" mov {}, x0", reg_name)); - } - } - } - } - - /// Try to extract an immediate value suitable for ARM imm12 encoding. - pub(super) fn const_as_imm12(op: &Operand) -> Option { - match op { - Operand::Const(c) => { - let val = match c { - IrConst::I8(v) => *v as i64, - IrConst::I16(v) => *v as i64, - IrConst::I32(v) => *v as i64, - IrConst::I64(v) => *v, - IrConst::Zero => 0, - _ => return None, - }; - // ARM add/sub imm12: 0..4095 - if (0..=4095).contains(&val) { - Some(val) - } else { - None - } - } - _ => None, - } - } - - /// If `op` is a constant that is a power of two, return its log2 (shift amount). - pub(super) fn const_as_power_of_2(op: &Operand) -> Option { - match op { - Operand::Const(c) => { - let val: u64 = match c { - IrConst::I8(v) => *v as u8 as u64, - IrConst::I16(v) => *v as u16 as u64, - IrConst::I32(v) => *v as u32 as u64, - IrConst::I64(v) => *v as u64, - IrConst::Zero => return None, - _ => return None, - }; - if val > 0 && val.is_power_of_two() { - Some(val.trailing_zeros()) - } else { - None - } - } - _ => None, - } - } - - /// Pre-scan all inline asm instructions in a function to predict which - /// callee-saved registers will be needed as scratch registers. - /// - /// The inline asm scratch allocator (`assign_scratch_reg`) walks through - /// `ARM_GP_SCRATCH` = [x9..x15, x19, x20, x21], skipping registers that - /// appear in the clobber/excluded list. When enough caller-saved scratch regs - /// (x9-x15) are clobbered, the allocator falls through to callee-saved - /// registers (x19, x20, x21). These must be saved/restored in the prologue, - /// but the prologue is emitted before inline asm codegen runs. This function - /// simulates the allocation to discover the callee-saved registers early. - pub(super) fn prescan_inline_asm_callee_saved(func: &IrFunction, used_callee_saved: &mut Vec) { - for block in &func.blocks { - for instr in &block.instructions { - if let Instruction::InlineAsm { - outputs, inputs, clobbers, .. - } = instr { - // Build excluded set: clobber registers + specific constraint regs - let mut excluded: Vec = Vec::new(); - for clobber in clobbers { - if clobber == "cc" || clobber == "memory" { - continue; - } - excluded.push(clobber.clone()); - // Also exclude the alternate width alias (wN <-> xN) - // and normalize rN (GCC AArch64 alias for xN) - if let Some(suffix) = clobber.strip_prefix('w') { - if suffix.chars().all(|c| c.is_ascii_digit()) { - excluded.push(format!("x{}", suffix)); - } - } else if let Some(suffix) = clobber.strip_prefix('x') { - if suffix.chars().all(|c| c.is_ascii_digit()) { - excluded.push(format!("w{}", suffix)); - } - } else if let Some(suffix) = clobber.strip_prefix('r') { - if suffix.chars().all(|c| c.is_ascii_digit()) { - if let Ok(n) = suffix.parse::() { - if n <= 30 { - excluded.push(format!("x{}", n)); - excluded.push(format!("w{}", n)); - } - } - } - } - } - - // Count GP scratch registers needed: - // 1. GpReg operands (outputs + inputs that are "r" type, not tied, not specific) - // 2. Memory operands that need indirection (non-alloca pointers get a scratch reg) - let mut gp_scratch_needed = 0usize; - - for (constraint, _, _) in outputs { - let c = constraint.trim_start_matches(['=', '+', '&', '%']); - if c.starts_with('{') && c.ends_with('}') { - let reg_name = &c[1..c.len()-1]; - // Normalize rN -> xN (GCC AArch64 alias) - let normalized = super::asm_emitter::normalize_aarch64_register(reg_name); - excluded.push(normalized); - } else if c == "m" || c == "Q" || c.contains('Q') || c.contains('m') { - // Memory operands may need a scratch reg for indirection. - // Conservatively count each one. - gp_scratch_needed += 1; - } else if c == "w" { - // FP register, doesn't consume GP scratch - } else if !c.is_empty() && c.chars().all(|ch| ch.is_ascii_digit()) { - // Tied operand, doesn't need its own scratch - } else { - // GpReg - gp_scratch_needed += 1; - } - } - - // Count "+" read-write outputs that generate synthetic inputs. - // Synthetic inputs from "+r" have constraint "r" and consume a - // GP scratch slot in phase 1 (even though the register is later - // overwritten by copy_metadata_from). We must count these too. - let num_plus = outputs.iter().filter(|(c,_,_)| c.contains('+')).count(); - { - let mut plus_idx = 0; - for (constraint, _, _) in outputs.iter() { - if constraint.contains('+') { - let c = constraint.trim_start_matches(['=', '+', '&', '%']); - // Synthetic input inherits constraint with '+' stripped - // "+r" → "r" (GpReg, consumes scratch), "+m" → "m" (Memory, skip) - if c != "m" && c != "Q" && !c.contains('Q') && !c.contains('m') && c != "w" - && !(c.starts_with('{') && c.ends_with('}')) - && (!c.chars().all(|ch| ch.is_ascii_digit()) || c.is_empty()) - { - gp_scratch_needed += 1; - } - plus_idx += 1; - } - } - let _ = plus_idx; - } - - for (i, (constraint, val, _)) in inputs.iter().enumerate() { - // Skip synthetic inputs (they're already counted above) - if i < num_plus { - continue; - } - let c = constraint.trim_start_matches(['=', '+', '&', '%']); - if c.starts_with('{') && c.ends_with('}') { - let reg_name = &c[1..c.len()-1]; - // Normalize rN -> xN (GCC AArch64 alias) - let normalized = super::asm_emitter::normalize_aarch64_register(reg_name); - excluded.push(normalized); - } else if c == "m" || c == "Q" || c.contains('Q') || c.contains('m') { - gp_scratch_needed += 1; - } else if c == "w" { - // FP register - } else if !c.is_empty() && c.chars().all(|ch| ch.is_ascii_digit()) { - // Tied operand - } else { - // Check if constant input with immediate-capable constraint - // would be promoted to Immediate (no scratch needed) - let is_const = matches!(val, Operand::Const(_)); - let has_imm_alt = c.contains('i') || c.contains('I') || c.contains('n'); - if is_const && has_imm_alt { - // Would be promoted to Immediate, no GP scratch needed - } else { - gp_scratch_needed += 1; - } - } - } - - // Simulate walking through ARM_GP_SCRATCH, skipping excluded regs - let mut scratch_idx = 0; - let mut assigned = 0; - while assigned < gp_scratch_needed && scratch_idx < ARM_GP_SCRATCH.len() { - let reg = ARM_GP_SCRATCH[scratch_idx]; - scratch_idx += 1; - if excluded.iter().any(|e| e == reg) { - continue; - } - assigned += 1; - // Check if this is a callee-saved register - if let Some(num_str) = reg.strip_prefix('x') { - if let Ok(n) = num_str.parse::() { - if (19..=28).contains(&n) { - let phys = PhysReg(n); - if !used_callee_saved.contains(&phys) { - used_callee_saved.push(phys); - } - } - } - } - } - - // Also handle overflow beyond ARM_GP_SCRATCH (format!("x{}", 9 + idx)) - while assigned < gp_scratch_needed { - let idx = scratch_idx; - scratch_idx += 1; - let reg_num = 9 + idx; - let reg_name = format!("x{}", reg_num); - if excluded.iter().any(|e| e == ®_name) { - continue; - } - assigned += 1; - if (19..=28).contains(®_num) { - let phys = PhysReg(reg_num as u8); - if !used_callee_saved.contains(&phys) { - used_callee_saved.push(phys); - } - } - } - } - } - } - // Sort for deterministic prologue/epilogue emission - used_callee_saved.sort_by_key(|r| r.0); - } - - /// Restore callee-saved registers before epilogue. - pub(super) fn emit_restore_callee_saved(&mut self) { - let used_regs = self.used_callee_saved.clone(); - let base = self.callee_save_offset; - let n = used_regs.len(); - let mut i = 0; - while i + 1 < n { - let r1 = callee_saved_name(used_regs[i]); - let r2 = callee_saved_name(used_regs[i + 1]); - let offset = base + (i as i64) * 8; - self.emit_ldp_from_sp(r1, r2, offset); - i += 2; - } - if i < n { - let r = callee_saved_name(used_regs[i]); - let offset = base + (i as i64) * 8; - self.emit_load_from_sp(r, offset, "ldr"); - } - } - - /// Check if an IrConst is a small unsigned immediate that fits in AArch64 - /// `cmp Xn, #imm12` instruction (0..=4095). - fn const_as_cmp_imm12(c: &IrConst) -> Option { - let v = match c { - IrConst::I8(v) => *v as i64, - IrConst::I16(v) => *v as i64, - IrConst::I32(v) => *v as i64, - IrConst::I64(v) => *v, - IrConst::Zero => 0, - _ => return None, - }; - // AArch64 cmp (alias of subs) accepts unsigned 12-bit immediate (0..4095), - // optionally shifted left by 12. We only use the unshifted form. - if (0..=4095).contains(&v) { - Some(v as u64) - } else { - None - } - } - - /// Check if an IrConst is a small negative value that can use `cmn Xn, #imm12` - /// (i.e., the negated value fits in 0..=4095). - fn const_as_cmn_imm12(c: &IrConst) -> Option { - let v = match c { - IrConst::I8(v) => *v as i64, - IrConst::I16(v) => *v as i64, - IrConst::I32(v) => *v as i64, - IrConst::I64(v) => *v, - _ => return None, - }; - let neg = v.checked_neg()?; - if (1..=4095).contains(&neg) { - Some(neg as u64) - } else { - None - } - } - - /// Get the register name for a Value if it has a register assignment. - /// Returns (64-bit name, 32-bit name) pair. - fn value_reg_name(&self, v: &Value) -> Option<(&'static str, &'static str)> { - self.reg_assignments.get(&v.0).map(|®| { - (callee_saved_name(reg), callee_saved_name_32(reg)) - }) - } - - /// Emit the integer comparison preamble. - /// Optimized paths: - /// 1. reg vs #imm12 → `cmp wN/xN, #imm` (1 instruction) - /// 2. reg vs #neg_imm12 → `cmn wN/xN, #imm` (1 instruction) - /// 3. reg vs reg → `cmp wN/xN, wM/xM` (1 instruction) - /// 4. fallback → load lhs→x1, rhs→x0, `cmp w1/x1, w0/x0` - /// Used by both emit_cmp and emit_fused_cmp_branch. - pub(super) fn emit_int_cmp_insn(&mut self, lhs: &Operand, rhs: &Operand, ty: IrType) { - let use_32bit = ty == IrType::I32 || ty == IrType::U32 - || ty == IrType::I8 || ty == IrType::U8 - || ty == IrType::I16 || ty == IrType::U16; - - // Try optimized path: lhs in register, rhs is immediate - if let Operand::Value(lv) = lhs { - if let Some((lhs_x, lhs_w)) = self.value_reg_name(lv) { - let lhs_reg = if use_32bit { lhs_w } else { lhs_x }; - - // cmp reg, #imm12 - if let Operand::Const(c) = rhs { - if let Some(imm) = Self::const_as_cmp_imm12(c) { - self.state.emit_fmt(format_args!(" cmp {}, #{}", lhs_reg, imm)); - return; - } - // cmn reg, #imm12 (for negative constants) - if let Some(imm) = Self::const_as_cmn_imm12(c) { - self.state.emit_fmt(format_args!(" cmn {}, #{}", lhs_reg, imm)); - return; - } - } - - // cmp reg, reg - if let Operand::Value(rv) = rhs { - if let Some((rhs_x, rhs_w)) = self.value_reg_name(rv) { - let rhs_reg = if use_32bit { rhs_w } else { rhs_x }; - self.state.emit_fmt(format_args!(" cmp {}, {}", lhs_reg, rhs_reg)); - return; - } - } - - // lhs in register, rhs needs loading into x0 - self.operand_to_x0(rhs); - if use_32bit { - self.state.emit_fmt(format_args!(" cmp {}, w0", lhs_reg)); - } else { - self.state.emit_fmt(format_args!(" cmp {}, x0", lhs_reg)); - } - return; - } - } - - // Try: lhs needs loading, rhs in register - if let Operand::Value(rv) = rhs { - if let Some((rhs_x, rhs_w)) = self.value_reg_name(rv) { - self.operand_to_x0(lhs); - let rhs_reg = if use_32bit { rhs_w } else { rhs_x }; - if use_32bit { - self.state.emit_fmt(format_args!(" cmp w0, {}", rhs_reg)); - } else { - self.state.emit_fmt(format_args!(" cmp x0, {}", rhs_reg)); - } - return; - } - } - - // Try: lhs in x0 (accumulator), rhs is immediate - if let Operand::Const(c) = rhs { - if let Some(imm) = Self::const_as_cmp_imm12(c) { - self.operand_to_x0(lhs); - if use_32bit { - self.state.emit_fmt(format_args!(" cmp w0, #{}", imm)); - } else { - self.state.emit_fmt(format_args!(" cmp x0, #{}", imm)); - } - return; - } - if let Some(imm) = Self::const_as_cmn_imm12(c) { - self.operand_to_x0(lhs); - if use_32bit { - self.state.emit_fmt(format_args!(" cmn w0, #{}", imm)); - } else { - self.state.emit_fmt(format_args!(" cmn x0, #{}", imm)); - } - return; - } - } - - // Fallback: load both into x0/x1 - self.operand_to_x0(lhs); - self.state.emit(" mov x1, x0"); - self.operand_to_x0(rhs); - if use_32bit { - self.state.emit(" cmp w1, w0"); - } else { - self.state.emit(" cmp x1, x0"); - } - } - - // --- AArch64 large-offset helpers --- - - /// Emit a large immediate subtraction from sp. Uses x17 (IP1) as scratch. - pub(super) fn emit_sub_sp(&mut self, n: i64) { - if n == 0 { return; } - if n <= 4095 { - self.state.emit_fmt(format_args!(" sub sp, sp, #{}", n)); - } else { - self.emit_load_imm64("x17", n); - self.state.emit(" sub sp, sp, x17"); - } - } - - /// Emit a large immediate addition to sp. Uses x17 (IP1) as scratch. - pub(super) fn emit_add_sp(&mut self, n: i64) { - if n == 0 { return; } - if n <= 4095 { - self.state.emit_fmt(format_args!(" add sp, sp, #{}", n)); - } else { - self.emit_load_imm64("x17", n); - self.state.emit(" add sp, sp, x17"); - } - } - - /// Get the access size in bytes for an AArch64 load/store instruction and register. - /// For str/ldr, the access size depends on the register: - /// w registers = 4 bytes, x registers = 8 bytes, - /// s (single-precision float) = 4 bytes, d (double-precision float) = 8 bytes, - /// q (SIMD/quad) = 16 bytes. - fn access_size_for_instr(instr: &str, reg: &str) -> i64 { - match instr { - "strb" | "ldrb" | "ldrsb" => 1, - "strh" | "ldrh" | "ldrsh" => 2, - "ldrsw" => 4, - "str" | "ldr" => { - if reg.starts_with('w') || reg.starts_with('s') { - 4 - } else if reg.starts_with('q') { - 16 - } else { - // x registers and d registers are both 8 bytes - 8 - } - } - _ => 1, // conservative default - } - } - - /// Check if an offset is valid for unsigned immediate addressing on AArch64. - /// The unsigned offset is a 12-bit field scaled by access size: max = 4095 * access_size. - /// The offset must also be naturally aligned to the access size. - pub(super) fn is_valid_imm_offset(offset: i64, instr: &str, reg: &str) -> bool { - if offset < 0 { return false; } - let access_size = Self::access_size_for_instr(instr, reg); - let max_offset = 4095 * access_size; - offset <= max_offset && offset % access_size == 0 - } - - /// Emit store to [base, #offset], handling large offsets. - /// For large frames with x19 as frame base register, tries x19-relative addressing - /// before falling back to the expensive movz+movk+add sequence. - pub(super) fn emit_store_to_sp(&mut self, reg: &str, offset: i64, instr: &str) { - // When DynAlloca is present, use x29 (frame pointer) as base. - let base = if self.state.has_dyn_alloca { "x29" } else { "sp" }; - if Self::is_valid_imm_offset(offset, instr, reg) { - self.state.emit_fmt(format_args!(" {} {}, [{}, #{}]", instr, reg, base, offset)); - } else if let Some(fb_offset) = self.frame_base_offset { - // Try x19-relative addressing (x19 = sp + frame_base_offset) - let rel_offset = offset - fb_offset; - if Self::is_valid_imm_offset(rel_offset, instr, reg) { - self.state.emit_fmt(format_args!(" {} {}, [x19, #{}]", instr, reg, rel_offset)); - } else { - self.load_large_imm("x17", offset); - self.state.emit_fmt(format_args!(" add x17, {}, x17", base)); - self.state.emit_fmt(format_args!(" {} {}, [x17]", instr, reg)); - } - } else { - self.load_large_imm("x17", offset); - self.state.emit_fmt(format_args!(" add x17, {}, x17", base)); - self.state.emit_fmt(format_args!(" {} {}, [x17]", instr, reg)); - } - } - - /// Emit load from [base, #offset], handling large offsets. - /// For large frames with x19 as frame base register, tries x19-relative addressing. - pub(super) fn emit_load_from_sp(&mut self, reg: &str, offset: i64, instr: &str) { - let base = if self.state.has_dyn_alloca { "x29" } else { "sp" }; - if Self::is_valid_imm_offset(offset, instr, reg) { - self.state.emit_fmt(format_args!(" {} {}, [{}, #{}]", instr, reg, base, offset)); - } else if let Some(fb_offset) = self.frame_base_offset { - let rel_offset = offset - fb_offset; - if Self::is_valid_imm_offset(rel_offset, instr, reg) { - self.state.emit_fmt(format_args!(" {} {}, [x19, #{}]", instr, reg, rel_offset)); - } else { - self.load_large_imm("x17", offset); - self.state.emit_fmt(format_args!(" add x17, {}, x17", base)); - self.state.emit_fmt(format_args!(" {} {}, [x17]", instr, reg)); - } - } else { - self.load_large_imm("x17", offset); - self.state.emit_fmt(format_args!(" add x17, {}, x17", base)); - self.state.emit_fmt(format_args!(" {} {}, [x17]", instr, reg)); - } - } - - /// Emit store to [sp, #offset] using the REAL sp register, even when alloca is present. - /// Used for storing into dynamically-allocated call stack arg areas that live at the - /// current sp, NOT in the frame (x29-relative). - pub(super) fn emit_store_to_raw_sp(&mut self, reg: &str, offset: i64, instr: &str) { - if Self::is_valid_imm_offset(offset, instr, reg) { - self.state.emit_fmt(format_args!(" {} {}, [sp, #{}]", instr, reg, offset)); - } else { - self.load_large_imm("x17", offset); - self.state.emit(" add x17, sp, x17"); - self.state.emit_fmt(format_args!(" {} {}, [x17]", instr, reg)); - } - } - - /// Emit `stp reg1, reg2, [base, #offset]` handling large offsets. - /// Uses x19 frame base for large frames when possible. - pub(super) fn emit_stp_to_sp(&mut self, reg1: &str, reg2: &str, offset: i64) { - let base = if self.state.has_dyn_alloca { "x29" } else { "sp" }; - // stp supports signed offsets in [-512, 504] range (multiples of 8) - if (-512..=504).contains(&offset) { - self.state.emit_fmt(format_args!(" stp {}, {}, [{}, #{}]", reg1, reg2, base, offset)); - } else if let Some(fb_offset) = self.frame_base_offset { - let rel = offset - fb_offset; - if (-512..=504).contains(&rel) { - self.state.emit_fmt(format_args!(" stp {}, {}, [x19, #{}]", reg1, reg2, rel)); - } else { - self.load_large_imm("x17", offset); - self.state.emit_fmt(format_args!(" add x17, {}, x17", base)); - self.state.emit_fmt(format_args!(" stp {}, {}, [x17]", reg1, reg2)); - } - } else { - self.load_large_imm("x17", offset); - self.state.emit_fmt(format_args!(" add x17, {}, x17", base)); - self.state.emit_fmt(format_args!(" stp {}, {}, [x17]", reg1, reg2)); - } - } - - pub(super) fn emit_ldp_from_sp(&mut self, reg1: &str, reg2: &str, offset: i64) { - let base = if self.state.has_dyn_alloca { "x29" } else { "sp" }; - if (-512..=504).contains(&offset) { - self.state.emit_fmt(format_args!(" ldp {}, {}, [{}, #{}]", reg1, reg2, base, offset)); - } else if let Some(fb_offset) = self.frame_base_offset { - let rel = offset - fb_offset; - if (-512..=504).contains(&rel) { - self.state.emit_fmt(format_args!(" ldp {}, {}, [x19, #{}]", reg1, reg2, rel)); - } else { - self.load_large_imm("x17", offset); - self.state.emit_fmt(format_args!(" add x17, {}, x17", base)); - self.state.emit_fmt(format_args!(" ldp {}, {}, [x17]", reg1, reg2)); - } - } else { - self.load_large_imm("x17", offset); - self.state.emit_fmt(format_args!(" add x17, {}, x17", base)); - self.state.emit_fmt(format_args!(" ldp {}, {}, [x17]", reg1, reg2)); - } - } - - /// Emit `add dest, sp, #offset` handling large offsets. - /// Uses x19 frame base when available, falls back to x17 scratch. - pub(super) fn emit_add_sp_offset(&mut self, dest: &str, offset: i64) { - let base = if self.state.has_dyn_alloca { "x29" } else { "sp" }; - if (0..=4095).contains(&offset) { - self.state.emit_fmt(format_args!(" add {}, {}, #{}", dest, base, offset)); - } else if let Some(fb_offset) = self.frame_base_offset { - let rel = offset - fb_offset; - if (0..=4095).contains(&rel) { - self.state.emit_fmt(format_args!(" add {}, x19, #{}", dest, rel)); - } else if (-4095..0).contains(&rel) { - self.state.emit_fmt(format_args!(" sub {}, x19, #{}", dest, -rel)); - } else { - self.load_large_imm("x17", offset); - self.state.emit_fmt(format_args!(" add {}, {}, x17", dest, base)); - } - } else { - self.load_large_imm("x17", offset); - self.state.emit_fmt(format_args!(" add {}, {}, x17", dest, base)); - } - } - - /// Compute the address of an alloca into `dest`, handling over-aligned allocas. - /// For normal allocas: `dest = sp + offset`. - /// For over-aligned allocas: `dest = (sp + offset + align-1) & -align`. - /// `offset` is the raw stack slot offset (already adjusted for call setup if needed). - pub(super) fn emit_alloca_addr(&mut self, dest: &str, val_id: u32, offset: i64) { - if let Some(align) = self.state.alloca_over_align(val_id) { - self.emit_add_sp_offset(dest, offset); - self.load_large_imm("x17", (align - 1) as i64); - self.state.emit_fmt(format_args!(" add {}, {}, x17", dest, dest)); - self.load_large_imm("x17", -(align as i64)); - self.state.emit_fmt(format_args!(" and {}, {}, x17", dest, dest)); - } else { - self.emit_add_sp_offset(dest, offset); - } - } - - /// Emit `add dest, x29, #offset` handling large offsets. - /// Uses x17 (IP1) as scratch for offsets > 4095. - pub(super) fn emit_add_fp_offset(&mut self, dest: &str, offset: i64) { - if (0..=4095).contains(&offset) { - self.state.emit_fmt(format_args!(" add {}, x29, #{}", dest, offset)); - } else { - self.load_large_imm("x17", offset); - self.state.emit_fmt(format_args!(" add {}, x29, x17", dest)); - } - } - - /// Emit load from an arbitrary base register with offset, handling large offsets via x17. - /// For offsets that exceed the ARM64 unsigned immediate range, materializes the - /// effective address into x17 and loads from [x17]. - pub(super) fn emit_load_from_reg(&mut self, dest: &str, base: &str, offset: i64, instr: &str) { - if Self::is_valid_imm_offset(offset, instr, dest) { - self.state.emit_fmt(format_args!(" {} {}, [{}, #{}]", instr, dest, base, offset)); - } else { - self.load_large_imm("x17", offset); - self.state.emit_fmt(format_args!(" add x17, {}, x17", base)); - self.state.emit_fmt(format_args!(" {} {}, [x17]", instr, dest)); - } - } - - /// Load an immediate into a register using the most efficient sequence. - /// Handles all 64-bit values including negatives via MOVZ/MOVK or MOVN/MOVK. - pub(super) fn load_large_imm(&mut self, reg: &str, val: i64) { - self.emit_load_imm64(reg, val); - } - - /// Load a 64-bit immediate value into a register using movz/movn + movk sequence. - /// Uses MOVN (move-not) for values where most halfwords are 0xFFFF, which - /// gives shorter sequences for negative numbers and large values. - pub(super) fn emit_load_imm64(&mut self, reg: &str, val: i64) { - let bits = val as u64; - if bits == 0 { - self.state.emit_fmt(format_args!(" mov {}, #0", reg)); - return; - } - if bits == 0xFFFFFFFF_FFFFFFFF { - // All-ones: MOVN reg, #0 produces NOT(0) = 0xFFFFFFFFFFFFFFFF - self.state.emit_fmt(format_args!(" movn {}, #0", reg)); - return; - } - - // Extract 16-bit halfwords - let hw: [u16; 4] = [ - (bits & 0xffff) as u16, - ((bits >> 16) & 0xffff) as u16, - ((bits >> 32) & 0xffff) as u16, - ((bits >> 48) & 0xffff) as u16, - ]; - - // Count how many halfwords are 0x0000 vs 0xFFFF to pick MOVZ vs MOVN - let zeros = hw.iter().filter(|&&h| h == 0x0000).count(); - let ones = hw.iter().filter(|&&h| h == 0xFFFF).count(); - - if ones > zeros { - // Use MOVN (move-not) strategy: start with all-ones, patch non-0xFFFF halfwords - // MOVN sets the register to NOT(imm16 << shift) - let mut first = true; - for (i, &h) in hw.iter().enumerate() { - if h != 0xFFFF { - let shift = i * 16; - let not_h = (!h) as u64 & 0xffff; - if first { - if shift == 0 { - self.state.emit_fmt(format_args!(" movn {}, #{}", reg, not_h)); - } else { - self.state.emit_fmt(format_args!(" movn {}, #{}, lsl #{}", reg, not_h, shift)); - } - first = false; - } else if shift == 0 { - self.state.emit_fmt(format_args!(" movk {}, #{}", reg, h as u64)); - } else { - self.state.emit_fmt(format_args!(" movk {}, #{}, lsl #{}", reg, h as u64, shift)); - } - } - } - } else { - // Use MOVZ (move-zero) strategy: start with all-zeros, patch non-0x0000 halfwords - let mut first = true; - for (i, &h) in hw.iter().enumerate() { - if h != 0x0000 { - let shift = i * 16; - if first { - if shift == 0 { - self.state.emit_fmt(format_args!(" movz {}, #{}", reg, h as u64)); - } else { - self.state.emit_fmt(format_args!(" movz {}, #{}, lsl #{}", reg, h as u64, shift)); - } - first = false; - } else if shift == 0 { - self.state.emit_fmt(format_args!(" movk {}, #{}", reg, h as u64)); - } else { - self.state.emit_fmt(format_args!(" movk {}, #{}, lsl #{}", reg, h as u64, shift)); - } - } - } - } - } - - /// Emit function prologue: allocate stack and save fp/lr. - pub(super) fn emit_prologue_arm(&mut self, frame_size: i64) { - const PAGE_SIZE: i64 = 4096; - if frame_size > 0 && frame_size <= 504 { - self.state.emit_fmt(format_args!(" stp x29, x30, [sp, #-{}]!", frame_size)); - if self.state.emit_cfi { - self.state.emit_fmt(format_args!(" .cfi_def_cfa_offset {}", frame_size)); - self.state.emit_fmt(format_args!(" .cfi_offset x29, -{}", frame_size)); - self.state.emit_fmt(format_args!(" .cfi_offset x30, -{}", frame_size - 8)); - } - } else if frame_size > PAGE_SIZE { - // Stack probing: for large frames, touch each page so the kernel - // can grow the stack mapping. Without this, a single large sub - // can skip guard pages and cause a segfault. - let probe_label = self.state.fresh_label("stack_probe"); - self.emit_load_imm64("x17", frame_size); - self.state.emit_fmt(format_args!("{}:", probe_label)); - self.state.emit_fmt(format_args!(" sub sp, sp, #{}", PAGE_SIZE)); - self.state.emit(" str xzr, [sp]"); - self.state.emit_fmt(format_args!(" sub x17, x17, #{}", PAGE_SIZE)); - self.state.emit_fmt(format_args!(" cmp x17, #{}", PAGE_SIZE)); - self.state.emit_fmt(format_args!(" b.hi {}", probe_label)); - self.state.emit(" sub sp, sp, x17"); - self.state.emit(" str xzr, [sp]"); - self.state.emit(" stp x29, x30, [sp]"); - if self.state.emit_cfi { - self.state.emit_fmt(format_args!(" .cfi_def_cfa_offset {}", frame_size)); - self.state.emit_fmt(format_args!(" .cfi_offset x29, -{}", frame_size)); - self.state.emit_fmt(format_args!(" .cfi_offset x30, -{}", frame_size - 8)); - } - } else { - self.emit_sub_sp(frame_size); - self.state.emit(" stp x29, x30, [sp]"); - if self.state.emit_cfi { - self.state.emit_fmt(format_args!(" .cfi_def_cfa_offset {}", frame_size)); - self.state.emit_fmt(format_args!(" .cfi_offset x29, -{}", frame_size)); - self.state.emit_fmt(format_args!(" .cfi_offset x30, -{}", frame_size - 8)); - } - } - self.state.emit(" mov x29, sp"); - if self.state.emit_cfi { - self.state.emit(" .cfi_def_cfa_register x29"); - } - } - - /// Emit function epilogue: restore fp/lr and deallocate stack. - pub(super) fn emit_epilogue_arm(&mut self, frame_size: i64) { - if self.state.has_dyn_alloca { - // DynAlloca modified SP at runtime; restore from frame pointer. - self.state.emit(" mov sp, x29"); - } - if frame_size > 0 && frame_size <= 504 { - self.state.emit_fmt(format_args!(" ldp x29, x30, [sp], #{}", frame_size)); - } else { - self.state.emit(" ldp x29, x30, [sp]"); - self.emit_add_sp(frame_size); - } - } - - /// Load an operand into x0. - pub(super) fn operand_to_x0(&mut self, op: &Operand) { - match op { - Operand::Const(c) => { - self.state.reg_cache.invalidate_acc(); - match c { - IrConst::I8(v) => self.state.emit_fmt(format_args!(" mov x0, #{}", v)), - IrConst::I16(v) => self.state.emit_fmt(format_args!(" mov x0, #{}", v)), - IrConst::I32(v) => { - if *v >= -65536 && *v <= 65535 { - self.state.emit_fmt(format_args!(" mov x0, #{}", v)); - } else { - // Sign-extend to 64-bit before loading into x0. - // Using the i64 path ensures negative I32 values get - // proper sign extension (upper 32 bits = 0xFFFFFFFF). - self.emit_load_imm64("x0", *v as i64); - } - } - IrConst::I64(v) => { - if *v >= -65536 && *v <= 65535 { - self.state.emit_fmt(format_args!(" mov x0, #{}", v)); - } else { - self.emit_load_imm64("x0", *v); - } - } - IrConst::F32(v) => self.emit_load_imm64("x0", v.to_bits() as i64), - IrConst::F64(v) => self.emit_load_imm64("x0", v.to_bits() as i64), - IrConst::LongDouble(v, _) => self.emit_load_imm64("x0", v.to_bits() as i64), - IrConst::I128(v) => self.emit_load_imm64("x0", *v as i64), // truncate to 64-bit - IrConst::Zero => self.state.emit(" mov x0, #0"), - } - } - Operand::Value(v) => { - let is_alloca = self.state.is_alloca(v.0); - if self.state.reg_cache.acc_has(v.0, is_alloca) { - return; // Cache hit — x0 already holds this value. - } - // Check for callee-saved register assignment. - if let Some(®) = self.reg_assignments.get(&v.0) { - let reg_name = callee_saved_name(reg); - self.state.emit_fmt(format_args!(" mov x0, {}", reg_name)); - self.state.reg_cache.set_acc(v.0, false); - return; - } - if let Some(slot) = self.state.get_slot(v.0) { - if is_alloca { - self.emit_alloca_addr("x0", v.0, slot.0); - } else { - self.emit_load_from_sp("x0", slot.0, "ldr"); - } - self.state.reg_cache.set_acc(v.0, is_alloca); - } else if self.state.reg_cache.acc_has(v.0, false) || self.state.reg_cache.acc_has(v.0, true) { - // Value has no slot or register but is in the accumulator cache - // (skip-slot optimization: immediately-consumed values stay in x0). - } else { - self.state.emit(" mov x0, #0"); - self.state.reg_cache.invalidate_acc(); - } - } - } - } - - /// Store x0 to a value's destination (register or stack slot). - pub(super) fn store_x0_to(&mut self, dest: &Value) { - if let Some(®) = self.reg_assignments.get(&dest.0) { - // Value has a callee-saved register: store only to register, skip stack. - let reg_name = callee_saved_name(reg); - self.state.emit_fmt(format_args!(" mov {}, x0", reg_name)); - } else if let Some(slot) = self.state.get_slot(dest.0) { - self.emit_store_to_sp("x0", slot.0, "str"); - } - self.state.reg_cache.set_acc(dest.0, false); - } - - // --- 128-bit integer helpers --- - // Convention: 128-bit values use x0 (low 64 bits) and x1 (high 64 bits). - // Stack slots for 128-bit values are 16 bytes: slot(sp) = low, slot+8(sp) = high. - - /// Load a 128-bit operand into x0 (low) : x1 (high). - pub(super) fn operand_to_x0_x1(&mut self, op: &Operand) { - match op { - Operand::Const(c) => { - match c { - IrConst::I128(v) => { - let low = *v as u64; - let high = (*v >> 64) as u64; - self.emit_load_imm64("x0", low as i64); - self.emit_load_imm64("x1", high as i64); - } - IrConst::Zero => { - self.state.emit(" mov x0, #0"); - self.state.emit(" mov x1, #0"); - } - _ => { - // Other consts: load into x0, zero-extend high half - self.operand_to_x0(op); - self.state.emit(" mov x1, #0"); - } - } - } - Operand::Value(v) => { - if let Some(slot) = self.state.get_slot(v.0) { - if self.state.is_alloca(v.0) { - // Alloca: address, not a 128-bit value itself - self.emit_alloca_addr("x0", v.0, slot.0); - self.state.emit(" mov x1, #0"); - } else if self.state.is_i128_value(v.0) { - // 128-bit value in 16-byte stack slot - self.emit_load_from_sp("x0", slot.0, "ldr"); - self.emit_load_from_sp("x1", slot.0 + 8, "ldr"); - } else { - // Non-i128 value (e.g. shift amount): load 8 bytes, zero high - // Check register allocation first, since register-allocated values - // may not have their stack slot written. - if let Some(®) = self.reg_assignments.get(&v.0) { - let reg_name = callee_saved_name(reg); - self.state.emit_fmt(format_args!(" mov x0, {}", reg_name)); - } else { - self.emit_load_from_sp("x0", slot.0, "ldr"); - } - self.state.emit(" mov x1, #0"); - } - } else { - // No stack slot: check register allocation - if let Some(®) = self.reg_assignments.get(&v.0) { - let reg_name = callee_saved_name(reg); - self.state.emit_fmt(format_args!(" mov x0, {}", reg_name)); - self.state.emit(" mov x1, #0"); - } else { - self.state.emit(" mov x0, #0"); - self.state.emit(" mov x1, #0"); - } - } - } - } - } - - /// Store x0 (low) : x1 (high) to a 128-bit value's stack slot. - pub(super) fn store_x0_x1_to(&mut self, dest: &Value) { - if let Some(slot) = self.state.get_slot(dest.0) { - self.emit_store_to_sp("x0", slot.0, "str"); - self.emit_store_to_sp("x1", slot.0 + 8, "str"); - } - } - - /// Prepare a 128-bit binary operation: load lhs into x2:x3, rhs into x4:x5. - /// (Uses x0:x1 as temporaries during loading.) - pub(super) fn prep_i128_binop(&mut self, lhs: &Operand, rhs: &Operand) { - self.operand_to_x0_x1(lhs); - self.state.emit(" mov x2, x0"); - self.state.emit(" mov x3, x1"); - self.operand_to_x0_x1(rhs); - self.state.emit(" mov x4, x0"); - self.state.emit(" mov x5, x1"); - } - - // emit_i128_binop and emit_i128_cmp use the shared default implementations - // via ArchCodegen trait defaults, with per-op primitives defined in the trait impl above. - - pub(super) fn str_for_type(ty: IrType) -> &'static str { - match ty { - IrType::I8 | IrType::U8 => "strb", - IrType::I16 | IrType::U16 => "strh", - IrType::I32 | IrType::U32 | IrType::F32 => "str", // 32-bit store with w register - _ => "str", // 64-bit store with x register - } - } - - /// Get the appropriate register name for a given base and type. - pub(super) fn reg_for_type(base: &str, ty: IrType) -> &'static str { - let use_w = matches!(ty, - IrType::I8 | IrType::U8 | IrType::I16 | IrType::U16 | - IrType::I32 | IrType::U32 | IrType::F32 - ); - match base { - "x0" => if use_w { "w0" } else { "x0" }, - "x1" => if use_w { "w1" } else { "x1" }, - "x2" => if use_w { "w2" } else { "x2" }, - "x3" => if use_w { "w3" } else { "x3" }, - "x4" => if use_w { "w4" } else { "x4" }, - "x5" => if use_w { "w5" } else { "x5" }, - "x6" => if use_w { "w6" } else { "x6" }, - "x7" => if use_w { "w7" } else { "x7" }, - "x8" => if use_w { "w8" } else { "x8" }, - _ => "x0", - } - } - - /// Parse a load instruction token into the actual ARM instruction and destination register. - /// ARM's "ldr" instruction is width-polymorphic (the register determines access width), - /// so load_instr_for_type returns "ldr32"/"ldr64" tokens to distinguish 32-bit from 64-bit. - pub(super) fn arm_parse_load(instr: &'static str) -> (&'static str, &'static str) { - match instr { - "ldr32" => ("ldr", "w0"), - "ldr64" => ("ldr", "x0"), - "ldrb" | "ldrh" => (instr, "w0"), - // ldrsb, ldrsh, ldrsw all sign-extend into x0 - _ => (instr, "x0"), - } - } - - /// Like arm_parse_load but returns the w/x variant of the given register number - /// instead of hardcoded x0/w0. Used when x0 must not be clobbered. - pub(super) fn arm_parse_load_to_reg(instr: &'static str, xreg: &'static str, wreg: &'static str) -> (&'static str, &'static str) { - match instr { - "ldr32" => ("ldr", wreg), - "ldr64" => ("ldr", xreg), - "ldrb" | "ldrh" => (instr, wreg), - // ldrsb, ldrsh, ldrsw all sign-extend into the x-width register - _ => (instr, xreg), - } - } - - // --- Intrinsic helpers (NEON) --- - - /// Load the address represented by a pointer Value into the given register. - /// For alloca values, computes the address; for others, loads the stored pointer. - pub(super) fn load_ptr_to_reg(&mut self, ptr: &Value, reg: &str) { - if let Some(slot) = self.state.get_slot(ptr.0) { - if self.state.is_alloca(ptr.0) { - self.emit_alloca_addr(reg, ptr.0, slot.0); - } else { - self.emit_load_from_sp(reg, slot.0, "ldr"); - } - } - } - - // ── Call register arg helpers ─────────────────────────────────────────── - - /// Load an operand into the given destination register, accounting for SP adjustment. - /// When `needs_adjusted_load` is true, values must be loaded from adjusted stack offsets - /// or callee-saved registers (since SP has been modified for stack args). - pub(super) fn emit_load_arg_to_reg(&mut self, arg: &Operand, dest: &str, slot_adjust: i64, extra_sp_adj: i64, needs_adjusted_load: bool) { - if needs_adjusted_load || extra_sp_adj > 0 { - match arg { - Operand::Value(v) => { - if let Some(®) = self.reg_assignments.get(&v.0) { - self.state.emit_fmt(format_args!(" mov {}, {}", dest, callee_saved_name(reg))); - } else if let Some(slot) = self.state.get_slot(v.0) { - let adjusted = slot.0 + slot_adjust + extra_sp_adj; - if self.state.is_alloca(v.0) { - self.emit_alloca_addr(dest, v.0, adjusted); - } else { - self.emit_load_from_sp(dest, adjusted, "ldr"); - } - } else { - self.state.emit_fmt(format_args!(" mov {}, #0", dest)); - } - } - Operand::Const(_) => { - self.operand_to_x0(arg); - if dest != "x0" { - self.state.emit_fmt(format_args!(" mov {}, x0", dest)); - } - } - } - } else { - // For Value operands, load directly into dest to avoid clobbering x0. - // The operand_to_x0 path unconditionally uses x0 as scratch, which - // can destroy previously-loaded argument registers (e.g., when struct - // arguments are reordered in a call like check(y, x)). - match arg { - Operand::Value(v) => { - if let Some(®) = self.reg_assignments.get(&v.0) { - self.state.emit_fmt(format_args!(" mov {}, {}", dest, callee_saved_name(reg))); - } else if let Some(slot) = self.state.get_slot(v.0) { - if self.state.is_alloca(v.0) { - self.emit_alloca_addr(dest, v.0, slot.0); - } else { - self.emit_load_from_sp(dest, slot.0, "ldr"); - } - } else { - self.state.emit_fmt(format_args!(" mov {}, #0", dest)); - } - } - Operand::Const(_) => { - self.operand_to_x0(arg); - if dest != "x0" { - self.state.emit_fmt(format_args!(" mov {}, x0", dest)); - } - } - } - } - } - - /// Phase 2a: Load GP integer register args into temp registers (x9-x16). - pub(super) fn emit_call_gp_to_temps(&mut self, args: &[Operand], arg_classes: &[CallArgClass], - slot_adjust: i64, needs_adjusted_load: bool) { - let mut gp_tmp_idx = 0usize; - for (i, arg) in args.iter().enumerate() { - if !matches!(arg_classes[i], CallArgClass::IntReg { .. }) { continue; } - if gp_tmp_idx >= 8 { break; } - self.emit_load_arg_to_reg(arg, "x0", slot_adjust, 0, needs_adjusted_load); - self.state.emit_fmt(format_args!(" mov {}, x0", ARM_TMP_REGS[gp_tmp_idx])); - gp_tmp_idx += 1; - } - } - - /// Phase 2b: Load FP register args, handling F128 via temp stack + __extenddftf2. - pub(super) fn emit_call_fp_reg_args(&mut self, args: &[Operand], arg_classes: &[CallArgClass], - arg_types: &[IrType], slot_adjust: i64, needs_adjusted_load: bool) { - let fp_reg_assignments: Vec<(usize, usize)> = args.iter().enumerate() - .filter(|(i, _)| matches!(arg_classes[*i], CallArgClass::FloatReg { .. } | CallArgClass::F128Reg { .. })) - .map(|(i, _)| { - let reg_idx = match arg_classes[i] { - CallArgClass::FloatReg { reg_idx } | CallArgClass::F128Reg { reg_idx } => reg_idx, - _ => 0, - }; - (i, reg_idx) - }) - .collect(); - - let f128_var_count: usize = fp_reg_assignments.iter() - .filter(|&&(arg_i, _)| matches!(arg_classes[arg_i], CallArgClass::F128Reg { .. }) && matches!(&args[arg_i], Operand::Value(_))) - .count(); - let f128_temp_space_aligned = (f128_var_count * 16 + 15) & !15; - if f128_temp_space_aligned > 0 { - self.emit_sub_sp(f128_temp_space_aligned as i64); - } - - let extra_sp_adj = f128_temp_space_aligned as i64; - let f128_temp_slots = self.emit_call_f128_var_args( - args, arg_classes, &fp_reg_assignments, slot_adjust, extra_sp_adj, needs_adjusted_load, - ); - - self.emit_call_f128_const_args(args, arg_classes, &fp_reg_assignments); - - for &(reg_i, temp_off) in &f128_temp_slots { - self.state.emit_fmt(format_args!(" ldr q{}, [sp, #{}]", reg_i, temp_off)); - } - if f128_temp_space_aligned > 0 { - self.emit_add_sp(f128_temp_space_aligned as i64); - } - - for &(arg_i, reg_i) in &fp_reg_assignments { - if matches!(arg_classes[arg_i], CallArgClass::F128Reg { .. }) { continue; } - let arg_ty = if arg_i < arg_types.len() { Some(arg_types[arg_i]) } else { None }; - self.emit_load_arg_to_reg(&args[arg_i], "x0", slot_adjust, 0, needs_adjusted_load); - if arg_ty == Some(IrType::F32) { - self.state.emit_fmt(format_args!(" fmov s{}, w0", reg_i)); - } else { - self.state.emit_fmt(format_args!(" fmov d{}, x0", reg_i)); - } - } - } - - /// Convert F128 variable args to full-precision f128, saving to temp stack. - pub(super) fn emit_call_f128_var_args(&mut self, args: &[Operand], arg_classes: &[CallArgClass], - fp_reg_assignments: &[(usize, usize)], - slot_adjust: i64, extra_sp_adj: i64, - needs_adjusted_load: bool) -> Vec<(usize, usize)> { - let mut f128_temp_idx = 0usize; - let mut f128_temp_slots: Vec<(usize, usize)> = Vec::new(); - for &(arg_i, reg_i) in fp_reg_assignments { - if !matches!(arg_classes[arg_i], CallArgClass::F128Reg { .. }) { continue; } - if let Operand::Value(v) = &args[arg_i] { - let temp_off = f128_temp_idx * 16; - let loaded_full = self.try_load_f128_full_precision(v.0, slot_adjust + extra_sp_adj, temp_off); - - if !loaded_full { - self.emit_load_arg_to_reg(&args[arg_i], "x0", slot_adjust, extra_sp_adj, - needs_adjusted_load || extra_sp_adj > 0); - self.state.emit(" fmov d0, x0"); - self.state.emit(" stp x9, x10, [sp, #-16]!"); - self.state.emit(" bl __extenddftf2"); - self.state.emit(" ldp x9, x10, [sp], #16"); - self.state.emit_fmt(format_args!(" str q0, [sp, #{}]", temp_off)); - } - - f128_temp_slots.push((reg_i, temp_off)); - f128_temp_idx += 1; - } - } - f128_temp_slots - } - - /// Try to load a full-precision f128 value via f128 tracking. Returns true if successful. - pub(super) fn try_load_f128_full_precision(&mut self, value_id: u32, adjusted_slot_base: i64, temp_off: usize) -> bool { - if let Some((src_id, offset, is_indirect)) = self.state.get_f128_source(value_id) { - if !is_indirect { - if let Some(src_slot) = self.state.get_slot(src_id) { - let adj = src_slot.0 + offset + adjusted_slot_base; - self.emit_load_from_sp("q0", adj, "ldr"); - self.state.emit_fmt(format_args!(" str q0, [sp, #{}]", temp_off)); - return true; - } - } else if let Some(src_slot) = self.state.get_slot(src_id) { - let adj = src_slot.0 + adjusted_slot_base; - self.emit_load_from_sp("x17", adj, "ldr"); - if offset != 0 { - if offset > 0 && offset <= 4095 { - self.state.emit_fmt(format_args!(" add x17, x17, #{}", offset)); - } else { - self.load_large_imm("x16", offset); - self.state.emit(" add x17, x17, x16"); - } - } - self.state.emit(" ldr q0, [x17]"); - self.state.emit_fmt(format_args!(" str q0, [sp, #{}]", temp_off)); - return true; - } - } - false - } - - /// Load F128 constants directly into target Q registers using full f128 bytes. - pub(super) fn emit_call_f128_const_args(&mut self, args: &[Operand], arg_classes: &[CallArgClass], - fp_reg_assignments: &[(usize, usize)]) { - for &(arg_i, reg_i) in fp_reg_assignments { - if !matches!(arg_classes[arg_i], CallArgClass::F128Reg { .. }) { continue; } - if let Operand::Const(c) = &args[arg_i] { - let bytes = match c { - IrConst::LongDouble(_, f128_bytes) => *f128_bytes, - _ => { - let f64_val = c.to_f64().unwrap_or(0.0); - crate::ir::reexports::f64_to_f128_bytes(f64_val) - } - }; - let lo = u64::from_le_bytes(bytes[0..8].try_into().unwrap()); - let hi = u64::from_le_bytes(bytes[8..16].try_into().unwrap()); - self.emit_load_imm64("x0", lo as i64); - self.emit_load_imm64("x1", hi as i64); - self.state.emit(" stp x0, x1, [sp, #-16]!"); - self.state.emit_fmt(format_args!(" ldr q{}, [sp]", reg_i)); - self.state.emit(" add sp, sp, #16"); - } - } - } - - /// Phase 3: Move GP int args from temp regs to actual arg registers. - pub(super) fn emit_call_move_temps_to_arg_regs(&mut self, args: &[Operand], arg_classes: &[CallArgClass]) { - let mut int_reg_idx = 0usize; - let mut gp_tmp_idx = 0usize; - for (i, _) in args.iter().enumerate() { - match arg_classes[i] { - CallArgClass::I128RegPair { .. } => { - if !int_reg_idx.is_multiple_of(2) { int_reg_idx += 1; } - int_reg_idx += 2; - } - CallArgClass::StructByValReg { size, .. } => { - int_reg_idx += if size <= 8 { 1 } else { 2 }; - } - CallArgClass::IntReg { .. } => { - if gp_tmp_idx < 8 && int_reg_idx < 8 { - self.state.emit_fmt(format_args!(" mov {}, {}", ARM_ARG_REGS[int_reg_idx], ARM_TMP_REGS[gp_tmp_idx])); - int_reg_idx += 1; - } - gp_tmp_idx += 1; - } - _ => {} - } - } - } - - /// Phase 3b: Load i128 register pair args into paired arg registers. - pub(super) fn emit_call_i128_reg_args(&mut self, args: &[Operand], arg_classes: &[CallArgClass], - slot_adjust: i64, needs_adjusted_load: bool) { - for (i, arg) in args.iter().enumerate() { - if let CallArgClass::I128RegPair { base_reg_idx } = arg_classes[i] { - match arg { - Operand::Value(v) => { - if let Some(slot) = self.state.get_slot(v.0) { - let adj = if needs_adjusted_load { slot.0 + slot_adjust } else { slot.0 }; - if self.state.is_alloca(v.0) { - self.emit_alloca_addr(ARM_ARG_REGS[base_reg_idx], v.0, adj); - self.state.emit_fmt(format_args!(" mov {}, #0", ARM_ARG_REGS[base_reg_idx + 1])); - } else { - self.emit_load_from_sp(ARM_ARG_REGS[base_reg_idx], adj, "ldr"); - self.emit_load_from_sp(ARM_ARG_REGS[base_reg_idx + 1], adj + 8, "ldr"); - } - } - } - Operand::Const(c) => { - if let IrConst::I128(v) = c { - self.emit_load_imm64(ARM_ARG_REGS[base_reg_idx], *v as u64 as i64); - self.emit_load_imm64(ARM_ARG_REGS[base_reg_idx + 1], (*v >> 64) as u64 as i64); - } else { - self.operand_to_x0(arg); - if base_reg_idx != 0 { - self.state.emit_fmt(format_args!(" mov {}, x0", ARM_ARG_REGS[base_reg_idx])); - } - self.state.emit_fmt(format_args!(" mov {}, #0", ARM_ARG_REGS[base_reg_idx + 1])); - } - } - } - } - } - } - - /// Phase 3c: Load struct-by-value register args. Loads pointer into x17, - /// then reads struct data from [x17] into arg regs. - pub(super) fn emit_call_struct_byval_reg_args(&mut self, args: &[Operand], arg_classes: &[CallArgClass], - slot_adjust: i64, needs_adjusted_load: bool) { - for (i, arg) in args.iter().enumerate() { - if let CallArgClass::StructByValReg { base_reg_idx, size } = arg_classes[i] { - let regs_needed = if size <= 8 { 1 } else { 2 }; - self.emit_load_arg_to_reg(arg, "x17", slot_adjust, 0, needs_adjusted_load); - self.state.emit_fmt(format_args!(" ldr {}, [x17]", ARM_ARG_REGS[base_reg_idx])); - if regs_needed > 1 { - self.state.emit_fmt(format_args!(" ldr {}, [x17, #8]", ARM_ARG_REGS[base_reg_idx + 1])); - } - } - } - } - - /// Resolve param alloca to (slot, type) for parameter `i`. - fn resolve_param_slot(&self, func: &IrFunction, i: usize) -> Option<(StackSlot, IrType, Value)> { - let (dest, ty) = find_param_alloca(func, i)?; - let slot = self.state.get_slot(dest.0)?; - Some((slot, ty, dest)) - } - - /// Save variadic function registers to save areas. - pub(super) fn emit_save_variadic_regs(&mut self) { - let gp_base = self.va_gp_save_offset; - for i in (0..8).step_by(2) { - let offset = gp_base + (i as i64) * 8; - self.emit_stp_to_sp(&format!("x{}", i), &format!("x{}", i + 1), offset); - } - if !self.general_regs_only { - let fp_base = self.va_fp_save_offset; - for i in (0..8).step_by(2) { - let offset = fp_base + (i as i64) * 16; - self.emit_stp_to_sp(&format!("q{}", i), &format!("q{}", i + 1), offset); - } - } - } - - /// Phase 1: Store GP register params to alloca slots. - pub(super) fn emit_store_gp_params(&mut self, func: &IrFunction, param_classes: &[ParamClass]) { - // AArch64 ABI: when a function uses sret, the hidden pointer comes in x8 - // (not x0). All other GP register params shift down by one so that the - // first real argument is in x0 instead of x1. - let sret_shift = if self.state.uses_sret { 1usize } else { 0 }; - - for (i, _) in func.params.iter().enumerate() { - let class = param_classes[i]; - if !class.uses_gp_reg() { continue; } - - let (slot, ty, _) = match self.resolve_param_slot(func, i) { - Some(v) => v, - None => continue, - }; - - match class { - ParamClass::IntReg { reg_idx } => { - if sret_shift > 0 && reg_idx == 0 && i == 0 { - // sret pointer: comes in x8 on AArch64 - self.emit_store_to_sp("x8", slot.0, "str"); - } else { - let actual_idx = if reg_idx >= sret_shift { reg_idx - sret_shift } else { reg_idx }; - let store_instr = Self::str_for_type(ty); - let reg = Self::reg_for_type(ARM_ARG_REGS[actual_idx], ty); - self.emit_store_to_sp(reg, slot.0, store_instr); - } - } - ParamClass::I128RegPair { base_reg_idx } => { - let actual_idx = if base_reg_idx >= sret_shift { base_reg_idx - sret_shift } else { base_reg_idx }; - self.emit_store_to_sp(ARM_ARG_REGS[actual_idx], slot.0, "str"); - self.emit_store_to_sp(ARM_ARG_REGS[actual_idx + 1], slot.0 + 8, "str"); - } - ParamClass::StructByValReg { base_reg_idx, size } => { - let actual_idx = if base_reg_idx >= sret_shift { base_reg_idx - sret_shift } else { base_reg_idx }; - self.emit_store_to_sp(ARM_ARG_REGS[actual_idx], slot.0, "str"); - if size > 8 { - self.emit_store_to_sp(ARM_ARG_REGS[actual_idx + 1], slot.0 + 8, "str"); - } - } - ParamClass::LargeStructByRefReg { reg_idx, size } => { - let actual_idx = if reg_idx >= sret_shift { reg_idx - sret_shift } else { reg_idx }; - let src_reg = ARM_ARG_REGS[actual_idx]; - let n_dwords = size.div_ceil(8); - for qi in 0..n_dwords { - let src_off = (qi * 8) as i64; - self.emit_load_from_reg("x9", src_reg, src_off, "ldr"); - self.emit_store_to_sp("x9", slot.0 + src_off, "str"); - } - } - _ => {} - } - } - } - - /// Phase 2: Store FP register params to alloca slots. - pub(super) fn emit_store_fp_params(&mut self, func: &IrFunction, param_classes: &[ParamClass]) { - let has_f128_fp_params = param_classes.iter().enumerate().any(|(i, c)| { - matches!(c, ParamClass::F128FpReg { .. }) && - find_param_alloca(func, i).is_some() - }); - - if has_f128_fp_params { - self.emit_store_fp_params_with_f128(func, param_classes); - } else { - self.emit_store_fp_params_simple(func, param_classes); - } - } - - /// Store FP params when F128 params are present (save/restore q0-q7). - fn emit_store_fp_params_with_f128(&mut self, func: &IrFunction, param_classes: &[ParamClass]) { - self.emit_sub_sp(128); - for i in 0..8usize { - self.state.emit_fmt(format_args!(" str q{}, [sp, #{}]", i, i * 16)); - } - - // Process non-F128 float params first (from saved Q area). - for (i, _) in func.params.iter().enumerate() { - let reg_idx = match param_classes[i] { - ParamClass::FloatReg { reg_idx } => reg_idx, - _ => continue, - }; - let (slot, ty, _) = match self.resolve_param_slot(func, i) { - Some(v) => v, - None => continue, - }; - let fp_reg_off = (reg_idx * 16) as i64; - if ty == IrType::F32 { - self.state.emit_fmt(format_args!(" ldr s0, [sp, #{}]", fp_reg_off)); - self.state.emit(" fmov w9, s0"); - } else { - self.state.emit_fmt(format_args!(" ldr d0, [sp, #{}]", fp_reg_off)); - self.state.emit(" fmov x9, d0"); - } - self.emit_store_to_sp("x9", slot.0 + 128, "str"); - } - - // Process F128 FP reg params: store full 16-byte f128, then f64 approx. - for (i, _) in func.params.iter().enumerate() { - let reg_idx = match param_classes[i] { - ParamClass::F128FpReg { reg_idx } => reg_idx, - _ => continue, - }; - let (slot, _, dest_val) = match self.resolve_param_slot(func, i) { - Some(v) => v, - None => continue, - }; - let fp_reg_off = (reg_idx * 16) as i64; - self.state.emit_fmt(format_args!(" ldr q0, [sp, #{}]", fp_reg_off)); - self.emit_store_to_sp("q0", slot.0 + 128, "str"); - self.state.track_f128_self(dest_val.0); - self.state.emit(" bl __trunctfdf2"); - self.state.emit(" fmov x0, d0"); - } - - self.emit_add_sp(128); - } - - /// Store FP params when no F128 params are present (simple path). - fn emit_store_fp_params_simple(&mut self, func: &IrFunction, param_classes: &[ParamClass]) { - // Use x9/w9 as scratch instead of x0/w0 to avoid clobbering GP argument - // registers (x0-x7) that may not have been spilled yet (e.g. when mem2reg - // promoted their allocas and emit_param_ref will read them later). - for (i, _) in func.params.iter().enumerate() { - let reg_idx = match param_classes[i] { - ParamClass::FloatReg { reg_idx } => reg_idx, - _ => continue, - }; - let (slot, ty, _) = match self.resolve_param_slot(func, i) { - Some(v) => v, - None => continue, - }; - if ty == IrType::F32 { - self.state.emit_fmt(format_args!(" fmov w9, s{}", reg_idx)); - } else { - self.state.emit_fmt(format_args!(" fmov x9, d{}", reg_idx)); - } - self.emit_store_to_sp("x9", slot.0, "str"); - } - } - - /// Phase 3: Store stack-passed params to alloca slots. - /// Uses x9/w9 as scratch instead of x0/w0 to avoid clobbering GP argument - /// registers (x0-x7) that may not have been spilled yet (e.g. when mem2reg - /// promoted their allocas and emit_param_ref will read them later). - pub(super) fn emit_store_stack_params(&mut self, func: &IrFunction, param_classes: &[ParamClass]) { - let frame_size = self.current_frame_size; - for (i, _) in func.params.iter().enumerate() { - let class = param_classes[i]; - if !class.is_stack() { continue; } - - let (slot, ty, dest_val) = match self.resolve_param_slot(func, i) { - Some(v) => v, - None => continue, - }; - - match class { - ParamClass::StructStack { offset, size } | ParamClass::LargeStructStack { offset, size } => { - let caller_offset = frame_size + offset; - for qi in 0..size.div_ceil(8) { - let off = qi as i64 * 8; - self.emit_load_from_sp("x9", caller_offset + off, "ldr"); - self.emit_store_to_sp("x9", slot.0 + off, "str"); - } - } - ParamClass::F128Stack { offset } => { - let caller_offset = frame_size + offset; - self.emit_load_from_sp("x9", caller_offset, "ldr"); - self.emit_store_to_sp("x9", slot.0, "str"); - self.emit_load_from_sp("x9", caller_offset + 8, "ldr"); - self.emit_store_to_sp("x9", slot.0 + 8, "str"); - self.state.track_f128_self(dest_val.0); - } - ParamClass::I128Stack { offset } => { - let caller_offset = frame_size + offset; - self.emit_load_from_sp("x9", caller_offset, "ldr"); - self.emit_store_to_sp("x9", slot.0, "str"); - self.emit_load_from_sp("x9", caller_offset + 8, "ldr"); - self.emit_store_to_sp("x9", slot.0 + 8, "str"); - } - ParamClass::StackScalar { offset } => { - let caller_offset = frame_size + offset; - // Load from caller stack with extending load, then store - // full 64 bits so the slot is valid for any later ldr. - // Use x9/w9 to avoid clobbering GP argument registers. - let load_instr = self.load_instr_for_type_impl(ty); - let (arm_load, dest_reg) = Self::arm_parse_load_to_reg(load_instr, "x9", "w9"); - self.emit_load_from_sp(dest_reg, caller_offset, arm_load); - self.emit_store_to_sp("x9", slot.0, "str"); - } - ParamClass::LargeStructByRefStack { offset, size } => { - let caller_offset = frame_size + offset; - self.emit_load_from_sp("x9", caller_offset, "ldr"); - for qi in 0..size.div_ceil(8) { - let off = (qi * 8) as i64; - self.emit_load_from_reg("x10", "x9", off, "ldr"); - self.emit_store_to_sp("x10", slot.0 + off, "str"); - } - } - _ => {} - } - } - } -} - -pub(super) const ARM_ARG_REGS: [&str; 8] = ["x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7"]; -const ARM_TMP_REGS: [&str; 8] = ["x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16"]; - -impl ArchCodegen for ArmCodegen { - fn state(&mut self) -> &mut CodegenState { &mut self.state } - fn state_ref(&self) -> &CodegenState { &self.state } - - fn get_phys_reg_for_value(&self, val_id: u32) -> Option { - self.reg_assignments.get(&val_id).copied() - } - - fn emit_reg_to_reg_move(&mut self, src: PhysReg, dest: PhysReg) { - let s_name = callee_saved_name(src); - let d_name = callee_saved_name(dest); - self.state.emit_fmt(format_args!(" mov {}, {}", d_name, s_name)); - } - - fn emit_acc_to_phys_reg(&mut self, dest: PhysReg) { - let d_name = callee_saved_name(dest); - self.state.emit_fmt(format_args!(" mov {}, x0", d_name)); - } - - fn jump_mnemonic(&self) -> &'static str { "b" } - fn trap_instruction(&self) -> &'static str { "brk #0" } - - fn emit_branch_nonzero(&mut self, label: &str) { - let skip = self.state.fresh_label("skip"); - self.state.emit_fmt(format_args!(" cbz x0, {}", skip)); - self.state.emit_fmt(format_args!(" b {}", label)); - self.state.emit_fmt(format_args!("{}:", skip)); - } - - fn emit_jump_indirect(&mut self) { - self.state.emit(" br x0"); - } - - fn emit_switch_case_branch(&mut self, case_val: i64, label: &str, ty: IrType) { - let use_32bit = matches!(ty, IrType::I32 | IrType::U32 | IrType::I16 | IrType::U16 | IrType::I8 | IrType::U8); - if use_32bit { - self.emit_load_imm64("w1", case_val as i32 as i64); - self.state.emit(" cmp w0, w1"); - } else { - self.emit_load_imm64("x1", case_val); - self.state.emit(" cmp x0, x1"); - } - let skip = self.state.fresh_label("skip"); - self.state.emit_fmt(format_args!(" b.ne {}", skip)); - self.state.emit_fmt(format_args!(" b {}", label)); - self.state.emit_fmt(format_args!("{}:", skip)); - } - - fn emit_switch_jump_table(&mut self, val: &Operand, cases: &[(i64, BlockId)], default: &BlockId, _ty: IrType) { - use crate::backend::traits::build_jump_table; - let (table, min_val, range) = build_jump_table(cases, default); - let table_label = self.state.fresh_label("jt"); - let default_label = default.as_label(); - self.operand_to_x0(val); - if min_val != 0 { - if min_val > 0 && min_val <= 4095 { - self.state.emit_fmt(format_args!(" sub x0, x0, #{}", min_val)); - } else if min_val < 0 && (-min_val) <= 4095 { - self.state.emit_fmt(format_args!(" add x0, x0, #{}", -min_val)); - } else { - self.load_large_imm("x17", min_val); - self.state.emit(" sub x0, x0, x17"); - } - } - if range <= 4095 { - self.state.emit_fmt(format_args!(" cmp x0, #{}", range)); - } else { - self.load_large_imm("x17", range as i64); - self.state.emit(" cmp x0, x17"); - } - let range_skip = self.state.fresh_label("range_ok"); - self.state.emit_fmt(format_args!(" b.lo {}", range_skip)); - self.state.emit_fmt(format_args!(" b {}", default_label)); - self.state.emit_fmt(format_args!("{}:", range_skip)); - self.state.emit_fmt(format_args!(" adrp x17, {}", table_label)); - self.state.emit_fmt(format_args!(" add x17, x17, :lo12:{}", table_label)); - self.state.emit(" ldr w16, [x17, x0, lsl #2]"); - self.state.emit(" add x17, x17, w16, sxtw"); - self.state.emit(" br x17"); - self.state.emit(".section .rodata"); - self.state.emit(".align 2"); - self.state.emit_fmt(format_args!("{}:", table_label)); - for target in &table { - let target_label = target.as_label(); - self.state.emit_fmt(format_args!(" .word {} - {}", target_label, table_label)); - } - let sect = self.state.current_text_section.clone(); - self.state.emit_fmt(format_args!(".section {},\"ax\",@progbits", sect)); - self.state.reg_cache.invalidate_all(); - } - - fn ptr_directive(&self) -> PtrDirective { PtrDirective::Xword } - fn function_type_directive(&self) -> &'static str { "%function" } - - // ---- Standard trait methods (kept inline - arch-specific) ---- - fn emit_load_operand(&mut self, op: &Operand) { self.operand_to_x0(op); } - fn emit_store_result(&mut self, dest: &Value) { self.store_x0_to(dest); } - fn emit_save_acc(&mut self) { self.state.emit(" mov x1, x0"); } - fn emit_add_secondary_to_acc(&mut self) { self.state.emit(" add x0, x1, x0"); } - fn emit_gep_add_const_to_acc(&mut self, offset: i64) { if offset != 0 { self.emit_add_imm_to_acc_impl(offset); } } - fn emit_acc_to_secondary(&mut self) { self.state.emit(" mov x1, x0"); } - fn emit_memcpy_store_dest_from_acc(&mut self) { } - fn emit_memcpy_store_src_from_acc(&mut self) { self.state.emit(" mov x10, x9"); } - fn emit_call_spill_fptr(&mut self, func_ptr: &Operand) { - self.operand_to_x0(func_ptr); - self.state.emit(" str x0, [sp, #-16]!"); - } - fn emit_call_fptr_spill_size(&self) -> usize { 16 } - fn emit_call_move_f32_to_acc(&mut self) { self.state.emit(" fmov w0, s0"); } - fn emit_call_move_f64_to_acc(&mut self) { self.state.emit(" fmov x0, d0"); } - - // AArch64 ABI: sret pointer goes in x8, not x0. - fn sret_uses_dedicated_reg(&self) -> bool { true } - fn emit_call_sret_setup(&mut self, sret_operand: &Operand, total_sp_adjust: i64) { - let slot_adjust = if self.state.has_dyn_alloca { 0 } else { total_sp_adjust }; - let needs_adjusted = total_sp_adjust > 0; - self.emit_load_arg_to_reg(sret_operand, "x8", slot_adjust, 0, needs_adjusted); - } - - // ---- Inline asm / intrinsics (kept inline - has extra logic) ---- - fn emit_inline_asm(&mut self, template: &str, outputs: &[(String, Value, Option)], inputs: &[(String, Operand, Option)], clobbers: &[String], operand_types: &[IrType], goto_labels: &[(String, BlockId)], input_symbols: &[Option]) { - emit_inline_asm_common(self, template, outputs, inputs, clobbers, operand_types, goto_labels, input_symbols); - } - fn emit_intrinsic(&mut self, dest: &Option, op: &IntrinsicOp, dest_ptr: &Option, args: &[Operand]) { - self.emit_intrinsic_arm(dest, op, dest_ptr, args); - } - - // ---- Float binop body uses different name ---- - fn emit_float_binop_impl(&mut self, mnemonic: &str, ty: IrType) { self.emit_float_binop_body(mnemonic, ty) } - - // All remaining methods delegate to self.method_name_impl(args...) - delegate_to_impl! { - // prologue - fn calculate_stack_space(&mut self, func: &IrFunction) -> i64 => calculate_stack_space_impl; - fn aligned_frame_size(&self, raw_space: i64) -> i64 => aligned_frame_size_impl; - fn emit_prologue(&mut self, func: &IrFunction, frame_size: i64) => emit_prologue_impl; - fn emit_epilogue(&mut self, frame_size: i64) => emit_epilogue_impl; - fn emit_store_params(&mut self, func: &IrFunction) => emit_store_params_impl; - fn emit_param_ref(&mut self, dest: &Value, param_idx: usize, ty: IrType) => emit_param_ref_impl; - fn emit_epilogue_and_ret(&mut self, frame_size: i64) => emit_epilogue_and_ret_impl; - fn store_instr_for_type(&self, ty: IrType) -> &'static str => store_instr_for_type_impl; - fn load_instr_for_type(&self, ty: IrType) -> &'static str => load_instr_for_type_impl; - // memory - fn emit_store(&mut self, val: &Operand, ptr: &Value, ty: IrType) => emit_store_impl; - fn emit_load(&mut self, dest: &Value, ptr: &Value, ty: IrType) => emit_load_impl; - fn emit_store_with_const_offset(&mut self, val: &Operand, base: &Value, offset: i64, ty: IrType) => emit_store_with_const_offset_impl; - fn emit_load_with_const_offset(&mut self, dest: &Value, base: &Value, offset: i64, ty: IrType) => emit_load_with_const_offset_impl; - fn emit_typed_store_to_slot(&mut self, instr: &'static str, ty: IrType, slot: StackSlot) => emit_typed_store_to_slot_impl; - fn emit_typed_load_from_slot(&mut self, instr: &'static str, slot: StackSlot) => emit_typed_load_from_slot_impl; - fn emit_load_ptr_from_slot(&mut self, slot: StackSlot, val_id: u32) => emit_load_ptr_from_slot_impl; - fn emit_typed_store_indirect(&mut self, instr: &'static str, ty: IrType) => emit_typed_store_indirect_impl; - fn emit_typed_load_indirect(&mut self, instr: &'static str) => emit_typed_load_indirect_impl; - fn emit_add_offset_to_addr_reg(&mut self, offset: i64) => emit_add_offset_to_addr_reg_impl; - fn emit_slot_addr_to_secondary(&mut self, slot: StackSlot, is_alloca: bool, val_id: u32) => emit_slot_addr_to_secondary_impl; - fn emit_gep_direct_const(&mut self, slot: StackSlot, offset: i64) => emit_gep_direct_const_impl; - fn emit_gep_indirect_const(&mut self, slot: StackSlot, offset: i64, val_id: u32) => emit_gep_indirect_const_impl; - fn emit_add_imm_to_acc(&mut self, imm: i64) => emit_add_imm_to_acc_impl; - fn emit_round_up_acc_to_16(&mut self) => emit_round_up_acc_to_16_impl; - fn emit_sub_sp_by_acc(&mut self) => emit_sub_sp_by_acc_impl; - fn emit_mov_sp_to_acc(&mut self) => emit_mov_sp_to_acc_impl; - fn emit_mov_acc_to_sp(&mut self) => emit_mov_acc_to_sp_impl; - fn emit_align_acc(&mut self, align: usize) => emit_align_acc_impl; - fn emit_memcpy_load_dest_addr(&mut self, slot: StackSlot, is_alloca: bool, val_id: u32) => emit_memcpy_load_dest_addr_impl; - fn emit_memcpy_load_src_addr(&mut self, slot: StackSlot, is_alloca: bool, val_id: u32) => emit_memcpy_load_src_addr_impl; - fn emit_alloca_aligned_addr(&mut self, slot: StackSlot, val_id: u32) => emit_alloca_aligned_addr_impl; - fn emit_alloca_aligned_addr_to_acc(&mut self, slot: StackSlot, val_id: u32) => emit_alloca_aligned_addr_to_acc_impl; - fn emit_memcpy_impl(&mut self, size: usize) => emit_memcpy_impl_impl; - // alu - fn emit_float_neg(&mut self, ty: IrType) => emit_float_neg_impl; - fn emit_f128_neg(&mut self, dest: &Value, src: &Operand) => emit_f128_neg_impl; - fn emit_int_neg(&mut self, ty: IrType) => emit_int_neg_impl; - fn emit_int_not(&mut self, ty: IrType) => emit_int_not_impl; - fn emit_int_clz(&mut self, ty: IrType) => emit_int_clz_impl; - fn emit_int_ctz(&mut self, ty: IrType) => emit_int_ctz_impl; - fn emit_int_bswap(&mut self, ty: IrType) => emit_int_bswap_impl; - fn emit_int_popcount(&mut self, ty: IrType) => emit_int_popcount_impl; - fn emit_int_binop(&mut self, dest: &Value, op: IrBinOp, lhs: &Operand, rhs: &Operand, ty: IrType) => emit_int_binop_impl; - fn emit_copy_i128(&mut self, dest: &Value, src: &Operand) => emit_copy_i128_impl; - // comparison - fn emit_float_cmp(&mut self, dest: &Value, op: IrCmpOp, lhs: &Operand, rhs: &Operand, ty: IrType) => emit_float_cmp_impl; - fn emit_int_cmp(&mut self, dest: &Value, op: IrCmpOp, lhs: &Operand, rhs: &Operand, ty: IrType) => emit_int_cmp_impl; - fn emit_f128_cmp(&mut self, dest: &Value, op: IrCmpOp, lhs: &Operand, rhs: &Operand) => emit_f128_cmp_impl; - fn emit_fused_cmp_branch(&mut self, op: IrCmpOp, lhs: &Operand, rhs: &Operand, ty: IrType, true_label: &str, false_label: &str) => emit_fused_cmp_branch_impl; - fn emit_select(&mut self, dest: &Value, cond: &Operand, true_val: &Operand, false_val: &Operand, ty: IrType) => emit_select_impl; - // calls - fn call_abi_config(&self) -> CallAbiConfig => call_abi_config_impl; - fn emit_call_compute_stack_space(&self, arg_classes: &[CallArgClass], arg_types: &[IrType]) -> usize => emit_call_compute_stack_space_impl; - fn emit_call_f128_pre_convert(&mut self, args: &[Operand], arg_classes: &[CallArgClass], arg_types: &[IrType], stack_arg_space: usize) -> usize => emit_call_f128_pre_convert_impl; - fn emit_call_stack_args(&mut self, args: &[Operand], arg_classes: &[CallArgClass], arg_types: &[IrType], stack_arg_space: usize, fptr_spill: usize, f128_temp_space: usize) -> i64 => emit_call_stack_args_impl; - fn emit_call_reg_args(&mut self, args: &[Operand], arg_classes: &[CallArgClass], arg_types: &[IrType], total_sp_adjust: i64, f128_temp_space: usize, stack_arg_space: usize, struct_arg_riscv_float_classes: &[Option]) => emit_call_reg_args_impl; - fn emit_call_instruction(&mut self, direct_name: Option<&str>, func_ptr: Option<&Operand>, indirect: bool, stack_arg_space: usize) => emit_call_instruction_impl; - fn emit_call_cleanup(&mut self, stack_arg_space: usize, f128_temp_space: usize, indirect: bool) => emit_call_cleanup_impl; - fn emit_call_store_i128_result(&mut self, dest: &Value) => emit_call_store_i128_result_impl; - fn emit_call_store_f128_result(&mut self, dest: &Value) => emit_call_store_f128_result_impl; - // globals - fn emit_global_addr(&mut self, dest: &Value, name: &str) => emit_global_addr_impl; - fn emit_label_addr(&mut self, dest: &Value, label: &str) => emit_label_addr_impl; - fn emit_tls_global_addr(&mut self, dest: &Value, name: &str) => emit_tls_global_addr_impl; - // cast - fn emit_cast_instrs(&mut self, from_ty: IrType, to_ty: IrType) => emit_cast_instrs_impl; - fn emit_cast(&mut self, dest: &Value, src: &Operand, from_ty: IrType, to_ty: IrType) => emit_cast_impl; - // variadic - fn emit_va_arg(&mut self, dest: &Value, va_list_ptr: &Value, result_ty: IrType) => emit_va_arg_impl; - fn emit_va_start(&mut self, va_list_ptr: &Value) => emit_va_start_impl; - fn emit_va_copy(&mut self, dest_ptr: &Value, src_ptr: &Value) => emit_va_copy_impl; - fn emit_va_arg_struct(&mut self, dest_ptr: &Value, va_list_ptr: &Value, size: usize) => emit_va_arg_struct_impl; - // returns - fn current_return_type(&self) -> IrType => current_return_type_impl; - fn emit_return_i128_to_regs(&mut self) => emit_return_i128_to_regs_impl; - fn emit_return_f128_to_reg(&mut self) => emit_return_f128_to_reg_impl; - fn emit_return(&mut self, val: Option<&Operand>, frame_size: i64) => emit_return_impl; - fn emit_return_f32_to_reg(&mut self) => emit_return_f32_to_reg_impl; - fn emit_return_f64_to_reg(&mut self) => emit_return_f64_to_reg_impl; - fn emit_return_int_to_reg(&mut self) => emit_return_int_to_reg_impl; - fn emit_get_return_f64_second(&mut self, dest: &Value) => emit_get_return_f64_second_impl; - fn emit_set_return_f64_second(&mut self, src: &Operand) => emit_set_return_f64_second_impl; - fn emit_get_return_f32_second(&mut self, dest: &Value) => emit_get_return_f32_second_impl; - fn emit_set_return_f32_second(&mut self, src: &Operand) => emit_set_return_f32_second_impl; - fn emit_get_return_f128_second(&mut self, dest: &Value) => emit_get_return_f128_second_impl; - fn emit_set_return_f128_second(&mut self, src: &Operand) => emit_set_return_f128_second_impl; - // atomics - fn emit_atomic_rmw(&mut self, dest: &Value, op: AtomicRmwOp, ptr: &Operand, val: &Operand, ty: IrType, ordering: AtomicOrdering) => emit_atomic_rmw_impl; - fn emit_atomic_cmpxchg(&mut self, dest: &Value, ptr: &Operand, expected: &Operand, desired: &Operand, ty: IrType, success_ordering: AtomicOrdering, failure_ordering: AtomicOrdering, returns_bool: bool) => emit_atomic_cmpxchg_impl; - fn emit_atomic_load(&mut self, dest: &Value, ptr: &Operand, ty: IrType, ordering: AtomicOrdering) => emit_atomic_load_impl; - fn emit_atomic_store(&mut self, ptr: &Operand, val: &Operand, ty: IrType, ordering: AtomicOrdering) => emit_atomic_store_impl; - fn emit_fence(&mut self, ordering: AtomicOrdering) => emit_fence_impl; - // float binop - fn emit_float_binop(&mut self, dest: &Value, op: crate::backend::cast::FloatOp, lhs: &Operand, rhs: &Operand, ty: IrType) => emit_float_binop_impl; - // i128 ops - fn emit_load_acc_pair(&mut self, op: &Operand) => emit_load_acc_pair_impl; - fn emit_store_acc_pair(&mut self, dest: &Value) => emit_store_acc_pair_impl; - fn emit_store_pair_to_slot(&mut self, slot: StackSlot) => emit_store_pair_to_slot_impl; - fn emit_load_pair_from_slot(&mut self, slot: StackSlot) => emit_load_pair_from_slot_impl; - fn emit_save_acc_pair(&mut self) => emit_save_acc_pair_impl; - fn emit_store_pair_indirect(&mut self) => emit_store_pair_indirect_impl; - fn emit_load_pair_indirect(&mut self) => emit_load_pair_indirect_impl; - fn emit_i128_neg(&mut self) => emit_i128_neg_impl; - fn emit_i128_not(&mut self) => emit_i128_not_impl; - fn emit_sign_extend_acc_high(&mut self) => emit_sign_extend_acc_high_impl; - fn emit_zero_acc_high(&mut self) => emit_zero_acc_high_impl; - fn emit_i128_prep_binop(&mut self, lhs: &Operand, rhs: &Operand) => emit_i128_prep_binop_impl; - fn emit_i128_add(&mut self) => emit_i128_add_impl; - fn emit_i128_sub(&mut self) => emit_i128_sub_impl; - fn emit_i128_mul(&mut self) => emit_i128_mul_impl; - fn emit_i128_and(&mut self) => emit_i128_and_impl; - fn emit_i128_or(&mut self) => emit_i128_or_impl; - fn emit_i128_xor(&mut self) => emit_i128_xor_impl; - fn emit_i128_shl(&mut self) => emit_i128_shl_impl; - fn emit_i128_lshr(&mut self) => emit_i128_lshr_impl; - fn emit_i128_ashr(&mut self) => emit_i128_ashr_impl; - fn emit_i128_prep_shift_lhs(&mut self, lhs: &Operand) => emit_i128_prep_shift_lhs_impl; - fn emit_i128_shl_const(&mut self, amount: u32) => emit_i128_shl_const_impl; - fn emit_i128_lshr_const(&mut self, amount: u32) => emit_i128_lshr_const_impl; - fn emit_i128_ashr_const(&mut self, amount: u32) => emit_i128_ashr_const_impl; - fn emit_i128_divrem_call(&mut self, func_name: &str, lhs: &Operand, rhs: &Operand) => emit_i128_divrem_call_impl; - fn emit_i128_store_result(&mut self, dest: &Value) => emit_i128_store_result_impl; - fn emit_i128_to_float_call(&mut self, src: &Operand, from_signed: bool, to_ty: IrType) => emit_i128_to_float_call_impl; - fn emit_float_to_i128_call(&mut self, src: &Operand, to_signed: bool, from_ty: IrType) => emit_float_to_i128_call_impl; - fn emit_i128_cmp_eq(&mut self, is_ne: bool) => emit_i128_cmp_eq_impl; - fn emit_i128_cmp_ordered(&mut self, op: IrCmpOp) => emit_i128_cmp_ordered_impl; - fn emit_i128_cmp_store_result(&mut self, dest: &Value) => emit_i128_cmp_store_result_impl; - } -} - -impl Default for ArmCodegen { - fn default() -> Self { - Self::new() - } -} - diff --git a/src/backend/arm/codegen/f128.rs b/src/backend/arm/codegen/f128.rs deleted file mode 100644 index 95bfe2f78f..0000000000 --- a/src/backend/arm/codegen/f128.rs +++ /dev/null @@ -1,334 +0,0 @@ -//! AArch64 F128 (IEEE 754 binary128 / quad-precision) full-precision helpers. -//! -//! On AArch64, `long double` is IEEE 754 binary128 (16 bytes). Hardware has -//! no quad-precision FP ops, so all F128 arithmetic and conversion uses -//! compiler-rt / libgcc soft-float library calls. -//! -//! This file implements the `F128SoftFloat` trait for AArch64, providing the -//! arch-specific primitives (register names, instruction mnemonics, Q-register -//! representation). The shared orchestration logic lives in `backend/f128_softfloat.rs`. -//! -//! ABI: F128 values are passed/returned in Q registers (q0, q1). -//! Key design: -//! - Stack slots for F128 are 16 bytes (same as I128). -//! - f128_load_sources tracks which alloca/offset each F128 value was loaded -//! from, enabling full-precision reloads for comparisons and casts. - -use crate::ir::reexports::{Operand, Value}; -use crate::common::types::IrType; -use crate::backend::state::{StackSlot, SlotAddr}; -use crate::backend::traits::ArchCodegen; -use crate::backend::f128_softfloat::F128SoftFloat; -use super::emit::{ArmCodegen, callee_saved_name}; - -impl F128SoftFloat for ArmCodegen { - fn state(&mut self) -> &mut crate::backend::state::CodegenState { - &mut self.state - } - - fn f128_get_slot(&self, val_id: u32) -> Option { - self.state.get_slot(val_id) - } - - fn f128_get_source(&self, val_id: u32) -> Option<(u32, i64, bool)> { - self.state.get_f128_source(val_id) - } - - fn f128_resolve_slot_addr(&self, val_id: u32) -> Option { - self.state.resolve_slot_addr(val_id) - } - - fn f128_load_const_to_arg1(&mut self, lo: u64, hi: u64) { - self.emit_load_imm64("x0", lo as i64); - self.emit_load_imm64("x1", hi as i64); - self.state.emit(" fmov d0, x0"); - self.state.emit(" mov v0.d[1], x1"); - } - - fn f128_load_16b_from_addr_reg_to_arg1(&mut self) { - // x17 holds the address; load 16 bytes into q0 - self.state.emit(" ldr q0, [x17]"); - } - - fn f128_load_from_frame_offset_to_arg1(&mut self, offset: i64) { - self.emit_load_from_sp("q0", offset, "ldr"); - } - - fn f128_load_ptr_to_addr_reg(&mut self, slot: StackSlot, val_id: u32) { - if self.state.is_alloca(val_id) { - self.emit_alloca_addr("x17", val_id, slot.0); - } else { - self.emit_load_from_sp("x17", slot.0, "ldr"); - } - } - - fn f128_add_offset_to_addr_reg(&mut self, offset: i64) { - if offset > 0 && offset <= 4095 { - self.state.emit_fmt(format_args!(" add x17, x17, #{}", offset)); - } else { - self.load_large_imm("x16", offset); - self.state.emit(" add x17, x17, x16"); - } - } - - fn f128_alloca_aligned_addr(&mut self, slot: StackSlot, val_id: u32) { - self.emit_alloca_aligned_addr(slot, val_id); - } - - fn f128_load_operand_and_extend(&mut self, op: &Operand) { - self.operand_to_x0(op); - self.state.emit(" fmov d0, x0"); - self.state.emit(" bl __extenddftf2"); - // __extenddftf2 is a function call that clobbers x0 (and all - // caller-saved registers). Invalidate the cache so subsequent - // operand_to_x0 calls for the same value won't skip the reload. - self.state.reg_cache.invalidate_all(); - } - - fn f128_move_arg1_to_arg2(&mut self) { - // Move q0 -> q1 (128-bit NEON register move) - self.state.emit(" mov v1.16b, v0.16b"); - } - - fn f128_save_arg1_to_sp(&mut self) { - self.state.emit(" str q0, [sp]"); - } - - fn f128_reload_arg1_from_sp(&mut self) { - self.state.emit(" ldr q0, [sp]"); - } - - fn f128_alloc_temp_16(&mut self) { - self.emit_sub_sp(16); - } - - fn f128_free_temp_16(&mut self) { - self.emit_add_sp(16); - } - - fn f128_call(&mut self, name: &str) { - self.state.emit_fmt(format_args!(" bl {}", name)); - } - - fn f128_truncate_result_to_acc(&mut self) { - self.state.emit(" bl __trunctfdf2"); - self.state.emit(" fmov x0, d0"); - } - - fn f128_store_const_halves_to_slot(&mut self, lo: u64, hi: u64, slot: StackSlot) { - self.emit_load_imm64("x0", lo as i64); - self.emit_store_to_sp("x0", slot.0, "str"); - self.emit_load_imm64("x0", hi as i64); - self.emit_store_to_sp("x0", slot.0 + 8, "str"); - } - - fn f128_store_arg1_to_slot(&mut self, slot: StackSlot) { - // Store q0 (16 bytes) to slot - self.emit_store_to_sp("q0", slot.0, "str"); - } - - fn f128_copy_slot_to_slot(&mut self, src_offset: i64, dest_slot: StackSlot) { - // Load 16 bytes from source into q0, store to dest - self.emit_load_from_sp("q0", src_offset, "ldr"); - self.emit_store_to_sp("q0", dest_slot.0, "str"); - } - - fn f128_copy_addr_reg_to_slot(&mut self, dest_slot: StackSlot) { - // Load from x17 (addr reg) into q0, store to slot - self.state.emit(" ldr q0, [x17]"); - self.emit_store_to_sp("q0", dest_slot.0, "str"); - } - - fn f128_store_const_halves_to_addr(&mut self, lo: u64, hi: u64) { - // x17 holds dest address; use x16 as scratch - self.state.emit(" mov x16, x17"); - self.emit_load_imm64("x0", lo as i64); - self.state.emit(" str x0, [x16]"); - self.emit_load_imm64("x0", hi as i64); - self.state.emit(" str x0, [x16, #8]"); - } - - fn f128_save_addr_reg(&mut self) { - // Save x17 to x16 - self.state.emit(" mov x16, x17"); - } - - fn f128_copy_slot_to_saved_addr(&mut self, src_offset: i64) { - // Load 16 bytes from source slot, store to saved addr (x16) - self.emit_load_from_sp("q0", src_offset, "ldr"); - self.state.emit(" str q0, [x16]"); - } - - fn f128_copy_addr_reg_to_saved_addr(&mut self) { - // Load 16 bytes from x17, store to x16 - self.state.emit(" ldr q0, [x17]"); - self.state.emit(" str q0, [x16]"); - } - - fn f128_store_arg1_to_saved_addr(&mut self) { - // Store q0 (f128 in arg1) to saved addr (x16) - self.state.emit(" str q0, [x16]"); - } - - fn f128_flip_sign_bit(&mut self) { - // Extract high 64 bits of q0, XOR sign bit, reinsert - self.state.emit(" mov x0, v0.d[1]"); - self.state.emit(" eor x0, x0, #0x8000000000000000"); - self.state.emit(" mov v0.d[1], x0"); - } - - fn f128_cmp_result_to_bool(&mut self, kind: crate::backend::cast::F128CmpKind) { - use crate::backend::cast::F128CmpKind; - self.state.emit(" cmp w0, #0"); - let cond = match kind { - F128CmpKind::Eq => "eq", - F128CmpKind::Ne => "ne", - F128CmpKind::Lt => "lt", - F128CmpKind::Le => "le", - F128CmpKind::Gt => "gt", - F128CmpKind::Ge => "ge", - }; - self.state.emit_fmt(format_args!(" cset x0, {}", cond)); - } - - fn f128_store_acc_to_dest(&mut self, dest: &Value) { - self.store_x0_to(dest); - } - - fn f128_track_self(&mut self, dest_id: u32) { - self.state.track_f128_self(dest_id); - } - - fn f128_set_acc_cache(&mut self, dest_id: u32) { - self.state.reg_cache.set_acc(dest_id, false); - } - - fn f128_set_dyn_alloca(&mut self, val: bool) -> bool { - let saved = self.state.has_dyn_alloca; - self.state.has_dyn_alloca = val; - saved - } - - fn f128_move_callee_reg_to_addr_reg(&mut self, val_id: u32) -> bool { - if let Some(®) = self.reg_assignments.get(&val_id) { - let reg_name = callee_saved_name(reg); - self.state.emit_fmt(format_args!(" mov x17, {}", reg_name)); - true - } else { - false - } - } - - fn f128_move_aligned_to_addr_reg(&mut self) { - // x9 is the alloca-aligned address register, x17 is the F128 addr register - self.state.emit(" mov x17, x9"); - } - - fn f128_load_indirect_ptr_to_addr_reg(&mut self, slot: StackSlot, _val_id: u32) { - self.emit_load_from_sp("x17", slot.0, "ldr"); - } - - fn f128_load_from_addr_reg_to_acc(&mut self, dest: &Value) { - // Load 16 bytes from x17 into q0, convert to f64, store to dest - self.state.emit(" ldr q0, [x17]"); - self.state.emit(" bl __trunctfdf2"); - self.state.emit(" fmov x0, d0"); - self.state.reg_cache.invalidate_all(); - self.store_x0_to(dest); - } - - fn f128_load_from_direct_slot_to_acc(&mut self, slot: StackSlot) { - self.emit_load_from_sp("q0", slot.0, "ldr"); - } - - fn f128_store_result_and_truncate(&mut self, dest: &Value) { - // Store full f128 from arg1 (q0) to dest slot - if let Some(slot) = self.state.get_slot(dest.0) { - self.emit_f128_store_q0_to_slot(slot); - self.state.track_f128_self(dest.0); - } - // Produce f64 approximation (do NOT store to slot - that would overwrite f128) - self.state.emit(" bl __trunctfdf2"); - self.state.emit(" fmov x0, d0"); - self.state.reg_cache.set_acc(dest.0, false); - } - - fn f128_move_acc_to_arg0(&mut self) { - // On ARM, the accumulator is x0 which is already the first arg register - } - - fn f128_move_arg0_to_acc(&mut self) { - // On ARM, a0 = x0 = accumulator, so nothing to do - } - - fn f128_load_operand_to_acc(&mut self, op: &Operand) { - self.operand_to_x0(op); - } - - fn f128_sign_extend_acc(&mut self, from_size: usize) { - match from_size { - 1 => self.state.emit(" sxtb x0, w0"), - 2 => self.state.emit(" sxth x0, w0"), - 4 => self.state.emit(" sxtw x0, w0"), - _ => {} - } - } - - fn f128_zero_extend_acc(&mut self, from_size: usize) { - match from_size { - 1 => self.state.emit(" and x0, x0, #0xff"), - 2 => self.state.emit(" and x0, x0, #0xffff"), - 4 => self.state.emit(" mov w0, w0"), - _ => {} - } - } - - fn f128_narrow_acc(&mut self, to_ty: IrType) { - self.emit_cast_instrs(IrType::I64, to_ty); - } - - fn f128_extend_float_to_f128(&mut self, from_ty: IrType) { - if from_ty == IrType::F32 { - self.state.emit(" fmov s0, w0"); - self.state.emit(" bl __extendsftf2"); - } else { - self.state.emit(" fmov d0, x0"); - self.state.emit(" bl __extenddftf2"); - } - } - - fn f128_truncate_to_float_acc(&mut self, to_ty: IrType) { - if to_ty == IrType::F32 { - self.state.emit(" bl __trunctfsf2"); - self.state.emit(" fmov w0, s0"); - } else { - self.state.emit(" bl __trunctfdf2"); - self.state.emit(" fmov x0, d0"); - } - } - - fn f128_is_alloca(&self, val_id: u32) -> bool { - self.state.is_alloca(val_id) - } -} - -// ============================================================================= -// Public helpers that delegate to shared orchestration -// ============================================================================= - -impl ArmCodegen { - /// Load an F128 operand into Q0 with full precision. - pub(super) fn emit_f128_operand_to_q0_full(&mut self, op: &Operand) { - crate::backend::f128_softfloat::f128_operand_to_arg1(self, op); - } - - /// Store Q0 (16-byte f128) to a stack slot. - pub(super) fn emit_f128_store_q0_to_slot(&mut self, slot: StackSlot) { - self.emit_store_to_sp("q0", slot.0, "str"); - } - - /// Negate an F128 value with full precision. - pub(super) fn emit_f128_neg_full(&mut self, dest: &Value, src: &Operand) { - crate::backend::f128_softfloat::f128_neg(self, dest, src); - } -} diff --git a/src/backend/arm/codegen/float_ops.rs b/src/backend/arm/codegen/float_ops.rs deleted file mode 100644 index a1011a441b..0000000000 --- a/src/backend/arm/codegen/float_ops.rs +++ /dev/null @@ -1,43 +0,0 @@ -//! ArmCodegen: floating-point binary operations. - -use crate::ir::reexports::{Operand, Value}; -use crate::common::types::IrType; -use crate::backend::cast::FloatOp; -use crate::backend::traits::ArchCodegen; -use super::emit::ArmCodegen; - -impl ArmCodegen { - pub(super) fn emit_float_binop_impl(&mut self, dest: &Value, op: FloatOp, lhs: &Operand, rhs: &Operand, ty: IrType) { - if ty == IrType::F128 { - crate::backend::f128_softfloat::f128_emit_binop(self, dest, op, lhs, rhs); - return; - } - // Non-F128: use default path. - let mnemonic = self.emit_float_binop_mnemonic(op); - self.operand_to_x0(lhs); - self.state.emit(" mov x1, x0"); - self.operand_to_x0(rhs); - self.emit_float_binop_body(mnemonic, ty); - self.store_x0_to(dest); - } - - pub(super) fn emit_float_binop_body(&mut self, mnemonic: &str, ty: IrType) { - self.state.emit(" mov x2, x0"); - if ty == IrType::F32 { - self.state.emit(" fmov s0, w1"); - self.state.emit(" fmov s1, w2"); - self.state.emit_fmt(format_args!(" {} s0, s0, s1", mnemonic)); - self.state.emit(" fmov w0, s0"); - self.state.emit(" mov w0, w0"); // zero-extend - } else { - self.state.emit(" fmov d0, x1"); - self.state.emit(" fmov d1, x2"); - self.state.emit_fmt(format_args!(" {} d0, d0, d1", mnemonic)); - self.state.emit(" fmov x0, d0"); - } - } - - pub(super) fn emit_f128_neg_impl(&mut self, dest: &Value, src: &Operand) { - self.emit_f128_neg_full(dest, src); - } -} diff --git a/src/backend/arm/codegen/globals.rs b/src/backend/arm/codegen/globals.rs deleted file mode 100644 index d56325e48f..0000000000 --- a/src/backend/arm/codegen/globals.rs +++ /dev/null @@ -1,30 +0,0 @@ -//! ArmCodegen: global address operations. - -use crate::ir::reexports::Value; -use super::emit::ArmCodegen; - -impl ArmCodegen { - pub(super) fn emit_global_addr_impl(&mut self, dest: &Value, name: &str) { - if self.state.needs_got_aarch64(name) { - self.state.emit_fmt(format_args!(" adrp x0, :got:{}", name)); - self.state.emit_fmt(format_args!(" ldr x0, [x0, :got_lo12:{}]", name)); - } else { - self.state.emit_fmt(format_args!(" adrp x0, {}", name)); - self.state.emit_fmt(format_args!(" add x0, x0, :lo12:{}", name)); - } - self.store_x0_to(dest); - } - - pub(super) fn emit_label_addr_impl(&mut self, dest: &Value, label: &str) { - self.state.emit_fmt(format_args!(" adrp x0, {}", label)); - self.state.emit_fmt(format_args!(" add x0, x0, :lo12:{}", label)); - self.store_x0_to(dest); - } - - pub(super) fn emit_tls_global_addr_impl(&mut self, dest: &Value, name: &str) { - self.state.emit(" mrs x0, tpidr_el0"); - self.state.emit_fmt(format_args!(" add x0, x0, :tprel_hi12:{}", name)); - self.state.emit_fmt(format_args!(" add x0, x0, :tprel_lo12_nc:{}", name)); - self.store_x0_to(dest); - } -} diff --git a/src/backend/arm/codegen/i128_ops.rs b/src/backend/arm/codegen/i128_ops.rs deleted file mode 100644 index aba386a48f..0000000000 --- a/src/backend/arm/codegen/i128_ops.rs +++ /dev/null @@ -1,322 +0,0 @@ -//! ArmCodegen: 128-bit integer operations. - -use crate::ir::reexports::{IrCmpOp, Operand, Value}; -use crate::common::types::IrType; -use crate::backend::state::StackSlot; -use super::emit::ArmCodegen; - -impl ArmCodegen { - pub(super) fn emit_load_acc_pair_impl(&mut self, op: &Operand) { - self.operand_to_x0_x1(op); - } - - pub(super) fn emit_store_acc_pair_impl(&mut self, dest: &Value) { - self.store_x0_x1_to(dest); - } - - pub(super) fn emit_store_pair_to_slot_impl(&mut self, slot: StackSlot) { - self.emit_store_to_sp("x0", slot.0, "str"); - self.emit_store_to_sp("x1", slot.0 + 8, "str"); - } - - pub(super) fn emit_load_pair_from_slot_impl(&mut self, slot: StackSlot) { - self.emit_load_from_sp("x0", slot.0, "ldr"); - self.emit_load_from_sp("x1", slot.0 + 8, "ldr"); - } - - pub(super) fn emit_save_acc_pair_impl(&mut self) { - self.state.emit(" mov x2, x0"); - self.state.emit(" mov x3, x1"); - } - - pub(super) fn emit_store_pair_indirect_impl(&mut self) { - self.state.emit(" str x2, [x9]"); - self.state.emit(" str x3, [x9, #8]"); - } - - pub(super) fn emit_load_pair_indirect_impl(&mut self) { - self.state.emit(" ldr x0, [x9]"); - self.state.emit(" ldr x1, [x9, #8]"); - } - - pub(super) fn emit_i128_neg_impl(&mut self) { - self.state.emit(" mvn x0, x0"); - self.state.emit(" mvn x1, x1"); - self.state.emit(" adds x0, x0, #1"); - self.state.emit(" adc x1, x1, xzr"); - } - - pub(super) fn emit_i128_not_impl(&mut self) { - self.state.emit(" mvn x0, x0"); - self.state.emit(" mvn x1, x1"); - } - - pub(super) fn emit_sign_extend_acc_high_impl(&mut self) { - self.state.emit(" asr x1, x0, #63"); - } - - pub(super) fn emit_zero_acc_high_impl(&mut self) { - self.state.emit(" mov x1, #0"); - } - - pub(super) fn emit_i128_prep_binop_impl(&mut self, lhs: &Operand, rhs: &Operand) { - self.prep_i128_binop(lhs, rhs); - } - - pub(super) fn emit_i128_add_impl(&mut self) { - self.state.emit(" adds x0, x2, x4"); - self.state.emit(" adc x1, x3, x5"); - } - - pub(super) fn emit_i128_sub_impl(&mut self) { - self.state.emit(" subs x0, x2, x4"); - self.state.emit(" sbc x1, x3, x5"); - } - - pub(super) fn emit_i128_mul_impl(&mut self) { - self.state.emit(" mul x0, x2, x4"); - self.state.emit(" umulh x1, x2, x4"); - self.state.emit(" madd x1, x3, x4, x1"); - self.state.emit(" madd x1, x2, x5, x1"); - } - - pub(super) fn emit_i128_and_impl(&mut self) { - self.state.emit(" and x0, x2, x4"); - self.state.emit(" and x1, x3, x5"); - } - - pub(super) fn emit_i128_or_impl(&mut self) { - self.state.emit(" orr x0, x2, x4"); - self.state.emit(" orr x1, x3, x5"); - } - - pub(super) fn emit_i128_xor_impl(&mut self) { - self.state.emit(" eor x0, x2, x4"); - self.state.emit(" eor x1, x3, x5"); - } - - pub(super) fn emit_i128_shl_impl(&mut self) { - let lbl = self.state.fresh_label("shl128"); - let done = self.state.fresh_label("shl128_done"); - let noop = self.state.fresh_label("shl128_noop"); - self.state.emit(" and x4, x4, #127"); - self.state.emit_fmt(format_args!(" cbz x4, {}", noop)); - self.state.emit(" cmp x4, #64"); - self.state.emit_fmt(format_args!(" b.ge {}", lbl)); - self.state.emit(" lsl x1, x3, x4"); - self.state.emit(" mov x5, #64"); - self.state.emit(" sub x5, x5, x4"); - self.state.emit(" lsr x6, x2, x5"); - self.state.emit(" orr x1, x1, x6"); - self.state.emit(" lsl x0, x2, x4"); - self.state.emit_fmt(format_args!(" b {}", done)); - self.state.emit_fmt(format_args!("{}:", lbl)); - self.state.emit(" sub x4, x4, #64"); - self.state.emit(" lsl x1, x2, x4"); - self.state.emit(" mov x0, #0"); - self.state.emit_fmt(format_args!(" b {}", done)); - self.state.emit_fmt(format_args!("{}:", noop)); - self.state.emit(" mov x0, x2"); - self.state.emit(" mov x1, x3"); - self.state.emit_fmt(format_args!("{}:", done)); - } - - pub(super) fn emit_i128_lshr_impl(&mut self) { - let lbl = self.state.fresh_label("lshr128"); - let done = self.state.fresh_label("lshr128_done"); - let noop = self.state.fresh_label("lshr128_noop"); - self.state.emit(" and x4, x4, #127"); - self.state.emit_fmt(format_args!(" cbz x4, {}", noop)); - self.state.emit(" cmp x4, #64"); - self.state.emit_fmt(format_args!(" b.ge {}", lbl)); - self.state.emit(" lsr x0, x2, x4"); - self.state.emit(" mov x5, #64"); - self.state.emit(" sub x5, x5, x4"); - self.state.emit(" lsl x6, x3, x5"); - self.state.emit(" orr x0, x0, x6"); - self.state.emit(" lsr x1, x3, x4"); - self.state.emit_fmt(format_args!(" b {}", done)); - self.state.emit_fmt(format_args!("{}:", lbl)); - self.state.emit(" sub x4, x4, #64"); - self.state.emit(" lsr x0, x3, x4"); - self.state.emit(" mov x1, #0"); - self.state.emit_fmt(format_args!(" b {}", done)); - self.state.emit_fmt(format_args!("{}:", noop)); - self.state.emit(" mov x0, x2"); - self.state.emit(" mov x1, x3"); - self.state.emit_fmt(format_args!("{}:", done)); - } - - pub(super) fn emit_i128_ashr_impl(&mut self) { - let lbl = self.state.fresh_label("ashr128"); - let done = self.state.fresh_label("ashr128_done"); - let noop = self.state.fresh_label("ashr128_noop"); - self.state.emit(" and x4, x4, #127"); - self.state.emit_fmt(format_args!(" cbz x4, {}", noop)); - self.state.emit(" cmp x4, #64"); - self.state.emit_fmt(format_args!(" b.ge {}", lbl)); - self.state.emit(" lsr x0, x2, x4"); - self.state.emit(" mov x5, #64"); - self.state.emit(" sub x5, x5, x4"); - self.state.emit(" lsl x6, x3, x5"); - self.state.emit(" orr x0, x0, x6"); - self.state.emit(" asr x1, x3, x4"); - self.state.emit_fmt(format_args!(" b {}", done)); - self.state.emit_fmt(format_args!("{}:", lbl)); - self.state.emit(" sub x4, x4, #64"); - self.state.emit(" asr x0, x3, x4"); - self.state.emit(" asr x1, x3, #63"); - self.state.emit_fmt(format_args!(" b {}", done)); - self.state.emit_fmt(format_args!("{}:", noop)); - self.state.emit(" mov x0, x2"); - self.state.emit(" mov x1, x3"); - self.state.emit_fmt(format_args!("{}:", done)); - } - - pub(super) fn emit_i128_prep_shift_lhs_impl(&mut self, lhs: &Operand) { - self.operand_to_x0_x1(lhs); - self.state.emit(" mov x2, x0"); - self.state.emit(" mov x3, x1"); - } - - pub(super) fn emit_i128_shl_const_impl(&mut self, amount: u32) { - let amount = amount & 127; - if amount == 0 { - self.state.emit(" mov x0, x2"); - self.state.emit(" mov x1, x3"); - } else if amount == 64 { - self.state.emit(" mov x1, x2"); - self.state.emit(" mov x0, #0"); - } else if amount > 64 { - self.state.emit_fmt(format_args!(" lsl x1, x2, #{}", amount - 64)); - self.state.emit(" mov x0, #0"); - } else { - self.state.emit_fmt(format_args!(" lsl x1, x3, #{}", amount)); - self.state.emit_fmt(format_args!(" orr x1, x1, x2, lsr #{}", 64 - amount)); - self.state.emit_fmt(format_args!(" lsl x0, x2, #{}", amount)); - } - } - - pub(super) fn emit_i128_lshr_const_impl(&mut self, amount: u32) { - let amount = amount & 127; - if amount == 0 { - self.state.emit(" mov x0, x2"); - self.state.emit(" mov x1, x3"); - } else if amount == 64 { - self.state.emit(" mov x0, x3"); - self.state.emit(" mov x1, #0"); - } else if amount > 64 { - self.state.emit_fmt(format_args!(" lsr x0, x3, #{}", amount - 64)); - self.state.emit(" mov x1, #0"); - } else { - self.state.emit_fmt(format_args!(" lsr x0, x2, #{}", amount)); - self.state.emit_fmt(format_args!(" orr x0, x0, x3, lsl #{}", 64 - amount)); - self.state.emit_fmt(format_args!(" lsr x1, x3, #{}", amount)); - } - } - - pub(super) fn emit_i128_ashr_const_impl(&mut self, amount: u32) { - let amount = amount & 127; - if amount == 0 { - self.state.emit(" mov x0, x2"); - self.state.emit(" mov x1, x3"); - } else if amount == 64 { - self.state.emit(" mov x0, x3"); - self.state.emit(" asr x1, x3, #63"); - } else if amount > 64 { - self.state.emit_fmt(format_args!(" asr x0, x3, #{}", amount - 64)); - self.state.emit(" asr x1, x3, #63"); - } else { - self.state.emit_fmt(format_args!(" lsr x0, x2, #{}", amount)); - self.state.emit_fmt(format_args!(" orr x0, x0, x3, lsl #{}", 64 - amount)); - self.state.emit_fmt(format_args!(" asr x1, x3, #{}", amount)); - } - } - - pub(super) fn emit_i128_divrem_call_impl(&mut self, func_name: &str, lhs: &Operand, rhs: &Operand) { - self.operand_to_x0_x1(lhs); - self.state.emit(" mov x2, x0"); - self.state.emit(" mov x3, x1"); - self.operand_to_x0_x1(rhs); - self.state.emit(" mov x4, x0"); - self.state.emit(" mov x5, x1"); - self.state.emit(" mov x0, x2"); - self.state.emit(" mov x1, x3"); - self.state.emit(" mov x2, x4"); - self.state.emit(" mov x3, x5"); - self.state.emit_fmt(format_args!(" bl {}", func_name)); - } - - pub(super) fn emit_i128_store_result_impl(&mut self, dest: &Value) { - self.store_x0_x1_to(dest); - } - - pub(super) fn emit_float_to_i128_call_impl(&mut self, src: &Operand, to_signed: bool, from_ty: IrType) { - self.operand_to_x0(src); - if from_ty == IrType::F32 { - self.state.emit(" fmov s0, w0"); - } else { - self.state.emit(" fmov d0, x0"); - } - let func_name = match (to_signed, from_ty) { - (true, IrType::F64) => "__fixdfti", - (true, IrType::F32) => "__fixsfti", - (false, IrType::F64) => "__fixunsdfti", - (false, IrType::F32) => "__fixunssfti", - _ => panic!("unsupported float-to-i128 conversion: {:?}", from_ty), - }; - self.state.emit_fmt(format_args!(" bl {}", func_name)); - self.state.reg_cache.invalidate_all(); - } - - pub(super) fn emit_i128_to_float_call_impl(&mut self, src: &Operand, from_signed: bool, to_ty: IrType) { - self.operand_to_x0_x1(src); - let func_name = match (from_signed, to_ty) { - (true, IrType::F64) => "__floattidf", - (true, IrType::F32) => "__floattisf", - (false, IrType::F64) => "__floatuntidf", - (false, IrType::F32) => "__floatuntisf", - _ => panic!("unsupported i128-to-float conversion: {:?}", to_ty), - }; - self.state.emit_fmt(format_args!(" bl {}", func_name)); - self.state.reg_cache.invalidate_all(); - if to_ty == IrType::F32 { - self.state.emit(" fmov w0, s0"); - } else { - self.state.emit(" fmov x0, d0"); - } - } - - pub(super) fn emit_i128_cmp_eq_impl(&mut self, is_ne: bool) { - self.state.emit(" eor x0, x2, x4"); - self.state.emit(" eor x1, x3, x5"); - self.state.emit(" orr x0, x0, x1"); - self.state.emit(" cmp x0, #0"); - if is_ne { - self.state.emit(" cset x0, ne"); - } else { - self.state.emit(" cset x0, eq"); - } - } - - pub(super) fn emit_i128_cmp_ordered_impl(&mut self, op: IrCmpOp) { - let done = self.state.fresh_label("cmp128_done"); - self.state.emit(" cmp x3, x5"); - let (hi_cond, lo_cond) = match op { - IrCmpOp::Slt | IrCmpOp::Sle => ("lt", if op == IrCmpOp::Slt { "lo" } else { "ls" }), - IrCmpOp::Sgt | IrCmpOp::Sge => ("gt", if op == IrCmpOp::Sgt { "hi" } else { "hs" }), - IrCmpOp::Ult | IrCmpOp::Ule => ("lo", if op == IrCmpOp::Ult { "lo" } else { "ls" }), - IrCmpOp::Ugt | IrCmpOp::Uge => ("hi", if op == IrCmpOp::Ugt { "hi" } else { "hs" }), - _ => unreachable!("i128 ordered cmp got equality op: {:?}", op), - }; - self.state.emit_fmt(format_args!(" cset x0, {}", hi_cond)); - self.state.emit_fmt(format_args!(" b.ne {}", done)); - self.state.emit(" cmp x2, x4"); - self.state.emit_fmt(format_args!(" cset x0, {}", lo_cond)); - self.state.emit_fmt(format_args!("{}:", done)); - } - - pub(super) fn emit_i128_cmp_store_result_impl(&mut self, dest: &Value) { - self.store_x0_to(dest); - } -} diff --git a/src/backend/arm/codegen/inline_asm.rs b/src/backend/arm/codegen/inline_asm.rs deleted file mode 100644 index 2171e64cea..0000000000 --- a/src/backend/arm/codegen/inline_asm.rs +++ /dev/null @@ -1,362 +0,0 @@ -//! AArch64 inline assembly template substitution and register formatting. -//! -//! This module handles operand substitution in inline assembly templates -//! (e.g., `%0`, `%[name]`, `%w0`, `%x0`) and register formatting with -//! w/x/s/d/q modifiers for ARM targets. It also contains helpers for -//! atomic exclusive access instructions (ldxr/stxr) and atomic RMW operations. - -use std::fmt::Write; -use crate::ir::reexports::{AtomicOrdering, AtomicRmwOp}; -use crate::common::types::IrType; -use crate::backend::state::CodegenState; -use super::emit::ArmCodegen; - -impl ArmCodegen { - pub(super) fn substitute_asm_operands_static( - line: &str, - op_regs: &[String], - op_names: &[Option], - gcc_to_internal: &[usize], - op_imm_values: &[Option], - op_imm_symbols: &[Option], - ) -> String { - let mut result = String::new(); - let chars: Vec = line.chars().collect(); - let mut i = 0; - while i < chars.len() { - if chars[i] == '%' && i + 1 < chars.len() { - i += 1; - // Check for %% (literal %) - if chars[i] == '%' { - result.push('%'); - i += 1; - continue; - } - // Check for modifier: w, x, h, b, s, d, q, c, a - // 'c' = raw constant (no # prefix), used in ARM inline asm - // 'a' = memory address reference [reg], used for prfm/prefetch - let mut modifier = None; - if chars[i] == 'w' || chars[i] == 'x' || chars[i] == 'h' || chars[i] == 'b' - || chars[i] == 's' || chars[i] == 'd' || chars[i] == 'q' - || chars[i] == 'c' || chars[i] == 'a' - { - // Check if next char is digit or [, meaning this is a modifier - if i + 1 < chars.len() && (chars[i + 1].is_ascii_digit() || chars[i + 1] == '[') { - modifier = Some(chars[i]); - i += 1; - } - } - - if chars[i] == '[' { - // Named operand: %[name] or %w[name] - i += 1; - let name_start = i; - while i < chars.len() && chars[i] != ']' { - i += 1; - } - let name: String = chars[name_start..i].iter().collect(); - if i < chars.len() { i += 1; } // skip ] - - // Look up by name in operands - let mut found = false; - for (idx, op_name) in op_names.iter().enumerate() { - if let Some(ref n) = op_name { - if n == &name { - result.push_str(&Self::format_operand_static( - idx, modifier, op_regs, op_imm_values, op_imm_symbols, - )); - found = true; - break; - } - } - } - if !found { - // Fallback: emit raw - result.push('%'); - if let Some(m) = modifier { result.push(m); } - result.push('['); - result.push_str(&name); - result.push(']'); - } - } else if chars[i].is_ascii_digit() { - // Positional operand: %0, %1, %w2, etc. - // The number is a GCC operand number (outputs numbered first, - // then real inputs, skipping synthetic "+" inputs). - // Map through gcc_to_internal to get the internal operand index. - let mut num = 0usize; - while i < chars.len() && chars[i].is_ascii_digit() { - num = num * 10 + (chars[i] as usize - '0' as usize); - i += 1; - } - let internal_idx = if num < gcc_to_internal.len() { - gcc_to_internal[num] - } else { - num - }; - if internal_idx < op_regs.len() { - result.push_str(&Self::format_operand_static( - internal_idx, modifier, op_regs, op_imm_values, op_imm_symbols, - )); - } else { - let _ = write!(result, "x{}", num); - } - } else { - // Not a recognized pattern, emit as-is - result.push('%'); - if let Some(m) = modifier { result.push(m); } - result.push(chars[i]); - i += 1; - } - } else { - result.push(chars[i]); - i += 1; - } - } - result - } - - /// Format an operand for ARM inline assembly substitution. - /// For immediate operands (from "i"/"n" constraints), emit the raw value. - /// GCC on AArch64 emits raw integers for "i" constraints (no '#' prefix), - /// both in instruction and data directive contexts. - /// For register operands, apply the modifier and emit the register name. - fn format_operand_static( - idx: usize, - modifier: Option, - op_regs: &[String], - op_imm_values: &[Option], - op_imm_symbols: &[Option], - ) -> String { - // Check for immediate symbol first (e.g., function/variable name) - if let Some(Some(ref sym)) = op_imm_symbols.get(idx) { - return sym.clone(); - } - // Check for immediate value - emit raw (GCC AArch64 behavior) - if let Some(Some(imm)) = op_imm_values.get(idx) { - return imm.to_string(); - } - // Regular register operand - Self::format_reg_static(&op_regs[idx], modifier) - } - - pub(super) fn format_reg_static(reg: &str, modifier: Option) -> String { - // Normalize r-prefix registers (GCC AArch64 aliases: r0-r30 = x0-x30) - // before processing modifiers so %w0/%x0 work correctly on r-prefixed operands. - let normalized; - let reg = if reg.starts_with('r') { - if let Some(suffix) = reg.strip_prefix('r') { - if suffix.chars().all(|c| c.is_ascii_digit()) { - if let Ok(n) = suffix.parse::() { - if n <= 30 { - normalized = format!("x{}", n); - &normalized - } else { - reg - } - } else { - reg - } - } else { - reg - } - } else { - reg - } - } else { - reg - }; - // Extract the register number from any register form (x, w, d, s, q, v) - let reg_num = || -> Option<&str> { - if reg.starts_with('x') || reg.starts_with('w') || reg.starts_with('d') - || reg.starts_with('s') || reg.starts_with('q') || reg.starts_with('v') { - Some(®[1..]) - } else { - None - } - }; - // Check if this is a FP/SIMD register (d, s, q, v prefix) - let is_fp_reg = reg.starts_with('d') || reg.starts_with('s') - || reg.starts_with('q') || reg.starts_with('v'); - match modifier { - Some('w') => { - // Convert to w-register (32-bit GP) - if let Some(num) = reg_num() { - if reg.starts_with('x') || reg.starts_with('w') { - return format!("w{}", num); - } - } - reg.to_string() - } - Some('x') => { - // Convert to x-register (64-bit GP) - if let Some(num) = reg_num() { - if reg.starts_with('w') || reg.starts_with('x') { - return format!("x{}", num); - } - } - reg.to_string() - } - Some('d') => { - // Convert to d-register (64-bit FP/double) - if let Some(num) = reg_num() { - if is_fp_reg { - return format!("d{}", num); - } - } - reg.to_string() - } - Some('s') => { - // Convert to s-register (32-bit FP/float) - if let Some(num) = reg_num() { - if is_fp_reg { - return format!("s{}", num); - } - } - reg.to_string() - } - Some('q') => { - // Convert to q-register (128-bit SIMD) - if let Some(num) = reg_num() { - if is_fp_reg { - return format!("q{}", num); - } - } - reg.to_string() - } - Some('h') => { - // Convert to h-register (16-bit FP/half) - if let Some(num) = reg_num() { - if is_fp_reg { - return format!("h{}", num); - } - } - reg.to_string() - } - Some('b') => { - // Convert to b-register (8-bit) - if let Some(num) = reg_num() { - if is_fp_reg { - return format!("b{}", num); - } - } - reg.to_string() - } - Some('a') => { - // Memory address reference: wrap register in square brackets [reg] - // Used by prfm/prefetch instructions: "prfm pldl1keep, %a0" - // If the operand is already a memory reference (e.g., "[sp, #N]"), - // return it as-is. Otherwise, wrap the register in brackets. - if reg.starts_with('[') { - reg.to_string() - } else { - format!("[{}]", reg) - } - } - None => { - // No modifier: for FP/SIMD registers, emit as vN (GCC behavior). - // This ensures `%0.16b` produces `v16.16b` not `d16.16b`. - if is_fp_reg { - if let Some(num) = reg_num() { - return format!("v{}", num); - } - } - reg.to_string() - } - _ => reg.to_string(), - } - } - - /// Convert a FP/SIMD register name to its s-register counterpart (same register number). - /// e.g., "d16" -> "s16", "v16" -> "s16" - pub(super) fn fp_to_s_reg(reg: &str) -> String { - if let Some(rest) = reg.strip_prefix('d') - .or_else(|| reg.strip_prefix('v')) - .or_else(|| reg.strip_prefix('s')) - .or_else(|| reg.strip_prefix('q')) - { - format!("s{rest}") - } else { - reg.to_string() - } - } - - /// Convert a FP/SIMD register name to its d-register counterpart (same register number). - /// e.g., "v16" -> "d16", "s16" -> "d16" - pub(super) fn fp_to_d_reg(reg: &str) -> String { - if let Some(rest) = reg.strip_prefix('v') - .or_else(|| reg.strip_prefix('d')) - .or_else(|| reg.strip_prefix('s')) - .or_else(|| reg.strip_prefix('q')) - { - format!("d{rest}") - } else { - reg.to_string() - } - } - - /// Convert a FP/SIMD register name to its q-register counterpart (same register number). - /// e.g., "v16" -> "q16", "d16" -> "q16" - pub(super) fn fp_to_q_reg(reg: &str) -> String { - if let Some(rest) = reg.strip_prefix('v') - .or_else(|| reg.strip_prefix('d')) - .or_else(|| reg.strip_prefix('s')) - .or_else(|| reg.strip_prefix('q')) - { - format!("q{rest}") - } else { - reg.to_string() - } - } - - /// Get the exclusive load/store instructions and register prefix for a type, - /// with appropriate acquire/release semantics based on ordering. - /// - Relaxed: ldxr/stxr (no ordering) - /// - Acquire: ldaxr/stxr (acquire on load) - /// - Release: ldxr/stlxr (release on store) - /// - AcqRel/SeqCst: ldaxr/stlxr (acquire on load, release on store) - pub(super) fn exclusive_instrs(ty: IrType, ordering: AtomicOrdering) -> (&'static str, &'static str, &'static str) { - let need_acquire = matches!(ordering, AtomicOrdering::Acquire | AtomicOrdering::AcqRel | AtomicOrdering::SeqCst); - let need_release = matches!(ordering, AtomicOrdering::Release | AtomicOrdering::AcqRel | AtomicOrdering::SeqCst); - match ty { - IrType::I8 | IrType::U8 => ( - if need_acquire { "ldaxrb" } else { "ldxrb" }, - if need_release { "stlxrb" } else { "stxrb" }, - "w", - ), - IrType::I16 | IrType::U16 => ( - if need_acquire { "ldaxrh" } else { "ldxrh" }, - if need_release { "stlxrh" } else { "stxrh" }, - "w", - ), - IrType::I32 | IrType::U32 => ( - if need_acquire { "ldaxr" } else { "ldxr" }, - if need_release { "stlxr" } else { "stxr" }, - "w", - ), - _ => ( - if need_acquire { "ldaxr" } else { "ldxr" }, - if need_release { "stlxr" } else { "stxr" }, - "x", - ), - } - } - - /// Emit the arithmetic operation for an atomic RMW. - pub(super) fn emit_atomic_op_arm(state: &mut CodegenState, op: AtomicRmwOp, dest_reg: &str, old_reg: &str, val_reg: &str) { - match op { - AtomicRmwOp::Add => state.emit_fmt(format_args!(" add {}, {}, {}", dest_reg, old_reg, val_reg)), - AtomicRmwOp::Sub => state.emit_fmt(format_args!(" sub {}, {}, {}", dest_reg, old_reg, val_reg)), - AtomicRmwOp::And => state.emit_fmt(format_args!(" and {}, {}, {}", dest_reg, old_reg, val_reg)), - AtomicRmwOp::Or => state.emit_fmt(format_args!(" orr {}, {}, {}", dest_reg, old_reg, val_reg)), - AtomicRmwOp::Xor => state.emit_fmt(format_args!(" eor {}, {}, {}", dest_reg, old_reg, val_reg)), - AtomicRmwOp::Nand => { - state.emit_fmt(format_args!(" and {}, {}, {}", dest_reg, old_reg, val_reg)); - state.emit_fmt(format_args!(" mvn {}, {}", dest_reg, dest_reg)); - } - AtomicRmwOp::Xchg | AtomicRmwOp::TestAndSet => { - // Handled separately in emit_atomic_rmw - state.emit_fmt(format_args!(" mov {}, {}", dest_reg, val_reg)); - } - } - } -} diff --git a/src/backend/arm/codegen/intrinsics.rs b/src/backend/arm/codegen/intrinsics.rs deleted file mode 100644 index 046cd619e6..0000000000 --- a/src/backend/arm/codegen/intrinsics.rs +++ /dev/null @@ -1,297 +0,0 @@ -//! AArch64 NEON/SIMD intrinsic emission and F128 (quad-precision) soft-float helpers. -//! -//! NEON intrinsics: SSE-equivalent operations via 128-bit NEON instructions. -//! F128: IEEE 754 binary128 via compiler-rt/libgcc soft-float libcalls. - -use crate::ir::reexports::{IntrinsicOp, Operand, Value}; -use super::emit::ArmCodegen; - -impl ArmCodegen { - pub(super) fn emit_neon_binary_128(&mut self, dest_ptr: &Value, args: &[Operand], neon_inst: &str) { - // Load first 128-bit operand pointer into x0, then load q0 - self.operand_to_x0(&args[0]); - self.state.emit(" ldr q0, [x0]"); - // Load second 128-bit operand pointer into x1, then load q1 - match &args[1] { - Operand::Value(v) => { - if let Some(slot) = self.state.get_slot(v.0) { - if self.state.is_alloca(v.0) { - self.emit_alloca_addr("x1", v.0, slot.0); - } else { - self.emit_load_from_sp("x1", slot.0, "ldr"); - } - } - } - Operand::Const(_) => { - self.operand_to_x0(&args[1]); - self.state.emit(" mov x1, x0"); - } - } - self.state.emit(" ldr q1, [x1]"); - // Apply the binary NEON operation - self.state.emit_fmt(format_args!(" {} v0.16b, v0.16b, v1.16b", neon_inst)); - // Store result to dest_ptr - self.load_ptr_to_reg(dest_ptr, "x0"); - self.state.emit(" str q0, [x0]"); - } - - /// Store a scalar result from x0 (or w0) into the dest stack slot. - fn store_scalar_dest(&mut self, dest: &Option, reg: &str) { - if let Some(d) = dest { - if let Some(slot) = self.state.get_slot(d.0) { - self.emit_store_to_sp(reg, slot.0, "str"); - } - } - } - - /// Emit a unary F64 operation: fmov to d0, apply `op_inst`, fmov back, store result. - fn emit_f64_unary_neon(&mut self, dest: &Option, args: &[Operand], op_inst: &str) { - self.operand_to_x0(&args[0]); - self.state.emit(" fmov d0, x0"); - self.state.emit_fmt(format_args!(" {} d0, d0", op_inst)); - self.state.emit(" fmov x0, d0"); - self.store_scalar_dest(dest, "x0"); - } - - /// Emit a unary F32 operation: fmov to s0, apply `op_inst`, fmov back, store result. - fn emit_f32_unary_neon(&mut self, dest: &Option, args: &[Operand], op_inst: &str) { - self.operand_to_x0(&args[0]); - self.state.emit(" fmov s0, w0"); - self.state.emit_fmt(format_args!(" {} s0, s0", op_inst)); - self.state.emit(" fmov w0, s0"); - self.store_scalar_dest(dest, "w0"); - } - - /// Emit a non-temporal store: load value from args[0], store to dest_ptr. - fn emit_nontemporal_store(&mut self, dest_ptr: &Option, args: &[Operand], save_reg: &str, val_reg: &str) { - if let Some(ptr) = dest_ptr { - self.operand_to_x0(&args[0]); - self.state.emit_fmt(format_args!(" mov {}, {}", save_reg, val_reg)); - self.load_ptr_to_reg(ptr, "x0"); - self.state.emit_fmt(format_args!(" str {}, [x0]", save_reg)); - } - } - - pub(super) fn emit_intrinsic_arm(&mut self, dest: &Option, op: &IntrinsicOp, dest_ptr: &Option, args: &[Operand]) { - match op { - IntrinsicOp::Lfence | IntrinsicOp::Mfence => { - self.state.emit(" dmb ish"); - } - IntrinsicOp::Sfence => { - self.state.emit(" dmb ishst"); - } - IntrinsicOp::Pause => { - self.state.emit(" yield"); - } - IntrinsicOp::Clflush => { - // ARM has no direct clflush; use dc civac (clean+invalidate to PoC) - self.operand_to_x0(&args[0]); - self.state.emit(" dc civac, x0"); - } - IntrinsicOp::Movnti => { - self.emit_nontemporal_store(dest_ptr, args, "w9", "w0"); - } - IntrinsicOp::Movnti64 => { - self.emit_nontemporal_store(dest_ptr, args, "x9", "x0"); - } - IntrinsicOp::Movntdq | IntrinsicOp::Movntpd => { - // Non-temporal 128-bit store: dest_ptr = target, args[0] = source ptr - if let Some(ptr) = dest_ptr { - self.operand_to_x0(&args[0]); - self.state.emit(" ldr q0, [x0]"); - self.load_ptr_to_reg(ptr, "x0"); - self.state.emit(" str q0, [x0]"); - } - } - IntrinsicOp::Loaddqu => { - // Load 128-bit unaligned: args[0] = source ptr, dest_ptr = result storage - if let Some(dptr) = dest_ptr { - self.operand_to_x0(&args[0]); - self.state.emit(" ldr q0, [x0]"); - self.load_ptr_to_reg(dptr, "x0"); - self.state.emit(" str q0, [x0]"); - } - } - IntrinsicOp::Storedqu => { - // Store 128-bit unaligned: dest_ptr = target ptr, args[0] = source data ptr - if let Some(ptr) = dest_ptr { - self.operand_to_x0(&args[0]); - self.state.emit(" ldr q0, [x0]"); - self.load_ptr_to_reg(ptr, "x0"); - self.state.emit(" str q0, [x0]"); - } - } - IntrinsicOp::Pcmpeqb128 => { - if let Some(dptr) = dest_ptr { - self.emit_neon_binary_128(dptr, args, "cmeq"); - } - } - IntrinsicOp::Pcmpeqd128 => { - if let Some(dptr) = dest_ptr { - // For 32-bit lane equality, load q regs, use cmeq with .4s arrangement - self.operand_to_x0(&args[0]); - self.state.emit(" ldr q0, [x0]"); - if let Operand::Value(v) = &args[1] { - self.load_ptr_to_reg(v, "x1"); - } else { - self.operand_to_x0(&args[1]); - self.state.emit(" mov x1, x0"); - } - self.state.emit(" ldr q1, [x1]"); - self.state.emit(" cmeq v0.4s, v0.4s, v1.4s"); - self.load_ptr_to_reg(dptr, "x0"); - self.state.emit(" str q0, [x0]"); - } - } - IntrinsicOp::Psubusb128 => { - if let Some(dptr) = dest_ptr { - self.emit_neon_binary_128(dptr, args, "uqsub"); - } - } - IntrinsicOp::Psubsb128 => { - if let Some(dptr) = dest_ptr { - self.emit_neon_binary_128(dptr, args, "sqsub"); - } - } - IntrinsicOp::Por128 => { - if let Some(dptr) = dest_ptr { - self.emit_neon_binary_128(dptr, args, "orr"); - } - } - IntrinsicOp::Pand128 => { - if let Some(dptr) = dest_ptr { - self.emit_neon_binary_128(dptr, args, "and"); - } - } - IntrinsicOp::Pxor128 => { - if let Some(dptr) = dest_ptr { - self.emit_neon_binary_128(dptr, args, "eor"); - } - } - IntrinsicOp::Pmovmskb128 => { - // Extract the high bit of each byte in a 128-bit vector into a 16-bit mask. - // NEON has no pmovmskb equivalent, so we use a multi-step sequence: - // 1. Load 128-bit data into v0 - // 2. Shift right each byte by 7 to isolate the sign bit - // 3. Multiply by power-of-2 bit positions, then add across lanes - self.operand_to_x0(&args[0]); - self.state.emit(" ldr q0, [x0]"); - self.state.emit(" ushr v0.16b, v0.16b, #7"); - // Load bit position constants: [1,2,4,8,16,32,64,128] repeated - self.state.emit(" movz x0, #0x0201"); - self.state.emit(" movk x0, #0x0804, lsl #16"); - self.state.emit(" movk x0, #0x2010, lsl #32"); - self.state.emit(" movk x0, #0x8040, lsl #48"); - self.state.emit(" fmov d1, x0"); - self.state.emit(" mov v1.d[1], x0"); - self.state.emit(" mul v0.16b, v0.16b, v1.16b"); - // Split and sum each half - self.state.emit(" ext v1.16b, v0.16b, v0.16b, #8"); - self.state.emit(" addv b0, v0.8b"); - self.state.emit(" umov w0, v0.b[0]"); - self.state.emit(" addv b1, v1.8b"); - self.state.emit(" umov w1, v1.b[0]"); - self.state.emit(" orr w0, w0, w1, lsl #8"); - self.store_scalar_dest(dest, "x0"); - } - IntrinsicOp::SetEpi8 => { - if let Some(dptr) = dest_ptr { - self.operand_to_x0(&args[0]); - self.state.emit(" dup v0.16b, w0"); - self.load_ptr_to_reg(dptr, "x0"); - self.state.emit(" str q0, [x0]"); - } - } - IntrinsicOp::SetEpi32 => { - if let Some(dptr) = dest_ptr { - self.operand_to_x0(&args[0]); - self.state.emit(" dup v0.4s, w0"); - self.load_ptr_to_reg(dptr, "x0"); - self.state.emit(" str q0, [x0]"); - } - } - IntrinsicOp::Crc32_8 | IntrinsicOp::Crc32_16 - | IntrinsicOp::Crc32_32 | IntrinsicOp::Crc32_64 => { - let is_64 = matches!(op, IntrinsicOp::Crc32_64); - let (save_reg, crc_inst) = match op { - IntrinsicOp::Crc32_8 => ("w9", "crc32cb w9, w9, w0"), - IntrinsicOp::Crc32_16 => ("w9", "crc32ch w9, w9, w0"), - IntrinsicOp::Crc32_32 => ("w9", "crc32cw w9, w9, w0"), - IntrinsicOp::Crc32_64 => ("x9", "crc32cx w9, w9, x0"), - _ => unreachable!(), - }; - self.operand_to_x0(&args[0]); - self.state.emit_fmt(format_args!(" mov {}, {}", save_reg, if is_64 { "x0" } else { "w0" })); - self.operand_to_x0(&args[1]); - self.state.emit_fmt(format_args!(" {}", crc_inst)); - self.state.emit(" mov x0, x9"); - self.store_scalar_dest(dest, "x0"); - } - IntrinsicOp::FrameAddress => { - self.state.emit(" mov x0, x29"); - self.store_scalar_dest(dest, "x0"); - } - IntrinsicOp::ReturnAddress => { - // x30 (lr) is clobbered by bl instructions, so read from stack - self.state.emit(" ldr x0, [x29, #8]"); - self.store_scalar_dest(dest, "x0"); - } - IntrinsicOp::ThreadPointer => { - // __builtin_thread_pointer(): read TLS base from tpidr_el0 - self.state.emit(" mrs x0, tpidr_el0"); - self.store_scalar_dest(dest, "x0"); - } - IntrinsicOp::SqrtF64 => self.emit_f64_unary_neon(dest, args, "fsqrt"), - IntrinsicOp::SqrtF32 => self.emit_f32_unary_neon(dest, args, "fsqrt"), - IntrinsicOp::FabsF64 => self.emit_f64_unary_neon(dest, args, "fabs"), - IntrinsicOp::FabsF32 => self.emit_f32_unary_neon(dest, args, "fabs"), - // x86-specific SSE/AES-NI/CLMUL intrinsics - these are x86-only and should - // not appear in ARM codegen in practice. Cross-compiled code that conditionally - // uses these behind #ifdef __x86_64__ will have the calls dead-code eliminated. - // TODO: consider emitting a runtime trap instead of silent zeros - IntrinsicOp::Aesenc128 | IntrinsicOp::Aesenclast128 - | IntrinsicOp::Aesdec128 | IntrinsicOp::Aesdeclast128 - | IntrinsicOp::Aesimc128 | IntrinsicOp::Aeskeygenassist128 - | IntrinsicOp::Pclmulqdq128 - | IntrinsicOp::Pslldqi128 | IntrinsicOp::Psrldqi128 - | IntrinsicOp::Psllqi128 | IntrinsicOp::Psrlqi128 - | IntrinsicOp::Pshufd128 | IntrinsicOp::Loadldi128 - | IntrinsicOp::Paddw128 | IntrinsicOp::Psubw128 - | IntrinsicOp::Pmulhw128 | IntrinsicOp::Pmaddwd128 - | IntrinsicOp::Pcmpgtw128 | IntrinsicOp::Pcmpgtb128 - | IntrinsicOp::Psllwi128 | IntrinsicOp::Psrlwi128 - | IntrinsicOp::Psrawi128 | IntrinsicOp::Psradi128 - | IntrinsicOp::Pslldi128 | IntrinsicOp::Psrldi128 - | IntrinsicOp::Paddd128 | IntrinsicOp::Psubd128 - | IntrinsicOp::Packssdw128 | IntrinsicOp::Packsswb128 | IntrinsicOp::Packuswb128 - | IntrinsicOp::Punpcklbw128 | IntrinsicOp::Punpckhbw128 - | IntrinsicOp::Punpcklwd128 | IntrinsicOp::Punpckhwd128 - | IntrinsicOp::SetEpi16 | IntrinsicOp::Pinsrw128 - | IntrinsicOp::Pextrw128 | IntrinsicOp::Storeldi128 - | IntrinsicOp::Cvtsi128Si32 | IntrinsicOp::Cvtsi32Si128 - | IntrinsicOp::Cvtsi128Si64 - | IntrinsicOp::Pshuflw128 | IntrinsicOp::Pshufhw128 - | IntrinsicOp::Pinsrd128 | IntrinsicOp::Pextrd128 - | IntrinsicOp::Pinsrb128 | IntrinsicOp::Pextrb128 - | IntrinsicOp::Pinsrq128 | IntrinsicOp::Pextrq128 => { - // x86-only: zero dest if present - if let Some(dptr) = dest_ptr { - if let Some(slot) = self.state.get_slot(dptr.0) { - self.state.emit_fmt(format_args!(" add x9, sp, #{}", slot.0)); - self.state.emit(" stp xzr, xzr, [x9]"); - } - } - } - } - } - - // ---- F128 (long double / IEEE quad precision) soft-float helpers ---- - // - // On AArch64, long double is IEEE 754 binary128 (16 bytes). - // Hardware has no quad-precision FP ops, so we use compiler-rt/libgcc soft-float: - // Comparison: __eqtf2, __lttf2, __letf2, __gttf2, __getf2 - // Arithmetic: __addtf3, __subtf3, __multf3, __divtf3 - // Conversion: __extenddftf2 (f64->f128), __trunctfdf2 (f128->f64) - // ABI: f128 passed/returned in Q registers (q0, q1). Int result in w0/x0. - -} diff --git a/src/backend/arm/codegen/memory.rs b/src/backend/arm/codegen/memory.rs deleted file mode 100644 index f2762a81a6..0000000000 --- a/src/backend/arm/codegen/memory.rs +++ /dev/null @@ -1,252 +0,0 @@ -//! ArmCodegen: memory operations (load, store, memcpy, GEP, stack). - -use crate::ir::reexports::{Operand, Value}; -use crate::common::types::IrType; -use crate::backend::state::{StackSlot, SlotAddr}; -use super::emit::{ArmCodegen, callee_saved_name}; - -impl ArmCodegen { - // ---- Store/Load overrides ---- - - pub(super) fn emit_store_impl(&mut self, val: &Operand, ptr: &Value, ty: IrType) { - if ty == IrType::F128 { - crate::backend::f128_softfloat::f128_emit_store(self, val, ptr); - return; - } - crate::backend::traits::emit_store_default(self, val, ptr, ty); - } - - pub(super) fn emit_load_impl(&mut self, dest: &Value, ptr: &Value, ty: IrType) { - if ty == IrType::F128 { - crate::backend::f128_softfloat::f128_emit_load(self, dest, ptr); - return; - } - crate::backend::traits::emit_load_default(self, dest, ptr, ty); - } - - pub(super) fn emit_store_with_const_offset_impl(&mut self, val: &Operand, base: &Value, offset: i64, ty: IrType) { - if ty == IrType::F128 { - crate::backend::f128_softfloat::f128_emit_store_with_offset(self, val, base, offset); - return; - } - self.operand_to_x0(val); - let addr = self.state.resolve_slot_addr(base.0); - if let Some(addr) = addr { - let store_instr = self.store_instr_for_type_impl(ty); - match addr { - SlotAddr::OverAligned(slot, id) => { - self.state.emit(" mov x1, x0"); - self.emit_alloca_aligned_addr_impl(slot, id); - self.emit_add_offset_to_addr_reg_impl(offset); - let reg = Self::reg_for_type("x1", ty); - self.state.emit_fmt(format_args!(" {} {}, [x9]", store_instr, reg)); - } - SlotAddr::Direct(slot) => { - let folded_slot = StackSlot(slot.0 + offset); - let reg = Self::reg_for_type("x0", ty); - self.emit_store_to_sp(reg, folded_slot.0, store_instr); - } - SlotAddr::Indirect(slot) => { - self.state.emit(" mov x1, x0"); - self.emit_load_ptr_from_slot_impl(slot, base.0); - if offset != 0 { - self.emit_add_offset_to_addr_reg_impl(offset); - } - let reg = Self::reg_for_type("x1", ty); - self.state.emit_fmt(format_args!(" {} {}, [x9]", store_instr, reg)); - } - } - } - } - - pub(super) fn emit_load_with_const_offset_impl(&mut self, dest: &Value, base: &Value, offset: i64, ty: IrType) { - if ty == IrType::F128 { - crate::backend::f128_softfloat::f128_emit_load_with_offset(self, dest, base, offset); - return; - } - let addr = self.state.resolve_slot_addr(base.0); - if let Some(addr) = addr { - let load_instr = self.load_instr_for_type_impl(ty); - match addr { - SlotAddr::OverAligned(slot, id) => { - self.emit_alloca_aligned_addr_impl(slot, id); - self.emit_add_offset_to_addr_reg_impl(offset); - let (actual_instr, dest_reg) = Self::arm_parse_load(load_instr); - self.state.emit_fmt(format_args!(" {} {}, [x9]", actual_instr, dest_reg)); - } - SlotAddr::Direct(slot) => { - let folded_slot = StackSlot(slot.0 + offset); - let (actual_instr, dest_reg) = Self::arm_parse_load(load_instr); - self.emit_load_from_sp(dest_reg, folded_slot.0, actual_instr); - } - SlotAddr::Indirect(slot) => { - self.emit_load_ptr_from_slot_impl(slot, base.0); - if offset != 0 { - self.emit_add_offset_to_addr_reg_impl(offset); - } - let (actual_instr, dest_reg) = Self::arm_parse_load(load_instr); - self.state.emit_fmt(format_args!(" {} {}, [x9]", actual_instr, dest_reg)); - } - } - self.store_x0_to(dest); - } - } - - pub(super) fn emit_typed_store_to_slot_impl(&mut self, instr: &'static str, ty: IrType, slot: StackSlot) { - let reg = Self::reg_for_type("x0", ty); - self.emit_store_to_sp(reg, slot.0, instr); - } - - pub(super) fn emit_typed_load_from_slot_impl(&mut self, instr: &'static str, slot: StackSlot) { - let (actual_instr, dest_reg) = Self::arm_parse_load(instr); - self.emit_load_from_sp(dest_reg, slot.0, actual_instr); - } - - pub(super) fn emit_load_ptr_from_slot_impl(&mut self, slot: StackSlot, val_id: u32) { - if let Some(®) = self.reg_assignments.get(&val_id) { - let reg_name = callee_saved_name(reg); - self.state.emit_fmt(format_args!(" mov x9, {}", reg_name)); - } else { - self.emit_load_from_sp("x9", slot.0, "ldr"); - } - } - - pub(super) fn emit_typed_store_indirect_impl(&mut self, instr: &'static str, ty: IrType) { - let reg = Self::reg_for_type("x1", ty); - self.state.emit_fmt(format_args!(" {} {}, [x9]", instr, reg)); - } - - pub(super) fn emit_typed_load_indirect_impl(&mut self, instr: &'static str) { - let (actual_instr, dest_reg) = Self::arm_parse_load(instr); - self.state.emit_fmt(format_args!(" {} {}, [x9]", actual_instr, dest_reg)); - } - - pub(super) fn emit_add_offset_to_addr_reg_impl(&mut self, offset: i64) { - if (0..=4095).contains(&offset) { - self.state.emit_fmt(format_args!(" add x9, x9, #{}", offset)); - } else if offset < 0 && (-offset) <= 4095 { - self.state.emit_fmt(format_args!(" sub x9, x9, #{}", -offset)); - } else { - self.load_large_imm("x17", offset); - self.state.emit(" add x9, x9, x17"); - } - } - - pub(super) fn emit_slot_addr_to_secondary_impl(&mut self, slot: StackSlot, is_alloca: bool, val_id: u32) { - if is_alloca { - self.emit_alloca_addr("x1", val_id, slot.0); - } else if let Some(®) = self.reg_assignments.get(&val_id) { - let reg_name = callee_saved_name(reg); - self.state.emit_fmt(format_args!(" mov x1, {}", reg_name)); - } else { - self.emit_load_from_sp("x1", slot.0, "ldr"); - } - } - - pub(super) fn emit_gep_direct_const_impl(&mut self, slot: StackSlot, offset: i64) { - let folded = slot.0 + offset; - self.emit_add_sp_offset("x0", folded); - } - - pub(super) fn emit_gep_indirect_const_impl(&mut self, slot: StackSlot, offset: i64, val_id: u32) { - if let Some(®) = self.reg_assignments.get(&val_id) { - let reg_name = callee_saved_name(reg); - self.state.emit_fmt(format_args!(" mov x0, {}", reg_name)); - } else { - self.emit_load_from_sp("x0", slot.0, "ldr"); - } - if offset != 0 { - self.emit_add_imm_to_acc_impl(offset); - } - } - - pub(super) fn emit_add_imm_to_acc_impl(&mut self, imm: i64) { - if (0..=4095).contains(&imm) { - self.state.emit_fmt(format_args!(" add x0, x0, #{}", imm)); - } else if imm < 0 && (-imm) <= 4095 { - self.state.emit_fmt(format_args!(" sub x0, x0, #{}", -imm)); - } else { - self.emit_load_imm64("x1", imm); - self.state.emit(" add x0, x0, x1"); - } - } - - pub(super) fn emit_round_up_acc_to_16_impl(&mut self) { - self.state.emit(" add x0, x0, #15"); - self.state.emit(" and x0, x0, #-16"); - } - - pub(super) fn emit_sub_sp_by_acc_impl(&mut self) { - self.state.emit(" sub sp, sp, x0"); - } - - pub(super) fn emit_mov_sp_to_acc_impl(&mut self) { - self.state.emit(" mov x0, sp"); - } - - pub(super) fn emit_mov_acc_to_sp_impl(&mut self) { - self.state.emit(" mov sp, x0"); - } - - pub(super) fn emit_align_acc_impl(&mut self, align: usize) { - self.state.emit_fmt(format_args!(" add x0, x0, #{}", align - 1)); - self.state.emit_fmt(format_args!(" and x0, x0, #{}", -(align as i64))); - } - - pub(super) fn emit_memcpy_load_dest_addr_impl(&mut self, slot: StackSlot, is_alloca: bool, val_id: u32) { - if is_alloca { - self.emit_alloca_addr("x9", val_id, slot.0); - } else if let Some(®) = self.reg_assignments.get(&val_id) { - let reg_name = callee_saved_name(reg); - self.state.emit_fmt(format_args!(" mov x9, {}", reg_name)); - } else { - self.emit_load_from_sp("x9", slot.0, "ldr"); - } - } - - pub(super) fn emit_memcpy_load_src_addr_impl(&mut self, slot: StackSlot, is_alloca: bool, val_id: u32) { - if is_alloca { - self.emit_alloca_addr("x10", val_id, slot.0); - } else if let Some(®) = self.reg_assignments.get(&val_id) { - let reg_name = callee_saved_name(reg); - self.state.emit_fmt(format_args!(" mov x10, {}", reg_name)); - } else { - self.emit_load_from_sp("x10", slot.0, "ldr"); - } - } - - pub(super) fn emit_alloca_aligned_addr_impl(&mut self, slot: StackSlot, val_id: u32) { - let align = self.state.alloca_over_align(val_id) - .expect("alloca must have over-alignment for aligned addr emission"); - self.emit_add_sp_offset("x9", slot.0); - self.load_large_imm("x17", (align - 1) as i64); - self.state.emit(" add x9, x9, x17"); - self.load_large_imm("x17", -(align as i64)); - self.state.emit(" and x9, x9, x17"); - } - - pub(super) fn emit_alloca_aligned_addr_to_acc_impl(&mut self, slot: StackSlot, val_id: u32) { - let align = self.state.alloca_over_align(val_id) - .expect("alloca must have over-alignment for aligned addr emission"); - self.emit_add_sp_offset("x0", slot.0); - self.load_large_imm("x17", (align - 1) as i64); - self.state.emit(" add x0, x0, x17"); - self.load_large_imm("x17", -(align as i64)); - self.state.emit(" and x0, x0, x17"); - self.state.reg_cache.invalidate_acc(); - } - - pub(super) fn emit_memcpy_impl_impl(&mut self, size: usize) { - let label_id = self.state.next_label_id(); - let loop_label = format!(".Lmemcpy_loop_{}", label_id); - let done_label = format!(".Lmemcpy_done_{}", label_id); - self.load_large_imm("x11", size as i64); - self.state.emit_fmt(format_args!("{}:", loop_label)); - self.state.emit_fmt(format_args!(" cbz x11, {}", done_label)); - self.state.emit(" ldrb w12, [x10], #1"); - self.state.emit(" strb w12, [x9], #1"); - self.state.emit(" sub x11, x11, #1"); - self.state.emit_fmt(format_args!(" b {}", loop_label)); - self.state.emit_fmt(format_args!("{}:", done_label)); - } -} diff --git a/src/backend/arm/codegen/mod.rs b/src/backend/arm/codegen/mod.rs deleted file mode 100644 index be711940eb..0000000000 --- a/src/backend/arm/codegen/mod.rs +++ /dev/null @@ -1,18 +0,0 @@ -pub(crate) mod emit; -pub(crate) mod peephole; -mod asm_emitter; -mod f128; -mod inline_asm; -mod intrinsics; -mod prologue; -mod memory; -mod alu; -mod comparison; -mod calls; -mod globals; -mod cast_ops; -mod variadic; -mod returns; -mod atomics; -mod i128_ops; -mod float_ops; diff --git a/src/backend/arm/codegen/peephole.rs b/src/backend/arm/codegen/peephole.rs deleted file mode 100644 index d0fab5e431..0000000000 --- a/src/backend/arm/codegen/peephole.rs +++ /dev/null @@ -1,1223 +0,0 @@ -//! AArch64 peephole optimizer for assembly text. -//! -//! Operates on generated assembly text to eliminate redundant patterns from the -//! stack-based codegen. Lines are pre-parsed into `LineKind` enums so hot-path -//! pattern matching uses integer/enum comparisons instead of string parsing. -//! -//! ## Pass structure -//! -//! **Local passes** (iterative, up to 8 rounds): store/load elimination, -//! redundant branch removal, self-move elimination, move chain optimization, -//! branch-over-branch fusion, and move-immediate chain optimization. -//! -//! ## Optimizations -//! -//! 1. **Adjacent store/load elimination**: `str xN, [sp, #off]` followed by -//! `ldr xN, [sp, #off]` — the load is redundant since the value is -//! already in the register. -//! -//! 2. **Redundant branch elimination**: `b .LBBN` where `.LBBN:` is the -//! immediately next non-empty line — falls through naturally. -//! -//! 3. **Self-move elimination**: `mov xN, xN` (64-bit) is a no-op. -//! Note: `mov wN, wN` (32-bit) zeros upper 32 bits and is NOT eliminated. -//! -//! 4. **Move chain optimization**: `mov A, B; mov C, A` → `mov C, B`, -//! enabling the first mov to become dead if A is unused. -//! -//! 5. **Branch-over-branch fusion**: `b.cc .Lskip; b .target; .Lskip:` -//! → `b.!cc .target` (invert condition, eliminate skip label). -//! -//! 6. **Move-immediate chain**: `mov xN, #imm; mov xM, xN` where xN is a -//! scratch register (x0-x15) → `mov xM, #imm` when safe. - -// ── Line classification types ──────────────────────────────────────────────── - -/// Compact classification of an assembly line. -#[derive(Clone, Copy, PartialEq, Eq, Debug)] -enum LineKind { - /// Deleted / blank - Nop, - /// `str xN/wN, [sp, #off]` — store to stack (via sp) - StoreSp { reg: u8, offset: i32, is_word: bool }, - /// `ldr xN/wN, [sp, #off]` — load from stack (via sp) - LoadSp { reg: u8, offset: i32, is_word: bool }, - /// `ldrsw xN, [sp, #off]` — load signed word from stack - LoadswSp { reg: u8, offset: i32 }, - /// `ldrsb xN, [xM]` — load signed byte (general) - LoadsbReg, - /// `stp xN, xM, [sp, #off]` — store pair to stack - StorePairSp, - /// `ldp xN, xM, [sp, #off]` — load pair from stack - LoadPairSp, - /// `mov xN, xM` — register-to-register move. - /// `is_32bit` indicates whether this is a w-register (32-bit) move. - /// On AArch64, `mov wN, wM` zeros the upper 32 bits of the destination, - /// so it is NOT equivalent to `mov xN, xM`. - Move { dst: u8, src: u8, is_32bit: bool }, - /// `mov xN, #imm` — move immediate to register - MoveImm { dst: u8 }, - /// `movz/movn xN, #imm` — move wide immediate - MoveWide { dst: u8 }, - /// `sxtw xN, wM` — sign-extend word to doubleword - Sxtw { dst: u8, src: u8 }, - /// `b .label` — unconditional branch - Branch, - /// `b.cc .label` — conditional branch - CondBranch, - /// `cbz/cbnz xN, .label` — compare and branch on (non-)zero - CmpBranch, - /// Label (`.LBBx:` etc.) - Label, - /// `ret` - Ret, - /// `bl func` — branch with link (function call) - Call, - /// `cmp` or `cmn` instruction - Compare, - /// `add`, `sub`, or other ALU instruction - Alu, - /// `str`/`ldr` to non-sp addresses (e.g., `str w1, [x9]`) - MemOther, - /// Assembler directive (`.section`, `.globl`, etc.) - Directive, - /// Any other instruction - Other, -} - -/// AArch64 register IDs for pattern matching. -/// We map x0-x30, sp, and w0-w30 to the same set (register number). -const REG_NONE: u8 = 255; - -/// Parse an AArch64 register name to an internal ID (0-30, or special). -/// x0/w0 → 0, x1/w1 → 1, ..., x30/w30 → 30, sp → 31, xzr/wzr → 32. -fn parse_reg(name: &str) -> u8 { - let name = name.trim(); - if let Some(n) = name.strip_prefix('x').or_else(|| name.strip_prefix('w')) { - if let Ok(num) = n.parse::() { - if num <= 30 { - return num; - } - } - if n == "zr" { - return 32; // zero register - } - } - if name == "sp" { - return 31; - } - REG_NONE -} - -/// Return the x-register name for a given ID. -fn xreg_name(id: u8) -> &'static str { - match id { - 0 => "x0", 1 => "x1", 2 => "x2", 3 => "x3", - 4 => "x4", 5 => "x5", 6 => "x6", 7 => "x7", - 8 => "x8", 9 => "x9", 10 => "x10", 11 => "x11", - 12 => "x12", 13 => "x13", 14 => "x14", 15 => "x15", - 16 => "x16", 17 => "x17", 18 => "x18", 19 => "x19", - 20 => "x20", 21 => "x21", 22 => "x22", 23 => "x23", - 24 => "x24", 25 => "x25", 26 => "x26", 27 => "x27", - 28 => "x28", 29 => "x29", 30 => "x30", - 31 => "sp", 32 => "xzr", - _ => "??", - } -} - -/// Return the w-register name for a given ID. -fn wreg_name(id: u8) -> &'static str { - match id { - 0 => "w0", 1 => "w1", 2 => "w2", 3 => "w3", - 4 => "w4", 5 => "w5", 6 => "w6", 7 => "w7", - 8 => "w8", 9 => "w9", 10 => "w10", 11 => "w11", - 12 => "w12", 13 => "w13", 14 => "w14", 15 => "w15", - 16 => "w16", 17 => "w17", 18 => "w18", 19 => "w19", - 20 => "w20", 21 => "w21", 22 => "w22", 23 => "w23", - 24 => "w24", 25 => "w25", 26 => "w26", 27 => "w27", - 28 => "w28", 29 => "w29", 30 => "w30", - 32 => "wzr", - _ => "??", - } -} - -// ── Line classification ────────────────────────────────────────────────────── - -/// Classify a single assembly line into a LineKind. -fn classify_line(line: &str) -> LineKind { - let trimmed = line.trim(); - if trimmed.is_empty() { - return LineKind::Nop; - } - - // Labels end with ':' - if trimmed.ends_with(':') { - return LineKind::Label; - } - - // Directives start with '.' - if trimmed.starts_with('.') { - return LineKind::Directive; - } - - // str xN/wN, [sp, #off] - if let Some(rest) = trimmed.strip_prefix("str ") { - if let Some(info) = parse_sp_mem_op(rest) { - return LineKind::StoreSp { reg: info.0, offset: info.1, is_word: info.2 }; - } - return LineKind::MemOther; - } - - // strb wN, [xM] — byte store - if trimmed.starts_with("strb ") || trimmed.starts_with("strh ") { - return LineKind::MemOther; - } - - // ldr xN/wN, [sp, #off] - if let Some(rest) = trimmed.strip_prefix("ldr ") { - if let Some(info) = parse_sp_mem_op(rest) { - return LineKind::LoadSp { reg: info.0, offset: info.1, is_word: info.2 }; - } - return LineKind::MemOther; - } - - // ldrsw xN, [sp, #off] - if let Some(rest) = trimmed.strip_prefix("ldrsw ") { - if let Some((reg_str, addr)) = rest.split_once(", ") { - let reg = parse_reg(reg_str.trim()); - if reg != REG_NONE { - if let Some(offset) = parse_sp_offset(addr.trim()) { - return LineKind::LoadswSp { reg, offset }; - } - } - } - return LineKind::MemOther; - } - - // ldrsb - if trimmed.starts_with("ldrsb ") || trimmed.starts_with("ldrsh ") || - trimmed.starts_with("ldrb ") || trimmed.starts_with("ldrh ") { - return LineKind::LoadsbReg; - } - - // stp (store pair) - if trimmed.starts_with("stp ") { - if trimmed.contains("[sp") { - return LineKind::StorePairSp; - } - return LineKind::MemOther; - } - - // ldp (load pair) - if trimmed.starts_with("ldp ") { - if trimmed.contains("[sp") { - return LineKind::LoadPairSp; - } - return LineKind::MemOther; - } - - // mov xN, xM or mov xN, #imm or mov xN, :lo12:sym etc. - if let Some(rest) = trimmed.strip_prefix("mov ") { - // Avoid matching movz, movn, movk - if !trimmed.starts_with("movz") && !trimmed.starts_with("movn") && !trimmed.starts_with("movk") { - if let Some((dst_str, src_str)) = rest.split_once(", ") { - let dst_trimmed = dst_str.trim(); - let dst = parse_reg(dst_trimmed); - if dst != REG_NONE { - let src_trimmed = src_str.trim(); - if src_trimmed.starts_with('#') || src_trimmed.starts_with('-') { - return LineKind::MoveImm { dst }; - } - let src = parse_reg(src_trimmed); - if src != REG_NONE { - let is_32bit = dst_trimmed.starts_with('w'); - return LineKind::Move { dst, src, is_32bit }; - } - // mov xN, :lo12:symbol etc. - return LineKind::MoveImm { dst }; - } - } - } - } - - // movz / movn / movk - if trimmed.starts_with("movz ") || trimmed.starts_with("movn ") { - if let Some((_, rest)) = trimmed.split_once(' ') { - if let Some((dst_str, _)) = rest.split_once(", ") { - let dst = parse_reg(dst_str.trim()); - if dst != REG_NONE { - return LineKind::MoveWide { dst }; - } - } - } - return LineKind::Other; - } - - // movk is an update, not a fresh definition - if trimmed.starts_with("movk ") { - return LineKind::Other; - } - - // sxtw xN, wM - if let Some(rest) = trimmed.strip_prefix("sxtw ") { - if let Some((dst_str, src_str)) = rest.split_once(", ") { - let dst = parse_reg(dst_str.trim()); - let src = parse_reg(src_str.trim()); - if dst != REG_NONE && src != REG_NONE { - return LineKind::Sxtw { dst, src }; - } - } - } - - // Unconditional branch: b .label (but not bl, b.cc) - if trimmed.starts_with("b ") && !trimmed.starts_with("bl ") && !trimmed.starts_with("b.") { - return LineKind::Branch; - } - - // Conditional branch: b.eq, b.ne, b.lt, b.ge, b.gt, b.le, b.hi, b.ls, b.cs, b.cc, etc. - if trimmed.starts_with("b.") { - return LineKind::CondBranch; - } - - // cbz/cbnz/tbz/tbnz - if trimmed.starts_with("cbz ") || trimmed.starts_with("cbnz ") || - trimmed.starts_with("tbz ") || trimmed.starts_with("tbnz ") { - return LineKind::CmpBranch; - } - - // ret - if trimmed == "ret" { - return LineKind::Ret; - } - - // bl (branch and link = call) - if trimmed.starts_with("bl ") || trimmed.starts_with("blr ") { - return LineKind::Call; - } - - // br xN (indirect branch = control flow barrier) - if trimmed.starts_with("br ") { - return LineKind::Branch; - } - - // cmp/cmn - if trimmed.starts_with("cmp ") || trimmed.starts_with("cmn ") { - return LineKind::Compare; - } - - // ALU: add, sub, and, orr, eor, lsl, lsr, asr, mul, etc. - if trimmed.starts_with("add ") || trimmed.starts_with("sub ") || - trimmed.starts_with("and ") || trimmed.starts_with("orr ") || - trimmed.starts_with("eor ") || trimmed.starts_with("mul ") || - trimmed.starts_with("neg ") || trimmed.starts_with("mvn ") || - trimmed.starts_with("lsl ") || trimmed.starts_with("lsr ") || - trimmed.starts_with("asr ") || trimmed.starts_with("madd ") || - trimmed.starts_with("msub ") || trimmed.starts_with("sdiv ") || - trimmed.starts_with("udiv ") || trimmed.starts_with("adds ") || - trimmed.starts_with("subs ") { - return LineKind::Alu; - } - - LineKind::Other -} - -/// Parse `xN/wN, [sp, #off]` and return (reg_id, offset, is_word). -fn parse_sp_mem_op(rest: &str) -> Option<(u8, i32, bool)> { - let (reg_str, addr) = rest.split_once(", ")?; - let reg_str = reg_str.trim(); - let is_word = reg_str.starts_with('w'); - let reg = parse_reg(reg_str); - if reg == REG_NONE { - return None; - } - let offset = parse_sp_offset(addr.trim())?; - Some((reg, offset, is_word)) -} - -/// Parse `[sp, #off]` or `[sp]` and return the offset. -fn parse_sp_offset(addr: &str) -> Option { - // [sp] — zero offset - if addr == "[sp]" { - return Some(0); - } - // [sp, #N] or [sp, #-N] - if addr.starts_with("[sp, #") && addr.ends_with(']') { - let inner = &addr[6..addr.len() - 1]; // strip "[sp, #" and "]" - return inner.parse::().ok(); - } - // [sp, #N]! (pre-index) — not a simple stack slot access - None -} - -/// Extract branch target from `b .label` or `b label`. -fn branch_target(line: &str) -> Option<&str> { - let trimmed = line.trim(); - // Match "b .label" but not "br xN" (indirect branch) - if let Some(rest) = trimmed.strip_prefix("b ") { - let target = rest.trim(); - // Must start with '.' (a label), not a register - if target.starts_with('.') { - return Some(target); - } - } - None -} - -/// Extract the condition code and target from a conditional branch. -/// `b.eq .label` → Some(("eq", ".label")) -fn cond_branch_parts(line: &str) -> Option<(&str, &str)> { - let trimmed = line.trim(); - if let Some(rest) = trimmed.strip_prefix("b.") { - if let Some((cc, target)) = rest.split_once(' ') { - return Some((cc, target.trim())); - } - } - None -} - -/// Invert a condition code. -fn invert_condition(cc: &str) -> Option<&'static str> { - match cc { - "eq" => Some("ne"), - "ne" => Some("eq"), - "lt" => Some("ge"), - "ge" => Some("lt"), - "gt" => Some("le"), - "le" => Some("gt"), - "hi" => Some("ls"), - "ls" => Some("hi"), - "hs" | "cs" => Some("lo"), - "lo" | "cc" => Some("hs"), - "mi" => Some("pl"), - "pl" => Some("mi"), - "vs" => Some("vc"), - "vc" => Some("vs"), - _ => None, - } -} - -/// Extract label name from a label line (strip trailing `:`) -fn label_name(line: &str) -> Option<&str> { - let trimmed = line.trim(); - trimmed.strip_suffix(':') -} - - -// ── Main entry point ───────────────────────────────────────────────────────── - -/// Run peephole optimization on AArch64 assembly text. -/// Returns the optimized assembly string. -pub fn peephole_optimize(asm: String) -> String { - let mut lines: Vec = asm.lines().map(String::from).collect(); - let mut kinds: Vec = lines.iter().map(|l| classify_line(l)).collect(); - let n = lines.len(); - - if n == 0 { - return asm; - } - - // Phase 1: Iterative local passes (up to 8 rounds) - let mut changed = true; - let mut rounds = 0; - while changed && rounds < 8 { - changed = false; - changed |= eliminate_adjacent_store_load(&mut lines, &mut kinds, n); - changed |= eliminate_redundant_branches(&lines, &mut kinds, n); - changed |= eliminate_self_moves(&mut kinds, n); - changed |= eliminate_move_chains(&mut lines, &mut kinds, n); - changed |= fuse_branch_over_branch(&mut lines, &mut kinds, n); - rounds += 1; - } - - // Phase 2: Global passes - // - // Global store forwarding is disabled: same-register NOP elimination has a - // remaining correctness bug in complex float-array code (test 0036_0041). - // The root cause is not yet identified — it appears correct within a basic - // block but produces wrong output. Until this is fixed, GSF is skipped. - // - // Copy propagation and dead store elimination are independent and safe. - propagate_register_copies(&mut lines, &mut kinds, n); - global_dead_store_elimination(&lines, &mut kinds, n); - - // Phase 3: Local cleanup after global passes (up to 4 rounds) - { - let mut changed2 = true; - let mut rounds2 = 0; - while changed2 && rounds2 < 4 { - changed2 = false; - changed2 |= eliminate_adjacent_store_load(&mut lines, &mut kinds, n); - changed2 |= eliminate_redundant_branches(&lines, &mut kinds, n); - changed2 |= eliminate_self_moves(&mut kinds, n); - changed2 |= eliminate_move_chains(&mut lines, &mut kinds, n); - rounds2 += 1; - } - } - - // Build result, filtering out Nop lines - let mut result = String::with_capacity(asm.len()); - for i in 0..n { - if kinds[i] != LineKind::Nop { - result.push_str(&lines[i]); - result.push('\n'); - } - } - result -} - -// ── Pass 1: Adjacent store/load elimination ────────────────────────────────── -// -// Pattern: str xN, [sp, #off] → ldr xN, [sp, #off] (same reg, same offset) -// The load is redundant since the value is already in the register. -// Also: str xN, [sp, #off] → ldr xM, [sp, #off] → replace load with mov xM, xN -// Also handles: str wN, [sp, #off] → ldrsw xN, [sp, #off] - -fn eliminate_adjacent_store_load(lines: &mut [String], kinds: &mut [LineKind], n: usize) -> bool { - let mut changed = false; - let mut i = 0; - while i + 1 < n { - if let LineKind::StoreSp { reg: store_reg, offset: store_off, is_word: store_word } = kinds[i] { - // Look ahead for the matching load (skip Nops) - let mut j = i + 1; - while j < n && kinds[j] == LineKind::Nop { - j += 1; - } - if j < n { - match kinds[j] { - LineKind::LoadSp { reg: load_reg, offset: load_off, is_word: load_word } - if store_off == load_off && store_word == load_word => - { - if store_reg == load_reg { - // Same register: eliminate the load entirely - kinds[j] = LineKind::Nop; - changed = true; - } else { - // Different register: replace load with mov - let reg_fmt = if store_word { wreg_name } else { xreg_name }; - lines[j] = format!(" mov {}, {}", reg_fmt(load_reg), reg_fmt(store_reg)); - kinds[j] = LineKind::Move { dst: load_reg, src: store_reg, is_32bit: store_word }; - changed = true; - } - } - // str wN, [sp, #off] followed by ldrsw xM, [sp, #off] - // The value was just stored as a word; sign-extending load can be replaced - // with sxtw xM, wN (or eliminated if same reg). - LineKind::LoadswSp { reg: load_reg, offset: load_off } - if store_off == load_off && store_word => - { - lines[j] = format!(" sxtw {}, {}", xreg_name(load_reg), wreg_name(store_reg)); - kinds[j] = LineKind::Sxtw { dst: load_reg, src: store_reg }; - changed = true; - } - _ => {} - } - } - } - i += 1; - } - changed -} - -// ── Pass 2: Redundant branch elimination ───────────────────────────────────── -// -// Pattern: b .LBBN ; .LBBN: (branch to immediately next label) -// Falls through naturally, so the branch is redundant. - -fn eliminate_redundant_branches(lines: &[String], kinds: &mut [LineKind], n: usize) -> bool { - let mut changed = false; - for i in 0..n { - if kinds[i] == LineKind::Branch { - if let Some(target) = branch_target(&lines[i]) { - // Find next non-Nop line - let mut j = i + 1; - while j < n && kinds[j] == LineKind::Nop { - j += 1; - } - if j < n && kinds[j] == LineKind::Label { - if let Some(lbl) = label_name(&lines[j]) { - if target == lbl { - kinds[i] = LineKind::Nop; - changed = true; - } - } - } - } - } - } - changed -} - -// ── Pass 3: Self-move elimination ──────────────────────────────────────────── -// -// Pattern: mov xN, xN — no-op - -fn eliminate_self_moves(kinds: &mut [LineKind], n: usize) -> bool { - let mut changed = false; - for i in 0..n { - if let LineKind::Move { dst, src, is_32bit } = kinds[i] { - if dst == src && !is_32bit { - // Only eliminate 64-bit self-moves (mov xN, xN). - // On AArch64, `mov wN, wN` zeros the upper 32 bits of xN, - // so it is NOT a true no-op and must be preserved. - kinds[i] = LineKind::Nop; - changed = true; - } - } - } - changed -} - -// ── Pass 4: Move chain optimization ────────────────────────────────────────── -// -// Pattern: mov A, B ; mov C, A → mov C, B -// This allows the first mov to potentially be dead-eliminated later. - -fn eliminate_move_chains(lines: &mut [String], kinds: &mut [LineKind], n: usize) -> bool { - let mut changed = false; - let mut i = 0; - while i + 1 < n { - match kinds[i] { - LineKind::Move { dst: dst1, src: src1, is_32bit: is_32bit1 } => { - // Find next non-Nop instruction - let mut j = i + 1; - while j < n && kinds[j] == LineKind::Nop { - j += 1; - } - if j < n { - if let LineKind::Move { dst: dst2, src: src2, is_32bit: is_32bit2 } = kinds[j] { - // mov dst1, src1 ; mov dst2, dst1 → mov dst2, src1 - // Only safe when both moves use the same width. - if src2 == dst1 && dst2 != src1 && is_32bit1 == is_32bit2 { - let reg_fmt = if is_32bit2 { wreg_name } else { xreg_name }; - lines[j] = format!(" mov {}, {}", reg_fmt(dst2), reg_fmt(src1)); - kinds[j] = LineKind::Move { dst: dst2, src: src1, is_32bit: is_32bit2 }; - changed = true; - } - } - } - } - LineKind::MoveImm { dst: dst1 } | LineKind::MoveWide { dst: dst1 } => { - // mov xN, #imm ; mov xM, xN → mov xM, #imm (copy the immediate) - // Only when dst1 is a scratch register (x0-x15) not callee-saved - if dst1 <= 15 { - let mut j = i + 1; - while j < n && kinds[j] == LineKind::Nop { - j += 1; - } - if j < n { - if let LineKind::Move { dst: dst2, src: src2, is_32bit: _ } = kinds[j] { - if src2 == dst1 { - // Copy the immediate instruction, retargeted to dst2 - let old_line = lines[i].trim(); - // Replace the register in the first instruction - if let Some(new_line) = retarget_move_imm(old_line, dst2) { - lines[j] = format!(" {}", new_line); - kinds[j] = LineKind::MoveImm { dst: dst2 }; - changed = true; - } - } - } - } - } - } - _ => {} - } - i += 1; - } - changed -} - -/// Retarget a move-immediate instruction to a different destination register. -/// E.g., `mov x0, #5` with new dest x14 → `mov x14, #5` -fn retarget_move_imm(line: &str, new_dst: u8) -> Option { - // Handle: mov xN, #imm / mov xN, :lo12:sym / movz xN, #imm / movn xN, #imm - for prefix in &["mov ", "movz ", "movn "] { - if let Some(rest) = line.strip_prefix(prefix) { - if let Some((_old_reg, imm_part)) = rest.split_once(", ") { - let new_reg = if line.contains('w') && !imm_part.starts_with('w') { - // If original used w-register (e.g., mov w0, #5) - // check if the source had 'w' prefix - let old_first = rest.chars().next()?; - if old_first == 'w' { - wreg_name(new_dst) - } else { - xreg_name(new_dst) - } - } else { - xreg_name(new_dst) - }; - return Some(format!("{}{}, {}", prefix, new_reg, imm_part)); - } - } - } - None -} - -// ── Pass 5: Branch-over-branch fusion ──────────────────────────────────────── -// -// Pattern: -// b.cc .Lskip_N -// b .target -// .Lskip_N: -// -// Transform to: -// b.!cc .target -// -// This is a very common pattern from the codegen: it emits a conditional branch -// to skip over an unconditional branch. - -/// Estimate the distance (in instructions) from position `from` to the label -/// `target` in the assembly. Returns `None` if the label is not found. -fn estimate_branch_distance(lines: &[String], kinds: &[LineKind], from: usize, target: &str) -> Option { - // Search both forward and backward for the target label - let target_with_colon = format!("{}:", target); - for idx in 0..lines.len() { - if kinds[idx] == LineKind::Label && lines[idx].trim() == target_with_colon { - // Count non-Nop lines between from and idx (each is one 4-byte instruction) - let (lo, hi) = if from < idx { (from, idx) } else { (idx, from) }; - let count = (lo..hi).filter(|&p| kinds[p] != LineKind::Nop && kinds[p] != LineKind::Label).count(); - return Some(count); - } - } - None -} - -/// Maximum number of instructions a b.cond can reach (±1MB = ±262144 instructions). -/// Use a conservative threshold of 200,000 to leave margin. -const COND_BRANCH_SAFE_DISTANCE: usize = 200_000; - -fn fuse_branch_over_branch(lines: &mut [String], kinds: &mut [LineKind], n: usize) -> bool { - let mut changed = false; - let mut i = 0; - while i + 2 < n { - if kinds[i] == LineKind::CondBranch { - // Find the next two non-Nop instructions - let mut j = i + 1; - while j < n && kinds[j] == LineKind::Nop { - j += 1; - } - if j >= n { - i += 1; - continue; - } - let mut k = j + 1; - while k < n && kinds[k] == LineKind::Nop { - k += 1; - } - if k >= n { - i += 1; - continue; - } - - // Check pattern: b.cc .skip ; b .target ; .skip: - if kinds[j] == LineKind::Branch && kinds[k] == LineKind::Label { - if let (Some((cc, skip_target)), Some(real_target), Some(lbl)) = ( - cond_branch_parts(&lines[i]), - branch_target(&lines[j]), - label_name(&lines[k]), - ) { - if skip_target == lbl { - // Check if the real target is within safe b.cond range. - // If the label isn't found in this snippet (external target), - // assume it's in range since the unconditional branch already - // reached it, and b.cond has ±1MB range which is generous. - let in_range = estimate_branch_distance(lines, kinds, i, real_target) - .is_none_or(|d| d < COND_BRANCH_SAFE_DISTANCE); - if in_range { - if let Some(inv_cc) = invert_condition(cc) { - // Replace conditional branch with inverted condition to real target - lines[i] = format!(" b.{} {}", inv_cc, real_target); - // kinds[i] stays as CondBranch - // Remove the unconditional branch - kinds[j] = LineKind::Nop; - // Keep the label (might be targeted by other branches) - changed = true; - } - } - } - } - } - } - i += 1; - } - changed -} - -// ── Global register copy propagation ───────────────────────────────────────── -// -// After store forwarding converts loads into register moves, propagate those -// copies into subsequent instructions. For `mov xDST, xSRC`, replace -// references to xDST with xSRC in the immediately following instruction -// (within the same basic block). - -fn propagate_register_copies(lines: &mut [String], kinds: &mut [LineKind], n: usize) -> bool { - let mut changed = false; - - for i in 0..n { - // Only process 64-bit register-to-register moves - let (dst, src) = match kinds[i] { - LineKind::Move { dst, src, is_32bit: false } => { - // Don't propagate sp/fp moves - if dst >= 29 || src >= 29 { - continue; - } - (dst, src) - } - _ => continue, - }; - - // Find the next non-Nop instruction - let mut j = i + 1; - while j < n && kinds[j] == LineKind::Nop { - j += 1; - } - if j >= n { - continue; - } - - // Don't propagate across control flow boundaries or into instructions - // with multiple destination registers (like ldp which has two dest regs - // after the mnemonic, but our replace_source_reg_in_instruction only - // treats the first operand as dest). - match kinds[j] { - LineKind::Label | LineKind::Branch | LineKind::Ret | LineKind::Directive - | LineKind::LoadPairSp | LineKind::Call => continue, - _ => {} - } - // Also skip ldp/ldaxr/ldxr/stxr by checking instruction text, - // since these have multiple dest registers or complex operand semantics - let trimmed_j = lines[j].trim(); - if trimmed_j.starts_with("ldp ") - || trimmed_j.starts_with("ldaxr ") - || trimmed_j.starts_with("ldxr ") - || trimmed_j.starts_with("stxr ") - || trimmed_j.starts_with("ldaxp ") - || trimmed_j.starts_with("ldxp ") - || trimmed_j.starts_with("cas ") - { - continue; - } - - // Try to replace references to dst with src in line j - let old_line = &lines[j]; - let dst_name = xreg_name(dst); - if !old_line.contains(dst_name) { - continue; - } - - // Don't propagate into instructions that write to the same register - // as the source (would create a self-reference) - let src_name = xreg_name(src); - - // For move instructions, replace the source operand - match kinds[j] { - LineKind::Move { dst: dst2, src: src2, is_32bit: false } if src2 == dst => { - // mov X, dst -> mov X, src - if dst2 != src { - lines[j] = format!(" mov {}, {}", xreg_name(dst2), src_name); - kinds[j] = LineKind::Move { dst: dst2, src, is_32bit: false }; - changed = true; - } - } - _ => { - // General case: try to replace the register in the instruction text. - // Only replace in source operand positions (not destination). - if let Some(new_line) = replace_source_reg_in_instruction(old_line, dst_name, src_name) { - lines[j] = new_line; - kinds[j] = classify_line(&lines[j]); - changed = true; - } - } - } - } - changed -} - -// Shared peephole string utilities -- see backend/peephole_common.rs -use crate::backend::peephole_common::replace_source_reg_in_instruction; -#[cfg(test)] -use crate::backend::peephole_common::replace_whole_word; - -// ── Global dead store elimination ──────────────────────────────────────────── -// -// Scans the entire function to find stack slot offsets that are never loaded. -// Stores to such slots are dead and can be eliminated. -// This runs after global store forwarding, which may have converted many loads -// to register moves, leaving the original stores dead. - -fn global_dead_store_elimination(lines: &[String], kinds: &mut [LineKind], n: usize) -> bool { - // Safety check: if any instruction takes the address of sp (e.g., `add xN, sp, #off`), - // stack slots could be accessed through pointers, so we must not eliminate any stores. - // This is conservative but sound — it prevents miscompilation when arrays or structs - // are allocated on the stack and passed by pointer to callees. - for i in 0..n { - if kinds[i] == LineKind::Nop { - continue; - } - let trimmed = lines[i].trim(); - // Check for address-of-sp patterns: - // - `add xN, sp, #offset` (common for array/struct address) - // - `mov xN, sp` (copying stack pointer) - // - `sub xN, sp, #N` (stack address computation) - if (trimmed.starts_with("add ") || trimmed.starts_with("sub ")) && trimmed.contains(", sp,") { - return false; - } - if trimmed.starts_with("mov ") && trimmed.contains(", sp") { - // Check it's actually "mov xN, sp" not "mov sp, xN" - if let Some(rest) = trimmed.strip_prefix("mov ") { - if let Some((_, src)) = rest.split_once(", ") { - if src.trim() == "sp" { - return false; - } - } - } - } - } - - // Phase 1: Collect all (offset, size) byte ranges that are loaded from. - // We must use byte-range overlap (not exact offset match) because a wide - // store (e.g. `str x` at offset 16, 8 bytes) can be partially read by a - // narrower load at a different offset (e.g. `ldr w` at offset 20, 4 bytes). - let mut loaded_ranges: Vec<(i32, i32)> = Vec::new(); // (offset, size) - for i in 0..n { - match kinds[i] { - LineKind::LoadSp { offset, is_word, .. } => { - let size = if is_word { 4 } else { 8 }; - loaded_ranges.push((offset, size)); - } - LineKind::LoadswSp { offset, .. } => { - loaded_ranges.push((offset, 4)); - } - _ => { - // Check for loads in Other instructions (e.g., ldp, ldrb, ldrh, etc.) - let trimmed = lines[i].trim(); - let load_size = if trimmed.starts_with("ldp ") { - Some(16) // ldp loads two 8-byte registers - } else if trimmed.starts_with("ldr x") || trimmed.starts_with("ldur x") { - Some(8) - } else if trimmed.starts_with("ldr w") || trimmed.starts_with("ldur w") || - trimmed.starts_with("ldrsw ") { - Some(4) - } else if trimmed.starts_with("ldrh ") || trimmed.starts_with("ldrsh ") { - Some(2) - } else if trimmed.starts_with("ldrb ") || trimmed.starts_with("ldrsb ") { - Some(1) - } else if trimmed.starts_with("ldr ") || trimmed.starts_with("ldur ") { - Some(8) // default to 8 for unqualified ldr - } else { - None - }; - if let Some(sz) = load_size { - if trimmed.contains("[sp") { - if let Some(off) = extract_sp_offset(trimmed) { - loaded_ranges.push((off, sz)); - } - } - } - } - } - } - - // Phase 2: Remove stores whose byte range does not overlap any load range - let mut changed = false; - for i in 0..n { - if let LineKind::StoreSp { offset, is_word, .. } = kinds[i] { - let store_size = if is_word { 4 } else { 8 }; - let overlaps_any_load = loaded_ranges.iter().any(|&(load_off, load_sz)| { - // Two ranges [a, a+as) and [b, b+bs) overlap iff a < b+bs && b < a+as - offset < load_off + load_sz && load_off < offset + store_size - }); - if !overlaps_any_load { - kinds[i] = LineKind::Nop; - changed = true; - } - } - } - changed -} - -/// Extract the numeric offset from an instruction containing `[sp, #N]` or `[sp]`. -fn extract_sp_offset(line: &str) -> Option { - if let Some(start) = line.find("[sp") { - let rest = &line[start..]; - if rest.starts_with("[sp]") { - return Some(0); - } - if rest.starts_with("[sp, #") { - let num_start = start + 6; // skip "[sp, #" - let after = &line[num_start..]; - if let Some(end) = after.find(']') { - return after[..end].parse::().ok(); - } - } - } - None -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_classify_store() { - assert!(matches!( - classify_line(" str x0, [sp, #16]"), - LineKind::StoreSp { reg: 0, offset: 16, is_word: false } - )); - assert!(matches!( - classify_line(" str w1, [sp, #24]"), - LineKind::StoreSp { reg: 1, offset: 24, is_word: true } - )); - } - - #[test] - fn test_classify_load() { - assert!(matches!( - classify_line(" ldr x0, [sp, #16]"), - LineKind::LoadSp { reg: 0, offset: 16, is_word: false } - )); - } - - #[test] - fn test_classify_loadsw() { - assert!(matches!( - classify_line(" ldrsw x0, [sp, #24]"), - LineKind::LoadswSp { reg: 0, offset: 24 } - )); - } - - #[test] - fn test_classify_move() { - assert!(matches!( - classify_line(" mov x14, x0"), - LineKind::Move { dst: 14, src: 0, is_32bit: false } - )); - } - - #[test] - fn test_classify_move_imm() { - assert!(matches!( - classify_line(" mov x0, #0"), - LineKind::MoveImm { dst: 0 } - )); - assert!(matches!( - classify_line(" mov x0, #-1"), - LineKind::MoveImm { dst: 0 } - )); - } - - #[test] - fn test_classify_branch() { - assert_eq!(classify_line(" b .LBB1"), LineKind::Branch); - } - - #[test] - fn test_classify_cond_branch() { - assert_eq!(classify_line(" b.ge .Lskip_0"), LineKind::CondBranch); - assert_eq!(classify_line(" b.eq .LBB3"), LineKind::CondBranch); - } - - #[test] - fn test_classify_label() { - assert_eq!(classify_line(".LBB1:"), LineKind::Label); - assert_eq!(classify_line("sum_array:"), LineKind::Label); - } - - #[test] - fn test_classify_ret() { - assert_eq!(classify_line(" ret"), LineKind::Ret); - } - - #[test] - fn test_classify_sxtw() { - assert!(matches!( - classify_line(" sxtw x0, w0"), - LineKind::Sxtw { dst: 0, src: 0 } - )); - } - - #[test] - fn test_adjacent_store_load_same_reg() { - let input = " str x0, [sp, #16]\n ldr x0, [sp, #16]\n ret\n"; - let result = peephole_optimize(input.to_string()); - assert!(result.contains("str x0, [sp, #16]")); - assert!(!result.contains("ldr x0, [sp, #16]")); - } - - #[test] - fn test_adjacent_store_load_diff_reg() { - let input = " str x0, [sp, #16]\n ldr x1, [sp, #16]\n ret\n"; - let result = peephole_optimize(input.to_string()); - // The load is replaced with mov, and then DSE removes the now-dead store - // (no remaining loads from offset 16), leaving just the mov and ret. - assert!(!result.contains("ldr x1, [sp, #16]")); - assert!(result.contains("mov x1, x0")); - } - - #[test] - fn test_redundant_branch() { - let input = " b .LBB1\n.LBB1:\n ret\n"; - let result = peephole_optimize(input.to_string()); - assert!(!result.contains("b .LBB1")); - assert!(result.contains(".LBB1:")); - } - - #[test] - fn test_self_move() { - let input = " mov x0, x0\n ret\n"; - let result = peephole_optimize(input.to_string()); - assert!(!result.contains("mov x0, x0")); - } - - #[test] - fn test_branch_over_branch_fusion() { - let input = " b.ge .Lskip_0\n b .LBB2\n.Lskip_0:\n b .LBB4\n"; - let result = peephole_optimize(input.to_string()); - // Should become: b.lt .LBB2 (inverted ge → lt) - assert!(result.contains("b.lt .LBB2")); - // The unconditional branch to LBB2 should be eliminated - assert!(!result.contains(" b .LBB2\n")); - } - - #[test] - fn test_move_chain() { - let input = " mov x0, x14\n mov x13, x0\n ret\n"; - let result = peephole_optimize(input.to_string()); - assert!(result.contains("mov x13, x14")); - } - - #[test] - fn test_move_imm_chain() { - let input = " mov x0, #0\n mov x14, x0\n ret\n"; - let result = peephole_optimize(input.to_string()); - assert!(result.contains("mov x14, #0")); - } - - #[test] - fn test_store_loadsw_fusion() { - let input = " str w1, [sp, #24]\n ldrsw x0, [sp, #24]\n ret\n"; - let result = peephole_optimize(input.to_string()); - assert!(!result.contains("ldrsw")); - assert!(result.contains("sxtw x0, w1")); - } - - // ── Global store forwarding tests ───────────────────────────────── - - #[test] - #[ignore] // GSF is disabled due to correctness bug in complex float-array code - fn test_gsf_same_reg_elimination() { - // Store x0 then load x0 from same slot (non-adjacent) — load is dead - let input = "\ - str x0, [sp, #16]\n\ - add x1, x2, x3\n\ - ldr x0, [sp, #16]\n\ - ret\n"; - let result = peephole_optimize(input.to_string()); - assert!(result.contains("str x0, [sp, #16]")); - assert!(!result.contains("ldr x0, [sp, #16]")); - } - - #[test] - #[ignore] // GSF is disabled due to correctness bug in complex float-array code - fn test_gsf_different_reg_forwarding() { - // Store x5 then load x10 from same slot — replace load with mov - let input = "\ - str x5, [sp, #32]\n\ - add x1, x2, x3\n\ - ldr x10, [sp, #32]\n\ - ret\n"; - let result = peephole_optimize(input.to_string()); - assert!(result.contains("str x5, [sp, #32]")); - assert!(!result.contains("ldr x10, [sp, #32]")); - assert!(result.contains("mov x10, x5")); - } - - #[test] - fn test_gsf_invalidation_on_reg_overwrite() { - // After x0 is overwritten, the mapping slot 16 → x0 is stale - let input = "\ - str x0, [sp, #16]\n\ - mov x0, #42\n\ - ldr x1, [sp, #16]\n\ - ret\n"; - let result = peephole_optimize(input.to_string()); - // The load should NOT be forwarded since x0 was overwritten - assert!(result.contains("ldr x1, [sp, #16]")); - } - - #[test] - fn test_gsf_invalidation_at_jump_target() { - // Mappings are invalidated at jump target labels - let input = "\ - str x5, [sp, #16]\n\ - b .LBB1\n\ -.LBB1:\n\ - ldr x5, [sp, #16]\n\ - ret\n"; - let result = peephole_optimize(input.to_string()); - // .LBB1 is a jump target, so mappings are invalidated - assert!(result.contains("ldr x5, [sp, #16]")); - } - - #[test] - #[ignore] // GSF is disabled due to correctness bug in complex float-array code - fn test_gsf_word_forwarding() { - // Word store forwarded to word load - let input = "\ - str w3, [sp, #24]\n\ - add x1, x2, x4\n\ - ldr w5, [sp, #24]\n\ - ret\n"; - let result = peephole_optimize(input.to_string()); - assert!(!result.contains("ldr w5, [sp, #24]")); - assert!(result.contains("mov w5, w3")); - } - - #[test] - #[ignore] // GSF is disabled due to correctness bug in complex float-array code - fn test_gsf_ldrsw_forwarding() { - // Word store forwarded to sign-extending load - let input = "\ - str w1, [sp, #24]\n\ - add x2, x3, x4\n\ - ldrsw x5, [sp, #24]\n\ - ret\n"; - let result = peephole_optimize(input.to_string()); - assert!(!result.contains("ldrsw x5, [sp, #24]")); - assert!(result.contains("sxtw x5, w1")); - } - - // ── Copy propagation tests ─────────────────────────────────────── - - #[test] - fn test_copy_prop_word_boundary() { - // Ensure "x1" doesn't match inside "x11" - assert_eq!( - replace_whole_word("x11, x1", "x1", "x5"), - "x11, x5" - ); - } - - #[test] - fn test_copy_prop_no_false_match() { - // x10 should not be affected when replacing x1 - assert_eq!( - replace_whole_word("x10", "x1", "x5"), - "x10" - ); - } - - // ── Dead store elimination tests ───────────────────────────────── - - #[test] - fn test_dse_with_address_taken() { - // When sp address is taken, no stores should be eliminated - let input = "\ - str w0, [sp, #16]\n\ - add x1, sp, #16\n\ - ret\n"; - let result = peephole_optimize(input.to_string()); - // Store must be preserved because address of stack slot is taken - assert!(result.contains("str w0, [sp, #16]")); - } -} diff --git a/src/backend/arm/codegen/prologue.rs b/src/backend/arm/codegen/prologue.rs deleted file mode 100644 index 0247007620..0000000000 --- a/src/backend/arm/codegen/prologue.rs +++ /dev/null @@ -1,339 +0,0 @@ -//! ArmCodegen: prologue/epilogue and stack frame operations. - -use crate::ir::reexports::{IrFunction, Instruction, Value}; -use crate::common::types::IrType; -use crate::backend::generation::{calculate_stack_space_common, find_param_alloca}; -use crate::backend::call_abi::{ParamClass, classify_params}; -use super::emit::{ - ArmCodegen, callee_saved_name, ARM_CALLEE_SAVED, ARM_CALLER_SAVED, ARM_ARG_REGS, -}; - -impl ArmCodegen { - // ---- calculate_stack_space ---- - - pub(super) fn calculate_stack_space_impl(&mut self, func: &IrFunction) -> i64 { - use crate::ir::reexports::Instruction; - use crate::backend::regalloc::PhysReg; - - let mut asm_clobbered_regs: Vec = Vec::new(); - Self::prescan_inline_asm_callee_saved(func, &mut asm_clobbered_regs); - let base_regs: &[PhysReg] = if func.is_variadic { &[] } else { &ARM_CALLEE_SAVED }; - let available_regs = crate::backend::generation::filter_available_regs(base_regs, &asm_clobbered_regs); - - let mut caller_saved_regs: Vec = if func.is_variadic { - Vec::new() - } else { - ARM_CALLER_SAVED.to_vec() - }; - let mut has_f128_ops = false; - for block in &func.blocks { - for inst in &block.instructions { - match inst { - Instruction::BinOp { ty, .. } | Instruction::UnaryOp { ty, .. } - | Instruction::Cmp { ty, .. } | Instruction::Load { ty, .. } - | Instruction::Store { ty, .. } if *ty == IrType::F128 => { - has_f128_ops = true; - } - Instruction::Cast { to_ty, .. } if *to_ty == IrType::F128 => { - has_f128_ops = true; - } - Instruction::Cast { from_ty, .. } if *from_ty == IrType::F128 => { - has_f128_ops = true; - } - _ => {} - } - } - } - if has_f128_ops { - caller_saved_regs.clear(); - } - - let (reg_assigned, cached_liveness) = crate::backend::generation::run_regalloc_and_merge_clobbers( - func, available_regs, caller_saved_regs, &asm_clobbered_regs, - &mut self.reg_assignments, &mut self.used_callee_saved, - false, - ); - - let mut space = calculate_stack_space_common(&mut self.state, func, 16, |space, alloc_size, align| { - let effective_align = if align > 0 { align.max(8) } else { 8 }; - let slot = (space + effective_align - 1) & !(effective_align - 1); - let new_space = slot + ((alloc_size + 7) & !7).max(8); - (slot, new_space) - }, ®_assigned, &ARM_CALLEE_SAVED, cached_liveness, false); - - if func.is_variadic { - space = (space + 7) & !7; - self.va_gp_save_offset = space; - space += 64; - - if !self.general_regs_only { - space = (space + 15) & !15; - self.va_fp_save_offset = space; - space += 128; - } - - let config = self.call_abi_config_impl(); - let param_classes = crate::backend::call_abi::classify_params(func, &config); - let mut named_gp = 0usize; - let mut named_fp = 0usize; - for (i, class) in param_classes.iter().enumerate() { - // On ARM64, the sret pointer goes in x8 (a dedicated register), - // NOT in x0-x7. Don't count it as consuming a GP argument register, - // otherwise va_start computes the wrong __gr_offs and skips the - // first variadic argument. - if self.state.uses_sret && i == 0 { - continue; - } - named_gp += class.gp_reg_count(); - if matches!(class, crate::backend::call_abi::ParamClass::FloatReg { .. } - | crate::backend::call_abi::ParamClass::F128FpReg { .. }) { - named_fp += 1; - } - } - self.va_named_gp_count = named_gp.min(8); - self.va_named_fp_count = named_fp.min(8); - self.va_named_stack_bytes = crate::backend::call_abi::named_params_stack_bytes(¶m_classes); - } - - let save_count = self.used_callee_saved.len() as i64; - if save_count > 0 { - space = (space + 7) & !7; - self.callee_save_offset = space; - space += save_count * 8; - } - - space - } - - // ---- aligned_frame_size ---- - - pub(super) fn aligned_frame_size_impl(&self, raw_space: i64) -> i64 { - (raw_space + 15) & !15 - } - - // ---- emit_prologue ---- - - pub(super) fn emit_prologue_impl(&mut self, func: &IrFunction, frame_size: i64) { - self.current_return_type = func.return_type; - self.current_frame_size = frame_size; - self.frame_base_offset = None; - self.emit_prologue_arm(frame_size); - - let used_regs = self.used_callee_saved.clone(); - let base = self.callee_save_offset; - let n = used_regs.len(); - let mut i = 0; - while i + 1 < n { - let r1 = callee_saved_name(used_regs[i]); - let r2 = callee_saved_name(used_regs[i + 1]); - let offset = base + (i as i64) * 8; - self.emit_stp_to_sp(r1, r2, offset); - i += 2; - } - if i < n { - let r = callee_saved_name(used_regs[i]); - let offset = base + (i as i64) * 8; - self.emit_store_to_sp(r, offset, "str"); - } - } - - // ---- emit_epilogue ---- - - pub(super) fn emit_epilogue_impl(&mut self, frame_size: i64) { - self.emit_restore_callee_saved(); - self.emit_epilogue_arm(frame_size); - } - - // ---- emit_store_params ---- - - pub(super) fn emit_store_params_impl(&mut self, func: &IrFunction) { - if func.is_variadic { - self.emit_save_variadic_regs(); - } - - let config = self.call_abi_config_impl(); - let param_classes = classify_params(func, &config); - self.state.param_classes = param_classes.clone(); - self.state.num_params = func.params.len(); - self.state.func_is_variadic = func.is_variadic; - - self.state.param_alloca_slots = (0..func.params.len()).map(|i| { - find_param_alloca(func, i).and_then(|(dest, ty)| { - self.state.get_slot(dest.0).map(|slot| (slot, ty)) - }) - }).collect(); - - // Pre-store optimization: when a GP param's alloca is dead (promoted by - // mem2reg) but the ParamRef dest is register-allocated to a callee-saved - // register, store the ABI arg register directly to that callee-saved - // register in the prologue. This is critical because: - // 1. Dead alloca means no stack slot exists for this param - // 2. The ABI register (x0-x7) will be clobbered by subsequent codegen - // (ARM uses x0 as the universal scratch/result register) - // 3. We must save the value NOW, before any other code runs - // 4. emit_param_ref will see param_pre_stored and skip code generation - let sret_shift = if self.state.uses_sret { 1usize } else { 0 }; - let mut paramref_dests: Vec> = vec![None; func.params.len()]; - for block in &func.blocks { - for inst in &block.instructions { - if let Instruction::ParamRef { dest, param_idx, .. } = inst { - if *param_idx < paramref_dests.len() { - paramref_dests[*param_idx] = Some(*dest); - } - } - } - } - // Build a map from physical register -> list of param indices that use it, - // so we can detect when two params share the same callee-saved register. - let mut reg_to_params: crate::common::fx_hash::FxHashMap> = crate::common::fx_hash::FxHashMap::default(); - for (i, _) in func.params.iter().enumerate() { - if let Some(paramref_dest) = paramref_dests[i] { - if let Some(&phys_reg) = self.reg_assignments.get(¶mref_dest.0) { - reg_to_params.entry(phys_reg.0).or_default().push(i); - } - } - } - - for (i, _) in func.params.iter().enumerate() { - let class = param_classes[i]; - if !class.uses_gp_reg() { continue; } - // Skip params that have an alloca slot (they'll be handled by emit_store_gp_params) - let has_slot = self.state.param_alloca_slots.get(i) - .and_then(|opt| opt.as_ref()) - .is_some(); - if has_slot { continue; } - - if let Some(paramref_dest) = paramref_dests[i] { - if let Some(&phys_reg) = self.reg_assignments.get(¶mref_dest.0) { - // Only pre-store to callee-saved registers (x20-x28). - // Caller-saved registers (x13, x14) cannot be used because - // they may overlap with scratch registers. - let is_callee_saved = phys_reg.0 >= 20 && phys_reg.0 <= 28; - if is_callee_saved { - // Safety check: if another param's dest is also assigned - // to this register, skip pre-store to avoid conflicts. - // The register allocator may assign the same register to - // two params whose live ranges don't overlap, but pre-store - // extends the effective lifetime to function entry. - if let Some(users) = reg_to_params.get(&phys_reg.0) { - if users.len() > 1 { - continue; - } - } - let dest_reg = callee_saved_name(phys_reg); - if let ParamClass::IntReg { reg_idx } = class { - let actual_idx = if sret_shift > 0 && reg_idx == 0 && i == 0 { - // sret: the pointer comes in x8 - self.state.emit_fmt(format_args!( - " mov {}, x8", dest_reg)); - self.state.param_pre_stored.insert(i); - continue; - } else if reg_idx >= sret_shift { - reg_idx - sret_shift - } else { - reg_idx - }; - let src_reg = ARM_ARG_REGS[actual_idx]; - self.state.emit_fmt(format_args!( - " mov {}, {}", dest_reg, src_reg)); - self.state.param_pre_stored.insert(i); - } - } - } - } - } - - self.emit_store_gp_params(func, ¶m_classes); - self.emit_store_fp_params(func, ¶m_classes); - self.emit_store_stack_params(func, ¶m_classes); - } - - // ---- emit_param_ref ---- - - pub(super) fn emit_param_ref_impl(&mut self, dest: &Value, param_idx: usize, ty: IrType) { - if param_idx >= self.state.param_classes.len() { - return; - } - - // If this param was pre-stored directly to its register-allocated - // destination during emit_store_params, the value is already in place. - // No code needs to be emitted — the register already holds the value. - if self.state.param_pre_stored.contains(¶m_idx) { - return; - } - - if param_idx < self.state.param_alloca_slots.len() { - if let Some((slot, alloca_ty)) = self.state.param_alloca_slots[param_idx] { - let ldr_instr = self.load_instr_for_type_impl(alloca_ty); - let (actual_instr, reg) = Self::arm_parse_load(ldr_instr); - self.emit_load_from_sp(reg, slot.0, actual_instr); - self.store_x0_to(dest); - return; - } - } - - let class = self.state.param_classes[param_idx]; - let frame_size = self.current_frame_size; - - // AArch64 ABI: sret shifts GP register indices - let sret_shift = if self.state.uses_sret { 1usize } else { 0 }; - - match class { - ParamClass::IntReg { reg_idx } => { - let actual_reg = if sret_shift > 0 && reg_idx == 0 && param_idx == 0 { - Self::reg_for_type("x8", ty) - } else { - let actual_idx = if reg_idx >= sret_shift { reg_idx - sret_shift } else { reg_idx }; - Self::reg_for_type(ARM_ARG_REGS[actual_idx], ty) - }; - let dst = Self::reg_for_type("x0", ty); - if actual_reg != dst { - self.state.emit_fmt(format_args!(" mov {}, {}", dst, actual_reg)); - } - self.store_x0_to(dest); - } - ParamClass::FloatReg { reg_idx } => { - if ty == IrType::F32 { - self.state.emit_fmt(format_args!(" fmov w0, s{}", reg_idx)); - } else { - self.state.emit_fmt(format_args!(" fmov x0, d{}", reg_idx)); - } - self.store_x0_to(dest); - } - ParamClass::StackScalar { offset } => { - let src = frame_size + offset; - let ldr_instr = self.load_instr_for_type_impl(ty); - let (actual_instr, reg) = Self::arm_parse_load(ldr_instr); - self.emit_load_from_sp(reg, src, actual_instr); - self.store_x0_to(dest); - } - _ => {} - } - } - - // ---- emit_epilogue_and_ret ---- - - pub(super) fn emit_epilogue_and_ret_impl(&mut self, frame_size: i64) { - self.emit_restore_callee_saved(); - self.emit_epilogue_arm(frame_size); - self.state.emit(" ret"); - } - - // ---- store_instr_for_type / load_instr_for_type ---- - - pub(super) fn store_instr_for_type_impl(&self, ty: IrType) -> &'static str { - Self::str_for_type(ty) - } - - pub(super) fn load_instr_for_type_impl(&self, ty: IrType) -> &'static str { - match ty { - IrType::I8 => "ldrsb", - IrType::U8 => "ldrb", - IrType::I16 => "ldrsh", - IrType::U16 => "ldrh", - IrType::I32 => "ldrsw", - IrType::U32 | IrType::F32 => "ldr32", - _ => "ldr64", - } - } -} diff --git a/src/backend/arm/codegen/returns.rs b/src/backend/arm/codegen/returns.rs deleted file mode 100644 index 443441566d..0000000000 --- a/src/backend/arm/codegen/returns.rs +++ /dev/null @@ -1,106 +0,0 @@ -//! ArmCodegen: return operations. - -use crate::ir::reexports::{IrConst, Operand, Value}; -use crate::common::types::IrType; -use super::emit::ArmCodegen; - -impl ArmCodegen { - pub(super) fn emit_return_impl(&mut self, val: Option<&Operand>, frame_size: i64) { - if let Some(val) = val { - let ret_ty = self.current_return_type; - if ret_ty.is_long_double() { - self.emit_f128_operand_to_q0_full(val); - self.emit_epilogue_and_ret_impl(frame_size); - return; - } - } - crate::backend::traits::emit_return_default(self, val, frame_size); - } - - pub(super) fn emit_return_i128_to_regs_impl(&mut self) { - // x0:x1 already hold the i128 return value per AAPCS64 -- noop - } - - pub(super) fn emit_return_f128_to_reg_impl(&mut self) { - self.state.emit(" fmov d0, x0"); - self.state.emit(" bl __extenddftf2"); - } - - pub(super) fn emit_return_f32_to_reg_impl(&mut self) { - self.state.emit(" fmov s0, w0"); - } - - pub(super) fn emit_return_f64_to_reg_impl(&mut self) { - self.state.emit(" fmov d0, x0"); - } - - pub(super) fn emit_return_int_to_reg_impl(&mut self) { - // x0 already holds the return value per AAPCS64 -- noop - } - - pub(super) fn current_return_type_impl(&self) -> IrType { - self.current_return_type - } - - pub(super) fn emit_get_return_f64_second_impl(&mut self, dest: &Value) { - if let Some(slot) = self.state.get_slot(dest.0) { - self.emit_store_to_sp("d1", slot.0, "str"); - } - } - - pub(super) fn emit_set_return_f64_second_impl(&mut self, src: &Operand) { - match src { - Operand::Value(v) => { - if let Some(slot) = self.state.get_slot(v.0) { - self.emit_load_from_sp("d1", slot.0, "ldr"); - } - } - Operand::Const(IrConst::F64(f)) => { - let bits = f.to_bits(); - self.emit_load_imm64("x0", bits as i64); - self.state.emit(" fmov d1, x0"); - } - _ => { - self.operand_to_x0(src); - self.state.emit(" fmov d1, x0"); - } - } - } - - pub(super) fn emit_get_return_f32_second_impl(&mut self, dest: &Value) { - if let Some(slot) = self.state.get_slot(dest.0) { - self.emit_store_to_sp("s1", slot.0, "str"); - } - } - - pub(super) fn emit_set_return_f32_second_impl(&mut self, src: &Operand) { - match src { - Operand::Value(v) => { - if let Some(slot) = self.state.get_slot(v.0) { - self.emit_load_from_sp("s1", slot.0, "ldr"); - } - } - Operand::Const(IrConst::F32(f)) => { - let bits = f.to_bits(); - self.emit_load_imm64("x0", bits as i64); - self.state.emit(" fmov s1, w0"); - } - _ => { - self.operand_to_x0(src); - self.state.emit(" fmov s1, w0"); - } - } - } - - pub(super) fn emit_get_return_f128_second_impl(&mut self, dest: &Value) { - if let Some(slot) = self.state.get_slot(dest.0) { - self.emit_store_to_sp("q1", slot.0, "str"); - self.state.track_f128_self(dest.0); - } - } - - pub(super) fn emit_set_return_f128_second_impl(&mut self, src: &Operand) { - self.emit_f128_operand_to_q0_full(src); - self.state.emit(" mov v1.16b, v0.16b"); - } -} diff --git a/src/backend/arm/codegen/variadic.rs b/src/backend/arm/codegen/variadic.rs deleted file mode 100644 index 6f8112be98..0000000000 --- a/src/backend/arm/codegen/variadic.rs +++ /dev/null @@ -1,359 +0,0 @@ -//! ArmCodegen: variadic function operations (va_arg, va_start, va_copy). - -use crate::ir::reexports::Value; -use crate::common::types::IrType; -use super::emit::{ArmCodegen, callee_saved_name}; - -impl ArmCodegen { - pub(super) fn emit_va_arg_impl(&mut self, dest: &Value, va_list_ptr: &Value, result_ty: IrType) { - let is_fp = result_ty.is_float(); - let is_f128 = result_ty.is_long_double(); - - if self.state.is_alloca(va_list_ptr.0) { - if let Some(slot) = self.state.get_slot(va_list_ptr.0) { - self.emit_add_fp_offset("x1", slot.0); - } - } else if let Some(®) = self.reg_assignments.get(&va_list_ptr.0) { - let reg_name = callee_saved_name(reg); - self.state.emit_fmt(format_args!(" mov x1, {}", reg_name)); - } else if let Some(slot) = self.state.get_slot(va_list_ptr.0) { - self.emit_load_from_sp("x1", slot.0, "ldr"); - } - - if is_f128 { - let label_id = self.state.next_label_id(); - let label_stack = format!(".Lva_stack_{}", label_id); - let label_done = format!(".Lva_done_{}", label_id); - - self.state.emit(" ldrsw x2, [x1, #28]"); - self.state.emit_fmt(format_args!(" tbz x2, #63, {}", label_stack)); - self.state.emit(" ldr x3, [x1, #16]"); - self.state.emit(" add x3, x3, x2"); - self.state.emit(" add w2, w2, #16"); - self.state.emit(" str w2, [x1, #28]"); - self.state.emit(" ldr q0, [x3]"); - self.state.emit(" bl __trunctfdf2"); - self.state.emit(" fmov x0, d0"); - self.state.emit_fmt(format_args!(" b {}", label_done)); - - self.state.emit_fmt(format_args!("{}:", label_stack)); - self.state.emit(" ldr x3, [x1]"); - self.state.emit(" add x3, x3, #15"); - self.state.emit(" and x3, x3, #-16"); - self.state.emit(" mov x4, x1"); - self.state.emit(" ldr q0, [x3]"); - self.state.emit(" add x3, x3, #16"); - self.state.emit(" str x3, [x4]"); - self.state.emit(" bl __trunctfdf2"); - self.state.emit(" fmov x0, d0"); - - self.state.emit_fmt(format_args!("{}:", label_done)); - self.state.reg_cache.invalidate_all(); - } else if is_fp { - let label_id = self.state.next_label_id(); - let label_stack = format!(".Lva_stack_{}", label_id); - let label_done = format!(".Lva_done_{}", label_id); - - self.state.emit(" ldrsw x2, [x1, #28]"); - self.state.emit_fmt(format_args!(" tbz x2, #63, {}", label_stack)); - self.state.emit(" ldr x3, [x1, #16]"); - self.state.emit(" add x3, x3, x2"); - self.state.emit(" add w2, w2, #16"); - self.state.emit(" str w2, [x1, #28]"); - if result_ty == IrType::F32 { - self.state.emit(" ldr w0, [x3]"); - } else { - self.state.emit(" ldr x0, [x3]"); - } - self.state.emit_fmt(format_args!(" b {}", label_done)); - - self.state.emit_fmt(format_args!("{}:", label_stack)); - self.state.emit(" ldr x3, [x1]"); - if result_ty == IrType::F32 { - self.state.emit(" ldr w0, [x3]"); - } else { - self.state.emit(" ldr x0, [x3]"); - } - self.state.emit(" add x3, x3, #8"); - self.state.emit(" str x3, [x1]"); - - self.state.emit_fmt(format_args!("{}:", label_done)); - } else { - let label_id = self.state.next_label_id(); - let label_stack = format!(".Lva_stack_{}", label_id); - let label_done = format!(".Lva_done_{}", label_id); - - self.state.emit(" ldrsw x2, [x1, #24]"); - self.state.emit_fmt(format_args!(" tbz x2, #63, {}", label_stack)); - self.state.emit(" ldr x3, [x1, #8]"); - self.state.emit(" add x3, x3, x2"); - self.state.emit(" add w2, w2, #8"); - self.state.emit(" str w2, [x1, #24]"); - self.state.emit(" ldr x0, [x3]"); - self.state.emit_fmt(format_args!(" b {}", label_done)); - - self.state.emit_fmt(format_args!("{}:", label_stack)); - self.state.emit(" ldr x3, [x1]"); - self.state.emit(" ldr x0, [x3]"); - self.state.emit(" add x3, x3, #8"); - self.state.emit(" str x3, [x1]"); - - self.state.emit_fmt(format_args!("{}:", label_done)); - } - - if let Some(slot) = self.state.get_slot(dest.0) { - self.emit_store_to_sp("x0", slot.0, "str"); - } - } - - pub(super) fn emit_va_start_impl(&mut self, va_list_ptr: &Value) { - if self.state.is_alloca(va_list_ptr.0) { - if let Some(slot) = self.state.get_slot(va_list_ptr.0) { - self.emit_add_fp_offset("x0", slot.0); - } - } else if let Some(®) = self.reg_assignments.get(&va_list_ptr.0) { - let reg_name = callee_saved_name(reg); - self.state.emit_fmt(format_args!(" mov x0, {}", reg_name)); - } else if let Some(slot) = self.state.get_slot(va_list_ptr.0) { - self.emit_load_from_sp("x0", slot.0, "ldr"); - } - - let stack_offset = self.current_frame_size + self.va_named_stack_bytes as i64; - if stack_offset <= 4095 { - self.state.emit_fmt(format_args!(" add x1, x29, #{}", stack_offset)); - } else { - self.load_large_imm("x1", stack_offset); - self.state.emit(" add x1, x29, x1"); - } - self.state.emit(" str x1, [x0]"); - - let gr_top_offset = self.va_gp_save_offset + 64; - self.emit_add_sp_offset("x1", gr_top_offset); - self.state.emit(" str x1, [x0, #8]"); - - if self.general_regs_only { - self.state.emit(" str xzr, [x0, #16]"); - } else { - let vr_top_offset = self.va_fp_save_offset + 128; - self.emit_add_sp_offset("x1", vr_top_offset); - self.state.emit(" str x1, [x0, #16]"); - } - - let gr_offs: i32 = -((8 - self.va_named_gp_count as i32) * 8); - self.state.emit_fmt(format_args!(" mov w1, #{}", gr_offs)); - self.state.emit(" str w1, [x0, #24]"); - - let vr_offs: i32 = if self.general_regs_only { - 0 - } else { - -((8 - self.va_named_fp_count as i32) * 16) - }; - self.state.emit_fmt(format_args!(" mov w1, #{}", vr_offs)); - self.state.emit(" str w1, [x0, #28]"); - } - - pub(super) fn emit_va_copy_impl(&mut self, dest_ptr: &Value, src_ptr: &Value) { - if self.state.is_alloca(src_ptr.0) { - if let Some(src_slot) = self.state.get_slot(src_ptr.0) { - self.emit_add_fp_offset("x1", src_slot.0); - } - } else if let Some(®) = self.reg_assignments.get(&src_ptr.0) { - let reg_name = callee_saved_name(reg); - self.state.emit_fmt(format_args!(" mov x1, {}", reg_name)); - } else if let Some(src_slot) = self.state.get_slot(src_ptr.0) { - self.emit_load_from_sp("x1", src_slot.0, "ldr"); - } - if self.state.is_alloca(dest_ptr.0) { - if let Some(dest_slot) = self.state.get_slot(dest_ptr.0) { - self.emit_add_fp_offset("x0", dest_slot.0); - } - } else if let Some(®) = self.reg_assignments.get(&dest_ptr.0) { - let reg_name = callee_saved_name(reg); - self.state.emit_fmt(format_args!(" mov x0, {}", reg_name)); - } else if let Some(dest_slot) = self.state.get_slot(dest_ptr.0) { - self.emit_load_from_sp("x0", dest_slot.0, "ldr"); - } - self.state.emit(" ldp x2, x3, [x1]"); - self.state.emit(" stp x2, x3, [x0]"); - self.state.emit(" ldp x2, x3, [x1, #16]"); - self.state.emit(" stp x2, x3, [x0, #16]"); - } - - /// Emit va_arg for struct types on AArch64 (AAPCS64). - /// - /// Per AAPCS64, composite types in variadic args are passed via GP registers. - /// A struct requiring N register slots (N = ceil(size/8)) must fit ENTIRELY - /// in the remaining GP register save area, or be read ENTIRELY from the stack - /// overflow area. It must never be split across the boundary. - /// - /// AAPCS64 va_list layout (32 bytes): - /// [+0] __stack : pointer to next stack (overflow) arg - /// [+8] __gr_top : pointer to top of GP register save area - /// [+16] __vr_top : pointer to top of FP register save area - /// [+24] __gr_offs : negative offset from __gr_top (i32) - /// [+28] __vr_offs : negative offset from __vr_top (i32) - /// - /// __gr_offs starts negative and advances toward 0. When it would become >= 0 - /// for the struct, we use the stack path instead. - pub(super) fn emit_va_arg_struct_impl(&mut self, dest_ptr: &Value, va_list_ptr: &Value, size: usize) { - let num_slots = size.div_ceil(8); - let total_reg_bytes = num_slots * 8; - - let label_id = self.state.next_label_id(); - let label_stack = format!(".Lva_struct_stack_{}", label_id); - let label_done = format!(".Lva_struct_done_{}", label_id); - - // Load va_list pointer into x1 - self.load_va_list_ptr(va_list_ptr, "x1"); - - // Load dest_ptr into x4 (scratch register) - self.load_dest_ptr(dest_ptr, "x4"); - - // Check if enough GP register slots remain for the entire struct. - // __gr_offs is a negative i32 at [va_list + 24]. - // We need: __gr_offs + total_reg_bytes <= 0 - // Equivalently: __gr_offs <= -total_reg_bytes - // Or: __gr_offs + total_reg_bytes is still negative (bit 63 set after sign-extend + add) - self.state.emit(" ldrsw x2, [x1, #24]"); // x2 = sign-extended __gr_offs - if total_reg_bytes <= 4095 { - self.state.emit_fmt(format_args!(" adds x3, x2, #{}", total_reg_bytes)); - } else { - self.load_large_imm("x3", total_reg_bytes as i64); - self.state.emit(" adds x3, x2, x3"); - } - // If x3 > 0 (not enough register slots for entire struct), use stack path. - self.state.emit_fmt(format_args!(" b.gt {}", label_stack)); - - // ==== Register path ==== - // Read all slots from the GP register save area. - // Base address: __gr_top + __gr_offs - self.state.emit(" ldr x5, [x1, #8]"); // x5 = __gr_top - self.state.emit(" add x5, x5, x2"); // x5 = __gr_top + __gr_offs (source addr) - - // Copy struct data from register save area to dest - for i in 0..num_slots { - let offset = (i * 8) as i64; - if offset + 8 <= size as i64 { - // Full 8-byte slot - if offset == 0 { - self.state.emit(" ldr x6, [x5]"); - self.state.emit(" str x6, [x4]"); - } else { - self.state.emit_fmt(format_args!(" ldr x6, [x5, #{}]", offset)); - self.state.emit_fmt(format_args!(" str x6, [x4, #{}]", offset)); - } - } else { - // Partial last slot: copy remaining bytes - let remaining = size - i * 8; - self.emit_partial_struct_copy(offset, remaining, "x5", "x4"); - } - } - - // Advance __gr_offs by total_reg_bytes - // x3 already holds __gr_offs + total_reg_bytes from the adds above - self.state.emit(" str w3, [x1, #24]"); - self.state.emit_fmt(format_args!(" b {}", label_done)); - - // ==== Stack path ==== - self.state.emit_fmt(format_args!("{}:", label_stack)); - // When falling through to stack, we must also set __gr_offs to 0 - // to indicate no more GP registers available (per AAPCS64). - self.state.emit(" str wzr, [x1, #24]"); - - // Read from __stack - self.state.emit(" ldr x5, [x1]"); // x5 = __stack (source addr) - - // Align __stack to 8 bytes (structs on stack are 8-byte aligned per AAPCS64) - self.state.emit(" add x5, x5, #7"); - self.state.emit(" and x5, x5, #-8"); - - // Copy struct data from stack to dest - for i in 0..num_slots { - let offset = (i * 8) as i64; - if offset + 8 <= size as i64 { - if offset == 0 { - self.state.emit(" ldr x6, [x5]"); - self.state.emit(" str x6, [x4]"); - } else { - self.state.emit_fmt(format_args!(" ldr x6, [x5, #{}]", offset)); - self.state.emit_fmt(format_args!(" str x6, [x4, #{}]", offset)); - } - } else { - let remaining = size - i * 8; - self.emit_partial_struct_copy(offset, remaining, "x5", "x4"); - } - } - - // Advance __stack past the struct (8-byte aligned) - let advance = num_slots * 8; - if advance <= 4095 { - self.state.emit_fmt(format_args!(" add x5, x5, #{}", advance)); - } else { - self.load_large_imm("x6", advance as i64); - self.state.emit(" add x5, x5, x6"); - } - self.state.emit(" str x5, [x1]"); - - // ==== Done ==== - self.state.emit_fmt(format_args!("{}:", label_done)); - self.state.reg_cache.invalidate_all(); - } - - /// Load the va_list pointer into the specified register. - fn load_va_list_ptr(&mut self, va_list_ptr: &Value, dest_reg: &str) { - if self.state.is_alloca(va_list_ptr.0) { - if let Some(slot) = self.state.get_slot(va_list_ptr.0) { - self.emit_add_fp_offset(dest_reg, slot.0); - } - } else if let Some(®) = self.reg_assignments.get(&va_list_ptr.0) { - let reg_name = callee_saved_name(reg); - self.state.emit_fmt(format_args!(" mov {}, {}", dest_reg, reg_name)); - } else if let Some(slot) = self.state.get_slot(va_list_ptr.0) { - self.emit_load_from_sp(dest_reg, slot.0, "ldr"); - } - } - - /// Load the destination pointer into the specified register. - fn load_dest_ptr(&mut self, dest_ptr: &Value, dest_reg: &str) { - if self.state.is_alloca(dest_ptr.0) { - if let Some(slot) = self.state.get_slot(dest_ptr.0) { - self.emit_add_fp_offset(dest_reg, slot.0); - } - } else if let Some(®) = self.reg_assignments.get(&dest_ptr.0) { - let reg_name = callee_saved_name(reg); - self.state.emit_fmt(format_args!(" mov {}, {}", dest_reg, reg_name)); - } else if let Some(slot) = self.state.get_slot(dest_ptr.0) { - self.emit_load_from_sp(dest_reg, slot.0, "ldr"); - } - } - - /// Emit byte-by-byte copy for a partial struct slot (last slot with < 8 bytes). - fn emit_partial_struct_copy(&mut self, base_offset: i64, remaining: usize, src_reg: &str, dst_reg: &str) { - let mut copied = 0usize; - // Copy 4 bytes if possible - if remaining >= 4 { - let off = base_offset + copied as i64; - if off == 0 { - self.state.emit_fmt(format_args!(" ldr w6, [{}]", src_reg)); - self.state.emit_fmt(format_args!(" str w6, [{}]", dst_reg)); - } else { - self.state.emit_fmt(format_args!(" ldr w6, [{}, #{}]", src_reg, off)); - self.state.emit_fmt(format_args!(" str w6, [{}, #{}]", dst_reg, off)); - } - copied += 4; - } - // Copy 2 bytes if possible - if remaining - copied >= 2 { - let off = base_offset + copied as i64; - self.state.emit_fmt(format_args!(" ldrh w6, [{}, #{}]", src_reg, off)); - self.state.emit_fmt(format_args!(" strh w6, [{}, #{}]", dst_reg, off)); - copied += 2; - } - // Copy 1 byte if remaining - if remaining - copied >= 1 { - let off = base_offset + copied as i64; - self.state.emit_fmt(format_args!(" ldrb w6, [{}, #{}]", src_reg, off)); - self.state.emit_fmt(format_args!(" strb w6, [{}, #{}]", dst_reg, off)); - } - } -} diff --git a/src/backend/arm/ld_stub.sh b/src/backend/arm/ld_stub.sh deleted file mode 100755 index c6b3cd36f3..0000000000 --- a/src/backend/arm/ld_stub.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/sh -# Placeholder linker for AArch64 backend. -# Set MY_LD to point to this script to test the custom linker integration. -# TODO: Replace this stub with a real linker implementation. -echo "ERROR: AArch64 custom linker stub called but not yet implemented." >&2 -echo "Arguments: $@" >&2 -exit 1 diff --git a/src/backend/arm/linker/README.md b/src/backend/arm/linker/README.md deleted file mode 100644 index 0d22789808..0000000000 --- a/src/backend/arm/linker/README.md +++ /dev/null @@ -1,633 +0,0 @@ -# AArch64 Built-in Linker -- Design Document - -## Overview - -The built-in AArch64 linker links ELF64 relocatable object files (`.o`) and -static archives (`.a`) into ELF64 executables for AArch64 Linux, supporting -both static and dynamic linking. It can also produce shared libraries -(`ET_DYN` / `.so` files) via `link_shared()`. It replaces the external `ld` -dependency when the `gcc_linker` Cargo feature is not enabled (the default), making the -compiler fully self-hosting. - -The linker implements the complete linking pipeline: ELF object parsing, -archive member extraction, symbol resolution, section merging, virtual address -layout, GOT/PLT construction, TLS handling, IFUNC support, relocation -application, dynamic section emission, and final ELF output. - -The implementation spans roughly 4,000 lines of Rust across ten modules, plus -shared infrastructure in `linker_common` (`GlobalSymbolOps` trait, -`OutputSection` / `InputSection` types, section merging, symbol registration, -common symbol allocation, archive loading, library resolution, section name -mapping, `.eh_frame` processing, and dynamic symbol resolution). - -``` - AArch64 Built-in Linker - ============================================================ - - .o files .a archives -l libraries .so shared libs - \ | / / - v v v v - +------------------------------------------+ - | elf.rs (~75 lines) | - | Type aliases + thin wrappers to | - | linker_common; AArch64 reloc consts | - +------------------------------------------+ - | - v - +------------------------------------------+ - | input.rs (~90 lines) | - | File loading: load_file, resolve_lib | - +------------------------------------------+ - | - v - +------------------------------------------+ - | types.rs (~93) | plt_got.rs (~130) | - | GlobalSymbol, | PLT/GOT list | - | arch constants | construction | - +------------------------------------------+ - | - v - +------------------------------------------+ - | link.rs (~411 lines) | - | Orchestrator: link_builtin, | - | link_shared entry points | - +------------------------------------------+ - / | \ - v v v - +-----------+ +-----------+ +-----------+ - | emit_ | | emit_ | | emit_ | - | dynamic | | static | | shared | - | (~869) | | (~645) | | (~1098) | - +-----------+ +-----------+ +-----------+ - | - v - +------------------------------------------+ - | reloc.rs (~540 lines) | - | Relocation Application: 40+ reloc | - | types, TLS relaxation, GOT refs | - +------------------------------------------+ - | - v - ELF64 executable on disk -``` - - ---- - -## Public Entry Points - -The linker has two public entry points: - -```rust -// mod.rs -- static and dynamic executable linking -pub fn link_builtin( - object_files: &[&str], // Paths to .o files from the compiler - output_path: &str, // Output executable path - user_args: &[String], // Additional flags: -L, -l, -Wl,... - lib_paths: &[&str], // Library search paths (from common.rs) - needed_libs: &[&str], // Default libraries to link (e.g., "gcc", "c") - crt_objects_before: &[&str], // CRT objects before user code (crt1.o, crti.o, ...) - crt_objects_after: &[&str], // CRT objects after user code (crtend.o, crtn.o) - is_static: bool, // Static vs dynamic linking -) -> Result<(), String> - -// mod.rs -- shared library (.so) output -pub fn link_shared( - object_files: &[&str], - output_path: &str, - user_args: &[String], - lib_paths: &[&str], -) -> Result<(), String> -``` - -CRT object discovery, library path resolution, and the `-nostdlib`/`-static` -flags are handled by `common.rs`'s `resolve_builtin_link_setup()` before -calling into the linker. The linker receives pre-resolved paths and loads -them in order. - - ---- - -## Stage 1: ELF Parsing (`elf.rs` / `linker_common`) - -### Purpose - -Read and decode ELF64 relocatable object files, static archives, and minimal -linker scripts. The actual parsing logic lives in the shared `linker_common` -module; `elf.rs` provides AArch64-specific relocation constants and re-exports -shared types under local names via type aliases. - -### Key Data Structures - -ELF64 types are defined in `linker_common` and re-exported via type aliases: - -| Type | Alias | Role | -|------|-------|------| -| `Elf64Object` | `ElfObject` | A fully parsed object file: sections, symbols, raw section data, relocations indexed by section. | -| `Elf64Section` | `SectionHeader` | Parsed `Elf64_Shdr`: name, type, flags, offset, size, link, info, alignment, entsize. | -| `Elf64Symbol` | `Symbol` | Parsed `Elf64_Sym`: name, info (binding + type), other (visibility), shndx, value, size. | -| `Elf64Rela` | `Rela` | Parsed `Elf64_Rela`: offset, sym_idx, rela_type, addend. | - -### Object Parsing (`parse_object`) - -Delegates to `linker_common::parse_elf64_object(data, source_name, EM_AARCH64)`. - -### Archive and Linker Script Parsing - -Archive parsing (`parse_archive_members`, `parse_thin_archive_members`) and -linker script parsing (`parse_linker_script_entries`) are provided by the -shared `crate::backend::elf` module. - - ---- - -## Stage 2: Orchestration (`link.rs` + `input.rs` + `plt_got.rs`) - -### Purpose - -These modules form the linker driver. `link.rs` coordinates the pipeline: -file loading (delegated to `input.rs`), symbol resolution, section merging, -address layout, PLT/GOT construction (delegated to `plt_got.rs`), and -dispatching to the appropriate emission module. - -### Key Data Structures - -| Type | Role | -|------|------| -| `OutputSection` | Shared type from `linker_common`: merged output section with name, type, flags, alignment, list of `InputSection` references, merged data buffer, assigned virtual address and file offset, memory size. | -| `InputSection` | Shared type from `linker_common`: reference to one input section with object index, section index, output offset within the merged section, size. | -| `GlobalSymbol` | ARM-specific resolved global symbol: implements `linker_common::GlobalSymbolOps` trait. Contains final value (address), size, info byte, defining object index, section index, plus dynamic linking fields (`from_lib`, `plt_idx`, `got_idx`, `is_dynamic`, `copy_reloc`, `lib_sym_value`). | - -### Constants - -``` -BASE_ADDR = 0x400000 -- Base virtual address for the executable -PAGE_SIZE = 0x10000 -- 64 KB (AArch64 linker page alignment) -INTERP = "/lib/ld-linux-aarch64.so.1" -- dynamic linker path -``` - -### Linking Algorithm -- Step by Step - -``` -link_builtin(object_files, output_path, user_args, lib_paths, - needed_libs, crt_before, crt_after, is_static): - - 1. ARGUMENT PARSING - Parse user_args for -L (extra library paths), -l (libraries), - -Wl,--defsym, -Wl,--export-dynamic, -rdynamic, etc. - - 2. FILE LOADING - a. Load CRT objects (before): pre-resolved by common.rs - b. Load user object files from object_files[] - c. Load objects/archives/libraries from user_args (-l flags) - d. Load CRT objects (after): pre-resolved by common.rs - e. Group-load default libraries from needed_libs[] - (iterate until no new symbols resolved -- handles circular deps) - f. For dynamic linking: resolve remaining undefs against - system .so files (libc.so.6, libm.so.6, libgcc_s.so.1) - - 3. SYMBOL RESOLUTION (linker_common::register_symbols_elf64, per object) - - Skip FILE, SECTION, and local symbols - - Defined symbols: insert or replace if existing is - undefined, dynamic, or weak-vs-global - - COMMON symbols: insert if not already defined - - Undefined symbols: insert placeholder if not present - - 4. DEFSYM APPLICATION - Apply --defsym=ALIAS=TARGET definitions (symbol aliasing). - - 5. GARBAGE COLLECTION (if --gc-sections) - BFS reachability from entry points (_start, main, __libc_csu_init, - __libc_csu_fini) and init/fini arrays; unreachable sections are - excluded from the link. - - 6. UNRESOLVED SYMBOL CHECK - Error on undefined non-weak symbols, excluding linker-defined - names recognized by linker_common::is_linker_defined_symbol(). - - 7. SECTION MERGING (linker_common::merge_sections_elf64) - Delegates to shared implementation that: - a. Maps input section names to output names via map_section_name() - b. For each allocatable input section, appends to matching output section - c. Calculates output offsets within each merged section - d. Sorts output sections: RO -> Exec -> RW(progbits) -> RW(nobits) - e. Builds section_map: (obj_idx, sec_idx) -> (out_idx, offset) - - 8. COMMON SYMBOL ALLOCATION (linker_common::allocate_common_symbols_elf64) - Allocate SHN_COMMON symbols into .bss with proper alignment. - - 9. EMIT - If dynamic symbols present and !is_static: - create_plt_got() then emit_dynamic_executable() - Otherwise: - emit_executable() (static linking) -``` - -### Memory Layout (Static Executable) - -The static linker produces a two-segment layout. Note: `emit_executable()` -places executable sections first, then read-only data, regardless of the -earlier section sort order. - -``` - Virtual Address Space - ==================================================================== - - 0x400000 +========================+ ----+ - | ELF Header (64 B) | | - | Program Headers | | - +------------------------+ | - | .text | | LOAD segment 1 - | (executable code) | | RX (Read + Execute) - +------------------------+ | - | .rodata | | - | (read-only data) | | - +------------------------+ | - | .gcc_except_table | | - | .eh_frame | | - +------------------------+ | - | .eh_frame_hdr | | - +------------------------+ | - | [IPLT stubs] | | (in RX padding gap) - +========================+ ----+ - | (page alignment gap) | <- 64 KB aligned - +========================+ ----+ - | .tdata | | - | (TLS initialized) | | - +------------------------+ | - | .init_array | | - | .fini_array | | - +------------------------+ | LOAD segment 2 - | .data.rel.ro | | RW (Read + Write) - | .data | | - +------------------------+ | - | .got | | (built by linker) - +------------------------+ | - | [IPLT GOT slots] | | - | [.rela.iplt entries] | | - +========================+ ----+ - | .bss | (no file space, only memsize) - | .tbss | - +========================+ - - Program Headers (up to 5): - LOAD RX: file offset 0, vaddr BASE_ADDR, filesz=rx_filesz - LOAD RW: file offset rw_page_offset, vaddr=rw_page_addr - TLS: .tdata + .tbss (if present) - GNU_STACK: RW, no exec - GNU_EH_FRAME: .eh_frame_hdr (if .eh_frame present) -``` - -### GOT (Global Offset Table) Construction - -The linker builds a GOT for two purposes: - -1. **Regular GOT entries** (`R_AARCH64_ADR_GOT_PAGE` / `R_AARCH64_LD64_GOT_LO12_NC`): - 8-byte slots containing the absolute address of the target symbol. - -2. **TLS IE GOT entries** (`R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21` / - `R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC`): 8-byte slots containing the - TP-relative offset of the TLS variable (computed as - `sym_addr - tls_base + 16` per AArch64 Variant 1 TLS). - -The `collect_got_symbols()` function in `reloc.rs` scans all relocations -to determine which symbols need GOT entries, and what kind (`Regular` or -`TlsIE`). GOT entries are allocated in the RW segment, 8-byte aligned. - -### IFUNC / IPLT Support - -The linker handles `STT_GNU_IFUNC` symbols (indirect functions whose runtime -address is determined by a resolver function): - -1. **Identify IFUNC symbols** in the global symbol table. -2. **Allocate IPLT GOT slots** (one 8-byte slot per IFUNC) in the RW segment. -3. **Generate `.rela.iplt` entries** with `R_AARCH64_IRELATIVE` relocations - pointing to the resolver function. -4. **Generate IPLT PLT stubs** in the RX gap between text and data segments. - Each stub is 16 bytes: - ``` - ADRP x16, page_of(got_slot) - LDR x17, [x16, #lo12(got_slot)] - BR x17 - NOP - ``` -5. **Redirect IFUNC symbol addresses** to point to the PLT stub instead of - the resolver. The symbol type is changed from `STT_GNU_IFUNC` to - `STT_FUNC`. - -### Linker-Defined Symbols - -The following symbols are automatically provided (via -`linker_common::get_standard_linker_symbols()`): - -| Symbol | Value | -|--------|-------| -| `__dso_handle` | `BASE_ADDR` | -| `_DYNAMIC` | 0 (no dynamic section in static executables) | -| `_GLOBAL_OFFSET_TABLE_` | GOT base address | -| `__init_array_start` / `__init_array_end` | `.init_array` bounds | -| `__fini_array_start` / `__fini_array_end` | `.fini_array` bounds | -| `__preinit_array_start` / `__preinit_array_end` | Same as init_array start | -| `__ehdr_start` | `BASE_ADDR` | -| `__executable_start` | `BASE_ADDR` | -| `_etext` / `etext` | End of text (RX) segment | -| `__data_start` / `data_start` | Start of RW data segment | -| `_init` / `_fini` | Address of `.init` / `.fini` sections | -| `__rela_iplt_start` / `__rela_iplt_end` | IRELATIVE relocation table bounds | -| `__bss_start` / `_edata` | BSS start address | -| `_end` / `__end` | BSS end address | - - ---- - -## Stage 3: Relocation Application (`reloc.rs`) - -### Purpose - -After all sections have been laid out and symbol addresses are known, apply -every relocation from every input object to the output buffer. This module -also handles TLS model relaxation and GOT-indirect references. - -### Key Data Structures - -| Type | Role | -|------|------| -| `TlsInfo` | TLS segment base address and total size. | -| `GotInfo` | GOT base address and a map of symbol keys to entry indices. | -| `GotEntryKind` | Whether a GOT entry is `Regular` (absolute address) or `TlsIE` (TP offset). | - -### Symbol Resolution (`resolve_sym`) - -``` -resolve_sym(obj_idx, sym, globals, section_map, output_sections): - if sym is STT_SECTION: - return output_sections[mapped_section].addr + section_offset - if sym is non-local and in globals and defined: - return global value // includes linker-defined symbols - if sym is non-local and weak: - return 0 - if sym is undefined: - return 0 - if sym is SHN_ABS: - return sym.value - otherwise: - return mapped section addr + section offset + sym.value -``` - -### Supported Relocation Types - -The linker handles 40+ AArch64 relocation types, organized by category: - -#### Absolute Relocations - -| Type | ELF # | Formula | Usage | -|------|-------|---------|-------| -| `R_AARCH64_ABS64` | 257 | S + A | 64-bit data pointer | -| `R_AARCH64_ABS32` | 258 | S + A | 32-bit data pointer | -| `R_AARCH64_ABS16` | 259 | S + A | 16-bit data value | - -#### PC-Relative Relocations - -| Type | ELF # | Formula | Usage | -|------|-------|---------|-------| -| `R_AARCH64_PREL64` | 260 | S + A - P | 64-bit PC-relative | -| `R_AARCH64_PREL32` | 261 | S + A - P | 32-bit PC-relative (jump tables) | -| `R_AARCH64_PREL16` | 262 | S + A - P | 16-bit PC-relative | - -#### Page-Relative and Immediate Relocations - -| Type | ELF # | Formula | Usage | -|------|-------|---------|-------| -| `R_AARCH64_ADR_PREL_PG_HI21` | 275 | Page(S+A) - Page(P) | ADRP instruction | -| `R_AARCH64_ADR_PREL_LO21` | 274 | S + A - P | ADR instruction | -| `R_AARCH64_ADD_ABS_LO12_NC` | 277 | (S+A) & 0xFFF | ADD :lo12: | -| `R_AARCH64_LDST8_ABS_LO12_NC` | 278 | (S+A) & 0xFFF | Byte load/store | -| `R_AARCH64_LDST16_ABS_LO12_NC` | 284 | (S+A) & 0xFFF >> 1 | Halfword load/store | -| `R_AARCH64_LDST32_ABS_LO12_NC` | 285 | (S+A) & 0xFFF >> 2 | Word load/store | -| `R_AARCH64_LDST64_ABS_LO12_NC` | 286 | (S+A) & 0xFFF >> 3 | Doubleword load/store | -| `R_AARCH64_LDST128_ABS_LO12_NC` | 299 | (S+A) & 0xFFF >> 4 | Quadword load/store | - -#### Branch Relocations - -| Type | ELF # | Formula | Usage | -|------|-------|---------|-------| -| `R_AARCH64_CALL26` | 283 | (S+A-P) >> 2 | BL instruction (26-bit) | -| `R_AARCH64_JUMP26` | 282 | (S+A-P) >> 2 | B instruction (26-bit) | -| `R_AARCH64_CONDBR19` | 280 | (S+A-P) >> 2 | Conditional branch (19-bit) | -| `R_AARCH64_TSTBR14` | 279 | (S+A-P) >> 2 | Test-and-branch (14-bit) | - -Special: when a `CALL26`/`JUMP26` target resolves to address 0 (undefined -weak symbol), the instruction is replaced with `NOP` (0xd503201f). - -#### MOVW Relocations - -| Type | ELF # | Formula | Usage | -|------|-------|---------|-------| -| `R_AARCH64_MOVW_UABS_G0[_NC]` | 263/264 | (S+A) & 0xFFFF | MOVZ/MOVK bits [15:0] | -| `R_AARCH64_MOVW_UABS_G1_NC` | 265 | (S+A) >> 16 & 0xFFFF | MOVK bits [31:16] | -| `R_AARCH64_MOVW_UABS_G2_NC` | 266 | (S+A) >> 32 & 0xFFFF | MOVK bits [47:32] | -| `R_AARCH64_MOVW_UABS_G3` | 267 | (S+A) >> 48 & 0xFFFF | MOVK bits [63:48] | - -#### GOT Relocations - -| Type | ELF # | Description | -|------|-------|-------------| -| `R_AARCH64_ADR_GOT_PAGE` | 311 | ADRP to page containing GOT entry | -| `R_AARCH64_LD64_GOT_LO12_NC` | 312 | LDR from GOT entry (low 12 bits) | - -In static linking, the GOT is a real data structure in the RW segment -populated at link time (not lazily at runtime). - -#### TLS Local Exec (LE) Relocations - -Used when the TLS variable is in the executable itself (most common in -static linking). The TP (Thread Pointer) offset is computed as: - -``` -tp_offset = (sym_addr - tls_start_addr) + 16 // AArch64 Variant 1 -``` - -| Type | ELF # | Description | -|------|-------|-------------| -| `R_AARCH64_TLSLE_ADD_TPREL_HI12` | 549 | ADD, high 12 bits of TP offset | -| `R_AARCH64_TLSLE_ADD_TPREL_LO12[_NC]` | 550/551 | ADD, low 12 bits of TP offset | -| `R_AARCH64_TLSLE_MOVW_TPREL_G0[_NC]` | 544/545 | MOVZ/MOVK, bits [15:0] | -| `R_AARCH64_TLSLE_MOVW_TPREL_G1[_NC]` | 546/547 | MOVK, bits [31:16] | -| `R_AARCH64_TLSLE_MOVW_TPREL_G2` | 548 | MOVK, bits [47:32] | - -#### TLS Initial Exec (IE) via GOT - -Instead of relaxing ADRP+LDR to MOVZ+MOVK (which can break if different -registers are used), the linker uses real GOT entries pre-populated with -TP offsets: - -| Type | ELF # | Description | -|------|-------|-------------| -| `R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21` | 541 | ADRP to GOT page holding TP offset | -| `R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC` | 542 | LDR from GOT entry | - -#### TLS Descriptor (TLSDESC) Relaxation to LE - -For static linking, TLSDESC sequences are relaxed to direct TP-offset -computation: - -| Type | ELF # | Relaxation | -|------|-------|------------| -| `R_AARCH64_TLSDESC_ADR_PAGE21` | 562 | ADRP -> MOVZ Xd, #tprel_g1, LSL #16 | -| `R_AARCH64_TLSDESC_LD64_LO12` | 563 | LDR -> MOVK Xd, #tprel_lo | -| `R_AARCH64_TLSDESC_ADD_LO12` | 564 | ADD -> NOP | -| `R_AARCH64_TLSDESC_CALL` | 569 | BLR -> NOP | - -#### TLS General Dynamic (GD) Relaxation to LE - -| Type | ELF # | Relaxation | -|------|-------|------------| -| `R_AARCH64_TLSGD_ADR_PAGE21` | 513 | ADRP -> MOVZ Xd, #tprel_g1, LSL #16 | -| `R_AARCH64_TLSGD_ADD_LO12_NC` | 514 | ADD -> MOVK Xd, #tprel_lo | - -### Instruction Patching Helpers - -The relocation module includes helpers that patch individual instruction -fields without disturbing other bits: - -| Helper | Field Modified | -|--------|---------------| -| `encode_adrp()` | immhi[23:5] and immlo[30:29] of ADRP | -| `encode_adr()` | immhi[23:5] and immlo[30:29] of ADR | -| `encode_add_imm12()` | imm12[21:10] of ADD immediate | -| `encode_ldst_imm12()` | imm12[21:10] of LDR/STR, scaled by access size | -| `encode_movw()` | imm16[20:5] of MOVZ/MOVK | - - ---- - -## Archive and Library Handling - -### File Loading Dispatch (`load_file`) - -The linker dispatches file loading based on format detection: - -1. **Archives** (`!\n` magic): parse members, selectively extract -2. **Thin archives** (`!\n` magic): members are external files -3. **Linker scripts** (non-ELF text): parse `GROUP`/`INPUT` directives, - recursively load referenced files and `-l` libraries -4. **Shared libraries** (`ET_DYN`): load dynamic symbols (skipped if static) -5. **Relocatable objects** (`ET_REL`): parse and register symbols - -### Archive Loading Strategy - -Archives use **selective extraction with iterative resolution**, matching -the behavior of traditional `ld --start-group`. Archive and thin archive -loading is delegated to `linker_common::load_archive_elf64()` and -`linker_common::load_thin_archive_elf64()`, which implement the shared -algorithm: parse members, filter by `e_machine`, iterate until stable -extracting members that resolve currently-undefined symbols. - -### Default Library Group Loading - -The caller (common.rs) provides `needed_libs` (e.g., `["gcc", "gcc_eh", "c"]`). -The linker resolves these to archive paths and loads them in a group-loading -loop. This handles circular dependencies between these libraries: - -``` -repeat: - prev_count = objects.len() - for each resolved library archive: - load_file(archive) // only extracts members that resolve undefs - if objects.len() == prev_count: - break // stable -- no new members pulled in -``` - -### Library Resolution (`resolve_lib`) - -Libraries specified with `-l` are searched via `linker_common::resolve_lib()` -across all library paths (user `-L` paths first, then system paths provided -by common.rs). In static mode, `.a` is preferred; in dynamic mode, `.so` is -preferred. The special `-l:filename` syntax searches for an exact filename. - - ---- - -## Design Decisions and Trade-offs - -### 1. Static and Dynamic Linking - -The linker supports both static executables (`ET_EXEC`, the default with -`-static`) and dynamically-linked executables with PLT/GOT, `.dynamic` -section, `DT_*` tags, `.interp`, `.gnu.hash`, and copy relocations. It -also produces shared libraries (`ET_DYN`) with `R_AARCH64_RELATIVE`, -`R_AARCH64_JUMP_SLOT`, and `R_AARCH64_GLOB_DAT` relocations, enabling -full `dlopen()` support (e.g., PostgreSQL extension modules). - -### 2. Two-Segment Layout - -The output uses exactly two `PT_LOAD` segments (RX and RW) plus optional -TLS, GNU_STACK, and GNU_EH_FRAME segments. This is the minimal viable -layout. The 64 KB page alignment (`PAGE_SIZE = 0x10000`) accommodates -AArch64 systems with either 4 KB or 64 KB page sizes. - -### 3. Real GOT for All GOT-Based Relocations - -Rather than relaxing `ADRP+LDR` GOT sequences to `ADRP+ADD` (which would -save memory but requires verifying instruction sequences), the linker -maintains a real GOT in the RW segment. GOT entries are populated at link -time with final addresses. This is conservative but correct -- the `LDR` -instruction genuinely loads from memory, and converting it to `ADD` would -require instruction replacement. - -### 4. TLS IE via GOT (Not MOVZ/MOVK Relaxation) - -TLS Initial Exec relocations use real GOT entries containing pre-computed -TP offsets, rather than relaxing to `MOVZ+MOVK` instruction sequences. The -relaxation approach was found to be fragile because the ADRP and LDR -instructions may use different registers, and the relaxed MOVZ+MOVK must -target the same register as the original LDR destination. - -### 5. TLSDESC and TLSGD Relaxation to LE - -For static linking, both TLSDESC and General Dynamic TLS access patterns are -relaxed to Local Exec. The TLSDESC 4-instruction sequence -(ADRP + LDR + ADD + BLR) is replaced with (MOVZ + MOVK + NOP + NOP). -This is correct because in a static executable, all TLS variables are in the -executable's own TLS block. - -### 6. IFUNC Handling via IPLT - -GNU IFUNC symbols (where the symbol resolves to a "resolver" function that -returns the actual implementation address at runtime) are handled by -generating IPLT stubs and IRELATIVE relocations. The glibc startup code -processes these relocations to fill the GOT slots with the actual function -addresses returned by the resolvers. - -### 7. No Section Headers in Output - -The output executable contains no section header table (`e_shnum = 0`). -This is valid per the ELF specification (section headers are optional for -executables) and reduces output size. Tools like `objdump -d` still work -by following program headers. - -### 8. Diagnostic Support - -Setting `LINKER_DEBUG=1` enables verbose tracing of object loading, symbol -resolution, section layout, GOT allocation, and final addresses. -`LINKER_DEBUG_LAYOUT=1` adds section-by-section layout details, and -`LINKER_DEBUG_TLS=1` traces TLS relocation processing. - - ---- - -## File Inventory - -| File | Lines | Purpose | -|------|-------|---------| -| `mod.rs` | ~40 | Module declarations and public re-exports (`link_builtin`, `link_shared`) | -| `types.rs` | ~93 | `GlobalSymbol` struct with `GlobalSymbolOps` impl, arch constants (`BASE_ADDR`, `PAGE_SIZE`, `INTERP`), `arm_should_replace_extra` | -| `elf.rs` | ~75 | AArch64 relocation constants (26 types); type aliases delegating to `linker_common` for ELF64 parsing | -| `input.rs` | ~91 | File loading dispatch: `load_file`, `resolve_lib`, `resolve_lib_prefer_shared` | -| `plt_got.rs` | ~130 | PLT/GOT entry list construction from relocation scanning | -| `link.rs` | ~411 | Orchestration: `link_builtin` and `link_shared` entry points, dynamic symbol resolution, library group loading | -| `emit_dynamic.rs` | ~869 | Dynamic executable emission: PLT/GOT/.dynamic section, address layout, copy relocations | -| `emit_shared.rs` | ~1,098 | Shared library (`.so`) emission: PIC layout, `R_AARCH64_RELATIVE`/`JUMP_SLOT`/`GLOB_DAT`, RELRO | -| `emit_static.rs` | ~645 | Static executable emission: IPLT/IRELATIVE, two-segment layout | -| `reloc.rs` | ~540 | Relocation application (40+ types), TLS relaxation, GOT/TLS-IE references, instruction field patching helpers | -| **Total** | **~4,000** | (plus shared infrastructure in `linker_common`) | diff --git a/src/backend/arm/linker/elf.rs b/src/backend/arm/linker/elf.rs deleted file mode 100644 index 41278f166a..0000000000 --- a/src/backend/arm/linker/elf.rs +++ /dev/null @@ -1,75 +0,0 @@ -//! ELF64 parsing for the AArch64 linker. -//! -//! This module re-exports the shared ELF64 types and parser from `linker_common`, -//! plus provides AArch64-specific relocation constants. The actual parsing logic -//! lives in the shared module to avoid duplication with x86 and RISC-V. - -// Re-export shared ELF constants so existing callers (mod.rs, reloc.rs) -// continue to work via `use super::elf::*`. -pub use crate::backend::elf::{ - ELF_MAGIC, ELFCLASS64, ELFDATA2LSB, ET_EXEC, ET_DYN, EM_AARCH64, - SHT_NOBITS, - SHF_WRITE, SHF_ALLOC, SHF_EXECINSTR, SHF_TLS, - STB_GLOBAL, STB_WEAK, - STT_OBJECT, STT_FUNC, STT_SECTION, STT_TLS, STT_GNU_IFUNC, - SHN_UNDEF, SHN_ABS, SHN_COMMON, - PT_LOAD, PT_TLS, PT_GNU_STACK, PT_GNU_EH_FRAME, PT_INTERP, PT_DYNAMIC, PT_PHDR, - PF_X, PF_W, PF_R, - read_u16, read_u32, - is_thin_archive, - parse_linker_script_entries, LinkerScriptEntry, - LinkerSymbolAddresses, get_standard_linker_symbols, - DT_NEEDED, DT_SONAME, DT_STRTAB, DT_SYMTAB, DT_STRSZ, DT_SYMENT, - DT_DEBUG, DT_PLTGOT, DT_PLTRELSZ, DT_PLTREL, DT_JMPREL, - DT_RELA, DT_RELASZ, DT_RELAENT, DT_GNU_HASH, - DT_INIT_ARRAY, DT_INIT_ARRAYSZ, DT_FINI_ARRAY, DT_FINI_ARRAYSZ, - DT_NULL, DT_RELACOUNT, - DT_FLAGS, DF_BIND_NOW, DT_FLAGS_1, DF_1_NOW, - w16, w32, w64, write_bytes, wphdr, -}; - -use crate::backend::linker_common; - -// ── AArch64 relocation types ─────────────────────────────────────────── - -pub const R_AARCH64_NONE: u32 = 0; -pub const R_AARCH64_ABS64: u32 = 257; // S + A -pub const R_AARCH64_ABS32: u32 = 258; // S + A (32-bit) -pub const R_AARCH64_ABS16: u32 = 259; // S + A (16-bit) -pub const R_AARCH64_PREL64: u32 = 260; // S + A - P -pub const R_AARCH64_PREL32: u32 = 261; // S + A - P -pub const R_AARCH64_PREL16: u32 = 262; // S + A - P -pub const R_AARCH64_ADR_PREL_PG_HI21: u32 = 275; // Page(S+A) - Page(P) -pub const R_AARCH64_ADR_PREL_LO21: u32 = 274; // S + A - P -pub const R_AARCH64_ADD_ABS_LO12_NC: u32 = 277; // (S + A) & 0xFFF -pub const R_AARCH64_LDST8_ABS_LO12_NC: u32 = 278; -pub const R_AARCH64_LDST16_ABS_LO12_NC: u32 = 284; -pub const R_AARCH64_LDST32_ABS_LO12_NC: u32 = 285; -pub const R_AARCH64_LDST64_ABS_LO12_NC: u32 = 286; -pub const R_AARCH64_LDST128_ABS_LO12_NC: u32 = 299; -pub const R_AARCH64_JUMP26: u32 = 282; // S + A - P (26-bit B) -pub const R_AARCH64_CALL26: u32 = 283; // S + A - P (26-bit BL) -pub const R_AARCH64_MOVW_UABS_G0_NC: u32 = 264; -pub const R_AARCH64_MOVW_UABS_G1_NC: u32 = 265; -pub const R_AARCH64_MOVW_UABS_G2_NC: u32 = 266; -pub const R_AARCH64_MOVW_UABS_G3: u32 = 267; -pub const R_AARCH64_MOVW_UABS_G0: u32 = 263; -pub const R_AARCH64_ADR_GOT_PAGE: u32 = 311; -pub const R_AARCH64_LD64_GOT_LO12_NC: u32 = 312; -pub const R_AARCH64_CONDBR19: u32 = 280; -pub const R_AARCH64_TSTBR14: u32 = 279; - -// ── Type aliases ───────────────────────────────────────────────────────── -// Re-export shared types under the names the ARM linker already uses. - -pub type SectionHeader = linker_common::Elf64Section; -pub type Symbol = linker_common::Elf64Symbol; -pub type Rela = linker_common::Elf64Rela; -pub type ElfObject = linker_common::Elf64Object; - -// ── Parsing functions ──────────────────────────────────────────────────── -// Delegate to shared implementations. - -pub fn parse_object(data: &[u8], source_name: &str) -> Result { - linker_common::parse_elf64_object(data, source_name, EM_AARCH64) -} diff --git a/src/backend/arm/linker/emit_dynamic.rs b/src/backend/arm/linker/emit_dynamic.rs deleted file mode 100644 index 17070468d9..0000000000 --- a/src/backend/arm/linker/emit_dynamic.rs +++ /dev/null @@ -1,868 +0,0 @@ -//! Dynamic executable emission for the AArch64 linker. -//! -//! Emits a dynamically-linked ELF64 executable with PLT/GOT, `.dynamic` section, -//! `.dynsym`/`.dynstr` tables, `.rela.dyn`/`.rela.plt`, and copy relocations. -//! This is the code path used when shared library symbols are present. - -use std::collections::HashMap; - -use super::elf::*; -use super::types::{GlobalSymbol, BASE_ADDR, PAGE_SIZE, INTERP}; -use super::reloc; -use crate::backend::linker_common; -use linker_common::{DynStrTab, OutputSection}; - -// ── Dynamic executable emission ───────────────────────────────────────── - -/// Emit a dynamically-linked AArch64 ELF executable with PLT/GOT/.dynamic support. -pub(super) fn emit_dynamic_executable( - objects: &[ElfObject], globals: &mut HashMap, - output_sections: &mut [OutputSection], - section_map: &HashMap<(usize, usize), (usize, u64)>, - plt_names: &[String], got_entries: &[(String, bool)], - needed_sonames: &[String], output_path: &str, - export_dynamic: bool, -) -> Result<(), String> { - let mut dynstr = DynStrTab::new(); - for lib in needed_sonames { dynstr.add(lib); } - - // Build dynamic symbol name list - let mut dyn_sym_names: Vec = Vec::new(); - for name in plt_names { - if !dyn_sym_names.contains(name) { dyn_sym_names.push(name.clone()); } - } - for (name, is_plt) in got_entries { - if !name.is_empty() && !*is_plt && !dyn_sym_names.contains(name) { - if let Some(gsym) = globals.get(name) { - if gsym.is_dynamic && !gsym.copy_reloc { - dyn_sym_names.push(name.clone()); - } - } - } - } - let gnu_hash_symoffset = 1 + dyn_sym_names.len(); - - // Collect copy relocation symbols - let copy_reloc_syms: Vec<(String, u64)> = globals.iter() - .filter(|(_, g)| g.copy_reloc) - .map(|(n, g)| (n.clone(), g.size)) - .collect(); - for (name, _) in ©_reloc_syms { - if !dyn_sym_names.contains(name) { - dyn_sym_names.push(name.clone()); - } - } - - if export_dynamic { - let mut exported: Vec = globals.iter() - .filter(|(_, g)| { - g.section_idx != SHN_UNDEF && !g.is_dynamic && !g.copy_reloc - && (g.info >> 4) != 0 - }) - .map(|(n, _)| n.clone()) - .collect(); - exported.sort(); - for name in exported { - if !dyn_sym_names.contains(&name) { - dyn_sym_names.push(name); - } - } - } - - for name in &dyn_sym_names { dynstr.add(name); } - - let dynsym_count = 1 + dyn_sym_names.len(); - let dynsym_size = dynsym_count as u64 * 24; - let dynstr_size = dynstr.as_bytes().len() as u64; - let rela_plt_size = plt_names.len() as u64 * 24; - let rela_dyn_glob_count = got_entries.iter().filter(|(n, p)| { - !n.is_empty() && !*p && globals.get(n).map(|g| g.is_dynamic && !g.copy_reloc).unwrap_or(false) - }).count(); - let rela_dyn_count = rela_dyn_glob_count + copy_reloc_syms.len(); - let rela_dyn_size = rela_dyn_count as u64 * 24; - - // Build .gnu.hash - let num_hashed = dyn_sym_names.len() - (gnu_hash_symoffset - 1); - let gnu_hash_nbuckets = if num_hashed == 0 { 1 } else { num_hashed.next_power_of_two().max(1) } as u32; - let gnu_hash_bloom_size: u32 = 1; - let gnu_hash_bloom_shift: u32 = 6; - - let hashed_sym_hashes: Vec = dyn_sym_names[gnu_hash_symoffset - 1..] - .iter().map(|name| linker_common::gnu_hash(name.as_bytes())).collect(); - - let mut bloom_word: u64 = 0; - for &h in &hashed_sym_hashes { - bloom_word |= 1u64 << (h as u64 % 64); - bloom_word |= 1u64 << ((h >> gnu_hash_bloom_shift) as u64 % 64); - } - - if num_hashed > 0 { - let hashed_start = gnu_hash_symoffset - 1; - let mut hashed_with_hash: Vec<(String, u32)> = dyn_sym_names[hashed_start..] - .iter().zip(hashed_sym_hashes.iter()) - .map(|(n, &h)| (n.clone(), h)).collect(); - hashed_with_hash.sort_by_key(|(_, h)| h % gnu_hash_nbuckets); - for (i, (name, _)) in hashed_with_hash.iter().enumerate() { - dyn_sym_names[hashed_start + i] = name.clone(); - } - } - - let hashed_sym_hashes: Vec = dyn_sym_names[gnu_hash_symoffset - 1..] - .iter().map(|name| linker_common::gnu_hash(name.as_bytes())).collect(); - - let mut gnu_hash_buckets = vec![0u32; gnu_hash_nbuckets as usize]; - let mut gnu_hash_chains = vec![0u32; num_hashed]; - for (i, &h) in hashed_sym_hashes.iter().enumerate() { - let bucket = (h % gnu_hash_nbuckets) as usize; - if gnu_hash_buckets[bucket] == 0 { - gnu_hash_buckets[bucket] = (gnu_hash_symoffset + i) as u32; - } - gnu_hash_chains[i] = h & !1; - } - for bucket_idx in 0..gnu_hash_nbuckets as usize { - if gnu_hash_buckets[bucket_idx] == 0 { continue; } - let mut last_in_bucket = 0; - for (i, &h) in hashed_sym_hashes.iter().enumerate() { - if (h % gnu_hash_nbuckets) as usize == bucket_idx { - last_in_bucket = i; - } - } - gnu_hash_chains[last_in_bucket] |= 1; - } - - let gnu_hash_size: u64 = 16 + (gnu_hash_bloom_size as u64 * 8) - + (gnu_hash_nbuckets as u64 * 4) + (num_hashed as u64 * 4); - // PLT: 32 bytes header + 16 bytes per entry - let plt_size = if plt_names.is_empty() { 0u64 } else { 32 + 16 * plt_names.len() as u64 }; - let got_plt_count = 3 + plt_names.len(); - let got_plt_size = got_plt_count as u64 * 8; - let got_globdat_count = got_entries.iter().filter(|(n, p)| !n.is_empty() && !*p).count(); - let got_size = got_globdat_count as u64 * 8; - - let has_init_array = output_sections.iter().any(|s| s.name == ".init_array" && s.mem_size > 0); - let has_fini_array = output_sections.iter().any(|s| s.name == ".fini_array" && s.mem_size > 0); - let mut dyn_count = needed_sonames.len() as u64 + 16; // fixed entries + DT_FLAGS + DT_FLAGS_1 + NULL - if has_init_array { dyn_count += 2; } - if has_fini_array { dyn_count += 2; } - let dynamic_size = dyn_count * 16; - - let has_tls_sections = output_sections.iter().any(|s| s.flags & SHF_TLS != 0 && s.flags & SHF_ALLOC != 0); - // phdrs: PHDR, INTERP, LOAD(ro), LOAD(text), LOAD(rodata), LOAD(rw), DYNAMIC, GNU_STACK, [TLS] - let phdr_count: u64 = if has_tls_sections { 9 } else { 8 }; - let phdr_total_size = phdr_count * 56; - - // === Layout === - let mut offset = 64 + phdr_total_size; - let interp_offset = offset; - let interp_addr = BASE_ADDR + offset; - offset += INTERP.len() as u64; - - offset = (offset + 7) & !7; - let gnu_hash_offset = offset; let gnu_hash_addr = BASE_ADDR + offset; offset += gnu_hash_size; - offset = (offset + 7) & !7; - let dynsym_offset = offset; let dynsym_addr = BASE_ADDR + offset; offset += dynsym_size; - let dynstr_offset = offset; let dynstr_addr = BASE_ADDR + offset; offset += dynstr_size; - offset = (offset + 7) & !7; - let rela_dyn_offset = offset; let rela_dyn_addr = BASE_ADDR + offset; offset += rela_dyn_size; - offset = (offset + 7) & !7; - let rela_plt_offset = offset; let rela_plt_addr = BASE_ADDR + offset; offset += rela_plt_size; - - // Text segment - offset = (offset + PAGE_SIZE - 1) & !(PAGE_SIZE - 1); - let text_page_offset = offset; - let text_page_addr = BASE_ADDR + offset; - for sec in output_sections.iter_mut() { - if sec.flags & SHF_EXECINSTR != 0 && sec.flags & SHF_ALLOC != 0 { - let a = sec.alignment.max(4); - offset = (offset + a - 1) & !(a - 1); - sec.addr = BASE_ADDR + offset; - sec.file_offset = offset; - offset += sec.mem_size; - } - } - // PLT in text segment - let (plt_addr, plt_offset) = if plt_size > 0 { - offset = (offset + 15) & !15; - let a = BASE_ADDR + offset; let o = offset; offset += plt_size; (a, o) - } else { (0u64, 0u64) }; - let text_total_size = offset - text_page_offset; - - // Rodata segment (separate LOAD R) - offset = (offset + PAGE_SIZE - 1) & !(PAGE_SIZE - 1); - let rodata_page_offset = offset; - let rodata_page_addr = BASE_ADDR + offset; - for sec in output_sections.iter_mut() { - if sec.flags & SHF_ALLOC != 0 && sec.flags & SHF_EXECINSTR == 0 && - sec.flags & SHF_WRITE == 0 && sec.sh_type != SHT_NOBITS && - sec.flags & SHF_TLS == 0 { - let a = sec.alignment.max(1); - offset = (offset + a - 1) & !(a - 1); - sec.addr = BASE_ADDR + offset; - sec.file_offset = offset; - offset += sec.mem_size; - } - } - let rodata_total_size = offset - rodata_page_offset; - - // RW segment - offset = (offset + PAGE_SIZE - 1) & !(PAGE_SIZE - 1); - let rw_page_offset = offset; - let rw_page_addr = BASE_ADDR + offset; - - let mut init_array_addr = 0u64; let mut init_array_size = 0u64; - let mut fini_array_addr = 0u64; let mut fini_array_size = 0u64; - - for sec in output_sections.iter_mut() { - if sec.name == ".init_array" { - let a = sec.alignment.max(8); - offset = (offset + a - 1) & !(a - 1); - sec.addr = BASE_ADDR + offset; sec.file_offset = offset; - init_array_addr = sec.addr; init_array_size = sec.mem_size; - offset += sec.mem_size; break; - } - } - for sec in output_sections.iter_mut() { - if sec.name == ".fini_array" { - let a = sec.alignment.max(8); - offset = (offset + a - 1) & !(a - 1); - sec.addr = BASE_ADDR + offset; sec.file_offset = offset; - fini_array_addr = sec.addr; fini_array_size = sec.mem_size; - offset += sec.mem_size; break; - } - } - - offset = (offset + 7) & !7; - let dynamic_offset = offset; let dynamic_addr = BASE_ADDR + offset; offset += dynamic_size; - offset = (offset + 7) & !7; - let got_offset = offset; let got_addr = BASE_ADDR + offset; offset += got_size; - offset = (offset + 7) & !7; - let got_plt_offset = offset; let got_plt_addr = BASE_ADDR + offset; offset += got_plt_size; - - // Data.rel.ro - for sec in output_sections.iter_mut() { - if sec.name == ".data.rel.ro" { - let a = sec.alignment.max(1); - offset = (offset + a - 1) & !(a - 1); - sec.addr = BASE_ADDR + offset; sec.file_offset = offset; - offset += sec.mem_size; - } - } - - // Remaining data sections - for sec in output_sections.iter_mut() { - if sec.flags & SHF_ALLOC != 0 && sec.flags & SHF_WRITE != 0 && - sec.sh_type != SHT_NOBITS && sec.flags & SHF_TLS == 0 && - sec.name != ".init_array" && sec.name != ".fini_array" && - sec.name != ".data.rel.ro" { - let a = sec.alignment.max(1); - offset = (offset + a - 1) & !(a - 1); - sec.addr = BASE_ADDR + offset; sec.file_offset = offset; - offset += sec.mem_size; - } - } - - // TLS sections - let mut tls_addr = 0u64; - let mut tls_file_offset = 0u64; - let mut tls_file_size = 0u64; - let mut tls_mem_size = 0u64; - let mut tls_align = 1u64; - for sec in output_sections.iter_mut() { - if sec.flags & SHF_TLS != 0 && sec.flags & SHF_ALLOC != 0 && sec.sh_type != SHT_NOBITS { - let a = sec.alignment.max(1); - offset = (offset + a - 1) & !(a - 1); - sec.addr = BASE_ADDR + offset; sec.file_offset = offset; - if tls_addr == 0 { tls_addr = sec.addr; tls_file_offset = offset; tls_align = a; } - tls_file_size += sec.mem_size; - tls_mem_size += sec.mem_size; - offset += sec.mem_size; - } - } - if tls_addr == 0 && has_tls_sections { - tls_addr = BASE_ADDR + offset; - tls_file_offset = offset; - } - for sec in output_sections.iter_mut() { - if sec.flags & SHF_TLS != 0 && sec.sh_type == SHT_NOBITS { - let a = sec.alignment.max(1); - let aligned = (tls_mem_size + a - 1) & !(a - 1); - sec.addr = tls_addr + aligned; sec.file_offset = offset; - tls_mem_size = aligned + sec.mem_size; - if a > tls_align { tls_align = a; } - } - } - tls_mem_size = (tls_mem_size + tls_align - 1) & !(tls_align - 1); - let has_tls = tls_addr != 0; - - let bss_addr = BASE_ADDR + offset; - let mut bss_size = 0u64; - for sec in output_sections.iter_mut() { - if sec.sh_type == SHT_NOBITS && sec.flags & SHF_ALLOC != 0 && sec.flags & SHF_TLS == 0 { - let a = sec.alignment.max(1); - let aligned = (bss_addr + bss_size + a - 1) & !(a - 1); - bss_size = aligned - bss_addr + sec.mem_size; - sec.addr = aligned; sec.file_offset = offset; - } - } - - // BSS space for copy relocations - let mut copy_reloc_addr_map: HashMap<(String, u64), u64> = HashMap::new(); - for (name, size) in ©_reloc_syms { - let gsym = globals.get(name).cloned(); - let key = gsym.as_ref().and_then(|g| { - g.from_lib.as_ref().map(|lib| (lib.clone(), g.lib_sym_value)) - }); - let addr = if let Some(ref k) = key { - if let Some(&existing_addr) = copy_reloc_addr_map.get(k) { - existing_addr - } else { - let aligned = (bss_addr + bss_size + 7) & !7; - bss_size = aligned - bss_addr + size; - copy_reloc_addr_map.insert(k.clone(), aligned); - aligned - } - } else { - let aligned = (bss_addr + bss_size + 7) & !7; - bss_size = aligned - bss_addr + size; - aligned - }; - if let Some(gsym) = globals.get_mut(name) { - gsym.value = addr; - gsym.defined_in = Some(usize::MAX); - } - } - - let rw_filesz = offset - rw_page_offset; - let rw_memsz = if bss_size > 0 { (bss_addr + bss_size) - rw_page_addr } else { rw_filesz }; - - // Merge section data - for sec in output_sections.iter_mut() { - if sec.sh_type == SHT_NOBITS { continue; } - let mut data = vec![0u8; sec.mem_size as usize]; - for input in &sec.inputs { - let sd = &objects[input.object_idx].section_data[input.section_idx]; - let s = input.output_offset as usize; - let e = s + sd.len(); - if e <= data.len() && !sd.is_empty() { data[s..e].copy_from_slice(sd); } - } - sec.data = data; - } - - // Update global symbol addresses - for (_, gsym) in globals.iter_mut() { - if let Some(obj_idx) = gsym.defined_in { - if obj_idx == usize::MAX { continue; } // linker-defined or copy-reloc - if gsym.section_idx == SHN_COMMON || gsym.section_idx == 0xffff { - if let Some(bss_sec) = output_sections.iter().find(|s| s.name == ".bss") { - gsym.value += bss_sec.addr; - } - } else if gsym.section_idx != SHN_UNDEF && gsym.section_idx != SHN_ABS { - let si = gsym.section_idx as usize; - if let Some(&(oi, so)) = section_map.get(&(obj_idx, si)) { - gsym.value += output_sections[oi].addr + so; - } - } - } - } - - // Define linker-provided symbols - let text_seg_end = text_page_addr + text_total_size; - let linker_addrs = LinkerSymbolAddresses { - base_addr: BASE_ADDR, - got_addr: got_plt_addr, - dynamic_addr, - bss_addr, - bss_size, - text_end: text_seg_end, - data_start: rw_page_addr, - init_array_start: init_array_addr, - init_array_size, - fini_array_start: fini_array_addr, - fini_array_size, - preinit_array_start: 0, - preinit_array_size: 0, - rela_iplt_start: 0, - rela_iplt_size: 0, - }; - for sym in &get_standard_linker_symbols(&linker_addrs) { - let entry = globals.entry(sym.name.to_string()).or_insert(GlobalSymbol { - value: 0, size: 0, info: (sym.binding << 4), - defined_in: None, from_lib: None, plt_idx: None, got_idx: None, - section_idx: SHN_ABS, is_dynamic: false, copy_reloc: false, lib_sym_value: 0, - }); - if entry.defined_in.is_none() && !entry.is_dynamic { - entry.value = sym.value; - entry.defined_in = Some(usize::MAX); - entry.section_idx = SHN_ABS; - } - } - - // Auto-generate __start_
/ __stop_
symbols (GNU ld feature) - for (name, addr) in linker_common::resolve_start_stop_symbols(output_sections) { - if let Some(entry) = globals.get_mut(&name) { - if entry.defined_in.is_none() && !entry.is_dynamic { - entry.value = addr; - entry.defined_in = Some(usize::MAX); - entry.section_idx = SHN_ABS; - } - } - } - - let entry_addr = globals.get("_start").map(|s| s.value).unwrap_or(text_page_addr); - - // === Build output buffer === - let file_size = offset as usize; - let mut out = vec![0u8; file_size]; - - // ELF header - out[0..4].copy_from_slice(&ELF_MAGIC); - out[4] = ELFCLASS64; out[5] = ELFDATA2LSB; out[6] = 1; - out[7] = 0; // ELFOSABI_NONE for dynamic executables - w16(&mut out, 16, ET_EXEC); w16(&mut out, 18, EM_AARCH64); w32(&mut out, 20, 1); - w64(&mut out, 24, entry_addr); w64(&mut out, 32, 64); w64(&mut out, 40, 0); - w32(&mut out, 48, 0); w16(&mut out, 52, 64); w16(&mut out, 54, 56); - w16(&mut out, 56, phdr_count as u16); w16(&mut out, 58, 64); w16(&mut out, 60, 0); w16(&mut out, 62, 0); - - // Program headers - let mut ph = 64usize; - wphdr(&mut out, ph, PT_PHDR, PF_R, 64, BASE_ADDR+64, phdr_total_size, phdr_total_size, 8); ph += 56; - wphdr(&mut out, ph, PT_INTERP, PF_R, interp_offset, interp_addr, INTERP.len() as u64, INTERP.len() as u64, 1); ph += 56; - let ro_seg_end = rela_plt_offset + rela_plt_size; - wphdr(&mut out, ph, PT_LOAD, PF_R, 0, BASE_ADDR, ro_seg_end, ro_seg_end, PAGE_SIZE); ph += 56; - wphdr(&mut out, ph, PT_LOAD, PF_R|PF_X, text_page_offset, text_page_addr, text_total_size, text_total_size, PAGE_SIZE); ph += 56; - if rodata_total_size > 0 { - wphdr(&mut out, ph, PT_LOAD, PF_R, rodata_page_offset, rodata_page_addr, rodata_total_size, rodata_total_size, PAGE_SIZE); ph += 56; - } else { - // Empty placeholder segment to keep phdr count consistent - wphdr(&mut out, ph, PT_LOAD, PF_R, rodata_page_offset, rodata_page_addr, 0, 0, PAGE_SIZE); ph += 56; - } - wphdr(&mut out, ph, PT_LOAD, PF_R|PF_W, rw_page_offset, rw_page_addr, rw_filesz, rw_memsz, PAGE_SIZE); ph += 56; - wphdr(&mut out, ph, PT_DYNAMIC, PF_R|PF_W, dynamic_offset, dynamic_addr, dynamic_size, dynamic_size, 8); ph += 56; - wphdr(&mut out, ph, PT_GNU_STACK, PF_R|PF_W, 0, 0, 0, 0, 0x10); ph += 56; - if has_tls { - wphdr(&mut out, ph, PT_TLS, PF_R, tls_file_offset, tls_addr, tls_file_size, tls_mem_size, tls_align); - } - - // .interp - write_bytes(&mut out, interp_offset as usize, INTERP); - - // .gnu.hash - let gh = gnu_hash_offset as usize; - w32(&mut out, gh, gnu_hash_nbuckets); - w32(&mut out, gh+4, gnu_hash_symoffset as u32); - w32(&mut out, gh+8, gnu_hash_bloom_size); - w32(&mut out, gh+12, gnu_hash_bloom_shift); - let bloom_off = gh + 16; - w64(&mut out, bloom_off, bloom_word); - let buckets_off = bloom_off + (gnu_hash_bloom_size as usize * 8); - for (i, &b) in gnu_hash_buckets.iter().enumerate() { - w32(&mut out, buckets_off + i * 4, b); - } - let chains_off = buckets_off + (gnu_hash_nbuckets as usize * 4); - for (i, &c) in gnu_hash_chains.iter().enumerate() { - w32(&mut out, chains_off + i * 4, c); - } - - // .dynsym - let mut ds = dynsym_offset as usize + 24; - for name in &dyn_sym_names { - let no = dynstr.get_offset(name) as u32; - w32(&mut out, ds, no); - if let Some(gsym) = globals.get(name) { - if gsym.copy_reloc { - if ds+5 < out.len() { out[ds+4] = (STB_GLOBAL << 4) | STT_OBJECT; out[ds+5] = 0; } - w16(&mut out, ds+6, 1); - w64(&mut out, ds+8, gsym.value); - w64(&mut out, ds+16, gsym.size); - } else if !gsym.is_dynamic && gsym.section_idx != SHN_UNDEF && gsym.value != 0 { - let stt = gsym.info & 0xf; - let stb = gsym.info >> 4; - if ds+5 < out.len() { out[ds+4] = (stb << 4) | stt; out[ds+5] = 0; } - w16(&mut out, ds+6, 1); - w64(&mut out, ds+8, gsym.value); - w64(&mut out, ds+16, gsym.size); - } else { - // Preserve original binding (STB_WEAK vs STB_GLOBAL) and type - let bind = gsym.info >> 4; - let stype = gsym.info & 0xf; - let st_info = (bind << 4) | if stype != 0 { stype } else { STT_FUNC }; - if ds+5 < out.len() { out[ds+4] = st_info; out[ds+5] = 0; } - w16(&mut out, ds+6, 0); w64(&mut out, ds+8, 0); w64(&mut out, ds+16, 0); - } - } else { - if ds+5 < out.len() { out[ds+4] = (STB_GLOBAL << 4) | STT_FUNC; out[ds+5] = 0; } - w16(&mut out, ds+6, 0); w64(&mut out, ds+8, 0); w64(&mut out, ds+16, 0); - } - ds += 24; - } - - // .dynstr - write_bytes(&mut out, dynstr_offset as usize, dynstr.as_bytes()); - - // .rela.dyn (GLOB_DAT + COPY) - // AArch64: R_AARCH64_GLOB_DAT = 1025, R_AARCH64_COPY = 1024 - const R_AARCH64_GLOB_DAT: u64 = 1025; - const R_AARCH64_COPY: u64 = 1024; - let mut rd = rela_dyn_offset as usize; - let mut gd_a = got_addr; - for (name, is_plt) in got_entries { - if name.is_empty() || *is_plt { continue; } - let is_dynamic = globals.get(name).map(|g| g.is_dynamic && !g.copy_reloc).unwrap_or(false); - if is_dynamic { - let si = dyn_sym_names.iter().position(|n| n == name).map(|i| i+1).unwrap_or(0) as u64; - w64(&mut out, rd, gd_a); w64(&mut out, rd+8, (si << 32) | R_AARCH64_GLOB_DAT); w64(&mut out, rd+16, 0); - rd += 24; - } - gd_a += 8; - } - for (name, _) in ©_reloc_syms { - if let Some(gsym) = globals.get(name) { - let si = dyn_sym_names.iter().position(|n| n == name).map(|i| i+1).unwrap_or(0) as u64; - let copy_addr = gsym.value; - w64(&mut out, rd, copy_addr); w64(&mut out, rd+8, (si << 32) | R_AARCH64_COPY); w64(&mut out, rd+16, 0); - rd += 24; - } - } - - // .rela.plt (R_AARCH64_JUMP_SLOT = 1026) - const R_AARCH64_JUMP_SLOT: u64 = 1026; - let mut rp = rela_plt_offset as usize; - let gpb = got_plt_addr + 24; - for (i, name) in plt_names.iter().enumerate() { - let gea = gpb + i as u64 * 8; - let si = dyn_sym_names.iter().position(|n| n == name).map(|j| j+1).unwrap_or(0) as u64; - w64(&mut out, rp, gea); w64(&mut out, rp+8, (si << 32) | R_AARCH64_JUMP_SLOT); w64(&mut out, rp+16, 0); - rp += 24; - } - - // Section data - for sec in output_sections.iter() { - if sec.sh_type == SHT_NOBITS || sec.data.is_empty() { continue; } - write_bytes(&mut out, sec.file_offset as usize, &sec.data); - } - - // .plt (AArch64 PLT stubs) - if plt_size > 0 { - let po = plt_offset as usize; - // PLT header (32 bytes): - // stp x16, x30, [sp, #-16]! ; save registers - // adrp x16, GOT+16 ; load page of GOT[2] - // ldr x17, [x16, #lo12(GOT+16)] ; load GOT[2] (resolver) - // add x16, x16, #lo12(GOT+16) ; compute address - // br x17 ; jump to resolver - // nop; nop; nop ; padding to 32 bytes - - let got2_addr = got_plt_addr + 16; - let page_g = got2_addr & !0xFFF; - let page_p = plt_addr & !0xFFF; - let page_diff = ((page_g as i64 - page_p as i64) >> 12) as i32; - let immlo = (page_diff & 3) as u32; - let immhi = ((page_diff >> 2) & 0x7ffff) as u32; - - // STP x16, x30, [sp, #-16]! - w32(&mut out, po, 0xa9bf7bf0); - // ADRP x16, page_of(GOT+16) - w32(&mut out, po + 4, 0x90000010 | (immlo << 29) | (immhi << 5)); - // LDR x17, [x16, #lo12(GOT+16)] - let lo12 = (got2_addr & 0xFFF) as u32; - w32(&mut out, po + 8, 0xf9400211 | ((lo12 / 8) << 10)); - // ADD x16, x16, #lo12(GOT+16) - w32(&mut out, po + 12, 0x91000210 | ((lo12 & 0xFFF) << 10)); - // BR x17 - w32(&mut out, po + 16, 0xd61f0220); - // NOP padding - w32(&mut out, po + 20, 0xd503201f); - w32(&mut out, po + 24, 0xd503201f); - w32(&mut out, po + 28, 0xd503201f); - - // Individual PLT entries (16 bytes each): - // adrp x16, GOT_entry_page - // ldr x17, [x16, #lo12(GOT_entry)] - // add x16, x16, #lo12(GOT_entry) - // br x17 - for (i, _) in plt_names.iter().enumerate() { - let ep = po + 32 + i * 16; - let pea = plt_addr + 32 + i as u64 * 16; - let gea = got_plt_addr + 24 + i as u64 * 8; - - let page_g = gea & !0xFFF; - let page_p = pea & !0xFFF; - let page_diff = ((page_g as i64 - page_p as i64) >> 12) as i32; - let immlo = (page_diff & 3) as u32; - let immhi = ((page_diff >> 2) & 0x7ffff) as u32; - - // ADRP x16, page_of(GOT entry) - w32(&mut out, ep, 0x90000010 | (immlo << 29) | (immhi << 5)); - // LDR x17, [x16, #lo12(GOT entry)] - let lo12 = (gea & 0xFFF) as u32; - w32(&mut out, ep + 4, 0xf9400211 | ((lo12 / 8) << 10)); - // ADD x16, x16, #lo12(GOT entry) - w32(&mut out, ep + 8, 0x91000210 | ((lo12 & 0xFFF) << 10)); - // BR x17 - w32(&mut out, ep + 12, 0xd61f0220); - } - } - - // .dynamic - let mut dd = dynamic_offset as usize; - for lib in needed_sonames { - let so = dynstr.get_offset(lib); - w64(&mut out, dd, DT_NEEDED as u64); w64(&mut out, dd+8, so as u64); dd += 16; - } - for &(tag, val) in &[ - (DT_STRTAB, dynstr_addr), (DT_SYMTAB, dynsym_addr), (DT_STRSZ, dynstr_size), - (DT_SYMENT, 24), (DT_DEBUG, 0), (DT_PLTGOT, got_plt_addr), - (DT_PLTRELSZ, rela_plt_size), (DT_PLTREL, 7u64), (DT_JMPREL, rela_plt_addr), - (DT_RELA, rela_dyn_addr), (DT_RELASZ, rela_dyn_size), (DT_RELAENT, 24), - (DT_GNU_HASH, gnu_hash_addr), - (DT_FLAGS, DF_BIND_NOW as u64), (DT_FLAGS_1, DF_1_NOW as u64), - ] { - w64(&mut out, dd, tag as u64); w64(&mut out, dd+8, val); dd += 16; - } - if has_init_array { - w64(&mut out, dd, DT_INIT_ARRAY as u64); w64(&mut out, dd+8, init_array_addr); dd += 16; - w64(&mut out, dd, DT_INIT_ARRAYSZ as u64); w64(&mut out, dd+8, init_array_size); dd += 16; - } - if has_fini_array { - w64(&mut out, dd, DT_FINI_ARRAY as u64); w64(&mut out, dd+8, fini_array_addr); dd += 16; - w64(&mut out, dd, DT_FINI_ARRAYSZ as u64); w64(&mut out, dd+8, fini_array_size); dd += 16; - } - w64(&mut out, dd, DT_NULL as u64); w64(&mut out, dd+8, 0); - - // .got (GLOB_DAT entries) - let mut go = got_offset as usize; - for (name, is_plt) in got_entries { - if name.is_empty() || *is_plt { continue; } - if let Some(gsym) = globals.get(name) { - if gsym.defined_in.is_some() && !gsym.is_dynamic { - let sym_val = gsym.value; - if has_tls && (gsym.info & 0xf) == STT_TLS { - let tpoff = (sym_val as i64 - tls_addr as i64) + 16; - w64(&mut out, go, tpoff as u64); - } else { - w64(&mut out, go, sym_val); - } - } else if gsym.copy_reloc && gsym.value != 0 { - w64(&mut out, go, gsym.value); - } - } - go += 8; - } - - // .got.plt - let gp = got_plt_offset as usize; - w64(&mut out, gp, dynamic_addr); - w64(&mut out, gp+8, 0); w64(&mut out, gp+16, 0); - for (i, _) in plt_names.iter().enumerate() { - // Initialize GOT.plt entries to PLT[0] (resolved eagerly via DF_BIND_NOW) - w64(&mut out, gp+24+i*8, plt_addr); - } - - // Apply relocations - let globals_snap: HashMap = globals.clone(); - - // Build GotInfo for GOT-only entries (non-PLT symbols accessed via ADR_GOT_PAGE/LD64_GOT_LO12_NC) - let mut dyn_got_entries: HashMap = HashMap::new(); - { - let mut got_only_idx = 0usize; - for (name, is_plt) in got_entries.iter() { - if name.is_empty() || *is_plt { continue; } - // got_key for global symbols is just the name - dyn_got_entries.insert(name.clone(), got_only_idx); - got_only_idx += 1; - } - } - let dyn_got_info = reloc::GotInfo { got_addr, entries: dyn_got_entries }; - - for obj_idx in 0..objects.len() { - for sec_idx in 0..objects[obj_idx].sections.len() { - let relas = &objects[obj_idx].relocations[sec_idx]; - if relas.is_empty() { continue; } - let (out_idx, sec_off) = match section_map.get(&(obj_idx, sec_idx)) { - Some(&v) => v, None => continue, - }; - let sa = output_sections[out_idx].addr; - let sfo = output_sections[out_idx].file_offset; - - for rela in relas { - let si = rela.sym_idx as usize; - if si >= objects[obj_idx].symbols.len() { continue; } - let sym = &objects[obj_idx].symbols[si]; - let p = sa + sec_off + rela.offset; - let fp = (sfo + sec_off + rela.offset) as usize; - let a = rela.addend; - let s = resolve_sym_dynamic(obj_idx, sym, &globals_snap, section_map, output_sections, plt_addr); - - match rela.rela_type { - R_AARCH64_ABS64 => { - let t = if !sym.name.is_empty() { - if let Some(g) = globals_snap.get(&sym.name) { - if g.is_dynamic && !g.copy_reloc { - if let Some(pi) = g.plt_idx { plt_addr + 32 + pi as u64 * 16 } else { s } - } else { s } - } else { s } - } else { s }; - w64(&mut out, fp, (t as i64 + a) as u64); - } - R_AARCH64_ABS32 => { - w32(&mut out, fp, (s as i64 + a) as u32); - } - R_AARCH64_ABS16 => { - w16(&mut out, fp, (s as i64 + a) as u16); - } - R_AARCH64_PREL64 => { - w64(&mut out, fp, (s as i64 + a - p as i64) as u64); - } - R_AARCH64_PREL32 => { - w32(&mut out, fp, (s as i64 + a - p as i64) as u32); - } - R_AARCH64_PREL16 => { - w16(&mut out, fp, (s as i64 + a - p as i64) as u16); - } - R_AARCH64_ADR_PREL_PG_HI21 => { - let sa_val = (s as i64 + a) as u64; - let page_s = sa_val & !0xFFF; - let page_p = p & !0xFFF; - let imm = (page_s as i64 - page_p as i64) >> 12; - reloc::encode_adrp(&mut out, fp, imm); - } - R_AARCH64_ADR_PREL_LO21 => { - let offset_val = (s as i64 + a) - (p as i64); - reloc::encode_adr(&mut out, fp, offset_val); - } - R_AARCH64_ADD_ABS_LO12_NC => { - let sa_val = (s as i64 + a) as u64; - reloc::encode_add_imm12(&mut out, fp, (sa_val & 0xFFF) as u32); - } - R_AARCH64_LDST8_ABS_LO12_NC => { - let sa_val = (s as i64 + a) as u64; - reloc::encode_ldst_imm12(&mut out, fp, (sa_val & 0xFFF) as u32, 0); - } - R_AARCH64_LDST16_ABS_LO12_NC => { - let sa_val = (s as i64 + a) as u64; - reloc::encode_ldst_imm12(&mut out, fp, (sa_val & 0xFFF) as u32, 1); - } - R_AARCH64_LDST32_ABS_LO12_NC => { - let sa_val = (s as i64 + a) as u64; - reloc::encode_ldst_imm12(&mut out, fp, (sa_val & 0xFFF) as u32, 2); - } - R_AARCH64_LDST64_ABS_LO12_NC => { - let sa_val = (s as i64 + a) as u64; - reloc::encode_ldst_imm12(&mut out, fp, (sa_val & 0xFFF) as u32, 3); - } - R_AARCH64_LDST128_ABS_LO12_NC => { - let sa_val = (s as i64 + a) as u64; - reloc::encode_ldst_imm12(&mut out, fp, (sa_val & 0xFFF) as u32, 4); - } - R_AARCH64_CALL26 | R_AARCH64_JUMP26 => { - if fp + 4 > out.len() { continue; } - let t = if !sym.name.is_empty() { - if let Some(g) = globals_snap.get(&sym.name) { - if let Some(pi) = g.plt_idx { plt_addr + 32 + pi as u64 * 16 } else { s } - } else { s } - } else { s }; - let sa_val = (t as i64 + a) as u64; - if sa_val == 0 { - w32(&mut out, fp, 0xd503201f); // NOP for weak undef - } else { - let offset_val = (sa_val as i64) - (p as i64); - let mut insn = read_u32(&out, fp); - let imm26 = ((offset_val >> 2) as u32) & 0x3ffffff; - insn = (insn & 0xfc000000) | imm26; - w32(&mut out, fp, insn); - } - } - R_AARCH64_CONDBR19 => { - let offset_val = (s as i64 + a) - (p as i64); - if fp + 4 > out.len() { continue; } - let mut insn = read_u32(&out, fp); - let imm19 = ((offset_val >> 2) as u32) & 0x7ffff; - insn = (insn & 0xff00001f) | (imm19 << 5); - w32(&mut out, fp, insn); - } - R_AARCH64_TSTBR14 => { - let offset_val = (s as i64 + a) - (p as i64); - if fp + 4 > out.len() { continue; } - let mut insn = read_u32(&out, fp); - let imm14 = ((offset_val >> 2) as u32) & 0x3fff; - insn = (insn & 0xfff8001f) | (imm14 << 5); - w32(&mut out, fp, insn); - } - R_AARCH64_MOVW_UABS_G0 | R_AARCH64_MOVW_UABS_G0_NC => { - let sa_val = (s as i64 + a) as u64; - reloc::encode_movw(&mut out, fp, (sa_val & 0xffff) as u32); - } - R_AARCH64_MOVW_UABS_G1_NC => { - let sa_val = (s as i64 + a) as u64; - reloc::encode_movw(&mut out, fp, ((sa_val >> 16) & 0xffff) as u32); - } - R_AARCH64_MOVW_UABS_G2_NC => { - let sa_val = (s as i64 + a) as u64; - reloc::encode_movw(&mut out, fp, ((sa_val >> 32) & 0xffff) as u32); - } - R_AARCH64_MOVW_UABS_G3 => { - let sa_val = (s as i64 + a) as u64; - reloc::encode_movw(&mut out, fp, ((sa_val >> 48) & 0xffff) as u32); - } - R_AARCH64_ADR_GOT_PAGE | R_AARCH64_LD64_GOT_LO12_NC => { - // Use GOT entry address for symbols with GOT-only entries - let gkey = sym.name.clone(); - let tls_info = reloc::TlsInfo { tls_addr, tls_size: tls_mem_size }; - reloc::apply_one_reloc(&mut out, fp, rela.rela_type, s, a, p, - &sym.name, &objects[obj_idx].source_name, - &tls_info, &dyn_got_info, &gkey)?; - } - _ => { - // Delegate to the standard reloc handler for TLS etc. - let gkey = reloc::got_key(obj_idx, sym); - let tls_info = reloc::TlsInfo { tls_addr, tls_size: tls_mem_size }; - reloc::apply_one_reloc(&mut out, fp, rela.rela_type, s, a, p, - &sym.name, &objects[obj_idx].source_name, - &tls_info, &dyn_got_info, &gkey)?; - } - } - } - } - } - - // Write output - std::fs::write(output_path, &out).map_err(|e| format!("failed to write '{}': {}", output_path, e))?; - #[cfg(unix)] - { - use std::os::unix::fs::PermissionsExt; - let _ = std::fs::set_permissions(output_path, std::fs::Permissions::from_mode(0o755)); - } - Ok(()) -} - -/// Resolve a symbol address for dynamic linking. Dynamic symbols go through PLT. -fn resolve_sym_dynamic( - obj_idx: usize, - sym: &Symbol, - globals: &HashMap, - section_map: &HashMap<(usize, usize), (usize, u64)>, - output_sections: &[OutputSection], - plt_addr: u64, -) -> u64 { - if sym.sym_type() == STT_SECTION { - let si = sym.shndx as usize; - return section_map.get(&(obj_idx, si)) - .map(|&(oi, so)| output_sections[oi].addr + so) - .unwrap_or(0); - } - if !sym.name.is_empty() && !sym.is_local() { - if let Some(g) = globals.get(&sym.name) { - if g.defined_in.is_some() { return g.value; } - if g.is_dynamic { - if let Some(pi) = g.plt_idx { return plt_addr + 32 + pi as u64 * 16; } - if g.copy_reloc { return g.value; } - } - } - if sym.is_weak() { return 0; } - } - if sym.is_undefined() { return 0; } - if sym.shndx == SHN_ABS { return sym.value; } - section_map.get(&(obj_idx, sym.shndx as usize)) - .map(|&(oi, so)| output_sections[oi].addr + so + sym.value) - .unwrap_or(sym.value) -} diff --git a/src/backend/arm/linker/emit_shared.rs b/src/backend/arm/linker/emit_shared.rs deleted file mode 100644 index e30f5391da..0000000000 --- a/src/backend/arm/linker/emit_shared.rs +++ /dev/null @@ -1,1100 +0,0 @@ -//! Shared library (.so) emission for the AArch64 linker. -//! -//! Emits an ELF64 shared library (ET_DYN) with PIC relocations, PLT stubs -//! for external function calls, and a `.dynamic` section for the dynamic linker. - -use std::collections::HashMap; - -use super::elf::*; -use super::types::{GlobalSymbol, PAGE_SIZE}; -use super::reloc; -use crate::backend::linker_common; -use linker_common::{DynStrTab, OutputSection}; - -/// Emit a shared library (.so) ELF file for AArch64. -pub(super) fn emit_shared_library( - objects: &[ElfObject], globals: &mut HashMap, - output_sections: &mut [OutputSection], - section_map: &HashMap<(usize, usize), (usize, u64)>, - needed_sonames: &[String], output_path: &str, - soname: Option, -) -> Result<(), String> { - let base_addr: u64 = 0; - - let mut dynstr = DynStrTab::new(); - for lib in needed_sonames { dynstr.add(lib); } - if let Some(ref sn) = soname { dynstr.add(sn); } - - // Export all defined global symbols - let mut dyn_sym_names: Vec = Vec::new(); - let mut exported: Vec = globals.iter() - .filter(|(_, g)| { - g.defined_in.is_some() && !g.is_dynamic - && (g.info >> 4) != 0 - && g.section_idx != SHN_UNDEF - }) - .map(|(n, _)| n.clone()) - .collect(); - exported.sort(); - for name in exported { - if !dyn_sym_names.contains(&name) { dyn_sym_names.push(name); } - } - for (name, gsym) in globals.iter() { - if gsym.is_dynamic && !dyn_sym_names.contains(name) { - dyn_sym_names.push(name.clone()); - } - } - - // Collect PLT symbols: external functions referenced by CALL26/JUMP26 - let mut so_plt_names: Vec = Vec::new(); - for obj in objects.iter() { - for sec_relas in &obj.relocations { - for rela in sec_relas { - let si = rela.sym_idx as usize; - if si >= obj.symbols.len() { continue; } - let sym = &obj.symbols[si]; - if sym.name.is_empty() { continue; } - match rela.rela_type { - R_AARCH64_CALL26 | R_AARCH64_JUMP26 => { - let needs_plt = if let Some(g) = globals.get(&sym.name) { - g.is_dynamic || (g.defined_in.is_none() && g.section_idx == SHN_UNDEF) - } else { - sym.is_undefined() && !sym.is_local() - }; - if needs_plt && !so_plt_names.contains(&sym.name) { - so_plt_names.push(sym.name.clone()); - } - } - _ => {} - } - } - } - } - // Add PLT symbols to dyn_sym_names if not already present - for name in &so_plt_names { - if !dyn_sym_names.contains(name) { - dyn_sym_names.push(name.clone()); - } - } - // Assign PLT indices - for (i, name) in so_plt_names.iter().enumerate() { - if let Some(g) = globals.get_mut(name) { - g.plt_idx = Some(i); - } - } - - // Add undefined/dynamic symbols referenced by ADRP/ADD/LDST to dyn_sym_names - // so they get GLOB_DAT relocations and the dynamic linker can resolve them. - for obj in objects.iter() { - for sec_relas in &obj.relocations { - for rela in sec_relas { - let si = rela.sym_idx as usize; - if si >= obj.symbols.len() { continue; } - let sym = &obj.symbols[si]; - if sym.name.is_empty() { continue; } - match rela.rela_type { - R_AARCH64_ADR_PREL_PG_HI21 | R_AARCH64_ADD_ABS_LO12_NC - | R_AARCH64_LDST64_ABS_LO12_NC | R_AARCH64_LDST32_ABS_LO12_NC - | R_AARCH64_LDST8_ABS_LO12_NC | R_AARCH64_LDST16_ABS_LO12_NC => { - let sym_needs_got = if let Some(g) = globals.get(&sym.name) { - g.is_dynamic || g.defined_in.is_none() - } else { - sym.is_undefined() && !sym.is_local() - }; - if sym_needs_got && !dyn_sym_names.contains(&sym.name) { - dyn_sym_names.push(sym.name.clone()); - } - } - _ => {} - } - } - } - } - - // Reorder dyn_sym_names: undefined/import symbols first, then defined/export symbols. - // The .gnu.hash table only covers defined symbols (those after symoffset). - // Undefined symbols must be placed before symoffset so the dynamic linker - // doesn't incorrectly find them during symbol lookup in this library. - let mut undef_names: Vec = Vec::new(); - let mut def_names: Vec = Vec::new(); - for name in &dyn_sym_names { - let is_undef = if let Some(g) = globals.get(name) { - g.is_dynamic || g.defined_in.is_none() || g.section_idx == SHN_UNDEF - } else { - true - }; - if is_undef { - undef_names.push(name.clone()); - } else { - def_names.push(name.clone()); - } - } - dyn_sym_names = Vec::new(); - dyn_sym_names.extend(undef_names.iter().cloned()); - let so_undef_count = dyn_sym_names.len(); - dyn_sym_names.extend(def_names.iter().cloned()); - - for name in &dyn_sym_names { dynstr.add(name); } - - let dynsym_count = 1 + dyn_sym_names.len(); - let dynsym_size = dynsym_count as u64 * 24; - let dynstr_size = dynstr.as_bytes().len() as u64; - - // .gnu.hash - only covers defined symbols (after the undefined ones) - let gnu_hash_symoffset: usize = 1 + so_undef_count; - let num_hashed = dyn_sym_names.len() - so_undef_count; - let gnu_hash_nbuckets = if num_hashed == 0 { 1 } else { num_hashed.next_power_of_two().max(1) } as u32; - let gnu_hash_bloom_size: u32 = 1; - let gnu_hash_bloom_shift: u32 = 6; - - let hashed_sym_hashes: Vec = dyn_sym_names[so_undef_count..].iter() - .map(|name| linker_common::gnu_hash(name.as_bytes())).collect(); - - let mut bloom_word: u64 = 0; - for &h in &hashed_sym_hashes { - bloom_word |= 1u64 << (h as u64 % 64); - bloom_word |= 1u64 << ((h >> gnu_hash_bloom_shift) as u64 % 64); - } - - if num_hashed > 0 { - let mut hashed_with_hash: Vec<(String, u32)> = dyn_sym_names[so_undef_count..].iter() - .zip(hashed_sym_hashes.iter()) - .map(|(n, &h)| (n.clone(), h)).collect(); - hashed_with_hash.sort_by_key(|(_, h)| h % gnu_hash_nbuckets); - for (i, (name, _)) in hashed_with_hash.iter().enumerate() { - dyn_sym_names[so_undef_count + i] = name.clone(); - } - } - - let hashed_sym_hashes: Vec = dyn_sym_names[so_undef_count..].iter() - .map(|name| linker_common::gnu_hash(name.as_bytes())).collect(); - - let mut gnu_hash_buckets = vec![0u32; gnu_hash_nbuckets as usize]; - let mut gnu_hash_chains = vec![0u32; num_hashed]; - for (i, &h) in hashed_sym_hashes.iter().enumerate() { - let bucket = (h % gnu_hash_nbuckets) as usize; - if gnu_hash_buckets[bucket] == 0 { - gnu_hash_buckets[bucket] = (gnu_hash_symoffset + i) as u32; - } - gnu_hash_chains[i] = h & !1; - } - for bucket_idx in 0..gnu_hash_nbuckets as usize { - if gnu_hash_buckets[bucket_idx] == 0 { continue; } - let mut last_in_bucket = 0; - for (i, &h) in hashed_sym_hashes.iter().enumerate() { - if (h % gnu_hash_nbuckets) as usize == bucket_idx { - last_in_bucket = i; - } - } - gnu_hash_chains[last_in_bucket] |= 1; - } - - let gnu_hash_size: u64 = 16 + (gnu_hash_bloom_size as u64 * 8) - + (gnu_hash_nbuckets as u64 * 4) + (num_hashed as u64 * 4); - - let has_init_array = output_sections.iter().any(|s| s.name == ".init_array" && s.mem_size > 0); - let has_fini_array = output_sections.iter().any(|s| s.name == ".fini_array" && s.mem_size > 0); - let so_plt_size: u64 = if so_plt_names.is_empty() { 0 } else { 32 + 16 * so_plt_names.len() as u64 }; - let so_got_plt_count: u64 = if so_plt_names.is_empty() { 0 } else { 3 + so_plt_names.len() as u64 }; - let so_got_plt_size: u64 = so_got_plt_count * 8; - let so_rela_plt_size: u64 = so_plt_names.len() as u64 * 24; - - let mut dyn_count = needed_sonames.len() as u64 + 12; // base 10 + 2 for FLAGS/FLAGS_1 - if soname.is_some() { dyn_count += 1; } - if has_init_array { dyn_count += 2; } - if has_fini_array { dyn_count += 2; } - if !so_plt_names.is_empty() { dyn_count += 4; } // PLTGOT, PLTRELSZ, PLTREL, JMPREL - let dynamic_size = dyn_count * 16; - - let has_tls_sections = output_sections.iter().any(|s| s.flags & SHF_TLS != 0 && s.flags & SHF_ALLOC != 0); - - // Promote read-only sections that have R_AARCH64_ABS64 relocations to writable. - // These sections contain embedded pointers that become R_AARCH64_RELATIVE dynamic - // relocations, so the dynamic linker must be able to write to them at load time. - { - // Build a set of output section indices that have ABS64 relocs targeting them - let mut needs_write: std::collections::HashSet = std::collections::HashSet::new(); - for obj_idx in 0..objects.len() { - for sec_idx in 0..objects[obj_idx].sections.len() { - let relas = &objects[obj_idx].relocations[sec_idx]; - let has_abs64 = relas.iter().any(|r| r.rela_type == R_AARCH64_ABS64); - if has_abs64 { - if let Some(&(out_idx, _)) = section_map.get(&(obj_idx, sec_idx)) { - needs_write.insert(out_idx); - } - } - } - } - for idx in needs_write { - if output_sections[idx].flags & SHF_WRITE == 0 - && output_sections[idx].flags & SHF_EXECINSTR == 0 - { - output_sections[idx].flags |= SHF_WRITE; - } - } - } - - // Check if there are any pure-rodata (non-writable, non-executable) sections remaining - let has_rodata = output_sections.iter().any(|s| - s.flags & SHF_ALLOC != 0 && s.flags & SHF_EXECINSTR == 0 && - s.flags & SHF_WRITE == 0 && s.sh_type != SHT_NOBITS - ); - // PHDR, LOAD R, LOAD RX, [LOAD R(rodata)], LOAD RW, DYNAMIC, GNU_STACK, [TLS] - let mut phdr_count: u64 = 6; // base: PHDR + LOAD R + LOAD RX + LOAD RW + DYNAMIC + GNU_STACK - if has_rodata { phdr_count += 1; } - if has_tls_sections { phdr_count += 1; } - let phdr_total_size = phdr_count * 56; - - // === Layout === - let mut offset = 64 + phdr_total_size; - offset = (offset + 7) & !7; - let gnu_hash_offset = offset; let gnu_hash_addr = base_addr + offset; offset += gnu_hash_size; - offset = (offset + 7) & !7; - let dynsym_offset = offset; let dynsym_addr = base_addr + offset; offset += dynsym_size; - let dynstr_offset = offset; let dynstr_addr = base_addr + offset; offset += dynstr_size; - - // Text segment - offset = (offset + PAGE_SIZE - 1) & !(PAGE_SIZE - 1); - let text_page_offset = offset; - let text_page_addr = base_addr + offset; - for sec in output_sections.iter_mut() { - if sec.flags & SHF_EXECINSTR != 0 && sec.flags & SHF_ALLOC != 0 { - let a = sec.alignment.max(4); - offset = (offset + a - 1) & !(a - 1); - sec.addr = base_addr + offset; sec.file_offset = offset; - offset += sec.mem_size; - } - } - // PLT stubs (in text segment, after .text sections) - let so_plt_offset: u64; - let so_plt_addr: u64; - if so_plt_size > 0 { - offset = (offset + 15) & !15; // align to 16 bytes - so_plt_offset = offset; - so_plt_addr = base_addr + offset; - offset += so_plt_size; - } else { - so_plt_offset = 0; - so_plt_addr = 0; - } - let text_total_size = offset - text_page_offset; - - // Rodata - offset = (offset + PAGE_SIZE - 1) & !(PAGE_SIZE - 1); - let rodata_page_offset = offset; - let rodata_page_addr = base_addr + offset; - for sec in output_sections.iter_mut() { - if sec.flags & SHF_ALLOC != 0 && sec.flags & SHF_EXECINSTR == 0 && - sec.flags & SHF_WRITE == 0 && sec.sh_type != SHT_NOBITS { - let a = sec.alignment.max(1); - offset = (offset + a - 1) & !(a - 1); - sec.addr = base_addr + offset; sec.file_offset = offset; - offset += sec.mem_size; - } - } - let rodata_total_size = offset - rodata_page_offset; - - // RW segment - offset = (offset + PAGE_SIZE - 1) & !(PAGE_SIZE - 1); - let rw_page_offset = offset; - let rw_page_addr = base_addr + offset; - - let mut init_array_addr_so = 0u64; let mut init_array_size_so = 0u64; - let mut fini_array_addr_so = 0u64; let mut fini_array_size_so = 0u64; - - for sec in output_sections.iter_mut() { - if sec.name == ".init_array" { - let a = sec.alignment.max(8); - offset = (offset + a - 1) & !(a - 1); - sec.addr = base_addr + offset; sec.file_offset = offset; - init_array_addr_so = sec.addr; init_array_size_so = sec.mem_size; - offset += sec.mem_size; break; - } - } - for sec in output_sections.iter_mut() { - if sec.name == ".fini_array" { - let a = sec.alignment.max(8); - offset = (offset + a - 1) & !(a - 1); - sec.addr = base_addr + offset; sec.file_offset = offset; - fini_array_addr_so = sec.addr; fini_array_size_so = sec.mem_size; - offset += sec.mem_size; break; - } - } - - // Reserve space for .rela.dyn (R_AARCH64_RELATIVE + R_AARCH64_GLOB_DAT entries) - offset = (offset + 7) & !7; - let rela_dyn_offset = offset; - let rela_dyn_addr = base_addr + offset; - let mut max_rela_count: usize = 0; - // Count ABS64 relocations (become RELATIVE) - for obj in objects.iter() { - for sec_relas in &obj.relocations { - for rela in sec_relas { - if rela.rela_type == R_AARCH64_ABS64 { - max_rela_count += 1; - } - } - } - } - for sec in output_sections.iter() { - if sec.name == ".init_array" || sec.name == ".fini_array" { - max_rela_count += (sec.mem_size / 8) as usize; - } - } - // Pre-count GOT entries that will need dynamic relocations (RELATIVE or GLOB_DAT). - // This must be done before layout to correctly reserve .rela.dyn space. - { - let mut got_pre_count: usize = 0; - let mut got_pre_names: Vec = Vec::new(); - for obj in objects.iter() { - for sec_relas in &obj.relocations { - for rela in sec_relas { - let si = rela.sym_idx as usize; - if si >= obj.symbols.len() { continue; } - let sym_name = &obj.symbols[si].name; - if sym_name.is_empty() { continue; } - let needs_got = match rela.rela_type { - R_AARCH64_ADR_GOT_PAGE | R_AARCH64_LD64_GOT_LO12_NC => true, - R_AARCH64_ADR_PREL_PG_HI21 | R_AARCH64_ADD_ABS_LO12_NC - | R_AARCH64_LDST64_ABS_LO12_NC | R_AARCH64_LDST32_ABS_LO12_NC - | R_AARCH64_LDST8_ABS_LO12_NC | R_AARCH64_LDST16_ABS_LO12_NC => { - if let Some(g) = globals.get(sym_name.as_str()) { - g.is_dynamic || g.defined_in.is_none() - } else { - obj.symbols[si].is_undefined() && !obj.symbols[si].is_local() - } - } - _ => false, - }; - if needs_got && !got_pre_names.contains(sym_name) { - got_pre_names.push(sym_name.clone()); - got_pre_count += 1; - } - } - } - } - max_rela_count += got_pre_count; - } - let rela_dyn_max_size = max_rela_count as u64 * 24; - offset += rela_dyn_max_size; - - offset = (offset + 7) & !7; - let dynamic_offset = offset; let dynamic_addr_so = base_addr + offset; offset += dynamic_size; - - // GOT.PLT for PLT symbols - let so_got_plt_offset: u64; - let so_got_plt_addr: u64; - if so_got_plt_size > 0 { - offset = (offset + 7) & !7; - so_got_plt_offset = offset; - so_got_plt_addr = base_addr + offset; - offset += so_got_plt_size; - } else { - so_got_plt_offset = 0; - so_got_plt_addr = 0; - } - - // RELA.PLT for JUMP_SLOT relocations - let so_rela_plt_offset: u64; - let so_rela_plt_addr: u64; - if so_rela_plt_size > 0 { - offset = (offset + 7) & !7; - so_rela_plt_offset = offset; - so_rela_plt_addr = base_addr + offset; - offset += so_rela_plt_size; - } else { - so_rela_plt_offset = 0; - so_rela_plt_addr = 0; - } - - // GOT for locally-resolved symbols AND undefined/dynamic symbols referenced - // by ADRP/ADD pairs that need GOT indirection in shared libraries. - let got_offset = offset; let got_addr = base_addr + offset; - let mut got_needed: Vec = Vec::new(); - for obj in objects.iter() { - for sec_relas in &obj.relocations { - for rela in sec_relas { - let si = rela.sym_idx as usize; - if si >= obj.symbols.len() { continue; } - let sym = &obj.symbols[si]; - if sym.name.is_empty() { continue; } - // Skip local symbols - they don't need GOT entries in dynsym - // (e.g. static _Thread_local variables) - if sym.is_local() { continue; } - match rela.rela_type { - R_AARCH64_ADR_GOT_PAGE | R_AARCH64_LD64_GOT_LO12_NC => { - if !got_needed.contains(&sym.name) { got_needed.push(sym.name.clone()); } - } - // In shared libraries, ADRP/ADD for undefined/dynamic symbols must - // go through the GOT since the symbol address is only known at runtime. - R_AARCH64_ADR_PREL_PG_HI21 | R_AARCH64_ADD_ABS_LO12_NC - | R_AARCH64_LDST64_ABS_LO12_NC | R_AARCH64_LDST32_ABS_LO12_NC - | R_AARCH64_LDST8_ABS_LO12_NC | R_AARCH64_LDST16_ABS_LO12_NC => { - let sym_needs_got = if let Some(g) = globals.get(&sym.name) { - g.is_dynamic || g.defined_in.is_none() - } else { - sym.is_undefined() && !sym.is_local() - }; - if sym_needs_got && !got_needed.contains(&sym.name) { - got_needed.push(sym.name.clone()); - } - } - _ => {} - } - } - } - } - let got_size = got_needed.len() as u64 * 8; - offset += got_size; - - for sec in output_sections.iter_mut() { - if sec.flags & SHF_ALLOC != 0 && sec.flags & SHF_WRITE != 0 && - sec.sh_type != SHT_NOBITS && sec.name != ".init_array" && sec.name != ".fini_array" && - sec.flags & SHF_TLS == 0 { - let a = sec.alignment.max(1); - offset = (offset + a - 1) & !(a - 1); - sec.addr = base_addr + offset; sec.file_offset = offset; - offset += sec.mem_size; - } - } - - // TLS - let mut tls_addr = 0u64; - let mut tls_file_offset_so = 0u64; - let mut tls_file_size = 0u64; - let mut tls_mem_size = 0u64; - let mut tls_align = 1u64; - for sec in output_sections.iter_mut() { - if sec.flags & SHF_TLS != 0 && sec.flags & SHF_ALLOC != 0 && sec.sh_type != SHT_NOBITS { - let a = sec.alignment.max(1); - offset = (offset + a - 1) & !(a - 1); - sec.addr = base_addr + offset; sec.file_offset = offset; - if tls_addr == 0 { tls_addr = sec.addr; tls_file_offset_so = offset; tls_align = a; } - tls_file_size += sec.mem_size; tls_mem_size += sec.mem_size; - offset += sec.mem_size; - } - } - if tls_addr == 0 && has_tls_sections { - tls_addr = base_addr + offset; tls_file_offset_so = offset; - } - for sec in output_sections.iter_mut() { - if sec.flags & SHF_TLS != 0 && sec.sh_type == SHT_NOBITS { - let a = sec.alignment.max(1); - let aligned = (tls_mem_size + a - 1) & !(a - 1); - sec.addr = tls_addr + aligned; sec.file_offset = offset; - tls_mem_size = aligned + sec.mem_size; - if a > tls_align { tls_align = a; } - } - } - tls_mem_size = (tls_mem_size + tls_align - 1) & !(tls_align - 1); - let has_tls = tls_addr != 0; - - let bss_addr = base_addr + offset; - let mut bss_size = 0u64; - for sec in output_sections.iter_mut() { - if sec.sh_type == SHT_NOBITS && sec.flags & SHF_ALLOC != 0 && sec.flags & SHF_TLS == 0 { - let a = sec.alignment.max(1); - let aligned = (bss_addr + bss_size + a - 1) & !(a - 1); - bss_size = aligned - bss_addr + sec.mem_size; - sec.addr = aligned; sec.file_offset = offset; - } - } - - // Merge section data - for sec in output_sections.iter_mut() { - if sec.sh_type == SHT_NOBITS { continue; } - let mut data = vec![0u8; sec.mem_size as usize]; - for input in &sec.inputs { - let sd = &objects[input.object_idx].section_data[input.section_idx]; - let s = input.output_offset as usize; - let e = s + sd.len(); - if e <= data.len() && !sd.is_empty() { data[s..e].copy_from_slice(sd); } - } - sec.data = data; - } - - // Update globals - for (_, gsym) in globals.iter_mut() { - if let Some(obj_idx) = gsym.defined_in { - if gsym.section_idx == SHN_COMMON || gsym.section_idx == 0xffff { - if let Some(bss_sec) = output_sections.iter().find(|s| s.name == ".bss") { - gsym.value += bss_sec.addr; - } - } else if gsym.section_idx != SHN_UNDEF && gsym.section_idx != SHN_ABS { - let si = gsym.section_idx as usize; - if let Some(&(oi, so)) = section_map.get(&(obj_idx, si)) { - gsym.value += output_sections[oi].addr + so; - } - } - } - } - - // Linker-provided symbols for shared library - let linker_addrs = LinkerSymbolAddresses { - base_addr, got_addr, dynamic_addr: dynamic_addr_so, - bss_addr, bss_size, text_end: text_page_addr + text_total_size, - data_start: rw_page_addr, - init_array_start: init_array_addr_so, init_array_size: init_array_size_so, - fini_array_start: fini_array_addr_so, fini_array_size: fini_array_size_so, - preinit_array_start: 0, preinit_array_size: 0, - rela_iplt_start: 0, rela_iplt_size: 0, - }; - for sym in &get_standard_linker_symbols(&linker_addrs) { - let entry = globals.entry(sym.name.to_string()).or_insert(GlobalSymbol { - value: 0, size: 0, info: (sym.binding << 4), - defined_in: None, from_lib: None, plt_idx: None, got_idx: None, - section_idx: SHN_ABS, is_dynamic: false, copy_reloc: false, lib_sym_value: 0, - }); - if entry.defined_in.is_none() && !entry.is_dynamic { - entry.value = sym.value; - entry.defined_in = Some(usize::MAX); - entry.section_idx = SHN_ABS; - } - } - - // Auto-generate __start_
/ __stop_
symbols (GNU ld feature) - for (name, addr) in linker_common::resolve_start_stop_symbols(output_sections) { - if let Some(entry) = globals.get_mut(&name) { - if entry.defined_in.is_none() && !entry.is_dynamic { - entry.value = addr; - entry.defined_in = Some(usize::MAX); - entry.section_idx = SHN_ABS; - } - } - } - - // Save RW segment file size before appending section headers - let rw_end_offset = offset; - - // Section headers: null, .dynsym, .dynstr, .gnu.hash, .dynamic, .rela.dyn, .shstrtab - // Build .shstrtab - let mut shstrtab_data: Vec = vec![0]; // null byte at offset 0 - let shname_dynsym = shstrtab_data.len() as u32; - shstrtab_data.extend_from_slice(b".dynsym\0"); - let shname_dynstr = shstrtab_data.len() as u32; - shstrtab_data.extend_from_slice(b".dynstr\0"); - let shname_gnu_hash = shstrtab_data.len() as u32; - shstrtab_data.extend_from_slice(b".gnu.hash\0"); - let shname_dynamic = shstrtab_data.len() as u32; - shstrtab_data.extend_from_slice(b".dynamic\0"); - let shname_rela_dyn = shstrtab_data.len() as u32; - shstrtab_data.extend_from_slice(b".rela.dyn\0"); - let shname_shstrtab = shstrtab_data.len() as u32; - shstrtab_data.extend_from_slice(b".shstrtab\0"); - - // Append .shstrtab data and section headers after file content - offset = (offset + 7) & !7; - let shstrtab_offset = offset; - let shstrtab_size = shstrtab_data.len() as u64; - offset += shstrtab_size; - offset = (offset + 7) & !7; - let shdr_offset = offset; - let sh_count: u16 = 7; // null + .dynsym + .dynstr + .gnu.hash + .dynamic + .rela.dyn + .shstrtab - let shdr_total = sh_count as u64 * 64; - offset += shdr_total; - - // Build output buffer - let file_size = offset as usize; - let mut out = vec![0u8; file_size]; - - // ELF header - out[0..4].copy_from_slice(&ELF_MAGIC); - out[4] = ELFCLASS64; out[5] = ELFDATA2LSB; out[6] = 1; - w16(&mut out, 16, ET_DYN); - w16(&mut out, 18, EM_AARCH64); w32(&mut out, 20, 1); - w64(&mut out, 24, 0); // e_entry = 0 - w64(&mut out, 32, 64); // e_phoff - w64(&mut out, 40, shdr_offset); // e_shoff - w32(&mut out, 48, 0); w16(&mut out, 52, 64); w16(&mut out, 54, 56); - w16(&mut out, 56, phdr_count as u16); - w16(&mut out, 58, 64); // e_shentsize - w16(&mut out, 60, sh_count); // e_shnum - w16(&mut out, 62, sh_count - 1); // e_shstrndx (last section) - - // Program headers - let mut ph = 64usize; - wphdr(&mut out, ph, PT_PHDR, PF_R, 64, base_addr + 64, phdr_total_size, phdr_total_size, 8); ph += 56; - let ro_seg_end = dynstr_offset + dynstr_size; - wphdr(&mut out, ph, PT_LOAD, PF_R, 0, base_addr, ro_seg_end, ro_seg_end, PAGE_SIZE); ph += 56; - if text_total_size > 0 { - wphdr(&mut out, ph, PT_LOAD, PF_R|PF_X, text_page_offset, text_page_addr, text_total_size, text_total_size, PAGE_SIZE); ph += 56; - } else { - wphdr(&mut out, ph, PT_LOAD, PF_R|PF_X, text_page_offset, text_page_addr, 0, 0, PAGE_SIZE); ph += 56; - } - if has_rodata { - wphdr(&mut out, ph, PT_LOAD, PF_R, rodata_page_offset, rodata_page_addr, rodata_total_size, rodata_total_size, PAGE_SIZE); ph += 56; - } - let rw_filesz = rw_end_offset - rw_page_offset; - let rw_memsz = if bss_size > 0 { (bss_addr + bss_size) - rw_page_addr } else { rw_filesz }; - wphdr(&mut out, ph, PT_LOAD, PF_R|PF_W, rw_page_offset, rw_page_addr, rw_filesz, rw_memsz, PAGE_SIZE); ph += 56; - wphdr(&mut out, ph, PT_DYNAMIC, PF_R|PF_W, dynamic_offset, dynamic_addr_so, dynamic_size, dynamic_size, 8); ph += 56; - wphdr(&mut out, ph, PT_GNU_STACK, PF_R|PF_W, 0, 0, 0, 0, 0x10); ph += 56; - if has_tls { - wphdr(&mut out, ph, PT_TLS, PF_R, tls_file_offset_so, tls_addr, tls_file_size, tls_mem_size, tls_align); - } - - // .gnu.hash - let gh = gnu_hash_offset as usize; - w32(&mut out, gh, gnu_hash_nbuckets); - w32(&mut out, gh+4, gnu_hash_symoffset as u32); - w32(&mut out, gh+8, gnu_hash_bloom_size); - w32(&mut out, gh+12, gnu_hash_bloom_shift); - w64(&mut out, gh + 16, bloom_word); - let buckets_off = gh + 16 + (gnu_hash_bloom_size as usize * 8); - for (i, &b) in gnu_hash_buckets.iter().enumerate() { - w32(&mut out, buckets_off + i * 4, b); - } - let chains_off = buckets_off + (gnu_hash_nbuckets as usize * 4); - for (i, &c) in gnu_hash_chains.iter().enumerate() { - w32(&mut out, chains_off + i * 4, c); - } - - // .dynsym - let mut ds = dynsym_offset as usize + 24; - for name in &dyn_sym_names { - let no = dynstr.get_offset(name) as u32; - w32(&mut out, ds, no); - if let Some(gsym) = globals.get(name) { - if gsym.defined_in.is_some() && !gsym.is_dynamic && gsym.section_idx != SHN_UNDEF { - if ds+5 < out.len() { out[ds+4] = gsym.info; out[ds+5] = 0; } - w16(&mut out, ds+6, 1); - w64(&mut out, ds+8, gsym.value); - w64(&mut out, ds+16, gsym.size); - } else { - // Undefined/dynamic: preserve original binding (STB_WEAK vs STB_GLOBAL) - let bind = gsym.info >> 4; - let orig_type = gsym.info & 0xf; - let stype = if so_plt_names.contains(name) { STT_FUNC } else if orig_type != 0 { orig_type } else { 0u8 }; - if ds+5 < out.len() { out[ds+4] = (bind << 4) | stype; out[ds+5] = 0; } - w16(&mut out, ds+6, 0); w64(&mut out, ds+8, 0); w64(&mut out, ds+16, 0); - } - } else { - let stype = if so_plt_names.contains(name) { STT_FUNC } else { 0u8 /* STT_NOTYPE */ }; - if ds+5 < out.len() { out[ds+4] = (STB_GLOBAL << 4) | stype; out[ds+5] = 0; } - w16(&mut out, ds+6, 0); w64(&mut out, ds+8, 0); w64(&mut out, ds+16, 0); - } - ds += 24; - } - - // .dynstr - write_bytes(&mut out, dynstr_offset as usize, dynstr.as_bytes()); - - // Section data - for sec in output_sections.iter() { - if sec.sh_type == SHT_NOBITS || sec.data.is_empty() { continue; } - write_bytes(&mut out, sec.file_offset as usize, &sec.data); - } - - // GOT entries - let mut got_sym_addrs: HashMap = HashMap::new(); - for (i, name) in got_needed.iter().enumerate() { - let gea = got_addr + i as u64 * 8; - got_sym_addrs.insert(name.clone(), gea); - if let Some(gsym) = globals.get(name) { - if gsym.defined_in.is_some() && !gsym.is_dynamic { - w64(&mut out, (got_offset + i as u64 * 8) as usize, gsym.value); - } - } - } - // For PLT symbols referenced via GOT (ADR_GOT_PAGE/LD64_GOT_LO12_NC), - // point to the GOT.PLT entry so the dynamic linker resolves them - for (i, name) in so_plt_names.iter().enumerate() { - let gea = so_got_plt_addr + 24 + i as u64 * 8; - if !got_sym_addrs.contains_key(name) { - got_sym_addrs.insert(name.clone(), gea); - } - } - - // Apply relocations and collect dynamic relocation entries - let globals_snap: HashMap = globals.clone(); - // Each entry: (offset, r_info, addend) - let mut rela_dyn_entries: Vec<(u64, u64, u64)> = Vec::new(); - const R_AARCH64_RELATIVE_DYN: u64 = 1027; - const R_AARCH64_GLOB_DAT_DYN: u64 = 1025; - - // RELATIVE for locally-defined GOT entries, GLOB_DAT for undefined/dynamic GOT entries - for (i, name) in got_needed.iter().enumerate() { - let gea = got_addr + i as u64 * 8; - if let Some(gsym) = globals_snap.get(name) { - if gsym.defined_in.is_some() && !gsym.is_dynamic { - rela_dyn_entries.push((gea, R_AARCH64_RELATIVE_DYN, gsym.value)); - } else { - // Dynamic/undefined symbol: emit GLOB_DAT with the dynsym index - let si = dyn_sym_names.iter().position(|n| n == name).map(|p| p + 1).unwrap_or(0) as u64; - rela_dyn_entries.push((gea, (si << 32) | R_AARCH64_GLOB_DAT_DYN, 0)); - } - } else { - // Symbol not in globals - try to find in dynsym for GLOB_DAT - let si = dyn_sym_names.iter().position(|n| n == name).map(|p| p + 1).unwrap_or(0) as u64; - if si != 0 { - rela_dyn_entries.push((gea, (si << 32) | R_AARCH64_GLOB_DAT_DYN, 0)); - } - } - } - - for obj_idx in 0..objects.len() { - for sec_idx in 0..objects[obj_idx].sections.len() { - let relas = &objects[obj_idx].relocations[sec_idx]; - if relas.is_empty() { continue; } - let (out_idx, sec_off) = match section_map.get(&(obj_idx, sec_idx)) { - Some(&v) => v, None => continue, - }; - let sa = output_sections[out_idx].addr; - let sfo = output_sections[out_idx].file_offset; - - for rela in relas { - let si = rela.sym_idx as usize; - if si >= objects[obj_idx].symbols.len() { continue; } - let sym = &objects[obj_idx].symbols[si]; - let p = sa + sec_off + rela.offset; - let fp = (sfo + sec_off + rela.offset) as usize; - let a = rela.addend; - let s = reloc::resolve_sym(obj_idx, sym, &globals_snap, section_map, output_sections); - - match rela.rela_type { - R_AARCH64_ABS64 => { - let val = (s as i64 + a) as u64; - w64(&mut out, fp, val); - if s != 0 { rela_dyn_entries.push((p, R_AARCH64_RELATIVE_DYN, val)); } - } - R_AARCH64_ABS32 => { w32(&mut out, fp, (s as i64 + a) as u32); } - R_AARCH64_PREL64 => { w64(&mut out, fp, (s as i64 + a - p as i64) as u64); } - R_AARCH64_PREL32 | R_AARCH64_PREL16 => { - let val = (s as i64 + a - p as i64) as u32; - if rela.rela_type == R_AARCH64_PREL32 { w32(&mut out, fp, val); } - else { w16(&mut out, fp, val as u16); } - } - R_AARCH64_ADR_PREL_PG_HI21 => { - // For undefined/dynamic symbols in shared libs, redirect through GOT - if let Some(&gea) = got_sym_addrs.get(&sym.name) { - if s == 0 || globals_snap.get(&sym.name).is_some_and(|g| g.is_dynamic || g.defined_in.is_none()) { - let page_g = gea & !0xFFF; - let page_p = p & !0xFFF; - let imm = (page_g as i64 - page_p as i64) >> 12; - reloc::encode_adrp(&mut out, fp, imm); - } else { - let sa_val = (s as i64 + a) as u64; - let page_s = sa_val & !0xFFF; - let page_p = p & !0xFFF; - let imm = (page_s as i64 - page_p as i64) >> 12; - reloc::encode_adrp(&mut out, fp, imm); - } - } else { - let sa_val = (s as i64 + a) as u64; - let page_s = sa_val & !0xFFF; - let page_p = p & !0xFFF; - let imm = (page_s as i64 - page_p as i64) >> 12; - reloc::encode_adrp(&mut out, fp, imm); - } - } - R_AARCH64_ADD_ABS_LO12_NC => { - // For undefined/dynamic symbols in shared libs, convert ADD to LDR from GOT - if let Some(&gea) = got_sym_addrs.get(&sym.name) { - if s == 0 || globals_snap.get(&sym.name).is_some_and(|g| g.is_dynamic || g.defined_in.is_none()) { - // Convert: ADD Xd, Xn, #imm -> LDR Xd, [Xn, #imm] - // The ADD instruction loaded address = base + lo12 - // We need LDR to dereference the GOT entry instead - let lo12 = (gea & 0xFFF) as u32; - if fp + 4 <= out.len() { - let insn = read_u32(&out, fp); - let rd = insn & 0x1f; - let rn = (insn >> 5) & 0x1f; - // LDR Xd, [Xn, #imm] = 0xF9400000 | (imm/8 << 10) | (Rn << 5) | Rd - let ldr = 0xf9400000u32 | ((lo12 / 8) << 10) | (rn << 5) | rd; - w32(&mut out, fp, ldr); - } - } else { - let sa_val = (s as i64 + a) as u64; - reloc::encode_add_imm12(&mut out, fp, (sa_val & 0xFFF) as u32); - } - } else { - let sa_val = (s as i64 + a) as u64; - reloc::encode_add_imm12(&mut out, fp, (sa_val & 0xFFF) as u32); - } - } - R_AARCH64_CALL26 | R_AARCH64_JUMP26 => { - if fp + 4 > out.len() { continue; } - let mut target = (s as i64 + a) as u64; - // If the symbol has a PLT entry, redirect to it - if target == 0 && !sym.name.is_empty() { - if let Some(g) = globals_snap.get(&sym.name) { - if let Some(pi) = g.plt_idx { - target = so_plt_addr + 32 + pi as u64 * 16; - } - } - } - if target == 0 { - // Weak undefined with no PLT - NOP it - w32(&mut out, fp, 0xd503201f); - } else { - let offset_val = target as i64 - p as i64; - let mut insn = read_u32(&out, fp); - let imm26 = ((offset_val >> 2) as u32) & 0x3ffffff; - insn = (insn & 0xfc000000) | imm26; - w32(&mut out, fp, insn); - } - } - R_AARCH64_LDST8_ABS_LO12_NC => { - let sa_val = (s as i64 + a) as u64; - reloc::encode_ldst_imm12(&mut out, fp, (sa_val & 0xFFF) as u32, 0); - } - R_AARCH64_LDST16_ABS_LO12_NC => { - let sa_val = (s as i64 + a) as u64; - reloc::encode_ldst_imm12(&mut out, fp, (sa_val & 0xFFF) as u32, 1); - } - R_AARCH64_LDST32_ABS_LO12_NC => { - let sa_val = (s as i64 + a) as u64; - reloc::encode_ldst_imm12(&mut out, fp, (sa_val & 0xFFF) as u32, 2); - } - R_AARCH64_LDST64_ABS_LO12_NC => { - let sa_val = (s as i64 + a) as u64; - reloc::encode_ldst_imm12(&mut out, fp, (sa_val & 0xFFF) as u32, 3); - } - R_AARCH64_LDST128_ABS_LO12_NC => { - let sa_val = (s as i64 + a) as u64; - reloc::encode_ldst_imm12(&mut out, fp, (sa_val & 0xFFF) as u32, 4); - } - R_AARCH64_ADR_GOT_PAGE => { - if let Some(&gea) = got_sym_addrs.get(&sym.name) { - let page_g = gea & !0xFFF; - let page_p = p & !0xFFF; - let imm = (page_g as i64 - page_p as i64) >> 12; - reloc::encode_adrp(&mut out, fp, imm); - } else { - let sa_val = (s as i64 + a) as u64; - let page_s = sa_val & !0xFFF; - let page_p = p & !0xFFF; - let imm = (page_s as i64 - page_p as i64) >> 12; - reloc::encode_adrp(&mut out, fp, imm); - } - } - R_AARCH64_LD64_GOT_LO12_NC => { - if let Some(&gea) = got_sym_addrs.get(&sym.name) { - reloc::encode_ldst_imm12(&mut out, fp, (gea & 0xFFF) as u32, 3); - } else { - let sa_val = (s as i64 + a) as u64; - reloc::encode_ldst_imm12(&mut out, fp, (sa_val & 0xFFF) as u32, 3); - } - } - R_AARCH64_MOVW_UABS_G0 | R_AARCH64_MOVW_UABS_G0_NC => { - let sa_val = (s as i64 + a) as u64; - reloc::encode_movw(&mut out, fp, (sa_val & 0xffff) as u32); - } - R_AARCH64_MOVW_UABS_G1_NC => { - let sa_val = (s as i64 + a) as u64; - reloc::encode_movw(&mut out, fp, ((sa_val >> 16) & 0xffff) as u32); - } - R_AARCH64_MOVW_UABS_G2_NC => { - let sa_val = (s as i64 + a) as u64; - reloc::encode_movw(&mut out, fp, ((sa_val >> 32) & 0xffff) as u32); - } - R_AARCH64_MOVW_UABS_G3 => { - let sa_val = (s as i64 + a) as u64; - reloc::encode_movw(&mut out, fp, ((sa_val >> 48) & 0xffff) as u32); - } - R_AARCH64_NONE => {} - other => { - eprintln!("warning: unsupported relocation type {} for '{}' in shared library", other, sym.name); - } - } - } - } - } - - // Write .rela.dyn (RELATIVE + GLOB_DAT entries) - // Sort: put RELATIVE entries first (for DT_RELACOUNT), then GLOB_DAT - let relative_count = rela_dyn_entries.iter().filter(|(_, info, _)| *info == R_AARCH64_RELATIVE_DYN).count(); - rela_dyn_entries.sort_by_key(|(_, info, _)| if *info == R_AARCH64_RELATIVE_DYN { 0u8 } else { 1u8 }); - let actual_rela_count = rela_dyn_entries.len(); - let rela_dyn_size = actual_rela_count as u64 * 24; - let mut rd = rela_dyn_offset as usize; - for (rel_offset, rel_info, rel_addend) in &rela_dyn_entries { - if rd + 24 <= out.len() { - w64(&mut out, rd, *rel_offset); - w64(&mut out, rd+8, *rel_info); - w64(&mut out, rd+16, *rel_addend); - rd += 24; - } - } - - // .plt stubs (AArch64 PLT stubs for shared library) - if so_plt_size > 0 { - let po = so_plt_offset as usize; - // PLT header (32 bytes) - let got2_addr = so_got_plt_addr + 16; - let page_g = got2_addr & !0xFFF; - let page_p = so_plt_addr & !0xFFF; - let page_diff = ((page_g as i64 - page_p as i64) >> 12) as i32; - let immlo = (page_diff & 3) as u32; - let immhi = ((page_diff >> 2) & 0x7ffff) as u32; - w32(&mut out, po, 0xa9bf7bf0u32); // stp x16, x30, [sp, #-16]! - w32(&mut out, po + 4, 0x90000010 | (immlo << 29) | (immhi << 5)); // adrp x16, GOT+16 - let lo12 = (got2_addr & 0xFFF) as u32; - w32(&mut out, po + 8, 0xf9400211 | ((lo12 / 8) << 10)); // ldr x17, [x16, #lo12] - w32(&mut out, po + 12, 0x91000210 | ((lo12 & 0xFFF) << 10)); // add x16, x16, #lo12 - w32(&mut out, po + 16, 0xd61f0220u32); // br x17 - w32(&mut out, po + 20, 0xd503201fu32); // nop - w32(&mut out, po + 24, 0xd503201fu32); // nop - w32(&mut out, po + 28, 0xd503201fu32); // nop - - // Individual PLT entries (16 bytes each) - for (i, _) in so_plt_names.iter().enumerate() { - let ep = po + 32 + i * 16; - let pea = so_plt_addr + 32 + i as u64 * 16; - let gea = so_got_plt_addr + 24 + i as u64 * 8; - let page_g = gea & !0xFFF; - let page_p = pea & !0xFFF; - let page_diff = ((page_g as i64 - page_p as i64) >> 12) as i32; - let immlo = (page_diff & 3) as u32; - let immhi = ((page_diff >> 2) & 0x7ffff) as u32; - w32(&mut out, ep, 0x90000010 | (immlo << 29) | (immhi << 5)); // adrp x16 - let lo12 = (gea & 0xFFF) as u32; - w32(&mut out, ep + 4, 0xf9400211 | ((lo12 / 8) << 10)); // ldr x17, [x16, #lo12] - w32(&mut out, ep + 8, 0x91000210 | ((lo12 & 0xFFF) << 10)); // add x16, x16, #lo12 - w32(&mut out, ep + 12, 0xd61f0220u32); // br x17 - } - } - - // GOT.PLT entries - if so_got_plt_size > 0 { - let gp = so_got_plt_offset as usize; - w64(&mut out, gp, dynamic_addr_so); // GOT[0] = _DYNAMIC - // GOT[1] and GOT[2] are filled by the dynamic linker - // GOT[3..] are PLT GOT entries: initialized to PLT[0] (resolved eagerly via DF_BIND_NOW) - for i in 0..so_plt_names.len() { - w64(&mut out, gp + 24 + i * 8, so_plt_addr); - } - } - - // .rela.plt (R_AARCH64_JUMP_SLOT) - const R_AARCH64_JUMP_SLOT: u64 = 1026; - if so_rela_plt_size > 0 { - let mut rp = so_rela_plt_offset as usize; - for (i, name) in so_plt_names.iter().enumerate() { - let gea = so_got_plt_addr + 24 + i as u64 * 8; - // Find the dynsym index for this symbol - let si = dyn_sym_names.iter().position(|n| n == name).map(|p| p + 1).unwrap_or(0) as u64; - w64(&mut out, rp, gea); - w64(&mut out, rp + 8, (si << 32) | R_AARCH64_JUMP_SLOT); - w64(&mut out, rp + 16, 0); - rp += 24; - } - } - - // .dynamic - let mut dd = dynamic_offset as usize; - for lib in needed_sonames { - let so = dynstr.get_offset(lib); - w64(&mut out, dd, DT_NEEDED as u64); w64(&mut out, dd+8, so as u64); dd += 16; - } - if let Some(ref sn) = soname { - let so = dynstr.get_offset(sn); - w64(&mut out, dd, DT_SONAME as u64); w64(&mut out, dd+8, so as u64); dd += 16; - } - for &(tag, val) in &[ - (DT_STRTAB, dynstr_addr), (DT_SYMTAB, dynsym_addr), (DT_STRSZ, dynstr_size), - (DT_SYMENT, 24), - (DT_RELA, rela_dyn_addr), (DT_RELASZ, rela_dyn_size), (DT_RELAENT, 24), - (DT_RELACOUNT, relative_count as u64), - (DT_GNU_HASH, gnu_hash_addr), - ] { - w64(&mut out, dd, tag as u64); w64(&mut out, dd+8, val); dd += 16; - } - if !so_plt_names.is_empty() { - for &(tag, val) in &[ - (DT_PLTGOT, so_got_plt_addr), (DT_PLTRELSZ, so_rela_plt_size), - (DT_PLTREL, 7u64), (DT_JMPREL, so_rela_plt_addr), - ] { - w64(&mut out, dd, tag as u64); w64(&mut out, dd+8, val); dd += 16; - } - } - if has_init_array { - w64(&mut out, dd, DT_INIT_ARRAY as u64); w64(&mut out, dd+8, init_array_addr_so); dd += 16; - w64(&mut out, dd, DT_INIT_ARRAYSZ as u64); w64(&mut out, dd+8, init_array_size_so); dd += 16; - } - if has_fini_array { - w64(&mut out, dd, DT_FINI_ARRAY as u64); w64(&mut out, dd+8, fini_array_addr_so); dd += 16; - w64(&mut out, dd, DT_FINI_ARRAYSZ as u64); w64(&mut out, dd+8, fini_array_size_so); dd += 16; - } - // Force eager binding so GOT.PLT entries are resolved before execution - w64(&mut out, dd, DT_FLAGS as u64); w64(&mut out, dd+8, DF_BIND_NOW as u64); dd += 16; - w64(&mut out, dd, DT_FLAGS_1 as u64); w64(&mut out, dd+8, DF_1_NOW as u64); dd += 16; - w64(&mut out, dd, DT_NULL as u64); w64(&mut out, dd+8, 0); - - // Write .shstrtab - write_bytes(&mut out, shstrtab_offset as usize, &shstrtab_data); - - // Write section headers (64 bytes each for ELF64) - // Helper to write one section header - let mut sh = shdr_offset as usize; - // [0] SHT_NULL - // (already zeroed) - sh += 64; - // [1] .dynsym (SHT_DYNSYM = 11) - w32(&mut out, sh, shname_dynsym); - w32(&mut out, sh+4, 11); // SHT_DYNSYM - w64(&mut out, sh+8, 0x2); // SHF_ALLOC - w64(&mut out, sh+16, dynsym_addr); // sh_addr - w64(&mut out, sh+24, dynsym_offset); // sh_offset - w64(&mut out, sh+32, dynsym_size); // sh_size - w32(&mut out, sh+40, 2); // sh_link = .dynstr index - w32(&mut out, sh+44, 1); // sh_info = 1 (one local sym: null) - w64(&mut out, sh+48, 8); // sh_addralign - w64(&mut out, sh+56, 24); // sh_entsize - sh += 64; - // [2] .dynstr (SHT_STRTAB = 3) - w32(&mut out, sh, shname_dynstr); - w32(&mut out, sh+4, 3); // SHT_STRTAB - w64(&mut out, sh+8, 0x2); // SHF_ALLOC - w64(&mut out, sh+16, dynstr_addr); - w64(&mut out, sh+24, dynstr_offset); - w64(&mut out, sh+32, dynstr_size); - w64(&mut out, sh+48, 1); // sh_addralign - sh += 64; - // [3] .gnu.hash (SHT_GNU_HASH = 0x6ffffff6) - w32(&mut out, sh, shname_gnu_hash); - w32(&mut out, sh+4, 0x6ffffff6u32); // SHT_GNU_HASH - w64(&mut out, sh+8, 0x2); // SHF_ALLOC - w64(&mut out, sh+16, gnu_hash_addr); - w64(&mut out, sh+24, gnu_hash_offset); - w64(&mut out, sh+32, gnu_hash_size); - w32(&mut out, sh+40, 1); // sh_link = .dynsym index - w64(&mut out, sh+48, 8); - sh += 64; - // [4] .dynamic (SHT_DYNAMIC = 6) - w32(&mut out, sh, shname_dynamic); - w32(&mut out, sh+4, 6); // SHT_DYNAMIC - w64(&mut out, sh+8, 0x3); // SHF_WRITE | SHF_ALLOC - w64(&mut out, sh+16, dynamic_addr_so); - w64(&mut out, sh+24, dynamic_offset); - w64(&mut out, sh+32, dynamic_size); - w32(&mut out, sh+40, 2); // sh_link = .dynstr index - w64(&mut out, sh+48, 8); - w64(&mut out, sh+56, 16); // sh_entsize - sh += 64; - // [5] .rela.dyn (SHT_RELA = 4) - w32(&mut out, sh, shname_rela_dyn); - w32(&mut out, sh+4, 4); // SHT_RELA - w64(&mut out, sh+8, 0x2); // SHF_ALLOC - w64(&mut out, sh+16, rela_dyn_addr); - w64(&mut out, sh+24, rela_dyn_offset); - w64(&mut out, sh+32, rela_dyn_size); - w32(&mut out, sh+40, 1); // sh_link = .dynsym index - w64(&mut out, sh+48, 8); - w64(&mut out, sh+56, 24); // sh_entsize - sh += 64; - // [6] .shstrtab (SHT_STRTAB = 3) - w32(&mut out, sh, shname_shstrtab); - w32(&mut out, sh+4, 3); // SHT_STRTAB - w64(&mut out, sh+24, shstrtab_offset); - w64(&mut out, sh+32, shstrtab_size); - w64(&mut out, sh+48, 1); - - std::fs::write(output_path, &out).map_err(|e| format!("failed to write '{}': {}", output_path, e))?; - #[cfg(unix)] - { - use std::os::unix::fs::PermissionsExt; - let _ = std::fs::set_permissions(output_path, std::fs::Permissions::from_mode(0o755)); - } - Ok(()) -} diff --git a/src/backend/arm/linker/emit_static.rs b/src/backend/arm/linker/emit_static.rs deleted file mode 100644 index b80c2fe0b6..0000000000 --- a/src/backend/arm/linker/emit_static.rs +++ /dev/null @@ -1,645 +0,0 @@ -//! Static executable emission for the AArch64 linker. -//! -//! Emits a statically-linked ELF64 executable with two LOAD segments (RX+RW), -//! GOT for position-dependent code, IPLT/IRELATIVE for ifuncs, and TLS support. - -use std::collections::HashMap; - -use super::elf::*; -use super::types::{GlobalSymbol, BASE_ADDR, PAGE_SIZE}; -use super::reloc; -use crate::backend::linker_common; -use linker_common::OutputSection; - -// ── Static ELF emission ─────────────────────────────────────────────── - -pub(super) fn emit_executable( - objects: &[ElfObject], - globals: &mut HashMap, - output_sections: &mut [OutputSection], - section_map: &HashMap<(usize, usize), (usize, u64)>, - output_path: &str, -) -> Result<(), String> { - if std::env::var("LINKER_DEBUG").is_ok() { - eprintln!("output sections:"); - for (i, sec) in output_sections.iter().enumerate() { - eprintln!(" [{}]: {} type={} flags=0x{:x} size={} align={}", i, sec.name, sec.sh_type, sec.flags, sec.mem_size, sec.alignment); - } - } - - // Layout: Single RX LOAD segment from file offset 0 (ELF hdr + phdrs + text + rodata), - // followed by a RW LOAD segment for data + bss, plus TLS and GNU_STACK phdrs. - let has_tls = output_sections.iter().any(|s| s.flags & SHF_TLS != 0 && s.mem_size > 0); - let phdr_count: u64 = 2 + if has_tls { 1 } else { 0 } + 1 + 1; // 2 LOAD + optional TLS + GNU_STACK + GNU_EH_FRAME - let phdr_total_size = phdr_count * 56; - let debug_layout = std::env::var("LINKER_DEBUG_LAYOUT").is_ok(); - - // === Layout: RX segment (starts at file offset 0, vaddr BASE_ADDR) === - let mut offset = 64 + phdr_total_size; // After ELF header + phdrs - - // Text sections (executable) - for sec in output_sections.iter_mut() { - if sec.flags & SHF_EXECINSTR != 0 && sec.flags & SHF_ALLOC != 0 { - let a = sec.alignment.max(4); - offset = (offset + a - 1) & !(a - 1); - sec.addr = BASE_ADDR + offset; - sec.file_offset = offset; - offset += sec.mem_size; - } - } - - // Rodata sections (read-only, in same RX segment) - for sec in output_sections.iter_mut() { - if sec.flags & SHF_ALLOC != 0 && sec.flags & SHF_EXECINSTR == 0 && - sec.flags & SHF_WRITE == 0 && sec.sh_type != SHT_NOBITS && - sec.flags & SHF_TLS == 0 { - let a = sec.alignment.max(1); - offset = (offset + a - 1) & !(a - 1); - sec.addr = BASE_ADDR + offset; - sec.file_offset = offset; - if debug_layout { - eprintln!(" LAYOUT RO: {} addr=0x{:x} foff=0x{:x} sz=0x{:x} flags=0x{:x}", - sec.name, sec.addr, sec.file_offset, sec.mem_size, sec.flags); - } - offset += sec.mem_size; - } - } - - // Build .eh_frame_hdr: find .eh_frame, count FDEs, reserve space - let mut eh_frame_hdr_vaddr = 0u64; - let mut eh_frame_hdr_offset = 0u64; - let mut eh_frame_hdr_size = 0u64; - let mut eh_frame_vaddr = 0u64; - let mut eh_frame_file_offset = 0u64; - let mut eh_frame_size = 0u64; - for sec in output_sections.iter() { - if sec.name == ".eh_frame" && sec.mem_size > 0 { - eh_frame_vaddr = sec.addr; - eh_frame_file_offset = sec.file_offset; - eh_frame_size = sec.mem_size; - break; - } - } - if eh_frame_size > 0 { - // Count FDEs from individual input .eh_frame sections (data not merged yet) - let mut fde_count = 0usize; - if let Some(ef_sec) = output_sections.iter().find(|s| s.name == ".eh_frame" && s.mem_size > 0) { - for input in &ef_sec.inputs { - let sd = &objects[input.object_idx].section_data[input.section_idx]; - fde_count += crate::backend::linker_common::count_eh_frame_fdes(sd); - } - } - eh_frame_hdr_size = (12 + 8 * fde_count) as u64; - // Align to 4 bytes - offset = (offset + 3) & !3; - eh_frame_hdr_offset = offset; - eh_frame_hdr_vaddr = BASE_ADDR + offset; - offset += eh_frame_hdr_size; - if debug_layout { - eprintln!(" LAYOUT EH_FRAME_HDR: addr=0x{:x} foff=0x{:x} sz=0x{:x} fde_count={}", - eh_frame_hdr_vaddr, eh_frame_hdr_offset, eh_frame_hdr_size, fde_count); - } - } - - let rx_filesz = offset; // RX segment: [0, rx_filesz) - let _rx_memsz = rx_filesz; - - // Pre-count IFUNC symbols so we can reserve space for IPLT stubs in the RX gap. - // Each IPLT stub is 16 bytes (ADRP + LDR + BR + NOP), placed in the gap between - // the RX segment end and the page-aligned RW segment start. - let pre_iplt_count = globals.iter() - .filter(|(_, gsym)| gsym.info & 0xf == STT_GNU_IFUNC && gsym.defined_in.is_some()) - .count() as u64; - let iplt_stubs_needed = pre_iplt_count * 16; - if iplt_stubs_needed > 0 { - // Ensure the gap after rx_filesz is large enough for IPLT stubs. - // The stubs will be placed at 16-byte aligned offset after rx_filesz. - let stub_start = (offset + 15) & !15; - let stub_end = stub_start + iplt_stubs_needed; - // Make sure offset is at least stub_end so page-alignment leaves enough room - if offset < stub_end { - offset = stub_end; - } - } - - // === Layout: RW segment (page-aligned) === - offset = (offset + PAGE_SIZE - 1) & !(PAGE_SIZE - 1); - let rw_page_offset = offset; - let rw_page_addr = BASE_ADDR + offset; - - // TLS data (.tdata) first in RW - let mut tls_addr = 0u64; - let mut tls_file_offset = 0u64; - let mut tls_file_size = 0u64; - let mut tls_mem_size = 0u64; - let mut tls_align = 1u64; - for sec in output_sections.iter_mut() { - if sec.flags & SHF_TLS != 0 && sec.flags & SHF_ALLOC != 0 && sec.sh_type != SHT_NOBITS { - let a = sec.alignment.max(1); - offset = (offset + a - 1) & !(a - 1); - sec.addr = BASE_ADDR + offset; - sec.file_offset = offset; - if tls_addr == 0 { tls_addr = sec.addr; tls_file_offset = offset; tls_align = a; } - tls_file_size += sec.mem_size; - tls_mem_size += sec.mem_size; - if debug_layout { - eprintln!(" LAYOUT TLS: {} addr=0x{:x} foff=0x{:x} sz=0x{:x} align={} flags=0x{:x}", - sec.name, sec.addr, sec.file_offset, sec.mem_size, a, sec.flags); - } - offset += sec.mem_size; - } - } - // If only .tbss (NOBITS TLS) exists with no .tdata, we still need a TLS segment. - // Set tls_addr/tls_file_offset to the current position so TPOFF calculations work. - if tls_addr == 0 && has_tls { - tls_addr = BASE_ADDR + offset; - tls_file_offset = offset; - } - // TLS BSS (.tbss) - doesn't consume file space - for sec in output_sections.iter_mut() { - if sec.flags & SHF_TLS != 0 && sec.sh_type == SHT_NOBITS { - let a = sec.alignment.max(1); - let aligned = (tls_mem_size + a - 1) & !(a - 1); - sec.addr = tls_addr + aligned; - sec.file_offset = offset; - if debug_layout { - eprintln!(" LAYOUT TBSS: {} addr=0x{:x} aligned_off=0x{:x} sz=0x{:x} align={} tls_mem_size=0x{:x}", - sec.name, sec.addr, aligned, sec.mem_size, a, tls_mem_size); - } - tls_mem_size = aligned + sec.mem_size; - if a > tls_align { tls_align = a; } - } - } - if tls_mem_size > 0 { - tls_mem_size = (tls_mem_size + tls_align - 1) & !(tls_align - 1); - } - - // init_array - for sec in output_sections.iter_mut() { - if sec.name == ".init_array" { - let a = sec.alignment.max(8); - offset = (offset + a - 1) & !(a - 1); - sec.addr = BASE_ADDR + offset; - sec.file_offset = offset; - offset += sec.mem_size; - break; - } - } - // fini_array - for sec in output_sections.iter_mut() { - if sec.name == ".fini_array" { - let a = sec.alignment.max(8); - offset = (offset + a - 1) & !(a - 1); - sec.addr = BASE_ADDR + offset; - sec.file_offset = offset; - offset += sec.mem_size; - break; - } - } - - // .data.rel.ro (relocated read-only data) - must come before .data - for sec in output_sections.iter_mut() { - if sec.name == ".data.rel.ro" { - let a = sec.alignment.max(1); - offset = (offset + a - 1) & !(a - 1); - sec.addr = BASE_ADDR + offset; - sec.file_offset = offset; - if debug_layout { - eprintln!(" LAYOUT RW: {} addr=0x{:x} foff=0x{:x} sz=0x{:x} flags=0x{:x}", - sec.name, sec.addr, sec.file_offset, sec.mem_size, sec.flags); - } - offset += sec.mem_size; - } - } - // Remaining data sections (writable, non-BSS, non-TLS) - for sec in output_sections.iter_mut() { - if sec.flags & SHF_ALLOC != 0 && sec.flags & SHF_WRITE != 0 && - sec.sh_type != SHT_NOBITS && sec.flags & SHF_TLS == 0 && - sec.name != ".init_array" && sec.name != ".fini_array" && - sec.name != ".data.rel.ro" { - let a = sec.alignment.max(1); - offset = (offset + a - 1) & !(a - 1); - sec.addr = BASE_ADDR + offset; - sec.file_offset = offset; - if debug_layout { - eprintln!(" LAYOUT RW: {} addr=0x{:x} foff=0x{:x} sz=0x{:x} flags=0x{:x}", - sec.name, sec.addr, sec.file_offset, sec.mem_size, sec.flags); - } - offset += sec.mem_size; - } - } - - // GOT (Global Offset Table) - needed for R_AARCH64_ADR_GOT_PAGE / LD64_GOT_LO12_NC - // and TLS IE relocations (which store TP offsets in GOT entries) - let got_syms = reloc::collect_got_symbols(objects); - let got_size = got_syms.len() as u64 * 8; - offset = (offset + 7) & !7; // 8-byte align - let got_offset = offset; - let got_addr = BASE_ADDR + offset; - let mut got_entries = HashMap::new(); - for (idx, (key, _kind)) in got_syms.iter().enumerate() { - got_entries.insert(key.clone(), idx); - } - offset += got_size; - - // Collect IFUNC symbols before address resolution - we need them for layout. - // Identify them by STT_GNU_IFUNC type in the symbol table. - let mut ifunc_names: Vec = Vec::new(); - for (name, gsym) in globals.iter() { - if gsym.info & 0xf == STT_GNU_IFUNC && gsym.defined_in.is_some() { - ifunc_names.push(name.clone()); - } - } - ifunc_names.sort(); // deterministic order - - // IPLT GOT slots for IFUNC symbols (one 8-byte slot per IFUNC) - let iplt_got_count = ifunc_names.len(); - let iplt_got_size = iplt_got_count as u64 * 8; - offset = (offset + 7) & !7; - let iplt_got_offset = offset; - let iplt_got_addr = BASE_ADDR + offset; - offset += iplt_got_size; - - // IRELATIVE relocation entries (.rela.iplt) in the RW segment - // Format: Elf64_Rela { r_offset: u64, r_info: u64, r_addend: i64 } = 24 bytes each - let rela_iplt_size = iplt_got_count as u64 * 24; - offset = (offset + 7) & !7; - let rela_iplt_offset = offset; - let rela_iplt_addr = BASE_ADDR + offset; - let rela_iplt_end_addr = rela_iplt_addr + rela_iplt_size; - offset += rela_iplt_size; - - let rw_filesz = offset - rw_page_offset; - - // BSS (nobits, non-TLS) - let bss_addr = BASE_ADDR + offset; - let mut bss_size = 0u64; - for sec in output_sections.iter_mut() { - if sec.sh_type == SHT_NOBITS && sec.flags & SHF_ALLOC != 0 && sec.flags & SHF_TLS == 0 { - let a = sec.alignment.max(1); - let aligned = (bss_addr + bss_size + a - 1) & !(a - 1); - bss_size = aligned - bss_addr + sec.mem_size; - sec.addr = aligned; - sec.file_offset = offset; - } - } - let rw_memsz = if bss_size > 0 { (bss_addr + bss_size) - rw_page_addr } else { rw_filesz }; - - // IPLT stubs go in the RX padding between rx_filesz and rw_page_offset. - // Each stub: 16 bytes (ADRP + LDR + BR + NOP) - let iplt_stub_size = iplt_got_count as u64 * 16; - let iplt_stub_file_off = (rx_filesz + 15) & !15; // 16-byte aligned - let iplt_stub_addr = BASE_ADDR + iplt_stub_file_off; - if iplt_stub_size > 0 && iplt_stub_file_off + iplt_stub_size > rw_page_offset { - return Err(format!("IPLT stubs ({} bytes) don't fit in RX padding (gap={})", - iplt_stub_size, rw_page_offset - iplt_stub_file_off)); - } - - if std::env::var("LINKER_DEBUG").is_ok() { - eprintln!("section layout:"); - for sec in output_sections.iter() { - eprintln!(" {} addr=0x{:x} foff=0x{:x} size=0x{:x}", sec.name, sec.addr, sec.file_offset, sec.mem_size); - } - eprintln!(" GOT addr=0x{:x} foff=0x{:x} size=0x{:x} entries={}", got_addr, got_offset, got_size, got_entries.len()); - if iplt_got_count > 0 { - eprintln!(" IPLT GOT addr=0x{:x} entries={}", iplt_got_addr, iplt_got_count); - eprintln!(" RELA.IPLT addr=0x{:x}..0x{:x}", rela_iplt_addr, rela_iplt_end_addr); - eprintln!(" IPLT stubs addr=0x{:x}", iplt_stub_addr); - } - eprintln!(" BSS addr=0x{:x} size=0x{:x}", bss_addr, bss_size); - } - - // Merge section data - for sec in output_sections.iter_mut() { - if sec.sh_type == SHT_NOBITS { continue; } - let mut data = vec![0u8; sec.mem_size as usize]; - for input in &sec.inputs { - let sd = &objects[input.object_idx].section_data[input.section_idx]; - let s = input.output_offset as usize; - let e = s + sd.len(); - if e <= data.len() && !sd.is_empty() { - data[s..e].copy_from_slice(sd); - } - } - sec.data = data; - } - - // Update global symbol addresses - for (name, gsym) in globals.iter_mut() { - if let Some(obj_idx) = gsym.defined_in { - if obj_idx == usize::MAX { continue; } // linker-defined - if gsym.section_idx == SHN_COMMON || gsym.section_idx == 0xffff { - if let Some(bss_sec) = output_sections.iter().find(|s| s.name == ".bss") { - gsym.value += bss_sec.addr; - } - } else if gsym.section_idx != SHN_UNDEF && gsym.section_idx != SHN_ABS { - let si = gsym.section_idx as usize; - if let Some(&(oi, so)) = section_map.get(&(obj_idx, si)) { - let old_val = gsym.value; - gsym.value += output_sections[oi].addr + so; - if std::env::var("LINKER_DEBUG").is_ok() && gsym.info & 0xf == STT_TLS { - eprintln!(" TLS sym '{}': old=0x{:x} -> new=0x{:x} (sec={} addr=0x{:x} off=0x{:x})", - name, old_val, gsym.value, output_sections[oi].name, output_sections[oi].addr, so); - } - } else if std::env::var("LINKER_DEBUG").is_ok() && gsym.info & 0xf == STT_TLS { - eprintln!(" TLS sym '{}': NO MAPPING for ({}, {})", name, obj_idx, si); - } - } - } - } - - // Build IFUNC resolver address map (now that addresses are resolved) - let ifunc_syms: Vec<(String, u64)> = ifunc_names.iter() - .map(|name| { - let resolver_addr = globals.get(name).map(|g| g.value).unwrap_or(0); - (name.clone(), resolver_addr) - }) - .collect(); - - // Redirect IFUNC symbols to their PLT stubs - for (i, (name, _resolver_addr)) in ifunc_syms.iter().enumerate() { - let plt_addr = iplt_stub_addr + i as u64 * 16; - if let Some(gsym) = globals.get_mut(name) { - gsym.value = plt_addr; - // Change type from IFUNC to FUNC so relocations treat it normally - gsym.info = (gsym.info & 0xf0) | STT_FUNC; - } - } - - if std::env::var("LINKER_DEBUG").is_ok() && !ifunc_syms.is_empty() { - for (i, (name, resolver)) in ifunc_syms.iter().enumerate() { - eprintln!(" IFUNC[{}]: {} resolver=0x{:x} plt=0x{:x} got=0x{:x}", - i, name, resolver, iplt_stub_addr + i as u64 * 16, - iplt_got_addr + i as u64 * 8); - } - } - - // Compute init/fini array boundaries - let init_array_start = output_sections.iter().find(|s| s.name == ".init_array").map(|s| s.addr).unwrap_or(0); - let init_array_end = output_sections.iter().find(|s| s.name == ".init_array").map(|s| s.addr + s.mem_size).unwrap_or(0); - let fini_array_start = output_sections.iter().find(|s| s.name == ".fini_array").map(|s| s.addr).unwrap_or(0); - let fini_array_end = output_sections.iter().find(|s| s.name == ".fini_array").map(|s| s.addr + s.mem_size).unwrap_or(0); - let init_addr = output_sections.iter().find(|s| s.name == ".init").map(|s| s.addr).unwrap_or(0); - let fini_addr = output_sections.iter().find(|s| s.name == ".fini").map(|s| s.addr).unwrap_or(0); - - // Define linker-provided symbols using shared infrastructure (consistent - // with x86-64/i686/RISC-V via get_standard_linker_symbols) - let text_seg_end = BASE_ADDR + rx_filesz; - let linker_addrs = LinkerSymbolAddresses { - base_addr: BASE_ADDR, - got_addr, - dynamic_addr: 0, // No .dynamic in static mode (dynamic executables use emit_dynamic_executable) - bss_addr, - bss_size, - text_end: text_seg_end, - data_start: rw_page_addr, - init_array_start, - init_array_size: init_array_end - init_array_start, - fini_array_start, - fini_array_size: fini_array_end - fini_array_start, - preinit_array_start: 0, - preinit_array_size: 0, - rela_iplt_start: rela_iplt_addr, - rela_iplt_size, - }; - for sym in &get_standard_linker_symbols(&linker_addrs) { - if globals.get(sym.name).map(|g| g.defined_in.is_none()).unwrap_or(true) { - globals.insert(sym.name.to_string(), GlobalSymbol { - value: sym.value, size: 0, info: (sym.binding << 4) | STT_OBJECT, - defined_in: Some(usize::MAX), section_idx: SHN_ABS, - from_lib: None, plt_idx: None, got_idx: None, - is_dynamic: false, copy_reloc: false, lib_sym_value: 0, - }); - } - } - - // Auto-generate __start_
/ __stop_
symbols (GNU ld feature) - for (name, addr) in linker_common::resolve_start_stop_symbols(output_sections) { - if globals.get(&name).map(|g| g.defined_in.is_none()).unwrap_or(false) { - globals.insert(name, GlobalSymbol { - value: addr, size: 0, info: (STB_GLOBAL << 4), - defined_in: Some(usize::MAX), section_idx: SHN_ABS, - from_lib: None, plt_idx: None, got_idx: None, - is_dynamic: false, copy_reloc: false, lib_sym_value: 0, - }); - } - } - // ARM-specific linker symbols not in the shared list - let arm_extra_syms: [(&str, u64); 3] = [ - ("__GNU_EH_FRAME_HDR", eh_frame_hdr_vaddr), - ("_init", init_addr), - ("_fini", fini_addr), - ]; - for (name, val) in &arm_extra_syms { - if globals.get(*name).map(|g| g.defined_in.is_none()).unwrap_or(true) { - globals.insert(name.to_string(), GlobalSymbol { - value: *val, size: 0, info: (STB_GLOBAL << 4) | STT_OBJECT, - defined_in: Some(usize::MAX), section_idx: SHN_ABS, - from_lib: None, plt_idx: None, got_idx: None, - is_dynamic: false, copy_reloc: false, lib_sym_value: 0, - }); - } - } - - let entry_addr = globals.get("_start").map(|s| s.value).unwrap_or(BASE_ADDR); - - if std::env::var("LINKER_DEBUG").is_ok() { - if let Some(g) = globals.get("main") { eprintln!(" main resolved to 0x{:x}", g.value); } - if let Some(g) = globals.get("_start") { eprintln!(" _start resolved to 0x{:x}", g.value); } - if let Some(g) = globals.get("__libc_start_main") { eprintln!(" __libc_start_main resolved to 0x{:x}", g.value); } - eprintln!(" entry_addr = 0x{:x}", entry_addr); - } - - // Build output buffer - let file_size = offset as usize; - let mut out = vec![0u8; file_size]; - - // Write section data - for sec in output_sections.iter() { - if sec.sh_type == SHT_NOBITS || sec.data.is_empty() { continue; } - write_bytes(&mut out, sec.file_offset as usize, &sec.data); - } - - // Populate GOT entries with resolved symbol addresses or TP offsets. - // We re-walk the relocations to resolve each symbol (including locals) - // rather than only looking up global symbol names. - let globals_snap = globals.clone(); - let got_info = reloc::GotInfo { got_addr, entries: got_entries }; - let got_kind_map: HashMap = got_syms.iter() - .map(|(k, kind)| (k.clone(), *kind)) - .collect(); - // Build a resolved address map for GOT entries by walking relocations - let mut got_resolved: HashMap = HashMap::new(); - for obj_idx in 0..objects.len() { - for sec_idx in 0..objects[obj_idx].sections.len() { - for rela in &objects[obj_idx].relocations[sec_idx] { - match rela.rela_type { - R_AARCH64_ADR_GOT_PAGE | R_AARCH64_LD64_GOT_LO12_NC | - reloc::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21 | reloc::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC => { - let si = rela.sym_idx as usize; - if si < objects[obj_idx].symbols.len() { - let sym = &objects[obj_idx].symbols[si]; - let key = reloc::got_key(obj_idx, sym); - got_resolved.entry(key).or_insert_with(|| { - - reloc::resolve_sym(obj_idx, sym, &globals_snap, - section_map, output_sections) - }); - } - } - _ => {} - } - } - } - } - for (key, &idx) in &got_info.entries { - let sym_addr = got_resolved.get(key).copied().unwrap_or(0); - let val = match got_kind_map.get(key) { - Some(&reloc::GotEntryKind::TlsIE) => { - // GOT entry holds the TP-relative offset for this TLS variable - // AArch64 variant 1: tp_offset = (sym_addr - tls_base) + 16 - if tls_addr != 0 { - let offset = (sym_addr as i64) - (tls_addr as i64) + 16; - if std::env::var("LINKER_DEBUG_TLS").is_ok() { - eprintln!(" GOT TLS IE: key='{}' sym_addr=0x{:x} tls_addr=0x{:x} -> got_val=0x{:x}", - key, sym_addr, tls_addr, offset as u64); - } - offset as u64 - } else { - sym_addr - } - } - _ => sym_addr, - }; - let entry_off = got_offset as usize + idx * 8; - w64(&mut out, entry_off, val); - if std::env::var("LINKER_DEBUG").is_ok() && val == 0 { - eprintln!(" GOT[{}] = 0 for symbol '{}'", idx, key); - } - } - - // Populate IPLT GOT slots (initially with resolver addresses), RELA entries, and PLT stubs - for (i, (_name, resolver_addr)) in ifunc_syms.iter().enumerate() { - // IPLT GOT slot: initially contains resolver address (will be overwritten at startup) - let got_slot_off = iplt_got_offset as usize + i * 8; - if got_slot_off + 8 <= out.len() { - w64(&mut out, got_slot_off, *resolver_addr); - } - - // RELA.IPLT entry: { r_offset, r_info, r_addend } - let rela_off = rela_iplt_offset as usize + i * 24; - let got_slot_addr = iplt_got_addr + i as u64 * 8; - if rela_off + 24 <= out.len() { - w64(&mut out, rela_off, got_slot_addr); // r_offset: GOT slot VA - w64(&mut out, rela_off + 8, 0x408); // r_info: R_AARCH64_IRELATIVE - w64(&mut out, rela_off + 16, *resolver_addr); // r_addend: resolver VA - } - - // PLT stub: ADRP x16, got_page; LDR x17, [x16, #got_lo]; BR x17; NOP - let plt_off = iplt_stub_file_off as usize + i * 16; - let plt_addr = iplt_stub_addr + i as u64 * 16; - if plt_off + 16 <= out.len() { - // ADRP x16, page_of(got_slot) - let page_g = got_slot_addr & !0xFFF; - let page_p = plt_addr & !0xFFF; - let page_diff = ((page_g as i64 - page_p as i64) >> 12) as i32; - let immlo = (page_diff & 3) as u32; - let immhi = ((page_diff >> 2) & 0x7ffff) as u32; - let adrp = 0x90000010u32 | (immlo << 29) | (immhi << 5); // ADRP x16 - w32(&mut out, plt_off, adrp); - - // LDR x17, [x16, #lo12(got_slot)] - let lo12 = (got_slot_addr & 0xFFF) as u32; - let ldr = 0xf9400211u32 | ((lo12 / 8) << 10); // LDR x17, [x16, #imm] - w32(&mut out, plt_off + 4, ldr); - - // BR x17 - w32(&mut out, plt_off + 8, 0xd61f0220u32); - - // NOP - w32(&mut out, plt_off + 12, 0xd503201fu32); - } - } - - // Apply relocations - let tls_info = reloc::TlsInfo { tls_addr, tls_size: tls_mem_size }; - reloc::apply_relocations(objects, &globals_snap, output_sections, section_map, - &mut out, &tls_info, &got_info)?; - - // Build .eh_frame_hdr from relocated .eh_frame data and write it - if eh_frame_hdr_size > 0 && eh_frame_size > 0 { - let ef_start = eh_frame_file_offset as usize; - let ef_end = ef_start + eh_frame_size as usize; - if ef_end <= out.len() { - let eh_frame_relocated = out[ef_start..ef_end].to_vec(); - let hdr_data = crate::backend::linker_common::build_eh_frame_hdr( - &eh_frame_relocated, - eh_frame_vaddr, - eh_frame_hdr_vaddr, - true, // 64-bit - ); - let hdr_off = eh_frame_hdr_offset as usize; - if !hdr_data.is_empty() && hdr_off + hdr_data.len() <= out.len() { - write_bytes(&mut out, hdr_off, &hdr_data); - } - } - } - - // === ELF header === - out[0..4].copy_from_slice(&ELF_MAGIC); - out[4] = ELFCLASS64; out[5] = ELFDATA2LSB; out[6] = 1; - out[7] = 3; // ELFOSABI_GNU (matches ld output for static exes) - w16(&mut out, 16, ET_EXEC); - w16(&mut out, 18, EM_AARCH64); - w32(&mut out, 20, 1); - w64(&mut out, 24, entry_addr); - w64(&mut out, 32, 64); // e_phoff - w64(&mut out, 40, 0); // e_shoff - w32(&mut out, 48, 0); // e_flags - w16(&mut out, 52, 64); // e_ehsize - w16(&mut out, 54, 56); // e_phentsize - w16(&mut out, 56, phdr_count as u16); - w16(&mut out, 58, 64); // e_shentsize - w16(&mut out, 60, 0); // e_shnum - w16(&mut out, 62, 0); // e_shstrndx - - // === Program headers === - let mut ph = 64usize; - - // LOAD: RX segment starting from file offset 0 (includes ELF header + PLT stubs) - let rx_actual_filesz = if iplt_stub_size > 0 { iplt_stub_file_off + iplt_stub_size } else { rx_filesz }; - let rx_actual_memsz = rx_actual_filesz; - wphdr(&mut out, ph, PT_LOAD, PF_R | PF_X, - 0, BASE_ADDR, rx_actual_filesz, rx_actual_memsz, PAGE_SIZE); - ph += 56; - - // LOAD: RW segment - wphdr(&mut out, ph, PT_LOAD, PF_R | PF_W, - rw_page_offset, rw_page_addr, rw_filesz, rw_memsz, PAGE_SIZE); - ph += 56; - - // TLS segment - if has_tls && tls_addr != 0 { - wphdr(&mut out, ph, PT_TLS, PF_R, - tls_file_offset, tls_addr, tls_file_size, tls_mem_size, tls_align); - ph += 56; - } - - // GNU_STACK - wphdr(&mut out, ph, PT_GNU_STACK, PF_R | PF_W, 0, 0, 0, 0, 0x10); - ph += 56; - - // PT_GNU_EH_FRAME: points to .eh_frame_hdr for stack unwinding - wphdr(&mut out, ph, PT_GNU_EH_FRAME, PF_R, - eh_frame_hdr_offset, eh_frame_hdr_vaddr, eh_frame_hdr_size, eh_frame_hdr_size, 4); - - // Write output - std::fs::write(output_path, &out).map_err(|e| format!("failed to write '{}': {}", output_path, e))?; - #[cfg(unix)] - { - use std::os::unix::fs::PermissionsExt; - let _ = std::fs::set_permissions(output_path, std::fs::Permissions::from_mode(0o755)); - } - Ok(()) -} - diff --git a/src/backend/arm/linker/input.rs b/src/backend/arm/linker/input.rs deleted file mode 100644 index 33fbd39678..0000000000 --- a/src/backend/arm/linker/input.rs +++ /dev/null @@ -1,91 +0,0 @@ -//! Input file loading for the AArch64 linker. -//! -//! Handles loading of object files (.o), archives (.a), shared libraries (.so), -//! and linker scripts. Delegates to `linker_common` for ELF parsing. - -use std::collections::HashMap; -use std::path::Path; - -use super::elf::*; -use crate::backend::linker_common; -use super::types::{GlobalSymbol, arm_should_replace_extra}; - -pub fn load_file( - path: &str, - objects: &mut Vec, - globals: &mut HashMap, - needed_sonames: &mut Vec, - lib_paths: &[String], - is_static: bool, -) -> Result<(), String> { - if std::env::var("LINKER_DEBUG").is_ok() { - eprintln!("load_file: {}", path); - } - let data = std::fs::read(path).map_err(|e| format!("failed to read '{}': {}", path, e))?; - - // Regular archive - if data.len() >= 8 && &data[0..8] == b"!\n" { - return linker_common::load_archive_elf64(&data, path, objects, globals, EM_AARCH64, arm_should_replace_extra, false); - } - - // Thin archive - if is_thin_archive(&data) { - return linker_common::load_thin_archive_elf64(&data, path, objects, globals, EM_AARCH64, arm_should_replace_extra, false); - } - - // Not ELF? Try linker script (handles both GROUP and INPUT directives) - if data.len() >= 4 && data[0..4] != ELF_MAGIC { - if let Ok(text) = std::str::from_utf8(&data) { - if let Some(entries) = parse_linker_script_entries(text) { - let script_dir = Path::new(path).parent().map(|p| p.to_string_lossy().to_string()); - for entry in &entries { - match entry { - LinkerScriptEntry::Path(lib_path) => { - if Path::new(lib_path).exists() { - load_file(lib_path, objects, globals, needed_sonames, lib_paths, is_static)?; - } else if let Some(ref dir) = script_dir { - let resolved = format!("{}/{}", dir, lib_path); - if Path::new(&resolved).exists() { - load_file(&resolved, objects, globals, needed_sonames, lib_paths, is_static)?; - } - } - } - LinkerScriptEntry::Lib(lib_name) => { - if let Some(resolved) = resolve_lib(lib_name, lib_paths) { - load_file(&resolved, objects, globals, needed_sonames, lib_paths, is_static)?; - } - } - } - } - return Ok(()); - } - } - return Err(format!("{}: not a valid ELF object or archive", path)); - } - - // Shared library? - if data.len() >= 18 { - let e_type = read_u16(&data, 16); - if e_type == ET_DYN { - if is_static { - return Ok(()); // Skip .so in static linking - } - return linker_common::load_shared_library_elf64(path, globals, needed_sonames, lib_paths); - } - } - - let obj = parse_object(&data, path)?; - let obj_idx = objects.len(); - linker_common::register_symbols_elf64(obj_idx, &obj, globals, arm_should_replace_extra); - objects.push(obj); - Ok(()) -} - -pub fn resolve_lib(name: &str, paths: &[String]) -> Option { - crate::backend::linker_common::resolve_lib(name, paths, true) -} - -pub fn resolve_lib_prefer_shared(name: &str, paths: &[String]) -> Option { - // For dynamic linking, prefer .so over .a - crate::backend::linker_common::resolve_lib(name, paths, false) -} diff --git a/src/backend/arm/linker/link.rs b/src/backend/arm/linker/link.rs deleted file mode 100644 index 9c8c81d97e..0000000000 --- a/src/backend/arm/linker/link.rs +++ /dev/null @@ -1,410 +0,0 @@ -//! AArch64 linker orchestration. -//! -//! Contains the two public entry points (`link_builtin` and `link_shared`) that -//! orchestrate the linking pipeline: load inputs, resolve symbols, merge sections, -//! build PLT/GOT, and dispatch to the appropriate ELF emission path. - -use std::collections::{HashMap, HashSet}; -use std::path::Path; - -use super::elf::*; -use super::types::GlobalSymbol; -use super::input::{load_file, resolve_lib, resolve_lib_prefer_shared}; -use super::plt_got::create_plt_got; -use super::emit_dynamic::emit_dynamic_executable; -use super::emit_shared::emit_shared_library; -use super::emit_static::emit_executable; -use crate::backend::linker_common; -use linker_common::OutputSection; - -// ── Public entry point ───────────────────────────────────────────────── - -/// Link AArch64 object files into an ELF executable (pre-resolved CRT/library variant). -/// -/// Supports both static and dynamic linking. When `is_static` is false, shared -/// libraries are loaded and PLT/GOT/.dynamic sections are generated for dynamic -/// symbol references. -pub fn link_builtin( - object_files: &[&str], - output_path: &str, - user_args: &[String], - lib_paths: &[&str], - needed_libs: &[&str], - crt_objects_before: &[&str], - crt_objects_after: &[&str], - is_static: bool, -) -> Result<(), String> { - if std::env::var("LINKER_DEBUG").is_ok() { - eprintln!("arm linker: object_files={:?} output={} user_args={:?} static={}", object_files, output_path, user_args, is_static); - } - let mut objects: Vec = Vec::new(); - let mut globals: HashMap = HashMap::new(); - let mut needed_sonames: Vec = Vec::new(); - - let all_lib_paths: Vec = lib_paths.iter().map(|s| s.to_string()).collect(); - - // Parse user args for export-dynamic flag - let mut export_dynamic = false; - for arg in user_args { - if arg == "-rdynamic" { export_dynamic = true; } - if let Some(wl_arg) = arg.strip_prefix("-Wl,") { - for part in wl_arg.split(',') { - if part == "--export-dynamic" || part == "-export-dynamic" || part == "-E" { export_dynamic = true; } - } - } - } - - // Load CRT objects before user objects - for path in crt_objects_before { - if Path::new(path).exists() { - load_file(path, &mut objects, &mut globals, &mut needed_sonames, &all_lib_paths, is_static)?; - } - } - - // Load user object files - for path in object_files { - load_file(path, &mut objects, &mut globals, &mut needed_sonames, &all_lib_paths, is_static)?; - } - - // Parse user_args for -l, -L, bare files, etc. - let args: Vec<&str> = user_args.iter().map(|s| s.as_str()).collect(); - let mut defsym_defs: Vec<(String, String)> = Vec::new(); - let mut extra_lib_paths: Vec = Vec::new(); - let mut gc_sections = false; - let mut arg_i = 0; - while arg_i < args.len() { - let arg = args[arg_i]; - if let Some(path) = arg.strip_prefix("-L") { - let p = if path.is_empty() && arg_i + 1 < args.len() { arg_i += 1; args[arg_i] } else { path }; - extra_lib_paths.push(p.to_string()); - } else if let Some(lib) = arg.strip_prefix("-l") { - let l = if lib.is_empty() && arg_i + 1 < args.len() { arg_i += 1; args[arg_i] } else { lib }; - let resolver = if is_static { resolve_lib } else { resolve_lib_prefer_shared }; - let mut combined = extra_lib_paths.clone(); - combined.extend(all_lib_paths.iter().cloned()); - if let Some(lib_path) = resolver(l, &combined) { - load_file(&lib_path, &mut objects, &mut globals, &mut needed_sonames, &combined, is_static)?; - } - } else if let Some(wl_arg) = arg.strip_prefix("-Wl,") { - let parts: Vec<&str> = wl_arg.split(',').collect(); - let mut j = 0; - while j < parts.len() { - let part = parts[j]; - if let Some(lpath) = part.strip_prefix("-L") { - extra_lib_paths.push(lpath.to_string()); - } else if let Some(lib) = part.strip_prefix("-l") { - let resolver = if is_static { resolve_lib } else { resolve_lib_prefer_shared }; - let mut combined = extra_lib_paths.clone(); - combined.extend(all_lib_paths.iter().cloned()); - if let Some(lib_path) = resolver(lib, &combined) { - load_file(&lib_path, &mut objects, &mut globals, &mut needed_sonames, &combined, is_static)?; - } - } else if let Some(defsym_arg) = part.strip_prefix("--defsym=") { - // --defsym=SYMBOL=EXPR: define a symbol alias - // TODO: only supports symbol-to-symbol aliasing, not arbitrary expressions - if let Some(eq_pos) = defsym_arg.find('=') { - defsym_defs.push((defsym_arg[..eq_pos].to_string(), defsym_arg[eq_pos + 1..].to_string())); - } - } else if part == "--defsym" && j + 1 < parts.len() { - // Two-argument form: --defsym SYM=VAL - j += 1; - let defsym_arg = parts[j]; - if let Some(eq_pos) = defsym_arg.find('=') { - defsym_defs.push((defsym_arg[..eq_pos].to_string(), defsym_arg[eq_pos + 1..].to_string())); - } - } else if part == "--gc-sections" { - gc_sections = true; - } else if part == "--no-gc-sections" { - gc_sections = false; - } - j += 1; - } - } else if !arg.starts_with('-') && Path::new(arg).exists() { - load_file(arg, &mut objects, &mut globals, &mut needed_sonames, &all_lib_paths, is_static)?; - } - arg_i += 1; - } - - // Load CRT objects after user objects - for path in crt_objects_after { - if Path::new(path).exists() { - load_file(path, &mut objects, &mut globals, &mut needed_sonames, &all_lib_paths, is_static)?; - } - } - - // Build combined library search paths: user -L first, then system paths - let mut combined_lib_paths: Vec = extra_lib_paths; - combined_lib_paths.extend(all_lib_paths.iter().cloned()); - - // Load default libraries in a group (like ld's --start-group) - if !needed_libs.is_empty() { - let resolver = if is_static { resolve_lib } else { resolve_lib_prefer_shared }; - let mut lib_paths_resolved: Vec = Vec::new(); - for lib_name in needed_libs { - if let Some(lib_path) = resolver(lib_name, &combined_lib_paths) { - if !lib_paths_resolved.contains(&lib_path) { - lib_paths_resolved.push(lib_path); - } - } - } - let mut changed = true; - while changed { - changed = false; - let prev_count = objects.len(); - for lib_path in &lib_paths_resolved { - load_file(lib_path, &mut objects, &mut globals, &mut needed_sonames, &combined_lib_paths, is_static)?; - } - if objects.len() != prev_count { - changed = true; - } - } - } - - // For dynamic linking, resolve remaining undefined symbols against system libs - if !is_static { - let default_libs = ["libc.so.6", "libm.so.6", "libgcc_s.so.1", "ld-linux-aarch64.so.1"]; - linker_common::resolve_dynamic_symbols_elf64( - &mut globals, &mut needed_sonames, &combined_lib_paths, &default_libs, - )?; - } - - // Apply --defsym definitions: alias one symbol to another - for (alias, target) in &defsym_defs { - if let Some(target_sym) = globals.get(target).cloned() { - globals.insert(alias.clone(), target_sym); - } - } - - // Garbage-collect unreferenced sections when --gc-sections is active. - // This removes sections not reachable from entry points, which may also - // eliminate undefined symbol references from dead code. - let dead_sections: HashSet<(usize, usize)> = if gc_sections { - linker_common::gc_collect_sections_elf64(&objects) - } else { - HashSet::new() - }; - - // When gc-sections is active, remove globals that only exist in dead sections - if gc_sections { - let mut referenced_from_live: HashSet = HashSet::new(); - for (obj_idx, obj) in objects.iter().enumerate() { - for (sec_idx, relas) in obj.relocations.iter().enumerate() { - if dead_sections.contains(&(obj_idx, sec_idx)) { continue; } - for rela in relas { - if (rela.sym_idx as usize) < obj.symbols.len() { - let sym = &obj.symbols[rela.sym_idx as usize]; - if !sym.name.is_empty() { - referenced_from_live.insert(sym.name.clone()); - } - } - } - } - } - globals.retain(|name, sym| { - sym.defined_in.is_some() || sym.is_dynamic - || (sym.info >> 4) == STB_WEAK - || referenced_from_live.contains(name) - }); - } - - // Reject truly undefined symbols (weak undefined are allowed) - let mut unresolved = Vec::new(); - for (name, sym) in &globals { - if sym.defined_in.is_none() && !sym.is_dynamic && sym.section_idx == SHN_UNDEF { - let binding = sym.info >> 4; - if binding != STB_WEAK && !linker_common::is_linker_defined_symbol(name) { - unresolved.push(name.clone()); - } - } - } - if !unresolved.is_empty() { - unresolved.sort(); - unresolved.truncate(20); - return Err(format!("undefined symbols: {}", - unresolved.iter().map(|s| s.as_str()).collect::>().join(", "))); - } - - // Merge sections (skip dead sections when gc-sections is active) - let mut output_sections: Vec = Vec::new(); - let mut section_map: HashMap<(usize, usize), (usize, u64)> = HashMap::new(); - linker_common::merge_sections_elf64_gc(&objects, &mut output_sections, &mut section_map, &dead_sections); - - // Allocate COMMON symbols (using shared implementation) - linker_common::allocate_common_symbols_elf64(&mut globals, &mut output_sections); - - // Check if we have any dynamic symbols - let has_dynamic_syms = globals.values().any(|g| g.is_dynamic); - - if has_dynamic_syms && !is_static { - // Create PLT/GOT for dynamic symbols - let (plt_names, got_entries) = create_plt_got(&objects, &mut globals); - - // Emit dynamically-linked executable - emit_dynamic_executable( - &objects, &mut globals, &mut output_sections, §ion_map, - &plt_names, &got_entries, &needed_sonames, output_path, - export_dynamic, - ) - } else { - // Fall back to static emit - emit_executable(&objects, &mut globals, &mut output_sections, §ion_map, output_path) - } -} - -// ── Shared library output ──────────────────────────────────────────── - -/// Create a shared library (.so) from object files. -pub fn link_shared( - object_files: &[&str], - output_path: &str, - user_args: &[String], - lib_paths: &[&str], -) -> Result<(), String> { - let mut objects: Vec = Vec::new(); - let mut globals: HashMap = HashMap::new(); - let mut needed_sonames: Vec = Vec::new(); - let lib_path_strings: Vec = lib_paths.iter().map(|s| s.to_string()).collect(); - - // Parse user args - let mut extra_lib_paths: Vec = Vec::new(); - let mut libs_to_load: Vec = Vec::new(); - let mut extra_object_files: Vec = Vec::new(); - let mut soname: Option = None; - let mut i = 0; - let args: Vec<&str> = user_args.iter().map(|s| s.as_str()).collect(); - while i < args.len() { - let arg = args[i]; - if let Some(path) = arg.strip_prefix("-L") { - let p = if path.is_empty() && i + 1 < args.len() { i += 1; args[i] } else { path }; - extra_lib_paths.push(p.to_string()); - } else if let Some(lib) = arg.strip_prefix("-l") { - let l = if lib.is_empty() && i + 1 < args.len() { i += 1; args[i] } else { lib }; - libs_to_load.push(l.to_string()); - } else if let Some(wl_arg) = arg.strip_prefix("-Wl,") { - let parts: Vec<&str> = wl_arg.split(',').collect(); - for j in 0..parts.len() { - let part = parts[j]; - if let Some(sn) = part.strip_prefix("-soname=") { - soname = Some(sn.to_string()); - } else if part == "-soname" && j + 1 < parts.len() { - soname = Some(parts[j + 1].to_string()); - } else if let Some(lpath) = part.strip_prefix("-L") { - extra_lib_paths.push(lpath.to_string()); - } else if let Some(lib) = part.strip_prefix("-l") { - libs_to_load.push(lib.to_string()); - } - } - } else if arg == "-shared" || arg == "-nostdlib" || arg == "-o" { - if arg == "-o" { i += 1; } - } else if !arg.starts_with('-') && Path::new(arg).exists() { - extra_object_files.push(arg.to_string()); - } - i += 1; - } - - for path in object_files { - load_file(path, &mut objects, &mut globals, &mut needed_sonames, &lib_path_strings, false)?; - } - for path in &extra_object_files { - load_file(path, &mut objects, &mut globals, &mut needed_sonames, &lib_path_strings, false)?; - } - - let mut all_lib_paths: Vec = extra_lib_paths; - all_lib_paths.extend(lib_path_strings.iter().cloned()); - - if !libs_to_load.is_empty() { - let mut lib_paths_resolved: Vec = Vec::new(); - for lib_name in &libs_to_load { - if let Some(lib_path) = resolve_lib_prefer_shared(lib_name, &all_lib_paths) { - if !lib_paths_resolved.contains(&lib_path) { - lib_paths_resolved.push(lib_path); - } - } - } - let mut changed = true; - while changed { - changed = false; - let prev_count = objects.len(); - for lib_path in &lib_paths_resolved { - load_file(lib_path, &mut objects, &mut globals, &mut needed_sonames, &all_lib_paths, false)?; - } - if objects.len() != prev_count { changed = true; } - } - } - - // Merge sections (no gc-sections for shared libraries) - let mut output_sections: Vec = Vec::new(); - let mut section_map: HashMap<(usize, usize), (usize, u64)> = HashMap::new(); - linker_common::merge_sections_elf64(&objects, &mut output_sections, &mut section_map); - linker_common::allocate_common_symbols_elf64(&mut globals, &mut output_sections); - - // Resolve undefined symbols against system shared libraries to discover - // NEEDED dependencies. Without this, the shared library would be missing - // DT_NEEDED entries for libc.so.6 etc., causing the dynamic linker to - // fail to resolve PLT symbols at runtime. - resolve_dynamic_symbols_for_shared(&objects, &globals, &mut needed_sonames, &all_lib_paths); - - // Emit shared library - emit_shared_library( - &objects, &mut globals, &mut output_sections, §ion_map, - &needed_sonames, output_path, soname, - ) -} - -/// Discover NEEDED shared library dependencies for a shared library build. -/// Scans object file relocations for CALL26/JUMP26 references to undefined symbols -/// and searches system libraries to find which .so files provide them. -fn resolve_dynamic_symbols_for_shared( - objects: &[ElfObject], - globals: &HashMap, - needed_sonames: &mut Vec, - lib_paths: &[String], -) { - // Collect undefined symbol names referenced by function calls - let mut undefined: Vec = Vec::new(); - for obj in objects.iter() { - for sec_relas in &obj.relocations { - for rela in sec_relas { - let si = rela.sym_idx as usize; - if si >= obj.symbols.len() { continue; } - let sym = &obj.symbols[si]; - if sym.name.is_empty() || sym.is_local() { continue; } - let is_undef = if let Some(g) = globals.get(&sym.name) { - g.is_dynamic || (g.defined_in.is_none() && g.section_idx == SHN_UNDEF) - } else { - sym.is_undefined() - }; - if is_undef && !undefined.contains(&sym.name) { - undefined.push(sym.name.clone()); - } - } - } - } - if undefined.is_empty() { return; } - - let lib_names = ["libc.so.6", "libm.so.6", "libpthread.so.0", "libdl.so.2", "librt.so.1", "ld-linux-aarch64.so.1"]; - let mut libs: Vec = Vec::new(); - for lib_name in &lib_names { - for dir in lib_paths { - let candidate = format!("{}/{}", dir, lib_name); - if Path::new(&candidate).exists() { - libs.push(candidate); - break; - } - } - } - for lib_path in &libs { - let data = match std::fs::read(lib_path) { Ok(d) => d, Err(_) => continue }; - let soname = linker_common::parse_soname(&data).unwrap_or_else(|| { - Path::new(lib_path).file_name().map(|n| n.to_string_lossy().to_string()).unwrap_or_default() - }); - if needed_sonames.contains(&soname) { continue; } - let dyn_syms = match linker_common::parse_shared_library_symbols(&data, lib_path) { - Ok(s) => s, Err(_) => continue, - }; - let provides_any = undefined.iter().any(|name| dyn_syms.iter().any(|ds| ds.name == *name)); - if provides_any { - needed_sonames.push(soname); - } - } -} diff --git a/src/backend/arm/linker/mod.rs b/src/backend/arm/linker/mod.rs deleted file mode 100644 index cc415c91e3..0000000000 --- a/src/backend/arm/linker/mod.rs +++ /dev/null @@ -1,42 +0,0 @@ -//! Native AArch64 ELF64 linker. -//! -//! Links ELF relocatable object files (.o) and static archives (.a) into -//! a dynamically-linked ELF64 executable for AArch64 (ARM 64-bit). Also supports -//! producing shared libraries (ET_DYN) via `link_shared()`. -//! -//! Shared linker infrastructure (ELF parsing, section merging, symbol registration, -//! common symbol allocation, archive loading) is provided by `linker_common`. -//! This module provides AArch64-specific logic: PLT/GOT construction, relocation -//! application, address layout, and ELF emission. -//! -//! This is the default linker (used when the `gcc_linker` feature is disabled). -//! CRT object discovery and library path resolution are handled by -//! common.rs's `resolve_builtin_link_setup`. -//! -//! ## Module structure -//! -//! - `elf`: ELF64 constants, type aliases, parsing (delegates to shared linker_common) -//! - `reloc`: AArch64-specific relocation application and encoding helpers -//! - `types`: `GlobalSymbol` struct, `GlobalSymbolOps` impl, arch constants -//! - `input`: Input file loading (objects, archives, shared libs, linker scripts) -//! - `plt_got`: PLT/GOT entry list construction from relocation scanning -//! - `link`: Orchestration - `link_builtin` and `link_shared` entry points -//! - `emit_dynamic`: Dynamic executable emission (PLT/GOT/.dynamic) -//! - `emit_shared`: Shared library (.so) emission -//! - `emit_static`: Static executable emission - -#[allow(dead_code)] // Re-exports ELF constants/types; not all constants used by every linker path -pub mod elf; -pub mod reloc; -pub mod types; -mod input; -mod plt_got; -mod link; -mod emit_dynamic; -mod emit_shared; -mod emit_static; - -#[cfg(not(feature = "gcc_linker"))] -pub use link::link_builtin; -#[cfg(not(feature = "gcc_linker"))] -pub use link::link_shared; diff --git a/src/backend/arm/linker/plt_got.rs b/src/backend/arm/linker/plt_got.rs deleted file mode 100644 index fdf390821a..0000000000 --- a/src/backend/arm/linker/plt_got.rs +++ /dev/null @@ -1,130 +0,0 @@ -//! PLT/GOT construction for the AArch64 linker. -//! -//! Scans object file relocations to determine which symbols need PLT stubs -//! (for function calls via BL/B) or GOT entries (for data references via ADRP), -//! and builds the PLT/GOT entry lists. - -use std::collections::HashMap; - -use super::elf::*; -use super::types::GlobalSymbol; - - -pub(super) fn create_plt_got( - objects: &[ElfObject], globals: &mut HashMap, -) -> (Vec, Vec<(String, bool)>) { - let mut plt_names: Vec = Vec::new(); - let mut got_only_names: Vec = Vec::new(); - let mut copy_reloc_names: Vec = Vec::new(); - - for obj in objects { - for sec_idx in 0..obj.sections.len() { - for rela in &obj.relocations[sec_idx] { - let si = rela.sym_idx as usize; - if si >= obj.symbols.len() { continue; } - let sym = &obj.symbols[si]; - if sym.name.is_empty() { continue; } - let gsym_info = globals.get(&sym.name).map(|g| (g.is_dynamic, g.info & 0xf)); - - match rela.rela_type { - R_AARCH64_CALL26 | R_AARCH64_JUMP26 if gsym_info.map(|g| g.0).unwrap_or(false) => { - let sym_type = gsym_info.map(|g| g.1).unwrap_or(0); - if sym_type == STT_OBJECT { - if !copy_reloc_names.contains(&sym.name) { - copy_reloc_names.push(sym.name.clone()); - } - } else if !plt_names.contains(&sym.name) { plt_names.push(sym.name.clone()); } - } - R_AARCH64_ADR_PREL_PG_HI21 | R_AARCH64_ADD_ABS_LO12_NC - | R_AARCH64_LDST64_ABS_LO12_NC | R_AARCH64_LDST32_ABS_LO12_NC - | R_AARCH64_LDST8_ABS_LO12_NC | R_AARCH64_LDST16_ABS_LO12_NC - | R_AARCH64_LDST128_ABS_LO12_NC - if gsym_info.map(|g| g.0).unwrap_or(false) => { - let sym_type = gsym_info.map(|g| g.1).unwrap_or(0); - if sym_type == STT_OBJECT { - if !copy_reloc_names.contains(&sym.name) { - copy_reloc_names.push(sym.name.clone()); - } - } else { - // Function referenced via ADRP+ADD (e.g., taking address) - if !plt_names.contains(&sym.name) { plt_names.push(sym.name.clone()); } - } - } - R_AARCH64_ABS64 if gsym_info.map(|g| g.0).unwrap_or(false) => { - let sym_type = gsym_info.map(|g| g.1).unwrap_or(0); - if sym_type != STT_OBJECT { - if !plt_names.contains(&sym.name) { plt_names.push(sym.name.clone()); } - } else if !copy_reloc_names.contains(&sym.name) { - copy_reloc_names.push(sym.name.clone()); - } - } - R_AARCH64_ADR_GOT_PAGE | R_AARCH64_LD64_GOT_LO12_NC => { - if !got_only_names.contains(&sym.name) && !plt_names.contains(&sym.name) { - got_only_names.push(sym.name.clone()); - } - } - _ => {} - } - } - } - } - - // Mark copy relocation symbols - let mut copy_reloc_lib_addrs: Vec<(String, u64)> = Vec::new(); - for name in ©_reloc_names { - if let Some(gsym) = globals.get_mut(name) { - gsym.copy_reloc = true; - if let Some(ref lib) = gsym.from_lib { - if (gsym.info & 0xf) == STT_OBJECT && gsym.lib_sym_value != 0 { - let key = (lib.clone(), gsym.lib_sym_value); - if !copy_reloc_lib_addrs.contains(&key) { - copy_reloc_lib_addrs.push(key); - } - } - } - } - } - // Mark aliases - if !copy_reloc_lib_addrs.is_empty() { - let alias_names: Vec = globals.iter() - .filter(|(name, g)| { - g.is_dynamic && !g.copy_reloc && (g.info & 0xf) == STT_OBJECT - && !copy_reloc_names.contains(name) - && g.from_lib.is_some() && g.lib_sym_value != 0 - && copy_reloc_lib_addrs.contains( - &(g.from_lib.as_ref().unwrap().clone(), g.lib_sym_value)) - }) - .map(|(n, _)| n.clone()) - .collect(); - for name in alias_names { - if let Some(gsym) = globals.get_mut(&name) { - gsym.copy_reloc = true; - } - } - } - - // Build GOT entries: [0]=.dynamic, [1]=reserved, [2]=reserved, then PLT entries, then GOT-only - let mut got_entries: Vec<(String, bool)> = Vec::new(); - got_entries.push((String::new(), false)); // GOT[0] - got_entries.push((String::new(), false)); // GOT[1] - got_entries.push((String::new(), false)); // GOT[2] - - for (plt_idx, name) in plt_names.iter().enumerate() { - let got_idx = got_entries.len(); - got_entries.push((name.clone(), true)); - if let Some(gsym) = globals.get_mut(name) { - gsym.plt_idx = Some(plt_idx); - gsym.got_idx = Some(got_idx); - } - } - - for name in &got_only_names { - let got_idx = got_entries.len(); - got_entries.push((name.clone(), false)); - if let Some(gsym) = globals.get_mut(name) { - gsym.got_idx = Some(got_idx); - } - } - - (plt_names, got_entries) -} diff --git a/src/backend/arm/linker/reloc.rs b/src/backend/arm/linker/reloc.rs deleted file mode 100644 index 19c2ef1af9..0000000000 --- a/src/backend/arm/linker/reloc.rs +++ /dev/null @@ -1,540 +0,0 @@ -//! AArch64 relocation application. -//! -//! Applies ELF relocations to the output buffer after section layout -//! has been determined. - -use std::collections::HashMap; -use super::elf::*; -use super::types::GlobalSymbol; -use crate::backend::linker_common::OutputSection; - -// TLS relocation types (AArch64 Local Exec model for static linking) -const R_AARCH64_TLSLE_ADD_TPREL_HI12: u32 = 549; -const R_AARCH64_TLSLE_ADD_TPREL_LO12: u32 = 550; -const R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: u32 = 551; -const R_AARCH64_TLSLE_MOVW_TPREL_G0: u32 = 544; -const R_AARCH64_TLSLE_MOVW_TPREL_G0_NC: u32 = 545; -const R_AARCH64_TLSLE_MOVW_TPREL_G1: u32 = 546; -const R_AARCH64_TLSLE_MOVW_TPREL_G1_NC: u32 = 547; -const R_AARCH64_TLSLE_MOVW_TPREL_G2: u32 = 548; - -// TLS descriptor / Initial exec (convert to LE for static linking) -const R_AARCH64_TLSDESC_ADR_PAGE21: u32 = 562; -const R_AARCH64_TLSDESC_LD64_LO12: u32 = 563; -const R_AARCH64_TLSDESC_ADD_LO12: u32 = 564; -const R_AARCH64_TLSDESC_CALL: u32 = 569; - -// GD (General Dynamic) -> LE relaxation for static linking -const R_AARCH64_TLSGD_ADR_PAGE21: u32 = 513; -const R_AARCH64_TLSGD_ADD_LO12_NC: u32 = 514; - -// IE (Initial Exec) TLS -pub const R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: u32 = 541; -pub const R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC: u32 = 542; - -/// TLS layout info needed for relocation processing -pub struct TlsInfo { - pub tls_addr: u64, - pub tls_size: u64, -} - -/// GOT (Global Offset Table) for static linking. -/// Maps symbol names to GOT entry indices for R_AARCH64_ADR_GOT_PAGE -/// and R_AARCH64_LD64_GOT_LO12_NC relocations. -pub struct GotInfo { - pub got_addr: u64, - /// (symbol_key, resolved_address) -- symbol_key is "name" or "sec:obj:sec" - pub entries: HashMap, -} - -impl GotInfo { - /// Get the address of a GOT entry for a given symbol key. - pub fn entry_addr(&self, key: &str) -> Option { - self.entries.get(key).map(|&idx| self.got_addr + (idx as u64) * 8) - } -} - -/// Resolve a symbol's final address given the global symbol table and section map. -/// -/// Linker-defined symbols (__bss_start, _edata, _end, __end, etc.) are resolved -/// through the globals table where they are registered by get_standard_linker_symbols(). -pub fn resolve_sym( - obj_idx: usize, - sym: &Symbol, - globals: &HashMap, - section_map: &HashMap<(usize, usize), (usize, u64)>, - output_sections: &[OutputSection], -) -> u64 { - if sym.sym_type() == STT_SECTION { - let si = sym.shndx as usize; - return section_map.get(&(obj_idx, si)) - .map(|&(oi, so)| output_sections[oi].addr + so) - .unwrap_or(0); - } - if !sym.name.is_empty() && !sym.is_local() { - // All linker-defined symbols (including __bss_start, _edata, _end, __end) - // are registered in the globals table with defined_in = Some(usize::MAX), - // so they are resolved through the standard globals lookup below. - // Local (STB_LOCAL) symbols must NOT be resolved via globals, since a - // local symbol named e.g. "write" must not be confused with libc's write(). - if let Some(g) = globals.get(&sym.name) { - if g.defined_in.is_some() { return g.value; } - } - if sym.is_weak() { return 0; } - } - if sym.is_undefined() { return 0; } - if sym.shndx == SHN_ABS { return sym.value; } - section_map.get(&(obj_idx, sym.shndx as usize)) - .map(|&(oi, so)| output_sections[oi].addr + so + sym.value) - .unwrap_or(sym.value) -} - -/// Build a GOT key for a symbol reference in a relocation. -/// Local symbols must be scoped to their object to avoid collisions -/// (e.g., `.LANCHOR3` in different objects referring to different TLS vars). -pub fn got_key(obj_idx: usize, sym: &Symbol) -> String { - if !sym.name.is_empty() && !sym.is_local() { - sym.name.clone() - } else if !sym.name.is_empty() { - format!("{}@{}", sym.name, obj_idx) - } else if sym.sym_type() == STT_SECTION { - format!("__sec_{}_{}", obj_idx, sym.shndx) - } else { - format!("__anon_{}_{}", obj_idx, sym.shndx) - } -} - -/// Scan all relocations to find symbols that need GOT entries. -/// This includes both regular GOT references and TLS IE references -/// (which use GOT entries containing the TP offset). -pub fn collect_got_symbols(objects: &[ElfObject]) -> Vec<(String, GotEntryKind)> { - let mut got_syms: Vec<(String, GotEntryKind)> = Vec::new(); - for obj_idx in 0..objects.len() { - for sec_idx in 0..objects[obj_idx].sections.len() { - for rela in &objects[obj_idx].relocations[sec_idx] { - let kind = match rela.rela_type { - R_AARCH64_ADR_GOT_PAGE | R_AARCH64_LD64_GOT_LO12_NC => GotEntryKind::Regular, - R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21 | R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC => GotEntryKind::TlsIE, - _ => continue, - }; - let si = rela.sym_idx as usize; - if si < objects[obj_idx].symbols.len() { - let key = got_key(obj_idx, &objects[obj_idx].symbols[si]); - if !got_syms.iter().any(|(k, _)| k == &key) { - got_syms.push((key, kind)); - } - } - } - } - } - got_syms -} - -/// What kind of value a GOT entry holds. -#[derive(Clone, Copy, PartialEq)] -pub enum GotEntryKind { - /// Regular GOT entry: holds the absolute address of the symbol. - Regular, - /// TLS IE GOT entry: holds the TP-relative offset of the TLS variable. - TlsIE, -} - -/// Apply all relocations to the output buffer. -pub fn apply_relocations( - objects: &[ElfObject], - globals: &HashMap, - output_sections: &[OutputSection], - section_map: &HashMap<(usize, usize), (usize, u64)>, - out: &mut [u8], - tls_info: &TlsInfo, - got_info: &GotInfo, -) -> Result<(), String> { - for obj_idx in 0..objects.len() { - for sec_idx in 0..objects[obj_idx].sections.len() { - let relas = &objects[obj_idx].relocations[sec_idx]; - if relas.is_empty() { continue; } - let (out_idx, sec_off) = match section_map.get(&(obj_idx, sec_idx)) { - Some(&v) => v, - None => continue, - }; - let sa = output_sections[out_idx].addr; - let sfo = output_sections[out_idx].file_offset; - - for rela in relas { - let si = rela.sym_idx as usize; - if si >= objects[obj_idx].symbols.len() { continue; } - let sym = &objects[obj_idx].symbols[si]; - let p = sa + sec_off + rela.offset; - let fp = (sfo + sec_off + rela.offset) as usize; - let a = rela.addend; - let s = resolve_sym(obj_idx, sym, globals, section_map, output_sections); - let gkey = got_key(obj_idx, sym); - - apply_one_reloc(out, fp, rela.rela_type, s, a, p, &sym.name, - &objects[obj_idx].source_name, tls_info, got_info, &gkey)?; - } - } - } - Ok(()) -} - -/// Compute TP offset for AArch64. On AArch64, the TLS block starts at TP + 16 -/// (TP points to the DTV, TLS block is right after). -/// For Local Exec: tp_offset = sym_addr - tls_start_addr + 16 -fn tprel(s: u64, a: i64, tls_info: &TlsInfo) -> i64 { - // AArch64 uses variant 1 TLS: TP points to start of TCB (16 bytes), - // followed by TLS block. So offset from TP = (S+A - tls_base) + 16 - if tls_info.tls_addr == 0 { - return (s as i64).wrapping_add(a); - } - let sa = (s as i64).wrapping_add(a) as u64; - (sa as i64) - (tls_info.tls_addr as i64) + 16 -} - -/// Apply a single AArch64 relocation. -pub fn apply_one_reloc( - out: &mut [u8], - fp: usize, - rtype: u32, - s: u64, - a: i64, - p: u64, - sym_name: &str, - source: &str, - tls_info: &TlsInfo, - got_info: &GotInfo, - got_key: &str, -) -> Result<(), String> { - match rtype { - R_AARCH64_NONE => {} - - // ── Absolute relocations ── - R_AARCH64_ABS64 => { - let val = (s as i64).wrapping_add(a) as u64; - w64(out, fp, val); - } - R_AARCH64_ABS32 => { - let val = (s as i64).wrapping_add(a) as u64; - w32(out, fp, val as u32); - } - R_AARCH64_ABS16 => { - let val = (s as i64).wrapping_add(a) as u64; - w16(out, fp, val as u16); - } - - // ── PC-relative relocations ── - R_AARCH64_PREL64 => { - let val = (s as i64).wrapping_add(a).wrapping_sub(p as i64); - w64(out, fp, val as u64); - } - R_AARCH64_PREL32 => { - let val = (s as i64).wrapping_add(a).wrapping_sub(p as i64); - w32(out, fp, val as u32); - } - R_AARCH64_PREL16 => { - let val = (s as i64).wrapping_add(a).wrapping_sub(p as i64); - w16(out, fp, val as u16); - } - - // ── ADRP: Page-relative high 21 bits ── - R_AARCH64_ADR_PREL_PG_HI21 => { - let sa = (s as i64).wrapping_add(a) as u64; - let page_s = sa & !0xFFF; - let page_p = p & !0xFFF; - let offset = page_s as i64 - page_p as i64; - let imm = offset >> 12; - encode_adrp(out, fp, imm); - } - - // ── ADR: PC-relative low 21 bits ── - R_AARCH64_ADR_PREL_LO21 => { - let offset = (s as i64).wrapping_add(a).wrapping_sub(p as i64); - encode_adr(out, fp, offset); - } - - // ── ADD: absolute low 12 bits (no carry check) ── - R_AARCH64_ADD_ABS_LO12_NC => { - let sa = (s as i64).wrapping_add(a) as u64; - encode_add_imm12(out, fp, (sa & 0xFFF) as u32); - } - - // ── Load/store low 12 bits for different sizes ── - R_AARCH64_LDST8_ABS_LO12_NC => { - let sa = (s as i64).wrapping_add(a) as u64; - encode_ldst_imm12(out, fp, (sa & 0xFFF) as u32, 0); - } - R_AARCH64_LDST16_ABS_LO12_NC => { - let sa = (s as i64).wrapping_add(a) as u64; - encode_ldst_imm12(out, fp, (sa & 0xFFF) as u32, 1); - } - R_AARCH64_LDST32_ABS_LO12_NC => { - let sa = (s as i64).wrapping_add(a) as u64; - encode_ldst_imm12(out, fp, (sa & 0xFFF) as u32, 2); - } - R_AARCH64_LDST64_ABS_LO12_NC => { - let sa = (s as i64).wrapping_add(a) as u64; - encode_ldst_imm12(out, fp, (sa & 0xFFF) as u32, 3); - } - R_AARCH64_LDST128_ABS_LO12_NC => { - let sa = (s as i64).wrapping_add(a) as u64; - encode_ldst_imm12(out, fp, (sa & 0xFFF) as u32, 4); - } - - // ── Branch instructions ── - R_AARCH64_CALL26 | R_AARCH64_JUMP26 => { - if fp + 4 > out.len() { return Ok(()); } - let sa = (s as i64).wrapping_add(a) as u64; - if sa == 0 { - // Undefined/weak symbol resolved to 0: replace with NOP - w32(out, fp, 0xd503201f); - } else { - let offset = (sa as i64).wrapping_sub(p as i64); - let mut insn = read_u32(out, fp); - let imm26 = ((offset >> 2) as u32) & 0x3ffffff; - insn = (insn & 0xfc000000) | imm26; - w32(out, fp, insn); - } - } - - // ── Conditional branch (19-bit offset) ── - R_AARCH64_CONDBR19 => { - let offset = (s as i64).wrapping_add(a).wrapping_sub(p as i64); - if fp + 4 > out.len() { return Ok(()); } - let mut insn = read_u32(out, fp); - let imm19 = ((offset >> 2) as u32) & 0x7ffff; - insn = (insn & 0xff00001f) | (imm19 << 5); - w32(out, fp, insn); - } - - // ── Test and branch (14-bit offset) ── - R_AARCH64_TSTBR14 => { - let offset = (s as i64).wrapping_add(a).wrapping_sub(p as i64); - if fp + 4 > out.len() { return Ok(()); } - let mut insn = read_u32(out, fp); - let imm14 = ((offset >> 2) as u32) & 0x3fff; - insn = (insn & 0xfff8001f) | (imm14 << 5); - w32(out, fp, insn); - } - - // ── MOVW relocations ── - R_AARCH64_MOVW_UABS_G0 | R_AARCH64_MOVW_UABS_G0_NC => { - let sa = (s as i64).wrapping_add(a) as u64; - encode_movw(out, fp, (sa & 0xffff) as u32); - } - R_AARCH64_MOVW_UABS_G1_NC => { - let sa = (s as i64).wrapping_add(a) as u64; - encode_movw(out, fp, ((sa >> 16) & 0xffff) as u32); - } - R_AARCH64_MOVW_UABS_G2_NC => { - let sa = (s as i64).wrapping_add(a) as u64; - encode_movw(out, fp, ((sa >> 32) & 0xffff) as u32); - } - R_AARCH64_MOVW_UABS_G3 => { - let sa = (s as i64).wrapping_add(a) as u64; - encode_movw(out, fp, ((sa >> 48) & 0xffff) as u32); - } - - // ── GOT relocations ── - // Even in static linking, we use a real GOT since the instruction - // is an LDR (load from memory), not an ADD. - R_AARCH64_ADR_GOT_PAGE => { - if let Some(got_entry_addr) = got_info.entry_addr(got_key) { - let page_g = got_entry_addr & !0xFFF; - let page_p = p & !0xFFF; - let imm = (page_g as i64 - page_p as i64) >> 12; - encode_adrp(out, fp, imm); - } else { - // Fallback: treat like ADR_PREL_PG_HI21 - let sa = (s as i64).wrapping_add(a) as u64; - let page_s = sa & !0xFFF; - let page_p = p & !0xFFF; - let imm = (page_s as i64 - page_p as i64) >> 12; - encode_adrp(out, fp, imm); - } - } - R_AARCH64_LD64_GOT_LO12_NC => { - if let Some(got_entry_addr) = got_info.entry_addr(got_key) { - encode_ldst_imm12(out, fp, (got_entry_addr & 0xFFF) as u32, 3); - } else { - let sa = (s as i64).wrapping_add(a) as u64; - encode_ldst_imm12(out, fp, (sa & 0xFFF) as u32, 3); - } - } - - // ── TLS Local Exec (LE) relocations ── - // These are used when the TLS variable is in the executable itself. - // On AArch64 variant 1, tp offset = sym_offset_in_tls + 16 (TCB size) - R_AARCH64_TLSLE_ADD_TPREL_HI12 => { - let tp = tprel(s, a, tls_info); - if std::env::var("LINKER_DEBUG_TLS").is_ok() { - eprintln!(" TLSLE_HI12: sym='{}' s=0x{:x} a={} tls_addr=0x{:x} tls_size=0x{:x} -> tp=0x{:x}", - sym_name, s, a, tls_info.tls_addr, tls_info.tls_size, tp as u64); - } - let imm12 = ((tp as u64 >> 12) & 0xFFF) as u32; - encode_add_imm12(out, fp, imm12); - } - R_AARCH64_TLSLE_ADD_TPREL_LO12 | R_AARCH64_TLSLE_ADD_TPREL_LO12_NC => { - let tp = tprel(s, a, tls_info); - if std::env::var("LINKER_DEBUG_TLS").is_ok() { - eprintln!(" TLSLE_LO12: sym='{}' s=0x{:x} a={} tls_addr=0x{:x} -> tp=0x{:x}", - sym_name, s, a, tls_info.tls_addr, tp as u64); - } - let imm12 = (tp as u64 & 0xFFF) as u32; - encode_add_imm12(out, fp, imm12); - } - R_AARCH64_TLSLE_MOVW_TPREL_G0 | R_AARCH64_TLSLE_MOVW_TPREL_G0_NC => { - let tp = tprel(s, a, tls_info); - encode_movw(out, fp, (tp as u64 & 0xffff) as u32); - } - R_AARCH64_TLSLE_MOVW_TPREL_G1 | R_AARCH64_TLSLE_MOVW_TPREL_G1_NC => { - let tp = tprel(s, a, tls_info); - encode_movw(out, fp, ((tp as u64 >> 16) & 0xffff) as u32); - } - R_AARCH64_TLSLE_MOVW_TPREL_G2 => { - let tp = tprel(s, a, tls_info); - encode_movw(out, fp, ((tp as u64 >> 32) & 0xffff) as u32); - } - - // ── TLS IE (Initial Exec) via GOT for static linking ── - // Instead of relaxing to MOVZ/MOVK (which can break if registers - // differ between ADRP and LDR), we use real GOT entries that hold - // the pre-computed TP offset. The instructions remain ADRP+LDR - // pointing at our GOT entry. - R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21 => { - if let Some(got_entry_addr) = got_info.entry_addr(got_key) { - let page_g = got_entry_addr & !0xFFF; - let page_p = p & !0xFFF; - let imm = (page_g as i64 - page_p as i64) >> 12; - encode_adrp(out, fp, imm); - } else { - // Fallback: relax to MOVZ - let tp = tprel(s, a, tls_info); - if fp + 4 > out.len() { return Ok(()); } - let insn = read_u32(out, fp); - let rd = insn & 0x1f; - let imm16 = ((tp as u64 >> 16) & 0xffff) as u32; - let new_insn = 0xd2a00000 | (imm16 << 5) | rd; - w32(out, fp, new_insn); - } - } - R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC => { - if let Some(got_entry_addr) = got_info.entry_addr(got_key) { - encode_ldst_imm12(out, fp, (got_entry_addr & 0xFFF) as u32, 3); - } else { - // Fallback: relax to MOVK - let tp = tprel(s, a, tls_info); - if fp + 4 > out.len() { return Ok(()); } - let insn = read_u32(out, fp); - let rd = insn & 0x1f; - let imm16 = (tp as u64 & 0xffff) as u32; - let new_insn = 0xf2800000 | (imm16 << 5) | rd; - w32(out, fp, new_insn); - } - } - - // ── TLSDESC relaxation to LE for static linking ── - R_AARCH64_TLSDESC_ADR_PAGE21 => { - // Replace ADRP with MOVZ Xd, #tprel_g1, LSL #16 - let tp = tprel(s, a, tls_info); - if fp + 4 > out.len() { return Ok(()); } - let insn = read_u32(out, fp); - let rd = insn & 0x1f; - let imm16 = ((tp as u64 >> 16) & 0xffff) as u32; - let new_insn = 0xd2a00000 | (imm16 << 5) | rd; - w32(out, fp, new_insn); - } - R_AARCH64_TLSDESC_LD64_LO12 => { - // Replace LDR with MOVK Xd, #tprel_lo - let tp = tprel(s, a, tls_info); - if fp + 4 > out.len() { return Ok(()); } - let insn = read_u32(out, fp); - let rd = insn & 0x1f; - let imm16 = (tp as u64 & 0xffff) as u32; - let new_insn = 0xf2800000 | (imm16 << 5) | rd; - w32(out, fp, new_insn); - } - R_AARCH64_TLSDESC_ADD_LO12 => { - // NOP (the value is already in the register from MOVZ+MOVK) - if fp + 4 > out.len() { return Ok(()); } - w32(out, fp, 0xd503201f); // NOP - } - R_AARCH64_TLSDESC_CALL => { - // NOP (no runtime call needed for static linking) - if fp + 4 > out.len() { return Ok(()); } - w32(out, fp, 0xd503201f); // NOP - } - - // ── TLS GD -> LE relaxation ── - // TODO: full GD relaxation also needs to NOP the BL __tls_get_addr that follows - R_AARCH64_TLSGD_ADR_PAGE21 => { - let tp = tprel(s, a, tls_info); - if fp + 4 > out.len() { return Ok(()); } - let insn = read_u32(out, fp); - let rd = insn & 0x1f; - let imm16 = ((tp as u64 >> 16) & 0xffff) as u32; - let new_insn = 0xd2a00000 | (imm16 << 5) | rd; - w32(out, fp, new_insn); - } - R_AARCH64_TLSGD_ADD_LO12_NC => { - let tp = tprel(s, a, tls_info); - if fp + 4 > out.len() { return Ok(()); } - let insn = read_u32(out, fp); - let rd = insn & 0x1f; - let imm16 = (tp as u64 & 0xffff) as u32; - let new_insn = 0xf2800000 | (imm16 << 5) | rd; - w32(out, fp, new_insn); - } - - other => { - return Err(format!( - "unsupported AArch64 relocation type {} for '{}' in {}", - other, sym_name, source - )); - } - } - Ok(()) -} - -// ── Instruction encoding helpers ─────────────────────────────────────── - -pub(super) fn encode_adrp(out: &mut [u8], fp: usize, imm: i64) { - if fp + 4 > out.len() { return; } - let mut insn = read_u32(out, fp); - let immlo = (imm as u32) & 0x3; - let immhi = ((imm as u32) >> 2) & 0x7ffff; - insn = (insn & 0x9f00001f) | (immlo << 29) | (immhi << 5); - w32(out, fp, insn); -} - -pub(super) fn encode_adr(out: &mut [u8], fp: usize, offset: i64) { - if fp + 4 > out.len() { return; } - let mut insn = read_u32(out, fp); - let imm = offset as u32; - let immlo = imm & 0x3; - let immhi = (imm >> 2) & 0x7ffff; - insn = (insn & 0x9f00001f) | (immlo << 29) | (immhi << 5); - w32(out, fp, insn); -} - -pub(super) fn encode_add_imm12(out: &mut [u8], fp: usize, imm12: u32) { - if fp + 4 > out.len() { return; } - let mut insn = read_u32(out, fp); - insn = (insn & 0xffc003ff) | ((imm12 & 0xfff) << 10); - w32(out, fp, insn); -} - -pub(super) fn encode_ldst_imm12(out: &mut [u8], fp: usize, lo12: u32, shift: u32) { - if fp + 4 > out.len() { return; } - let mut insn = read_u32(out, fp); - let imm12 = (lo12 >> shift) & 0xfff; - insn = (insn & 0xffc003ff) | (imm12 << 10); - w32(out, fp, insn); -} - -pub(super) fn encode_movw(out: &mut [u8], fp: usize, imm16: u32) { - if fp + 4 > out.len() { return; } - let mut insn = read_u32(out, fp); - insn = (insn & 0xffe0001f) | ((imm16 & 0xffff) << 5); - w32(out, fp, insn); -} - diff --git a/src/backend/arm/linker/types.rs b/src/backend/arm/linker/types.rs deleted file mode 100644 index c6c493c00d..0000000000 --- a/src/backend/arm/linker/types.rs +++ /dev/null @@ -1,93 +0,0 @@ -//! AArch64 linker types and constants. -//! -//! Defines the `GlobalSymbol` type used by all linker phases, plus -//! architecture-specific constants (base address, page size, interpreter path). - -use crate::backend::linker_common; -use linker_common::{GlobalSymbolOps, Elf64Symbol}; -use super::elf::SHN_UNDEF; -use super::elf::SHN_COMMON; - -/// Dynamic linker path for AArch64 -pub const INTERP: &[u8] = b"/lib/ld-linux-aarch64.so.1\0"; - -/// Base virtual address for the executable -pub const BASE_ADDR: u64 = 0x400000; -/// Page size for alignment -pub const PAGE_SIZE: u64 = 0x10000; // AArch64 uses 64KB pages for linker alignment - -/// A resolved global symbol -#[derive(Clone)] -pub struct GlobalSymbol { - pub value: u64, - pub size: u64, - pub info: u8, - pub defined_in: Option, - pub section_idx: u16, - /// SONAME of the shared library this symbol was resolved from - pub from_lib: Option, - /// PLT entry index (for dynamic function symbols) - pub plt_idx: Option, - /// GOT entry index (for dynamic symbols needing GOT slots) - pub got_idx: Option, - /// Whether this symbol is resolved from a shared library - pub is_dynamic: bool, - /// Whether this symbol needs a copy relocation - pub copy_reloc: bool, - /// Symbol's value in the source shared library (for alias detection) - pub lib_sym_value: u64, -} - -impl GlobalSymbolOps for GlobalSymbol { - fn is_defined(&self) -> bool { self.defined_in.is_some() } - fn is_dynamic(&self) -> bool { self.is_dynamic } - fn info(&self) -> u8 { self.info } - fn section_idx(&self) -> u16 { self.section_idx } - fn value(&self) -> u64 { self.value } - fn size(&self) -> u64 { self.size } - fn new_defined(obj_idx: usize, sym: &Elf64Symbol) -> Self { - GlobalSymbol { - value: sym.value, size: sym.size, info: sym.info, - defined_in: Some(obj_idx), from_lib: None, - plt_idx: None, got_idx: None, - section_idx: sym.shndx, is_dynamic: false, copy_reloc: false, - lib_sym_value: 0, - } - } - fn new_common(obj_idx: usize, sym: &Elf64Symbol) -> Self { - GlobalSymbol { - value: sym.value, size: sym.size, info: sym.info, - defined_in: Some(obj_idx), from_lib: None, - plt_idx: None, got_idx: None, - section_idx: SHN_COMMON, is_dynamic: false, copy_reloc: false, - lib_sym_value: 0, - } - } - fn new_undefined(sym: &Elf64Symbol) -> Self { - GlobalSymbol { - value: 0, size: 0, info: sym.info, - defined_in: None, from_lib: None, - plt_idx: None, got_idx: None, - section_idx: SHN_UNDEF, is_dynamic: false, copy_reloc: false, - lib_sym_value: 0, - } - } - fn set_common_bss(&mut self, bss_offset: u64) { - self.value = bss_offset; - self.section_idx = 0xffff; - } - fn new_dynamic(dsym: &linker_common::DynSymbol, soname: &str) -> Self { - GlobalSymbol { - value: 0, size: dsym.size, info: dsym.info, - defined_in: None, from_lib: Some(soname.to_string()), - plt_idx: None, got_idx: None, - section_idx: SHN_UNDEF, is_dynamic: true, copy_reloc: false, - lib_sym_value: dsym.value, - } - } -} - -/// ARM-specific replacement policy: also replace dynamic symbols with local definitions. -pub fn arm_should_replace_extra(existing: &GlobalSymbol) -> bool { - existing.is_dynamic -} diff --git a/src/backend/arm/mod.rs b/src/backend/arm/mod.rs deleted file mode 100644 index bcbec70e0a..0000000000 --- a/src/backend/arm/mod.rs +++ /dev/null @@ -1,7 +0,0 @@ -pub(crate) mod codegen; -#[cfg_attr(feature = "gcc_assembler", allow(dead_code))] // Built-in assembler unused when gcc handles assembly -pub(crate) mod assembler; -#[cfg_attr(feature = "gcc_linker", allow(dead_code))] // Built-in linker unused when gcc handles linking -pub(crate) mod linker; - -pub(crate) use codegen::emit::ArmCodegen; diff --git a/src/backend/asm_expr.rs b/src/backend/asm_expr.rs deleted file mode 100644 index 2a2c6ef2df..0000000000 --- a/src/backend/asm_expr.rs +++ /dev/null @@ -1,409 +0,0 @@ -/// Shared assembly expression evaluator for all assembler backends (x86, i686, ARM, RISC-V). -/// -/// Supports arithmetic expressions with proper operator precedence: -/// - Parentheses: `(expr)` -/// - Bitwise OR: `|` -/// - Bitwise XOR: `^` -/// - Bitwise AND: `&` -/// - Shifts: `<<`, `>>` -/// - Addition/Subtraction: `+`, `-` -/// - Multiplication/Division/Modulo: `*`, `/`, `%` -/// - Unary: `-`, `+`, `~`, `!` (logical NOT) -/// -/// Integer literals: decimal, hex (0x), binary (0b), octal (leading 0), -/// character literals ('c', '\n'). -/// Used by all four assembler backends (x86, i686, ARM, RISC-V). -/// Map a C escape character to its ASCII value (e.g., b'n' -> 10 for '\n'). -fn char_escape_value(esc: u8) -> i64 { - match esc { - b'n' => 10, - b't' => 9, - b'r' => 13, - b'0' => 0, - b'\\' => b'\\' as i64, - b'\'' => b'\'' as i64, - b'"' => b'"' as i64, - b'a' => 7, - b'b' => 8, - b'f' => 12, - b'v' => 11, - _ => esc as i64, - } -} - -/// Token type for the expression evaluator. -#[derive(Debug)] -enum ExprToken { - Num(i64), - Op(char), - Op2(&'static str), -} - -/// Tokenize an expression string into ExprTokens. -fn tokenize_expr(s: &str) -> Result, String> { - let mut tokens = Vec::new(); - let bytes = s.as_bytes(); - let mut i = 0; - while i < bytes.len() { - let c = bytes[i]; - if c.is_ascii_whitespace() { - i += 1; - continue; - } - if c == b'(' || c == b')' || c == b'+' || c == b'-' || c == b'*' - || c == b'/' || c == b'%' || c == b'&' || c == b'|' || c == b'^' || c == b'~' - || c == b'!' - { - tokens.push(ExprToken::Op(c as char)); - i += 1; - } else if c == b'<' && i + 1 < bytes.len() && bytes[i + 1] == b'<' { - tokens.push(ExprToken::Op2("<<")); - i += 2; - } else if c == b'>' && i + 1 < bytes.len() && bytes[i + 1] == b'>' { - tokens.push(ExprToken::Op2(">>")); - i += 2; - } else if c == b'\'' { - // Character literal: 'c' or '\n', '\t', '\\', etc. - i += 1; // skip opening quote - if i >= bytes.len() { - return Err("unterminated character literal".to_string()); - } - let ch_val = if bytes[i] == b'\\' { - i += 1; - if i >= bytes.len() { - return Err("unterminated character escape".to_string()); - } - let esc = bytes[i]; - i += 1; - char_escape_value(esc) - } else { - let val = bytes[i] as i64; - i += 1; - val - }; - // Skip closing quote if present - if i < bytes.len() && bytes[i] == b'\'' { - i += 1; - } - tokens.push(ExprToken::Num(ch_val)); - } else if c.is_ascii_digit() { - let start = i; - if c == b'0' && i + 1 < bytes.len() && (bytes[i + 1] == b'x' || bytes[i + 1] == b'X') { - i += 2; - while i < bytes.len() && bytes[i].is_ascii_hexdigit() { i += 1; } - } else if c == b'0' && i + 1 < bytes.len() && (bytes[i + 1] == b'b' || bytes[i + 1] == b'B') { - i += 2; - while i < bytes.len() && (bytes[i] == b'0' || bytes[i] == b'1') { i += 1; } - } else { - while i < bytes.len() && bytes[i].is_ascii_digit() { i += 1; } - } - let num_str = &s[start..i]; - let val = parse_single_integer(num_str)?; - tokens.push(ExprToken::Num(val)); - } else { - return Err(format!("unexpected char '{}' in expression", c as char)); - } - } - Ok(tokens) -} - -fn eval_tokens(tokens: &[ExprToken], pos: &mut usize) -> Result { - eval_or(tokens, pos) -} - -fn eval_or(tokens: &[ExprToken], pos: &mut usize) -> Result { - let mut val = eval_xor(tokens, pos)?; - while *pos < tokens.len() { - if matches!(&tokens[*pos], ExprToken::Op('|')) { - *pos += 1; - val |= eval_xor(tokens, pos)?; - } else { - break; - } - } - Ok(val) -} - -fn eval_xor(tokens: &[ExprToken], pos: &mut usize) -> Result { - let mut val = eval_and(tokens, pos)?; - while *pos < tokens.len() { - if matches!(&tokens[*pos], ExprToken::Op('^')) { - *pos += 1; - val ^= eval_and(tokens, pos)?; - } else { - break; - } - } - Ok(val) -} - -fn eval_and(tokens: &[ExprToken], pos: &mut usize) -> Result { - let mut val = eval_shift(tokens, pos)?; - while *pos < tokens.len() { - if matches!(&tokens[*pos], ExprToken::Op('&')) { - *pos += 1; - val &= eval_shift(tokens, pos)?; - } else { - break; - } - } - Ok(val) -} - -fn eval_shift(tokens: &[ExprToken], pos: &mut usize) -> Result { - let mut val = eval_add(tokens, pos)?; - while *pos < tokens.len() { - match &tokens[*pos] { - ExprToken::Op2("<<") => { *pos += 1; val <<= eval_add(tokens, pos)?; } - ExprToken::Op2(">>") => { *pos += 1; val = ((val as u64) >> eval_add(tokens, pos)?) as i64; } - _ => break, - } - } - Ok(val) -} - -fn eval_add(tokens: &[ExprToken], pos: &mut usize) -> Result { - let mut val = eval_mul(tokens, pos)?; - while *pos < tokens.len() { - match &tokens[*pos] { - ExprToken::Op('+') => { *pos += 1; val += eval_mul(tokens, pos)?; } - ExprToken::Op('-') => { *pos += 1; val -= eval_mul(tokens, pos)?; } - _ => break, - } - } - Ok(val) -} - -fn eval_mul(tokens: &[ExprToken], pos: &mut usize) -> Result { - let mut val = eval_unary(tokens, pos)?; - while *pos < tokens.len() { - match &tokens[*pos] { - ExprToken::Op('*') => { *pos += 1; val *= eval_unary(tokens, pos)?; } - ExprToken::Op('/') => { - *pos += 1; - let rhs = eval_unary(tokens, pos)?; - if rhs == 0 { return Err("division by zero".to_string()); } - val /= rhs; - } - ExprToken::Op('%') => { - *pos += 1; - let rhs = eval_unary(tokens, pos)?; - if rhs == 0 { return Err("modulo by zero".to_string()); } - val %= rhs; - } - _ => break, - } - } - Ok(val) -} - -fn eval_unary(tokens: &[ExprToken], pos: &mut usize) -> Result { - if *pos >= tokens.len() { - return Err("unexpected end of expression".to_string()); - } - match &tokens[*pos] { - ExprToken::Op('-') => { *pos += 1; Ok(-eval_unary(tokens, pos)?) } - ExprToken::Op('+') => { *pos += 1; eval_unary(tokens, pos) } - ExprToken::Op('~') => { *pos += 1; Ok(!eval_unary(tokens, pos)?) } - ExprToken::Op('!') => { *pos += 1; Ok(if eval_unary(tokens, pos)? == 0 { 1 } else { 0 }) } - ExprToken::Op('(') => { - *pos += 1; - let val = eval_tokens(tokens, pos)?; - if *pos < tokens.len() && matches!(&tokens[*pos], ExprToken::Op(')')) { - *pos += 1; - } else { - return Err("missing closing parenthesis".to_string()); - } - Ok(val) - } - ExprToken::Num(v) => { let v = *v; *pos += 1; Ok(v) } - other => Err(format!("unexpected token in expression: {:?}", other)), - } -} - -/// Parse a single integer value (no arithmetic expressions). -/// Supports decimal, hex (0x/0X), binary (0b/0B), octal (leading 0), -/// and character literals ('c', '\n', etc.). -fn parse_single_integer(s: &str) -> Result { - let s = s.trim(); - if s.is_empty() { - return Err("empty integer".to_string()); - } - - // Character literal: 'c' or '\n' etc. Must be the entire string - // (e.g., "'!'" or "'\n'"), not part of a larger expression. - if s.starts_with('\'') { - let bytes = s.as_bytes(); - let mut i = 1; - if i >= bytes.len() { - return Err("unterminated character literal".to_string()); - } - let val; - if bytes[i] == b'\\' { - i += 1; - if i >= bytes.len() { - return Err("unterminated character escape".to_string()); - } - val = char_escape_value(bytes[i]); - i += 1; - } else { - val = bytes[i] as i64; - i += 1; - } - // Skip closing quote - if i < bytes.len() && bytes[i] == b'\'' { - i += 1; - } - // Only accept if we consumed the whole string - if i == bytes.len() { - return Ok(val); - } - // Otherwise, fall through to let tokenize_expr handle it as an expression - return Err("character literal has trailing content".to_string()); - } - - let (negative, s) = if let Some(rest) = s.strip_prefix('-') { - (true, rest) - } else { - (false, s) - }; - - let val = if let Some(hex) = s.strip_prefix("0x").or_else(|| s.strip_prefix("0X")) { - let uval = u64::from_str_radix(hex, 16) - .map_err(|_| format!("bad hex: {}", s))?; - if negative { - return Ok((uval as i64).wrapping_neg()); - } - return Ok(uval as i64); - } else if let Some(bin) = s.strip_prefix("0b").or_else(|| s.strip_prefix("0B")) { - i64::from_str_radix(bin, 2) - .map_err(|_| format!("bad binary: {}", s))? - } else if s.starts_with('0') && s.len() > 1 && s.chars().all(|c| c.is_ascii_digit()) { - // Octal (must be checked before decimal to handle leading-zero literals) - i64::from_str_radix(s, 8) - .map_err(|_| format!("bad octal: {}", s))? - } else { - // Try decimal, including u64 range for large unsigned values - if let Ok(val) = s.parse::() { - if negative { - return Ok(-val); - } - return Ok(val); - } - if let Ok(uval) = s.parse::() { - if negative { - return Ok((uval as i64).wrapping_neg()); - } - return Ok(uval as i64); - } - return Err(format!("bad integer: {}", s)); - }; - - Ok(if negative { val.wrapping_neg() } else { val }) -} - -/// Parse an integer expression with full operator precedence. -/// Supports: |, ^, &, <<, >>, +, -, *, /, %, ~, parentheses. -/// Falls back to simple integer parsing for plain numbers. -pub fn parse_integer_expr(s: &str) -> Result { - let s = s.trim(); - if s.is_empty() { - return Err("empty integer".to_string()); - } - // Fast path: try simple integer first - if let Ok(val) = parse_single_integer(s) { - return Ok(val); - } - let tokens = tokenize_expr(s)?; - if tokens.is_empty() { - return Err("empty expression".to_string()); - } - let mut pos = 0; - let val = eval_tokens(&tokens, &mut pos)?; - if pos < tokens.len() { - return Err(format!("unexpected trailing token in expression: {:?}", tokens[pos])); - } - Ok(val) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_simple_integers() { - assert_eq!(parse_integer_expr("42").unwrap(), 42); - assert_eq!(parse_integer_expr("-1").unwrap(), -1); - assert_eq!(parse_integer_expr("0xFF").unwrap(), 255); - assert_eq!(parse_integer_expr("0b1010").unwrap(), 10); - assert_eq!(parse_integer_expr("0644").unwrap(), 420); - } - - #[test] - fn test_arithmetic() { - assert_eq!(parse_integer_expr("8 * 16 + 8 * 8").unwrap(), 192); - assert_eq!(parse_integer_expr("(8 * 16 + 8 * 8)").unwrap(), 192); - assert_eq!(parse_integer_expr("8*2 + (8 * 16 + 8 * 8) + 64").unwrap(), 272); - assert_eq!(parse_integer_expr("-(8 * 8 + 8 * 8 + 16)").unwrap(), -144); - assert_eq!(parse_integer_expr("-(8 * 8 + 8 * 8 + 16)+0*8").unwrap(), -144); - } - - #[test] - fn test_bitwise() { - assert_eq!(parse_integer_expr("0x100 | 0x4000 | 17").unwrap(), 0x4111); - assert_eq!(parse_integer_expr("0xFF & 0x0F").unwrap(), 0x0F); - assert_eq!(parse_integer_expr("1 << 5").unwrap(), 32); - assert_eq!(parse_integer_expr("~0").unwrap(), -1); - assert_eq!(parse_integer_expr("0xFF ^ 0x0F").unwrap(), 0xF0); - } - - #[test] - fn test_logical_not() { - // GAS logical NOT: !0 = 1, !1 = 0, !nonzero = 0 - assert_eq!(parse_integer_expr("!0").unwrap(), 1); - assert_eq!(parse_integer_expr("!1").unwrap(), 0); - assert_eq!(parse_integer_expr("!42").unwrap(), 0); - // Used in FFmpeg: 1+!0 = 2 - assert_eq!(parse_integer_expr("1+!0").unwrap(), 2); - assert_eq!(parse_integer_expr("1+!1").unwrap(), 1); - } - - #[test] - fn test_complex() { - // libffi CALL_CONTEXT_SIZE = (N_V_ARG_REG * 16 + N_X_ARG_REG * 8) where N_V=8, N_X=8 - assert_eq!(parse_integer_expr("(8 * 16 + 8 * 8)").unwrap(), 192); - // ffi_closure_SYSV_FS = (8*2 + CALL_CONTEXT_SIZE + 64) - assert_eq!(parse_integer_expr("(8*2 + (8 * 16 + 8 * 8) + 64)").unwrap(), 272); - // musl vfork: CLONE_VM | CLONE_VFORK | SIGCHLD - assert_eq!(parse_integer_expr("0x100 | 0x4000 | 17").unwrap(), 0x4111); - } - - #[test] - fn test_operator_precedence() { - // * binds tighter than + - assert_eq!(parse_integer_expr("2 + 3 * 4").unwrap(), 14); - assert_eq!(parse_integer_expr("3 * 4 + 2").unwrap(), 14); - // | binds looser than + - assert_eq!(parse_integer_expr("1 | 2 + 4").unwrap(), 7); // 1 | (2+4) = 1 | 6 = 7 - } - - #[test] - fn test_character_literals() { - // Simple ASCII characters - assert_eq!(parse_integer_expr("'!'").unwrap(), 33); - assert_eq!(parse_integer_expr("'A'").unwrap(), 65); - assert_eq!(parse_integer_expr("'0'").unwrap(), 48); - assert_eq!(parse_integer_expr("' '").unwrap(), 32); - // Escape sequences - assert_eq!(parse_integer_expr("'\\n'").unwrap(), 10); - assert_eq!(parse_integer_expr("'\\t'").unwrap(), 9); - assert_eq!(parse_integer_expr("'\\r'").unwrap(), 13); - assert_eq!(parse_integer_expr("'\\0'").unwrap(), 0); - assert_eq!(parse_integer_expr("'\\\\").unwrap(), 92); - assert_eq!(parse_integer_expr("'\\''").unwrap(), 39); - // Character literals in expressions - assert_eq!(parse_integer_expr("'A' + 1").unwrap(), 66); - assert_eq!(parse_integer_expr("'!' | 0x80").unwrap(), 0xA1); - } - -} diff --git a/src/backend/asm_preprocess.rs b/src/backend/asm_preprocess.rs deleted file mode 100644 index 54296e10e5..0000000000 --- a/src/backend/asm_preprocess.rs +++ /dev/null @@ -1,1434 +0,0 @@ -//! Shared GAS assembly preprocessing utilities. -//! -//! Functions used by multiple assembler parsers (ARM, RISC-V, x86) for -//! text-level preprocessing before architecture-specific parsing: -//! -//! - C-style comment stripping (`/* ... */`) -//! - Line comment stripping (`#`, `//`, `@`) -//! - Semicolon splitting (GAS statement separator) -//! - `.rept`/`.irp`/`.endr` block expansion -//! - `.macro`/`.endm`/`.purgem` definition, expansion, and removal -//! - `.if`/`.elseif`/`.else`/`.endif` conditional assembly evaluation - -use crate::backend::asm_expr; - -// ── Comment handling ─────────────────────────────────────────────────── - -/// Characters that start a line comment for each architecture. -/// -/// Used by `strip_comment` to determine where comments begin. -/// Each architecture may have multiple comment styles. -pub enum CommentStyle { - /// `#` only (x86/x86-64 AT&T syntax) - Hash, - /// `#` and `//` (RISC-V GAS) - HashAndSlashSlash, - /// `//` and `@` (ARM GAS — but `@` is not a comment before type specifiers - /// like `@function`, `@object`, `@progbits`, `@nobits`, `@tls_object`, `@note`) - /// Currently the ARM assembler uses its own strip_comment; this variant will - /// be used when ARM migrates to the shared preprocessor. - #[allow(dead_code)] - SlashSlashAndAt, -} - -/// Strip C-style `/* ... */` comments from assembly text, handling multi-line spans. -/// Preserves newlines inside comments so line numbers remain correct for error messages. -/// String-aware: does not strip `/* */` inside quoted string literals (e.g. `.asciz` data). -pub fn strip_c_comments(text: &str) -> String { - let mut result = String::with_capacity(text.len()); - let bytes = text.as_bytes(); - let mut i = 0; - let mut in_string = false; - let mut escape = false; - while i < bytes.len() { - if in_string { - if escape { - escape = false; - result.push(bytes[i] as char); - i += 1; - continue; - } - if bytes[i] == b'\\' { - escape = true; - result.push(bytes[i] as char); - i += 1; - continue; - } - if bytes[i] == b'"' { - in_string = false; - } - // Newlines end the string context (unterminated string on this line) - if bytes[i] == b'\n' { - in_string = false; - } - result.push(bytes[i] as char); - i += 1; - } else if i + 1 < bytes.len() && bytes[i] == b'/' && bytes[i + 1] == b'*' { - i += 2; - while i + 1 < bytes.len() { - if bytes[i] == b'*' && bytes[i + 1] == b'/' { - i += 2; - break; - } - if bytes[i] == b'\n' { - result.push('\n'); - } - i += 1; - } - } else { - if bytes[i] == b'"' { - in_string = true; - } - result.push(bytes[i] as char); - i += 1; - } - } - result -} - -/// Strip trailing line comment from a single line, respecting string literals. -/// -/// Scans character by character tracking quote state so that comment characters -/// inside `"..."` strings are not treated as comment starts. Handles escaped -/// quotes (`\"`) correctly. -pub fn strip_comment<'a>(line: &'a str, style: &CommentStyle) -> &'a str { - let bytes = line.as_bytes(); - let mut in_string = false; - let mut i = 0; - while i < bytes.len() { - if in_string { - if bytes[i] == b'\\' { - i += 2; // skip escaped character - continue; - } - if bytes[i] == b'"' { - in_string = false; - } - i += 1; - continue; - } - // Not in string - if bytes[i] == b'"' { - in_string = true; - i += 1; - continue; - } - match style { - CommentStyle::Hash => { - if bytes[i] == b'#' { - return &line[..i]; - } - } - CommentStyle::HashAndSlashSlash => { - if bytes[i] == b'#' { - return &line[..i]; - } - if bytes[i] == b'/' && i + 1 < bytes.len() && bytes[i + 1] == b'/' { - return &line[..i]; - } - } - CommentStyle::SlashSlashAndAt => { - if bytes[i] == b'/' && i + 1 < bytes.len() && bytes[i + 1] == b'/' { - return &line[..i]; - } - if bytes[i] == b'@' { - // `@` is NOT a comment before type specifiers used in GAS directives - let after = &line[i + 1..]; - if !after.starts_with("object") - && !after.starts_with("function") - && !after.starts_with("progbits") - && !after.starts_with("nobits") - && !after.starts_with("tls_object") - && !after.starts_with("note") - { - return &line[..i]; - } - } - } - } - i += 1; - } - line -} - -/// Split a line on `;` characters, respecting string literals and comments. -/// In GAS syntax, `;` separates multiple statements on the same line. -/// Stops splitting once a line comment (`#` or `//`) is encountered outside strings, -/// so semicolons inside comments are not treated as statement separators. -pub fn split_on_semicolons(line: &str) -> Vec<&str> { - let mut parts = Vec::new(); - let mut in_string = false; - let mut escape = false; - let mut start = 0; - let bytes = line.as_bytes(); - for (i, c) in line.char_indices() { - if escape { - escape = false; - continue; - } - if c == '\\' && in_string { - escape = true; - continue; - } - if c == '"' { - in_string = !in_string; - continue; - } - if !in_string { - // Stop splitting at # comment start - if c == '#' { - break; - } - // Stop splitting at // comment start - if c == '/' && i + 1 < bytes.len() && bytes[i + 1] == b'/' { - break; - } - if c == ';' { - parts.push(&line[start..i]); - start = i + 1; - } - } - } - parts.push(&line[start..]); - parts -} - -// ── .rept / .irp block expansion ─────────────────────────────────────── - -fn is_rept_start(trimmed: &str) -> bool { - trimmed.starts_with(".rept ") || trimmed.starts_with(".rept\t") -} - -fn is_irp_start(trimmed: &str) -> bool { - trimmed.starts_with(".irp ") || trimmed.starts_with(".irp\t") -} - -fn is_block_start(trimmed: &str) -> bool { - is_rept_start(trimmed) || is_irp_start(trimmed) -} - -/// Collect the body lines of a `.rept`/`.irp` block, handling nesting. -/// Returns the body lines and advances `i` past the closing `.endr`. -fn collect_block_body<'a>( - lines: &[&'a str], - i: &mut usize, - comment_style: &CommentStyle, -) -> Result, String> { - let mut depth = 1; - let mut body = Vec::new(); - *i += 1; - while *i < lines.len() { - let inner = strip_comment(lines[*i], comment_style).trim().to_string(); - if is_block_start(&inner) { - depth += 1; - } else if inner == ".endr" { - depth -= 1; - if depth == 0 { - break; - } - } - body.push(lines[*i]); - *i += 1; - } - if depth != 0 { - return Err(".rept/.irp without matching .endr".to_string()); - } - Ok(body) -} - -/// Estimate the byte size of a single assembly line for label position tracking. -/// Used to resolve backward label references in .rept count expressions. -/// `default_insn_size` is the typical instruction size for the target (4 for ARM/RISC-V). -fn estimate_line_bytes_generic(trimmed: &str, comment_style: &CommentStyle, default_insn_size: u64) -> u64 { - if trimmed.is_empty() { - return 0; - } - // Check for comment-only lines - match comment_style { - CommentStyle::Hash => { - if trimmed.starts_with('#') { return 0; } - } - CommentStyle::HashAndSlashSlash => { - if trimmed.starts_with('#') || trimmed.starts_with("//") { return 0; } - } - CommentStyle::SlashSlashAndAt => { - if trimmed.starts_with("//") || trimmed.starts_with('@') { return 0; } - } - } - // Label definitions don't add bytes - if trimmed.ends_with(':') && !trimmed.contains(' ') { - return 0; - } - // Strip leading labels like "661:" from lines like "661: bl foo" - let content = if let Some(pos) = trimmed.find(':') { - let before = &trimmed[..pos]; - if before.chars().all(|c| c.is_alphanumeric() || c == '_' || c == '.') { - trimmed[pos + 1..].trim() - } else { - trimmed - } - } else { - trimmed - }; - if content.is_empty() { - return 0; - } - // Directives - if content.starts_with('.') { - let lower = content.to_lowercase(); - if lower.starts_with(".byte ") { return 1; } - if lower.starts_with(".hword ") || lower.starts_with(".short ") || lower.starts_with(".2byte ") { return 2; } - if lower.starts_with(".word ") || lower.starts_with(".long ") || lower.starts_with(".4byte ") || lower.starts_with(".inst ") { return 4; } - if lower.starts_with(".quad ") || lower.starts_with(".xword ") || lower.starts_with(".8byte ") { return 8; } - // .zero N, .space N, .skip N - if lower.starts_with(".zero ") || lower.starts_with(".space ") || lower.starts_with(".skip ") { - let arg = content.split_whitespace().nth(1).unwrap_or("0"); - if let Ok(n) = arg.trim_end_matches(',').parse::() { return n; } - } - // Other directives (.align, .section, .globl, .type, .ascii, etc.) — 0 bytes - return 0; - } - // Everything else is an instruction - default_insn_size -} - -/// Resolve backward numeric label references (like 662b, 661b) in a .rept count expression. -/// Substitutes each backward reference with its byte position, then evaluates the expression. -fn resolve_rept_label_expr( - count_str: &str, - label_positions: &std::collections::HashMap>, - parse_int: fn(&str) -> Result, -) -> Result { - // First try direct evaluation (handles simple integer expressions) - if let Ok(val) = parse_int(count_str) { - return Ok(val); - } - - // Check if expression contains backward label references (e.g., 662b) - let mut resolved = count_str.to_string(); - let mut found_label_ref = false; - - // Find and replace backward label references: digits followed by 'b' or 'B' - loop { - let mut replaced = false; - let bytes = resolved.as_bytes(); - let len = bytes.len(); - let mut i = 0; - while i < len { - if bytes[i].is_ascii_digit() { - let start = i; - while i < len && bytes[i].is_ascii_digit() { - i += 1; - } - if i < len && (bytes[i] == b'b' || bytes[i] == b'B') { - let after_ok = i + 1 >= len || !bytes[i + 1].is_ascii_alphanumeric(); - // Avoid matching binary literals like 0b1010 - let is_binary = start + 1 == i && bytes[start] == b'0'; - if after_ok && !is_binary { - let label_num = &resolved[start..i]; - let ref_end = i + 1; - if let Some(positions) = label_positions.get(label_num) { - if let Some(&pos) = positions.last() { - let before = &resolved[..start]; - let after = &resolved[ref_end..]; - resolved = format!("{}{}{}", before, pos, after); - found_label_ref = true; - replaced = true; - break; - } - } - } - } - } - i += 1; - } - if !replaced { - break; - } - } - - if found_label_ref { - parse_int(&resolved) - } else { - Err(format!("cannot evaluate .rept count: {}", count_str)) - } -} - -/// Expand `.rept`/`.endr` and `.irp`/`.endr` blocks by repeating or -/// substituting contained lines. -/// -/// Handles nested blocks and recursive expansion. Uses `parse_int` to -/// evaluate the `.rept` count expression. -/// Tracks numeric label byte positions to resolve backward label references -/// in `.rept` count expressions (e.g., `.rept (662b-661b)/4`). -pub fn expand_rept_blocks( - lines: &[&str], - comment_style: &CommentStyle, - parse_int: fn(&str) -> Result, -) -> Result, String> { - expand_rept_blocks_with_insn_size(lines, comment_style, parse_int, 4) -} - -/// Same as `expand_rept_blocks` but with configurable instruction size for byte estimation. -pub(crate) fn expand_rept_blocks_with_insn_size( - lines: &[&str], - comment_style: &CommentStyle, - parse_int: fn(&str) -> Result, - default_insn_size: u64, -) -> Result, String> { - let mut result = Vec::new(); - let mut i = 0; - let mut label_positions: std::collections::HashMap> = std::collections::HashMap::new(); - let mut current_byte_pos: u64 = 0; - while i < lines.len() { - let trimmed = strip_comment(lines[i], comment_style).trim().to_string(); - if is_rept_start(&trimmed) { - let count_str = trimmed[".rept".len()..].trim(); - let count_val = resolve_rept_label_expr(count_str, &label_positions, parse_int) - .map_err(|e| format!(".rept: bad count '{}': {}", count_str, e))?; - // Treat negative counts as 0 (matches GNU as behavior) - let count = if count_val < 0 { 0usize } else { count_val as usize }; - let body = collect_block_body(lines, &mut i, comment_style)?; - let expanded_body = expand_rept_blocks_with_insn_size(&body, comment_style, parse_int, default_insn_size)?; - for _ in 0..count { - result.extend(expanded_body.iter().cloned()); - } - } else if is_irp_start(&trimmed) { - // .irp var, val1, val2, ... - let args_str = trimmed[".irp".len()..].trim(); - let (var, values_str) = match args_str.find(',') { - Some(pos) => (args_str[..pos].trim(), args_str[pos + 1..].trim()), - None => (args_str, ""), - }; - let values: Vec<&str> = values_str.split(',').map(|s| s.trim()).collect(); - let body = collect_block_body(lines, &mut i, comment_style)?; - for val in &values { - let subst_body: Vec = body.iter().map(|line| { - let pattern = format!("\\{}", var); - let substituted = replace_macro_param(line, &pattern, val); - // Strip GAS macro argument delimiters: \() resolves to empty string - substituted.replace("\\()", "") - }).collect(); - let subst_refs: Vec<&str> = subst_body.iter().map(|s| s.as_str()).collect(); - let expanded = expand_rept_blocks_with_insn_size(&subst_refs, comment_style, parse_int, default_insn_size)?; - result.extend(expanded); - } - } else if trimmed == ".endr" { - // stray .endr without .rept — skip - } else { - // Track numeric label definitions and byte positions - if let Some(colon_pos) = trimmed.find(':') { - let before = &trimmed[..colon_pos]; - if !before.is_empty() && before.chars().all(|c| c.is_ascii_digit()) { - label_positions - .entry(before.to_string()) - .or_default() - .push(current_byte_pos); - } - } - current_byte_pos += estimate_line_bytes_generic(&trimmed, comment_style, default_insn_size); - result.push(lines[i].to_string()); - } - i += 1; - } - Ok(result) -} - -// ── .macro / .endm expansion ─────────────────────────────────────────── - -/// Macro definition: name, parameter list (with optional defaults), and body lines. -pub(crate) struct MacroDef { - params: Vec, - defaults: Vec>, - body: Vec, -} - -/// Parse macro parameter list, handling `param = default_value` syntax. -/// -/// GAS allows parameters like `enable = 1` where `1` is the default value -/// used when the caller omits that argument. Parameters are separated by -/// commas or whitespace. -fn parse_macro_params(params_str: &str) -> (Vec, Vec>) { - if params_str.is_empty() { - return (Vec::new(), Vec::new()); - } - let mut params = Vec::new(); - let mut defaults = Vec::new(); - - // Split on commas first (primary separator), then handle whitespace within each part - for part in params_str.split(',') { - let part = part.trim(); - if part.is_empty() { - continue; - } - // Check for `param = default_value` syntax - if let Some(eq_pos) = part.find('=') { - let param_name = part[..eq_pos].trim(); - let default_val = part[eq_pos + 1..].trim(); - // The param name might contain spaces (e.g., "enable = 1") - // Take the last whitespace-delimited token as the param name - // in case there are multiple tokens before the = - let tokens: Vec<&str> = param_name.split_whitespace().collect(); - if tokens.len() > 1 { - // Everything before the last token are separate params with no default - for t in &tokens[..tokens.len() - 1] { - params.push(t.to_string()); - defaults.push(None); - } - } - if let Some(last) = tokens.last() { - if !last.is_empty() { - params.push(last.to_string()); - defaults.push(Some(default_val.to_string())); - } - } - } else { - // No default value - may contain space-separated params - for token in part.split_whitespace() { - if !token.is_empty() { - params.push(token.to_string()); - defaults.push(None); - } - } - } - } - (params, defaults) -} - -/// Split macro invocation arguments, matching GNU as behavior. -/// -/// GAS treats both commas and whitespace as argument separators. Specifically: -/// 1. Split on commas first (respecting parentheses and quotes) -/// 2. Within each comma-separated field, further split on whitespace -/// 3. However, tokens connected by arithmetic/bitwise operators (`+`, `-`, `*`, -/// `/`, `%`, `|`, `&`, `^`, `<<`, `>>`, `~`) are kept as a single expression -/// argument with internal spaces stripped. -/// -/// Examples: -/// - `lb a5, 0(a1), 10f` → [`lb`, `a5`, `0(a1)`, `10f`] -/// - `886b, 888f, 0x1234, 0, 889f - 888f` → [`886b`, `888f`, `0x1234`, `0`, `889f-888f`] -/// - `a b c` → [`a`, `b`, `c`] -/// -/// Quoted strings are kept as a single argument with outer quotes stripped. -/// Parenthesized groups like `0(a1)` are kept together. -pub fn split_macro_args(s: &str) -> Vec { - if s.is_empty() { - return Vec::new(); - } - - // Step 1: Split on commas (respecting parens and quotes) to get comma fields. - let comma_fields = split_on_commas_raw(s); - - // Step 2: Within each comma field, split on whitespace (respecting parens), - // then merge expression tokens connected by operators. - let mut args = Vec::new(); - for field in &comma_fields { - let trimmed = field.trim(); - if trimmed.is_empty() { - continue; - } - let sub_tokens = split_field_on_whitespace(trimmed); - let merged = merge_expression_tokens(&sub_tokens); - args.extend(merged); - } - args -} - -/// Split a string on top-level commas (outside parens and quotes). -fn split_on_commas_raw(s: &str) -> Vec { - let mut fields = Vec::new(); - let mut current = String::new(); - let bytes = s.as_bytes(); - let mut i = 0; - let mut paren_depth = 0i32; - let mut in_quote = false; - - while i < bytes.len() { - if in_quote { - if bytes[i] == b'\\' && i + 1 < bytes.len() { - current.push(bytes[i] as char); - current.push(bytes[i + 1] as char); - i += 2; - continue; - } - if bytes[i] == b'"' { - in_quote = false; - } - current.push(bytes[i] as char); - i += 1; - continue; - } - match bytes[i] { - b'"' => { - in_quote = true; - current.push('"'); - } - b'(' => { - paren_depth += 1; - current.push('('); - } - b')' => { - paren_depth -= 1; - current.push(')'); - } - b',' if paren_depth == 0 => { - fields.push(current.clone()); - current.clear(); - } - _ => { - current.push(bytes[i] as char); - } - } - i += 1; - } - fields.push(current); - fields -} - -/// Split a single comma field on whitespace, respecting parenthesized groups -/// and quoted strings. -fn split_field_on_whitespace(s: &str) -> Vec { - let mut tokens = Vec::new(); - let mut current = String::new(); - let bytes = s.as_bytes(); - let mut i = 0; - let mut paren_depth = 0i32; - - while i < bytes.len() { - match bytes[i] { - b'(' => { - paren_depth += 1; - current.push('('); - } - b')' => { - paren_depth -= 1; - current.push(')'); - } - b'"' => { - // Consume quoted string, stripping outer quotes - i += 1; - while i < bytes.len() && bytes[i] != b'"' { - if bytes[i] == b'\\' && i + 1 < bytes.len() { - current.push(bytes[i + 1] as char); - i += 2; - continue; - } - current.push(bytes[i] as char); - i += 1; - } - // Skip closing quote - } - b' ' | b'\t' if paren_depth == 0 => { - let trimmed = current.trim().to_string(); - if !trimmed.is_empty() { - tokens.push(trimmed); - current.clear(); - } - // Skip remaining whitespace - while i + 1 < bytes.len() && (bytes[i + 1] == b' ' || bytes[i + 1] == b'\t') { - i += 1; - } - } - _ => { - current.push(bytes[i] as char); - } - } - i += 1; - } - let trimmed = current.trim().to_string(); - if !trimmed.is_empty() { - tokens.push(trimmed); - } - tokens -} - -/// Check if a token looks like an arithmetic/bitwise operator that connects -/// expression parts in GAS macro arguments. -fn is_expression_operator(token: &str) -> bool { - matches!( - token, - "+" | "-" | "*" | "/" | "%" | "|" | "&" | "^" | "~" | "<<" | ">>" | "!" | "||" | "&&" - ) -} - -/// Check if a token ends with an operand character (digit, letter, `_`, `.`, `)`), -/// indicating it could be the left-hand side of a binary operator expression. -fn ends_with_operand(token: &str) -> bool { - let bytes = token.as_bytes(); - if bytes.is_empty() { - return false; - } - let last = bytes[bytes.len() - 1]; - last.is_ascii_alphanumeric() || last == b'_' || last == b'.' || last == b')' -} - -/// Check if a token ends with an operator character, indicating the expression -/// continues into the next token. -fn ends_with_operator(token: &str) -> bool { - let bytes = token.as_bytes(); - if bytes.is_empty() { - return false; - } - let last = bytes[bytes.len() - 1]; - matches!(last, b'+' | b'-' | b'*' | b'/' | b'%' | b'|' | b'&' | b'^' | b'~') -} - -/// Check if a token starts with an operator character that could be a binary -/// operator connecting it to the preceding token (e.g., `-888f` after `889f`). -/// Only treats leading `-`/`+` as binary operators when preceded by an operand. -fn starts_with_binary_operator(token: &str) -> bool { - let bytes = token.as_bytes(); - if bytes.is_empty() { - return false; - } - let first = bytes[0]; - matches!(first, b'+' | b'-' | b'*' | b'/' | b'%' | b'|' | b'&' | b'^' | b'~') -} - -/// Merge tokens that form arithmetic/bitwise expressions. -/// -/// When tokens are: `[889f, -, 888f]`, merge to `[889f-888f]`. -/// When tokens are: `[889f, +, 888f]`, merge to `[889f+888f]`. -/// When tokens are: `[lb, a5]`, keep as `[lb, a5]` (no operator). -/// When tokens are: `[a, -4, b]`, keep as `[a, -4, b]` (unary minus, not binary). -/// -/// Context-awareness: a leading `-`/`+` on the next token is only treated as -/// a binary operator if the current merged token ends with an operand character -/// (digit, letter, `_`, `.`, `)`). This prevents false merges like `a-4`. -fn merge_expression_tokens(tokens: &[String]) -> Vec { - if tokens.is_empty() { - return Vec::new(); - } - if tokens.len() == 1 { - return tokens.to_vec(); - } - - let mut result = Vec::new(); - let mut i = 0; - - while i < tokens.len() { - let mut merged = tokens[i].clone(); - // Look ahead: if next token is an operator or starts with one, merge - while i + 1 < tokens.len() { - let next = &tokens[i + 1]; - if is_expression_operator(next) && ends_with_operand(&merged) { - // Standalone operator token (e.g., `-`, `+`): merge it and the - // following operand, but only if current token looks like an operand - merged.push_str(next); - i += 1; - if i + 1 < tokens.len() { - merged.push_str(&tokens[i + 1]); - i += 1; - } - } else if starts_with_binary_operator(next) && ends_with_operand(&merged) { - // Next token starts with operator (e.g., `-888f`) and current - // ends with operand — treat as binary expression continuation - merged.push_str(next); - i += 1; - } else if ends_with_operator(&merged) { - // Current ends with operator (e.g., `889f+`), merge with next - merged.push_str(next); - i += 1; - } else { - break; - } - } - result.push(merged); - i += 1; - } - result -} - -/// Expand `.macro`/`.endm` definitions and macro invocations. -/// -/// Two-pass approach: -/// 1. Collect macro definitions (`.macro name [params]` ... `.endm`) -/// 2. Expand macro invocations: lines where the first word matches a defined macro -/// -/// Handles nested macro definitions and recursive expansion. -pub fn expand_macros( - lines: &[&str], - comment_style: &CommentStyle, -) -> Result, String> { - use std::collections::HashMap; - let mut macros: HashMap = HashMap::new(); - let mut result = Vec::new(); - let mut i = 0; - - while i < lines.len() { - let trimmed = strip_comment(lines[i], comment_style).trim().to_string(); - if trimmed.starts_with(".macro ") || trimmed.starts_with(".macro\t") { - // Parse: .macro name [param1[, param2, ...]] - let rest = trimmed[".macro".len()..].trim(); - let (name, params_str) = match rest.find([' ', '\t', ',']) { - Some(pos) => (rest[..pos].trim(), rest[pos..].trim().trim_start_matches(',')), - None => (rest, ""), - }; - let (params, defaults) = parse_macro_params(params_str); - let mut body = Vec::new(); - let mut depth = 1; - i += 1; - while i < lines.len() { - let inner = strip_comment(lines[i], comment_style).trim().to_string(); - if inner.starts_with(".macro ") || inner.starts_with(".macro\t") { - depth += 1; - } else if inner == ".endm" || inner.starts_with(".endm ") || inner.starts_with(".endm\t") { - depth -= 1; - if depth == 0 { - break; - } - } - body.push(lines[i].to_string()); - i += 1; - } - macros.insert(name.to_string(), MacroDef { params, defaults, body }); - } else if trimmed == ".endm" || trimmed.starts_with(".endm ") || trimmed.starts_with(".endm\t") { - // stray .endm — skip - } else if trimmed.starts_with(".purgem ") || trimmed.starts_with(".purgem\t") { - // Remove a macro definition (GAS .purgem directive) - let name = trimmed[".purgem".len()..].trim(); - macros.remove(name); - } else if !trimmed.is_empty() && !trimmed.starts_with('.') && !trimmed.starts_with('#') { - let first_word = trimmed.split([' ', '\t']).next().unwrap_or(""); - let potential_name = first_word.trim_end_matches(':'); - if potential_name != first_word { - // It's a label, not a macro invocation - result.push(lines[i].to_string()); - } else if let Some(mac) = macros.get(potential_name) { - let args_str = trimmed[first_word.len()..].trim(); - let args = split_macro_args(args_str); - // Sort parameter indices by name length (longest first) to avoid - // partial substitution: e.g., \orig must not match before \orig_len. - let mut sorted_indices: Vec = (0..mac.params.len()).collect(); - sorted_indices.sort_by(|&a, &b| mac.params[b].len().cmp(&mac.params[a].len())); - let mut expanded_lines = Vec::new(); - for body_line in &mac.body { - let mut expanded = body_line.clone(); - for &pi in &sorted_indices { - let param = &mac.params[pi]; - let pattern = format!("\\{}", param); - let replacement = args.get(pi).map(|s| s.as_str()).unwrap_or_else(|| { - mac.defaults.get(pi) - .and_then(|d| d.as_deref()) - .unwrap_or("0") - }); - expanded = replace_macro_param(&expanded, &pattern, replacement); - } - // Strip GAS macro argument delimiters: \() resolves to empty string. - // Used to separate parameter names from adjacent text, - // e.g., \op\()_safe_regs -> rdmsr_safe_regs - expanded = expanded.replace("\\()", ""); - expanded_lines.push(expanded); - } - let refs: Vec<&str> = expanded_lines.iter().map(|s| s.as_str()).collect(); - let re_expanded = expand_macros_with(&refs, ¯os, comment_style)?; - result.extend(re_expanded); - } else { - result.push(lines[i].to_string()); - } - } else { - result.push(lines[i].to_string()); - } - i += 1; - } - Ok(result) -} - -/// Replace `\param` in a macro body line with the argument value, but only -/// when `\param` is followed by a non-identifier character (or end of string). -/// -/// GAS macro parameter references like `\orig` should NOT match as a prefix -/// of `\orig_len`. In GAS, `\()` is used to explicitly delimit parameter names -/// from adjacent identifier characters (e.g., `\op\()_safe_regs`). -pub fn replace_macro_param(text: &str, pattern: &str, replacement: &str) -> String { - let pat_bytes = pattern.as_bytes(); - let pat_len = pat_bytes.len(); - let text_bytes = text.as_bytes(); - let text_len = text_bytes.len(); - let mut result = String::with_capacity(text_len); - let mut i = 0; - while i < text_len { - if i + pat_len <= text_len && &text_bytes[i..i + pat_len] == pat_bytes { - // Check that the character after the match is not an identifier continuation - let after = if i + pat_len < text_len { - text_bytes[i + pat_len] - } else { - b' ' // end of string counts as delimiter - }; - if after.is_ascii_alphanumeric() || after == b'_' { - // Not a full match -- the parameter name continues - result.push(text_bytes[i] as char); - i += 1; - } else { - result.push_str(replacement); - i += pat_len; - } - } else { - result.push(text_bytes[i] as char); - i += 1; - } - } - result -} - -/// Re-expand macro invocations using already-collected macro definitions. -fn expand_macros_with( - lines: &[&str], - macros: &std::collections::HashMap, - comment_style: &CommentStyle, -) -> Result, String> { - let mut result = Vec::new(); - for line in lines { - let trimmed = strip_comment(line, comment_style).trim().to_string(); - if trimmed.is_empty() || trimmed.starts_with('.') || trimmed.starts_with('#') { - result.push(line.to_string()); - continue; - } - let first_word = trimmed.split([' ', '\t']).next().unwrap_or(""); - let potential_name = first_word.trim_end_matches(':'); - if potential_name != first_word { - result.push(line.to_string()); - } else if let Some(mac) = macros.get(potential_name) { - let args_str = trimmed[first_word.len()..].trim(); - let args = split_macro_args(args_str); - // Sort parameter indices by name length (longest first) to avoid - // partial substitution: e.g., \orig must not match before \orig_len. - let mut sorted_indices: Vec = (0..mac.params.len()).collect(); - sorted_indices.sort_by(|&a, &b| mac.params[b].len().cmp(&mac.params[a].len())); - let mut expanded_lines = Vec::new(); - for body_line in &mac.body { - let mut expanded = body_line.clone(); - for &pi in &sorted_indices { - let param = &mac.params[pi]; - let pattern = format!("\\{}", param); - let replacement = args.get(pi).map(|s| s.as_str()).unwrap_or_else(|| { - mac.defaults.get(pi) - .and_then(|d| d.as_deref()) - .unwrap_or("0") - }); - expanded = replace_macro_param(&expanded, &pattern, replacement); - } - // Strip GAS macro argument delimiters: \() resolves to empty string - expanded = expanded.replace("\\()", ""); - expanded_lines.push(expanded); - } - let refs: Vec<&str> = expanded_lines.iter().map(|s| s.as_str()).collect(); - let re_expanded = expand_macros_with(&refs, macros, comment_style)?; - result.extend(re_expanded); - } else { - result.push(line.to_string()); - } - } - Ok(result) -} - -// ── .if / .else / .endif conditional assembly ────────────────────────── - -/// Map x86-64 register names to unique integer values for use in `.if` expressions. -/// -/// GAS assigns internal register encoding numbers to register names, allowing -/// comparisons like `.if %rsp == %rbp` in conditional assembly. The exact values -/// match GAS's internal encoding (AT&T register numbers starting at 104 for %rax). -/// Only the 64-bit GPRs are mapped since those are the only ones used in kernel -/// `.if` comparisons (UNWIND_HINT_REGS). -pub fn resolve_x86_registers(expr: &str) -> String { - // Replace register names with numeric values, longest first to avoid partial matches. - // Values match GAS internal encoding for x86-64 registers. - const REGS: &[(&str, &str)] = &[ - ("%r10", "114"), ("%r11", "115"), ("%r12", "116"), ("%r13", "117"), - ("%r14", "118"), ("%r15", "119"), ("%r8", "112"), ("%r9", "113"), - ("%rax", "104"), ("%rcx", "105"), ("%rdx", "106"), ("%rbx", "107"), - ("%rsp", "108"), ("%rbp", "109"), ("%rsi", "110"), ("%rdi", "111"), - // 32-bit registers (used in some kernel macros) - ("%eax", "40"), ("%ecx", "41"), ("%edx", "42"), ("%ebx", "43"), - ("%esp", "44"), ("%ebp", "45"), ("%esi", "46"), ("%edi", "47"), - ]; - let mut result = expr.to_string(); - for &(name, val) in REGS { - result = result.replace(name, val); - } - result -} - -/// Evaluate a simple `.if` condition expression. -/// -/// Supports: integer literals, `==`, `!=`, `>=`, `<=`, `>`, `<`, and simple -/// arithmetic via `asm_expr::parse_integer_expr`. Non-zero result is true. -pub fn eval_if_condition(cond: &str) -> bool { - eval_if_condition_inner(cond, |s| s.to_string()) -} - -/// Evaluate a `.if` condition with a pre-processing step for resolving names. -/// -/// The `resolve` function is called on each side of a comparison operator -/// before integer expression parsing. Use this to resolve register names -/// or symbol values. -pub fn eval_if_condition_with_resolver String>(cond: &str, resolve: F) -> bool { - eval_if_condition_inner(cond, resolve) -} - -/// Find position of an isolated `>` or `<` that is not part of `>>`, `<<`, `>=`, or `<=`, -/// and not inside parentheses. -fn find_isolated_cmp(cond: &str, ch: char) -> Option { - let bytes = cond.as_bytes(); - let target = ch as u8; - let mut depth = 0i32; - for i in 0..bytes.len() { - match bytes[i] { - b'(' => depth += 1, - b')' => depth -= 1, - _ => {} - } - if depth == 0 && bytes[i] == target { - let next = bytes.get(i + 1).copied(); - let prev = if i > 0 { Some(bytes[i - 1]) } else { None }; - if next != Some(b'>') && next != Some(b'<') && next != Some(b'=') - && prev != Some(b'>') && prev != Some(b'<') - { - return Some(i); - } - } - } - None -} - -fn eval_if_condition_inner String>(cond: &str, resolve: F) -> bool { - eval_if_condition_dyn(cond, &resolve) -} - -fn eval_if_condition_dyn(cond: &str, resolve: &dyn Fn(&str) -> String) -> bool { - let cond = cond.trim(); - // Strip outer parentheses: (.Lfound != 1) -> .Lfound != 1 - let cond = strip_outer_parens(cond); - - // Handle || (logical OR) at top level — lowest precedence - if let Some(pos) = find_top_level_op(cond, "||") { - let lhs = &cond[..pos]; - let rhs = &cond[pos + 2..]; - return eval_if_condition_dyn(lhs, resolve) || eval_if_condition_dyn(rhs, resolve); - } - // Handle && (logical AND) at top level - if let Some(pos) = find_top_level_op(cond, "&&") { - let lhs = &cond[..pos]; - let rhs = &cond[pos + 2..]; - return eval_if_condition_dyn(lhs, resolve) && eval_if_condition_dyn(rhs, resolve); - } - - // Find comparison operators at the top level (not inside parentheses). - // Check "!=" before "==" and ">="/"<=" before ">"/"<". - if let Some(pos) = find_top_level_op(cond, "!=") { - let lhs = resolve(cond[..pos].trim()); - let rhs = resolve(cond[pos + 2..].trim()); - let l = asm_expr::parse_integer_expr(&lhs).unwrap_or(i64::MIN); - let r = asm_expr::parse_integer_expr(&rhs).unwrap_or(i64::MAX); - return l != r; - } - if let Some(pos) = find_top_level_op(cond, "==") { - let lhs = resolve(cond[..pos].trim()); - let rhs = resolve(cond[pos + 2..].trim()); - let l = asm_expr::parse_integer_expr(&lhs).unwrap_or(i64::MIN); - let r = asm_expr::parse_integer_expr(&rhs).unwrap_or(i64::MAX); - return l == r; - } - if let Some(pos) = find_top_level_op(cond, ">=") { - let lhs = resolve(cond[..pos].trim()); - let rhs = resolve(cond[pos + 2..].trim()); - let l = asm_expr::parse_integer_expr(&lhs).unwrap_or(i64::MIN); - let r = asm_expr::parse_integer_expr(&rhs).unwrap_or(i64::MAX); - return l >= r; - } - if let Some(pos) = find_top_level_op(cond, "<=") { - let lhs = resolve(cond[..pos].trim()); - let rhs = resolve(cond[pos + 2..].trim()); - let l = asm_expr::parse_integer_expr(&lhs).unwrap_or(i64::MIN); - let r = asm_expr::parse_integer_expr(&rhs).unwrap_or(i64::MAX); - return l <= r; - } - // Try isolated ">" not part of ">>" or ">=" - if let Some(pos) = find_isolated_cmp(cond, '>') { - let lhs = resolve(cond[..pos].trim()); - let rhs = resolve(cond[pos + 1..].trim()); - let l = asm_expr::parse_integer_expr(&lhs).unwrap_or(i64::MIN); - let r = asm_expr::parse_integer_expr(&rhs).unwrap_or(i64::MAX); - return l > r; - } - // Try isolated "<" not part of "<<" or "<=" - if let Some(pos) = find_isolated_cmp(cond, '<') { - let lhs = resolve(cond[..pos].trim()); - let rhs = resolve(cond[pos + 1..].trim()); - let l = asm_expr::parse_integer_expr(&lhs).unwrap_or(i64::MIN); - let r = asm_expr::parse_integer_expr(&rhs).unwrap_or(i64::MAX); - return l < r; - } - // Simple integer expression: non-zero is true - let resolved = resolve(cond); - asm_expr::parse_integer_expr(&resolved).unwrap_or(0) != 0 -} - -/// Strip balanced outer parentheses from an expression. -/// `(expr)` -> `expr`, `((expr))` -> `expr`, `(a) + (b)` -> unchanged. -fn strip_outer_parens(s: &str) -> &str { - let s = s.trim(); - if !s.starts_with('(') || !s.ends_with(')') { - return s; - } - // Check if the outer parens are truly balanced as a pair - let inner = &s[1..s.len() - 1]; - let mut depth = 0i32; - for ch in inner.bytes() { - match ch { - b'(' => depth += 1, - b')' => { - depth -= 1; - if depth < 0 { - // Unbalanced: the closing paren in the middle means - // the outer parens are not a matching pair - return s; - } - } - _ => {} - } - } - if depth == 0 { - inner.trim() - } else { - s - } -} - -/// Find a comparison operator at the top level (not inside parentheses). -fn find_top_level_op(s: &str, op: &str) -> Option { - let bytes = s.as_bytes(); - let op_bytes = op.as_bytes(); - let op_len = op_bytes.len(); - if bytes.len() < op_len { - return None; - } - let mut depth = 0i32; - for i in 0..=bytes.len() - op_len { - match bytes[i] { - b'(' => depth += 1, - b')' => depth -= 1, - _ => {} - } - if depth == 0 && &bytes[i..i + op_len] == op_bytes { - return Some(i); - } - } - None -} - -// ── Shared data-value helpers ────────────────────────────────────────── - -/// Check if a string looks like a GNU numeric label reference (e.g. "2f", "1b", "42f"). -pub fn is_numeric_label_ref(s: &str) -> bool { - if s.len() < 2 { - return false; - } - let last = s.as_bytes()[s.len() - 1]; - if last != b'f' && last != b'F' && last != b'b' && last != b'B' { - return false; - } - s[..s.len() - 1].bytes().all(|b| b.is_ascii_digit()) -} - -/// Find the position of the `-` operator in a symbol difference expression -/// like `sym_a - sym_b`. Skips position 0 to avoid matching a leading negation. -/// -/// Returns `None` if no valid symbol difference operator is found. -pub fn find_symbol_diff_minus(expr: &str) -> Option { - let bytes = expr.as_bytes(); - let len = bytes.len(); - let mut i = 1; - while i < len { - if bytes[i] == b'-' { - let left_char = bytes[i - 1]; - let left_ok = left_char.is_ascii_alphanumeric() - || left_char == b'_' - || left_char == b'.' - || left_char == b' ' - || left_char == b')'; - let right_start = expr[i + 1..].trim_start(); - if !right_start.is_empty() { - let right_char = right_start.as_bytes()[0]; - let right_ok = right_char.is_ascii_alphabetic() - || right_char == b'_' - || right_char == b'.' - || right_char.is_ascii_digit() - || right_char == b'('; - if left_ok && right_ok { - return Some(i); - } - } - } - i += 1; - } - None -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_strip_c_comments() { - assert_eq!(strip_c_comments("a /* b */ c"), "a c"); - assert_eq!(strip_c_comments("a /* b\nc */ d"), "a \n d"); - } - - #[test] - fn test_strip_c_comments_preserves_strings() { - // /* inside a quoted string must not be treated as a comment - assert_eq!( - strip_c_comments(r#".asciz "hello/*world*/end""#), - r#".asciz "hello/*world*/end""# - ); - // Escaped quotes inside strings should not end the string - assert_eq!( - strip_c_comments(r#".asciz "a\"/*b*/c""#), - r#".asciz "a\"/*b*/c""# - ); - // /* outside strings should still be stripped - assert_eq!( - strip_c_comments(r#".asciz "hello" /* comment */ .byte 1"#), - r#".asciz "hello" .byte 1"# - ); - } - - #[test] - fn test_strip_comment_hash() { - let style = CommentStyle::Hash; - assert_eq!(strip_comment("movq %rax, %rbx # comment", &style), "movq %rax, %rbx "); - assert_eq!(strip_comment(".asciz \"a#b\"", &style), ".asciz \"a#b\""); - } - - #[test] - fn test_strip_comment_slash_slash_and_at() { - let style = CommentStyle::SlashSlashAndAt; - assert_eq!(strip_comment("mov x0, x1 // comment", &style), "mov x0, x1 "); - assert_eq!(strip_comment("mov x0, x1 @ comment", &style), "mov x0, x1 "); - assert_eq!(strip_comment(".type foo, @function", &style), ".type foo, @function"); - } - - #[test] - fn test_split_on_semicolons() { - let parts = split_on_semicolons("a; b; c"); - assert_eq!(parts, vec!["a", " b", " c"]); - let parts = split_on_semicolons(".asciz \"a;b\"; nop"); - assert_eq!(parts, vec![".asciz \"a;b\"", " nop"]); - // Semicolons inside # comments should not cause splitting - let parts = split_on_semicolons("nop # comment; with semicolons"); - assert_eq!(parts, vec!["nop # comment; with semicolons"]); - // Semicolons inside // comments should not cause splitting - let parts = split_on_semicolons("nop // comment; with semicolons"); - assert_eq!(parts, vec!["nop // comment; with semicolons"]); - // Full-line // comments with semicolons - let parts = split_on_semicolons("// struct {size_t a,b;} *p = (void*)x;"); - assert_eq!(parts, vec!["// struct {size_t a,b;} *p = (void*)x;"]); - // Semicolons before comment still split normally - let parts = split_on_semicolons("a; b # comment; c"); - assert_eq!(parts, vec!["a", " b # comment; c"]); - } - - #[test] - fn test_eval_if_condition() { - assert!(eval_if_condition("1")); - assert!(!eval_if_condition("0")); - assert!(eval_if_condition("1 == 1")); - assert!(!eval_if_condition("1 == 2")); - assert!(eval_if_condition("1 != 2")); - assert!(eval_if_condition("3 >= 2")); - assert!(eval_if_condition("2 >= 2")); - assert!(!eval_if_condition("1 >= 2")); - assert!(eval_if_condition("1 <= 2")); - assert!(eval_if_condition("3 > 2")); - assert!(!eval_if_condition("2 > 2")); - assert!(eval_if_condition("1 < 2")); - } - - #[test] - fn test_eval_if_condition_with_x86_registers() { - // Test register equality (like kernel UNWIND_HINT_REGS) - assert!(eval_if_condition_with_resolver("%rsp == %rsp", resolve_x86_registers)); - assert!(!eval_if_condition_with_resolver("%rsp == %rbp", resolve_x86_registers)); - assert!(eval_if_condition_with_resolver("%rbp == %rbp", resolve_x86_registers)); - assert!(eval_if_condition_with_resolver("%rdi == %rdi", resolve_x86_registers)); - assert!(eval_if_condition_with_resolver("%rdx == %rdx", resolve_x86_registers)); - assert!(eval_if_condition_with_resolver("%r10 == %r10", resolve_x86_registers)); - assert!(eval_if_condition_with_resolver("%rsp != %rbp", resolve_x86_registers)); - } - - #[test] - fn test_resolve_x86_registers() { - assert_eq!(resolve_x86_registers("%rsp"), "108"); - assert_eq!(resolve_x86_registers("%rbp"), "109"); - assert_eq!(resolve_x86_registers("%rdi"), "111"); - assert_eq!(resolve_x86_registers("%r10"), "114"); - // Ordering matters: %r10 must be replaced before %r8/%r9 to avoid partial matches - assert_eq!(resolve_x86_registers("%r13"), "117"); - } - - #[test] - fn test_is_numeric_label_ref() { - assert!(is_numeric_label_ref("1f")); - assert!(is_numeric_label_ref("42b")); - assert!(!is_numeric_label_ref("f")); - assert!(!is_numeric_label_ref("abc")); - } - - #[test] - fn test_find_symbol_diff_minus() { - assert_eq!(find_symbol_diff_minus("a - b"), Some(2)); - assert_eq!(find_symbol_diff_minus("-5"), None); - assert_eq!(find_symbol_diff_minus(".Lfoo-.Lbar"), Some(5)); - } - - #[test] - fn test_split_macro_args() { - assert_eq!(split_macro_args("a, b, c"), vec!["a", "b", "c"]); - assert_eq!(split_macro_args("0(a1), x, y"), vec!["0(a1)", "x", "y"]); - assert_eq!(split_macro_args(""), Vec::::new()); - // Expression with operator: `889f - 888f` stays as one arg (operator merging) - assert_eq!(split_macro_args("886b, 888f, 0x1234, 0, 889f - 888f"), - vec!["886b", "888f", "0x1234", "0", "889f-888f"]); - // Without commas, spaces are separators - assert_eq!(split_macro_args("a b c"), vec!["a", "b", "c"]); - // Mixed comma and space: `fixup lb a5, 0(a1), 10f` → 4 args - assert_eq!(split_macro_args("lb a5, 0(a1), 10f"), - vec!["lb", "a5", "0(a1)", "10f"]); - // Expression operators keep tokens together - assert_eq!(split_macro_args("foo + bar"), vec!["foo+bar"]); - assert_eq!(split_macro_args("foo + bar, baz"), vec!["foo+bar", "baz"]); - // GNU as treats `a -4` as expression `a-4` (binary minus, not unary) - assert_eq!(split_macro_args("a -4 b"), vec!["a-4", "b"]); - // Operand followed by operator-prefixed token: binary subtraction - assert_eq!(split_macro_args("889f -888f"), vec!["889f-888f"]); - } - - #[test] - fn test_parse_macro_params_simple() { - let (params, defaults) = parse_macro_params("a, b, c"); - assert_eq!(params, vec!["a", "b", "c"]); - assert_eq!(defaults, vec![None, None, None]); - } - - #[test] - fn test_parse_macro_params_with_defaults() { - let (params, defaults) = parse_macro_params("a, b = 5, c"); - assert_eq!(params, vec!["a", "b", "c"]); - assert_eq!(defaults, vec![None, Some("5".to_string()), None]); - } - - #[test] - fn test_parse_macro_params_space_separated() { - let (params, defaults) = parse_macro_params("a b c"); - assert_eq!(params, vec!["a", "b", "c"]); - assert_eq!(defaults, vec![None, None, None]); - } - - #[test] - fn test_parse_macro_params_mixed() { - // GAS-style: `.macro ALT_NEW_CONTENT vendor_id, patch_id, enable = 1, new_c` - let (params, defaults) = parse_macro_params("vendor_id, patch_id, enable = 1, new_c"); - assert_eq!(params, vec!["vendor_id", "patch_id", "enable", "new_c"]); - assert_eq!(defaults, vec![None, None, Some("1".to_string()), None]); - } - - #[test] - fn test_parse_macro_params_empty() { - let (params, defaults) = parse_macro_params(""); - assert!(params.is_empty()); - assert!(defaults.is_empty()); - } - - #[test] - fn test_purgem_removes_macro() { - // Simulates the kernel's insn-def.h pattern: define macro, use it, .purgem it - let lines = vec![ - ".macro insn_r, opcode, func3", - ".4byte (\\opcode | \\func3)", - ".endm", - "insn_r 0x33, 0x0", - ".purgem insn_r", - ]; - let result = expand_macros(&lines, &CommentStyle::HashAndSlashSlash).unwrap(); - // The macro invocation should have been expanded - assert!(result.iter().any(|l| l.contains(".4byte"))); - // The .purgem line should have been consumed (not passed through) - assert!(!result.iter().any(|l| l.contains(".purgem"))); - } - - #[test] - fn test_purgem_prevents_further_expansion() { - // After .purgem, the macro name should no longer be recognized - let lines = vec![ - ".macro mymacro", - "nop", - ".endm", - "mymacro", - ".purgem mymacro", - "mymacro", - ]; - let result = expand_macros(&lines, &CommentStyle::HashAndSlashSlash).unwrap(); - // First invocation expands to "nop" - // After .purgem, second "mymacro" is passed through as-is (not a known macro) - let nop_count = result.iter().filter(|l| l.trim() == "nop").count(); - assert_eq!(nop_count, 1, "macro should only expand once before .purgem"); - let mymacro_count = result.iter().filter(|l| l.trim() == "mymacro").count(); - assert_eq!(mymacro_count, 1, "after .purgem, 'mymacro' should be passed through literally"); - } - - #[test] - fn test_replace_macro_param_basic() { - // Basic replacement - assert_eq!(replace_macro_param(".byte \\orig", "\\orig", "140b"), ".byte 140b"); - } - - #[test] - fn test_replace_macro_param_boundary_rejection() { - // \orig should NOT match as prefix of \orig_len - assert_eq!(replace_macro_param(".byte \\orig_len", "\\orig", "140b"), ".byte \\orig_len"); - } - - #[test] - fn test_replace_macro_param_end_of_string() { - // \orig at end of text should be replaced - assert_eq!(replace_macro_param("\\orig", "\\orig", "140b"), "140b"); - } - - #[test] - fn test_replace_macro_param_followed_by_operator() { - // \orig followed by '-' (not an identifier char) should be replaced - assert_eq!(replace_macro_param("\\orig-\\alt", "\\orig", "142b"), "142b-\\alt"); - } - - #[test] - fn test_replace_macro_param_multiple() { - // Multiple occurrences - assert_eq!( - replace_macro_param(".long \\sym - . ; .byte \\sym", "\\sym", "foo"), - ".long foo - . ; .byte foo" - ); - } - - #[test] - fn test_replace_macro_param_no_match() { - // Pattern not present - assert_eq!(replace_macro_param(".byte 42", "\\orig", "140b"), ".byte 42"); - } - - #[test] - fn test_replace_macro_param_adjacent_digit() { - // \orig followed by digit should NOT be replaced (digit is identifier continuation) - assert_eq!(replace_macro_param("\\orig2", "\\orig", "foo"), "\\orig2"); - } - - #[test] - fn test_replace_macro_param_before_delimiter() { - // \op followed by \() delimiter — \op IS replaced, \() stripped later - assert_eq!( - replace_macro_param("\\op\\()_safe_regs", "\\op", "rdmsr"), - "rdmsr\\()_safe_regs" - ); - } -} diff --git a/src/backend/call_abi.rs b/src/backend/call_abi.rs deleted file mode 100644 index 4cd8a531a4..0000000000 --- a/src/backend/call_abi.rs +++ /dev/null @@ -1,870 +0,0 @@ -//! Unified ABI classification for both call-site arguments and callee-side parameters. -//! -//! The core classification algorithm (struct layout, register assignment, stack overflow) -//! is the same for callers and callees — they must agree on where each argument lives. -//! Previously this logic was duplicated in two separate files with parallel enum -//! hierarchies (`CallArgClass` and `ParamClass`). This module unifies them: -//! -//! - `CallArgClass`: caller-side classification (no stack offsets needed) -//! - `ParamClass`: callee-side classification (tracks stack offsets for loading params) -//! - `classify_args_core`: single implementation of the classification algorithm -//! - `classify_call_args` / `classify_params_full`: thin wrappers over the core - -use crate::ir::reexports::{IrConst, IrFunction, Operand}; -use crate::common::types::IrType; -use super::generation::is_i128_type; - -// --------------------------------------------------------------------------- -// CallArgClass — caller-side classification (used by emit_call_*) -// --------------------------------------------------------------------------- - -/// Classification of a function call argument for register/stack assignment. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum CallArgClass { - /// Integer/pointer argument in a GP register. `reg_idx` is the GP register index. - IntReg { reg_idx: usize }, - /// Float argument in an FP register. `reg_idx` is the FP register index. - FloatReg { reg_idx: usize }, - /// 128-bit integer in a GP register pair. `base_reg_idx` is the first register. - I128RegPair { base_reg_idx: usize }, - /// F128 (long double) — handling is arch-specific (x87 on x86, Q-reg on ARM, GP pair on RISC-V). - F128Reg { reg_idx: usize }, - /// Small struct (<=16 bytes) passed by value in 1-2 GP registers. - StructByValReg { base_reg_idx: usize, size: usize }, - /// Small struct (<=16 bytes) where all fields are float/double (SSE class per SysV ABI). - /// Passed in 1-2 XMM registers instead of GP registers. - /// `lo_fp_idx` is the FP register for the first eightbyte, `hi_fp_idx` for the second (if size > 8). - StructSseReg { lo_fp_idx: usize, hi_fp_idx: Option, size: usize }, - /// Small struct where first eightbyte is INTEGER and second is SSE (mixed). - StructMixedIntSseReg { int_reg_idx: usize, fp_reg_idx: usize, size: usize }, - /// Small struct where first eightbyte is SSE and second is INTEGER (mixed). - StructMixedSseIntReg { fp_reg_idx: usize, int_reg_idx: usize, size: usize }, - /// Small struct (<=16 bytes) that overflows to the stack. - StructByValStack { size: usize }, - /// Small struct split across the last GP register and the stack. - /// RISC-V psABI: first XLEN bytes in `reg_idx`, remaining bytes on the stack. - StructSplitRegStack { reg_idx: usize, size: usize }, - /// Large struct (>16 bytes) passed on the stack (MEMORY class). - LargeStructStack { size: usize }, - /// Argument overflows to the stack (normal 8-byte). - Stack, - /// F128 argument overflows to the stack (16-byte aligned). - F128Stack, - /// I128 argument overflows to the stack (16-byte aligned). - I128Stack, - /// Zero-size struct argument (e.g., `struct { char x[0]; }`). - /// Per GCC behavior, zero-size struct arguments consume no register or stack space. - ZeroSizeSkip, -} - -impl CallArgClass { - /// Returns true if this argument is passed on the stack (any kind). - pub fn is_stack(&self) -> bool { - matches!(self, CallArgClass::Stack | CallArgClass::F128Stack | - CallArgClass::I128Stack | CallArgClass::StructByValStack { .. } | - CallArgClass::LargeStructStack { .. } | - CallArgClass::StructSplitRegStack { .. }) - } - - /// Returns the stack space consumed by this argument (0 if register). - pub fn stack_bytes(&self) -> usize { - let slot_size = crate::common::types::target_ptr_size(); // 8 for LP64, 4 for ILP32 - let align_mask = slot_size - 1; - match self { - CallArgClass::F128Stack => if slot_size == 4 { 12 } else { 16 }, // i686: x87 long double = 12 bytes - CallArgClass::I128Stack => 16, - CallArgClass::StructByValStack { size } | CallArgClass::LargeStructStack { size } => { - (*size + align_mask) & !align_mask - } - CallArgClass::StructSplitRegStack { size, .. } => { - // Only the portion beyond the first register goes on the stack. - let stack_part = size - slot_size; - (stack_part + align_mask) & !align_mask - } - CallArgClass::Stack => slot_size, - _ => 0, - } - } -} - -// --------------------------------------------------------------------------- -// ParamClass — callee-side classification (used by emit_store_params) -// --------------------------------------------------------------------------- - -/// Classification of a function parameter for `emit_store_params`. -/// -/// Each variant tells the backend exactly where the parameter arrives and what -/// kind of store logic is needed, without the backend reimplementing the ABI -/// classification algorithm. -#[derive(Debug, Clone, Copy)] -pub enum ParamClass { - /// Integer/pointer in GP register at `reg_idx`. - IntReg { reg_idx: usize }, - /// Float/double in FP register at `reg_idx`. - FloatReg { reg_idx: usize }, - /// i128 in aligned GP register pair starting at `base_reg_idx`. - I128RegPair { base_reg_idx: usize }, - /// Small struct (<=16 bytes) by value in 1-2 GP registers. - StructByValReg { base_reg_idx: usize, size: usize }, - /// Small struct where all eightbytes are SSE class -> 1-2 XMM registers. - StructSseReg { lo_fp_idx: usize, hi_fp_idx: Option, size: usize }, - /// Small struct: first eightbyte INTEGER, second SSE. - /// (`size` mirrors `CallArgClass`/`CoreArgClass` for structural consistency.) - #[allow(dead_code)] // size field not yet read by any backend - StructMixedIntSseReg { int_reg_idx: usize, fp_reg_idx: usize, size: usize }, - /// Small struct: first eightbyte SSE, second INTEGER. - /// (`size` mirrors `CallArgClass`/`CoreArgClass` for structural consistency.) - #[allow(dead_code)] // size field not yet read by any backend - StructMixedSseIntReg { fp_reg_idx: usize, int_reg_idx: usize, size: usize }, - /// F128 (long double) in FP register (ARM: Q-reg). - F128FpReg { reg_idx: usize }, - /// F128 in GP register pair (RISC-V). - F128GpPair { lo_reg_idx: usize, hi_reg_idx: usize }, - /// F128 always on stack (x86: x87 convention). - F128AlwaysStack { offset: i64 }, - /// Regular scalar on the stack. - StackScalar { offset: i64 }, - /// i128 on the stack (16-byte aligned). - I128Stack { offset: i64 }, - /// F128 on the stack (overflow from registers). - F128Stack { offset: i64 }, - /// Small struct that overflowed to the stack. - StructStack { offset: i64, size: usize }, - /// Small struct split across the last GP register and the stack. - /// RISC-V psABI: first XLEN bytes in `reg_idx`, remaining bytes at `stack_offset`. - StructSplitRegStack { reg_idx: usize, stack_offset: i64, size: usize }, - /// Large struct (>16 bytes) passed on the stack. - LargeStructStack { offset: i64, size: usize }, - /// Large struct (>16 bytes) passed by reference in a GP register (AAPCS64). - /// The register holds a pointer to the struct data; callee must copy from it. - LargeStructByRefReg { reg_idx: usize, size: usize }, - /// Large struct (>16 bytes) passed by reference on the stack (AAPCS64, overflow case). - /// The stack slot holds a pointer to the struct data; callee must copy from it. - LargeStructByRefStack { offset: i64, size: usize }, - /// Zero-size struct parameter (e.g., `struct { char x[0]; }`). - /// Per GCC behavior, zero-size struct parameters consume no register or stack space. - ZeroSizeSkip, -} - -impl ParamClass { - /// Returns true if this parameter is passed on the stack (fully or partially). - pub fn is_stack(&self) -> bool { - matches!(self, - ParamClass::StackScalar { .. } | ParamClass::I128Stack { .. } | - ParamClass::F128Stack { .. } | ParamClass::F128AlwaysStack { .. } | - ParamClass::StructStack { .. } | ParamClass::LargeStructStack { .. } | - ParamClass::LargeStructByRefStack { .. } | - ParamClass::StructSplitRegStack { .. } - ) - } - - /// Returns true if this parameter arrives in a GP register (int, i128 pair, struct, or F128 GP pair). - pub fn uses_gp_reg(&self) -> bool { - matches!(self, - ParamClass::IntReg { .. } | ParamClass::I128RegPair { .. } | - ParamClass::StructByValReg { .. } | ParamClass::F128GpPair { .. } | - ParamClass::LargeStructByRefReg { .. } | - ParamClass::StructMixedIntSseReg { .. } | ParamClass::StructMixedSseIntReg { .. } | - ParamClass::StructSplitRegStack { .. } - ) - } - - /// Returns the number of stack bytes consumed by this parameter classification. - /// Used by variadic function handling to compute how many stack bytes named - /// parameters occupy, so va_start can skip past them. - pub fn stack_bytes(&self) -> usize { - let slot_size = crate::common::types::target_ptr_size(); // 8 for LP64, 4 for ILP32 - let align_mask = slot_size - 1; - match self { - ParamClass::StackScalar { .. } => slot_size, - ParamClass::I128Stack { .. } => 16, - ParamClass::F128Stack { .. } => 16, - ParamClass::F128AlwaysStack { .. } => if slot_size == 4 { 12 } else { 16 }, - ParamClass::StructStack { size, .. } => (*size + align_mask) & !align_mask, - ParamClass::StructSplitRegStack { size, .. } => { - // Only the portion beyond the first register goes on the stack. - let stack_part = size - slot_size; - (stack_part + align_mask) & !align_mask - } - ParamClass::LargeStructStack { size, .. } => (*size + align_mask) & !align_mask, - ParamClass::LargeStructByRefStack { .. } => slot_size, // pointer on stack - _ => 0, // register-passed params don't consume stack space - } - } - - /// Returns the number of GP registers consumed by this parameter classification. - /// Used by variadic function handling to compute the correct va_start offset. - pub fn gp_reg_count(&self) -> usize { - match self { - ParamClass::IntReg { .. } => 1, - ParamClass::LargeStructByRefReg { .. } => 1, // pointer in one GP reg - ParamClass::I128RegPair { .. } => 2, - ParamClass::StructByValReg { size, .. } => { - // 1 reg for <=8 bytes, 2 regs for >8 bytes (up to 16) - if *size <= 8 { 1 } else { 2 } - } - ParamClass::F128GpPair { .. } => 2, - ParamClass::StructMixedIntSseReg { .. } | ParamClass::StructMixedSseIntReg { .. } => 1, - ParamClass::StructSplitRegStack { .. } => 1, // only 1 GP reg used (the rest on stack) - ParamClass::StructSseReg { .. } => 0, // all SSE, no GP regs - _ => 0, // FP regs and stack don't consume GP regs - } - } -} - -// --------------------------------------------------------------------------- -// ABI configuration and SysV struct classification (unchanged) -// --------------------------------------------------------------------------- - -/// ABI configuration for call argument classification. -pub struct CallAbiConfig { - /// Maximum GP registers for arguments (x86: 6, ARM/RISC-V: 8). - pub max_int_regs: usize, - /// Maximum FP registers for arguments (all: 8). - pub max_float_regs: usize, - /// Whether i128 register pairs must be even-aligned (ARM/RISC-V: true, x86: false). - pub align_i128_pairs: bool, - /// Whether F128 uses FP registers (ARM: true) or always goes to stack/x87 (x86: true = stack). - /// On RISC-V, F128 goes in GP register pairs like i128. - pub f128_in_fp_regs: bool, - /// Whether F128 uses GP register pairs (RISC-V: true). - pub f128_in_gp_pairs: bool, - /// Whether variadic float args must go in GP registers instead of FP regs (RISC-V: true, x86: false, ARM: false). - pub variadic_floats_in_gp: bool, - /// Whether large structs (>16 bytes) are passed by reference (pointer in GP reg). - /// ARM/RISC-V: true (pointer in GP reg or on stack), x86: false (copy to stack). - pub large_struct_by_ref: bool, - /// Whether to use SysV per-eightbyte struct classification (x86-64 only). - /// When true, struct eightbytes classified as SSE are passed in xmm registers. - pub use_sysv_struct_classification: bool, - /// Whether to use RISC-V LP64D hardware floating-point struct classification. - /// When true, small structs with float/double fields are passed in FP registers - /// per the RISC-V psABI. - pub use_riscv_float_struct_classification: bool, - /// Whether 2-register structs can be split across the last GP register and the stack. - /// RISC-V psABI: if a 2×XLEN struct has only 1 GP register left, the first XLEN bytes - /// go in that register and the rest go on the stack. ARM AAPCS64 does NOT split. - pub allow_struct_split_reg_stack: bool, - /// Whether 2-register structs with >XLEN alignment must start at an even register. - /// RISC-V psABI: true (2×XLEN-aligned composites require even-aligned register pairs). - /// ARM AAPCS64: false (composite types never require even-aligned pairs; only - /// fundamental types like __int128 do, which is handled by align_i128_pairs). - pub align_struct_pairs: bool, - /// Whether sret (struct return) uses a dedicated register (x8 on AArch64) instead of - /// consuming a regular GP argument register slot. When true, the classification must - /// promote the first stack-overflow GP argument to the freed GP register slot so that - /// caller and callee agree on where each argument lives. - /// ARM AAPCS64: true (sret pointer in x8), x86/RISC-V: false (sret in x0/a0). - pub sret_uses_dedicated_reg: bool, -} - -/// Result of SysV per-eightbyte struct classification. -/// Describes how a small struct (<=16 bytes) should be passed in registers. -#[derive(Debug, Clone, Copy)] -pub enum SysvStructRegClass { - /// All eightbytes are INTEGER class -> GP registers only. - AllInt, - /// All eightbytes are SSE class -> XMM registers only. - AllSse { fp_count: usize }, - /// First eightbyte INTEGER, second SSE (mixed). - IntSse, - /// First eightbyte SSE, second INTEGER (mixed). - SseInt, - /// Not enough registers available -> spill to stack. - Stack, -} - -/// Classify a small struct (<=16 bytes) using SysV AMD64 per-eightbyte rules. -/// -/// Given the eightbyte classes and current register allocation state, determines -/// whether the struct fits in registers and which class combination to use. -/// Returns the classification and the number of GP/FP registers consumed. -pub fn classify_sysv_struct( - eb_classes: &[crate::common::types::EightbyteClass], - int_idx: usize, - float_idx: usize, - config: &CallAbiConfig, -) -> (SysvStructRegClass, usize, usize) { - use crate::common::types::EightbyteClass; - let n_eightbytes = eb_classes.len(); - let eb0_is_sse = eb_classes.first() == Some(&EightbyteClass::Sse); - let eb1_is_sse = if n_eightbytes > 1 { eb_classes.get(1) == Some(&EightbyteClass::Sse) } else { false }; - - let gp_needed = (if !eb0_is_sse { 1 } else { 0 }) - + (if n_eightbytes > 1 && !eb1_is_sse { 1 } else { 0 }); - let fp_needed = (if eb0_is_sse { 1 } else { 0 }) - + (if n_eightbytes > 1 && eb1_is_sse { 1 } else { 0 }); - - if int_idx + gp_needed > config.max_int_regs || float_idx + fp_needed > config.max_float_regs { - return (SysvStructRegClass::Stack, 0, 0); - } - - if n_eightbytes == 1 { - if eb0_is_sse { - (SysvStructRegClass::AllSse { fp_count: 1 }, 0, 1) - } else { - (SysvStructRegClass::AllInt, 1, 0) - } - } else if eb0_is_sse && eb1_is_sse { - (SysvStructRegClass::AllSse { fp_count: 2 }, 0, 2) - } else if !eb0_is_sse && eb1_is_sse { - (SysvStructRegClass::IntSse, 1, 1) - } else if eb0_is_sse && !eb1_is_sse { - (SysvStructRegClass::SseInt, 1, 1) - } else { - (SysvStructRegClass::AllInt, 2, 0) - } -} - -// --------------------------------------------------------------------------- -// ArgInfo — abstracts per-argument metadata for the unified classification core -// --------------------------------------------------------------------------- - -/// Per-argument metadata needed by the classification algorithm. -/// -/// Both the caller (`classify_call_args`) and callee (`classify_params_full`) paths -/// construct one `ArgInfo` per argument/parameter, then pass the slice to the shared -/// `classify_args_core` function. -pub(crate) struct ArgInfo<'a> { - pub(crate) is_float: bool, - pub(crate) is_i128: bool, - pub(crate) is_long_double: bool, - /// If this is a struct/union by value: Some(byte_size). None otherwise. - pub(crate) struct_size: Option, - /// Struct alignment in bytes (for RISC-V even-register alignment). None for non-struct. - pub(crate) struct_align: Option, - /// SysV per-eightbyte classification (x86-64 only). Empty if not applicable. - pub(crate) eightbyte_classes: &'a [crate::common::types::EightbyteClass], - /// RISC-V LP64D float struct classification. None if not applicable. - pub(crate) riscv_float_class: Option, -} - -// --------------------------------------------------------------------------- -// Core classification algorithm — single implementation used by both sides -// --------------------------------------------------------------------------- - -/// Internal classification result for one argument, used by the core algorithm. -/// Represents the ABI decision *without* stack offsets — offsets are layered on -/// by the callee-side wrapper (`classify_params_full`). -#[derive(Debug, Clone, Copy)] -enum CoreArgClass { - IntReg { reg_idx: usize }, - FloatReg { reg_idx: usize }, - I128RegPair { base_reg_idx: usize }, - F128FpReg { reg_idx: usize }, - F128GpPair { base_reg_idx: usize }, - F128Stack, - StructByValReg { base_reg_idx: usize, size: usize }, - StructSseReg { lo_fp_idx: usize, hi_fp_idx: Option, size: usize }, - StructMixedIntSseReg { int_reg_idx: usize, fp_reg_idx: usize, size: usize }, - StructMixedSseIntReg { fp_reg_idx: usize, int_reg_idx: usize, size: usize }, - StructByValStack { size: usize }, - StructSplitRegStack { reg_idx: usize, size: usize }, - LargeStructStack { size: usize }, - LargeStructByRefReg { reg_idx: usize, size: usize }, - LargeStructByRefStack { size: usize }, - Stack, - I128Stack, - ZeroSizeSkip, -} - -/// Result of the core classification algorithm. -struct CoreClassification { - classes: Vec, - /// Final GP register index after classifying all arguments (capped at max_int_regs). - int_reg_idx: usize, -} - -/// Core ABI classification algorithm shared by both caller and callee paths. -/// -/// Walks the argument list and assigns each to a register or stack class based on -/// the ABI configuration. The `is_variadic` flag gates the `force_gp` behavior for -/// float args in variadic functions (needed on the caller side; the callee side -/// encodes this in `config.variadic_floats_in_gp` directly). -fn classify_args_core( - args: &[ArgInfo<'_>], - is_variadic: bool, - config: &CallAbiConfig, -) -> CoreClassification { - let mut result = Vec::with_capacity(args.len()); - let mut int_idx = 0usize; - let mut float_idx = 0usize; - let slot_size = crate::common::types::target_ptr_size(); - - for info in args { - let force_gp = is_variadic && config.variadic_floats_in_gp && info.is_float && !info.is_long_double; - - if let Some(size) = info.struct_size { - // Zero-size structs consume no register or stack space per GCC behavior. - if size == 0 { - result.push(CoreArgClass::ZeroSizeSkip); - continue; - } - - let eb_classes = info.eightbyte_classes; - - if size <= 16 && config.use_sysv_struct_classification && !eb_classes.is_empty() { - // SysV AMD64 ABI: classify per-eightbyte and assign to GP or SSE registers - let (cls, gp_used, fp_used) = classify_sysv_struct(eb_classes, int_idx, float_idx, config); - match cls { - SysvStructRegClass::AllSse { fp_count } => { - let hi = if fp_count > 1 { Some(float_idx + 1) } else { None }; - result.push(CoreArgClass::StructSseReg { lo_fp_idx: float_idx, hi_fp_idx: hi, size }); - } - SysvStructRegClass::AllInt => { - result.push(CoreArgClass::StructByValReg { base_reg_idx: int_idx, size }); - } - SysvStructRegClass::IntSse => { - result.push(CoreArgClass::StructMixedIntSseReg { int_reg_idx: int_idx, fp_reg_idx: float_idx, size }); - } - SysvStructRegClass::SseInt => { - result.push(CoreArgClass::StructMixedSseIntReg { fp_reg_idx: float_idx, int_reg_idx: int_idx, size }); - } - SysvStructRegClass::Stack => { - result.push(CoreArgClass::StructByValStack { size }); - int_idx = config.max_int_regs; - } - } - int_idx += gp_used; - float_idx += fp_used; - } else if size <= 16 { - // Non-SysV path (ARM, RISC-V) - let rv_class = if config.use_riscv_float_struct_classification { - info.riscv_float_class - } else { - None - }; - let mut classified = false; - if let Some(rv_fc) = rv_class { - use crate::common::types::RiscvFloatClass; - match rv_fc { - RiscvFloatClass::OneFloat { .. } => { - if float_idx < config.max_float_regs { - result.push(CoreArgClass::StructSseReg { lo_fp_idx: float_idx, hi_fp_idx: None, size }); - float_idx += 1; - classified = true; - } - } - RiscvFloatClass::TwoFloats { .. } => { - if float_idx + 1 < config.max_float_regs { - result.push(CoreArgClass::StructSseReg { lo_fp_idx: float_idx, hi_fp_idx: Some(float_idx + 1), size }); - float_idx += 2; - classified = true; - } - } - RiscvFloatClass::FloatAndInt { .. } => { - if float_idx < config.max_float_regs && int_idx < config.max_int_regs { - result.push(CoreArgClass::StructMixedSseIntReg { fp_reg_idx: float_idx, int_reg_idx: int_idx, size }); - float_idx += 1; - int_idx += 1; - classified = true; - } - } - RiscvFloatClass::IntAndFloat { .. } => { - if int_idx < config.max_int_regs && float_idx < config.max_float_regs { - result.push(CoreArgClass::StructMixedIntSseReg { int_reg_idx: int_idx, fp_reg_idx: float_idx, size }); - int_idx += 1; - float_idx += 1; - classified = true; - } - } - } - } - if !classified { - let regs_needed = if size <= slot_size { 1 } else { size.div_ceil(slot_size) }; - // RISC-V psABI: 2×XLEN-aligned structs must start at even register. - // Note: ARM AAPCS64 does NOT require even-aligned pairs for composites. - if regs_needed == 2 && config.align_struct_pairs { - let struct_align = info.struct_align.unwrap_or(slot_size); - if struct_align > slot_size && !int_idx.is_multiple_of(2) { - int_idx += 1; // skip to even register - } - } - if int_idx + regs_needed <= config.max_int_regs { - result.push(CoreArgClass::StructByValReg { base_reg_idx: int_idx, size }); - int_idx += regs_needed; - } else if regs_needed == 2 && int_idx < config.max_int_regs && config.allow_struct_split_reg_stack { - result.push(CoreArgClass::StructSplitRegStack { reg_idx: int_idx, size }); - int_idx = config.max_int_regs; - } else { - result.push(CoreArgClass::StructByValStack { size }); - int_idx = config.max_int_regs; - } - } - } else if config.large_struct_by_ref { - // AAPCS64 / RISC-V: large composites passed by reference. - if int_idx < config.max_int_regs { - result.push(CoreArgClass::LargeStructByRefReg { reg_idx: int_idx, size }); - int_idx += 1; - } else { - result.push(CoreArgClass::LargeStructByRefStack { size }); - } - } else { - result.push(CoreArgClass::LargeStructStack { size }); - } - } else if info.is_i128 { - if config.align_i128_pairs && !int_idx.is_multiple_of(2) { - int_idx += 1; - } - if int_idx + 1 < config.max_int_regs { - result.push(CoreArgClass::I128RegPair { base_reg_idx: int_idx }); - int_idx += 2; - } else { - result.push(CoreArgClass::I128Stack); - int_idx = config.max_int_regs; - } - } else if info.is_long_double { - if config.f128_in_fp_regs { - if float_idx < config.max_float_regs { - result.push(CoreArgClass::F128FpReg { reg_idx: float_idx }); - float_idx += 1; - } else { - result.push(CoreArgClass::F128Stack); - } - } else if config.f128_in_gp_pairs { - if config.align_i128_pairs && !int_idx.is_multiple_of(2) { - int_idx += 1; - } - if int_idx + 1 < config.max_int_regs { - result.push(CoreArgClass::F128GpPair { base_reg_idx: int_idx }); - int_idx += 2; - } else { - result.push(CoreArgClass::F128Stack); - int_idx = config.max_int_regs; - } - } else { - result.push(CoreArgClass::F128Stack); - } - } else if info.is_float && !force_gp && float_idx < config.max_float_regs { - result.push(CoreArgClass::FloatReg { reg_idx: float_idx }); - float_idx += 1; - } else if info.is_float && !force_gp { - result.push(CoreArgClass::Stack); - } else if int_idx < config.max_int_regs { - result.push(CoreArgClass::IntReg { reg_idx: int_idx }); - int_idx += 1; - } else { - result.push(CoreArgClass::Stack); - } - } - - CoreClassification { classes: result, int_reg_idx: int_idx } -} - -// --------------------------------------------------------------------------- -// Caller-side wrapper: classify_call_args -// --------------------------------------------------------------------------- - -/// Classify all arguments for a function call, returning a `CallArgClass` per argument. -/// -/// This is the caller-side entry point. It extracts `ArgInfo` from the call-site -/// arrays and delegates to the shared classification core. -/// -/// `struct_arg_sizes`: Some(size) for struct/union by-value args, None otherwise. -/// `struct_arg_aligns`: struct alignment (for RISC-V even-register alignment). -/// `struct_arg_classes`: per-eightbyte SysV ABI classification (x86-64 only). -/// `struct_arg_riscv_float_classes`: RISC-V LP64D float field classification. -pub fn classify_call_args( - args: &[Operand], - arg_types: &[IrType], - struct_arg_sizes: &[Option], - struct_arg_aligns: &[Option], - struct_arg_classes: &[Vec], - struct_arg_riscv_float_classes: &[Option], - is_variadic: bool, - config: &CallAbiConfig, -) -> Vec { - // Build ArgInfo slice from call-site arrays. - let arg_infos: Vec> = args.iter().enumerate().map(|(i, arg)| { - let arg_ty = if i < arg_types.len() { Some(arg_types[i]) } else { None }; - ArgInfo { - is_float: if let Some(ty) = arg_ty { - ty.is_float() - } else { - matches!(arg, Operand::Const(IrConst::F32(_) | IrConst::F64(_))) - }, - is_i128: arg_ty.map(is_i128_type).unwrap_or(false), - is_long_double: arg_ty.map(|t| t.is_long_double()).unwrap_or(false), - struct_size: struct_arg_sizes.get(i).copied().flatten(), - struct_align: struct_arg_aligns.get(i).copied().flatten(), - eightbyte_classes: struct_arg_classes.get(i).map(|v| v.as_slice()).unwrap_or(&[]), - riscv_float_class: struct_arg_riscv_float_classes.get(i).copied().flatten(), - } - }).collect(); - - let core = classify_args_core(&arg_infos, is_variadic, config); - - // Convert CoreArgClass -> CallArgClass (drop by-ref variants that only appear - // when large_struct_by_ref is true, which maps to IntReg/Stack on the caller side). - core.classes.into_iter().map(|c| match c { - CoreArgClass::IntReg { reg_idx } => CallArgClass::IntReg { reg_idx }, - CoreArgClass::FloatReg { reg_idx } => CallArgClass::FloatReg { reg_idx }, - CoreArgClass::I128RegPair { base_reg_idx } => CallArgClass::I128RegPair { base_reg_idx }, - CoreArgClass::F128FpReg { reg_idx } | CoreArgClass::F128GpPair { base_reg_idx: reg_idx } => CallArgClass::F128Reg { reg_idx }, - CoreArgClass::F128Stack => CallArgClass::F128Stack, - CoreArgClass::StructByValReg { base_reg_idx, size } => CallArgClass::StructByValReg { base_reg_idx, size }, - CoreArgClass::StructSseReg { lo_fp_idx, hi_fp_idx, size } => CallArgClass::StructSseReg { lo_fp_idx, hi_fp_idx, size }, - CoreArgClass::StructMixedIntSseReg { int_reg_idx, fp_reg_idx, size } => CallArgClass::StructMixedIntSseReg { int_reg_idx, fp_reg_idx, size }, - CoreArgClass::StructMixedSseIntReg { fp_reg_idx, int_reg_idx, size } => CallArgClass::StructMixedSseIntReg { fp_reg_idx, int_reg_idx, size }, - CoreArgClass::StructByValStack { size } => CallArgClass::StructByValStack { size }, - CoreArgClass::StructSplitRegStack { reg_idx, size } => CallArgClass::StructSplitRegStack { reg_idx, size }, - CoreArgClass::LargeStructStack { size } => CallArgClass::LargeStructStack { size }, - // On caller side, large-struct-by-ref uses IntReg (pointer) or Stack (pointer overflow). - CoreArgClass::LargeStructByRefReg { reg_idx, .. } => CallArgClass::IntReg { reg_idx }, - CoreArgClass::LargeStructByRefStack { .. } => CallArgClass::Stack, - CoreArgClass::Stack => CallArgClass::Stack, - CoreArgClass::I128Stack => CallArgClass::I128Stack, - CoreArgClass::ZeroSizeSkip => CallArgClass::ZeroSizeSkip, - }).collect() -} - -// --------------------------------------------------------------------------- -// Callee-side wrapper: classify_params_full / classify_params -// --------------------------------------------------------------------------- - -/// Result of parameter classification, including the final register allocation state. -/// The `int_reg_idx` field captures the effective GP register index after all named -/// params are classified, which is needed by RISC-V va_start to correctly skip -/// alignment padding gaps (e.g., when an F128 pair couldn't fit and bumped the index). -pub struct ParamClassification { - pub classes: Vec, - /// Final GP register index after classifying all named params. - /// Includes alignment bumps for I128/F128 pairs. Capped at max_int_regs. - pub int_reg_idx: usize, - /// Total stack bytes consumed by all named parameters. - /// This is the final stack_offset after classification, accounting for - /// type-specific sizes (e.g., F64/I64 take 8 bytes on ILP32). - pub total_stack_bytes: usize, -} - -/// Classify all parameters of a function for callee-side store emission. -/// -/// Uses the same `CallAbiConfig` as `classify_call_args` to ensure caller and callee -/// agree on parameter locations. Returns one `ParamClass` per parameter plus the final -/// register allocation state. -pub fn classify_params_full(func: &IrFunction, config: &CallAbiConfig) -> ParamClassification { - // Build ArgInfo slice from function parameters. - let arg_infos: Vec> = func.params.iter().map(|param| { - ArgInfo { - is_float: param.ty.is_float(), - is_i128: is_i128_type(param.ty), - is_long_double: param.ty.is_long_double(), - struct_size: param.struct_size, - struct_align: param.struct_align, - eightbyte_classes: ¶m.struct_eightbyte_classes, - riscv_float_class: param.riscv_float_class, - } - }).collect(); - - // Pass is_variadic=true here because the callee side encodes variadic behavior - // directly in config.variadic_floats_in_gp (set by the caller based on func.is_variadic). - // The force_gp check is: is_variadic && config.variadic_floats_in_gp && is_float. - // For non-variadic functions, config.variadic_floats_in_gp is false, so force_gp - // is false regardless of the is_variadic flag — making this safe. - let mut core = classify_args_core(&arg_infos, true, config); - - // AArch64 ABI: when sret uses a dedicated register (x8), the classification initially - // assigns the sret pointer to IntReg(0), consuming one GP register slot. On the caller - // side (emit_call in traits.rs), the sret arg is reclassified to ZeroSizeSkip, all - // other GP reg indices are shifted down by 1, and the first stack-overflow GP arg is - // promoted to the freed register slot (max_int_regs-1, i.e. x7). - // - // We must apply the same promotion on the callee side so that caller and callee agree - // on where each argument lives. The emit_store_gp_params method already handles - // the register index shift (sret_shift=1), but it does NOT promote stack args. - // Apply the promotion here to match the caller-side logic. - if func.uses_sret && config.sret_uses_dedicated_reg && core.classes.len() > 1 { - // Use max_int_regs (not max_int_regs-1) because emit_store_gp_params applies - // sret_shift=1, computing actual_idx = reg_idx - 1. So reg_idx=8 maps to x7. - let freed_reg = config.max_int_regs; - // Promote the first GP stack-overflow arg to the freed register slot. - for i in 1..core.classes.len() { - match core.classes[i] { - CoreArgClass::Stack => { - let is_float = func.params.get(i).map(|p| p.ty.is_float()).unwrap_or(false); - if !is_float { - core.classes[i] = CoreArgClass::IntReg { reg_idx: freed_reg }; - break; - } - } - CoreArgClass::StructByValStack { size } if size <= 8 => { - core.classes[i] = CoreArgClass::StructByValReg { base_reg_idx: freed_reg, size }; - break; - } - _ => {} - } - } - } - - // Convert CoreArgClass -> ParamClass, assigning stack offsets. - let slot_size = crate::common::types::target_ptr_size(); - let slot_align_mask = (slot_size - 1) as i64; - let mut stack_offset: i64 = 0; - let mut classes = Vec::with_capacity(core.classes.len()); - - for (i, c) in core.classes.iter().enumerate() { - let param_ty = func.params.get(i).map(|p| p.ty); - let pc = match *c { - CoreArgClass::IntReg { reg_idx } => ParamClass::IntReg { reg_idx }, - CoreArgClass::FloatReg { reg_idx } => ParamClass::FloatReg { reg_idx }, - CoreArgClass::I128RegPair { base_reg_idx } => ParamClass::I128RegPair { base_reg_idx }, - CoreArgClass::StructByValReg { base_reg_idx, size } => ParamClass::StructByValReg { base_reg_idx, size }, - CoreArgClass::StructSseReg { lo_fp_idx, hi_fp_idx, size } => ParamClass::StructSseReg { lo_fp_idx, hi_fp_idx, size }, - CoreArgClass::StructMixedIntSseReg { int_reg_idx, fp_reg_idx, size } => ParamClass::StructMixedIntSseReg { int_reg_idx, fp_reg_idx, size }, - CoreArgClass::StructMixedSseIntReg { fp_reg_idx, int_reg_idx, size } => ParamClass::StructMixedSseIntReg { fp_reg_idx, int_reg_idx, size }, - CoreArgClass::F128FpReg { reg_idx } => ParamClass::F128FpReg { reg_idx }, - CoreArgClass::F128GpPair { base_reg_idx } => ParamClass::F128GpPair { lo_reg_idx: base_reg_idx, hi_reg_idx: base_reg_idx + 1 }, - CoreArgClass::LargeStructByRefReg { reg_idx, size } => ParamClass::LargeStructByRefReg { reg_idx, size }, - CoreArgClass::ZeroSizeSkip => ParamClass::ZeroSizeSkip, - - // Stack-overflow cases: assign offsets - CoreArgClass::F128Stack => { - if config.f128_in_fp_regs || config.f128_in_gp_pairs { - // ARM/RISC-V: F128 overflowed from registers - stack_offset = (stack_offset + 15) & !15; - let off = stack_offset; - stack_offset += 16; - ParamClass::F128Stack { offset: off } - } else if slot_size == 4 { - // i686: long double is 12 bytes, 4-byte aligned - let off = stack_offset; - stack_offset += 12; - ParamClass::F128AlwaysStack { offset: off } - } else { - // x86-64: 16 bytes, 16-byte aligned - stack_offset = (stack_offset + 15) & !15; - let off = stack_offset; - stack_offset += 16; - ParamClass::F128AlwaysStack { offset: off } - } - } - CoreArgClass::I128Stack => { - stack_offset = (stack_offset + 15) & !15; - let off = stack_offset; - stack_offset += 16; - ParamClass::I128Stack { offset: off } - } - CoreArgClass::StructByValStack { size } | CoreArgClass::LargeStructStack { size } => { - let off = stack_offset; - stack_offset += (size as i64 + slot_align_mask) & !slot_align_mask; - if matches!(*c, CoreArgClass::LargeStructStack { .. }) { - ParamClass::LargeStructStack { offset: off, size } - } else { - ParamClass::StructStack { offset: off, size } - } - } - CoreArgClass::StructSplitRegStack { reg_idx, size } => { - let off = stack_offset; - let stack_part = size - slot_size; - stack_offset += (stack_part as i64 + slot_align_mask) & !slot_align_mask; - ParamClass::StructSplitRegStack { reg_idx, stack_offset: off, size } - } - CoreArgClass::LargeStructByRefStack { size } => { - let off = stack_offset; - stack_offset += slot_size as i64; // pointer on stack - ParamClass::LargeStructByRefStack { offset: off, size } - } - CoreArgClass::Stack => { - let off = stack_offset; - let is_float = param_ty.map(|t| t.is_float()).unwrap_or(false); - if is_float { - // Float that overflowed FP registers - if slot_size == 4 { - let float_stack_size = if param_ty == Some(IrType::F64) { 8 } else { 4 }; - stack_offset += float_stack_size; - } else { - stack_offset += 8; - } - } else { - // GP register overflow - let param_size = param_ty.map(|t| t.size() as i64).unwrap_or(slot_size as i64); - stack_offset += (param_size + slot_align_mask) & !slot_align_mask; - } - ParamClass::StackScalar { offset: off } - } - }; - classes.push(pc); - } - - ParamClassification { - classes, - int_reg_idx: core.int_reg_idx, - total_stack_bytes: stack_offset as usize, - } -} - -/// Classify all parameters of a function for callee-side store emission. -/// -/// Uses the same `CallAbiConfig` as `classify_call_args` to ensure caller and callee -/// agree on parameter locations. Returns one `ParamClass` per parameter. -pub fn classify_params(func: &IrFunction, config: &CallAbiConfig) -> Vec { - classify_params_full(func, config).classes -} - -/// Compute the total stack space (in bytes) consumed by named parameters that are -/// passed on the stack. This is needed for variadic functions: va_start must set its -/// stack pointer past all named stack-passed args to point at the first variadic arg. -/// -/// This correctly accounts for alignment padding (e.g., 16-byte alignment for F128/I128). -pub fn named_params_stack_bytes(param_classes: &[ParamClass]) -> usize { - let mut total: usize = 0; - for class in param_classes { - // Align for 16-byte types before adding their size - if matches!(class, ParamClass::F128Stack { .. } | ParamClass::I128Stack { .. } | ParamClass::F128AlwaysStack { .. }) { - total = (total + 15) & !15; - } - total += class.stack_bytes(); - } - total -} - -// --------------------------------------------------------------------------- -// Call-site stack space helpers (used by emit_call) -// --------------------------------------------------------------------------- - -/// Compute the total stack space needed for stack-overflow arguments. -/// Returns the total bytes needed, 16-byte aligned. -/// Use this for ARM and RISC-V which pre-allocate stack space with a single SP adjustment. -pub fn compute_stack_arg_space(arg_classes: &[CallArgClass]) -> usize { - let mut total: usize = 0; - for cls in arg_classes { - if !cls.is_stack() { continue; } - if matches!(cls, CallArgClass::F128Stack | CallArgClass::I128Stack) { - total = (total + 15) & !15; - } - total += cls.stack_bytes(); - } - (total + 15) & !15 -} - -/// Compute per-stack-arg alignment padding needed in the forward layout. -/// Returns a Vec with one entry per `arg_classes` element. Non-stack args get 0. -/// F128Stack and I128Stack args get padding to align to 16 bytes in the overflow area. -pub fn compute_stack_arg_padding(arg_classes: &[CallArgClass]) -> Vec { - let mut padding = vec![0usize; arg_classes.len()]; - let mut offset: usize = 0; - for (i, cls) in arg_classes.iter().enumerate() { - if !cls.is_stack() { continue; } - if matches!(cls, CallArgClass::F128Stack | CallArgClass::I128Stack) { - let align_pad = (16 - (offset % 16)) % 16; - padding[i] = align_pad; - offset += align_pad; - } - offset += cls.stack_bytes(); - } - padding -} - -/// Compute the raw bytes that will be pushed onto the stack for stack arguments. -/// Unlike `compute_stack_arg_space`, this does NOT apply final 16-byte alignment, -/// because x86 uses individual `pushq` instructions and handles alignment separately. -/// This includes alignment padding for F128/I128 args. -pub fn compute_stack_push_bytes(arg_classes: &[CallArgClass]) -> usize { - let padding = compute_stack_arg_padding(arg_classes); - let mut total: usize = 0; - for (i, cls) in arg_classes.iter().enumerate() { - if !cls.is_stack() { continue; } - total += padding[i] + cls.stack_bytes(); - } - total -} diff --git a/src/backend/cast.rs b/src/backend/cast.rs deleted file mode 100644 index eea9988e71..0000000000 --- a/src/backend/cast.rs +++ /dev/null @@ -1,261 +0,0 @@ -//! Shared cast and float operation classification, plus F128 soft-float libcall mapping. -//! -//! All four backends use the same decision logic to determine what kind of cast -//! to emit — only the actual machine instructions differ. By classifying the cast -//! once in shared code, we eliminate duplicated Ptr-normalization and F128-reduction -//! logic from each backend. This module also provides the shared mnemonic-to-libcall -//! mapping for F128 soft-float arithmetic and comparisons (ARM, RISC-V). - -use crate::common::types::IrType; -use crate::ir::reexports::{IrBinOp, IrCmpOp, IrConst, Operand}; - -/// Classification of type casts. All four backends use the same control flow -/// to decide which kind of cast to emit; only the actual instructions differ. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum CastKind { - /// No conversion needed (same type, or Ptr <-> I64/U64, or F128 <-> F64). - Noop, - /// Float to signed integer (from_ty is F32 or F64). - FloatToSigned { from_f64: bool }, - /// Float to unsigned integer (from_ty is F32 or F64). - FloatToUnsigned { from_f64: bool, to_u64: bool }, - /// Signed integer to float (to_ty is F32 or F64). - /// `from_ty` is the source integer type, needed to sign-extend sub-64-bit values - /// before conversion (e.g., I32 in rax must be sign-extended to 64 bits). - SignedToFloat { to_f64: bool, from_ty: IrType }, - /// Unsigned integer to float. `from_ty` is the source unsigned integer type, - /// needed for proper zero-extension on RISC-V (where W-suffix instructions - /// sign-extend) and for U64 overflow handling on x86. - UnsignedToFloat { to_f64: bool, from_ty: IrType }, - /// Float-to-float conversion (F32 <-> F64). - FloatToFloat { widen: bool }, - /// Integer widening: sign- or zero-extend a smaller type to a larger one. - IntWiden { from_ty: IrType, to_ty: IrType }, - /// Integer narrowing: truncate a larger type to a smaller one. - IntNarrow { to_ty: IrType }, - /// Same-size signed-to-unsigned (need to mask/clear upper bits). - SignedToUnsignedSameSize { to_ty: IrType }, - /// Same-size unsigned-to-signed. On most architectures this is a noop, - /// but on RISC-V 64-bit, U32->I32 needs sign-extension because the ABI - /// requires all 32-bit values to be sign-extended in 64-bit registers. - UnsignedToSignedSameSize { to_ty: IrType }, - /// Signed integer -> F128 via softfloat (__floatsitf / __floatditf). - /// Used on ARM/RISC-V where long double is IEEE binary128. - SignedToF128 { from_ty: IrType }, - /// Unsigned integer -> F128 via softfloat (__floatunsitf / __floatunditf). - UnsignedToF128 { from_ty: IrType }, - /// F128 -> signed integer via softfloat (__fixtfsi / __fixtfdi). - F128ToSigned { to_ty: IrType }, - /// F128 -> unsigned integer via softfloat (__fixunstfsi / __fixunstfdi). - F128ToUnsigned { to_ty: IrType }, - /// F32/F64 -> F128 widening via softfloat (__extendsftf2 / __extenddftf2). - FloatToF128 { from_f32: bool }, - /// F128 -> F32/F64 narrowing via softfloat (__trunctfsf2 / __trunctfdf2). - F128ToFloat { to_f32: bool }, -} - -/// Classify a cast between two IR types. This captures the shared decision logic -/// that all four backends use identically. Backends then match on the returned -/// `CastKind` to emit architecture-specific instructions. -/// -/// Handles Ptr normalization (Ptr treated as U64) and F128 reduction (F128 treated -/// as F64 for computation purposes on x86) before classification. -/// -/// `f128_is_native`: true on ARM/RISC-V where F128 is IEEE binary128 and requires -/// softfloat library calls for conversions. false on x86 where F128 is x87 80-bit -/// and is approximated as F64. -pub fn classify_cast_with_f128(from_ty: IrType, to_ty: IrType, f128_is_native: bool) -> CastKind { - if from_ty == to_ty { - return CastKind::Noop; - } - - // F128 (long double) handling depends on architecture. - if from_ty == IrType::F128 || to_ty == IrType::F128 { - if f128_is_native { - // ARM/RISC-V: F128 is true IEEE binary128. Use softfloat library calls. - return classify_f128_cast_native(from_ty, to_ty); - } - // x86: F128 (x87 80-bit) is computed as F64. Treat F128 <-> F64 as noop, - // and F128 <-> other as F64 <-> other. - let effective_from = if from_ty == IrType::F128 { IrType::F64 } else { from_ty }; - let effective_to = if to_ty == IrType::F128 { IrType::F64 } else { to_ty }; - if effective_from == effective_to { - return CastKind::Noop; - } - return classify_cast(effective_from, effective_to); - } - - // Ptr is equivalent to U64 on LP64 targets, U32 on ILP32 targets. - if (from_ty == IrType::Ptr || to_ty == IrType::Ptr) && !from_ty.is_float() && !to_ty.is_float() { - let ptr_int_ty = if crate::common::types::target_is_32bit() { IrType::U32 } else { IrType::U64 }; - let effective_from = if from_ty == IrType::Ptr { ptr_int_ty } else { from_ty }; - let effective_to = if to_ty == IrType::Ptr { ptr_int_ty } else { to_ty }; - let ptr_sz = crate::common::types::target_ptr_size(); - if effective_from == effective_to || (effective_from.size() == ptr_sz && effective_to.size() == ptr_sz) { - return CastKind::Noop; - } - return classify_cast(effective_from, effective_to); - } - - // Float-to-int - if from_ty.is_float() && !to_ty.is_float() { - let is_unsigned_dest = to_ty.is_unsigned() || to_ty == IrType::Ptr; - let from_f64 = from_ty == IrType::F64; - if is_unsigned_dest { - let to_u64 = to_ty == IrType::U64 || to_ty == IrType::Ptr; - return CastKind::FloatToUnsigned { from_f64, to_u64 }; - } else { - return CastKind::FloatToSigned { from_f64 }; - } - } - - // Int-to-float - if !from_ty.is_float() && to_ty.is_float() { - let is_unsigned_src = from_ty.is_unsigned(); - let to_f64 = to_ty == IrType::F64; - if is_unsigned_src { - return CastKind::UnsignedToFloat { to_f64, from_ty }; - } else { - return CastKind::SignedToFloat { to_f64, from_ty }; - } - } - - // Float-to-float - if from_ty.is_float() && to_ty.is_float() { - let widen = from_ty == IrType::F32 && to_ty == IrType::F64; - return CastKind::FloatToFloat { widen }; - } - - // Integer-to-integer - let from_size = from_ty.size(); - let to_size = to_ty.size(); - - if from_size == to_size { - if from_ty.is_signed() && to_ty.is_unsigned() { - return CastKind::SignedToUnsignedSameSize { to_ty }; - } - if from_ty.is_unsigned() && to_ty.is_signed() { - return CastKind::UnsignedToSignedSameSize { to_ty }; - } - return CastKind::Noop; - } - - if to_size > from_size { - return CastKind::IntWiden { from_ty, to_ty }; - } - - CastKind::IntNarrow { to_ty } -} - -/// Backward-compatible wrapper: classifies casts with x86 F128 semantics -/// (F128 treated as F64 approximation). -pub fn classify_cast(from_ty: IrType, to_ty: IrType) -> CastKind { - classify_cast_with_f128(from_ty, to_ty, false) -} - -/// Classify F128 casts on targets where F128 is true IEEE binary128 (ARM/RISC-V). -/// These require softfloat library calls for full precision. -fn classify_f128_cast_native(from_ty: IrType, to_ty: IrType) -> CastKind { - debug_assert!(from_ty == IrType::F128 || to_ty == IrType::F128); - - if to_ty == IrType::F128 { - // Something -> F128 - if from_ty == IrType::F64 { - return CastKind::FloatToF128 { from_f32: false }; - } - if from_ty == IrType::F32 { - return CastKind::FloatToF128 { from_f32: true }; - } - // Integer -> F128 - if from_ty.is_float() { - // F128 -> F128: should not happen (handled by from_ty == to_ty check) - return CastKind::Noop; - } - if from_ty.is_unsigned() { - return CastKind::UnsignedToF128 { from_ty }; - } - return CastKind::SignedToF128 { from_ty }; - } - - // F128 -> something - if to_ty == IrType::F64 { - return CastKind::F128ToFloat { to_f32: false }; - } - if to_ty == IrType::F32 { - return CastKind::F128ToFloat { to_f32: true }; - } - // F128 -> integer - if to_ty.is_float() { - return CastKind::Noop; - } - if to_ty.is_unsigned() || to_ty == IrType::Ptr { - return CastKind::F128ToUnsigned { to_ty }; - } - CastKind::F128ToSigned { to_ty } -} - -/// Float arithmetic operations that all four backends support. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum FloatOp { - Add, - Sub, - Mul, - Div, -} - -/// Classify a binary operation on floats. Returns None if the operation is not -/// meaningful on floats (e.g., bitwise And, Or, Xor, shifts, integer remainder). -pub fn classify_float_binop(op: IrBinOp) -> Option { - match op { - IrBinOp::Add => Some(FloatOp::Add), - IrBinOp::Sub => Some(FloatOp::Sub), - IrBinOp::Mul => Some(FloatOp::Mul), - IrBinOp::SDiv | IrBinOp::UDiv => Some(FloatOp::Div), - _ => None, - } -} - -/// Map a float binop mnemonic (fadd/fsub/fmul/fdiv) to the corresponding F128 -/// soft-float libcall. Used by ARM and RISC-V backends (x86 uses x87 for F128). -/// Returns None for unrecognized mnemonics (caller should fall back to f64 hardware). -/// How to interpret an F128 comparison libcall result. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum F128CmpKind { - /// Result == 0 means true (equality) - Eq, - /// Result != 0 means true (inequality) - Ne, - /// Result < 0 means true (less than) - Lt, - /// Result <= 0 means true (less or equal) - Le, - /// Result > 0 means true (greater than) - Gt, - /// Result >= 0 means true (greater or equal) - Ge, -} - -/// Map a comparison operation to the F128 soft-float libcall and result interpretation. -pub fn f128_cmp_libcall(op: IrCmpOp) -> (&'static str, F128CmpKind) { - match op { - IrCmpOp::Eq => ("__eqtf2", F128CmpKind::Eq), - IrCmpOp::Ne => ("__eqtf2", F128CmpKind::Ne), - IrCmpOp::Slt | IrCmpOp::Ult => ("__lttf2", F128CmpKind::Lt), - IrCmpOp::Sle | IrCmpOp::Ule => ("__letf2", F128CmpKind::Le), - IrCmpOp::Sgt | IrCmpOp::Ugt => ("__gttf2", F128CmpKind::Gt), - IrCmpOp::Sge | IrCmpOp::Uge => ("__getf2", F128CmpKind::Ge), - } -} - -/// Extract the IEEE f128 low/high u64 halves from an F128 constant operand. -/// The f128 bytes are already in IEEE binary128 format. -/// Returns None for non-constant operands (caller must use runtime conversion). -pub fn f128_const_halves(op: &Operand) -> Option<(u64, u64)> { - if let Operand::Const(IrConst::LongDouble(_, f128_bytes)) = op { - let lo = u64::from_le_bytes(f128_bytes[0..8].try_into().unwrap()); - let hi = u64::from_le_bytes(f128_bytes[8..16].try_into().unwrap()); - Some((lo, hi)) - } else { - None - } -} diff --git a/src/backend/common.rs b/src/backend/common.rs deleted file mode 100644 index 1588102988..0000000000 --- a/src/backend/common.rs +++ /dev/null @@ -1,1742 +0,0 @@ -//! Shared backend utilities for assembler, linker, and data emission. -//! -//! All four backends (x86-64, i686, AArch64, RISC-V 64) share identical logic for: -//! - Assembling via an external toolchain (gcc/cross-gcc) -//! - Linking via an external toolchain -//! - Emitting assembly data directives (.data, .bss, .rodata, string literals, constants) -//! -//! This module extracts that shared logic, parameterized only by: -//! - The toolchain command name (e.g., "gcc" vs "aarch64-linux-gnu-gcc") -//! - The 64-bit data directive (`.quad` vs `.xword` vs `.dword`) -//! - Extra assembler/linker flags - -#[cfg(any(feature = "gcc_assembler", feature = "gcc_linker"))] -use std::process::Command; -#[cfg(any(feature = "gcc_assembler", feature = "gcc_linker"))] -use std::sync::Once; -use crate::ir::reexports::{ - GlobalInit, - IrConst, - IrGlobal, - IrModule, -}; -use crate::common::types::IrType; -use crate::backend::elf::{EM_386, EM_X86_64, EM_AARCH64, EM_RISCV}; - -/// Print a one-time warning when using a GCC-backed assembler. -/// -/// This fires when the `gcc_assembler` feature is enabled and GCC is -/// being used as the assembler. The warning is printed at most once per -/// process to avoid flooding stderr on large builds. -#[cfg(feature = "gcc_assembler")] -fn warn_gcc_assembler(command: &str) { - static WARN_ONCE: Once = Once::new(); - WARN_ONCE.call_once(|| { - eprintln!("WARNING: Using GCC-backed assembler ({}) [gcc_assembler feature enabled]", command); - }); -} - -/// Print a one-time warning when using GCC as the linker driver. -/// -/// This fires when the `gcc_linker` feature is enabled and GCC is -/// being used as the linker. The warning is printed at most once per process. -#[cfg(feature = "gcc_linker")] -fn warn_gcc_linker(command: &str) { - static WARN_ONCE: Once = Once::new(); - WARN_ONCE.call_once(|| { - eprintln!("WARNING: Using GCC-backed linker ({}) [gcc_linker feature enabled]", command); - }); -} - -/// Configuration for an external assembler. -#[cfg_attr(not(feature = "gcc_assembler"), allow(dead_code))] // Only constructed/used when gcc_assembler enabled -pub struct AssemblerConfig { - /// The assembler command (e.g., "gcc", "aarch64-linux-gnu-gcc") - pub command: &'static str, - /// Extra flags to pass (e.g., ["-march=rv64gc", "-mabi=lp64d"] for RISC-V) - pub extra_args: &'static [&'static str], -} - -/// Configuration for an external linker. -/// -/// The `command` and `extra_args` fields are only used when linking via GCC -/// (`gcc_linker` feature). The built-in linker dispatches by `expected_elf_machine`. -#[allow(dead_code)] // `command`/`extra_args` fields only read under gcc_linker feature -pub struct LinkerConfig { - /// The linker command (e.g., "gcc", "aarch64-linux-gnu-gcc") - pub command: &'static str, - /// Extra flags (e.g., ["-static"] for cross-compiled targets, ["-no-pie"] for x86) - pub extra_args: &'static [&'static str], - /// Expected ELF e_machine value for this target (e.g., EM_X86_64=62, EM_RISCV=243). - /// Used to validate input .o files before linking and produce clear error messages - /// when stale/wrong-arch objects are accidentally passed to the linker. - pub expected_elf_machine: u16, - /// Human-readable architecture name for error messages (e.g., "RISC-V", "x86-64"). - pub arch_name: &'static str, -} - -/// Assemble text to an object file using GCC as the assembler. -/// -/// Only available when the `gcc_assembler` Cargo feature is enabled. -/// The `extra_dynamic_args` are appended after the config's static extra_args, -/// allowing runtime overrides (e.g., -mabi=lp64 from CLI flags). -#[cfg(feature = "gcc_assembler")] -pub fn assemble_with_extra(config: &AssemblerConfig, asm_text: &str, output_path: &str, extra_dynamic_args: &[String]) -> Result<(), String> { - use crate::common::temp_files::TempFile; - - warn_gcc_assembler(config.command); - - let keep_asm = std::env::var("CCC_KEEP_ASM").is_ok(); - - let asm_file = if keep_asm { - let mut f = TempFile::with_path(format!("{}.s", output_path).into()); - f.set_keep(true); - f - } else { - let stem = std::path::Path::new(output_path) - .file_stem() - .and_then(|s| s.to_str()) - .unwrap_or("asm"); - TempFile::new("ccc_asm", stem, "s") - }; - std::fs::write(asm_file.path(), asm_text) - .map_err(|e| format!("Failed to write assembly: {}", e))?; - - let mut cmd = Command::new(config.command); - cmd.args(config.extra_args); - cmd.args(extra_dynamic_args); - cmd.args(["-c", "-o", output_path, asm_file.to_str()]); - - let result = cmd.output() - .map_err(|e| format!("Failed to run assembler ({}): {}", config.command, e))?; - - if !result.status.success() { - let stderr = String::from_utf8_lossy(&result.stderr); - return Err(format!("Assembly failed ({}): {}", config.command, stderr)); - } - - Ok(()) -} - -/// Map an ELF e_machine value to a human-readable architecture name. -fn elf_machine_name(em: u16) -> &'static str { - match em { - EM_386 => "i386", - 40 => "ARM", - EM_X86_64 => "x86-64", - EM_AARCH64 => "aarch64", - EM_RISCV => "RISC-V", - _ => "unknown", - } -} - -/// Validate that all .o files in a list match the expected ELF e_machine. -/// Returns Ok(()) if all files match or are not ELF objects (archives, shared libs, etc.). -/// Returns Err with a diagnostic listing the mismatched files. -fn validate_object_architectures( - files: impl Iterator>, - expected_machine: u16, - arch_name: &str, -) -> Result<(), String> { - use std::io::Read; - let mut mismatched = Vec::new(); - - for path_ref in files { - let path = path_ref.as_ref(); - // Only check .o files (not .a, .so, -l flags, -Wl, flags, etc.) - if !path.ends_with(".o") { - continue; - } - // Read the ELF header: first 20 bytes contain e_ident (16) + e_type (2) + e_machine (2) - let mut buf = [0u8; 20]; - let Ok(mut f) = std::fs::File::open(path) else { continue }; - let Ok(n) = f.read(&mut buf) else { continue }; - if n < 20 { - continue; - } - // Verify ELF magic - if &buf[0..4] != b"\x7fELF" { - continue; - } - // e_machine is at offset 18, always 2 bytes. - // Determine endianness from EI_DATA (byte 5): 1=LE, 2=BE - let is_le = buf[5] == 1; - let em = if is_le { - u16::from_le_bytes([buf[18], buf[19]]) - } else { - u16::from_be_bytes([buf[18], buf[19]]) - }; - if em != expected_machine { - mismatched.push((path.to_string(), em)); - } - } - - if mismatched.is_empty() { - return Ok(()); - } - - let mut msg = format!( - "Object file architecture mismatch: target is {} (ELF e_machine={}) but these files are for a different architecture:\n", - arch_name, expected_machine - ); - for (path, em) in &mismatched { - msg.push_str(&format!(" {} ({}; e_machine={})\n", path, elf_machine_name(*em), em)); - } - msg.push_str("Hint: these look like stale objects from a previous build. Try running 'make clean' before rebuilding."); - Err(msg) -} - -/// Link object files into an executable (or shared library), with additional user-provided linker args. -/// -/// When the `gcc_linker` Cargo feature is enabled, uses GCC as the linker -/// driver (with a warning). When disabled (default), uses the built-in native -/// linker for all supported architectures. -pub fn link_with_args(config: &LinkerConfig, object_files: &[&str], output_path: &str, user_args: &[String]) -> Result<(), String> { - // Validate that all input .o files match the target architecture. - validate_object_architectures( - object_files.iter().copied().chain(user_args.iter().map(|s| s.as_str())), - config.expected_elf_machine, - config.arch_name, - )?; - - let is_shared = user_args.iter().any(|a| a == "-shared"); - let is_nostdlib = user_args.iter().any(|a| a == "-nostdlib"); - let is_relocatable = user_args.iter().any(|a| a == "-r"); - #[allow(unused_variables)] // Only used in the non-gcc_linker path below - let is_static = user_args.iter().any(|a| a == "-static"); - - // When gcc_linker feature is enabled, use GCC for ALL linking - #[cfg(feature = "gcc_linker")] - { - link_with_gcc(config, object_files, output_path, user_args, - is_shared, is_nostdlib, is_relocatable) - } - - // Default (gcc_linker disabled): use the built-in native linker - #[cfg(not(feature = "gcc_linker"))] - { - if is_relocatable { - return Err("Relocatable linking (-r) requires the gcc_linker feature. \ - Rebuild with: cargo build --features gcc_linker".to_string()); - } - - // Look up the architecture config by ELF machine number - let arch = match config.expected_elf_machine { - EM_X86_64 => &DIRECT_LD_X86_64, - EM_AARCH64 => &DIRECT_LD_AARCH64, - EM_RISCV => &DIRECT_LD_RISCV64, - EM_386 => &DIRECT_LD_I686, - _ => { - return Err(format!( - "No built-in linker for ELF machine {} ({}). \ - Rebuild with: cargo build --features gcc_linker", - config.expected_elf_machine, config.arch_name - )); - } - }; - - link_builtin_native(arch, object_files, output_path, user_args, is_nostdlib, is_static, is_shared) - } -} - -/// Link using GCC as the linker driver (fallback path). -/// -/// Only compiled when the `gcc_linker` Cargo feature is enabled. -#[cfg(feature = "gcc_linker")] -fn link_with_gcc( - config: &LinkerConfig, - object_files: &[&str], - output_path: &str, - user_args: &[String], - is_shared: bool, - is_nostdlib: bool, - is_relocatable: bool, -) -> Result<(), String> { - warn_gcc_linker(config.command); - let ld_command = config.command; - - let mut cmd = Command::new(ld_command); - let skip_extra = is_shared || is_relocatable; - for arg in config.extra_args { - if skip_extra && (*arg == "-no-pie" || *arg == "-pie" || *arg == "-static") { - continue; - } - cmd.arg(arg); - } - cmd.arg("-o").arg(output_path); - if !is_relocatable { - cmd.arg("-Wl,-z,noexecstack"); - } - - for obj in object_files { - cmd.arg(obj); - } - - for arg in user_args { - cmd.arg(arg); - } - - if !is_nostdlib && !is_shared { - cmd.arg("-lc"); - cmd.arg("-lm"); - } - - let result = cmd.output() - .map_err(|e| format!("Failed to run linker ({}): {}", ld_command, e))?; - - if !result.stdout.is_empty() { - use std::io::Write; - let _ = std::io::stdout().write_all(&result.stdout); - } - - if !result.status.success() { - let stderr = String::from_utf8_lossy(&result.stderr); - return Err(format!("Linking failed ({}): {}", ld_command, stderr)); - } - - Ok(()) -} - -/// Per-architecture configuration for direct ld invocation and built-in linker -/// CRT/library discovery. -/// -/// Each architecture has different CRT/GCC library paths, emulation mode, -/// dynamic linker path, etc. This struct captures all those differences -/// so a single generic function can handle all backends. -#[cfg(not(feature = "gcc_linker"))] -#[allow(dead_code)] // Some fields (emulation, dynamic_linker, etc.) are stored for documentation/future use -struct DirectLdArchConfig { - /// Human-readable architecture name for error messages (e.g., "x86-64", "RISC-V") - arch_name: &'static str, - /// ELF e_machine value (e.g., EM_X86_64=62, EM_RISCV=243). - /// Used to dispatch to the correct backend linker. - elf_machine: u16, - /// ld emulation mode (e.g., "elf_x86_64", "elf64lriscv", "elf_i386", "aarch64linux") - emulation: &'static str, - /// Dynamic linker path (e.g., "/lib64/ld-linux-x86-64.so.2") - dynamic_linker: &'static str, - /// Base paths to search for GCC lib dir (containing crtbegin.o) - gcc_lib_base_paths: &'static [&'static str], - /// GCC versions to probe (newest first) - gcc_versions: &'static [&'static str], - /// Candidate directories for system CRT objects (crt1.o) - crt_dir_candidates: &'static [&'static str], - /// Standard system library directories for -L paths - system_lib_dirs: &'static [&'static str], - /// Extra ld flags specific to this architecture (e.g., AArch64 erratum workarounds) - extra_ld_flags: &'static [&'static str], - /// Extra GCC flags to skip when converting user args (e.g., "-m32" for i686) - extra_skip_flags: &'static [&'static str], - /// If true, crti.o and crtn.o are found in the GCC lib dir rather than the CRT dir. - /// This is the case for RISC-V cross-compilation where the CRT dir only has crt1.o. - crti_from_gcc_dir: bool, - /// Package hint for CRT not-found error messages - crt_package_hint: &'static str, - /// Package hint for GCC lib not-found error messages - gcc_package_hint: &'static str, -} - -/// Standard GCC versions to probe (newest to oldest), shared across most architectures. -#[cfg(not(feature = "gcc_linker"))] -const GCC_VERSIONS_FULL: &[&str] = &["14", "13", "12", "11", "10", "9", "8", "7", "6", "5", "4.9"]; -/// Shorter version list for architectures that don't have very old GCC support. -#[cfg(not(feature = "gcc_linker"))] -const GCC_VERSIONS_SHORT: &[&str] = &["14", "13", "12", "11", "10", "9", "8", "7"]; - -#[cfg(not(feature = "gcc_linker"))] -const DIRECT_LD_X86_64: DirectLdArchConfig = DirectLdArchConfig { - arch_name: "x86-64", - elf_machine: EM_X86_64, - emulation: "elf_x86_64", - dynamic_linker: "/lib64/ld-linux-x86-64.so.2", - gcc_lib_base_paths: &[ - "/usr/lib/gcc/x86_64-linux-gnu", - "/usr/lib/gcc/x86_64-redhat-linux", - "/usr/lib/gcc/x86_64-pc-linux-gnu", - "/usr/lib64/gcc/x86_64-linux-gnu", - "/usr/lib64/gcc/x86_64-redhat-linux", - ], - gcc_versions: GCC_VERSIONS_FULL, - crt_dir_candidates: &[ - "/usr/lib/x86_64-linux-gnu", - "/usr/lib64", - "/lib/x86_64-linux-gnu", - "/lib64", - ], - system_lib_dirs: &[ - "/lib/x86_64-linux-gnu", - "/lib/../lib", - "/usr/lib/x86_64-linux-gnu", - "/usr/lib/../lib", - ], - extra_ld_flags: &[], - extra_skip_flags: &[], - crti_from_gcc_dir: false, - crt_package_hint: "Is the libc development package installed?", - gcc_package_hint: "Is the GCC development package installed?", -}; - -#[cfg(not(feature = "gcc_linker"))] -const DIRECT_LD_RISCV64: DirectLdArchConfig = DirectLdArchConfig { - arch_name: "RISC-V", - elf_machine: EM_RISCV, - emulation: "elf64lriscv", - dynamic_linker: "/lib/ld-linux-riscv64-lp64d.so.1", - gcc_lib_base_paths: &[ - "/usr/lib/gcc-cross/riscv64-linux-gnu", - "/usr/lib/gcc/riscv64-linux-gnu", - "/usr/lib/gcc/riscv64-redhat-linux", - "/usr/lib64/gcc/riscv64-linux-gnu", - ], - gcc_versions: GCC_VERSIONS_SHORT, - crt_dir_candidates: &[ - "/usr/riscv64-linux-gnu/lib", - "/usr/lib/riscv64-linux-gnu", - "/lib/riscv64-linux-gnu", - ], - system_lib_dirs: &[ - "/lib/riscv64-linux-gnu", - "/usr/lib/riscv64-linux-gnu", - ], - extra_ld_flags: &[], - extra_skip_flags: &[], - crti_from_gcc_dir: true, - crt_package_hint: "Is the riscv64-linux-gnu libc development package installed? \ - (e.g., libc6-dev-riscv64-cross)", - gcc_package_hint: "Is the riscv64-linux-gnu GCC cross-compiler installed? \ - (e.g., gcc-riscv64-linux-gnu)", -}; - -#[cfg(not(feature = "gcc_linker"))] -const DIRECT_LD_I686: DirectLdArchConfig = DirectLdArchConfig { - arch_name: "i686", - elf_machine: EM_386, - emulation: "elf_i386", - dynamic_linker: "/lib/ld-linux.so.2", - gcc_lib_base_paths: &[ - "/usr/lib/gcc-cross/i686-linux-gnu", - "/usr/lib/gcc/i686-linux-gnu", - "/usr/lib/gcc/i686-redhat-linux", - "/usr/lib/gcc/i686-pc-linux-gnu", - "/usr/lib/gcc/i386-linux-gnu", - "/usr/lib/gcc/i386-redhat-linux", - ], - gcc_versions: GCC_VERSIONS_FULL, - crt_dir_candidates: &[ - "/usr/lib/i386-linux-gnu", - "/usr/i686-linux-gnu/lib", - "/usr/lib32", - "/lib/i386-linux-gnu", - "/lib32", - ], - system_lib_dirs: &[ - "/lib/i386-linux-gnu", - "/lib/../lib", - "/usr/lib/i386-linux-gnu", - "/usr/lib/../lib", - "/usr/i686-linux-gnu/lib", - ], - extra_ld_flags: &[], - extra_skip_flags: &["-m32"], - crti_from_gcc_dir: false, - crt_package_hint: "Is the libc-dev-i386-cross or libc6-dev-i386 package installed?", - gcc_package_hint: "Is the gcc-i686-linux-gnu package installed?", -}; - -#[cfg(not(feature = "gcc_linker"))] -const DIRECT_LD_AARCH64: DirectLdArchConfig = DirectLdArchConfig { - arch_name: "AArch64", - elf_machine: EM_AARCH64, - emulation: "aarch64linux", - dynamic_linker: "/lib/ld-linux-aarch64.so.1", - gcc_lib_base_paths: &[ - "/usr/lib/gcc-cross/aarch64-linux-gnu", - "/usr/lib/gcc/aarch64-linux-gnu", - "/usr/lib/gcc/aarch64-redhat-linux", - "/usr/lib/gcc/aarch64-unknown-linux-gnu", - "/usr/lib64/gcc/aarch64-linux-gnu", - "/usr/lib64/gcc/aarch64-redhat-linux", - ], - gcc_versions: GCC_VERSIONS_FULL, - crt_dir_candidates: &[ - "/usr/aarch64-linux-gnu/lib", - "/usr/lib/aarch64-linux-gnu", - "/usr/lib64", - "/lib/aarch64-linux-gnu", - "/lib64", - ], - system_lib_dirs: &[ - "/lib/aarch64-linux-gnu", - "/lib/../lib", - "/usr/lib/aarch64-linux-gnu", - "/usr/lib/../lib", - "/usr/aarch64-linux-gnu/lib", - ], - extra_ld_flags: &["-EL", "-X", "--fix-cortex-a53-843419"], - extra_skip_flags: &[], - crti_from_gcc_dir: false, - crt_package_hint: "Is the libc-dev-arm64-cross package installed?", - gcc_package_hint: "Is the gcc-aarch64-linux-gnu package installed?", -}; - -/// Discover GCC's library directory by probing well-known paths. -/// Returns the path containing crtbegin.o (e.g., "/usr/lib/gcc/x86_64-linux-gnu/13"). -#[cfg(not(feature = "gcc_linker"))] -fn find_gcc_lib_dir(arch: &DirectLdArchConfig) -> Option { - for base in arch.gcc_lib_base_paths { - for ver in arch.gcc_versions { - let dir = format!("{}/{}", base, ver); - let crtbegin = format!("{}/crtbegin.o", dir); - if std::path::Path::new(&crtbegin).exists() { - return Some(dir); - } - } - } - None -} - -/// Discover the system CRT directory containing crt1.o. -/// Returns the path (e.g., "/usr/lib/x86_64-linux-gnu"). -#[cfg(not(feature = "gcc_linker"))] -fn find_crt_dir(arch: &DirectLdArchConfig) -> Option { - for dir in arch.crt_dir_candidates { - let crt1 = format!("{}/crt1.o", dir); - if std::path::Path::new(&crt1).exists() { - return Some(dir.to_string()); - } - } - None -} - -/// Resolve CRT objects and library paths for a built-in linker using DirectLdArchConfig. -/// -/// This shared helper is used by all four built-in linker wrappers -/// (x86-64, i686, AArch64, RISC-V) to avoid duplicating CRT/library -/// discovery logic. Returns: -/// - `crt_before`: CRT objects to link before user objects -/// - `crt_after`: CRT objects to link after user objects -/// - `lib_paths`: Combined library search paths (user -L first, then system paths) -/// - `needed_libs`: Default libraries to link -#[cfg(not(feature = "gcc_linker"))] -struct BuiltinLinkSetup { - crt_before: Vec, - crt_after: Vec, - lib_paths: Vec, - needed_libs: Vec, -} - -#[cfg(not(feature = "gcc_linker"))] -fn resolve_builtin_link_setup( - arch: &DirectLdArchConfig, - user_args: &[String], - is_nostdlib: bool, - is_static: bool, -) -> BuiltinLinkSetup { - let gcc_lib_dir = find_gcc_lib_dir(arch); - let crt_dir = find_crt_dir(arch); - - // System library paths - let mut system_lib_paths: Vec = Vec::new(); - if let Some(ref gcc) = gcc_lib_dir { - system_lib_paths.push(gcc.clone()); - } - if let Some(ref crt) = crt_dir { - system_lib_paths.push(crt.clone()); - } - for dir in arch.system_lib_dirs { - if std::path::Path::new(dir).exists() { - system_lib_paths.push(dir.to_string()); - } - } - - // User-provided -L paths from args - let mut user_lib_paths: Vec = Vec::new(); - let mut i = 0; - while i < user_args.len() { - let arg = &user_args[i]; - if let Some(path) = arg.strip_prefix("-L") { - if path.is_empty() { - if i + 1 < user_args.len() { - i += 1; - user_lib_paths.push(user_args[i].clone()); - } - } else { - user_lib_paths.push(path.to_string()); - } - } else if let Some(wl_arg) = arg.strip_prefix("-Wl,") { - for part in wl_arg.split(',') { - if let Some(lpath) = part.strip_prefix("-L") { - user_lib_paths.push(lpath.to_string()); - } - } - } - i += 1; - } - - // CRT objects - let mut crt_before: Vec = Vec::new(); - let mut crt_after: Vec = Vec::new(); - - if !is_nostdlib { - // crt1.o comes from the CRT dir - if let Some(ref crt) = crt_dir { - crt_before.push(format!("{}/crt1.o", crt)); - } - // crti.o: from GCC dir for cross-compilation (e.g., RISC-V), otherwise from CRT dir - if arch.crti_from_gcc_dir { - if let Some(ref gcc) = gcc_lib_dir { - crt_before.push(format!("{}/crti.o", gcc)); - } - } else if let Some(ref crt) = crt_dir { - crt_before.push(format!("{}/crti.o", crt)); - } - // crtbegin: use crtbeginT.o for static linking, crtbegin.o for dynamic - if let Some(ref gcc) = gcc_lib_dir { - if is_static { - let crtbegin_t = format!("{}/crtbeginT.o", gcc); - if std::path::Path::new(&crtbegin_t).exists() { - crt_before.push(crtbegin_t); - } else { - crt_before.push(format!("{}/crtbegin.o", gcc)); - } - } else { - crt_before.push(format!("{}/crtbegin.o", gcc)); - } - } - if let Some(ref gcc) = gcc_lib_dir { - crt_after.push(format!("{}/crtend.o", gcc)); - } - // crtn.o: from GCC dir for cross-compilation, otherwise from CRT dir - if arch.crti_from_gcc_dir { - if let Some(ref gcc) = gcc_lib_dir { - crt_after.push(format!("{}/crtn.o", gcc)); - } - } else if let Some(ref crt) = crt_dir { - crt_after.push(format!("{}/crtn.o", crt)); - } - } - - // Default libraries - let needed_libs: Vec = if !is_nostdlib { - vec!["gcc".to_string(), "c".to_string(), "m".to_string()] - } else { - vec![] - }; - - // Combined paths: user first, then system - let mut lib_paths: Vec = user_lib_paths; - lib_paths.extend(system_lib_paths); - - BuiltinLinkSetup { crt_before, crt_after, lib_paths, needed_libs } -} - -/// Add architecture-specific extra libraries after "gcc" in the needed libs list. -/// -/// Most architectures need libgcc_eh.a (static) or libgcc_s.so (dynamic) for -/// exception handling / stack unwinding, but the exact policy varies: -/// - x86-64: no extra libs needed (libgcc alone suffices) -/// - i686: always adds gcc_eh (needed for __divmoddi4, etc.) -/// - AArch64/RISC-V: gcc_eh for static, gcc_s for dynamic -#[cfg(not(feature = "gcc_linker"))] -fn add_arch_extra_libs(setup: &mut BuiltinLinkSetup, elf_machine: u16, is_static: bool) { - // x86-64 doesn't need extra gcc libs - if elf_machine == EM_X86_64 { - return; - } - // Find the "gcc" entry and insert the extra lib after it - if let Some(pos) = setup.needed_libs.iter().position(|l| l == "gcc") { - // i686 always needs gcc_eh; others use gcc_eh for static, gcc_s for dynamic - let extra = if elf_machine == EM_386 || is_static { "gcc_eh" } else { "gcc_s" }; - setup.needed_libs.insert(pos + 1, extra.to_string()); - } -} - -/// Convert a `BuiltinLinkSetup` into borrowed slices for passing to backend linkers. -/// -/// Avoids repeating the same 4-line `.iter().map(|s| s.as_str()).collect()` pattern. -#[cfg(not(feature = "gcc_linker"))] -struct LinkSetupRefs<'a> { - lib_paths: Vec<&'a str>, - needed_libs: Vec<&'a str>, - crt_before: Vec<&'a str>, - crt_after: Vec<&'a str>, -} - -#[cfg(not(feature = "gcc_linker"))] -impl BuiltinLinkSetup { - fn as_refs(&self) -> LinkSetupRefs<'_> { - LinkSetupRefs { - lib_paths: self.lib_paths.iter().map(|s| s.as_str()).collect(), - needed_libs: self.needed_libs.iter().map(|s| s.as_str()).collect(), - crt_before: self.crt_before.iter().map(|s| s.as_str()).collect(), - crt_after: self.crt_after.iter().map(|s| s.as_str()).collect(), - } - } -} - -/// Link using the built-in native ELF linker for any supported architecture. -/// -/// This is the fully native path: no external ld binary is needed. The linker -/// reads ELF .o files and .a archives, resolves symbols against system shared -/// libraries (libc.so.6), handles relocations, and produces a dynamically-linked -/// ELF executable. Dispatches to the correct per-architecture backend based on -/// the `arch.elf_machine` value. -/// -/// For shared library output (-shared), delegates to the per-arch `link_shared` -/// entry point with library paths only (no CRT objects). -#[cfg(not(feature = "gcc_linker"))] -fn link_builtin_native( - arch: &DirectLdArchConfig, - object_files: &[&str], - output_path: &str, - user_args: &[String], - is_nostdlib: bool, - is_static: bool, - is_shared: bool, -) -> Result<(), String> { - use crate::backend::{x86, i686, arm, riscv}; - - if is_shared { - // Shared libraries: no CRT objects, lib paths only - let setup = resolve_builtin_link_setup(arch, user_args, true, false); - let refs = setup.as_refs(); - return match arch.elf_machine { - EM_X86_64 => { - // x86-64 shared linker also takes implicit libs (gcc for runtime helpers) - let implicit_libs: Vec<&str> = if is_nostdlib { vec![] } else { vec!["gcc"] }; - x86::linker::link_shared(object_files, output_path, user_args, &refs.lib_paths, &implicit_libs) - } - EM_AARCH64 => arm::linker::link_shared(object_files, output_path, user_args, &refs.lib_paths), - EM_RISCV => riscv::linker::link_shared(object_files, output_path, user_args, &refs.lib_paths), - EM_386 => i686::linker::link_shared(object_files, output_path, user_args, &refs.lib_paths), - _ => Err(format!("No shared library linker for {} (elf_machine={})", arch.arch_name, arch.elf_machine)), - }; - } - - let mut setup = resolve_builtin_link_setup(arch, user_args, is_nostdlib, is_static); - add_arch_extra_libs(&mut setup, arch.elf_machine, is_static); - let refs = setup.as_refs(); - - match arch.elf_machine { - EM_X86_64 => x86::linker::link_builtin( - object_files, output_path, user_args, - &refs.lib_paths, &refs.needed_libs, &refs.crt_before, &refs.crt_after, - ), - EM_386 => i686::linker::link_builtin( - object_files, output_path, user_args, - &refs.lib_paths, &refs.needed_libs, &refs.crt_before, &refs.crt_after, - ), - EM_AARCH64 => arm::linker::link_builtin( - object_files, output_path, user_args, - &refs.lib_paths, &refs.needed_libs, &refs.crt_before, &refs.crt_after, - is_static, - ), - EM_RISCV => riscv::linker::link_builtin( - object_files, output_path, user_args, - &refs.lib_paths, &refs.needed_libs, &refs.crt_before, &refs.crt_after, - ), - _ => Err(format!("No built-in linker for {} (elf_machine={})", arch.arch_name, arch.elf_machine)), - } -} - -/// Assembly output buffer with helpers for emitting text. -/// -/// Besides the generic `emit` and `emit_fmt` methods, this provides specialized -/// fast-path emitters for common patterns that avoid `core::fmt` overhead. -/// The fast integer writer (`write_i64`) uses direct digit extraction instead -/// of going through `Display`/`write_fmt` machinery. -pub struct AsmOutput { - pub buf: String, -} - -/// Write an i64 directly into a String buffer using manual digit extraction. -/// This is ~3-4x faster than `write!(buf, "{}", val)` for the common case -/// because it avoids the `core::fmt` vtable dispatch and `pad_integral` overhead. -#[inline] -fn write_i64_fast(buf: &mut String, val: i64) { - if val == 0 { - buf.push('0'); - return; - } - let mut tmp = [0u8; 20]; // i64 max is 19 digits + sign - let negative = val < 0; - // Work with absolute value using wrapping to handle i64::MIN correctly - let mut v = if negative { (val as u64).wrapping_neg() } else { val as u64 }; - let mut pos = 20; - while v > 0 { - pos -= 1; - tmp[pos] = b'0' + (v % 10) as u8; - v /= 10; - } - if negative { - pos -= 1; - tmp[pos] = b'-'; - } - // All bytes are ASCII digits and optionally '-', which is always valid UTF-8. - let s = std::str::from_utf8(&tmp[pos..20]).expect("integer formatting produced non-UTF8"); - buf.push_str(s); -} - -/// Write a u64 directly into a String buffer. -#[inline] -fn write_u64_fast(buf: &mut String, val: u64) { - if val == 0 { - buf.push('0'); - return; - } - let mut tmp = [0u8; 20]; // u64 max is 20 digits - let mut v = val; - let mut pos = 20; - while v > 0 { - pos -= 1; - tmp[pos] = b'0' + (v % 10) as u8; - v /= 10; - } - let s = std::str::from_utf8(&tmp[pos..20]).expect("integer formatting produced non-UTF8"); - buf.push_str(s); -} - -impl AsmOutput { - pub fn new() -> Self { - // Pre-allocate 256KB to avoid repeated reallocations during codegen. - Self { buf: String::with_capacity(256 * 1024) } - } - - /// Emit a line of assembly. - #[inline] - pub fn emit(&mut self, s: &str) { - self.buf.push_str(s); - self.buf.push('\n'); - } - - /// Emit formatted assembly directly into the buffer (no temporary String). - #[inline] - pub fn emit_fmt(&mut self, args: std::fmt::Arguments<'_>) { - std::fmt::Write::write_fmt(&mut self.buf, args).unwrap(); - self.buf.push('\n'); - } - - // ── Fast-path emitters ────────────────────────────────────────────── - // - // These avoid the overhead of `format_args!` + `core::fmt::write` for - // the most common codegen patterns. Each one directly pushes bytes into - // the buffer using `push_str` and our fast integer writer. - - /// Emit: ` {mnemonic} ${imm}, %{reg}` - /// Used for movq/movl/movabsq with immediate to register. - #[inline] - pub fn emit_instr_imm_reg(&mut self, mnemonic: &str, imm: i64, reg: &str) { - self.buf.push_str(mnemonic); - self.buf.push_str(" $"); - write_i64_fast(&mut self.buf, imm); - self.buf.push_str(", %"); - self.buf.push_str(reg); - self.buf.push('\n'); - } - - /// Emit: ` {mnemonic} %{src}, %{dst}` - /// Used for movq/movl/xorq register-to-register. - #[inline] - pub fn emit_instr_reg_reg(&mut self, mnemonic: &str, src: &str, dst: &str) { - self.buf.push_str(mnemonic); - self.buf.push_str(" %"); - self.buf.push_str(src); - self.buf.push_str(", %"); - self.buf.push_str(dst); - self.buf.push('\n'); - } - - /// Emit: ` {mnemonic} {offset}(%rbp), %{reg}` - /// Used for loads from stack slots. - #[inline] - pub fn emit_instr_rbp_reg(&mut self, mnemonic: &str, offset: i64, reg: &str) { - self.buf.push_str(mnemonic); - self.buf.push(' '); - write_i64_fast(&mut self.buf, offset); - self.buf.push_str("(%rbp), %"); - self.buf.push_str(reg); - self.buf.push('\n'); - } - - /// Emit: ` {mnemonic} %{reg}, {offset}(%rbp)` - /// Used for stores to stack slots. - #[inline] - pub fn emit_instr_reg_rbp(&mut self, mnemonic: &str, reg: &str, offset: i64) { - self.buf.push_str(mnemonic); - self.buf.push_str(" %"); - self.buf.push_str(reg); - self.buf.push_str(", "); - write_i64_fast(&mut self.buf, offset); - self.buf.push_str("(%rbp)"); - self.buf.push('\n'); - } - - /// Emit a block label line: `.LBB{id}:` - #[inline] - pub fn emit_block_label(&mut self, block_id: u32) { - self.buf.push_str(".LBB"); - write_u64_fast(&mut self.buf, block_id as u64); - self.buf.push(':'); - self.buf.push('\n'); - } - - /// Emit: ` jmp .LBB{block_id}` - #[inline] - pub fn emit_jmp_block(&mut self, block_id: u32) { - self.buf.push_str(" jmp .LBB"); - write_u64_fast(&mut self.buf, block_id as u64); - self.buf.push('\n'); - } - - /// Emit: ` {jcc} .LBB{block_id}` (conditional jump to block label) - #[inline] - pub fn emit_jcc_block(&mut self, jcc: &str, block_id: u32) { - self.buf.push_str(jcc); - self.buf.push_str(" .LBB"); - write_u64_fast(&mut self.buf, block_id as u64); - self.buf.push('\n'); - } - - /// Emit: ` {mnemonic} {reg}` (single-register instruction like push/pop) - #[inline] - pub fn emit_instr_reg(&mut self, mnemonic: &str, reg: &str) { - self.buf.push_str(mnemonic); - self.buf.push_str(" %"); - self.buf.push_str(reg); - self.buf.push('\n'); - } - - /// Emit: ` {mnemonic} ${imm}` (single-immediate instruction like push) - #[inline] - pub fn emit_instr_imm(&mut self, mnemonic: &str, imm: i64) { - self.buf.push_str(mnemonic); - self.buf.push_str(" $"); - write_i64_fast(&mut self.buf, imm); - self.buf.push('\n'); - } - - /// Write an i64 into the buffer without newline. Useful for building - /// custom format patterns that include integers. - #[inline] - pub fn write_i64(&mut self, val: i64) { - write_i64_fast(&mut self.buf, val); - } - - /// Write a u64 into the buffer without newline. - #[inline] - pub fn write_u64(&mut self, val: u64) { - write_u64_fast(&mut self.buf, val); - } - - /// Emit: ` {mnemonic} {offset}(%rbp)` (single rbp-offset operand, e.g. fldt/fstpt) - #[inline] - pub fn emit_instr_rbp(&mut self, mnemonic: &str, offset: i64) { - self.buf.push_str(mnemonic); - self.buf.push(' '); - write_i64_fast(&mut self.buf, offset); - self.buf.push_str("(%rbp)"); - self.buf.push('\n'); - } - - /// Emit a named label definition: `{label}:` - #[inline] - pub fn emit_named_label(&mut self, label: &str) { - self.buf.push_str(label); - self.buf.push(':'); - self.buf.push('\n'); - } - - /// Emit: ` jmp {label}` (jump to named label) - #[inline] - pub fn emit_jmp_label(&mut self, label: &str) { - self.buf.push_str(" jmp "); - self.buf.push_str(label); - self.buf.push('\n'); - } - - /// Emit: ` {jcc} {label}` (conditional jump to named label) - #[inline] - pub fn emit_jcc_label(&mut self, jcc: &str, label: &str) { - self.buf.push_str(jcc); - self.buf.push(' '); - self.buf.push_str(label); - self.buf.push('\n'); - } - - /// Emit: ` call {target}` (direct call to named function/label) - #[inline] - pub fn emit_call(&mut self, target: &str) { - self.buf.push_str(" call "); - self.buf.push_str(target); - self.buf.push('\n'); - } - - /// Emit: ` {mnemonic} {offset}(%{base}), %{reg}` (memory to register with arbitrary base) - #[inline] - pub fn emit_instr_mem_reg(&mut self, mnemonic: &str, offset: i64, base: &str, reg: &str) { - self.buf.push_str(mnemonic); - self.buf.push(' '); - if offset != 0 { - write_i64_fast(&mut self.buf, offset); - } - self.buf.push_str("(%"); - self.buf.push_str(base); - self.buf.push_str("), %"); - self.buf.push_str(reg); - self.buf.push('\n'); - } - - /// Emit: ` {mnemonic} %{reg}, {offset}(%{base})` (register to memory with arbitrary base) - #[inline] - pub fn emit_instr_reg_mem(&mut self, mnemonic: &str, reg: &str, offset: i64, base: &str) { - self.buf.push_str(mnemonic); - self.buf.push_str(" %"); - self.buf.push_str(reg); - self.buf.push_str(", "); - if offset != 0 { - write_i64_fast(&mut self.buf, offset); - } - self.buf.push_str("(%"); - self.buf.push_str(base); - self.buf.push(')'); - self.buf.push('\n'); - } - - /// Emit: ` {mnemonic} ${imm}, {offset}(%{base})` (immediate to memory with arbitrary base) - #[inline] - pub fn emit_instr_imm_mem(&mut self, mnemonic: &str, imm: i64, offset: i64, base: &str) { - self.buf.push_str(mnemonic); - self.buf.push_str(" $"); - write_i64_fast(&mut self.buf, imm); - self.buf.push_str(", "); - if offset != 0 { - write_i64_fast(&mut self.buf, offset); - } - self.buf.push_str("(%"); - self.buf.push_str(base); - self.buf.push(')'); - self.buf.push('\n'); - } - - /// Emit: ` {mnemonic} {symbol}(%{base}), %{reg}` (symbol-relative addressing) - /// Used for RIP-relative loads like `leaq table_label(%rip), %rcx`. - #[inline] - pub fn emit_instr_sym_base_reg(&mut self, mnemonic: &str, symbol: &str, base: &str, reg: &str) { - self.buf.push_str(mnemonic); - self.buf.push(' '); - self.buf.push_str(symbol); - self.buf.push_str("(%"); - self.buf.push_str(base); - self.buf.push_str("), %"); - self.buf.push_str(reg); - self.buf.push('\n'); - } - - /// Emit: ` {mnemonic} ${symbol}, %{reg}` (symbol as immediate) - /// Used for absolute symbol addressing like `movq $name, %rax`. - #[inline] - pub fn emit_instr_sym_imm_reg(&mut self, mnemonic: &str, symbol: &str, reg: &str) { - self.buf.push_str(mnemonic); - self.buf.push_str(" $"); - self.buf.push_str(symbol); - self.buf.push_str(", %"); - self.buf.push_str(reg); - self.buf.push('\n'); - } - - /// Push a string slice without newline. - #[inline] - pub fn write_str(&mut self, s: &str) { - self.buf.push_str(s); - } - - /// Push a newline to end the current line. - #[inline] - pub fn newline(&mut self) { - self.buf.push('\n'); - } -} - -/// Emit formatted assembly directly into the output buffer, avoiding temporary -/// String allocations from `format!()`. Usage: `emit!(state, " mov {}, {}", src, dst)` -#[macro_export] -macro_rules! emit { - ($state:expr, $($arg:tt)*) => { - $state.out.emit_fmt(format_args!($($arg)*)) - }; -} - -/// The only arch-specific difference in data emission: the name of the 64-bit pointer directive. -/// x86 uses `.quad`, AArch64 uses `.xword`, RISC-V uses `.dword`. -#[derive(Clone, Copy)] -pub enum PtrDirective { - Quad, // x86-64 - Long, // i686 (32-bit) - Xword, // AArch64 - Dword, // RISC-V 64 -} - -impl PtrDirective { - pub fn as_str(self) -> &'static str { - match self { - PtrDirective::Quad => ".quad", - PtrDirective::Long => ".long", - PtrDirective::Xword => ".xword", - PtrDirective::Dword => ".dword", - } - } - - /// Returns true if this is an x86 target directive (x86-64 or i686). - /// Used to select x87 80-bit extended precision format for long double constants. - pub fn is_x86(self) -> bool { - matches!(self, PtrDirective::Quad | PtrDirective::Long) - } - - /// Returns true if this is a 32-bit pointer directive. - pub fn is_32bit(self) -> bool { - matches!(self, PtrDirective::Long) - } - - /// Returns true if this is the RISC-V target directive. - /// RISC-V stores full IEEE binary128 long doubles in memory (allocas and globals). - pub fn is_riscv(self) -> bool { - matches!(self, PtrDirective::Dword) - } - - /// Returns true if this is the AArch64 target directive. - /// AArch64 stores full IEEE binary128 long doubles in memory (allocas and globals). - pub fn is_arm(self) -> bool { - matches!(self, PtrDirective::Xword) - } - - /// Convert a byte alignment value to the correct `.align` argument for this target. - /// On x86-64, `.align N` means N bytes. On ARM and RISC-V, `.align N` means 2^N bytes, - /// so we must emit log2(N) instead. - pub fn align_arg(self, bytes: usize) -> usize { - debug_assert!(bytes == 0 || bytes.is_power_of_two(), "alignment must be power of 2"); - match self { - PtrDirective::Quad | PtrDirective::Long => bytes, - PtrDirective::Xword | PtrDirective::Dword => { - if bytes <= 1 { 0 } else { bytes.trailing_zeros() as usize } - } - } - } -} - -/// Emit all data sections (rodata for string literals, .data and .bss for globals). -pub fn emit_data_sections(out: &mut AsmOutput, module: &IrModule, ptr_dir: PtrDirective) { - // String literals in .rodata - if !module.string_literals.is_empty() || !module.wide_string_literals.is_empty() - || !module.char16_string_literals.is_empty() { - out.emit(".section .rodata"); - for (label, value) in &module.string_literals { - out.emit_fmt(format_args!("{}:", label)); - emit_string_bytes(out, value); - } - // Wide string literals (L"..."): each char is a 4-byte wchar_t value - for (label, chars) in &module.wide_string_literals { - out.emit_fmt(format_args!(".align {}", ptr_dir.align_arg(4))); - out.emit_fmt(format_args!("{}:", label)); - for &ch in chars { - out.emit_fmt(format_args!(" .long {}", ch)); - } - } - // char16_t string literals (u"..."): each char is a 2-byte char16_t value - for (label, chars) in &module.char16_string_literals { - out.emit_fmt(format_args!(".align {}", ptr_dir.align_arg(2))); - out.emit_fmt(format_args!("{}:", label)); - for &ch in chars { - out.emit_fmt(format_args!(" .short {}", ch)); - } - } - out.emit(""); - } - - // Global variables - emit_globals(out, &module.globals, ptr_dir); -} - -/// Compute effective alignment for a global, promoting to 16 when size >= 16. -/// This matches GCC/Clang behavior on x86-64 and aarch64, enabling aligned SSE/NEON access. -/// Globals placed in custom sections are excluded from promotion because they may -/// form contiguous arrays (e.g. the kernel's __param or .init.setup sections) where -/// the linker expects elements at their natural stride with no extra padding. -/// Additionally, when the user explicitly specified an alignment via __attribute__((aligned(N))) -/// or _Alignas, we respect their choice and don't auto-promote. GCC behaves the same way: -/// explicit aligned(8) on a 24-byte struct gives 8-byte alignment, not 16. -fn effective_align(g: &IrGlobal) -> usize { - if g.section.is_some() || g.has_explicit_align { - return g.align; - } - if g.size >= 16 && g.align < 16 { - 16 - } else { - g.align - } -} - -/// Emit a zero-initialized global variable (used in .bss, .tbss, and custom section zero-init). -fn emit_zero_global(out: &mut AsmOutput, g: &IrGlobal, obj_type: &str, ptr_dir: PtrDirective) { - emit_symbol_directives(out, g); - out.emit_fmt(format_args!(".align {}", ptr_dir.align_arg(effective_align(g)))); - out.emit_fmt(format_args!(".type {}, {}", g.name, obj_type)); - out.emit_fmt(format_args!(".size {}, {}", g.name, g.size)); - out.emit_fmt(format_args!("{}:", g.name)); - out.emit_fmt(format_args!(" .zero {}", g.size)); -} - -/// Target section classification for a global variable. -/// -/// Each global is classified exactly once into one of these categories, -/// which determines which assembly section it belongs to. -#[derive(PartialEq, Eq)] -enum GlobalSection { - /// Extern (undefined) symbol -- only needs visibility directive, no storage. - Extern, - /// Has `__attribute__((section(...)))` -- emitted in its custom section. - Custom, - /// Const-qualified, non-TLS, initialized, non-zero-size -> `.rodata`. - Rodata, - /// Thread-local, initialized, non-zero-size -> `.tdata`. - Tdata, - /// Non-const, non-TLS, initialized, non-zero-size -> `.data`. - Data, - /// Zero-initialized, `is_common` flag set -> `.comm` directive. - Common, - /// Thread-local, zero-initialized (or zero-size) -> `.tbss`. - Tbss, - /// Non-TLS, zero-initialized (or zero-size with init) -> `.bss`. - Bss, -} - -/// Classify a global variable into the section it should be emitted to. -/// -/// The classification priority matches GCC behavior: -/// 1. Extern symbols get no storage (just visibility directives). -/// 2. Custom section overrides all other placement. -/// 3. TLS globals go to .tdata (initialized) or .tbss (zero-init). -/// 4. Const globals go to .rodata. -/// 5. Non-zero initialized non-const globals go to .data. -/// 6. Zero-initialized common globals go to .comm. -/// 7. Zero-initialized non-common globals go to .bss. -fn classify_global(g: &IrGlobal) -> GlobalSection { - if g.is_extern { - return GlobalSection::Extern; - } - if g.section.is_some() { - return GlobalSection::Custom; - } - let is_zero = matches!(g.init, GlobalInit::Zero); - let has_nonzero_init = !is_zero && g.size > 0; - if g.is_thread_local { - return if has_nonzero_init { GlobalSection::Tdata } else { GlobalSection::Tbss }; - } - if has_nonzero_init { - return if g.is_const { GlobalSection::Rodata } else { GlobalSection::Data }; - } - // Zero-initialized (or zero-size with init) - if g.is_common && is_zero { - return GlobalSection::Common; - } - GlobalSection::Bss -} - -/// Emit global variable definitions, grouped by target section. -/// -/// Classifies each global once via `classify_global`, then emits all globals -/// for each section in a fixed order: extern visibility, custom sections, -/// .rodata, .tdata, .data, .comm, .tbss, .bss. -fn emit_globals(out: &mut AsmOutput, globals: &[IrGlobal], ptr_dir: PtrDirective) { - // Phase 1: classify every global into its target section. - let classified: Vec = globals.iter().map(classify_global).collect(); - - // Phase 2: emit each section group in order. - - // Extern visibility directives (needed for PIC code so the assembler/linker knows - // these symbols are resolved within the link unit). - for (g, sect) in globals.iter().zip(&classified) { - if matches!(sect, GlobalSection::Extern) { - emit_visibility_directive(out, &g.name, &g.visibility); - // For extern TLS variables, emit .type @tls_object so the assembler - // creates a TLS-typed undefined symbol. Without this, the linker - // reports "TLS definition mismatches non-TLS reference" when the - // defining TU has the symbol in .tdata but this TU's reference - // lacks TLS type information (defaults to STT_NOTYPE). - if g.is_thread_local { - out.emit_fmt(format_args!(".type {}, @tls_object", g.name)); - } - } - } - - // Custom section globals: each gets its own .section directive since they - // may target different sections. - for (g, sect) in globals.iter().zip(&classified) { - if !matches!(sect, GlobalSection::Custom) { - continue; - } - let section_name = g.section.as_ref().expect("custom section must have a name"); - // Use "a" (read-only) for const-qualified globals or rodata sections, - // "aw" (writable) otherwise. GCC uses the const qualification of the - // variable to determine section flags, not just the section name. - // This matters for kernel sections like .modinfo which contain const data. - let flags = if g.is_const || section_name.contains("rodata") { "a" } else { "aw" }; - // Sections starting with ".bss" are NOBITS (no file space, BSS semantics) - let section_type = if section_name.starts_with(".bss") { "@nobits" } else { "@progbits" }; - out.emit_fmt(format_args!(".section {},\"{}\",{}", section_name, flags, section_type)); - if matches!(g.init, GlobalInit::Zero) || g.size == 0 { - emit_zero_global(out, g, "@object", ptr_dir); - } else { - emit_global_def(out, g, ptr_dir); - } - out.emit(""); - } - - // .rodata: const-qualified initialized globals (matches GCC -fno-PIE behavior; - // the linker handles relocations in .rodata fine, and kernel linker scripts - // don't recognize .data.rel.ro). - emit_section_group(out, globals, &classified, &GlobalSection::Rodata, - ".section .rodata", false, ptr_dir); - - // .tdata: thread-local initialized globals - emit_section_group(out, globals, &classified, &GlobalSection::Tdata, - ".section .tdata,\"awT\",@progbits", false, ptr_dir); - - // .data: non-const initialized globals - emit_section_group(out, globals, &classified, &GlobalSection::Data, - ".section .data", false, ptr_dir); - - // .comm: zero-initialized common globals (weak linkage, linker merges duplicates). - // .comm alignment is always in bytes on all platforms, unlike .align. - for (g, sect) in globals.iter().zip(&classified) { - if matches!(sect, GlobalSection::Common) { - out.emit_fmt(format_args!(".comm {},{},{}", g.name, g.size, effective_align(g))); - } - } - - // .tbss: thread-local zero-initialized globals - emit_section_group(out, globals, &classified, &GlobalSection::Tbss, - ".section .tbss,\"awT\",@nobits", true, ptr_dir); - - // .bss: non-TLS zero-initialized globals (includes zero-size globals with - // empty initializers like `Type arr[0] = {}` to avoid address overlap). - emit_section_group(out, globals, &classified, &GlobalSection::Bss, - ".section .bss", true, ptr_dir); -} - -/// Emit all globals matching `target` section, with a section header on first match. -/// If `is_zero` is true, emits as zero-initialized; otherwise as initialized data. -fn emit_section_group( - out: &mut AsmOutput, - globals: &[IrGlobal], - classified: &[GlobalSection], - target: &GlobalSection, - section_header: &str, - is_zero: bool, - ptr_dir: PtrDirective, -) { - let mut emitted_header = false; - for (g, sect) in globals.iter().zip(classified) { - if sect != target { - continue; - } - if !emitted_header { - out.emit(section_header); - emitted_header = true; - } - if is_zero { - let obj_type = if g.is_thread_local { "@tls_object" } else { "@object" }; - emit_zero_global(out, g, obj_type, ptr_dir); - } else { - emit_global_def(out, g, ptr_dir); - } - } - if emitted_header { - out.emit(""); - } -} - -/// Emit a visibility directive (.hidden, .protected, .internal) for a symbol if applicable. -fn emit_visibility_directive(out: &mut AsmOutput, name: &str, visibility: &Option) { - if let Some(ref vis) = visibility { - match vis.as_str() { - "hidden" => out.emit_fmt(format_args!(".hidden {}", name)), - "protected" => out.emit_fmt(format_args!(".protected {}", name)), - "internal" => out.emit_fmt(format_args!(".internal {}", name)), - _ => {} // "default" or unknown: no directive needed - } - } -} - -/// Emit linkage directives (.globl or .weak) for a non-static symbol. -fn emit_linkage_directive(out: &mut AsmOutput, name: &str, is_static: bool, is_weak: bool) { - if !is_static { - if is_weak { - out.emit_fmt(format_args!(".weak {}", name)); - } else { - out.emit_fmt(format_args!(".globl {}", name)); - } - } -} - -/// Emit both linkage (.globl/.weak) and visibility (.hidden/.protected/.internal) directives. -fn emit_symbol_directives(out: &mut AsmOutput, g: &IrGlobal) { - emit_linkage_directive(out, &g.name, g.is_static, g.is_weak); - emit_visibility_directive(out, &g.name, &g.visibility); -} - -/// Emit a single global variable definition. -fn emit_global_def(out: &mut AsmOutput, g: &IrGlobal, ptr_dir: PtrDirective) { - emit_symbol_directives(out, g); - out.emit_fmt(format_args!(".align {}", ptr_dir.align_arg(effective_align(g)))); - let obj_type = if g.is_thread_local { "@tls_object" } else { "@object" }; - out.emit_fmt(format_args!(".type {}, {}", g.name, obj_type)); - out.emit_fmt(format_args!(".size {}, {}", g.name, g.size)); - out.emit_fmt(format_args!("{}:", g.name)); - - emit_init_data(out, &g.init, g.ty, g.size, ptr_dir); -} - -/// Emit the data for a single GlobalInit element. -/// -/// Handles all init variants: scalars, arrays, strings, global addresses, label diffs, -/// and compound initializers (which recurse into this function for each element). -/// `fallback_ty` is the declared element type of the enclosing global/array, used to -/// widen narrow constants (e.g., IrConst::I32(0) in a pointer array emits .quad 0). -/// `total_size` is the declared size of the enclosing global for padding calculations. -fn emit_init_data(out: &mut AsmOutput, init: &GlobalInit, fallback_ty: IrType, total_size: usize, ptr_dir: PtrDirective) { - match init { - GlobalInit::Zero => { - out.emit_fmt(format_args!(" .zero {}", total_size)); - } - GlobalInit::Scalar(c) => { - emit_const_data(out, c, fallback_ty, ptr_dir); - } - GlobalInit::Array(values) => { - // Coalesce consecutive zero-valued elements into .zero directives - // to avoid emitting millions of individual `.byte 0` lines for - // large partially-initialized arrays like `char x[500000]={'a'}`. - let mut i = 0; - while i < values.len() { - let val = &values[i]; - let const_ty = const_natural_type(val, fallback_ty); - // Only widen integer constants to fallback_ty (e.g., I32(0) in a pointer - // array should emit .quad 0). Float constants (F32, F64, LongDouble) must - // keep their natural size -- complex arrays store F32 pairs where each zero - // imaginary slot is exactly 4 bytes, not pointer-sized. - let elem_ty = if fallback_ty.size() > const_ty.size() && const_ty.is_integer() { - fallback_ty - } else { - const_ty - }; - - if val.is_zero() { - // Count consecutive zero elements and emit as a single .zero - let elem_size = elem_ty.size(); - let mut zero_count = 1usize; - while i + zero_count < values.len() && values[i + zero_count].is_zero() { - zero_count += 1; - } - let zero_bytes = zero_count * elem_size; - if zero_bytes > 0 { - out.emit_fmt(format_args!(" .zero {}", zero_bytes)); - } - i += zero_count; - } else { - emit_const_data(out, val, elem_ty, ptr_dir); - i += 1; - } - } - } - GlobalInit::String(s) => { - let string_chars = s.chars().count(); - let string_bytes_with_nul = string_chars + 1; - if string_bytes_with_nul <= total_size { - // NUL terminator fits: use .asciz (emits string + NUL) - out.emit_fmt(format_args!(" .asciz \"{}\"", escape_string(s))); - if total_size > string_bytes_with_nul { - out.emit_fmt(format_args!(" .zero {}", total_size - string_bytes_with_nul)); - } - } else { - // NUL terminator doesn't fit (C11 6.7.9 p14): truncate to array size. - // Use .ascii (no implicit NUL) with the string truncated to total_size chars. - let truncated: String = s.chars().take(total_size).collect(); - out.emit_fmt(format_args!(" .ascii \"{}\"", escape_string(&truncated))); - } - } - GlobalInit::WideString(chars) => { - emit_wide_string(out, chars); - let wide_bytes = (chars.len() + 1) * 4; - if total_size > wide_bytes { - out.emit_fmt(format_args!(" .zero {}", total_size - wide_bytes)); - } - } - GlobalInit::Char16String(chars) => { - emit_char16_string(out, chars); - let char16_bytes = (chars.len() + 1) * 2; - if total_size > char16_bytes { - out.emit_fmt(format_args!(" .zero {}", total_size - char16_bytes)); - } - } - GlobalInit::GlobalAddr(label) => { - out.emit_fmt(format_args!(" {} {}", ptr_dir.as_str(), label)); - } - GlobalInit::GlobalAddrOffset(label, offset) => { - if *offset >= 0 { - out.emit_fmt(format_args!(" {} {}+{}", ptr_dir.as_str(), label, offset)); - } else { - out.emit_fmt(format_args!(" {} {}{}", ptr_dir.as_str(), label, offset)); - } - } - GlobalInit::GlobalLabelDiff(lab1, lab2, byte_size) => { - emit_label_diff(out, lab1, lab2, *byte_size); - } - GlobalInit::Compound(elements) => { - for elem in elements { - // Compound elements are self-typed: each element knows its own size. - // For Scalar elements, use the constant's natural type (falling back - // to the enclosing global's type for I64/wider constants). - emit_compound_element(out, elem, fallback_ty, ptr_dir); - } - } - } -} - -/// Emit a single element within a Compound initializer. -/// -/// Most variants delegate to the shared emit_init_data. Scalar elements use the -/// constant's natural type rather than the enclosing global's type, since compound -/// elements may have heterogeneous types (e.g., struct with int and pointer fields). -fn emit_compound_element(out: &mut AsmOutput, elem: &GlobalInit, fallback_ty: IrType, ptr_dir: PtrDirective) { - match elem { - GlobalInit::Scalar(c) => { - // In compound initializers, each element may have a different type. - // Use the constant's own type, falling back to fallback_ty for I64 and wider. - let elem_ty = const_natural_type(c, fallback_ty); - emit_const_data(out, c, elem_ty, ptr_dir); - } - GlobalInit::Zero => { - // Zero element in compound: emit a single pointer-sized zero - out.emit_fmt(format_args!(" {} 0", ptr_dir.as_str())); - } - GlobalInit::Compound(elements) => { - // Nested compound: recurse into each element - for inner in elements { - emit_compound_element(out, inner, fallback_ty, ptr_dir); - } - } - // All other variants (GlobalAddr, GlobalAddrOffset, WideString, etc.) - // delegate to the shared handler with zero total_size (no padding). - other => emit_init_data(out, other, fallback_ty, 0, ptr_dir), - } -} - -/// Get the natural IR type of a constant, falling back to `default_ty` for -/// types that don't have a narrower representation (I64, I128, etc.). -fn const_natural_type(c: &IrConst, default_ty: IrType) -> IrType { - match c { - IrConst::I8(_) => IrType::I8, - IrConst::I16(_) => IrType::I16, - IrConst::I32(_) => IrType::I32, - IrConst::F32(_) => IrType::F32, - IrConst::F64(_) => IrType::F64, - IrConst::LongDouble(..) => IrType::F128, - _ => default_ty, - } -} - -/// Emit a wide string (wchar_t) as .long directives with null terminator. -fn emit_wide_string(out: &mut AsmOutput, chars: &[u32]) { - for &ch in chars { - out.emit_fmt(format_args!(" .long {}", ch)); - } - out.emit(" .long 0"); // null terminator -} - -/// Emit a char16_t string as .short directives with null terminator. -fn emit_char16_string(out: &mut AsmOutput, chars: &[u16]) { - for &ch in chars { - out.emit_fmt(format_args!(" .short {}", ch)); - } - out.emit(" .short 0"); // null terminator -} - -/// Emit a label difference as a sized assembly directive (`.long lab1-lab2`, etc.). -fn emit_label_diff(out: &mut AsmOutput, lab1: &str, lab2: &str, byte_size: usize) { - let dir = match byte_size { - 1 => ".byte", - 2 => ".short", - 4 => ".long", - _ => ".quad", - }; - out.emit_fmt(format_args!(" {} {}-{}", dir, lab1, lab2)); -} - -/// Emit a 64-bit value as two `.long` directives in little-endian order. -/// Used on i686 (32-bit) targets where 64-bit values must be split. -#[inline] -fn emit_u64_as_long_pair(out: &mut AsmOutput, bits: u64) { - out.emit_fmt(format_args!(" .long {}", bits as u32)); - out.emit_fmt(format_args!(" .long {}", (bits >> 32) as u32)); -} - -pub fn emit_const_data(out: &mut AsmOutput, c: &IrConst, ty: IrType, ptr_dir: PtrDirective) { - match c { - // Integer constants: all share the same widening/narrowing logic. - // The value is sign-extended to i64, then emitted at the target type's width. - IrConst::I8(v) => emit_int_data(out, *v as i64, ty, ptr_dir), - IrConst::I16(v) => emit_int_data(out, *v as i64, ty, ptr_dir), - IrConst::I32(v) => emit_int_data(out, *v as i64, ty, ptr_dir), - IrConst::I64(v) => emit_int_data(out, *v, ty, ptr_dir), - IrConst::F32(v) => { - out.emit_fmt(format_args!(" .long {}", v.to_bits())); - } - IrConst::F64(v) => { - let bits = v.to_bits(); - if ptr_dir.is_32bit() { - emit_u64_as_long_pair(out, bits); - } else { - out.emit_fmt(format_args!(" {} {}", ptr_dir.as_str(), bits)); - } - } - IrConst::LongDouble(f64_val, f128_bytes) => { - if ptr_dir.is_x86() { - // x86: convert f128 bytes to x87 80-bit extended precision for emission. - // x87 80-bit format = 10 bytes: 8 bytes (significand+exp low) + 2 bytes (exp high+sign) - let x87 = crate::common::long_double::f128_bytes_to_x87_bytes(f128_bytes); - let lo = u64::from_le_bytes(x87[0..8].try_into().unwrap()); - let hi = u64::from_le_bytes([x87[8], x87[9], 0, 0, 0, 0, 0, 0]); - if ptr_dir.is_32bit() { - emit_u64_as_long_pair(out, lo); - // x87 80-bit: third .long holds the upper 2 bytes - out.emit_fmt(format_args!(" .long {}", hi as u32)); - } else { - out.emit_fmt(format_args!(" {} {}", ptr_dir.as_str(), lo as i64)); - out.emit_fmt(format_args!(" {} {}", ptr_dir.as_str(), hi as i64)); - } - } else if ptr_dir.is_riscv() || ptr_dir.is_arm() { - // RISC-V and ARM64: f128 bytes are already in IEEE 754 binary128 format. - let lo = u64::from_le_bytes(f128_bytes[0..8].try_into().unwrap()); - let hi = u64::from_le_bytes(f128_bytes[8..16].try_into().unwrap()); - out.emit_fmt(format_args!(" {} {}", ptr_dir.as_str(), lo as i64)); - out.emit_fmt(format_args!(" {} {}", ptr_dir.as_str(), hi as i64)); - } else { - // Fallback: store f64 approximation (should not normally be reached). - let f64_bits = f64_val.to_bits(); - out.emit_fmt(format_args!(" {} {}", ptr_dir.as_str(), f64_bits as i64)); - out.emit_fmt(format_args!(" {} 0", ptr_dir.as_str())); - } - } - IrConst::I128(v) => { - let lo = *v as u64; - let hi = (*v >> 64) as u64; - if ptr_dir.is_32bit() { - emit_u64_as_long_pair(out, lo); - emit_u64_as_long_pair(out, hi); - } else { - // 64-bit targets: emit as two 64-bit values (little-endian: low quad first) - out.emit_fmt(format_args!(" {} {}", ptr_dir.as_str(), lo as i64)); - out.emit_fmt(format_args!(" {} {}", ptr_dir.as_str(), hi as i64)); - } - } - IrConst::Zero => { - let size = ty.size(); - out.emit_fmt(format_args!(" .zero {}", if size == 0 { 4 } else { size })); - } - } -} - -/// Emit an integer constant at the width specified by `ty`. -/// Truncates or sign-extends `val` (an i64) as needed to match the target width. -fn emit_int_data(out: &mut AsmOutput, val: i64, ty: IrType, ptr_dir: PtrDirective) { - match ty { - IrType::I8 | IrType::U8 => out.emit_fmt(format_args!(" .byte {}", val as u8)), - IrType::I16 | IrType::U16 => out.emit_fmt(format_args!(" .short {}", val as u16)), - IrType::I32 | IrType::U32 => out.emit_fmt(format_args!(" .long {}", val as u32)), - // On i686 (32-bit), pointers are 4 bytes -- emit a single .long, not two. - IrType::Ptr if ptr_dir.is_32bit() => { - out.emit_fmt(format_args!(" .long {}", val as u32)); - } - _ => { - if ptr_dir.is_32bit() { - emit_u64_as_long_pair(out, val as u64); - } else { - out.emit_fmt(format_args!(" {} {}", ptr_dir.as_str(), val)); - } - } - } -} - -/// Emit string literal as .byte directives with null terminator. -/// Each char in the string is treated as a raw byte value (0-255), -/// not as a UTF-8 encoded character. This is correct for C narrow -/// string literals where \xNN escapes produce single bytes. -/// -/// Writes directly into the output buffer without any intermediate -/// heap allocations (no per-byte String, no Vec, no join). Uses -/// a pre-computed lookup table to convert bytes to decimal strings -/// without fmt::Write overhead. -pub fn emit_string_bytes(out: &mut AsmOutput, s: &str) { - // Chunk output into lines of at most 32 bytes each to avoid - // extremely long lines that can cause parser performance issues. - let mut count = 0; - for c in s.chars() { - if count % 32 == 0 { - if count > 0 { - out.buf.push('\n'); - } - out.buf.push_str(" .byte "); - } else { - out.buf.push_str(", "); - } - push_u8_decimal(&mut out.buf, c as u8); - count += 1; - } - // Null terminator - if count % 32 == 0 { - if count > 0 { - out.buf.push('\n'); - } - out.buf.push_str(" .byte 0\n"); - } else { - out.buf.push_str(", 0\n"); - } -} - -/// Append a u8 value as a decimal string directly into the buffer. -/// Avoids fmt::Write overhead by using direct digit extraction. -#[inline] -fn push_u8_decimal(buf: &mut String, v: u8) { - if v >= 100 { - buf.push((b'0' + v / 100) as char); - buf.push((b'0' + (v / 10) % 10) as char); - buf.push((b'0' + v % 10) as char); - } else if v >= 10 { - buf.push((b'0' + v / 10) as char); - buf.push((b'0' + v % 10) as char); - } else { - buf.push((b'0' + v) as char); - } -} - -/// Escape a string for use in assembly .asciz directives. -pub fn escape_string(s: &str) -> String { - let mut result = String::new(); - for c in s.chars() { - match c { - '\\' => result.push_str("\\\\"), - '"' => result.push_str("\\\""), - '\n' => result.push_str("\\n"), - '\t' => result.push_str("\\t"), - '\r' => result.push_str("\\r"), - '\0' => result.push_str("\\000"), - c if c.is_ascii_graphic() || c == ' ' => result.push(c), - c => { - // Emit the raw byte value (char as u8), not UTF-8 encoding - use std::fmt::Write; - let _ = write!(result, "\\{:03o}", c as u8); - } - } - } - result -} - diff --git a/src/backend/elf/archive.rs b/src/backend/elf/archive.rs deleted file mode 100644 index b2d26a0ac4..0000000000 --- a/src/backend/elf/archive.rs +++ /dev/null @@ -1,241 +0,0 @@ -//! Archive (.a) and linker script parsing. -//! -//! Handles both regular and thin GNU archives, plus GROUP/INPUT linker scripts. - -// ── Archive (.a) parsing ───────────────────────────────────────────────────── - -/// Returns true if `data` starts with the thin archive magic `!\n`. -pub fn is_thin_archive(data: &[u8]) -> bool { - data.len() >= 8 && &data[0..8] == b"!\n" -} - -/// Parse a GNU thin archive (.a file with `!\n` magic), returning member -/// filenames (relative to the archive directory). In thin archives, member data -/// is NOT stored inline — the archive only contains headers and a name table. -/// The caller must read each file from disk. -pub fn parse_thin_archive_members(data: &[u8]) -> Result, String> { - if data.len() < 8 || &data[0..8] != b"!\n" { - return Err("not a thin archive file".to_string()); - } - - let mut members = Vec::new(); - let mut pos = 8; - let mut extended_names: Option<&[u8]> = None; - - while pos + 60 <= data.len() { - let name_raw = &data[pos..pos + 16]; - let size_str = std::str::from_utf8(&data[pos + 48..pos + 58]) - .unwrap_or("") - .trim(); - let magic = &data[pos + 58..pos + 60]; - if magic != b"`\n" { - break; - } - - let size: usize = size_str.parse().unwrap_or(0); - let data_start = pos + 60; - let name_str = std::str::from_utf8(name_raw).unwrap_or("").trim_end(); - - if name_str == "/" || name_str == "/SYM64/" { - // Symbol table — data is stored inline even in thin archives - pos = data_start + size; - if pos % 2 != 0 { pos += 1; } - continue; - } else if name_str == "//" { - // Extended name table — also stored inline in thin archives - extended_names = Some(&data[data_start..(data_start + size).min(data.len())]); - pos = data_start + size; - if pos % 2 != 0 { pos += 1; } - continue; - } - - // Regular member — in thin archives, data is NOT inline - let member_name = if let Some(rest) = name_str.strip_prefix('/') { - if let Some(ext) = extended_names { - // The name field is like "/23607" or "/23607/" — extract just the digits - let num_str = rest.trim_end_matches('/').trim(); - let name_off: usize = num_str.parse().unwrap_or(0); - if name_off < ext.len() { - // In thin archives, names can contain '/' (path separators), - // so the terminator is the two-byte sequence "/\n", not just '/'. - let slice = &ext[name_off..]; - let end = slice.windows(2) - .position(|w| w == b"/\n") - .unwrap_or_else(|| { - // Fall back to null byte or end of table - slice.iter() - .position(|&b| b == 0) - .unwrap_or(slice.len()) - }); - String::from_utf8_lossy(&ext[name_off..name_off + end]).to_string() - } else { - name_str.to_string() - } - } else { - name_str.to_string() - } - } else { - name_str.trim_end_matches('/').to_string() - }; - - members.push(member_name); - - // In thin archives, member headers are consecutive (no inline data) - pos = data_start; - if pos % 2 != 0 { pos += 1; } - } - - Ok(members) -} - -/// Parse a GNU-format static archive (.a file), returning member entries as -/// `(name, data_offset, data_size)` tuples. The offsets are into the original -/// `data` slice, enabling zero-copy access. -/// -/// Handles extended name tables (`//`), symbol tables (`/`, `/SYM64/`), and -/// 2-byte alignment padding between members. -pub fn parse_archive_members(data: &[u8]) -> Result, String> { - if data.len() < 8 || &data[0..8] != b"!\n" { - return Err("not a valid archive file".to_string()); - } - - let mut members = Vec::new(); - let mut pos = 8; - let mut extended_names: Option<&[u8]> = None; - - while pos + 60 <= data.len() { - let name_raw = &data[pos..pos + 16]; - let size_str = std::str::from_utf8(&data[pos + 48..pos + 58]) - .unwrap_or("") - .trim(); - let magic = &data[pos + 58..pos + 60]; - if magic != b"`\n" { - break; - } - - let size: usize = size_str.parse().unwrap_or(0); - let data_start = pos + 60; - let name_str = std::str::from_utf8(name_raw).unwrap_or("").trim_end(); - - if name_str == "/" || name_str == "/SYM64/" { - // Symbol table — skip - } else if name_str == "//" { - // Extended name table - extended_names = Some(&data[data_start..(data_start + size).min(data.len())]); - } else { - let member_name = if let Some(rest) = name_str.strip_prefix('/') { - // Extended name: /offset into extended names table - if let Some(ext) = extended_names { - let name_off: usize = rest.trim_end_matches('/').parse().unwrap_or(0); - if name_off < ext.len() { - let end = ext[name_off..] - .iter() - .position(|&b| b == b'/' || b == b'\n' || b == 0) - .unwrap_or(ext.len() - name_off); - String::from_utf8_lossy(&ext[name_off..name_off + end]).to_string() - } else { - name_str.to_string() - } - } else { - name_str.to_string() - } - } else { - name_str.trim_end_matches('/').to_string() - }; - - if data_start + size <= data.len() { - members.push((member_name, data_start, size)); - } - } - - // Align to 2-byte boundary - pos = data_start + size; - if pos % 2 != 0 { - pos += 1; - } - } - - Ok(members) -} - -// ── Linker script parsing ──────────────────────────────────────────────────── - -/// An entry found in a GNU linker script directive (GROUP or INPUT). -#[derive(Debug, Clone)] -pub enum LinkerScriptEntry { - /// An absolute or relative file path (e.g. `/lib/x86_64-linux-gnu/libc.so.6` or `libncurses.so.6`) - Path(String), - /// A `-l` library reference (e.g. `-ltinfo` becomes `tinfo`) - Lib(String), -} - -/// Parse a GNU linker script looking for `GROUP ( ... )` or `INPUT ( ... )` directives. -/// Returns the list of entries referenced, or `None` if no directive found. -/// -/// Handles: -/// - `GROUP ( path1 path2 AS_NEEDED ( path3 ) )` - AS_NEEDED entries are skipped -/// - `INPUT ( libfoo.so.6 -lbar )` - both paths and `-l` library references -pub fn parse_linker_script(content: &str) -> Option> { - let entries = parse_linker_script_entries(content)?; - let paths: Vec = entries.into_iter().filter_map(|e| match e { - LinkerScriptEntry::Path(p) => Some(p), - LinkerScriptEntry::Lib(_) => None, - }).collect(); - if paths.is_empty() { None } else { Some(paths) } -} - -/// Parse a GNU linker script, returning all entries including `-l` library references. -/// This is the full-featured version that callers with library search path access should use. -pub fn parse_linker_script_entries(content: &str) -> Option> { - // Try GROUP first, then INPUT - let directive_start = content.find("GROUP") - .or_else(|| content.find("INPUT"))?; - - let rest = &content[directive_start..]; - let paren_start = rest.find('(')?; - - // Find matching closing paren (handle nested parens for AS_NEEDED) - let mut depth = 0; - let mut paren_end = None; - for (i, ch) in rest[paren_start..].char_indices() { - match ch { - '(' => depth += 1, - ')' => { - depth -= 1; - if depth == 0 { - paren_end = Some(paren_start + i); - break; - } - } - _ => {} - } - } - let paren_end = paren_end?; - let inside = &rest[paren_start + 1..paren_end]; - - let mut entries = Vec::new(); - let mut in_as_needed = false; - for token in inside.split_whitespace() { - match token { - "AS_NEEDED" => { in_as_needed = true; continue; } - "(" => continue, - ")" => { in_as_needed = false; continue; } - _ => {} - } - if in_as_needed { continue; } - - if let Some(lib_name) = token.strip_prefix("-l") { - // -ltinfo -> Lib("tinfo") - if !lib_name.is_empty() { - entries.push(LinkerScriptEntry::Lib(lib_name.to_string())); - } - } else if token.starts_with('/') || token.ends_with(".so") || token.ends_with(".a") - || token.contains(".so.") - || token.starts_with("lib") - { - entries.push(LinkerScriptEntry::Path(token.to_string())); - } - } - - if entries.is_empty() { None } else { Some(entries) } -} diff --git a/src/backend/elf/constants.rs b/src/backend/elf/constants.rs deleted file mode 100644 index 2941509084..0000000000 --- a/src/backend/elf/constants.rs +++ /dev/null @@ -1,187 +0,0 @@ -//! ELF format constants: identification bytes, section types, flags, symbol attributes, -//! program header types, dynamic tags, and standard structure sizes. -//! -//! These are the raw ELF spec constants used by all assembler and linker backends. -//! Organized by category matching the ELF specification sections. - -// ── ELF identification ─────────────────────────────────────────────────────── - -pub const ELF_MAGIC: [u8; 4] = [0x7f, b'E', b'L', b'F']; - -// ELF class -pub const ELFCLASS32: u8 = 1; -pub const ELFCLASS64: u8 = 2; - -// Data encoding -pub const ELFDATA2LSB: u8 = 1; - -// Version -pub const EV_CURRENT: u8 = 1; - -// OS/ABI -pub const ELFOSABI_NONE: u8 = 0; - -// ── ELF object types ───────────────────────────────────────────────────────── - -pub const ET_REL: u16 = 1; -pub const ET_EXEC: u16 = 2; -pub const ET_DYN: u16 = 3; - -// ── Machine types ──────────────────────────────────────────────────────────── - -pub const EM_386: u16 = 3; -pub const EM_X86_64: u16 = 62; -pub const EM_AARCH64: u16 = 183; -pub const EM_RISCV: u16 = 243; - -// ── Section header types ───────────────────────────────────────────────────── - -pub const SHT_NULL: u32 = 0; -pub const SHT_PROGBITS: u32 = 1; -pub const SHT_SYMTAB: u32 = 2; -pub const SHT_STRTAB: u32 = 3; -pub const SHT_RELA: u32 = 4; -pub const SHT_HASH: u32 = 5; -pub const SHT_DYNAMIC: u32 = 6; -pub const SHT_NOTE: u32 = 7; -pub const SHT_NOBITS: u32 = 8; -pub const SHT_REL: u32 = 9; -pub const SHT_DYNSYM: u32 = 11; -pub const SHT_INIT_ARRAY: u32 = 14; -pub const SHT_FINI_ARRAY: u32 = 15; -pub const SHT_PREINIT_ARRAY: u32 = 16; -pub const SHT_GROUP: u32 = 17; - -/// COMDAT group flag: sections in this group are deduplicated by the linker. -pub const GRP_COMDAT: u32 = 1; -pub const SHT_GNU_HASH: u32 = 0x6fff_fff6; -pub const SHT_GNU_VERSYM: u32 = 0x6fff_ffff; -pub const SHT_GNU_VERNEED: u32 = 0x6fff_fffe; -pub const SHT_GNU_VERDEF: u32 = 0x6fff_fffd; - -// ── Section header flags ───────────────────────────────────────────────────── - -pub const SHF_WRITE: u64 = 0x1; -pub const SHF_ALLOC: u64 = 0x2; -pub const SHF_EXECINSTR: u64 = 0x4; -pub const SHF_MERGE: u64 = 0x10; -pub const SHF_STRINGS: u64 = 0x20; -pub const SHF_INFO_LINK: u64 = 0x40; -pub const SHF_GROUP: u64 = 0x200; -pub const SHF_TLS: u64 = 0x400; -pub const SHF_EXCLUDE: u64 = 0x8000_0000; - -// ── Symbol binding ─────────────────────────────────────────────────────────── - -pub const STB_LOCAL: u8 = 0; -pub const STB_GLOBAL: u8 = 1; -pub const STB_WEAK: u8 = 2; - -// ── Symbol types ───────────────────────────────────────────────────────────── - -pub const STT_NOTYPE: u8 = 0; -pub const STT_OBJECT: u8 = 1; -pub const STT_FUNC: u8 = 2; -pub const STT_SECTION: u8 = 3; -pub const STT_FILE: u8 = 4; -pub const STT_COMMON: u8 = 5; -pub const STT_TLS: u8 = 6; -pub const STT_GNU_IFUNC: u8 = 10; - -// ── Symbol visibility ──────────────────────────────────────────────────────── - -pub const STV_DEFAULT: u8 = 0; -pub const STV_INTERNAL: u8 = 1; -pub const STV_HIDDEN: u8 = 2; -pub const STV_PROTECTED: u8 = 3; - -// ── Special section indices ────────────────────────────────────────────────── - -pub const SHN_UNDEF: u16 = 0; -pub const SHN_ABS: u16 = 0xfff1; -pub const SHN_COMMON: u16 = 0xfff2; - -// ── Program header types ───────────────────────────────────────────────────── - -pub const PT_NULL: u32 = 0; -pub const PT_LOAD: u32 = 1; -pub const PT_DYNAMIC: u32 = 2; -pub const PT_INTERP: u32 = 3; -pub const PT_NOTE: u32 = 4; -pub const PT_PHDR: u32 = 6; -pub const PT_TLS: u32 = 7; -pub const PT_GNU_EH_FRAME: u32 = 0x6474_e550; -pub const PT_GNU_STACK: u32 = 0x6474_e551; -pub const PT_GNU_RELRO: u32 = 0x6474_e552; - -// ── Program header flags ───────────────────────────────────────────────────── - -pub const PF_X: u32 = 0x1; -pub const PF_W: u32 = 0x2; -pub const PF_R: u32 = 0x4; - -// ── Dynamic section tags ───────────────────────────────────────────────────── - -pub const DT_NULL: i64 = 0; -pub const DT_NEEDED: i64 = 1; -pub const DT_PLTGOT: i64 = 3; -pub const DT_HASH: i64 = 4; -pub const DT_STRTAB: i64 = 5; -pub const DT_SYMTAB: i64 = 6; -pub const DT_RELA: i64 = 7; -pub const DT_RELASZ: i64 = 8; -pub const DT_RELAENT: i64 = 9; -pub const DT_STRSZ: i64 = 10; -pub const DT_SYMENT: i64 = 11; -pub const DT_INIT: i64 = 12; -pub const DT_FINI: i64 = 13; -pub const DT_SONAME: i64 = 14; -pub const DT_RPATH: i64 = 15; -pub const DT_REL: i64 = 17; -pub const DT_RELSZ: i64 = 18; -pub const DT_RELENT: i64 = 19; -pub const DT_JMPREL: i64 = 23; -pub const DT_PLTREL: i64 = 20; -pub const DT_PLTRELSZ: i64 = 2; -pub const DT_DEBUG: i64 = 21; -pub const DT_INIT_ARRAY: i64 = 25; -pub const DT_FINI_ARRAY: i64 = 26; -pub const DT_INIT_ARRAYSZ: i64 = 27; -pub const DT_FINI_ARRAYSZ: i64 = 28; -pub const DT_RUNPATH: i64 = 29; -pub const DT_FLAGS: i64 = 30; -pub const DF_BIND_NOW: i64 = 8; -pub const DT_PREINIT_ARRAY: i64 = 32; -pub const DT_PREINIT_ARRAYSZ: i64 = 33; -pub const DT_RELACOUNT: i64 = 0x6fff_fff9; -pub const DT_GNU_HASH: i64 = 0x6fff_fef5; -pub const DT_VERSYM: i64 = 0x6fff_fff0; -pub const DT_VERNEED: i64 = 0x6fff_fffe; -pub const DT_VERNEEDNUM: i64 = 0x6fff_ffff; -pub const DT_FLAGS_1: i64 = 0x6fff_fffb; -pub const DF_1_NOW: i64 = 1; - -// ── ELF sizes ──────────────────────────────────────────────────────────────── - -/// Size of ELF64 header in bytes. -pub const ELF64_EHDR_SIZE: usize = 64; -/// Size of ELF32 header in bytes. -pub const ELF32_EHDR_SIZE: usize = 52; -/// Size of ELF64 section header in bytes. -pub const ELF64_SHDR_SIZE: usize = 64; -/// Size of ELF32 section header in bytes. -pub const ELF32_SHDR_SIZE: usize = 40; -/// Size of ELF64 symbol table entry in bytes. -pub const ELF64_SYM_SIZE: usize = 24; -/// Size of ELF32 symbol table entry in bytes. -pub const ELF32_SYM_SIZE: usize = 16; -/// Size of ELF64 RELA relocation entry in bytes. -pub const ELF64_RELA_SIZE: usize = 24; -/// Size of ELF32 REL relocation entry in bytes. -pub const ELF32_REL_SIZE: usize = 8; -/// Size of ELF32 RELA relocation entry in bytes. -pub const ELF32_RELA_SIZE: usize = 12; -/// Size of ELF64 program header in bytes. -pub const ELF64_PHDR_SIZE: usize = 56; -/// Size of ELF32 program header in bytes. -pub const ELF32_PHDR_SIZE: usize = 32; diff --git a/src/backend/elf/io.rs b/src/backend/elf/io.rs deleted file mode 100644 index a0d3b78f8f..0000000000 --- a/src/backend/elf/io.rs +++ /dev/null @@ -1,204 +0,0 @@ -//! Binary read/write helpers for little-endian ELF fields, plus section header, -//! program header, symbol table entry, and relocation entry writers. - -// ── Binary read helpers (little-endian) ────────────────────────────────────── - -/// Read a little-endian u16 from `data` at `offset`. -#[inline] -pub fn read_u16(data: &[u8], offset: usize) -> u16 { - u16::from_le_bytes([data[offset], data[offset + 1]]) -} - -/// Read a little-endian u32 from `data` at `offset`. -#[inline] -pub fn read_u32(data: &[u8], offset: usize) -> u32 { - u32::from_le_bytes([ - data[offset], data[offset + 1], data[offset + 2], data[offset + 3], - ]) -} - -/// Read a little-endian u64 from `data` at `offset`. -#[inline] -pub fn read_u64(data: &[u8], offset: usize) -> u64 { - u64::from_le_bytes([ - data[offset], data[offset + 1], data[offset + 2], data[offset + 3], - data[offset + 4], data[offset + 5], data[offset + 6], data[offset + 7], - ]) -} - -/// Read a little-endian i32 from `data` at `offset`. -#[inline] -pub fn read_i32(data: &[u8], offset: usize) -> i32 { - i32::from_le_bytes([ - data[offset], data[offset + 1], data[offset + 2], data[offset + 3], - ]) -} - -/// Read a little-endian i64 from `data` at `offset`. -#[inline] -pub fn read_i64(data: &[u8], offset: usize) -> i64 { - i64::from_le_bytes([ - data[offset], data[offset + 1], data[offset + 2], data[offset + 3], - data[offset + 4], data[offset + 5], data[offset + 6], data[offset + 7], - ]) -} - -/// Read a null-terminated string from a byte slice starting at `offset`. -pub fn read_cstr(data: &[u8], offset: usize) -> String { - if offset >= data.len() { - return String::new(); - } - let end = data[offset..].iter().position(|&b| b == 0).unwrap_or(data.len() - offset); - String::from_utf8_lossy(&data[offset..offset + end]).into_owned() -} - -// ── Binary write helpers (little-endian, in-place) ─────────────────────────── - -/// Write a little-endian u16 into `buf` at `offset`. No-op if out of bounds. -#[inline] -pub fn w16(buf: &mut [u8], off: usize, val: u16) { - if off + 2 <= buf.len() { - buf[off..off + 2].copy_from_slice(&val.to_le_bytes()); - } -} - -/// Write a little-endian u32 into `buf` at `offset`. No-op if out of bounds. -#[inline] -pub fn w32(buf: &mut [u8], off: usize, val: u32) { - if off + 4 <= buf.len() { - buf[off..off + 4].copy_from_slice(&val.to_le_bytes()); - } -} - -/// Write a little-endian u64 into `buf` at `offset`. No-op if out of bounds. -#[inline] -pub fn w64(buf: &mut [u8], off: usize, val: u64) { - if off + 8 <= buf.len() { - buf[off..off + 8].copy_from_slice(&val.to_le_bytes()); - } -} - -/// Copy `data` into `buf` starting at `off`. No-op if out of bounds. -#[inline] -pub fn write_bytes(buf: &mut [u8], off: usize, data: &[u8]) { - let end = off + data.len(); - if end <= buf.len() { - buf[off..end].copy_from_slice(data); - } -} - -// ── Section header writing ─────────────────────────────────────────────────── - -/// Append an ELF64 section header to `buf`. -pub fn write_shdr64( - buf: &mut Vec, - sh_name: u32, sh_type: u32, sh_flags: u64, - sh_addr: u64, sh_offset: u64, sh_size: u64, - sh_link: u32, sh_info: u32, sh_addralign: u64, sh_entsize: u64, -) { - buf.extend_from_slice(&sh_name.to_le_bytes()); - buf.extend_from_slice(&sh_type.to_le_bytes()); - buf.extend_from_slice(&sh_flags.to_le_bytes()); - buf.extend_from_slice(&sh_addr.to_le_bytes()); - buf.extend_from_slice(&sh_offset.to_le_bytes()); - buf.extend_from_slice(&sh_size.to_le_bytes()); - buf.extend_from_slice(&sh_link.to_le_bytes()); - buf.extend_from_slice(&sh_info.to_le_bytes()); - buf.extend_from_slice(&sh_addralign.to_le_bytes()); - buf.extend_from_slice(&sh_entsize.to_le_bytes()); -} - -/// Append an ELF32 section header to `buf`. -pub fn write_shdr32( - buf: &mut Vec, - sh_name: u32, sh_type: u32, sh_flags: u32, - sh_addr: u32, sh_offset: u32, sh_size: u32, - sh_link: u32, sh_info: u32, sh_addralign: u32, sh_entsize: u32, -) { - buf.extend_from_slice(&sh_name.to_le_bytes()); - buf.extend_from_slice(&sh_type.to_le_bytes()); - buf.extend_from_slice(&sh_flags.to_le_bytes()); - buf.extend_from_slice(&sh_addr.to_le_bytes()); - buf.extend_from_slice(&sh_offset.to_le_bytes()); - buf.extend_from_slice(&sh_size.to_le_bytes()); - buf.extend_from_slice(&sh_link.to_le_bytes()); - buf.extend_from_slice(&sh_info.to_le_bytes()); - buf.extend_from_slice(&sh_addralign.to_le_bytes()); - buf.extend_from_slice(&sh_entsize.to_le_bytes()); -} - -/// Write an ELF64 program header to `buf` at offset `off`. -pub fn write_phdr64( - buf: &mut [u8], off: usize, - p_type: u32, p_flags: u32, p_offset: u64, - p_vaddr: u64, p_paddr: u64, p_filesz: u64, p_memsz: u64, p_align: u64, -) { - w32(buf, off, p_type); - w32(buf, off + 4, p_flags); - w64(buf, off + 8, p_offset); - w64(buf, off + 16, p_vaddr); - w64(buf, off + 24, p_paddr); - w64(buf, off + 32, p_filesz); - w64(buf, off + 40, p_memsz); - w64(buf, off + 48, p_align); -} - -/// Write an ELF64 program header with `p_paddr = p_vaddr` (the common case). -/// This is a convenience wrapper around `write_phdr64` used by multiple linker -/// backends to avoid repeating the vaddr twice. -#[inline] -pub fn wphdr(buf: &mut [u8], off: usize, pt: u32, flags: u32, foff: u64, va: u64, fsz: u64, msz: u64, align: u64) { - write_phdr64(buf, off, pt, flags, foff, va, va, fsz, msz, align); -} - -/// Write an ELF64 symbol table entry to `buf`. -pub fn write_sym64( - buf: &mut Vec, - st_name: u32, st_info: u8, st_other: u8, st_shndx: u16, - st_value: u64, st_size: u64, -) { - buf.extend_from_slice(&st_name.to_le_bytes()); - buf.push(st_info); - buf.push(st_other); - buf.extend_from_slice(&st_shndx.to_le_bytes()); - buf.extend_from_slice(&st_value.to_le_bytes()); - buf.extend_from_slice(&st_size.to_le_bytes()); -} - -/// Write an ELF32 symbol table entry to `buf`. -pub fn write_sym32( - buf: &mut Vec, - st_name: u32, st_value: u32, st_size: u32, - st_info: u8, st_other: u8, st_shndx: u16, -) { - buf.extend_from_slice(&st_name.to_le_bytes()); - buf.extend_from_slice(&st_value.to_le_bytes()); - buf.extend_from_slice(&st_size.to_le_bytes()); - buf.push(st_info); - buf.push(st_other); - buf.extend_from_slice(&st_shndx.to_le_bytes()); -} - -/// Write an ELF64 RELA relocation entry to `buf`. -pub fn write_rela64(buf: &mut Vec, r_offset: u64, r_sym: u32, r_type: u32, r_addend: i64) { - buf.extend_from_slice(&r_offset.to_le_bytes()); - let r_info: u64 = ((r_sym as u64) << 32) | (r_type as u64); - buf.extend_from_slice(&r_info.to_le_bytes()); - buf.extend_from_slice(&r_addend.to_le_bytes()); -} - -/// Write an ELF32 REL relocation entry to `buf`. -pub fn write_rel32(buf: &mut Vec, r_offset: u32, r_sym: u32, r_type: u8) { - buf.extend_from_slice(&r_offset.to_le_bytes()); - let r_info: u32 = (r_sym << 8) | (r_type as u32); - buf.extend_from_slice(&r_info.to_le_bytes()); -} - -/// Write an ELF32 RELA relocation entry to `buf`. -/// Used by architectures that require RELA even in 32-bit mode (e.g., RISC-V). -pub fn write_rela32(buf: &mut Vec, r_offset: u32, r_sym: u32, r_type: u8, r_addend: i32) { - buf.extend_from_slice(&r_offset.to_le_bytes()); - let r_info: u32 = (r_sym << 8) | (r_type as u32); - buf.extend_from_slice(&r_info.to_le_bytes()); - buf.extend_from_slice(&r_addend.to_le_bytes()); -} diff --git a/src/backend/elf/linker_symbols.rs b/src/backend/elf/linker_symbols.rs deleted file mode 100644 index 9cc8cbcb58..0000000000 --- a/src/backend/elf/linker_symbols.rs +++ /dev/null @@ -1,153 +0,0 @@ -//! Linker-defined symbols and section name helpers. -//! -//! Provides the standard set of symbols that all backend linkers define -//! (e.g. `_edata`, `__bss_start`, `_end`), plus section name/flags lookup. - -use super::constants::*; - -// ── Linker-defined symbols ──────────────────────────────────────────────────── -// -// All four backend linkers (x86-64, i686, ARM, RISC-V) need to define a standard -// set of symbols that programs expect the linker to provide. Previously each -// backend had its own list with inconsistent names and values. This shared -// infrastructure ensures all backends define the same symbols with consistent -// semantics. - -/// Addresses that linker backends must provide for linker-defined symbol resolution. -/// -/// Each backend computes these from its own layout, then passes them to -/// `get_standard_linker_symbols()` to get the canonical symbol list. -pub struct LinkerSymbolAddresses { - /// Base address of the ELF executable (e.g., 0x400000 for x86-64). - pub base_addr: u64, - /// Address of the GOT or GOT.PLT section. - pub got_addr: u64, - /// Address of the .dynamic section (0 if static-only linking). - pub dynamic_addr: u64, - /// Start address of the BSS section. - pub bss_addr: u64, - /// Size of the BSS section in memory. - pub bss_size: u64, - /// End of the text (RX) segment. - pub text_end: u64, - /// Start of the data (RW) segment. - pub data_start: u64, - /// Start address of .init_array section (0 if absent). - pub init_array_start: u64, - /// Size of .init_array section in bytes. - pub init_array_size: u64, - /// Start address of .fini_array section (0 if absent). - pub fini_array_start: u64, - /// Size of .fini_array section in bytes. - pub fini_array_size: u64, - /// Start address of .preinit_array section (0 if absent). - pub preinit_array_start: u64, - /// Size of .preinit_array section in bytes. - pub preinit_array_size: u64, - /// Start address of .rela.iplt / .rel.iplt section (0 if absent). - pub rela_iplt_start: u64, - /// Size of .rela.iplt / .rel.iplt section in bytes. - pub rela_iplt_size: u64, -} - -/// A linker-defined symbol entry with name, value, and binding. -pub struct LinkerDefinedSym { - pub name: &'static str, - pub value: u64, - pub binding: u8, -} - -/// Return the standard set of linker-defined symbols that all backends should provide. -/// -/// This ensures consistent symbol definitions across x86-64, i686, ARM, and RISC-V. -/// Each backend may also define additional architecture-specific symbols (e.g., -/// `__global_pointer$` for RISC-V) after calling this function. -/// -/// The returned list uses the same semantics as GNU ld: -/// - `_edata` / `__bss_start` = start of BSS (end of initialized data) -/// - `_end` / `__end` = end of BSS (end of all data) -/// - `_etext` / `etext` = end of text segment -/// - `__dso_handle` = start of data segment -/// - `_DYNAMIC` = address of .dynamic section -/// - `data_start` is weak (can be overridden by object files) -pub fn get_standard_linker_symbols(addrs: &LinkerSymbolAddresses) -> Vec { - let end_addr = addrs.bss_addr + addrs.bss_size; - vec![ - // GOT / dynamic - LinkerDefinedSym { name: "_GLOBAL_OFFSET_TABLE_", value: addrs.got_addr, binding: STB_GLOBAL }, - LinkerDefinedSym { name: "_DYNAMIC", value: addrs.dynamic_addr, binding: STB_GLOBAL }, - // BSS / data boundaries - LinkerDefinedSym { name: "__bss_start", value: addrs.bss_addr, binding: STB_GLOBAL }, - LinkerDefinedSym { name: "_edata", value: addrs.bss_addr, binding: STB_GLOBAL }, - LinkerDefinedSym { name: "_end", value: end_addr, binding: STB_GLOBAL }, - LinkerDefinedSym { name: "__end", value: end_addr, binding: STB_GLOBAL }, - // Text boundaries - LinkerDefinedSym { name: "_etext", value: addrs.text_end, binding: STB_GLOBAL }, - LinkerDefinedSym { name: "etext", value: addrs.text_end, binding: STB_GLOBAL }, - // ELF header / executable start - LinkerDefinedSym { name: "__ehdr_start", value: addrs.base_addr, binding: STB_GLOBAL }, - LinkerDefinedSym { name: "__executable_start", value: addrs.base_addr, binding: STB_GLOBAL }, - // Data segment - LinkerDefinedSym { name: "__dso_handle", value: addrs.data_start, binding: STB_GLOBAL }, - LinkerDefinedSym { name: "__data_start", value: addrs.data_start, binding: STB_GLOBAL }, - LinkerDefinedSym { name: "data_start", value: addrs.data_start, binding: STB_WEAK }, - // Init/fini/preinit arrays - LinkerDefinedSym { name: "__init_array_start", value: addrs.init_array_start, binding: STB_GLOBAL }, - LinkerDefinedSym { name: "__init_array_end", value: addrs.init_array_start + addrs.init_array_size, binding: STB_GLOBAL }, - LinkerDefinedSym { name: "__fini_array_start", value: addrs.fini_array_start, binding: STB_GLOBAL }, - LinkerDefinedSym { name: "__fini_array_end", value: addrs.fini_array_start + addrs.fini_array_size, binding: STB_GLOBAL }, - LinkerDefinedSym { name: "__preinit_array_start", value: addrs.preinit_array_start, binding: STB_GLOBAL }, - LinkerDefinedSym { name: "__preinit_array_end", value: addrs.preinit_array_start + addrs.preinit_array_size, binding: STB_GLOBAL }, - // IPLT relocation boundaries - LinkerDefinedSym { name: "__rela_iplt_start", value: addrs.rela_iplt_start, binding: STB_GLOBAL }, - LinkerDefinedSym { name: "__rela_iplt_end", value: addrs.rela_iplt_start + addrs.rela_iplt_size, binding: STB_GLOBAL }, - ] -} - -// ── Section name mapping ───────────────────────────────────────────────────── - -/// Map a symbol's section name to its index in the section header table. -/// -/// Handles special pseudo-sections used during assembly: -/// - `*COM*` → `SHN_COMMON` (0xFFF2) for COMMON symbols -/// - `*UND*` or empty → `SHN_UNDEF` (0) for undefined symbols -/// - Otherwise, looks up the section in the content section list (1-based index) -/// -/// `shndx_offset` is the number of section headers before the content sections -/// (typically 1 for NULL, or 1 + num_groups when COMDAT groups are present). -pub fn section_index(section_name: &str, content_sections: &[String], shndx_offset: u16) -> u16 { - if section_name == "*COM*" { - SHN_COMMON - } else if section_name == "*UND*" || section_name.is_empty() { - SHN_UNDEF - } else { - content_sections.iter().position(|s| s == section_name) - .map(|i| (i as u16) + shndx_offset) - .unwrap_or(SHN_UNDEF) - } -} - -/// Return default ELF section flags based on section name conventions. -/// -/// These are the standard mappings: `.text.*` → alloc+exec, `.data.*` → alloc+write, -/// `.rodata.*` → alloc, `.bss.*` → alloc+write, `.tdata`/`.tbss` → alloc+write+TLS, etc. -pub fn default_section_flags(name: &str) -> u64 { - if name == ".text" || name.starts_with(".text.") { - SHF_ALLOC | SHF_EXECINSTR - } else if name == ".data" || name.starts_with(".data.") - || name == ".bss" || name.starts_with(".bss.") { - SHF_ALLOC | SHF_WRITE - } else if name == ".rodata" || name.starts_with(".rodata.") { - SHF_ALLOC - } else if name == ".note.GNU-stack" { - 0 // Non-executable stack marker, no flags - } else if name.starts_with(".note") { - SHF_ALLOC - } else if name.starts_with(".tdata") || name.starts_with(".tbss") { - SHF_ALLOC | SHF_WRITE | SHF_TLS - } else if name.starts_with(".init") || name.starts_with(".fini") { - SHF_ALLOC | SHF_EXECINSTR - } else { - 0 - } -} diff --git a/src/backend/elf/mod.rs b/src/backend/elf/mod.rs deleted file mode 100644 index ddc8a2e1a7..0000000000 --- a/src/backend/elf/mod.rs +++ /dev/null @@ -1,81 +0,0 @@ -//! Shared ELF types, constants, and utilities used by all assembler and linker backends. -//! -//! Split into focused submodules: -//! -//! - `constants`: ELF format constants (section types, flags, symbol bindings, etc.) -//! - `string_table`: `StringTable` for building ELF string tables -//! - `io`: Binary read/write helpers and ELF structure writers -//! - `archive`: Archive (.a) and linker script parsing -//! - `linker_symbols`: Linker-defined symbols and section helpers -//! - `section_flags`: Section flags parsing for `.section` directives -//! - `parse_string`: String literal parser for assembler directives -//! - `object_writer`: Shared relocatable ELF object (.o) writer -//! - `numeric_labels`: Numeric local label resolution (x86/i686) -//! - `symbol_table`: Shared symbol table builder for assembler backends -//! - `writer_base`: `ElfWriterBase` for ARM/RISC-V assembler backends - -mod constants; -mod string_table; -mod io; -mod archive; -mod linker_symbols; -mod section_flags; -pub(crate) mod parse_string; -mod object_writer; -mod numeric_labels; -mod symbol_table; -mod writer_base; - -// Re-export everything at the elf:: level so existing `use crate::backend::elf::{...}` -// imports continue to work without any changes. - -// constants -#[allow(unused_imports)] -pub use constants::*; - -// string_table -#[allow(unused_imports)] -pub use string_table::StringTable; - -// io -#[allow(unused_imports)] -pub use io::*; - -// archive -#[allow(unused_imports)] -pub use archive::{ - is_thin_archive, parse_thin_archive_members, parse_archive_members, - LinkerScriptEntry, parse_linker_script, parse_linker_script_entries, -}; - -// linker_symbols -#[allow(unused_imports)] -pub use linker_symbols::{ - LinkerSymbolAddresses, LinkerDefinedSym, get_standard_linker_symbols, - section_index, default_section_flags, -}; - -// section_flags -pub use section_flags::parse_section_flags; - -// parse_string -pub use parse_string::parse_string_literal; - -// object_writer -pub use object_writer::{ - ElfConfig, ObjSection, ObjReloc, - write_relocatable_object, -}; - -// numeric_labels -#[allow(unused_imports)] -pub use numeric_labels::{ - is_numeric_label, parse_numeric_ref, resolve_numeric_labels, - resolve_numeric_name, resolve_numeric_refs_in_expr, -}; - -// symbol_table -pub use symbol_table::{ObjSymbol, SymbolTableInput, build_elf_symbol_table}; - -// writer_base -pub use writer_base::ElfWriterBase; diff --git a/src/backend/elf/numeric_labels.rs b/src/backend/elf/numeric_labels.rs deleted file mode 100644 index 70638a8741..0000000000 --- a/src/backend/elf/numeric_labels.rs +++ /dev/null @@ -1,285 +0,0 @@ -//! Numeric local label resolution for x86 and i686 assemblers. -//! -//! GNU assembler numeric labels (e.g., `1:`, `42:`) can be defined multiple -//! times. Forward references (`1f`) resolve to the NEXT definition, backward -//! references (`1b`) resolve to the MOST RECENT definition. -//! -//! This module provides the resolution logic shared by x86 and i686 ELF -//! writers. Both architectures use the same x86 parser AST types -//! (AsmItem, Operand, Displacement, DataValue, etc.), so these functions -//! operate on those types directly. -//! -//! ARM and RISC-V use different parser types and don't have this pattern -//! (ARM has no numeric labels; RISC-V has its own pre-pass). - -use std::collections::HashMap; -use crate::backend::x86::assembler::parser::{ - AsmItem, Instruction, Operand, MemoryOperand, Displacement, DataValue, ImmediateValue, -}; - -/// Check if a string is a numeric local label (just digits, e.g., "1", "42"). -pub fn is_numeric_label(name: &str) -> bool { - !name.is_empty() && name.bytes().all(|b| b.is_ascii_digit()) -} - -/// Check if a string is a numeric forward/backward reference like "1f" or "2b". -/// Returns Some((number_str, is_forward)) if it is, None otherwise. -pub fn parse_numeric_ref(name: &str) -> Option<(&str, bool)> { - if name.len() < 2 { - return None; - } - let last = name.as_bytes()[name.len() - 1]; - let num_part = &name[..name.len() - 1]; - if !num_part.bytes().all(|b| b.is_ascii_digit()) { - return None; - } - match last { - b'f' => Some((num_part, true)), - b'b' => Some((num_part, false)), - _ => None, - } -} - -/// Resolve numeric local labels (1:, 2:, etc.) and their references (1f, 1b) -/// into unique internal label names. -/// -/// GNU assembler numeric labels can be defined multiple times. Each forward -/// reference `Nf` refers to the next definition of `N`, and each backward -/// reference `Nb` refers to the most recent definition of `N`. -/// -/// This function renames each definition to a unique `.Lnum_N_K` name and -/// updates all references accordingly. Used by both x86 and i686 ELF writers. -pub fn resolve_numeric_labels(items: &[AsmItem]) -> Vec { - // First pass: find all numeric label definitions and assign unique names. - let mut defs: HashMap> = HashMap::new(); - let mut unique_counter: HashMap = HashMap::new(); - - for (i, item) in items.iter().enumerate() { - if let AsmItem::Label(name) = item { - if is_numeric_label(name) { - let count = unique_counter.entry(name.clone()).or_insert(0); - let unique_name = format!(".Lnum_{}_{}", name, *count); - *count += 1; - defs.entry(name.clone()).or_default().push((i, unique_name)); - } - } - } - - if defs.is_empty() { - return items.to_vec(); - } - - let mut result = Vec::with_capacity(items.len()); - for (i, item) in items.iter().enumerate() { - match item { - AsmItem::Label(name) if is_numeric_label(name) => { - if let Some(def_list) = defs.get(name) { - if let Some((_, unique_name)) = def_list.iter().find(|(idx, _)| *idx == i) { - result.push(AsmItem::Label(unique_name.clone())); - continue; - } - } - result.push(item.clone()); - } - AsmItem::Instruction(instr) => { - let new_ops: Vec = instr.operands.iter().map(|op| { - resolve_numeric_operand(op, i, &defs) - }).collect(); - result.push(AsmItem::Instruction(Instruction { - prefix: instr.prefix.clone(), - mnemonic: instr.mnemonic.clone(), - operands: new_ops, - })); - } - AsmItem::Short(vals) => { - result.push(AsmItem::Short(resolve_numeric_data_values(vals, i, &defs))); - } - AsmItem::Long(vals) => { - result.push(AsmItem::Long(resolve_numeric_data_values(vals, i, &defs))); - } - AsmItem::Quad(vals) => { - result.push(AsmItem::Quad(resolve_numeric_data_values(vals, i, &defs))); - } - AsmItem::Byte(vals) => { - result.push(AsmItem::Byte(resolve_numeric_data_values(vals, i, &defs))); - } - AsmItem::SkipExpr(expr, fill) => { - let new_expr = resolve_numeric_refs_in_expr(expr, i, &defs); - result.push(AsmItem::SkipExpr(new_expr, *fill)); - } - AsmItem::Org(sym, offset) => { - if let Some(resolved) = resolve_numeric_name(sym, i, &defs) { - result.push(AsmItem::Org(resolved, *offset)); - } else { - result.push(item.clone()); - } - } - _ => result.push(item.clone()), - } - } - - result -} - -/// Resolve numeric label references in a single operand. -fn resolve_numeric_operand( - op: &Operand, - current_idx: usize, - defs: &HashMap>, -) -> Operand { - match op { - Operand::Label(name) => { - if let Some(resolved) = resolve_numeric_name(name, current_idx, defs) { - Operand::Label(resolved) - } else { - op.clone() - } - } - Operand::Memory(mem) => { - if let Some(new_disp) = resolve_numeric_displacement(&mem.displacement, current_idx, defs) { - Operand::Memory(MemoryOperand { - segment: mem.segment.clone(), - displacement: new_disp, - base: mem.base.clone(), - index: mem.index.clone(), - scale: mem.scale, - }) - } else { - op.clone() - } - } - Operand::Immediate(ImmediateValue::SymbolDiff(lhs, rhs)) => { - let new_lhs = resolve_numeric_name(lhs, current_idx, defs).unwrap_or_else(|| lhs.clone()); - let new_rhs = resolve_numeric_name(rhs, current_idx, defs).unwrap_or_else(|| rhs.clone()); - Operand::Immediate(ImmediateValue::SymbolDiff(new_lhs, new_rhs)) - } - Operand::Immediate(ImmediateValue::Symbol(name)) => { - if let Some(resolved) = resolve_numeric_name(name, current_idx, defs) { - Operand::Immediate(ImmediateValue::Symbol(resolved)) - } else { - op.clone() - } - } - Operand::Indirect(inner) => { - let resolved_inner = resolve_numeric_operand(inner, current_idx, defs); - Operand::Indirect(Box::new(resolved_inner)) - } - _ => op.clone(), - } -} - -/// Resolve numeric label references in data values (.long, .quad, .byte directives). -fn resolve_numeric_data_values( - vals: &[DataValue], - current_idx: usize, - defs: &HashMap>, -) -> Vec { - vals.iter().map(|val| { - match val { - DataValue::Symbol(name) => { - if let Some(resolved) = resolve_numeric_name(name, current_idx, defs) { - DataValue::Symbol(resolved) - } else { - val.clone() - } - } - DataValue::SymbolDiff(lhs, rhs) => { - let new_lhs = resolve_numeric_name(lhs, current_idx, defs).unwrap_or_else(|| lhs.clone()); - let new_rhs = resolve_numeric_name(rhs, current_idx, defs).unwrap_or_else(|| rhs.clone()); - DataValue::SymbolDiff(new_lhs, new_rhs) - } - DataValue::SymbolOffset(name, offset) => { - if let Some(resolved) = resolve_numeric_name(name, current_idx, defs) { - DataValue::SymbolOffset(resolved, *offset) - } else { - val.clone() - } - } - _ => val.clone(), - } - }).collect() -} - -/// Resolve a numeric label reference name (e.g., "1f" -> ".Lnum_1_0"). -pub fn resolve_numeric_name( - name: &str, - current_idx: usize, - defs: &HashMap>, -) -> Option { - let (num, is_forward) = parse_numeric_ref(name)?; - let def_list = defs.get(num)?; - - if is_forward { - def_list.iter() - .find(|(idx, _)| *idx > current_idx) - .map(|(_, name)| name.clone()) - } else { - def_list.iter() - .rev() - .find(|(idx, _)| *idx < current_idx) - .map(|(_, name)| name.clone()) - } -} - -/// Resolve numeric label references in a displacement. -fn resolve_numeric_displacement( - disp: &Displacement, - current_idx: usize, - defs: &HashMap>, -) -> Option { - match disp { - Displacement::Symbol(name) => { - resolve_numeric_name(name, current_idx, defs) - .map(Displacement::Symbol) - } - Displacement::SymbolAddend(name, addend) => { - resolve_numeric_name(name, current_idx, defs) - .map(|n| Displacement::SymbolAddend(n, *addend)) - } - Displacement::SymbolMod(name, modifier) => { - resolve_numeric_name(name, current_idx, defs) - .map(|n| Displacement::SymbolMod(n, modifier.clone())) - } - Displacement::SymbolPlusOffset(name, offset) => { - resolve_numeric_name(name, current_idx, defs) - .map(|n| Displacement::SymbolPlusOffset(n, *offset)) - } - _ => None, - } -} - -/// Resolve numeric label references (e.g., "6651f", "661b") within an expression string. -/// Scans for patterns like digits followed by 'f' or 'b' and replaces them with unique names. -pub fn resolve_numeric_refs_in_expr( - expr: &str, - current_idx: usize, - defs: &HashMap>, -) -> String { - let mut result = String::with_capacity(expr.len()); - let bytes = expr.as_bytes(); - let mut i = 0; - while i < bytes.len() { - if bytes[i].is_ascii_digit() { - let start = i; - while i < bytes.len() && bytes[i].is_ascii_digit() { - i += 1; - } - if i < bytes.len() && (bytes[i] == b'f' || bytes[i] == b'b') { - let next = i + 1; - if next >= bytes.len() || !bytes[next].is_ascii_alphanumeric() { - let ref_name = &expr[start..=i]; - if let Some(resolved) = resolve_numeric_name(ref_name, current_idx, defs) { - result.push_str(&resolved); - i += 1; - continue; - } - } - } - result.push_str(&expr[start..i]); - } else { - result.push(bytes[i] as char); - i += 1; - } - } - result -} diff --git a/src/backend/elf/object_writer.rs b/src/backend/elf/object_writer.rs deleted file mode 100644 index 13756e9720..0000000000 --- a/src/backend/elf/object_writer.rs +++ /dev/null @@ -1,554 +0,0 @@ -//! Shared relocatable ELF object (.o) writer. -//! -//! Provides `write_relocatable_object` which serializes an ELF .o file from -//! architecture-independent section/symbol/reloc data. Handles ELF64+RELA -//! (x86-64, AArch64, RISC-V), ELF32+RELA (RISC-V RV32), and ELF32+REL (i686). -//! -//! Each backend's ElfWriter builds its own section/symbol/reloc data through -//! arch-specific logic (instruction encoding, branch resolution, etc.), then -//! calls `write_relocatable_object` for the final ELF serialization step. - -use std::collections::HashMap; -use super::constants::*; -use super::string_table::StringTable; -use super::io::*; -use super::linker_symbols::section_index; -use super::symbol_table::ObjSymbol; - -/// Configuration for ELF object file emission. Parameterizes the format -/// differences between architectures (machine type, ELF class, flags, etc). -pub struct ElfConfig { - /// ELF machine type (e.g., EM_X86_64, EM_AARCH64, EM_RISCV, EM_386) - pub e_machine: u16, - /// ELF flags (e.g., 0 for most, EF_RISCV_RVC | EF_RISCV_FLOAT_ABI_DOUBLE for RISC-V) - pub e_flags: u32, - /// ELF class: ELFCLASS64 or ELFCLASS32 - pub elf_class: u8, - /// Force RELA relocations even for ELF32 (needed by RISC-V which always uses RELA). - /// When false (default), ELF32 uses REL and ELF64 uses RELA. - pub force_rela: bool, -} - -/// A section in a relocatable object file being built by the assembler. -pub struct ObjSection { - pub name: String, - pub sh_type: u32, - pub sh_flags: u64, - pub data: Vec, - pub sh_addralign: u64, - /// Relocations targeting this section. - pub relocs: Vec, - /// If this section is part of a COMDAT group, the group signature symbol name. - pub comdat_group: Option, -} - -/// A relocation entry in a relocatable object file. -/// -/// Uses 64-bit offset/addend for all targets; the writer truncates to 32-bit -/// for ELF32/REL when needed. -#[derive(Clone)] -pub struct ObjReloc { - pub offset: u64, - pub reloc_type: u32, - pub symbol_name: String, - pub addend: i64, -} - -/// Internal symbol table entry used during serialization. -struct SymEntry { - st_name: u32, - st_info: u8, - st_other: u8, - st_shndx: u16, - st_value: u64, - st_size: u64, -} - -/// Write a relocatable ELF object file (.o) from assembled sections and symbols. -/// -/// This is the shared serialization pipeline used by all four backend assemblers. -/// The caller is responsible for: -/// - Building sections with encoded instructions and data -/// - Resolving local branches and patching instruction bytes -/// - Building the symbol list (defined labels, COMMON, aliases, undefined) -/// - Providing the correct `ElfConfig` for the target architecture -/// -/// The function handles the complete ELF layout and serialization: -/// 1. Build shstrtab/strtab string tables -/// 2. Build symbol table entries (NULL, section symbols, local, global) -/// 3. Compute section/rela/symtab/strtab layout offsets -/// 4. Write ELF header, section data, relocations, symtab, strtab, section headers -/// -/// Returns the serialized ELF bytes on success. -pub fn write_relocatable_object( - config: &ElfConfig, - section_order: &[String], - sections: &HashMap, - symbols: &[ObjSymbol], -) -> Result, String> { - let is_32bit = config.elf_class == ELFCLASS32; - // ELF64 always uses RELA; ELF32 defaults to REL but some architectures - // (e.g., RISC-V) always use RELA even in 32-bit mode. - let use_rela = !is_32bit || config.force_rela; - - let ehdr_size = if is_32bit { ELF32_EHDR_SIZE } else { ELF64_EHDR_SIZE }; - let shdr_size = if is_32bit { ELF32_SHDR_SIZE } else { ELF64_SHDR_SIZE }; - let sym_entry_size = if is_32bit { ELF32_SYM_SIZE } else { ELF64_SYM_SIZE }; - let reloc_entry_size = if use_rela { - if is_32bit { ELF32_RELA_SIZE } else { ELF64_RELA_SIZE } - } else { - ELF32_REL_SIZE - }; - let reloc_prefix = if use_rela { ".rela" } else { ".rel" }; - let reloc_sh_type = if use_rela { SHT_RELA } else { SHT_REL }; - let alignment_mask = if is_32bit { 3usize } else { 7usize }; // 4 or 8 byte alignment - - // ── Collect COMDAT groups ── - // Map: group_name -> list of member content section names - let mut comdat_groups: Vec<(String, Vec)> = Vec::new(); - { - let mut group_map: HashMap> = HashMap::new(); - let mut group_order: Vec = Vec::new(); - for sec_name in section_order { - if let Some(section) = sections.get(sec_name) { - if let Some(ref group_name) = section.comdat_group { - group_map.entry(group_name.clone()).or_insert_with(|| { - group_order.push(group_name.clone()); - Vec::new() - }).push(sec_name.clone()); - } - } - } - for gname in group_order { - if let Some(members) = group_map.remove(&gname) { - comdat_groups.push((gname, members)); - } - } - } - let num_groups = comdat_groups.len(); - - // ── Build string tables ── - let mut shstrtab = StringTable::new(); - let mut strtab = StringTable::new(); - - let content_sections: &[String] = section_order; - - // Add group section names to shstrtab - for _ in &comdat_groups { - shstrtab.add(".group"); - } - - // Add section names to shstrtab - for sec_name in content_sections { - shstrtab.add(sec_name); - } - shstrtab.add(".symtab"); - shstrtab.add(".strtab"); - shstrtab.add(".shstrtab"); - - // Build reloc section names - let mut reloc_section_names: Vec = Vec::new(); - for sec_name in content_sections { - if let Some(section) = sections.get(sec_name) { - if !section.relocs.is_empty() { - let reloc_name = format!("{}{}", reloc_prefix, sec_name); - shstrtab.add(&reloc_name); - reloc_section_names.push(reloc_name); - } - } - } - - // ── Build symbol table entries ── - let mut sym_entries: Vec = Vec::new(); - // Content sections start at shdr index: NULL + num_groups + content_index - let content_shndx_offset = (num_groups + 1) as u16; - - // NULL symbol (index 0) - sym_entries.push(SymEntry { - st_name: 0, st_info: 0, st_other: 0, - st_shndx: 0, st_value: 0, st_size: 0, - }); - - // Section symbols (one per content section) - // Per ELF convention, section symbols have st_name=0 (unnamed). - // Tools like the Linux kernel's modpost derive the name from the section - // header and expect st_name=0 so these don't appear in symbol searches. - for (i, sec_name) in content_sections.iter().enumerate() { - strtab.add(sec_name); - sym_entries.push(SymEntry { - st_name: 0, - st_info: (STB_LOCAL << 4) | STT_SECTION, - st_other: 0, - st_shndx: content_shndx_offset + i as u16, - st_value: 0, - st_size: 0, - }); - } - - // Separate local and global symbols - let mut local_syms: Vec<&ObjSymbol> = Vec::new(); - let mut global_syms: Vec<&ObjSymbol> = Vec::new(); - for sym in symbols { - if sym.binding == STB_LOCAL { - local_syms.push(sym); - } else { - global_syms.push(sym); - } - } - - let first_global_idx = sym_entries.len() + local_syms.len(); - - for sym in &local_syms { - let name_offset = strtab.add(&sym.name); - let shndx = section_index(&sym.section_name, content_sections, content_shndx_offset); - sym_entries.push(SymEntry { - st_name: name_offset, - st_info: (sym.binding << 4) | sym.sym_type, - st_other: sym.visibility, - st_shndx: shndx, - st_value: sym.value, - st_size: sym.size, - }); - } - - for sym in &global_syms { - let name_offset = strtab.add(&sym.name); - let shndx = section_index(&sym.section_name, content_sections, content_shndx_offset); - sym_entries.push(SymEntry { - st_name: name_offset, - st_info: (sym.binding << 4) | sym.sym_type, - st_other: sym.visibility, - st_shndx: shndx, - st_value: sym.value, - st_size: sym.size, - }); - } - - // ── Build COMDAT group section data ── - // Each group section contains: GRP_COMDAT flag (u32) + member section indices (u32 each) - let mut group_section_data: Vec> = Vec::new(); - for (_group_name, members) in &comdat_groups { - let mut data = Vec::with_capacity(4 + 4 * members.len()); - data.extend_from_slice(&GRP_COMDAT.to_le_bytes()); - for member_name in members { - // Find the section header index of this member - let member_idx = content_sections.iter().position(|s| s == member_name) - .map(|i| content_shndx_offset as u32 + i as u32) - .unwrap_or(0); - data.extend_from_slice(&member_idx.to_le_bytes()); - } - group_section_data.push(data); - } - - // ── Calculate layout ── - let mut offset = ehdr_size; - - // Group section offsets (come first, before content sections) - let mut group_offsets: Vec = Vec::new(); - for gdata in &group_section_data { - offset = (offset + 3) & !3; // align to 4 bytes - group_offsets.push(offset); - offset += gdata.len(); - } - - // Content section offsets - let mut section_offsets: Vec = Vec::new(); - for sec_name in content_sections { - let section = sections.get(sec_name).unwrap(); - let align = section.sh_addralign.max(1) as usize; - offset = (offset + align - 1) & !(align - 1); - section_offsets.push(offset); - if section.sh_type != SHT_NOBITS { - offset += section.data.len(); - } - } - - // Reloc section offsets - let mut reloc_offsets: Vec = Vec::new(); - for sec_name in content_sections { - if let Some(section) = sections.get(sec_name) { - if !section.relocs.is_empty() { - offset = (offset + alignment_mask) & !alignment_mask; - reloc_offsets.push(offset); - offset += section.relocs.len() * reloc_entry_size; - } - } - } - - // Symtab offset - offset = (offset + alignment_mask) & !alignment_mask; - let symtab_offset = offset; - let symtab_size = sym_entries.len() * sym_entry_size; - offset += symtab_size; - - // Strtab offset - let strtab_offset = offset; - let strtab_data = strtab.as_bytes().to_vec(); - offset += strtab_data.len(); - - // Shstrtab offset - let shstrtab_offset = offset; - let shstrtab_data = shstrtab.as_bytes().to_vec(); - offset += shstrtab_data.len(); - - // Section headers offset - offset = (offset + alignment_mask) & !alignment_mask; - let shdr_offset = offset; - - // Total section count: NULL + groups + content + relocs + symtab + strtab + shstrtab - let num_sections = 1 + num_groups + content_sections.len() + reloc_section_names.len() + 3; - let shstrtab_idx = num_sections - 1; - let symtab_shndx = 1 + num_groups + content_sections.len() + reloc_section_names.len(); - - // ── Write ELF ── - let total_size = shdr_offset + num_sections * shdr_size; - let mut elf = Vec::with_capacity(total_size); - - // ELF header (e_ident) - elf.extend_from_slice(&ELF_MAGIC); - elf.push(config.elf_class); - elf.push(ELFDATA2LSB); - elf.push(EV_CURRENT); - elf.push(ELFOSABI_NONE); - elf.extend_from_slice(&[0u8; 8]); // padding - - if is_32bit { - // ELF32 header - elf.extend_from_slice(&ET_REL.to_le_bytes()); - elf.extend_from_slice(&config.e_machine.to_le_bytes()); - elf.extend_from_slice(&1u32.to_le_bytes()); // e_version - elf.extend_from_slice(&0u32.to_le_bytes()); // e_entry - elf.extend_from_slice(&0u32.to_le_bytes()); // e_phoff - elf.extend_from_slice(&(shdr_offset as u32).to_le_bytes()); - elf.extend_from_slice(&config.e_flags.to_le_bytes()); - elf.extend_from_slice(&(ehdr_size as u16).to_le_bytes()); - elf.extend_from_slice(&0u16.to_le_bytes()); // e_phentsize - elf.extend_from_slice(&0u16.to_le_bytes()); // e_phnum - elf.extend_from_slice(&(shdr_size as u16).to_le_bytes()); - elf.extend_from_slice(&(num_sections as u16).to_le_bytes()); - elf.extend_from_slice(&(shstrtab_idx as u16).to_le_bytes()); - } else { - // ELF64 header - elf.extend_from_slice(&ET_REL.to_le_bytes()); - elf.extend_from_slice(&config.e_machine.to_le_bytes()); - elf.extend_from_slice(&1u32.to_le_bytes()); // e_version - elf.extend_from_slice(&0u64.to_le_bytes()); // e_entry - elf.extend_from_slice(&0u64.to_le_bytes()); // e_phoff - elf.extend_from_slice(&(shdr_offset as u64).to_le_bytes()); - elf.extend_from_slice(&config.e_flags.to_le_bytes()); - elf.extend_from_slice(&(ehdr_size as u16).to_le_bytes()); - elf.extend_from_slice(&0u16.to_le_bytes()); // e_phentsize - elf.extend_from_slice(&0u16.to_le_bytes()); // e_phnum - elf.extend_from_slice(&(shdr_size as u16).to_le_bytes()); - elf.extend_from_slice(&(num_sections as u16).to_le_bytes()); - elf.extend_from_slice(&(shstrtab_idx as u16).to_le_bytes()); - } - - debug_assert_eq!(elf.len(), ehdr_size); - - // ── Write group section data ── - for (gi, gdata) in group_section_data.iter().enumerate() { - while elf.len() < group_offsets[gi] { - elf.push(0); - } - elf.extend_from_slice(gdata); - } - - // ── Write content section data ── - for (i, sec_name) in content_sections.iter().enumerate() { - let section = sections.get(sec_name).unwrap(); - while elf.len() < section_offsets[i] { - elf.push(0); - } - if section.sh_type != SHT_NOBITS { - elf.extend_from_slice(§ion.data); - } - } - - // ── Write relocation section data ── - let mut reloc_idx = 0; - for sec_name in content_sections { - if let Some(section) = sections.get(sec_name) { - if !section.relocs.is_empty() { - while elf.len() < reloc_offsets[reloc_idx] { - elf.push(0); - } - for reloc in §ion.relocs { - let sym_idx = find_symbol_index_shared( - &reloc.symbol_name, &sym_entries, &strtab, content_sections, - ); - if use_rela && !is_32bit { - write_rela64(&mut elf, reloc.offset, sym_idx, reloc.reloc_type, reloc.addend); - } else if use_rela && is_32bit { - debug_assert!(reloc.reloc_type <= 255, "ELF32 reloc type must fit in u8"); - debug_assert!(reloc.addend >= i32::MIN as i64 && reloc.addend <= i32::MAX as i64, - "ELF32 RELA addend must fit in i32"); - write_rela32(&mut elf, reloc.offset as u32, sym_idx, reloc.reloc_type as u8, reloc.addend as i32); - } else { - debug_assert!(reloc.reloc_type <= 255, "ELF32 reloc type must fit in u8"); - write_rel32(&mut elf, reloc.offset as u32, sym_idx, reloc.reloc_type as u8); - } - } - reloc_idx += 1; - } - } - } - - // ── Write symtab ── - while elf.len() < symtab_offset { - elf.push(0); - } - for sym in &sym_entries { - if is_32bit { - write_sym32(&mut elf, sym.st_name, sym.st_value as u32, sym.st_size as u32, - sym.st_info, sym.st_other, sym.st_shndx); - } else { - write_sym64(&mut elf, sym.st_name, sym.st_info, sym.st_other, - sym.st_shndx, sym.st_value, sym.st_size); - } - } - - // ── Write strtab ── - debug_assert_eq!(elf.len(), strtab_offset); - elf.extend_from_slice(&strtab_data); - - // ── Write shstrtab ── - debug_assert_eq!(elf.len(), shstrtab_offset); - elf.extend_from_slice(&shstrtab_data); - - // ── Write section headers ── - while elf.len() < shdr_offset { - elf.push(0); - } - - let strtab_shndx = symtab_shndx + 1; - - if is_32bit { - // NULL - write_shdr32(&mut elf, 0, SHT_NULL, 0, 0, 0, 0, 0, 0, 0, 0); - // Group sections (COMDAT) - for (gi, (group_name, _members)) in comdat_groups.iter().enumerate() { - let sh_name = shstrtab.offset_of(".group"); - // sh_link = symtab index, sh_info = symbol index of group signature - let sig_sym_idx = find_symbol_index_shared(group_name, &sym_entries, &strtab, content_sections); - write_shdr32(&mut elf, sh_name, SHT_GROUP, 0, - 0, group_offsets[gi] as u32, group_section_data[gi].len() as u32, - symtab_shndx as u32, sig_sym_idx, - 4, 4); - } - // Content sections - for (i, sec_name) in content_sections.iter().enumerate() { - let section = sections.get(sec_name).unwrap(); - let sh_name = shstrtab.offset_of(sec_name); - let sh_offset = if section.sh_type == SHT_NOBITS { 0 } else { section_offsets[i] as u32 }; - write_shdr32(&mut elf, sh_name, section.sh_type, section.sh_flags as u32, - 0, sh_offset, section.data.len() as u32, - 0, 0, section.sh_addralign as u32, 0); - } - // Reloc sections - reloc_idx = 0; - for (i, sec_name) in content_sections.iter().enumerate() { - if let Some(section) = sections.get(sec_name) { - if !section.relocs.is_empty() { - let reloc_name = format!("{}{}", reloc_prefix, sec_name); - let sh_name = shstrtab.offset_of(&reloc_name); - let sh_offset = reloc_offsets[reloc_idx] as u32; - let sh_size = (section.relocs.len() * reloc_entry_size) as u32; - write_shdr32(&mut elf, sh_name, reloc_sh_type, 0, - 0, sh_offset, sh_size, - symtab_shndx as u32, content_shndx_offset as u32 + i as u32, - 4, reloc_entry_size as u32); - reloc_idx += 1; - } - } - } - // .symtab - write_shdr32(&mut elf, shstrtab.offset_of(".symtab"), SHT_SYMTAB, 0, - 0, symtab_offset as u32, symtab_size as u32, - strtab_shndx as u32, first_global_idx as u32, - 4, sym_entry_size as u32); - // .strtab - write_shdr32(&mut elf, shstrtab.offset_of(".strtab"), SHT_STRTAB, 0, - 0, strtab_offset as u32, strtab_data.len() as u32, 0, 0, 1, 0); - // .shstrtab - write_shdr32(&mut elf, shstrtab.offset_of(".shstrtab"), SHT_STRTAB, 0, - 0, shstrtab_offset as u32, shstrtab_data.len() as u32, 0, 0, 1, 0); - } else { - // NULL - write_shdr64(&mut elf, 0, SHT_NULL, 0, 0, 0, 0, 0, 0, 0, 0); - // Group sections (COMDAT) - for (gi, (group_name, _members)) in comdat_groups.iter().enumerate() { - let sh_name = shstrtab.offset_of(".group"); - let sig_sym_idx = find_symbol_index_shared(group_name, &sym_entries, &strtab, content_sections); - write_shdr64(&mut elf, sh_name, SHT_GROUP, 0, - 0, group_offsets[gi] as u64, group_section_data[gi].len() as u64, - symtab_shndx as u32, sig_sym_idx, - 4, 4); - } - // Content sections - for (i, sec_name) in content_sections.iter().enumerate() { - let section = sections.get(sec_name).unwrap(); - let sh_name = shstrtab.offset_of(sec_name); - let sh_offset = if section.sh_type == SHT_NOBITS { 0 } else { section_offsets[i] as u64 }; - write_shdr64(&mut elf, sh_name, section.sh_type, section.sh_flags, - 0, sh_offset, section.data.len() as u64, - 0, 0, section.sh_addralign, 0); - } - // Reloc sections - reloc_idx = 0; - for (i, sec_name) in content_sections.iter().enumerate() { - if let Some(section) = sections.get(sec_name) { - if !section.relocs.is_empty() { - let reloc_name = format!("{}{}", reloc_prefix, sec_name); - let sh_name = shstrtab.offset_of(&reloc_name); - let sh_offset = reloc_offsets[reloc_idx] as u64; - let sh_size = (section.relocs.len() * reloc_entry_size) as u64; - write_shdr64(&mut elf, sh_name, reloc_sh_type, SHF_INFO_LINK, - 0, sh_offset, sh_size, - symtab_shndx as u32, content_shndx_offset as u32 + i as u32, - 8, reloc_entry_size as u64); - reloc_idx += 1; - } - } - } - // .symtab - write_shdr64(&mut elf, shstrtab.offset_of(".symtab"), SHT_SYMTAB, 0, - 0, symtab_offset as u64, symtab_size as u64, - strtab_shndx as u32, first_global_idx as u32, - 8, sym_entry_size as u64); - // .strtab - write_shdr64(&mut elf, shstrtab.offset_of(".strtab"), SHT_STRTAB, 0, - 0, strtab_offset as u64, strtab_data.len() as u64, 0, 0, 1, 0); - // .shstrtab - write_shdr64(&mut elf, shstrtab.offset_of(".shstrtab"), SHT_STRTAB, 0, - 0, shstrtab_offset as u64, shstrtab_data.len() as u64, 0, 0, 1, 0); - } - - Ok(elf) -} - -/// Find a symbol's index in the ELF symbol table. -/// -/// Checks section names first (returns section symbol index), then searches -/// by string table offset for named symbols (excluding section symbols). -fn find_symbol_index_shared( - name: &str, - sym_entries: &[SymEntry], - strtab: &StringTable, - content_sections: &[String], -) -> u32 { - // Check if it's a section symbol - for (i, sec_name) in content_sections.iter().enumerate() { - if sec_name == name { - return (i + 1) as u32; // +1 for NULL entry - } - } - // Search named symbols - let name_offset = strtab.offset_of(name); - for (i, entry) in sym_entries.iter().enumerate() { - if entry.st_name == name_offset && entry.st_info & 0xF != STT_SECTION { - return i as u32; - } - } - 0 // undefined -} diff --git a/src/backend/elf/parse_string.rs b/src/backend/elf/parse_string.rs deleted file mode 100644 index 2c296615fd..0000000000 --- a/src/backend/elf/parse_string.rs +++ /dev/null @@ -1,98 +0,0 @@ -//! String literal parser for assembler directives. -//! -//! Shared by all four assembler backends (x86, i686, ARM, RISC-V) to ensure -//! consistent handling of C/GNU assembler escape sequences. - -/// Parse a string literal with escape sequences, returning raw bytes. -/// -/// This is the **canonical** implementation shared by all assembler backends -/// (x86, i686, ARM, RISC-V). Having one implementation prevents bugs where -/// different backends handle escapes differently (e.g. returning `String` -/// instead of `Vec` causes multi-byte UTF-8 expansion of byte values > 127). -/// -/// Supports the standard C/GNU assembler escape sequences: -/// `\n` `\t` `\r` `\\` `\"` `\a` `\b` `\f` `\v` -/// Octal: `\0` .. `\377` (1-3 digits) -/// Hex: `\x00` .. `\xFF` (1-2 digits) -/// -/// The input `s` should be a trimmed string starting with `"`. The parser scans -/// character-by-character until the closing `"` (rather than assuming it is the -/// last character), which correctly handles edge cases where extra content -/// follows the string literal. -pub fn parse_string_literal(s: &str) -> Result, String> { - let s = s.trim(); - if !s.starts_with('"') { - return Err(format!("expected string literal: {}", s)); - } - - let mut bytes = Vec::new(); - let mut chars = s[1..].chars(); - loop { - match chars.next() { - None => return Err("unterminated string".to_string()), - Some('"') => break, - Some('\\') => { - match chars.next() { - None => return Err("unterminated escape".to_string()), - Some('n') => bytes.push(b'\n'), - Some('t') => bytes.push(b'\t'), - Some('r') => bytes.push(b'\r'), - Some('\\') => bytes.push(b'\\'), - Some('"') => bytes.push(b'"'), - Some('a') => bytes.push(7), // bell - Some('b') => bytes.push(8), // backspace - Some('f') => bytes.push(12), // form feed - Some('v') => bytes.push(11), // vertical tab - Some(c) if ('0'..='7').contains(&c) => { - // Octal escape: \N, \NN, or \NNN (up to 3 digits) - let mut val = c as u32 - '0' as u32; - for _ in 0..2 { - if let Some(&next) = chars.as_str().as_bytes().first() { - if (b'0'..=b'7').contains(&next) { - val = val * 8 + (next - b'0') as u32; - chars.next(); - } else { - break; - } - } - } - bytes.push(val as u8); - } - Some('x') => { - // Hex escape: \xNN (up to 2 digits) - let mut val = 0u32; - for _ in 0..2 { - if let Some(&next) = chars.as_str().as_bytes().first() { - if next.is_ascii_hexdigit() { - val = val * 16 + match next { - b'0'..=b'9' => (next - b'0') as u32, - b'a'..=b'f' => (next - b'a' + 10) as u32, - b'A'..=b'F' => (next - b'A' + 10) as u32, - _ => unreachable!(), - }; - chars.next(); - } else { - break; - } - } - } - bytes.push(val as u8); - } - Some(c) => { - // Unknown escape: emit the character as a raw byte. - // (GNU as treats unknown \X as literal X.) - bytes.push(c as u8); - } - } - } - Some(c) => { - // Regular character - encode as UTF-8 - let mut buf = [0u8; 4]; - let encoded = c.encode_utf8(&mut buf); - bytes.extend_from_slice(encoded.as_bytes()); - } - } - } - - Ok(bytes) -} diff --git a/src/backend/elf/section_flags.rs b/src/backend/elf/section_flags.rs deleted file mode 100644 index 5d0360a55e..0000000000 --- a/src/backend/elf/section_flags.rs +++ /dev/null @@ -1,70 +0,0 @@ -//! Section flags parsing for `.section` directives. -//! -//! Converts section name, flags string ("awx"), and type string ("@nobits") -//! into ELF `(sh_type, sh_flags)` tuples. Used by x86 and i686 ELF writers. - -use super::constants::*; - -/// Parse section name, flags string, and type into ELF section type and flags. -/// -/// Returns `(sh_type, sh_flags)` based on well-known section names (`.text`, -/// `.data`, `.bss`, etc.) and optional explicit flags/type strings from the -/// `.section` directive. -pub fn parse_section_flags(name: &str, flags_str: Option<&str>, type_str: Option<&str>) -> (u32, u64) { - let (default_type, default_flags) = match name { - ".text" => (SHT_PROGBITS, SHF_ALLOC | SHF_EXECINSTR), - ".data" => (SHT_PROGBITS, SHF_ALLOC | SHF_WRITE), - ".bss" => (SHT_NOBITS, SHF_ALLOC | SHF_WRITE), - ".rodata" => (SHT_PROGBITS, SHF_ALLOC), - ".tdata" => (SHT_PROGBITS, SHF_ALLOC | SHF_WRITE | SHF_TLS), - ".tbss" => (SHT_NOBITS, SHF_ALLOC | SHF_WRITE | SHF_TLS), - ".init" => (SHT_PROGBITS, SHF_ALLOC | SHF_EXECINSTR), - ".fini" => (SHT_PROGBITS, SHF_ALLOC | SHF_EXECINSTR), - ".init_array" => (SHT_INIT_ARRAY, SHF_ALLOC | SHF_WRITE), - ".fini_array" => (SHT_FINI_ARRAY, SHF_ALLOC | SHF_WRITE), - n if n.starts_with(".text.") => (SHT_PROGBITS, SHF_ALLOC | SHF_EXECINSTR), - n if n.starts_with(".data.") => (SHT_PROGBITS, SHF_ALLOC | SHF_WRITE), - n if n.starts_with(".bss.") => (SHT_NOBITS, SHF_ALLOC | SHF_WRITE), - n if n.starts_with(".rodata.") => (SHT_PROGBITS, SHF_ALLOC), - n if n.starts_with(".note.") => (SHT_NOTE, 0), - _ => (SHT_PROGBITS, 0), - }; - - if flags_str.is_none() && type_str.is_none() { - return (default_type, default_flags); - } - - let mut flags = 0u64; - if let Some(f) = flags_str { - for c in f.chars() { - match c { - 'a' => flags |= SHF_ALLOC, - 'w' => flags |= SHF_WRITE, - 'x' => flags |= SHF_EXECINSTR, - 'M' => flags |= SHF_MERGE, - 'S' => flags |= SHF_STRINGS, - 'T' => flags |= SHF_TLS, - 'G' => flags |= SHF_GROUP, - 'o' => {} // SHF_LINK_ORDER - handle later - _ => {} - } - } - } else { - flags = default_flags; - } - - let section_type = if let Some(t) = type_str { - match t { - "@progbits" => SHT_PROGBITS, - "@nobits" => SHT_NOBITS, - "@note" => SHT_NOTE, - "@init_array" => SHT_INIT_ARRAY, - "@fini_array" => SHT_FINI_ARRAY, - _ => default_type, - } - } else { - default_type - }; - - (section_type, flags) -} diff --git a/src/backend/elf/string_table.rs b/src/backend/elf/string_table.rs deleted file mode 100644 index 784517fcb0..0000000000 --- a/src/backend/elf/string_table.rs +++ /dev/null @@ -1,53 +0,0 @@ -//! ELF string table builder for .strtab, .shstrtab, and .dynstr sections. - -use std::collections::HashMap; - -/// ELF string table builder. Used for .strtab, .shstrtab, and .dynstr sections. -/// -/// Strings are stored as null-terminated entries. The table always starts with -/// a null byte (index 0 = empty string), matching ELF convention. -pub struct StringTable { - data: Vec, - offsets: HashMap, -} - -impl StringTable { - /// Create a new string table with the initial null byte. - pub fn new() -> Self { - Self { - data: vec![0], - offsets: HashMap::new(), - } - } - - /// Add a string to the table and return its offset. - /// Returns 0 for empty strings. Deduplicates repeated insertions. - pub fn add(&mut self, s: &str) -> u32 { - if s.is_empty() { - return 0; - } - if let Some(&offset) = self.offsets.get(s) { - return offset; - } - let offset = self.data.len() as u32; - self.data.extend_from_slice(s.as_bytes()); - self.data.push(0); - self.offsets.insert(s.to_string(), offset); - offset - } - - /// Look up the offset of a previously-added string. Returns 0 if not found. - pub fn offset_of(&self, s: &str) -> u32 { - self.offsets.get(s).copied().unwrap_or(0) - } - - /// Return the raw table bytes (including the leading null byte). - pub fn as_bytes(&self) -> &[u8] { - &self.data - } - - /// Return the size of the table in bytes. - pub fn len(&self) -> usize { - self.data.len() - } -} diff --git a/src/backend/elf/symbol_table.rs b/src/backend/elf/symbol_table.rs deleted file mode 100644 index abc59ba2f5..0000000000 --- a/src/backend/elf/symbol_table.rs +++ /dev/null @@ -1,183 +0,0 @@ -//! Shared symbol table builder for all backend ELF writers. -//! -//! All four backend assemblers (x86-64, i686, ARM, RISC-V) use this shared -//! `build_elf_symbol_table` function to construct their symbol tables from -//! labels, aliases, and relocation references. This eliminates duplicated -//! symbol table construction logic across the backends. -//! -//! The only architecture-specific difference is that RISC-V needs to include -//! referenced local labels (for pcrel_hi synthetic labels) in the symbol table. - -use std::collections::{HashMap, HashSet}; -use super::constants::*; -use super::object_writer::ObjSection; - -/// A symbol in a relocatable object file. -pub struct ObjSymbol { - pub name: String, - pub value: u64, - pub size: u64, - pub binding: u8, - pub sym_type: u8, - pub visibility: u8, - /// Section name, or "*COM*" for COMMON, "*UND*" or empty for undefined. - pub section_name: String, -} - -/// Parameters for the shared `build_elf_symbol_table` function. -/// Collects the state needed to build the symbol table without requiring -/// a specific ElfWriter struct type. -pub struct SymbolTableInput<'a> { - pub labels: &'a HashMap, - pub global_symbols: &'a HashMap, - pub weak_symbols: &'a HashMap, - pub symbol_types: &'a HashMap, - pub symbol_sizes: &'a HashMap, - pub symbol_visibility: &'a HashMap, - pub aliases: &'a HashMap, - pub sections: &'a HashMap, - /// If true, include .L* local labels that are referenced by relocations - /// in the symbol table (needed by RISC-V for pcrel_hi/pcrel_lo pairs). - pub include_referenced_locals: bool, -} - -/// Build a symbol table from labels, aliases, and relocation references. -/// -/// Returns a list of `ObjSymbol` entries ready for `write_relocatable_object`. -/// Handles: -/// - Defined labels (global, weak, local) -/// - .set/.equ aliases with chain resolution -/// - Undefined symbols (referenced in relocations but not defined) -/// - Optionally, referenced local labels (.L*) for RISC-V pcrel support -pub fn build_elf_symbol_table(input: &SymbolTableInput) -> Vec { - let mut symbols: Vec = Vec::new(); - - // Collect referenced local labels if needed (RISC-V pcrel_hi) - let mut referenced_local_labels: HashSet = HashSet::new(); - if input.include_referenced_locals { - for sec in input.sections.values() { - for reloc in &sec.relocs { - if reloc.symbol_name.starts_with(".L") || reloc.symbol_name.starts_with(".l") { - referenced_local_labels.insert(reloc.symbol_name.clone()); - } - } - } - } - - // Add defined labels as symbols - for (name, (section, offset)) in input.labels { - let is_local_label = name.starts_with(".L") || name.starts_with(".l"); - if is_local_label && !referenced_local_labels.contains(name) { - continue; - } - - let binding = if input.weak_symbols.contains_key(name) { - STB_WEAK - } else if input.global_symbols.contains_key(name) { - STB_GLOBAL - } else { - STB_LOCAL - }; - - symbols.push(ObjSymbol { - name: name.clone(), - value: *offset, - size: input.symbol_sizes.get(name).copied().unwrap_or(0), - binding, - sym_type: input.symbol_types.get(name).copied().unwrap_or(STT_NOTYPE), - visibility: input.symbol_visibility.get(name).copied().unwrap_or(STV_DEFAULT), - section_name: section.clone(), - }); - } - - // Add alias symbols from .set/.equ directives - let defined_names: HashMap = symbols.iter() - .enumerate() - .map(|(i, s)| (s.name.clone(), i)) - .collect(); - - for (alias, target) in input.aliases { - // Resolve through alias chains - let mut resolved = target.as_str(); - let mut seen = HashSet::new(); - seen.insert(target.as_str()); - while let Some(next) = input.aliases.get(resolved) { - if !seen.insert(next.as_str()) { - break; - } - resolved = next.as_str(); - } - - let alias_binding = if input.weak_symbols.contains_key(alias) { - Some(STB_WEAK) - } else if input.global_symbols.contains_key(alias) { - Some(STB_GLOBAL) - } else { - None - }; - let alias_type = input.symbol_types.get(alias).copied(); - let alias_vis = input.symbol_visibility.get(alias).copied(); - - if let Some(&idx) = defined_names.get(resolved) { - let target_sym = &symbols[idx]; - symbols.push(ObjSymbol { - name: alias.clone(), - value: target_sym.value, - size: target_sym.size, - binding: alias_binding.unwrap_or(target_sym.binding), - sym_type: alias_type.unwrap_or(target_sym.sym_type), - visibility: alias_vis.unwrap_or(target_sym.visibility), - section_name: target_sym.section_name.clone(), - }); - } else if let Some((section, offset)) = input.labels.get(resolved) { - symbols.push(ObjSymbol { - name: alias.clone(), - value: *offset, - size: 0, - binding: alias_binding.unwrap_or(STB_LOCAL), - sym_type: alias_type.unwrap_or(STT_NOTYPE), - visibility: alias_vis.unwrap_or(STV_DEFAULT), - section_name: section.clone(), - }); - } - } - - // Add undefined symbols (referenced in relocations but not defined) - let mut referenced: HashSet = HashSet::new(); - for sec in input.sections.values() { - for reloc in &sec.relocs { - if reloc.symbol_name.is_empty() { - continue; - } - if !reloc.symbol_name.starts_with(".L") && !reloc.symbol_name.starts_with(".l") { - referenced.insert(reloc.symbol_name.clone()); - } - } - } - - let defined: HashSet = symbols.iter().map(|s| s.name.clone()).collect(); - - for name in &referenced { - if input.sections.contains_key(name) { - continue; // Skip section names - } - if !defined.contains(name) { - let binding = if input.weak_symbols.contains_key(name) { - STB_WEAK - } else { - STB_GLOBAL - }; - symbols.push(ObjSymbol { - name: name.clone(), - value: 0, - size: 0, - binding, - sym_type: input.symbol_types.get(name).copied().unwrap_or(STT_NOTYPE), - visibility: input.symbol_visibility.get(name).copied().unwrap_or(STV_DEFAULT), - section_name: "*UND*".to_string(), - }); - } - } - - symbols -} diff --git a/src/backend/elf/writer_base.rs b/src/backend/elf/writer_base.rs deleted file mode 100644 index cc3be0e606..0000000000 --- a/src/backend/elf/writer_base.rs +++ /dev/null @@ -1,674 +0,0 @@ -//! Shared ELF writer base for ARM and RISC-V assembler backends. -//! -//! ARM and RISC-V ELF writers share ~400 lines of identical code for section -//! management, symbol tracking, relocation recording, alignment, directive -//! processing, data emission, and ELF serialization. -//! -//! This `ElfWriterBase` struct captures all of that shared state and logic. -//! Each arch-specific ElfWriter composes with this base and adds its own -//! instruction encoding, branch resolution, and other arch-specific features: -//! - ARM adds `pending_sym_diffs` and AArch64-specific branch resolution -//! - RISC-V adds `pcrel_hi_counter`, `numeric_labels`, and RV64C compression - -use std::collections::HashMap; -use super::constants::*; -use super::linker_symbols::default_section_flags; -use super::object_writer::{ElfConfig, ObjSection, ObjReloc}; -use super::symbol_table::{ObjSymbol, SymbolTableInput, build_elf_symbol_table}; -use super::object_writer::write_relocatable_object; - -/// Shared ELF writer state used by both ARM and RISC-V assembler backends. -/// -/// This struct manages sections, symbols, labels, and relocations using the -/// shared `ObjSection`/`ObjReloc`/`ObjSymbol` types directly, eliminating the -/// per-arch conversion step in `write_elf()`. -/// -/// Architecture-specific ElfWriters compose with this base: -/// - ARM adds `pending_sym_diffs` and AArch64-specific branch resolution -/// - RISC-V adds `pcrel_hi_counter`, `numeric_labels`, and RV64C compression -pub struct ElfWriterBase { - /// Current section we're emitting into - pub current_section: String, - /// All sections being built (using shared ObjSection directly) - pub sections: HashMap, - /// Section order (for deterministic output) - pub section_order: Vec, - /// Extra symbols (e.g., COMMON symbols from .comm directives) - pub extra_symbols: Vec, - /// Local labels -> (section, offset) for branch resolution - pub labels: HashMap, - /// Symbols that have been declared .globl - pub global_symbols: HashMap, - /// Symbols declared .weak - pub weak_symbols: HashMap, - /// Symbol types from .type directives - pub symbol_types: HashMap, - /// Symbol sizes from .size directives - pub symbol_sizes: HashMap, - /// Symbol visibility from .hidden/.protected/.internal - pub symbol_visibility: HashMap, - /// Symbol aliases from .set/.equ directives - pub aliases: HashMap, - /// Section stack for .pushsection/.popsection (saves both current and previous section) - section_stack: Vec<(String, String)>, - /// Previous section for .section/.previous swapping - previous_section: String, - /// NOP instruction bytes for code section alignment padding. - /// ARM: `[0x1f, 0x20, 0x03, 0xd5]` (d503201f), RISC-V: `[0x13, 0x00, 0x00, 0x00]` (00000013) - nop_bytes: [u8; 4], - /// Default text section alignment (4 for ARM, 2 for RISC-V with compressed instructions) - text_align: u64, -} - -impl ElfWriterBase { - pub fn new(nop_bytes: [u8; 4], text_align: u64) -> Self { - Self { - current_section: String::new(), - sections: HashMap::new(), - section_order: Vec::new(), - extra_symbols: Vec::new(), - labels: HashMap::new(), - global_symbols: HashMap::new(), - weak_symbols: HashMap::new(), - symbol_types: HashMap::new(), - symbol_sizes: HashMap::new(), - symbol_visibility: HashMap::new(), - aliases: HashMap::new(), - section_stack: Vec::new(), - previous_section: String::new(), - nop_bytes, - text_align, - } - } - - /// Ensure a section exists. If it doesn't, create it with the given properties. - pub fn ensure_section(&mut self, name: &str, sh_type: u32, sh_flags: u64, align: u64) { - if !self.sections.contains_key(name) { - self.sections.insert(name.to_string(), ObjSection { - name: name.to_string(), - sh_type, - sh_flags, - data: Vec::new(), - sh_addralign: align, - relocs: Vec::new(), - comdat_group: None, - }); - self.section_order.push(name.to_string()); - } - } - - /// Get the current write offset within the current section. - pub fn current_offset(&self) -> u64 { - self.sections.get(&self.current_section) - .map(|s| s.data.len() as u64) - .unwrap_or(0) - } - - /// Append raw bytes to the current section. - pub fn emit_bytes(&mut self, bytes: &[u8]) { - if let Some(section) = self.sections.get_mut(&self.current_section) { - section.data.extend_from_slice(bytes); - } - } - - /// Append a 16-bit little-endian value to the current section. - pub fn emit_u16_le(&mut self, val: u16) { - self.emit_bytes(&val.to_le_bytes()); - } - - /// Append a 32-bit little-endian value to the current section. - pub fn emit_u32_le(&mut self, val: u32) { - self.emit_bytes(&val.to_le_bytes()); - } - - /// Record a relocation at the current offset in the current section. - pub fn add_reloc(&mut self, reloc_type: u32, symbol: String, addend: i64) { - let offset = self.current_offset(); - let section = self.current_section.clone(); - if let Some(s) = self.sections.get_mut(§ion) { - s.relocs.push(ObjReloc { - offset, - reloc_type, - symbol_name: symbol, - addend, - }); - } - } - - /// Align the current section's data to the specified byte boundary. - /// - /// Code sections are NOP-padded using the architecture's NOP instruction; - /// data sections are zero-padded. - pub fn align_to(&mut self, align: u64) { - if align <= 1 { - return; - } - if let Some(section) = self.sections.get_mut(&self.current_section) { - let current = section.data.len() as u64; - let aligned = (current + align - 1) & !(align - 1); - let padding = (aligned - current) as usize; - if section.sh_flags & SHF_EXECINSTR != 0 && align >= 4 { - let full_nops = padding / 4; - let remainder = padding % 4; - for _ in 0..full_nops { - section.data.extend_from_slice(&self.nop_bytes); - } - section.data.extend(std::iter::repeat_n(0u8, remainder)); - } else { - section.data.extend(std::iter::repeat_n(0u8, padding)); - } - if align > section.sh_addralign { - section.sh_addralign = align; - } - } - } - - /// Ensure we're in a text section, creating one if needed. - pub fn ensure_text_section(&mut self) { - if self.current_section.is_empty() { - self.ensure_section(".text", SHT_PROGBITS, SHF_ALLOC | SHF_EXECINSTR, self.text_align); - self.current_section = ".text".to_string(); - } - } - - /// Process a .section directive with parsed fields. - /// - /// `sec_name`: section name, `flags_str`: flag characters ("awx" etc.), - /// `flags_explicit`: whether flags were explicitly provided (vs default), - /// `sec_type_str`: optional type string ("@nobits", "@note", etc.) - pub fn process_section_directive( - &mut self, - sec_name: &str, - flags_str: &str, - flags_explicit: bool, - sec_type_str: Option<&str>, - ) { - let sh_type = match sec_type_str { - Some("@nobits") => SHT_NOBITS, - Some("@note") => SHT_NOTE, - _ => { - if sec_name == ".bss" || sec_name.starts_with(".bss.") || sec_name.starts_with(".tbss") { - SHT_NOBITS - } else { - SHT_PROGBITS - } - } - }; - - let mut sh_flags = 0u64; - if flags_str.contains('a') { sh_flags |= SHF_ALLOC; } - if flags_str.contains('w') { sh_flags |= SHF_WRITE; } - if flags_str.contains('x') { sh_flags |= SHF_EXECINSTR; } - if flags_str.contains('M') { sh_flags |= SHF_MERGE; } - if flags_str.contains('S') { sh_flags |= SHF_STRINGS; } - if flags_str.contains('T') { sh_flags |= SHF_TLS; } - if flags_str.contains('G') { sh_flags |= SHF_GROUP; } - - if sh_flags == 0 && !flags_explicit { - sh_flags = default_section_flags(sec_name); - } - - let align = if sh_flags & SHF_EXECINSTR != 0 { self.text_align } else { 1 }; - self.ensure_section(sec_name, sh_type, sh_flags, align); - self.previous_section = std::mem::replace(&mut self.current_section, sec_name.to_string()); - } - - /// Switch to a named standard section (.text, .data, .bss, .rodata). - pub fn switch_to_standard_section(&mut self, name: &str, sh_type: u32, sh_flags: u64) { - let align = if sh_flags & SHF_EXECINSTR != 0 { self.text_align } else { 1 }; - self.ensure_section(name, sh_type, sh_flags, align); - self.previous_section = std::mem::replace(&mut self.current_section, name.to_string()); - } - - /// Restore the previous section (for `.previous` directive). - /// Swaps current and previous sections, so repeated `.previous` toggles between two sections. - pub fn restore_previous_section(&mut self) { - if !self.previous_section.is_empty() { - std::mem::swap(&mut self.current_section, &mut self.previous_section); - } - } - - /// Push current section onto the stack and switch to a new section. - /// Saves both current_section and previous_section so that .popsection - /// fully restores the section state (matching GNU as behavior). - pub fn push_section(&mut self, name: &str, flags_str: &str, flags_explicit: bool, sec_type: Option<&str>) { - self.section_stack.push((self.current_section.clone(), self.previous_section.clone())); - self.process_section_directive(name, flags_str, flags_explicit, sec_type); - } - - /// Pop the section stack and restore both current and previous sections. - pub fn pop_section(&mut self) { - if let Some((saved_current, saved_previous)) = self.section_stack.pop() { - self.current_section = saved_current; - self.previous_section = saved_previous; - } - } - - /// Switch to a numbered subsection within the current section. - /// - /// `.subsection N` creates an internal section `PARENT.__subsection.N` that - /// gets merged back into the parent section after all statements are processed. - /// Subsections are concatenated in numeric order (0 first, then 1, 2, ...). - pub fn set_subsection(&mut self, n: u64) { - // Determine the parent section name (strip any existing subsection suffix) - let parent = if let Some(pos) = self.current_section.find(".__subsection.") { - self.current_section[..pos].to_string() - } else { - self.current_section.clone() - }; - - if n == 0 { - // Switch back to parent section (subsection 0 = parent itself) - self.previous_section = std::mem::replace(&mut self.current_section, parent); - } else { - // Switch to subsection N - let sub_name = format!("{}.__subsection.{}", parent, n); - // Inherit properties from parent section - if !self.sections.contains_key(&sub_name) { - if let Some(parent_sec) = self.sections.get(&parent) { - let sh_type = parent_sec.sh_type; - let sh_flags = parent_sec.sh_flags; - let align = parent_sec.sh_addralign; - self.ensure_section(&sub_name, sh_type, sh_flags, align); - } else { - // Parent doesn't exist yet; create subsection with code defaults - self.ensure_section(&sub_name, 1, 0x6, self.text_align); // SHT_PROGBITS, AX - } - } - self.previous_section = std::mem::replace(&mut self.current_section, sub_name); - } - } - - /// Merge all subsections back into their parent sections. - /// - /// After all statements are processed, subsections like `.text.__subsection.1` - /// are appended to their parent `.text` in numeric order. Labels and relocations - /// are adjusted to account for the new offsets. - /// - /// Returns a mapping from subsection name to (parent_name, offset_adjustment) - /// so callers can fix up any pending references that point to subsection names. - pub fn merge_subsections(&mut self) -> HashMap { - let mut remap = HashMap::new(); - - // Collect subsection names grouped by parent - let mut subsections: std::collections::BTreeMap> = - std::collections::BTreeMap::new(); - - for name in &self.section_order { - if let Some(pos) = name.find(".__subsection.") { - let parent = name[..pos].to_string(); - let num: u64 = name[pos + 14..].parse().unwrap_or(0); - subsections.entry(parent).or_default().insert(num, name.clone()); - } - } - - if subsections.is_empty() { - return remap; - } - - // For each parent, append subsections in order - for (parent, subs) in &subsections { - for sub_name in subs.values() { - let sub_data; - let sub_relocs; - { - let sub_sec = match self.sections.get(sub_name) { - Some(s) => s, - None => continue, - }; - sub_data = sub_sec.data.clone(); - sub_relocs = sub_sec.relocs.clone(); - } - - let parent_len = self.sections.get(parent) - .map(|s| s.data.len() as u64) - .unwrap_or(0); - - // Record the remapping for callers - remap.insert(sub_name.clone(), (parent.clone(), parent_len)); - - // Append data - if let Some(parent_sec) = self.sections.get_mut(parent) { - parent_sec.data.extend_from_slice(&sub_data); - // Append relocations with adjusted offsets - for mut reloc in sub_relocs { - reloc.offset += parent_len; - parent_sec.relocs.push(reloc); - } - } - - // Adjust labels that reference this subsection - let labels_to_update: Vec<(String, u64)> = self.labels.iter() - .filter(|(_, (sec, _))| sec == sub_name) - .map(|(name, (_, off))| (name.clone(), *off)) - .collect(); - - for (label_name, old_offset) in labels_to_update { - self.labels.insert(label_name, (parent.clone(), old_offset + parent_len)); - } - - // Remove the subsection - self.sections.remove(sub_name); - } - } - - // Remove subsection names from section_order - self.section_order.retain(|name| !name.contains(".__subsection.")); - - // Fix current_section if it pointed to a subsection - if self.current_section.contains(".__subsection.") { - if let Some(pos) = self.current_section.find(".__subsection.") { - self.current_section = self.current_section[..pos].to_string(); - } - } - if self.previous_section.contains(".__subsection.") { - if let Some(pos) = self.previous_section.find(".__subsection.") { - self.previous_section = self.previous_section[..pos].to_string(); - } - } - - remap - } - - /// Record .globl for a symbol. - pub fn set_global(&mut self, sym: &str) { - self.global_symbols.insert(sym.to_string(), true); - } - - /// Record .weak for a symbol. - pub fn set_weak(&mut self, sym: &str) { - self.weak_symbols.insert(sym.to_string(), true); - } - - /// Record symbol visibility (.hidden, .protected, .internal). - pub fn set_visibility(&mut self, sym: &str, vis: u8) { - self.symbol_visibility.insert(sym.to_string(), vis); - } - - /// Record .type for a symbol (STT_FUNC, STT_OBJECT, etc.). - pub fn set_symbol_type(&mut self, sym: &str, st: u8) { - self.symbol_types.insert(sym.to_string(), st); - } - - /// Record .size for a symbol. If `current_minus_label` is Some, computes - /// `current_offset - label_offset` in the same section. Otherwise uses the absolute value. - pub fn set_symbol_size(&mut self, sym: &str, current_minus_label: Option<&str>, absolute: Option) { - if let Some(label) = current_minus_label { - if let Some((section, label_offset)) = self.labels.get(label) { - if *section == self.current_section { - let current = self.current_offset(); - let size = current - label_offset; - self.symbol_sizes.insert(sym.to_string(), size); - } - } - } else if let Some(size) = absolute { - self.symbol_sizes.insert(sym.to_string(), size); - } - } - - /// Emit a .comm symbol (COMMON block). - pub fn emit_comm(&mut self, sym: &str, size: u64, align: u64) { - self.extra_symbols.push(ObjSymbol { - name: sym.to_string(), - value: align, - size, - binding: STB_GLOBAL, - sym_type: STT_OBJECT, - visibility: STV_DEFAULT, - section_name: "*COM*".to_string(), - }); - } - - /// Record a .set/.equ alias. - pub fn set_alias(&mut self, alias: &str, target: &str) { - self.aliases.insert(alias.to_string(), target.to_string()); - } - - /// Resolve .set/.equ aliases in an expression string. - /// Replaces symbol names (like `.L__gpr_num_t0`) with their numeric values. - pub fn resolve_expr_aliases(&self, expr: &str) -> String { - let mut result = String::with_capacity(expr.len()); - let bytes = expr.as_bytes(); - let mut i = 0; - while i < bytes.len() { - let c = bytes[i]; - // Symbol names start with a letter, underscore, or dot - if c == b'.' || c == b'_' || c.is_ascii_alphabetic() { - let start = i; - i += 1; - while i < bytes.len() && (bytes[i] == b'.' || bytes[i] == b'_' || bytes[i].is_ascii_alphanumeric()) { - i += 1; - } - let sym = &expr[start..i]; - // Chase alias chain - let mut resolved = sym; - let mut seen = 0; - while let Some(target) = self.aliases.get(resolved) { - resolved = target.as_str(); - seen += 1; - if seen > 20 { break; } - } - result.push_str(resolved); - } else { - result.push(c as char); - i += 1; - } - } - result - } - - /// Resolve label names in an expression to their numeric offsets. - /// This handles `.Ldot_N` synthetic labels (current position) and any - /// section-local labels that can be resolved to constant offsets. - pub fn resolve_expr_labels(&self, expr: &str) -> String { - let cur_section = &self.current_section; - let cur_offset = self.current_offset(); - let mut result = String::with_capacity(expr.len()); - let bytes = expr.as_bytes(); - let mut i = 0; - while i < bytes.len() { - let c = bytes[i]; - if c == b'.' || c == b'_' || c.is_ascii_alphabetic() { - let start = i; - i += 1; - while i < bytes.len() && (bytes[i] == b'.' || bytes[i] == b'_' || bytes[i].is_ascii_alphanumeric()) { - i += 1; - } - let sym = &expr[start..i]; - // Standalone '.' means current position - if sym == "." { - result.push_str(&cur_offset.to_string()); - // Check if this is a .Ldot_N label (current position) - } else if sym.starts_with(".Ldot_") { - result.push_str(&cur_offset.to_string()); - } else if let Some((sec, off)) = self.labels.get(sym) { - if sec == cur_section { - result.push_str(&off.to_string()); - } else { - result.push_str(sym); - } - } else { - result.push_str(sym); - } - } else { - result.push(c as char); - i += 1; - } - } - result - } - - /// Resolve ALL label names in an expression to their numeric offsets, - /// using the specified section as context. Unlike `resolve_expr_labels`, - /// this also resolves `.Ldot_N` labels from their stored definitions - /// (not the current offset), making it suitable for deferred resolution. - pub fn resolve_expr_all_labels(&self, expr: &str, section: &str) -> String { - let mut result = String::with_capacity(expr.len()); - let bytes = expr.as_bytes(); - let mut i = 0; - while i < bytes.len() { - let c = bytes[i]; - if c == b'.' || c == b'_' || c.is_ascii_alphabetic() { - let start = i; - i += 1; - while i < bytes.len() && (bytes[i] == b'.' || bytes[i] == b'_' || bytes[i].is_ascii_alphanumeric()) { - i += 1; - } - let sym = &expr[start..i]; - if let Some((sec, off)) = self.labels.get(sym) { - if sec == section { - result.push_str(&off.to_string()); - } else { - result.push_str(sym); - } - } else { - result.push_str(sym); - } - } else { - result.push(c as char); - i += 1; - } - } - result - } - - /// Resolve label names in an expression to their offsets regardless of section. - /// - /// Unlike `resolve_expr_all_labels` which only resolves labels in the same - /// section, this resolves ALL labels to their offsets. This is safe for - /// expressions that compute differences between labels in the same section - /// (the section offsets cancel out), which is common in kernel ALTERNATIVE - /// macros (e.g., `889f - 888f` computing the size of alternative code). - pub fn resolve_expr_cross_section(&self, expr: &str) -> String { - let resolved = self.resolve_expr_aliases(expr); - let mut result = String::with_capacity(resolved.len()); - let bytes = resolved.as_bytes(); - let mut i = 0; - while i < bytes.len() { - let c = bytes[i]; - if c == b'.' || c == b'_' || c.is_ascii_alphabetic() { - let start = i; - i += 1; - while i < bytes.len() && (bytes[i] == b'.' || bytes[i] == b'_' || bytes[i].is_ascii_alphanumeric()) { - i += 1; - } - let sym = &resolved[start..i]; - if let Some((_sec, off)) = self.labels.get(sym) { - result.push_str(&off.to_string()); - } else { - result.push_str(sym); - } - } else { - result.push(c as char); - i += 1; - } - } - result - } - - /// Emit a plain integer value for .byte (size=1), .short (size=2), .long (size=4) or .quad (size=8). - pub fn emit_data_integer(&mut self, val: i64, size: usize) { - match size { - 1 => self.emit_bytes(&[val as u8]), - 2 => self.emit_bytes(&(val as u16).to_le_bytes()), - 4 => self.emit_bytes(&(val as u32).to_le_bytes()), - _ => self.emit_bytes(&(val as u64).to_le_bytes()), - } - } - - /// Emit a symbol reference with a relocation. - pub fn emit_data_symbol_ref(&mut self, sym: &str, addend: i64, size: usize, reloc_type: u32) { - self.add_reloc(reloc_type, sym.to_string(), addend); - match size { - 1 => self.emit_bytes(&[0u8]), - 2 => self.emit_bytes(&0u16.to_le_bytes()), - 4 => self.emit_bytes(&0u32.to_le_bytes()), - _ => self.emit_bytes(&0u64.to_le_bytes()), - } - } - - /// Emit placeholder bytes for a deferred value (symbol diff, etc.). - pub fn emit_placeholder(&mut self, size: usize) { - match size { - 1 => self.emit_bytes(&[0u8]), - 2 => self.emit_bytes(&0u16.to_le_bytes()), - 4 => self.emit_bytes(&0u32.to_le_bytes()), - _ => self.emit_bytes(&0u64.to_le_bytes()), - } - } - - /// Resolve local label references in data relocations. - /// - /// When a data directive like `.xword .Lstr0` references a local label - /// in a different section, the local label won't be in the symbol table. - /// Convert these to section_symbol + offset_of_label_in_section. - pub fn resolve_local_data_relocs(&mut self) { - let labels = self.labels.clone(); - for sec_name in &self.section_order.clone() { - if let Some(section) = self.sections.get_mut(sec_name) { - for reloc in &mut section.relocs { - // Skip pcrel_lo12 relocations — they must keep their - // .Lpcrel_hi label reference (not section+offset) - let is_pcrel_lo = reloc.reloc_type == 24 || reloc.reloc_type == 25; - if is_pcrel_lo { - continue; - } - if (reloc.symbol_name.starts_with(".L") || reloc.symbol_name.starts_with(".l")) - && !reloc.symbol_name.is_empty() - { - if let Some((label_section, label_offset)) = labels.get(&reloc.symbol_name) { - reloc.addend += *label_offset as i64; - reloc.symbol_name = label_section.clone(); - } - } - } - } - } - } - - /// Build the symbol table and serialize the ELF object file. - /// - /// `config`: ELF configuration (machine type, flags, class) - /// `include_referenced_locals`: whether to include .L* labels referenced - /// by relocations (needed by RISC-V for pcrel_hi/pcrel_lo pairs) - pub fn write_elf(&mut self, output_path: &str, config: &ElfConfig, include_referenced_locals: bool) -> Result<(), String> { - if !include_referenced_locals { - self.resolve_local_data_relocs(); - } - - let symtab_input = SymbolTableInput { - labels: &self.labels, - global_symbols: &self.global_symbols, - weak_symbols: &self.weak_symbols, - symbol_types: &self.symbol_types, - symbol_sizes: &self.symbol_sizes, - symbol_visibility: &self.symbol_visibility, - aliases: &self.aliases, - sections: &self.sections, - include_referenced_locals, - }; - - let mut symbols = build_elf_symbol_table(&symtab_input); - // Remove UND entries for any symbols that are also in extra_symbols (e.g. COMMON). - // A symbol that is both referenced in relocations and declared as COMMON should - // only appear once (as COMMON), not as both UND and COMMON. - for extra in &self.extra_symbols { - if extra.section_name == "*COM*" { - symbols.retain(|s| !(s.name == extra.name && s.section_name == "*UND*")); - } - } - symbols.append(&mut self.extra_symbols); - - let elf_bytes = write_relocatable_object( - config, - &self.section_order, - &self.sections, - &symbols, - )?; - - std::fs::write(output_path, &elf_bytes) - .map_err(|e| format!("failed to write ELF file: {}", e))?; - - Ok(()) - } -} diff --git a/src/backend/elf_writer_common.rs b/src/backend/elf_writer_common.rs deleted file mode 100644 index 2bd62e281d..0000000000 --- a/src/backend/elf_writer_common.rs +++ /dev/null @@ -1,1700 +0,0 @@ -//! Shared ELF relocatable object file writer for x86-64 and i686. -//! -//! Both x86-64 and i686 assemblers share ~90% of their ELF writer logic: -//! section management, label tracking, symbol attributes, jump relaxation, -//! relocation resolution, and ELF emission. This module extracts that -//! shared code into a generic `ElfWriterCore`, parameterized by an -//! `X86Arch` trait that provides the architecture-specific pieces: -//! -//! - Relocation type constants (R_X86_64_* vs R_386_*) -//! - ELF class and machine type (ELFCLASS64/EM_X86_64 vs ELFCLASS32/EM_386) -//! - Instruction encoding dispatch -//! - REL vs RELA format handling -//! -//! Both x86-64 and i686 support deferred `.skip` expressions and deferred -//! byte-sized symbol diffs (needed by the Linux kernel's alternatives -//! framework). These are handled as optional extensions controlled by -//! the `supports_deferred_skips()` trait method. - -use std::collections::HashMap; -use crate::backend::x86::assembler::parser::*; -use crate::backend::elf::{self as elf_mod, - SHT_PROGBITS, - SHF_ALLOC, SHF_EXECINSTR, - STB_LOCAL, STB_GLOBAL, STB_WEAK, - STT_NOTYPE, STT_OBJECT, STT_FUNC, STT_TLS, - STV_DEFAULT, STV_INTERNAL, STV_HIDDEN, STV_PROTECTED, - resolve_numeric_labels, parse_section_flags, - ElfConfig, ObjSection, ObjSymbol, ObjReloc, SymbolTableInput, -}; - -// ─── Architecture trait ─────────────────────────────────────────────── - -/// Architecture-specific behavior for x86-family ELF writers. -/// -/// Implemented by x86-64 and i686 to provide relocation types, -/// ELF constants, and instruction encoding. -pub trait X86Arch { - /// Encode an instruction, returning (bytes, relocations, optional jump info). - /// The `section_data_len` parameter is the current offset in the section. - fn encode_instruction( - instr: &Instruction, - section_data_len: u64, - ) -> Result; - - /// ELF machine type (EM_X86_64 or EM_386). - fn elf_machine() -> u16; - /// ELF class (ELFCLASS64 or ELFCLASS32). - fn elf_class() -> u8; - /// ELF flags (typically 0 for both). - fn elf_flags() -> u32 { 0 } - - /// Absolute relocation type for data (R_X86_64_32/R_X86_64_64 or R_386_32). - fn reloc_abs(size: usize) -> u32; - /// 64-bit absolute relocation (R_X86_64_64). Only meaningful for x86-64. - fn reloc_abs64() -> u32; - /// PC-relative relocation type (R_X86_64_PC32 or R_386_PC32). - fn reloc_pc32() -> u32; - /// PLT relocation type (R_X86_64_PLT32 or R_386_PLT32). - fn reloc_plt32() -> u32; - - /// Whether this architecture uses REL format (i686) vs RELA (x86-64). - /// When true, addends are patched into section data instead of being - /// stored in the relocation entry. - fn uses_rel_format() -> bool; - - /// Optional: PC8 internal relocation type for loop/jrcxz instructions. - /// Only x86-64 has this; i686 returns None. - fn reloc_pc8_internal() -> Option { None } - - /// Optional: absolute 32-bit relocation for local symbol references. - /// Only x86-64 uses R_X86_64_32 this way; i686 returns None since - /// its R_386_32 is handled by the general abs path. - fn reloc_abs32_for_internal() -> Option { None } - - /// Whether `.skip` expressions with label arithmetic are supported. - /// Both x86-64 and i686 enable this for the Linux kernel's ALTERNATIVES macros. - fn supports_deferred_skips() -> bool { false } - - /// Whether `.set` alias resolution for label-difference expressions - /// should be done during data value emission. Both x86-64 and i686 - /// enable this for DWARF debug info `.set .Lset0, .LECIE-.LSCIE` patterns. - fn resolve_set_aliases_in_data() -> bool { false } - - /// Default code mode for this architecture (64 for x86-64, 32 for i686). - fn default_code_mode() -> u8 { 64 } - - /// Encode an instruction in 64-bit mode. Used by the i686 assembler when - /// encountering `.code64` sections (e.g. kernel realmode trampoline code). - /// Default implementation delegates to the normal encode_instruction. - fn encode_instruction_code64( - instr: &Instruction, - section_data_len: u64, - ) -> Result { - Self::encode_instruction(instr, section_data_len) - } - -} - -/// Result of encoding a single instruction. -pub struct EncodeResult { - pub bytes: Vec, - pub relocations: Vec, - pub jump: Option, -} - -/// A relocation produced by the instruction encoder. -pub struct EncoderReloc { - pub offset: u64, - pub symbol: String, - pub reloc_type: u32, - pub addend: i64, - pub diff_symbol: Option, -} - -/// Jump instruction detected during encoding, eligible for relaxation. -pub struct JumpDetection { - pub is_conditional: bool, - /// Whether this is already in short form (e.g., jecxz, loop). - pub already_short: bool, -} - -// ─── Internal types ─────────────────────────────────────────────────── - -/// Tracks a jump instruction for relaxation (long -> short). -#[derive(Clone, Debug)] -struct JumpInfo { - offset: usize, - len: usize, - target: String, - is_conditional: bool, - relaxed: bool, -} - -/// Tracks an alignment or .org marker within a section. -/// Used to recalculate padding after jump relaxation. -#[derive(Clone, Debug)] -struct AlignMarker { - offset: usize, - padding: usize, - kind: AlignMarkerKind, -} - -#[derive(Clone, Debug)] -enum AlignMarkerKind { - /// .balign N — pad to N-byte boundary. - Align(u32), - /// .org label + offset — advance to a fixed position. - Org { label: String, addend: i64 }, -} - -/// A section being built during assembly. -struct Section { - name: String, - section_type: u32, - flags: u64, - data: Vec, - alignment: u64, - relocations: Vec, - jumps: Vec, - align_markers: Vec, - comdat_group: Option, -} - -#[derive(Clone)] -struct ElfRelocation { - offset: u64, - symbol: String, - reloc_type: u32, - addend: i64, - diff_symbol: Option, - /// Size of the data to patch (1, 2, 4, or 8 bytes). - patch_size: u8, -} - -/// Symbol info collected during assembly. -struct SymbolInfo { - name: String, - binding: u8, - sym_type: u8, - visibility: u8, - section: Option, - value: u64, - size: u64, - is_common: bool, - common_align: u32, -} - -// ─── Expression evaluator ───────────────────────────────────────────── - -/// Token in a deferred expression (for `.skip` with label arithmetic). -#[derive(Debug, Clone, PartialEq)] -enum ExprToken { - Number(i64), - Symbol(String), - Plus, - Minus, - Star, - LParen, - RParen, - Lt, - Gt, - And, - Or, - Xor, - Not, -} - -fn tokenize_expr(expr: &str) -> Result, String> { - let mut tokens = Vec::new(); - let bytes = expr.as_bytes(); - let mut i = 0; - - while i < bytes.len() { - match bytes[i] { - b' ' | b'\t' => { i += 1; } - b'+' => { tokens.push(ExprToken::Plus); i += 1; } - b'-' => { tokens.push(ExprToken::Minus); i += 1; } - b'*' => { tokens.push(ExprToken::Star); i += 1; } - b'(' => { tokens.push(ExprToken::LParen); i += 1; } - b')' => { tokens.push(ExprToken::RParen); i += 1; } - b'<' => { tokens.push(ExprToken::Lt); i += 1; } - b'>' => { tokens.push(ExprToken::Gt); i += 1; } - b'&' => { tokens.push(ExprToken::And); i += 1; } - b'|' => { tokens.push(ExprToken::Or); i += 1; } - b'^' => { tokens.push(ExprToken::Xor); i += 1; } - b'~' => { tokens.push(ExprToken::Not); i += 1; } - b'0'..=b'9' => { - let start = i; - if i + 1 < bytes.len() && bytes[i] == b'0' && (bytes[i+1] == b'x' || bytes[i+1] == b'X') { - i += 2; - while i < bytes.len() && bytes[i].is_ascii_hexdigit() { i += 1; } - } else { - while i < bytes.len() && bytes[i].is_ascii_digit() { i += 1; } - } - // Check for numeric label references: digits followed by 'b' or 'f' - // (e.g., "0b" = backward ref to label 0, "1f" = forward ref to label 1) - if i < bytes.len() && (bytes[i] == b'b' || bytes[i] == b'f') - && (i + 1 >= bytes.len() || !bytes[i + 1].is_ascii_alphanumeric()) - { - i += 1; // include the 'b' or 'f' suffix - tokens.push(ExprToken::Symbol(expr[start..i].to_string())); - } else { - let num_str = &expr[start..i]; - let val = if num_str.starts_with("0x") || num_str.starts_with("0X") { - i64::from_str_radix(&num_str[2..], 16) - .map_err(|_| format!("bad hex number: {}", num_str))? - } else { - num_str.parse::() - .map_err(|_| format!("bad number: {}", num_str))? - }; - tokens.push(ExprToken::Number(val)); - } - } - b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'.' => { - let start = i; - while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_' || bytes[i] == b'.') { - i += 1; - } - tokens.push(ExprToken::Symbol(expr[start..i].to_string())); - } - c => return Err(format!("unexpected character in expression: '{}' (0x{:02x})", c as char, c)), - } - } - - Ok(tokens) -} - -// ─── Core ELF writer ────────────────────────────────────────────────── - -/// Shared ELF writer for x86-family architectures. -/// -/// Contains all the common logic for building ELF relocatable objects -/// from parsed assembly items. Architecture-specific behavior is -/// provided through the `X86Arch` trait parameter. -pub struct ElfWriterCore { - sections: Vec
, - symbols: Vec, - section_map: HashMap, - symbol_map: HashMap, - current_section: Option, - previous_section: Option, - label_positions: HashMap, - numeric_label_positions: HashMap>, - pending_globals: Vec, - pending_weaks: Vec, - pending_types: HashMap, - pending_sizes: HashMap, - pending_hidden: Vec, - pending_protected: Vec, - pending_internal: Vec, - aliases: HashMap, - section_stack: Vec<(Option, Option)>, - /// Deferred `.skip` expressions: (section_index, offset, expression, fill_byte). - deferred_skips: Vec<(usize, usize, String, u8)>, - /// Deferred byte-sized symbol diffs: (section_index, offset, sym_a, sym_b, size, addend). - deferred_byte_diffs: Vec<(usize, usize, String, String, usize, i64)>, - /// Current code mode (16, 32, or 64). Affects instruction encoding. - /// Set by `.code16`, `.code32`, `.code64` directives. - code_mode: u8, - _arch: std::marker::PhantomData, -} - -impl ElfWriterCore { - pub fn new() -> Self { - ElfWriterCore { - sections: Vec::new(), - symbols: Vec::new(), - section_map: HashMap::new(), - symbol_map: HashMap::new(), - current_section: None, - previous_section: None, - label_positions: HashMap::new(), - numeric_label_positions: HashMap::new(), - pending_globals: Vec::new(), - pending_weaks: Vec::new(), - pending_types: HashMap::new(), - pending_sizes: HashMap::new(), - pending_hidden: Vec::new(), - pending_protected: Vec::new(), - pending_internal: Vec::new(), - aliases: HashMap::new(), - section_stack: Vec::new(), - deferred_skips: Vec::new(), - deferred_byte_diffs: Vec::new(), - code_mode: A::default_code_mode(), - _arch: std::marker::PhantomData, - } - } - - /// Build the ELF object file from parsed assembly items. - pub fn build(mut self, items: &[AsmItem]) -> Result, String> { - let items = resolve_numeric_labels(items); - for item in &items { - self.process_item(item)?; - } - self.emit_elf() - } - - fn get_or_create_section(&mut self, name: &str, section_type: u32, flags: u64, comdat_group: Option) -> usize { - if let Some(&idx) = self.section_map.get(name) { - return idx; - } - let idx = self.sections.len(); - self.sections.push(Section { - name: name.to_string(), - section_type, - flags, - data: Vec::new(), - alignment: if flags & SHF_EXECINSTR != 0 && name != ".init" && name != ".fini" { 16 } else { 1 }, - relocations: Vec::new(), - jumps: Vec::new(), - align_markers: Vec::new(), - comdat_group, - }); - self.section_map.insert(name.to_string(), idx); - idx - } - - fn current_section_mut(&mut self) -> Result<&mut Section, String> { - let idx = self.current_section.ok_or("no active section")?; - Ok(&mut self.sections[idx]) - } - - fn switch_section(&mut self, dir: &SectionDirective) { - let (section_type, flags) = parse_section_flags(&dir.name, dir.flags.as_deref(), dir.section_type.as_deref()); - let idx = self.get_or_create_section(&dir.name, section_type, flags, dir.comdat_group.clone()); - self.previous_section = self.current_section; - self.current_section = Some(idx); - } - - fn process_item(&mut self, item: &AsmItem) -> Result<(), String> { - match item { - AsmItem::Section(dir) => { - self.switch_section(dir); - } - AsmItem::PushSection(dir) => { - self.section_stack.push((self.current_section, self.previous_section)); - self.switch_section(dir); - } - AsmItem::PopSection => { - if let Some((saved_current, saved_previous)) = self.section_stack.pop() { - self.current_section = saved_current; - self.previous_section = saved_previous; - } - } - AsmItem::Previous => { - if self.previous_section.is_some() { - std::mem::swap(&mut self.current_section, &mut self.previous_section); - } - } - AsmItem::Global(name) => { - self.pending_globals.push(name.clone()); - } - AsmItem::Weak(name) => { - self.pending_weaks.push(name.clone()); - } - AsmItem::Hidden(name) => { - self.pending_hidden.push(name.clone()); - } - AsmItem::Protected(name) => { - self.pending_protected.push(name.clone()); - } - AsmItem::Internal(name) => { - self.pending_internal.push(name.clone()); - } - AsmItem::SymbolType(name, kind) => { - self.pending_types.insert(name.clone(), *kind); - } - AsmItem::Size(name, expr) => { - let resolved = match expr { - SizeExpr::CurrentMinusSymbol(start_sym) => { - if let Some(sec_idx) = self.current_section { - let current_off = self.sections[sec_idx].data.len() as u64; - let end_label = format!(".Lsize_end_{}", name); - self.label_positions.insert(end_label.clone(), (sec_idx, current_off)); - SizeExpr::SymbolDiff(end_label, start_sym.clone()) - } else { - expr.clone() - } - } - other => other.clone(), - }; - self.pending_sizes.insert(name.clone(), resolved); - } - AsmItem::Label(name) => { - self.ensure_section()?; - let sec_idx = self.current_section.unwrap(); - let offset = self.sections[sec_idx].data.len() as u64; - self.label_positions.insert(name.clone(), (sec_idx, offset)); - - if name.chars().all(|c| c.is_ascii_digit()) { - self.numeric_label_positions - .entry(name.clone()) - .or_default() - .push((sec_idx, offset)); - } - - self.ensure_symbol(name, sec_idx, offset); - } - AsmItem::Align(n) => { - if let Some(sec_idx) = self.current_section { - let section = &mut self.sections[sec_idx]; - let align = *n as u64; - if align > section.alignment { - section.alignment = align; - } - let current = section.data.len() as u64; - let aligned = (current + align - 1) & !(align - 1); - let padding = (aligned - current) as usize; - // Record alignment marker for post-relaxation fixup - if padding > 0 && align > 1 { - section.align_markers.push(AlignMarker { - offset: current as usize, - padding, - kind: AlignMarkerKind::Align(*n), - }); - } - if section.flags & SHF_EXECINSTR != 0 { - section.data.extend(std::iter::repeat_n(0x90, padding)); - } else { - section.data.extend(std::iter::repeat_n(0, padding)); - } - } - } - AsmItem::Byte(vals) => { - self.emit_data_values(vals, 1)?; - } - AsmItem::Short(vals) => { - self.emit_data_values(vals, 2)?; - } - AsmItem::Long(vals) => { - self.emit_data_values(vals, 4)?; - } - AsmItem::Quad(vals) => { - self.emit_data_values(vals, 8)?; - } - AsmItem::Zero(n) => { - self.ensure_section()?; - let section = self.current_section_mut()?; - section.data.extend(std::iter::repeat_n(0u8, *n as usize)); - } - AsmItem::Org(sym, offset) => { - self.process_org(sym, *offset)?; - } - AsmItem::SkipExpr(expr, fill) => { - self.ensure_section()?; - if A::supports_deferred_skips() { - let sec_idx = self.current_section.ok_or("no active section for .skip")?; - let offset = self.sections[sec_idx].data.len(); - self.deferred_skips.push((sec_idx, offset, expr.clone(), *fill)); - } else { - // Simple integer parse for architectures without deferred skip support - if let Ok(val) = expr.trim().parse::() { - let section = self.current_section_mut()?; - section.data.extend(std::iter::repeat_n(*fill, val as usize)); - } else { - return Err(format!("unsupported .skip expression: {}", expr)); - } - } - } - AsmItem::Asciz(bytes) | AsmItem::Ascii(bytes) => { - let section = self.current_section_mut()?; - section.data.extend_from_slice(bytes); - } - AsmItem::Comm(name, size, align) => { - let sym_idx = self.symbols.len(); - self.symbols.push(SymbolInfo { - name: name.clone(), - binding: STB_GLOBAL, - sym_type: STT_OBJECT, - visibility: STV_DEFAULT, - section: None, - value: *align as u64, - size: *size, - is_common: true, - common_align: *align, - }); - self.symbol_map.insert(name.clone(), sym_idx); - } - AsmItem::Set(alias, target) => { - self.aliases.insert(alias.clone(), target.clone()); - } - AsmItem::Symver(name, ver_string) => { - // .symver name, alias@@VERSION -> default version: create alias from "alias" to "name" - // .symver name, alias@VERSION -> compat version: create alias from "alias" to "name" - // Extract the unversioned alias name from the version string - if let Some(at_pos) = ver_string.find('@') { - let alias = &ver_string[..at_pos]; - if !alias.is_empty() { - self.aliases.insert(alias.to_string(), name.clone()); - } - } - } - AsmItem::Incbin { path, skip, count } => { - let data = std::fs::read(path) - .map_err(|e| format!(".incbin: failed to read '{}': {}", path, e))?; - let skip = *skip as usize; - let data = if skip < data.len() { &data[skip..] } else { &[] }; - let data = match count { - Some(c) => { - let c = *c as usize; - if c < data.len() { &data[..c] } else { data } - } - None => data, - }; - let section = self.current_section_mut()?; - section.data.extend_from_slice(data); - } - AsmItem::Instruction(instr) => { - self.encode_instruction(instr)?; - } - AsmItem::CodeMode(bits) => { - // Code mode is global state that persists across section switches, - // matching GNU as behavior (e.g. kernel trampoline_64.S uses - // .code16gcc/.code32/.code64 across .text/.text32/.text64 sections). - self.code_mode = *bits; - } - AsmItem::Cfi(_) | AsmItem::File(_, _) | AsmItem::Loc(_, _, _) - | AsmItem::OptionDirective(_) | AsmItem::Empty => {} - } - Ok(()) - } - - fn process_org(&mut self, sym: &str, offset: i64) -> Result<(), String> { - let sec_idx = match self.current_section { - Some(idx) => idx, - None => return Ok(()), - }; - let current = self.sections[sec_idx].data.len() as u64; - let target = if sym.is_empty() { - offset as u64 - } else if let Some(&(label_sec, label_off)) = self.label_positions.get(sym) { - if label_sec == sec_idx { - (label_off as i64 + offset) as u64 - } else { - return Err(format!(".org symbol {} not in current section", sym)); - } - } else if let Some((label_sec, label_off)) = self.resolve_numeric_label(sym, current, sec_idx) { - if label_sec == sec_idx { - (label_off as i64 + offset) as u64 - } else { - return Err(format!(".org symbol {} not in current section", sym)); - } - } else { - return Err(format!(".org: unknown symbol {}", sym)); - }; - let padding = if target > current { (target - current) as usize } else { 0 }; - // Record .org marker for post-relaxation fixup (even when padding == 0, - // because code before it may shrink during jump relaxation) - if !sym.is_empty() { - self.sections[sec_idx].align_markers.push(AlignMarker { - offset: current as usize, - padding, - kind: AlignMarkerKind::Org { - label: sym.to_string(), - addend: offset, - }, - }); - } - if padding > 0 { - let fill = if self.sections[sec_idx].flags & SHF_EXECINSTR != 0 { 0x90u8 } else { 0u8 }; - self.sections[sec_idx].data.extend(std::iter::repeat_n(fill, padding)); - } - Ok(()) - } - - fn ensure_section(&mut self) -> Result<(), String> { - if self.current_section.is_none() { - let idx = self.get_or_create_section(".text", SHT_PROGBITS, SHF_ALLOC | SHF_EXECINSTR, None); - self.current_section = Some(idx); - } - Ok(()) - } - - fn ensure_symbol(&mut self, name: &str, sec_idx: usize, offset: u64) { - let sec_name = self.sections[sec_idx].name.clone(); - - if let Some(&sym_idx) = self.symbol_map.get(name) { - let sym = &mut self.symbols[sym_idx]; - sym.section = Some(sec_name); - sym.value = offset; - } else { - let binding = if self.pending_globals.contains(&name.to_string()) { - STB_GLOBAL - } else if self.pending_weaks.contains(&name.to_string()) { - STB_WEAK - } else { - STB_LOCAL - }; - - let sym_type = match self.pending_types.get(name) { - Some(SymbolKind::Function) => STT_FUNC, - Some(SymbolKind::Object) => STT_OBJECT, - Some(SymbolKind::TlsObject) => STT_TLS, - Some(SymbolKind::NoType) | None => STT_NOTYPE, - }; - - let visibility = if self.pending_hidden.contains(&name.to_string()) { - STV_HIDDEN - } else if self.pending_protected.contains(&name.to_string()) { - STV_PROTECTED - } else if self.pending_internal.contains(&name.to_string()) { - STV_INTERNAL - } else { - STV_DEFAULT - }; - - let sym_idx = self.symbols.len(); - self.symbols.push(SymbolInfo { - name: name.to_string(), - binding, - sym_type, - visibility, - section: Some(sec_name), - value: offset, - size: 0, - is_common: false, - common_align: 0, - }); - self.symbol_map.insert(name.to_string(), sym_idx); - } - } - - fn emit_data_values(&mut self, vals: &[DataValue], size: usize) -> Result<(), String> { - let sec_idx = self.current_section.ok_or("no active section")?; - - for val in vals { - match val { - DataValue::Integer(v) => { - let section = &mut self.sections[sec_idx]; - match size { - 1 => section.data.push(*v as u8), - 2 => section.data.extend_from_slice(&(*v as i16).to_le_bytes()), - 4 => section.data.extend_from_slice(&(*v as i32).to_le_bytes()), - _ => section.data.extend_from_slice(&v.to_le_bytes()), - } - } - DataValue::Symbol(sym) => { - // Resolve .set aliases for label-difference expressions (DWARF debug info) - if A::resolve_set_aliases_in_data() { - if let Some(target) = self.aliases.get(sym).cloned() { - if let Some(pos) = target.find('-') { - let a = target[..pos].trim().to_string(); - let b = target[pos+1..].trim().to_string(); - let offset = self.sections[sec_idx].data.len() as u64; - self.sections[sec_idx].relocations.push(ElfRelocation { - offset, - symbol: a, - reloc_type: if size <= 4 { A::reloc_pc32() } else { A::reloc_abs64() }, - addend: 0, - diff_symbol: Some(b), - patch_size: size as u8, - }); - let section = &mut self.sections[sec_idx]; - section.data.extend(std::iter::repeat_n(0, size)); - continue; - } - } - } - let offset = self.sections[sec_idx].data.len() as u64; - self.sections[sec_idx].relocations.push(ElfRelocation { - offset, - symbol: sym.clone(), - reloc_type: A::reloc_abs(size), - addend: 0, - diff_symbol: None, - patch_size: size as u8, - }); - let section = &mut self.sections[sec_idx]; - section.data.extend(std::iter::repeat_n(0, size)); - } - DataValue::SymbolOffset(sym, addend) => { - let offset = self.sections[sec_idx].data.len() as u64; - self.sections[sec_idx].relocations.push(ElfRelocation { - offset, - symbol: sym.clone(), - reloc_type: A::reloc_abs(size), - addend: *addend, - diff_symbol: None, - patch_size: size as u8, - }); - let section = &mut self.sections[sec_idx]; - section.data.extend(std::iter::repeat_n(0, size)); - } - DataValue::SymbolDiff(a, b) => { - self.emit_symbol_diff(sec_idx, a, b, size, 0)?; - } - DataValue::SymbolDiffAddend(a, b, addend) => { - self.emit_symbol_diff(sec_idx, a, b, size, *addend)?; - } - } - } - Ok(()) - } - - fn emit_symbol_diff(&mut self, sec_idx: usize, a: &str, b: &str, size: usize, addend: i64) -> Result<(), String> { - let offset = self.sections[sec_idx].data.len() as u64; - let a_resolved = self.aliases.get(a).cloned().unwrap_or_else(|| a.to_string()); - let b_resolved = self.aliases.get(b).cloned().unwrap_or_else(|| b.to_string()); - - if b_resolved == "." { - // `sym - .` means PC-relative - self.sections[sec_idx].relocations.push(ElfRelocation { - offset, - symbol: a_resolved, - reloc_type: A::reloc_pc32(), - addend, - diff_symbol: None, - patch_size: size as u8, - }); - let section = &mut self.sections[sec_idx]; - section.data.extend(std::iter::repeat_n(0, size)); - } else if size <= 2 && A::supports_deferred_skips() { - // For byte/short-sized diffs, defer resolution until after - // deferred skips are inserted (skip insertion shifts offsets). - let offset_usize = self.sections[sec_idx].data.len(); - self.deferred_byte_diffs.push((sec_idx, offset_usize, a_resolved, b_resolved, size, addend)); - let section = &mut self.sections[sec_idx]; - section.data.extend(std::iter::repeat_n(0, size)); - } else { - self.sections[sec_idx].relocations.push(ElfRelocation { - offset, - symbol: a_resolved, - reloc_type: if size == 4 { A::reloc_pc32() } else { A::reloc_abs64() }, - addend, - diff_symbol: Some(b_resolved), - patch_size: size as u8, - }); - let section = &mut self.sections[sec_idx]; - section.data.extend(std::iter::repeat_n(0, size)); - } - Ok(()) - } - - fn encode_instruction(&mut self, instr: &Instruction) -> Result<(), String> { - self.ensure_section()?; - let sec_idx = self.current_section.unwrap(); - let base_offset = self.sections[sec_idx].data.len() as u64; - - // Use the appropriate encoder based on current code mode. - // When the i686 assembler is in .code64 mode, it delegates to - // the x86-64 encoder for 64-bit instruction encoding. - let result = if self.code_mode == 64 && A::default_code_mode() != 64 { - A::encode_instruction_code64(instr, base_offset)? - } else { - A::encode_instruction(instr, base_offset)? - }; - let instr_len = result.bytes.len(); - self.sections[sec_idx].data.extend_from_slice(&result.bytes); - - // Register jump for relaxation if detected - if let Some(jump_det) = result.jump { - if let Some(ref label) = self.get_jump_target_label(instr) { - if jump_det.already_short { - // Short-only jumps (jecxz/jcxz/loop) - already short, just need displacement patched - self.sections[sec_idx].jumps.push(JumpInfo { - offset: base_offset as usize, - len: instr_len, - target: label.clone(), - is_conditional: jump_det.is_conditional, - relaxed: true, - }); - } else { - let expected_len = if jump_det.is_conditional { 6 } else { 5 }; - if instr_len == expected_len { - self.sections[sec_idx].jumps.push(JumpInfo { - offset: base_offset as usize, - len: expected_len, - target: label.clone(), - is_conditional: jump_det.is_conditional, - relaxed: false, - }); - } - } - } - } - - // Copy relocations - for reloc in result.relocations { - self.sections[sec_idx].relocations.push(ElfRelocation { - offset: base_offset + reloc.offset, - symbol: reloc.symbol, - reloc_type: reloc.reloc_type, - addend: reloc.addend, - diff_symbol: reloc.diff_symbol, - patch_size: 4, - }); - } - - Ok(()) - } - - fn get_jump_target_label(&self, instr: &Instruction) -> Option { - let mnem = &instr.mnemonic; - let is_jump = mnem == "jmp" || mnem == "loop" - || (mnem.starts_with('j') && mnem.len() >= 2); - if !is_jump { return None; } - if instr.operands.len() != 1 { return None; } - if let Operand::Label(label) = &instr.operands[0] { - Some(label.clone()) - } else { - None - } - } - - // ─── Deferred skip resolution (x86-64 and i686) ────────────────── - - fn resolve_deferred_skips(&mut self) -> Result<(), String> { - let mut skips = std::mem::take(&mut self.deferred_skips); - skips.sort_by(|a, b| a.0.cmp(&b.0).then(a.1.cmp(&b.1)).reverse()); - - for (sec_idx, offset, expr, fill) in &skips { - // Temporarily insert "." (current position) into label_positions so - // expressions like "0b + 16 - ." can reference the directive's offset. - self.label_positions.insert(".".to_string(), (*sec_idx, *offset as u64)); - // Pre-resolve numeric label references (e.g. "0b", "1f") in the expression - let resolved_expr = self.resolve_numeric_labels_in_expr(expr, *offset as u64, *sec_idx); - let val = self.evaluate_expr(&resolved_expr); - self.label_positions.remove("."); - let val = val?; - let count = if val < 0 { 0usize } else { val as usize }; - if count == 0 { continue; } - - let fill_bytes: Vec = vec![*fill; count]; - self.sections[*sec_idx].data.splice(*offset..*offset, fill_bytes); - - // Adjust label positions - for (_, (lsec, loff)) in self.label_positions.iter_mut() { - if *lsec == *sec_idx && (*loff as usize) >= *offset { - *loff += count as u64; - } - } - for (_, positions) in self.numeric_label_positions.iter_mut() { - for (lsec, loff) in positions.iter_mut() { - if *lsec == *sec_idx && (*loff as usize) >= *offset { - *loff += count as u64; - } - } - } - for reloc in self.sections[*sec_idx].relocations.iter_mut() { - if (reloc.offset as usize) >= *offset { - reloc.offset += count as u64; - } - } - for jump in self.sections[*sec_idx].jumps.iter_mut() { - if jump.offset >= *offset { - jump.offset += count; - } - } - for (bsec, boff, _, _, _, _) in self.deferred_byte_diffs.iter_mut() { - if *bsec == *sec_idx && *boff >= *offset { - *boff += count; - } - } - } - Ok(()) - } - - fn resolve_deferred_byte_diffs(&mut self) -> Result<(), String> { - let diffs = std::mem::take(&mut self.deferred_byte_diffs); - for (sec_idx, offset, sym_a, sym_b, size, addend) in &diffs { - let pos_a = self.label_positions.get(sym_a) - .ok_or_else(|| format!("undefined label in .byte diff: {}", sym_a))?; - let pos_b = self.label_positions.get(sym_b) - .ok_or_else(|| format!("undefined label in .byte diff: {}", sym_b))?; - - if pos_a.0 != pos_b.0 { - return Err(format!("cross-section .byte diff: {} - {}", sym_a, sym_b)); - } - - let diff = (pos_a.1 as i64) - (pos_b.1 as i64) + addend; - match size { - 1 => { - self.sections[*sec_idx].data[*offset] = diff as u8; - } - 2 => { - let bytes = (diff as i16).to_le_bytes(); - self.sections[*sec_idx].data[*offset] = bytes[0]; - self.sections[*sec_idx].data[*offset + 1] = bytes[1]; - } - _ => unreachable!(), - } - } - Ok(()) - } - - /// Pre-resolve numeric label references (e.g. `0b`, `1f`) in an expression string. - /// - /// GNU as numeric labels like `0:` can be referenced as `0b` (backward) or `0f` - /// (forward). The expression tokenizer doesn't handle these, so we substitute - /// them with their resolved byte offsets before evaluation. - fn resolve_numeric_labels_in_expr(&self, expr: &str, offset: u64, sec_idx: usize) -> String { - let bytes = expr.as_bytes(); - let mut result = String::with_capacity(expr.len()); - let mut i = 0; - while i < bytes.len() { - if bytes[i].is_ascii_digit() { - let start = i; - while i < bytes.len() && bytes[i].is_ascii_digit() { i += 1; } - if i < bytes.len() && (bytes[i] == b'b' || bytes[i] == b'f') - && (i + 1 >= bytes.len() || !bytes[i + 1].is_ascii_alphanumeric()) - { - // This is a numeric label reference like "0b" or "1f" - let label_ref = &expr[start..=i]; - i += 1; - if let Some((_, label_off)) = self.resolve_numeric_label(label_ref, offset, sec_idx) { - result.push_str(&label_off.to_string()); - } else { - // Can't resolve - keep the original text (will error during eval) - result.push_str(label_ref); - } - } else { - // Regular number - result.push_str(&expr[start..i]); - } - } else { - result.push(bytes[i] as char); - i += 1; - } - } - result - } - - // ─── Expression evaluator ───────────────────────────────────────── - - fn evaluate_expr(&self, expr: &str) -> Result { - let expr = expr.trim(); - let tokens = tokenize_expr(expr)?; - let mut pos = 0; - let result = self.parse_expr_or(&tokens, &mut pos)?; - if pos < tokens.len() { - return Err(format!("unexpected token in expression at position {}: {:?}", pos, tokens.get(pos))); - } - Ok(result) - } - - fn parse_expr_or(&self, tokens: &[ExprToken], pos: &mut usize) -> Result { - let mut val = self.parse_expr_xor(tokens, pos)?; - while *pos < tokens.len() { - match tokens[*pos] { - ExprToken::Or => { *pos += 1; val |= self.parse_expr_xor(tokens, pos)?; } - _ => break, - } - } - Ok(val) - } - - fn parse_expr_xor(&self, tokens: &[ExprToken], pos: &mut usize) -> Result { - let mut val = self.parse_expr_and(tokens, pos)?; - while *pos < tokens.len() { - match tokens[*pos] { - ExprToken::Xor => { *pos += 1; val ^= self.parse_expr_and(tokens, pos)?; } - _ => break, - } - } - Ok(val) - } - - fn parse_expr_and(&self, tokens: &[ExprToken], pos: &mut usize) -> Result { - let mut val = self.parse_expr_cmp(tokens, pos)?; - while *pos < tokens.len() { - match tokens[*pos] { - ExprToken::And => { *pos += 1; val &= self.parse_expr_cmp(tokens, pos)?; } - _ => break, - } - } - Ok(val) - } - - fn parse_expr_cmp(&self, tokens: &[ExprToken], pos: &mut usize) -> Result { - let mut val = self.parse_expr_add(tokens, pos)?; - while *pos < tokens.len() { - match tokens[*pos] { - ExprToken::Lt => { - *pos += 1; - let rhs = self.parse_expr_add(tokens, pos)?; - val = if val < rhs { -1 } else { 0 }; - } - ExprToken::Gt => { - *pos += 1; - let rhs = self.parse_expr_add(tokens, pos)?; - val = if val > rhs { -1 } else { 0 }; - } - _ => break, - } - } - Ok(val) - } - - fn parse_expr_add(&self, tokens: &[ExprToken], pos: &mut usize) -> Result { - let mut val = self.parse_expr_mul(tokens, pos)?; - while *pos < tokens.len() { - match tokens[*pos] { - ExprToken::Plus => { *pos += 1; val = val.wrapping_add(self.parse_expr_mul(tokens, pos)?); } - ExprToken::Minus => { *pos += 1; val = val.wrapping_sub(self.parse_expr_mul(tokens, pos)?); } - _ => break, - } - } - Ok(val) - } - - fn parse_expr_mul(&self, tokens: &[ExprToken], pos: &mut usize) -> Result { - let mut val = self.parse_expr_unary(tokens, pos)?; - while *pos < tokens.len() { - match tokens[*pos] { - ExprToken::Star => { *pos += 1; val = val.wrapping_mul(self.parse_expr_unary(tokens, pos)?); } - _ => break, - } - } - Ok(val) - } - - fn parse_expr_unary(&self, tokens: &[ExprToken], pos: &mut usize) -> Result { - if *pos < tokens.len() { - match tokens[*pos] { - ExprToken::Minus => { - *pos += 1; - let val = self.parse_expr_unary(tokens, pos)?; - Ok(-val) - } - ExprToken::Plus => { - *pos += 1; - self.parse_expr_unary(tokens, pos) - } - ExprToken::Not => { - *pos += 1; - let val = self.parse_expr_unary(tokens, pos)?; - Ok(!val) - } - _ => self.parse_expr_primary(tokens, pos), - } - } else { - Err("unexpected end of expression".to_string()) - } - } - - fn parse_expr_primary(&self, tokens: &[ExprToken], pos: &mut usize) -> Result { - if *pos >= tokens.len() { - return Err("unexpected end of expression".to_string()); - } - match &tokens[*pos] { - ExprToken::Number(n) => { - *pos += 1; - Ok(*n) - } - ExprToken::Symbol(name) => { - *pos += 1; - if let Some(&(_, offset)) = self.label_positions.get(name.as_str()) { - Ok(offset as i64) - } else { - Err(format!("undefined symbol in expression: {}", name)) - } - } - ExprToken::LParen => { - *pos += 1; - let val = self.parse_expr_or(tokens, pos)?; - if *pos < tokens.len() && tokens[*pos] == ExprToken::RParen { - *pos += 1; - } else { - return Err("missing closing parenthesis".to_string()); - } - Ok(val) - } - other => Err(format!("unexpected token: {:?}", other)), - } - } - - // ─── ELF emission ───────────────────────────────────────────────── - - fn emit_elf(mut self) -> Result, String> { - // Relax long jumps to short form where possible. - self.relax_jumps(); - - // Resolve deferred .skip expressions (x86-64 and i686) - if A::supports_deferred_skips() { - self.resolve_deferred_skips()?; - self.resolve_deferred_byte_diffs()?; - } - - // Resolve internal relocations - self.resolve_internal_relocations(); - - // Convert to shared ObjSection/ObjSymbol format - let section_names: Vec = self.sections.iter().map(|s| s.name.clone()).collect(); - - let mut shared_sections: HashMap = HashMap::new(); - for sec in &self.sections { - let mut data = sec.data.clone(); - let mut relocs = Vec::new(); - - for reloc in &sec.relocations { - let (sym_name, mut addend) = if reloc.symbol.starts_with('.') { - if let Some(&(target_sec, target_off)) = self.label_positions.get(&reloc.symbol) { - (section_names[target_sec].clone(), reloc.addend + target_off as i64) - } else { - (reloc.symbol.clone(), reloc.addend) - } - } else { - (reloc.symbol.clone(), reloc.addend) - }; - - // Handle symbol-difference relocations (.long a - b) - if let Some(ref diff_sym) = reloc.diff_symbol { - if let Some(&(_b_sec, b_off)) = self.label_positions.get(diff_sym.as_str()) { - addend += reloc.offset as i64 - b_off as i64; - } - } - - // For REL format (i686): patch addend into section data - if A::uses_rel_format() { - let off = reloc.offset as usize; - if off + 4 <= data.len() { - let existing = i32::from_le_bytes([data[off], data[off+1], data[off+2], data[off+3]]); - let patched = existing.wrapping_add(addend as i32); - data[off..off+4].copy_from_slice(&patched.to_le_bytes()); - } - relocs.push(ObjReloc { - offset: reloc.offset, - reloc_type: reloc.reloc_type, - symbol_name: sym_name, - addend: 0, - }); - } else { - relocs.push(ObjReloc { - offset: reloc.offset, - reloc_type: reloc.reloc_type, - symbol_name: sym_name, - addend, - }); - } - } - - shared_sections.insert(sec.name.clone(), ObjSection { - name: sec.name.clone(), - sh_type: sec.section_type, - sh_flags: sec.flags, - data, - sh_addralign: sec.alignment, - relocs, - comdat_group: sec.comdat_group.clone(), - }); - } - - // Convert label positions - let labels: HashMap = self.label_positions.iter() - .map(|(name, &(sec_idx, offset))| { - (name.clone(), (section_names[sec_idx].clone(), offset)) - }) - .collect(); - - let global_symbols: HashMap = self.pending_globals.iter() - .map(|s| (s.clone(), true)) - .collect(); - let weak_symbols: HashMap = self.pending_weaks.iter() - .map(|s| (s.clone(), true)) - .collect(); - - let symbol_types: HashMap = self.pending_types.iter() - .map(|(name, kind)| { - let stt = match kind { - SymbolKind::Function => STT_FUNC, - SymbolKind::Object => STT_OBJECT, - SymbolKind::TlsObject => STT_TLS, - SymbolKind::NoType => STT_NOTYPE, - }; - (name.clone(), stt) - }) - .collect(); - - // Resolve pending_sizes to concrete u64 values - let symbol_sizes: HashMap = self.pending_sizes.iter() - .map(|(name, expr)| { - let size = match expr { - SizeExpr::Constant(v) => *v, - SizeExpr::CurrentMinusSymbol(start_sym) => { - if let Some(&(sec_idx, start_off)) = self.label_positions.get(start_sym) { - let end = self.sections[sec_idx].data.len() as u64; - end - start_off - } else { - 0 - } - } - SizeExpr::SymbolDiff(end_label, start_label) => { - let end_off = self.label_positions.get(end_label).map(|p| p.1).unwrap_or(0); - let start_off = self.label_positions.get(start_label).map(|p| p.1).unwrap_or(0); - end_off.wrapping_sub(start_off) - } - SizeExpr::SymbolRef(sym_ref) => { - if let Some(alias_target) = self.aliases.get(sym_ref) { - let normalized = alias_target.replace(' ', ""); - if let Some(rest) = normalized.strip_prefix(".-") { - if let Some(&(sec_idx, start_off)) = self.label_positions.get(rest) { - let end = self.sections[sec_idx].data.len() as u64; - end - start_off - } else { 0 } - } else { 0 } - } else { 0 } - } - }; - (name.clone(), size) - }) - .collect(); - - let mut symbol_visibility: HashMap = HashMap::new(); - for name in &self.pending_hidden { - symbol_visibility.insert(name.clone(), STV_HIDDEN); - } - for name in &self.pending_protected { - symbol_visibility.insert(name.clone(), STV_PROTECTED); - } - for name in &self.pending_internal { - symbol_visibility.insert(name.clone(), STV_INTERNAL); - } - - let symtab_input = SymbolTableInput { - labels: &labels, - global_symbols: &global_symbols, - weak_symbols: &weak_symbols, - symbol_types: &symbol_types, - symbol_sizes: &symbol_sizes, - symbol_visibility: &symbol_visibility, - aliases: &self.aliases, - sections: &shared_sections, - include_referenced_locals: false, - }; - - let mut shared_symbols = elf_mod::build_elf_symbol_table(&symtab_input); - - // Add COMMON symbols - for sym in &self.symbols { - if sym.is_common { - shared_symbols.retain(|s| !(s.name == sym.name && s.section_name == "*UND*")); - shared_symbols.push(ObjSymbol { - name: sym.name.clone(), - value: sym.common_align as u64, - size: sym.size, - binding: sym.binding, - sym_type: sym.sym_type, - visibility: sym.visibility, - section_name: "*COM*".to_string(), - }); - } - } - - let config = ElfConfig { - e_machine: A::elf_machine(), - e_flags: A::elf_flags(), - elf_class: A::elf_class(), - force_rela: false, - }; - - elf_mod::write_relocatable_object( - &config, - §ion_names, - &shared_sections, - &shared_symbols, - ) - } - - // ─── Numeric label resolution ───────────────────────────────────── - - fn resolve_numeric_label(&self, symbol: &str, reloc_offset: u64, sec_idx: usize) -> Option<(usize, u64)> { - let len = symbol.len(); - if len < 2 { return None; } - let suffix = symbol.as_bytes()[len - 1]; - if suffix != b'b' && suffix != b'f' { return None; } - let label_num = &symbol[..len - 1]; - if !label_num.chars().all(|c| c.is_ascii_digit()) { return None; } - - let positions = self.numeric_label_positions.get(label_num)?; - if suffix == b'b' { - let mut best: Option<(usize, u64)> = None; - for &(s_idx, off) in positions { - if s_idx == sec_idx && off <= reloc_offset - && (best.is_none() || off > best.unwrap().1) - { - best = Some((s_idx, off)); - } - } - best - } else { - let mut best: Option<(usize, u64)> = None; - for &(s_idx, off) in positions { - if s_idx == sec_idx && off > reloc_offset - && (best.is_none() || off < best.unwrap().1) - { - best = Some((s_idx, off)); - } - } - best - } - } - - // ─── Jump relaxation ────────────────────────────────────────────── - - fn relax_jumps(&mut self) { - for sec_idx in 0..self.sections.len() { - if self.sections[sec_idx].jumps.is_empty() { - continue; - } - - // Iterative relaxation until convergence - loop { - let mut any_relaxed = false; - let mut local_labels: HashMap = HashMap::new(); - for (name, &(s_idx, offset)) in &self.label_positions { - if s_idx == sec_idx { - local_labels.insert(name.clone(), offset as usize); - } - } - - let mut to_relax: Vec = Vec::new(); - for (j_idx, jump) in self.sections[sec_idx].jumps.iter().enumerate() { - if jump.relaxed { continue; } - let target_off_opt = local_labels.get(&jump.target).copied() - .or_else(|| { - self.resolve_numeric_label(&jump.target, jump.offset as u64, sec_idx) - .map(|(_, off)| off as usize) - }); - if let Some(target_off) = target_off_opt { - let short_end = jump.offset as i64 + 2; - let disp = target_off as i64 - short_end; - if (-128..=127).contains(&disp) { - to_relax.push(j_idx); - } - } - } - - if to_relax.is_empty() { break; } - - // Process from back to front so offsets stay valid - to_relax.sort_unstable(); - to_relax.reverse(); - - for &j_idx in &to_relax { - let jump = &self.sections[sec_idx].jumps[j_idx]; - let offset = jump.offset; - let old_len = jump.len; - let is_conditional = jump.is_conditional; - let new_len = 2usize; - let shrink = old_len - new_len; - - // Rewrite instruction bytes - let data = &mut self.sections[sec_idx].data; - if is_conditional { - let cc = data[offset + 1] - 0x80; - data[offset] = 0x70 + cc; - data[offset + 1] = 0; - } else { - data[offset] = 0xEB; - data[offset + 1] = 0; - } - - let remove_start = offset + new_len; - let remove_end = offset + old_len; - data.drain(remove_start..remove_end); - - // Update label positions - for (_, pos) in self.label_positions.iter_mut() { - if pos.0 == sec_idx && (pos.1 as usize) > offset { - pos.1 -= shrink as u64; - } - } - for (_, positions) in self.numeric_label_positions.iter_mut() { - for pos in positions.iter_mut() { - if pos.0 == sec_idx && (pos.1 as usize) > offset { - pos.1 -= shrink as u64; - } - } - } - - // Update relocations: remove the one for this jump, shift others - self.sections[sec_idx].relocations.retain_mut(|reloc| { - let reloc_off = reloc.offset as usize; - let old_reloc_pos = if is_conditional { offset + 2 } else { offset + 1 }; - if reloc_off == old_reloc_pos { - return false; - } - if reloc_off > offset { - reloc.offset -= shrink as u64; - } - true - }); - - // Update other jump offsets - for other_jump in self.sections[sec_idx].jumps.iter_mut() { - if other_jump.offset > offset { - other_jump.offset -= shrink; - } - } - - // Update alignment markers - for marker in self.sections[sec_idx].align_markers.iter_mut() { - if marker.offset > offset { - marker.offset -= shrink; - } - } - - // Update deferred skip offsets - for (s_idx, s_off, _, _) in self.deferred_skips.iter_mut() { - if *s_idx == sec_idx && *s_off > offset { - *s_off -= shrink; - } - } - - // Update deferred byte diff offsets - for (s_idx, s_off, _, _, _, _) in self.deferred_byte_diffs.iter_mut() { - if *s_idx == sec_idx && *s_off > offset { - *s_off -= shrink; - } - } - - self.sections[sec_idx].jumps[j_idx].relaxed = true; - self.sections[sec_idx].jumps[j_idx].len = new_len; - any_relaxed = true; - } - - if !any_relaxed { break; } - } - - // Post-relaxation fixup for alignment/org markers - self.fixup_alignment_markers(sec_idx); - - // Resolve short jump displacements - let mut local_labels: HashMap = HashMap::new(); - for (name, &(s_idx, offset)) in &self.label_positions { - if s_idx == sec_idx { - local_labels.insert(name.clone(), offset as usize); - } - } - - let patches: Vec<(usize, u8)> = self.sections[sec_idx].jumps.iter() - .filter(|j| j.relaxed) - .filter_map(|jump| { - let target = local_labels.get(&jump.target).copied() - .or_else(|| { - self.resolve_numeric_label(&jump.target, jump.offset as u64, sec_idx) - .map(|(_, off)| off as usize) - }); - target.map(|target_off| { - let end_of_instr = jump.offset + 2; - let disp = (target_off as i64 - end_of_instr as i64) as i8; - (jump.offset + 1, disp as u8) - }) - }) - .collect(); - - for (off, byte) in patches { - self.sections[sec_idx].data[off] = byte; - } - } - } - - fn fixup_alignment_markers(&mut self, sec_idx: usize) { - if self.sections[sec_idx].align_markers.is_empty() { - return; - } - - // Sort by offset to ensure front-to-back processing - self.sections[sec_idx].align_markers.sort_by_key(|m| m.offset); - - let is_exec = self.sections[sec_idx].flags & SHF_EXECINSTR != 0; - let fill_byte = if is_exec { 0x90u8 } else { 0u8 }; - - let mut marker_idx = 0; - loop { - if marker_idx >= self.sections[sec_idx].align_markers.len() { - break; - } - let current_offset = self.sections[sec_idx].align_markers[marker_idx].offset; - let kind = self.sections[sec_idx].align_markers[marker_idx].kind.clone(); - - let needed_end = match &kind { - AlignMarkerKind::Align(align) => { - let a = *align as usize; - if a <= 1 { marker_idx += 1; continue; } - (current_offset + a - 1) & !(a - 1) - } - AlignMarkerKind::Org { label, addend } => { - if label.is_empty() { - *addend as usize - } else if let Some(&(l_sec, l_off)) = self.label_positions.get(label.as_str()) { - if l_sec == sec_idx { - (l_off as i64 + *addend) as usize - } else { - marker_idx += 1; continue; - } - } else { - marker_idx += 1; continue; - } - } - }; - - let needed_padding = needed_end.saturating_sub(current_offset); - let existing_padding = self.sections[sec_idx].align_markers[marker_idx].padding; - - if needed_padding > existing_padding { - let insert_at = current_offset + existing_padding; - let extra = needed_padding - existing_padding; - let insert_bytes = vec![fill_byte; extra]; - self.sections[sec_idx].data.splice(insert_at..insert_at, insert_bytes); - self.shift_offsets_after(sec_idx, insert_at, extra as i64, marker_idx); - } else if needed_padding < existing_padding { - let remove_count = existing_padding - needed_padding; - let remove_start = current_offset + needed_padding; - let remove_end = remove_start + remove_count; - self.sections[sec_idx].data.drain(remove_start..remove_end); - self.shift_offsets_after(sec_idx, remove_start, -(remove_count as i64), marker_idx); - } - - marker_idx += 1; - } - } - - /// Shift all labels, relocations, jumps, alignment markers, deferred skips, - /// and deferred byte diffs in a section after an insertion or removal at `at_offset`. - fn shift_offsets_after(&mut self, sec_idx: usize, at_offset: usize, delta: i64, current_marker_idx: usize) { - if delta == 0 { return; } - for (_, pos) in self.label_positions.iter_mut() { - if pos.0 == sec_idx && (pos.1 as usize) >= at_offset { - pos.1 = (pos.1 as i64 + delta) as u64; - } - } - for (_, positions) in self.numeric_label_positions.iter_mut() { - for pos in positions.iter_mut() { - if pos.0 == sec_idx && (pos.1 as usize) >= at_offset { - pos.1 = (pos.1 as i64 + delta) as u64; - } - } - } - for reloc in self.sections[sec_idx].relocations.iter_mut() { - if (reloc.offset as usize) >= at_offset { - reloc.offset = (reloc.offset as i64 + delta) as u64; - } - } - for jump in self.sections[sec_idx].jumps.iter_mut() { - if jump.offset >= at_offset { - jump.offset = (jump.offset as i64 + delta) as usize; - } - } - for i in (current_marker_idx + 1)..self.sections[sec_idx].align_markers.len() { - if self.sections[sec_idx].align_markers[i].offset >= at_offset { - self.sections[sec_idx].align_markers[i].offset = - (self.sections[sec_idx].align_markers[i].offset as i64 + delta) as usize; - } - } - // Update deferred skips and byte diffs - for (skip_sec, skip_off, _, _) in self.deferred_skips.iter_mut() { - if *skip_sec == sec_idx && *skip_off >= at_offset { - *skip_off = (*skip_off as i64 + delta) as usize; - } - } - for (bd_sec, bd_off, _, _, _, _) in self.deferred_byte_diffs.iter_mut() { - if *bd_sec == sec_idx && *bd_off >= at_offset { - *bd_off = (*bd_off as i64 + delta) as usize; - } - } - } - - // ─── Symbol locality check ──────────────────────────────────────── - - fn is_local_symbol(&self, name: &str) -> bool { - if name.starts_with('.') { return true; } - if name.len() >= 2 { - let last = name.as_bytes()[name.len() - 1]; - if (last == b'f' || last == b'b') && name[..name.len()-1].chars().all(|c| c.is_ascii_digit()) { - return true; - } - } - if let Some(&sym_idx) = self.symbol_map.get(name) { - self.symbols[sym_idx].binding == STB_LOCAL - } else { - false - } - } - - // ─── Internal relocation resolution ─────────────────────────────── - - fn resolve_internal_relocations(&mut self) { - for sec_idx in 0..self.sections.len() { - let mut resolved: Vec<(usize, i64, usize)> = Vec::new(); // (offset, value, patch_size) - let mut pc8_patches: Vec<(usize, u8)> = Vec::new(); - let mut unresolved = Vec::new(); - - for reloc in &self.sections[sec_idx].relocations { - // Handle SymbolDiff relocations - if reloc.diff_symbol.is_some() { - if let Some(ref diff_sym) = reloc.diff_symbol { - if let (Some(&(a_sec, a_off)), Some(&(b_sec, b_off))) = ( - self.label_positions.get(&reloc.symbol), - self.label_positions.get(diff_sym), - ) { - if a_sec == b_sec { - let val = a_off as i64 - b_off as i64; - resolved.push((reloc.offset as usize, val, reloc.patch_size as usize)); - continue; - } - } - } - unresolved.push(reloc.clone()); - continue; - } - - let label_pos = self.label_positions.get(&reloc.symbol).copied() - .or_else(|| self.resolve_numeric_label(&reloc.symbol, reloc.offset, sec_idx)); - - if let Some((target_sec, target_off)) = label_pos { - let is_local = self.is_local_symbol(&reloc.symbol); - - // Handle PC8 internal relocations (x86-64 loop/jrcxz) - if let Some(pc8_type) = A::reloc_pc8_internal() { - if reloc.reloc_type == pc8_type && target_sec == sec_idx { - let rel = (target_off as i64) + reloc.addend - (reloc.offset as i64); - if (-128..=127).contains(&rel) { - pc8_patches.push((reloc.offset as usize, rel as u8)); - } - continue; - } - } - - if target_sec == sec_idx && is_local - && (reloc.reloc_type == A::reloc_pc32() || reloc.reloc_type == A::reloc_plt32()) - { - let rel = (target_off as i64) + reloc.addend - (reloc.offset as i64); - resolved.push((reloc.offset as usize, rel, reloc.patch_size as usize)); - } else if let Some(abs32_type) = A::reloc_abs32_for_internal() { - if target_sec == sec_idx && is_local && reloc.reloc_type == abs32_type { - let val = (target_off as i64) + reloc.addend; - resolved.push((reloc.offset as usize, val, reloc.patch_size as usize)); - } else { - unresolved.push(reloc.clone()); - } - } else { - unresolved.push(reloc.clone()); - } - } else { - unresolved.push(reloc.clone()); - } - } - - // Patch resolved relocations into section data - for (offset, value, psz) in resolved { - if psz == 1 { - self.sections[sec_idx].data[offset] = value as u8; - } else if psz == 2 { - let bytes = (value as i16).to_le_bytes(); - self.sections[sec_idx].data[offset..offset + 2].copy_from_slice(&bytes); - } else { - let bytes = (value as i32).to_le_bytes(); - self.sections[sec_idx].data[offset..offset + 4].copy_from_slice(&bytes); - } - } - for (offset, value) in pc8_patches { - self.sections[sec_idx].data[offset] = value; - } - - self.sections[sec_idx].relocations = unresolved; - } - } -} diff --git a/src/backend/f128_softfloat.rs b/src/backend/f128_softfloat.rs deleted file mode 100644 index 348cf5c37b..0000000000 --- a/src/backend/f128_softfloat.rs +++ /dev/null @@ -1,786 +0,0 @@ -//! Shared F128 (IEEE 754 binary128) soft-float orchestration for ARM and RISC-V. -//! -//! ARM and RISC-V both lack hardware quad-precision FP, so all F128 operations -//! go through compiler-rt/libgcc soft-float library calls. The orchestration -//! logic (load operand, save to temp, shuffle args, call libcall, convert result) -//! is identical between the two; only the register names, instruction mnemonics, -//! and F128 register representation differ: -//! -//! - **ARM**: F128 lives in a single NEON Q register (q0/q1). Moving between -//! arg positions is `mov v1.16b, v0.16b`. Sign bit flip uses `mov`+`eor`+`mov` -//! on the high lane. -//! - **RISC-V**: F128 lives in a GP register pair (a0:a1 / a2:a3). Moving between -//! arg positions is `mv a2, a0; mv a3, a1`. Sign bit flip uses `li`+`slli`+`xor` -//! on the high register. -//! -//! The `F128SoftFloat` trait captures arch-specific primitives, and the `f128_*` -//! free functions implement the shared orchestration once. This covers: -//! -//! - **Operand loading** (`f128_operand_to_arg1`): load F128 with full precision -//! - **Store/load dispatch** (`f128_emit_store`, `f128_emit_load`, etc.): the -//! SlotAddr 4-way dispatch for F128 store/load/store_with_offset/load_with_offset -//! - **Cast dispatch** (`f128_emit_cast`): int<->F128 and float<->F128 casts -//! - **Binop dispatch** (`f128_emit_binop`): F128 arithmetic via libcalls -//! - **Comparison** (`f128_cmp`): F128 comparison via libcalls -//! - **Negation** (`f128_neg`): sign bit flip - -use crate::ir::reexports::{ - IrCmpOp, - IrConst, - Operand, - Value, -}; -use crate::common::types::IrType; -use crate::backend::state::{StackSlot, SlotAddr}; -use crate::backend::cast::FloatOp; - -/// Arch-specific primitives for F128 soft-float operations. -/// -/// Each method emits a small sequence of instructions (1-5 lines) specific to -/// the architecture. The shared orchestration functions call these in the right -/// order to implement full-precision F128 loads, stores, arithmetic, and comparisons. -pub trait F128SoftFloat { - // --- State access --- - - /// Access the codegen state (for emit, get_slot, get_f128_source, etc.). - fn state(&mut self) -> &mut crate::backend::state::CodegenState; - - /// Get the stack slot for a value (delegates to state().get_slot). - fn f128_get_slot(&self, val_id: u32) -> Option; - - /// Get the f128 load source tracking for a value. - fn f128_get_source(&self, val_id: u32) -> Option<(u32, i64, bool)>; - - /// Resolve a value's slot address (Direct/Indirect/OverAligned). - fn f128_resolve_slot_addr(&self, val_id: u32) -> Option; - - // --- Loading f128 constants --- - - /// Load an f128 constant (given as lo:hi u64 halves) into the first argument - /// position (ARM: q0 via x0/x1+fmov; RISC-V: a0:a1 via li). - fn f128_load_const_to_arg1(&mut self, lo: u64, hi: u64); - - // --- Loading f128 from memory --- - - /// Load f128 (16 bytes) from an indirect source into arg1 position. - /// The address register (ARM: x17, RISC-V: t5) already points to the data. - fn f128_load_16b_from_addr_reg_to_arg1(&mut self); - - /// Load f128 from a slot at the given frame-relative offset into arg1 position. - /// (ARM: `ldr q0, [sp, #offset]`; RISC-V: `ld a0, offset(s0); ld a1, offset+8(s0)`) - fn f128_load_from_frame_offset_to_arg1(&mut self, offset: i64); - - // --- Address computation --- - - /// Load a pointer from a slot into the address register (ARM: x17, RISC-V: t5). - /// For allocas, computes the address of the alloca; for non-allocas, loads the pointer value. - fn f128_load_ptr_to_addr_reg(&mut self, slot: StackSlot, val_id: u32); - - /// Add an offset to the address register (ARM: x17, RISC-V: t5). - fn f128_add_offset_to_addr_reg(&mut self, offset: i64); - - /// Compute the aligned address for an over-aligned alloca into the address register. - fn f128_alloca_aligned_addr(&mut self, slot: StackSlot, val_id: u32); - - // --- Fallback: f64 -> f128 conversion --- - - /// Load an operand as an f64 bit pattern into the accumulator register - /// (ARM: operand_to_x0; RISC-V: operand_to_t0), then convert to f128 via - /// __extenddftf2. After this call, arg1 holds the f128 value. - fn f128_load_operand_and_extend(&mut self, op: &Operand); - - // --- Arg shuffling --- - - /// Move the f128 value from arg1 position to arg2 position. - /// (ARM: `mov v1.16b, v0.16b`; RISC-V: `mv a2, a0; mv a3, a1`) - fn f128_move_arg1_to_arg2(&mut self); - - /// Save the f128 in arg1 to the stack pointer (16 bytes at sp). - /// Used as temp storage when we need to load both operands. - /// (ARM: `str q0, [sp]`; RISC-V: `sd a0, 0(sp); sd a1, 8(sp)`) - fn f128_save_arg1_to_sp(&mut self); - - /// Reload the f128 from the stack pointer back into arg1. - /// (ARM: `ldr q0, [sp]`; RISC-V: `ld a0, 0(sp); ld a1, 8(sp)`) - fn f128_reload_arg1_from_sp(&mut self); - - // --- Stack temp allocation --- - - /// Allocate temp stack space (16 bytes). (ARM: `sub sp, sp, #16`; RISC-V: `addi sp, sp, -16`) - fn f128_alloc_temp_16(&mut self); - - /// Free temp stack space (16 bytes). (ARM: `add sp, sp, #16`; RISC-V: `addi sp, sp, 16`) - fn f128_free_temp_16(&mut self); - - // --- Calls --- - - /// Emit a call to a named library function. - /// (ARM: `bl `; RISC-V: `call `) - fn f128_call(&mut self, name: &str); - - // --- Result handling --- - - /// Convert f128 result (in arg1) to f64 approximation and move to accumulator. - /// Calls __trunctfdf2, then moves the f64 from float reg to GP acc. - /// (ARM: `bl __trunctfdf2; fmov x0, d0`; RISC-V: `call __trunctfdf2; fmv.x.d t0, fa0`) - fn f128_truncate_result_to_acc(&mut self); - - // --- F128 store to slot --- - - /// Store f128 constant halves (lo, hi) directly to a stack slot. - /// (ARM: load imm + str; RISC-V: li + sd) - fn f128_store_const_halves_to_slot(&mut self, lo: u64, hi: u64, slot: StackSlot); - - /// Store f128 from arg1 to a stack slot (16 bytes). - /// (ARM: `str q0, [sp, #offset]`; RISC-V: `sd a0, offset(s0); sd a1, offset+8(s0)`) - fn f128_store_arg1_to_slot(&mut self, slot: StackSlot); - - /// Load f128 (16 bytes) from a source slot at offset, store to dest slot. - /// This is a direct memory-to-memory copy (load then store, 16 bytes). - fn f128_copy_slot_to_slot(&mut self, src_offset: i64, dest_slot: StackSlot); - - /// Load f128 from addr_reg, store to dest slot. - fn f128_copy_addr_reg_to_slot(&mut self, dest_slot: StackSlot); - - // --- F128 store to address --- - - /// Store f128 constant halves to address in addr_reg. - fn f128_store_const_halves_to_addr(&mut self, lo: u64, hi: u64); - - /// Save addr_reg to a scratch register before potentially clobbering it. - fn f128_save_addr_reg(&mut self); - - /// Load f128 from source slot offset, store to saved addr. - fn f128_copy_slot_to_saved_addr(&mut self, src_offset: i64); - - /// Load f128 from addr_reg (source), store to saved addr (dest). - /// The source address is in addr_reg, the dest address was saved by f128_save_addr_reg. - fn f128_copy_addr_reg_to_saved_addr(&mut self); - - /// Store the f128 in arg1 to the saved address (from f128_save_addr_reg). - fn f128_store_arg1_to_saved_addr(&mut self); - - // --- F128 negation --- - - /// Flip the sign bit of the f128 in arg1. - /// (ARM: `mov x0, v0.d[1]; eor x0, x0, #0x80...; mov v0.d[1], x0`) - /// (RISC-V: `li t0, 1; slli t0, t0, 63; xor a1, a1, t0`) - fn f128_flip_sign_bit(&mut self); - - // --- Comparison result mapping --- - - /// Map a comparison libcall result to a boolean in the accumulator. - /// ARM uses `cmp w0, #0; cset x0, `. - /// RISC-V uses seqz/snez/slti/slt/xori sequences. - fn f128_cmp_result_to_bool(&mut self, kind: crate::backend::cast::F128CmpKind); - - // --- Result store --- - - /// Store the accumulator to dest (ARM: store_x0_to; RISC-V: store_t0_to). - fn f128_store_acc_to_dest(&mut self, dest: &Value); - - /// Track that dest has full f128 data in its own slot (for subsequent loads). - fn f128_track_self(&mut self, dest_id: u32); - - /// Set the accumulator cache to hold the given value (without writing to slot). - fn f128_set_acc_cache(&mut self, dest_id: u32); - - /// Has dynamic alloca flag (needed for ARM's SP-relative addressing workaround). - /// Returns the current value and sets it to the given value. - fn f128_set_dyn_alloca(&mut self, val: bool) -> bool; - - // --- Store/Load dispatch primitives (used by shared emit_store/emit_load) --- - - /// Move a callee-saved register value into the address register (ARM: x17, RISC-V: t5). - /// Called when a pointer is register-allocated. - fn f128_move_callee_reg_to_addr_reg(&mut self, val_id: u32) -> bool; - - /// Move the computed aligned address into the address register. - /// ARM needs `mov x17, x9` because x9 is the alloca addr register and x17 is the - /// F128 addr register. RISC-V uses t5 for both, so this is a no-op. - fn f128_move_aligned_to_addr_reg(&mut self) {} - - /// Load a pointer from a (non-alloca) slot into the address register. - /// This differs from `f128_load_ptr_to_addr_reg` in that it always - /// loads the pointer value, never computes an alloca address. - fn f128_load_indirect_ptr_to_addr_reg(&mut self, slot: StackSlot, val_id: u32); - - /// Load f128 from addr_reg, convert to f64 approx, store to dest. - /// This is the "load through pointer" path: load 16 bytes from the address - /// register, call __trunctfdf2, move result to accumulator, store to dest. - fn f128_load_from_addr_reg_to_acc(&mut self, dest: &Value); - - /// Load f128 from a direct alloca slot, convert to f64 approx, store to accumulator. - fn f128_load_from_direct_slot_to_acc(&mut self, slot: StackSlot); - - /// Store arg1 (f128 result) to dest slot and produce f64 approximation. - /// This is the common epilogue for cast/binop results: store full f128 to - /// dest slot, track self, call __trunctfdf2, update cache. Does NOT store - /// f64 back to the slot (that would overwrite the full-precision f128). - fn f128_store_result_and_truncate(&mut self, dest: &Value); - - /// Load the accumulator value and move it to the first integer argument register. - /// (ARM: already in x0; RISC-V: `mv a0, t0`) - fn f128_move_acc_to_arg0(&mut self); - - /// Move the f128 return value from arg1 to accumulator as f64 result. - /// (ARM: result already in x0 from __fixtfdi; RISC-V: `mv t0, a0`) - fn f128_move_arg0_to_acc(&mut self); - - /// Load an operand into the accumulator. - /// (ARM: operand_to_x0; RISC-V: operand_to_t0) - fn f128_load_operand_to_acc(&mut self, op: &Operand); - - /// Sign-extend a sub-64-bit signed integer in the accumulator. - /// (ARM: sxtb/sxth/sxtw; RISC-V: slli+srai/sext.w) - fn f128_sign_extend_acc(&mut self, from_size: usize); - - /// Zero-extend a sub-64-bit unsigned integer in the accumulator. - /// (ARM: and/mov w0,w0; RISC-V: andi/slli+srli) - fn f128_zero_extend_acc(&mut self, from_size: usize); - - /// Narrow the accumulator to a smaller integer type using emit_cast_instrs. - fn f128_narrow_acc(&mut self, to_ty: IrType); - - /// Move a float value from the accumulator to the float argument register - /// and extend it from F32 to F128 or F64 to F128. - /// (ARM: `fmov s0/d0, w0/x0; bl __extendsftf2/__extenddftf2`) - /// (RISC-V: `fmv.w.x/fmv.d.x fa0, t0; call __extendsftf2/__extenddftf2`) - fn f128_extend_float_to_f128(&mut self, from_ty: IrType); - - /// Convert an F128 in arg1 to F32 or F64 and move to accumulator. - /// (ARM: `bl __trunctfsf2; fmov w0, s0` or `bl __trunctfdf2; fmov x0, d0`) - /// (RISC-V: `call __trunctfsf2; fmv.x.w t0, fa0` or `call __trunctfdf2; fmv.x.d t0, fa0`) - fn f128_truncate_to_float_acc(&mut self, to_ty: IrType); - - /// Check if this backend has the `is_alloca` method accessible. - /// Both ARM and RISC-V do, so this just delegates. - fn f128_is_alloca(&self, val_id: u32) -> bool; -} - -// ============================================================================= -// Shared orchestration functions -// ============================================================================= - -/// Load an F128 operand into the first argument position with full precision. -/// -/// Three paths: -/// 1. **Constant**: load f128 bytes directly as lo:hi. -/// 2. **Tracked value**: load full 16-byte f128 from the original memory location. -/// 3. **Fallback**: load f64 approximation and extend via __extenddftf2. -pub fn f128_operand_to_arg1(cg: &mut T, op: &Operand) { - // Path 1: F128 constant with full-precision f128 bytes. - if let Operand::Const(IrConst::LongDouble(_, f128_bytes)) = op { - let lo = u64::from_le_bytes(f128_bytes[0..8].try_into().unwrap()); - let hi = u64::from_le_bytes(f128_bytes[8..16].try_into().unwrap()); - cg.f128_load_const_to_arg1(lo, hi); - // The accumulator register (ARM: x0) was clobbered with constant data. - // Invalidate the cache so subsequent loads don't get a stale hit. - cg.state().reg_cache.invalidate_all(); - return; - } - - // Path 2: Value with tracked f128 source (preserves full precision). - if let Operand::Value(v) = op { - if let Some((src_id, offset, is_indirect)) = cg.f128_get_source(v.0) { - if is_indirect { - if let Some(slot) = cg.f128_get_slot(src_id) { - cg.f128_load_ptr_to_addr_reg(slot, src_id); - if offset != 0 { - cg.f128_add_offset_to_addr_reg(offset); - } - cg.f128_load_16b_from_addr_reg_to_arg1(); - return; - } - } else { - let addr = cg.f128_resolve_slot_addr(src_id); - if let Some(addr) = addr { - match addr { - SlotAddr::Direct(slot) | SlotAddr::Indirect(slot) => { - let effective = slot.0 + offset; - cg.f128_load_from_frame_offset_to_arg1(effective); - } - SlotAddr::OverAligned(slot, id) => { - cg.f128_alloca_aligned_addr(slot, id); - if offset != 0 { - cg.f128_add_offset_to_addr_reg(offset); - } - cg.f128_load_16b_from_addr_reg_to_arg1(); - } - } - return; - } - } - } - } - - // Path 3: Fallback - load f64 approximation and convert to f128. - cg.f128_load_operand_and_extend(op); -} - -/// Store an F128 value to a direct stack slot. -/// -/// Three paths: -/// 1. **Constant**: store lo:hi halves directly. -/// 2. **Tracked value**: copy 16 bytes from the tracked source. -/// 3. **Fallback**: convert f64 to f128 via __extenddftf2, store result. -pub fn f128_store_to_slot(cg: &mut T, val: &Operand, slot: StackSlot) { - // Path 1: F128 constant. - if let Some((lo, hi)) = crate::backend::cast::f128_const_halves(val) { - cg.f128_store_const_halves_to_slot(lo, hi, slot); - // The accumulator register was clobbered with constant data. - cg.state().reg_cache.invalidate_all(); - return; - } - - // Path 2: Tracked value with full f128 source. - if let Operand::Value(v) = val { - if let Some((src_id, offset, is_indirect)) = cg.f128_get_source(v.0) { - if is_indirect { - if let Some(src_slot) = cg.f128_get_slot(src_id) { - cg.f128_load_ptr_to_addr_reg(src_slot, src_id); - if offset != 0 { - cg.f128_add_offset_to_addr_reg(offset); - } - cg.f128_copy_addr_reg_to_slot(slot); - return; - } - } else if let Some(src_slot) = cg.f128_get_slot(src_id) { - let src_off = src_slot.0 + offset; - cg.f128_copy_slot_to_slot(src_off, slot); - return; - } - } - } - - // Path 3: Fallback - extend f64 to f128, store result. - cg.f128_load_operand_and_extend(val); - cg.f128_store_arg1_to_slot(slot); - cg.state().reg_cache.invalidate_all(); -} - -/// Store an F128 value to the address in the addr register (ARM: x17, RISC-V: t5). -/// -/// Three paths: -/// 1. **Constant**: store lo:hi halves directly to addr. -/// 2. **Tracked value**: copy 16 bytes from tracked source to addr. -/// 3. **Fallback**: convert f64 to f128, store result to saved addr. -pub fn f128_store_to_addr_reg(cg: &mut T, val: &Operand) { - // Path 1: F128 constant. - if let Some((lo, hi)) = crate::backend::cast::f128_const_halves(val) { - cg.f128_store_const_halves_to_addr(lo, hi); - // The accumulator register was clobbered with constant data. - cg.state().reg_cache.invalidate_all(); - return; - } - - // Path 2: Tracked value with full f128 source. - if let Operand::Value(v) = val { - if let Some((src_id, offset, is_indirect)) = cg.f128_get_source(v.0) { - if is_indirect { - if let Some(src_slot) = cg.f128_get_slot(src_id) { - cg.f128_save_addr_reg(); - cg.f128_load_ptr_to_addr_reg(src_slot, src_id); - if offset != 0 { - cg.f128_add_offset_to_addr_reg(offset); - } - cg.f128_copy_addr_reg_to_saved_addr(); - return; - } - } else if let Some(src_slot) = cg.f128_get_slot(src_id) { - let src_off = src_slot.0 + offset; - cg.f128_save_addr_reg(); - cg.f128_copy_slot_to_saved_addr(src_off); - return; - } - } - } - - // Path 3: Fallback - save addr, convert f64 to f128, store to saved addr. - cg.f128_save_addr_reg(); - cg.f128_load_operand_and_extend(val); - cg.f128_store_arg1_to_saved_addr(); - cg.state().reg_cache.invalidate_all(); -} - -/// Negate an F128 value with full precision by flipping the IEEE 754 sign bit. -/// -/// 1. Load full f128 into arg1. -/// 2. XOR the sign bit (bit 127). -/// 3. Store full f128 result to dest slot. -/// 4. Convert to f64 approximation for register-based data flow. -pub fn f128_neg(cg: &mut T, dest: &Value, src: &Operand) { - // Step 1: Load full-precision f128 into arg1. - f128_operand_to_arg1(cg, src); - // Step 2: Flip the sign bit. - cg.f128_flip_sign_bit(); - // Step 3: Store full f128 result to dest slot. - if let Some(dest_slot) = cg.f128_get_slot(dest.0) { - cg.f128_store_arg1_to_slot(dest_slot); - cg.f128_track_self(dest.0); - } - // Step 4: Convert to f64 approximation in accumulator. - cg.f128_truncate_result_to_acc(); - cg.state().reg_cache.invalidate_all(); - cg.f128_set_acc_cache(dest.0); -} - -/// F128 comparison via soft-float libcalls with full precision. -/// -/// 1. Load LHS f128 into arg1, save to stack temp. -/// 2. Load RHS f128 into arg1, move to arg2. -/// 3. Reload LHS from stack temp into arg1. -/// 4. Call comparison libcall. -/// 5. Map result to boolean in accumulator. -pub fn f128_cmp( - cg: &mut T, - dest: &Value, - op: IrCmpOp, - lhs: &Operand, - rhs: &Operand, -) { - // Force frame-pointer-relative addressing during temp allocation - // (ARM needs this because sub sp breaks sp-relative slot addressing). - let saved = cg.f128_set_dyn_alloca(true); - - // Step 1: Allocate temp, load LHS, save to sp. - cg.f128_alloc_temp_16(); - f128_operand_to_arg1(cg, lhs); - cg.f128_save_arg1_to_sp(); - - // Step 2: Load RHS, move to arg2. - f128_operand_to_arg1(cg, rhs); - cg.f128_move_arg1_to_arg2(); - - // Step 3: Reload LHS from sp into arg1. - cg.f128_reload_arg1_from_sp(); - - // Step 4: Free temp, restore dyn_alloca flag. - cg.f128_free_temp_16(); - cg.f128_set_dyn_alloca(saved); - - // Step 5: Call comparison libcall and map result. - let (libcall, kind) = crate::backend::cast::f128_cmp_libcall(op); - cg.f128_call(libcall); - cg.f128_cmp_result_to_bool(kind); - - cg.state().reg_cache.invalidate_all(); - cg.f128_store_acc_to_dest(dest); -} - -// ============================================================================= -// Shared store/load dispatch orchestration -// ============================================================================= - -/// F128 store dispatch: resolve the pointer's SlotAddr and store 16 bytes. -/// -/// Handles four cases: register-allocated pointer, Direct alloca, OverAligned -/// alloca, and Indirect (non-alloca pointer in slot). Each case resolves to -/// either a direct slot store or an address-register store. -pub fn f128_emit_store( - cg: &mut T, - val: &Operand, - ptr: &Value, -) { - let is_indirect = !cg.f128_is_alloca(ptr.0); - - // Check if the pointer lives in a callee-saved register. - if cg.f128_move_callee_reg_to_addr_reg(ptr.0) { - f128_store_to_addr_reg(cg, val); - return; - } - - let addr = cg.f128_resolve_slot_addr(ptr.0); - if let Some(addr) = addr { - match addr { - SlotAddr::Direct(slot) if !is_indirect => { - f128_store_to_slot(cg, val, slot); - } - SlotAddr::Direct(slot) | SlotAddr::Indirect(slot) => { - cg.f128_load_indirect_ptr_to_addr_reg(slot, ptr.0); - f128_store_to_addr_reg(cg, val); - } - SlotAddr::OverAligned(slot, id) => { - cg.f128_alloca_aligned_addr(slot, id); - cg.f128_move_aligned_to_addr_reg(); - f128_store_to_addr_reg(cg, val); - } - } - } -} - -/// F128 load dispatch: resolve the pointer's SlotAddr, load 16 bytes, -/// convert to f64 approximation, and store to dest. -/// -/// Also tracks the f128 source for full-precision reloads. -pub fn f128_emit_load( - cg: &mut T, - dest: &Value, - ptr: &Value, -) { - cg.state().track_f128_load(dest.0, ptr.0, 0); - let is_indirect = !cg.f128_is_alloca(ptr.0); - - // Check if the pointer lives in a callee-saved register. - if cg.f128_move_callee_reg_to_addr_reg(ptr.0) { - cg.f128_load_from_addr_reg_to_acc(dest); - return; - } - - let addr = cg.f128_resolve_slot_addr(ptr.0); - if let Some(addr) = addr { - match addr { - SlotAddr::Direct(slot) if !is_indirect => { - cg.f128_load_from_direct_slot_to_acc(slot); - } - SlotAddr::Direct(slot) | SlotAddr::Indirect(slot) => { - cg.f128_load_indirect_ptr_to_addr_reg(slot, ptr.0); - cg.f128_load_from_addr_reg_to_acc(dest); - return; - } - SlotAddr::OverAligned(slot, id) => { - cg.f128_alloca_aligned_addr(slot, id); - cg.f128_move_aligned_to_addr_reg(); - cg.f128_load_from_addr_reg_to_acc(dest); - return; - } - } - } else { - return; - } - // Convert f128 to f64, store to dest. - cg.f128_truncate_result_to_acc(); - cg.state().reg_cache.invalidate_all(); - cg.f128_store_acc_to_dest(dest); -} - -/// F128 store with constant offset dispatch. -/// -/// Resolves the base pointer's SlotAddr and stores 16 bytes at base + offset. -pub fn f128_emit_store_with_offset( - cg: &mut T, - val: &Operand, - base: &Value, - offset: i64, -) { - let is_indirect = !cg.f128_is_alloca(base.0); - - // Check if the base pointer lives in a callee-saved register. - if cg.f128_move_callee_reg_to_addr_reg(base.0) { - if offset != 0 { - cg.f128_add_offset_to_addr_reg(offset); - } - f128_store_to_addr_reg(cg, val); - return; - } - - let addr = cg.f128_resolve_slot_addr(base.0); - if let Some(addr) = addr { - match addr { - SlotAddr::Direct(slot) if !is_indirect => { - let folded_slot = StackSlot(slot.0 + offset); - f128_store_to_slot(cg, val, folded_slot); - } - SlotAddr::Direct(slot) | SlotAddr::Indirect(slot) => { - cg.f128_load_indirect_ptr_to_addr_reg(slot, base.0); - if offset != 0 { - cg.f128_add_offset_to_addr_reg(offset); - } - f128_store_to_addr_reg(cg, val); - } - SlotAddr::OverAligned(slot, id) => { - cg.f128_alloca_aligned_addr(slot, id); - cg.f128_move_aligned_to_addr_reg(); - if offset != 0 { - cg.f128_add_offset_to_addr_reg(offset); - } - f128_store_to_addr_reg(cg, val); - } - } - } -} - -/// F128 load with constant offset dispatch. -/// -/// Resolves the base pointer's SlotAddr, loads 16 bytes at base + offset, -/// converts to f64 approximation, and stores to dest. -pub fn f128_emit_load_with_offset( - cg: &mut T, - dest: &Value, - base: &Value, - offset: i64, -) { - cg.state().track_f128_load(dest.0, base.0, offset); - let is_indirect = !cg.f128_is_alloca(base.0); - - // Check if the base pointer lives in a callee-saved register. - let loaded = if cg.f128_move_callee_reg_to_addr_reg(base.0) { - if offset != 0 { - cg.f128_add_offset_to_addr_reg(offset); - } - cg.f128_load_from_addr_reg_to_acc(dest); - return; // load_from_addr_reg_to_acc handles truncation + store - } else { - let addr = cg.f128_resolve_slot_addr(base.0); - if let Some(addr) = addr { - match addr { - SlotAddr::Direct(slot) if !is_indirect => { - let folded_slot = StackSlot(slot.0 + offset); - cg.f128_load_from_direct_slot_to_acc(folded_slot); - } - SlotAddr::Direct(slot) | SlotAddr::Indirect(slot) => { - cg.f128_load_indirect_ptr_to_addr_reg(slot, base.0); - if offset != 0 { - cg.f128_add_offset_to_addr_reg(offset); - } - cg.f128_load_from_addr_reg_to_acc(dest); - return; - } - SlotAddr::OverAligned(slot, id) => { - cg.f128_alloca_aligned_addr(slot, id); - cg.f128_move_aligned_to_addr_reg(); - if offset != 0 { - cg.f128_add_offset_to_addr_reg(offset); - } - cg.f128_load_from_addr_reg_to_acc(dest); - return; - } - } - true - } else { - false - } - }; - if loaded { - cg.f128_truncate_result_to_acc(); - cg.state().reg_cache.invalidate_all(); - cg.f128_store_acc_to_dest(dest); - } -} - -// ============================================================================= -// Shared cast orchestration -// ============================================================================= - -/// F128 cast dispatch: handles all F128-related casts (int<->F128, float<->F128). -/// -/// Returns `true` if the cast was handled, `false` if the caller should use the -/// default cast path. -pub fn f128_emit_cast( - cg: &mut T, - dest: &Value, - src: &Operand, - from_ty: IrType, - to_ty: IrType, -) -> bool { - let is_i128 = |ty: IrType| ty == IrType::I128 || ty == IrType::U128; - - // int -> F128 - if to_ty == IrType::F128 && !from_ty.is_float() && !is_i128(from_ty) { - cg.f128_load_operand_to_acc(src); - if from_ty.is_signed() { - cg.f128_sign_extend_acc(from_ty.size()); - cg.f128_move_acc_to_arg0(); - cg.f128_call("__floatditf"); - } else { - cg.f128_zero_extend_acc(from_ty.size()); - cg.f128_move_acc_to_arg0(); - cg.f128_call("__floatunditf"); - } - cg.state().reg_cache.invalidate_all(); - cg.f128_store_result_and_truncate(dest); - return true; - } - - // F128 -> int - if from_ty == IrType::F128 && !to_ty.is_float() && !is_i128(to_ty) { - f128_operand_to_arg1(cg, src); - if to_ty.is_unsigned() || to_ty == IrType::Ptr { - cg.f128_call("__fixunstfdi"); - } else { - cg.f128_call("__fixtfdi"); - } - cg.f128_move_arg0_to_acc(); - cg.state().reg_cache.invalidate_all(); - if to_ty.size() < 8 { - cg.f128_narrow_acc(to_ty); - } - cg.f128_store_acc_to_dest(dest); - return true; - } - - // float -> F128 - if to_ty == IrType::F128 && from_ty.is_float() { - cg.f128_load_operand_to_acc(src); - cg.f128_extend_float_to_f128(from_ty); - cg.state().reg_cache.invalidate_all(); - cg.f128_store_result_and_truncate(dest); - return true; - } - - // F128 -> float - if from_ty == IrType::F128 && to_ty.is_float() { - f128_operand_to_arg1(cg, src); - cg.f128_truncate_to_float_acc(to_ty); - cg.state().reg_cache.invalidate_all(); - cg.f128_store_acc_to_dest(dest); - return true; - } - - false -} - -// ============================================================================= -// Shared binop orchestration -// ============================================================================= - -/// F128 binary operation via soft-float libcalls with full precision. -/// -/// 1. Allocate stack temp. -/// 2. Load LHS f128 into arg1, save to temp. -/// 3. Load RHS f128 into arg1, move to arg2. -/// 4. Reload LHS from temp into arg1. -/// 5. Call arithmetic libcall. -/// 6. Free temp. -/// 7. Store full f128 result to dest slot, produce f64 approximation. -pub fn f128_emit_binop( - cg: &mut T, - dest: &Value, - op: FloatOp, - lhs: &Operand, - rhs: &Operand, -) { - let libcall = match op { - FloatOp::Add => "__addtf3", - FloatOp::Sub => "__subtf3", - FloatOp::Mul => "__multf3", - FloatOp::Div => "__divtf3", - }; - - // Force frame-pointer-relative addressing during temp allocation. - let saved = cg.f128_set_dyn_alloca(true); - - // Step 1: Allocate temp stack space for saving LHS. - cg.f128_alloc_temp_16(); - - // Step 2: Load LHS f128, save to temp. - f128_operand_to_arg1(cg, lhs); - cg.f128_save_arg1_to_sp(); - - // Step 3: Load RHS f128, move to arg2. - f128_operand_to_arg1(cg, rhs); - cg.f128_move_arg1_to_arg2(); - - // Step 4: Reload LHS from temp. - cg.f128_reload_arg1_from_sp(); - - // Step 5: Free temp, restore flag. - cg.f128_free_temp_16(); - cg.f128_set_dyn_alloca(saved); - - // Step 6: Call the arithmetic libcall. - cg.f128_call(libcall); - - // Step 7: Store full f128 result and produce f64 approximation. - cg.f128_store_result_and_truncate(dest); -} diff --git a/src/backend/generation.rs b/src/backend/generation.rs deleted file mode 100644 index 27d6398df2..0000000000 --- a/src/backend/generation.rs +++ /dev/null @@ -1,1406 +0,0 @@ -//! Module, function, and instruction generation dispatch. -//! -//! This module contains the top-level entry points that drive code generation: -//! - `generate_module`: emits data sections and iterates over functions -//! - `generate_function`: emits prologue, basic blocks, and epilogue -//! - `generate_instruction`: dispatches each IR instruction to arch trait methods -//! - `generate_terminator`: dispatches terminators to arch trait methods -//! -//! These functions are arch-independent — they use the `ArchCodegen` trait to call -//! into the backend-specific implementations. - -use crate::ir::reexports::{ - BasicBlock, - GlobalInit, - Instruction, - IrConst, - IrFunction, - IrModule, - Operand, - Terminator, - Value, -}; -use crate::common::types::{AddressSpace, IrType}; -use crate::common::source::{Span, SourceManager}; -use crate::common::fx_hash::{FxHashMap, FxHashSet}; -use super::common; -use super::traits::ArchCodegen; -use super::liveness::{for_each_operand_in_instruction, for_each_value_use_in_instruction, for_each_operand_in_terminator}; - -/// Information about a GEP with a constant offset that can be folded into -/// Load/Store addressing modes. Instead of computing `base + offset` as a -/// separate instruction and spilling to stack, the constant offset is merged -/// directly into the memory operand of the subsequent load/store. -#[derive(Debug, Clone, Copy)] -pub(super) struct GepFoldInfo { - /// The base pointer value (an alloca or previously-computed pointer). - pub(super) base: Value, - /// The constant byte offset to add to the base address. - pub(super) offset: i64, -} - -/// Build a map of GEP destinations that can be folded into Load/Store instructions. -/// -/// A GEP is foldable when: -/// 1. Its offset is a compile-time constant (Operand::Const) -/// 2. The constant fits in a 32-bit signed displacement (x86 addressing limit) -/// 3. The GEP result is only used as the ptr operand of Load/Store instructions -/// (not used by other instructions, terminators, or as a value operand) -/// -/// When all conditions are met, the GEP instruction is skipped during codegen, -/// and each Load/Store that uses it receives the (base, offset) directly. -fn build_gep_fold_map(func: &IrFunction, use_counts: &[u32]) -> FxHashMap { - let mut gep_map: FxHashMap = FxHashMap::default(); - - // Phase 1: Collect all GEPs with constant offsets. - for block in &func.blocks { - for inst in &block.instructions { - if let Instruction::GetElementPtr { dest, base, offset: Operand::Const(c), .. } = inst { - let offset_val = match c.to_i64() { - Some(v) => v, - None => continue, - }; - // Offset must fit in 32-bit signed displacement for x86. - // Also reasonable for ARM (signed 9-bit unscaled or 12-bit scaled) - // and RISC-V (signed 12-bit). - // Use i32 range as the safe common limit. - // Unsigned type constants (e.g. U32 -1 = 4294967295) are sign-narrowed. - let offset_val = if offset_val >= i32::MIN as i64 && offset_val <= i32::MAX as i64 { - offset_val - } else if offset_val > i32::MAX as i64 && offset_val <= u32::MAX as i64 { - offset_val as i32 as i64 - } else { - continue; - }; - gep_map.insert(dest.0, GepFoldInfo { base: *base, offset: offset_val }); - } - } - } - - if gep_map.is_empty() { - return gep_map; - } - - // Phase 2: Verify that each candidate GEP dest is ONLY used as Load/Store ptr. - // If it's used anywhere else (as a value operand, in a call, in a terminator, - // or as a base of another GEP), we cannot fold it. - // - // Strategy: Load.ptr and Store.ptr are the ONLY foldable use positions. - // - Load: ptr is a Value (visited by for_each_value_use), no Operand uses → skip entirely. - // - Store: ptr (Value) is foldable, but val (Operand) is NOT → check only Operand uses. - // - All other instructions: ANY reference to a GEP dest invalidates folding. - let mut non_ptr_uses: FxHashSet = FxHashSet::default(); - - // Helper: mark a GEP dest as non-foldable if used outside Load/Store ptr position. - let mut mark_non_ptr = |id: u32| { - if gep_map.contains_key(&id) { - non_ptr_uses.insert(id); - } - }; - - for block in &func.blocks { - for inst in &block.instructions { - match inst { - // Load.ptr is foldable — UNLESS: - // - The load type is i128/u128: the i128 load path doesn't - // support GEP folding and falls through to emit_load. - // - The load has a segment override (%gs:/%fs:): the segment- - // overridden load path (emit_seg_load) returns early before - // the GEP fold check, so it needs the pointer value to be - // computed by the GEP instruction (not folded away). - Instruction::Load { ptr, ty, seg_override, .. } => { - if matches!(ty, IrType::I128 | IrType::U128) - || *seg_override != AddressSpace::Default { - mark_non_ptr(ptr.0); - } - } - // Store.ptr is foldable, but Store.val is an Operand that is NOT foldable. - // Also invalidate if the store type is i128/u128 or has a segment - // override, for the same reasons as Load above. - Instruction::Store { val, ptr, ty, seg_override, .. } => { - if let Operand::Value(v) = val { mark_non_ptr(v.0); } - if matches!(ty, IrType::I128 | IrType::U128) - || *seg_override != AddressSpace::Default { - mark_non_ptr(ptr.0); - } - } - // All other instructions: any reference invalidates folding. - _ => { - for_each_operand_in_instruction(inst, |op| { - if let Operand::Value(v) = op { mark_non_ptr(v.0); } - }); - for_each_value_use_in_instruction(inst, |v| mark_non_ptr(v.0)); - } - } - } - for_each_operand_in_terminator(&block.terminator, |op| { - if let Operand::Value(v) = op { mark_non_ptr(v.0); } - }); - } - - // Remove GEPs that have non-ptr uses. - for val_id in &non_ptr_uses { - gep_map.remove(val_id); - } - - // Also remove GEPs that are unused (use_count == 0). - gep_map.retain(|val_id, _| { - (*val_id as usize) < use_counts.len() && use_counts[*val_id as usize] > 0 - }); - - gep_map -} - -/// Build a map from Value IDs to global symbol names (with optional offsets). -/// Maps values produced by `GlobalAddr { name }` to `"name"`, and values -/// produced by `GEP(GlobalAddr { name }, const_offset)` to `"name+offset"`. -/// Used to emit direct symbol(%rip) references for segment-overridden loads/stores. -/// TLS symbols are excluded because they require special access patterns -/// (%fs:/@TPOFF on x86-64, %gs:/@NTPOFF on i686, etc.) and must not be -/// folded into plain RIP-relative accesses. -fn build_global_addr_map(func: &IrFunction, tls_symbols: &FxHashSet) -> FxHashMap { - let mut map: FxHashMap = FxHashMap::default(); - for block in &func.blocks { - for inst in &block.instructions { - match inst { - Instruction::GlobalAddr { dest, name } => { - // Skip TLS symbols - they must go through emit_tls_global_addr - if !tls_symbols.contains(name.as_str()) { - map.insert(dest.0, name.clone()); - } - } - Instruction::GetElementPtr { dest, base, offset: Operand::Const(c), .. } => { - if let Some(base_name) = map.get(&base.0) { - let offset_val = match c.to_i64() { - Some(v) => v, - None => continue, - }; - let sym = if offset_val == 0 { - base_name.clone() - } else if offset_val > 0 { - format!("{}+{}", base_name, offset_val) - } else { - format!("{}{}", base_name, offset_val) - }; - map.insert(dest.0, sym); - } - } - _ => {} - } - } - } - map -} - -/// Build a set of GlobalAddr value IDs that are "dead" after the fold optimization. -/// A GlobalAddr is dead when ALL of its uses are as `ptr` in Load/Store instructions -/// that will be folded into direct `symbol(%rip)` accesses. In that case, the -/// `lea symbol(%rip), %rax` instruction for the GlobalAddr is unnecessary. -fn build_foldable_global_addr_set( - func: &IrFunction, - global_addr_map: &FxHashMap, -) -> FxHashSet { - // Collect all GlobalAddr dest value IDs - let mut global_addr_ids: FxHashSet = FxHashSet::default(); - for block in &func.blocks { - for inst in &block.instructions { - if let Instruction::GlobalAddr { dest, .. } = inst { - global_addr_ids.insert(dest.0); - } - } - } - if global_addr_ids.is_empty() { - return FxHashSet::default(); - } - - // Track which GlobalAddr values have non-foldable uses. - // A use is "foldable" if it's the `ptr` of a Load/Store AND the ptr is in - // global_addr_map AND the type is foldable (not wide/F128). - let mut has_non_foldable_use: FxHashSet = FxHashSet::default(); - - for block in &func.blocks { - for inst in &block.instructions { - match inst { - Instruction::Load { ptr, ty, seg_override, .. } => { - // The ptr use is foldable if it's in global_addr_map and type is supported - let is_foldable = global_addr_ids.contains(&ptr.0) - && global_addr_map.contains_key(&ptr.0) - && !is_wide_int_type(*ty) - && *ty != IrType::F128 - && *seg_override == AddressSpace::Default; - if !is_foldable && global_addr_ids.contains(&ptr.0) { - has_non_foldable_use.insert(ptr.0); - } - } - Instruction::Store { val, ptr, ty, seg_override } => { - let is_ptr_foldable = global_addr_ids.contains(&ptr.0) - && global_addr_map.contains_key(&ptr.0) - && !is_wide_int_type(*ty) - && *ty != IrType::F128 - && *seg_override == AddressSpace::Default; - if !is_ptr_foldable && global_addr_ids.contains(&ptr.0) { - has_non_foldable_use.insert(ptr.0); - } - // If Store's val references a GlobalAddr, that's a non-foldable use - if let Operand::Value(v) = val { - if global_addr_ids.contains(&v.0) { - has_non_foldable_use.insert(v.0); - } - } - } - // Any other instruction using a GlobalAddr value means it's not dead - _ => { - for v in inst.used_values() { - if global_addr_ids.contains(&v) { - has_non_foldable_use.insert(v); - } - } - } - } - } - // Check terminator uses too - for v in block.terminator.used_values() { - if global_addr_ids.contains(&v) { - has_non_foldable_use.insert(v); - } - } - } - - // Return GlobalAddr values that have NO non-foldable uses - global_addr_ids.difference(&has_non_foldable_use).copied().collect() -} - -/// Build a set of GlobalAddr value IDs that are used as Load/Store pointers. -/// In kernel code model, GlobalAddr values used only as integer values -/// (e.g., `(unsigned long)_text`) need absolute addressing (R_X86_64_32S) -/// to produce the linked virtual address. But GlobalAddr values used as -/// Load/Store pointers need RIP-relative addressing so they work at any -/// physical/virtual address during early boot. -fn build_global_addr_ptr_set(func: &IrFunction) -> FxHashSet { - // First collect all GlobalAddr dest values - let mut global_addrs: FxHashSet = FxHashSet::default(); - for block in &func.blocks { - for inst in &block.instructions { - if let Instruction::GlobalAddr { dest, .. } = inst { - global_addrs.insert(dest.0); - } - } - } - // Now find which ones (or values derived from them) are used as memory ptrs. - // Track derivation through Copy, Cast, GEP, Phi, and Select so that a - // GlobalAddr flowing through intermediate values to a Load/Store/Atomic - // ptr is still caught. - let mut ptr_set: FxHashSet = FxHashSet::default(); - let mut derived_from: FxHashMap = FxHashMap::default(); // derived_dest -> original GlobalAddr - - // Helper: if `id` is a GlobalAddr or derived from one, mark it as pointer use - let mark_val = |id: u32, global_addrs: &FxHashSet, derived_from: &FxHashMap, ptr_set: &mut FxHashSet| { - if global_addrs.contains(&id) { - ptr_set.insert(id); - } else if let Some(&orig) = derived_from.get(&id) { - ptr_set.insert(orig); - } - }; - // Helper: same but for Operand (skips constants) - let mark_op = |op: &Operand, global_addrs: &FxHashSet, derived_from: &FxHashMap, ptr_set: &mut FxHashSet| { - if let Operand::Value(v) = op { - if global_addrs.contains(&v.0) { - ptr_set.insert(v.0); - } else if let Some(&orig) = derived_from.get(&v.0) { - ptr_set.insert(orig); - } - } - }; - // Helper: if src_id is a GlobalAddr or derived from one, record dest_id as derived - let track_val = |dest_id: u32, src_id: u32, global_addrs: &FxHashSet, derived_from: &mut FxHashMap| { - if global_addrs.contains(&src_id) { - derived_from.insert(dest_id, src_id); - } else if let Some(&orig) = derived_from.get(&src_id) { - derived_from.insert(dest_id, orig); - } - }; - // Helper: same but for Operand - let track_op = |dest_id: u32, op: &Operand, global_addrs: &FxHashSet, derived_from: &mut FxHashMap| { - if let Operand::Value(v) = op { - if global_addrs.contains(&v.0) { - derived_from.insert(dest_id, v.0); - } else if let Some(&orig) = derived_from.get(&v.0) { - derived_from.insert(dest_id, orig); - } - } - }; - - for block in &func.blocks { - for inst in &block.instructions { - match inst { - // Track derivation: these instructions produce a value that may - // carry a GlobalAddr through to a later pointer use. - Instruction::GetElementPtr { dest, base, .. } => { - track_val(dest.0, base.0, &global_addrs, &mut derived_from); - } - Instruction::Copy { dest, src } => { - track_op(dest.0, src, &global_addrs, &mut derived_from); - } - Instruction::Cast { dest, src, .. } => { - track_op(dest.0, src, &global_addrs, &mut derived_from); - } - Instruction::Phi { dest, incoming, .. } => { - for (op, _) in incoming { - if let Operand::Value(v) = op { - if global_addrs.contains(&v.0) || derived_from.contains_key(&v.0) { - track_val(dest.0, v.0, &global_addrs, &mut derived_from); - break; - } - } - } - } - Instruction::Select { dest, true_val, false_val, .. } => { - // If either branch carries a GlobalAddr, track the result - track_op(dest.0, true_val, &global_addrs, &mut derived_from); - if !derived_from.contains_key(&dest.0) { - track_op(dest.0, false_val, &global_addrs, &mut derived_from); - } - } - // Mark pointer uses: Load, Store, Memcpy, and atomic operations - Instruction::Load { ptr, .. } => { - mark_val(ptr.0, &global_addrs, &derived_from, &mut ptr_set); - } - Instruction::Store { ptr, .. } => { - mark_val(ptr.0, &global_addrs, &derived_from, &mut ptr_set); - } - Instruction::Memcpy { dest, src, .. } => { - mark_val(dest.0, &global_addrs, &derived_from, &mut ptr_set); - mark_val(src.0, &global_addrs, &derived_from, &mut ptr_set); - } - Instruction::AtomicLoad { ptr, .. } => { - mark_op(ptr, &global_addrs, &derived_from, &mut ptr_set); - } - Instruction::AtomicStore { ptr, .. } => { - mark_op(ptr, &global_addrs, &derived_from, &mut ptr_set); - } - Instruction::AtomicRmw { ptr, .. } => { - mark_op(ptr, &global_addrs, &derived_from, &mut ptr_set); - } - Instruction::AtomicCmpxchg { ptr, .. } => { - mark_op(ptr, &global_addrs, &derived_from, &mut ptr_set); - } - // Conservatively mark GlobalAddr passed to function calls as pointer use, - // since the callee may dereference it - Instruction::Call { info, .. } | Instruction::CallIndirect { info, .. } => { - for arg in &info.args { - mark_op(arg, &global_addrs, &derived_from, &mut ptr_set); - } - } - _ => {} - } - } - } - ptr_set -} - -/// Returns the number of times each IR Value is used as an operand in -/// instructions or terminators. Indexed by Value ID; used to identify -/// single-use values eligible for compare-branch fusion. -fn count_value_uses(func: &IrFunction) -> Vec { - // Find the max value ID to size the vector. - let mut max_id: u32 = 0; - for block in &func.blocks { - for inst in &block.instructions { - if let Some(dest) = inst.dest() { - max_id = max_id.max(dest.0); - } - } - } - let mut counts = vec![0u32; max_id as usize + 1]; - - // Helper: increment use count for a value ID, bounds-checked. - let mut count_id = |id: u32| { - if (id as usize) < counts.len() { - counts[id as usize] += 1; - } - }; - - for block in &func.blocks { - for inst in &block.instructions { - for_each_operand_in_instruction(inst, |op| { - if let Operand::Value(v) = op { count_id(v.0); } - }); - for_each_value_use_in_instruction(inst, |v| count_id(v.0)); - } - for_each_operand_in_terminator(&block.terminator, |op| { - if let Operand::Value(v) = op { count_id(v.0); } - }); - } - counts -} - -/// Detect if a block's last instruction is a Cmp whose result is only used -/// by the block's CondBranch terminator. Returns the index of the Cmp if -/// fusion is possible, None otherwise. -fn detect_cmp_branch_fusion(block: &BasicBlock, use_counts: &[u32]) -> Option { - // Terminator must be a CondBranch - let (cond, _, _) = match &block.terminator { - Terminator::CondBranch { cond, true_label, false_label } => (cond, true_label, false_label), - _ => return None, - }; - - // The condition must be a Value (not a constant) - let cond_val = match cond { - Operand::Value(v) => v, - _ => return None, - }; - - // Find the last instruction that is a Cmp producing this value - let last_idx = block.instructions.len().checked_sub(1)?; - let last_inst = &block.instructions[last_idx]; - - let (dest, _op, _lhs, _rhs, ty) = match last_inst { - Instruction::Cmp { dest, op, lhs, rhs, ty } => (dest, op, lhs, rhs, ty), - _ => return None, - }; - - // The Cmp dest must be the same as the CondBranch cond - if dest.0 != cond_val.0 { - return None; - } - - // Don't fuse wide-int or float comparisons (they have special codegen paths). - // On 32-bit targets, also exclude I64/U64: the fused compare-and-branch - // uses 32-bit cmpl which only tests the low half of a 64-bit value. - if is_wide_int_type(*ty) || ty.is_float() { - return None; - } - if crate::common::types::target_is_32bit() && matches!(ty, IrType::I64 | IrType::U64) { - return None; - } - - // The Cmp result must be used exactly once (by the CondBranch terminator) - if (cond_val.0 as usize) < use_counts.len() && use_counts[cond_val.0 as usize] == 1 { - Some(last_idx) - } else { - None - } -} - -/// Generate assembly for a module using the given architecture's codegen. -/// Generate assembly for an IR module with debug info support. -/// Sets `debug_info` on the codegen state before proceeding. -pub fn generate_module_with_debug( - cg: &mut dyn ArchCodegen, - module: &IrModule, - debug_info: bool, - source_mgr: Option<&crate::common::source::SourceManager>, -) -> String { - cg.state().debug_info = debug_info; - generate_module(cg, module, source_mgr) -} - -pub fn generate_module(cg: &mut dyn ArchCodegen, module: &IrModule, source_mgr: Option<&crate::common::source::SourceManager>) -> String { - pre_size_output_buffer(cg, module); - collect_symbol_sets(cg, module); - let file_table = build_and_emit_dwarf_file_table(cg, module, source_mgr); - - let ptr_dir = cg.ptr_directive(); - common::emit_data_sections(&mut cg.state().out, module, ptr_dir); - - // Emit top-level asm("...") directives verbatim (e.g., musl's _start definition). - // Switch to .text first so that labels/code in the asm land in the correct section. - if !module.toplevel_asm.is_empty() { - cg.state().emit(".text"); - for asm_str in &module.toplevel_asm { - cg.state().emit(asm_str); - } - } - - let referenced_symbols = collect_referenced_symbols(module); - emit_extern_visibility_directives(cg, module, &referenced_symbols); - emit_functions_and_sections(cg, module, source_mgr, &file_table); - emit_aliases(cg, module); - emit_symver_directives(cg, module); - emit_symbol_attrs(cg, module, &referenced_symbols); - emit_init_fini_arrays(cg, module, ptr_dir); - - // Emit architecture-specific runtime helper stubs (e.g., i686 __divdi3) - cg.emit_runtime_stubs(); - - // Emit .note.GNU-stack section to indicate non-executable stack - cg.state().emit(""); - cg.state().emit(".section .note.GNU-stack,\"\",@progbits"); - - std::mem::take(&mut cg.state().out.buf) -} - -/// Pre-size the output buffer based on total IR instruction count to avoid -/// repeated reallocations. Each IR instruction typically generates ~40 bytes -/// of assembly text. -fn pre_size_output_buffer(cg: &mut dyn ArchCodegen, module: &IrModule) { - let total_insts: usize = module.functions.iter() - .map(|f| f.blocks.iter().map(|b| b.instructions.len()).sum::()) - .sum(); - let estimated_bytes = (total_insts * 40).clamp(256 * 1024, 64 * 1024 * 1024); - let state = cg.state(); - if state.out.buf.capacity() < estimated_bytes { - state.out.buf.reserve(estimated_bytes - state.out.buf.capacity()); - } -} - -/// Build the sets of locally-defined, thread-local, and weak extern symbols. -/// Local symbols (static or hidden/internal/protected visibility) don't need -/// GOT/PLT indirection in PIC mode. TLS symbols need TLS access patterns. -/// Weak extern symbols need GOT indirection on AArch64. -fn collect_symbol_sets(cg: &mut dyn ArchCodegen, module: &IrModule) { - let state = cg.state(); - for func in &module.functions { - if func.is_static || matches!(func.visibility.as_deref(), Some("hidden" | "internal" | "protected")) { - state.local_symbols.insert(func.name.clone()); - } - } - for global in &module.globals { - if global.is_static || matches!(global.visibility.as_deref(), Some("hidden" | "internal" | "protected")) { - state.local_symbols.insert(global.name.clone()); - } - if global.is_thread_local { - state.tls_symbols.insert(global.name.clone()); - } - if global.is_weak && global.is_extern { - state.weak_extern_symbols.insert(global.name.clone()); - } - } - for (name, is_weak, visibility) in &module.symbol_attrs { - if matches!(visibility.as_deref(), Some("hidden" | "internal" | "protected")) { - state.local_symbols.insert(name.clone()); - } - if *is_weak { - state.weak_extern_symbols.insert(name.clone()); - } - } - for (label, _) in &module.string_literals { - state.local_symbols.insert(label.clone()); - } - for (label, _) in &module.wide_string_literals { - state.local_symbols.insert(label.clone()); - } -} - -/// Build the DWARF file table and emit .file directives when debug info is enabled. -/// Scans all spans in the module, resolves filenames via SourceManager, -/// and assigns each unique filename a DWARF file number (1-based). -fn build_and_emit_dwarf_file_table( - cg: &mut dyn ArchCodegen, - module: &IrModule, - source_mgr: Option<&crate::common::source::SourceManager>, -) -> FxHashMap { - if !cg.state_ref().debug_info { - return FxHashMap::default(); - } - let sm = match source_mgr { - Some(sm) => sm, - None => return FxHashMap::default(), - }; - - let mut table: FxHashMap = FxHashMap::default(); - let mut next_id: u32 = 1; - for func in &module.functions { - if func.is_declaration { continue; } - for block in &func.blocks { - for span in &block.source_spans { - if span.start == 0 && span.end == 0 { continue; } - let loc = sm.resolve_span(*span); - if let std::collections::hash_map::Entry::Vacant(e) = table.entry(loc.file) { - e.insert(next_id); - next_id += 1; - } - } - } - } - - if !table.is_empty() { - let mut entries: Vec<(&String, &u32)> = table.iter().collect(); - entries.sort_by_key(|(_name, id)| *id); - for (name, id) in entries { - cg.state().emit_fmt(format_args!(".file {} \"{}\"", id, name)); - } - } - table -} - -/// Collect the set of symbols actually referenced in this translation unit. -/// We only emit .weak/.hidden directives for referenced symbols, matching GCC behavior. -fn collect_referenced_symbols(module: &IrModule) -> FxHashSet { - let mut refs = FxHashSet::default(); - - // Symbols referenced in function bodies - for func in &module.functions { - if func.is_declaration { continue; } - for block in &func.blocks { - for inst in &block.instructions { - match inst { - Instruction::Call { func: callee, .. } => { - refs.insert(callee.clone()); - } - Instruction::GlobalAddr { name, .. } => { - refs.insert(name.clone()); - } - Instruction::InlineAsm { input_symbols, .. } => { - for s in input_symbols.iter().flatten() { - let base = s.split('+').next().unwrap_or(s); - refs.insert(base.to_string()); - } - } - _ => {} - } - } - } - } - - // Symbols referenced in global initializers - for global in &module.globals { - fn collect_global_refs(init: &GlobalInit, refs: &mut FxHashSet) { - match init { - GlobalInit::GlobalAddr(name) | GlobalInit::GlobalAddrOffset(name, _) => { - refs.insert(name.clone()); - } - GlobalInit::GlobalLabelDiff(a, b, _) => { - refs.insert(a.clone()); - refs.insert(b.clone()); - } - GlobalInit::Compound(inits) => { - for sub in inits { - collect_global_refs(sub, refs); - } - } - _ => {} - } - } - collect_global_refs(&global.init, &mut refs); - } - - // Symbols referenced in toplevel asm (conservative substring match) - for asm_str in &module.toplevel_asm { - for (sym_name, _, _) in &module.symbol_attrs { - if asm_str.contains(sym_name.as_str()) { - refs.insert(sym_name.clone()); - } - } - } - - // Defined functions and globals are always considered referenced - for func in &module.functions { - if !func.is_declaration { - refs.insert(func.name.clone()); - } - } - for global in &module.globals { - if !global.is_extern { - refs.insert(global.name.clone()); - } - } - refs -} - -/// Emit visibility directives for declaration-only (extern) functions with -/// non-default visibility, but only if they are actually referenced. -fn emit_extern_visibility_directives(cg: &mut dyn ArchCodegen, module: &IrModule, referenced_symbols: &FxHashSet) { - for func in &module.functions { - if func.is_declaration && referenced_symbols.contains(&func.name) { - cg.state().emit_visibility(&func.name, &func.visibility); - } - } -} - -/// Emit text section, handle custom sections, and generate code for each function. -/// When `-ffunction-sections` is enabled, each function without a custom section -/// attribute gets its own `.text.funcname` section, enabling `--gc-sections` to -/// discard unreferenced functions at link time. -fn emit_functions_and_sections( - cg: &mut dyn ArchCodegen, - module: &IrModule, - source_mgr: Option<&crate::common::source::SourceManager>, - file_table: &FxHashMap, -) { - let function_sections = cg.state().function_sections; - if !function_sections { - cg.state().emit(".section .text"); - } - let mut in_custom_section = false; - for func in &module.functions { - if !func.is_declaration { - if let Some(ref sect) = func.section { - cg.state().emit_fmt(format_args!(".section {},\"ax\",@progbits", sect)); - cg.state().current_text_section = sect.clone(); - in_custom_section = true; - } else if function_sections { - // -ffunction-sections: each function gets its own section - let sect_name = format!(".text.{}", func.name); - cg.state().emit_fmt(format_args!(".section {},\"ax\",@progbits", sect_name)); - cg.state().current_text_section = sect_name; - in_custom_section = false; - } else if in_custom_section { - cg.state().emit(".section .text"); - cg.state().current_text_section = ".text".to_string(); - in_custom_section = false; - } else { - cg.state().current_text_section = ".text".to_string(); - } - generate_function(cg, func, source_mgr, file_table); - } - } -} - -/// Emit symbol aliases from __attribute__((alias("target"))). -fn emit_aliases(cg: &mut dyn ArchCodegen, module: &IrModule) { - for (alias_name, target_name, is_weak) in &module.aliases { - cg.state().emit(""); - if *is_weak { - cg.state().emit_fmt(format_args!(".weak {}", alias_name)); - } else { - cg.state().emit_fmt(format_args!(".globl {}", alias_name)); - } - cg.state().emit_fmt(format_args!(".set {},{}", alias_name, target_name)); - } -} - -/// Emit .symver directives from __attribute__((symver("name@@VERSION"))). -fn emit_symver_directives(cg: &mut dyn ArchCodegen, module: &IrModule) { - for (func_name, symver_str) in &module.symver_directives { - cg.state().emit_fmt(format_args!(".symver {},{}", func_name, symver_str)); - } -} - -/// Emit .weak/.hidden directives for declaration symbols that are referenced. -fn emit_symbol_attrs(cg: &mut dyn ArchCodegen, module: &IrModule, referenced_symbols: &FxHashSet) { - for (name, is_weak, visibility) in &module.symbol_attrs { - if !referenced_symbols.contains(name) { - continue; - } - if *is_weak { - cg.state().emit_fmt(format_args!(".weak {}", name)); - } - cg.state().emit_visibility(name, visibility); - } -} - -/// Emit .init_array and .fini_array sections for constructor/destructor functions. -fn emit_init_fini_arrays(cg: &mut dyn ArchCodegen, module: &IrModule, ptr_dir: super::common::PtrDirective) { - let align = crate::common::types::target_ptr_size(); - for ctor in &module.constructors { - cg.state().emit(""); - cg.state().emit(".section .init_array,\"aw\",@init_array"); - cg.state().emit_fmt(format_args!(".align {}", ptr_dir.align_arg(align))); - cg.state().emit_fmt(format_args!("{} {}", ptr_dir.as_str(), ctor)); - } - for dtor in &module.destructors { - cg.state().emit(""); - cg.state().emit(".section .fini_array,\"aw\",@fini_array"); - cg.state().emit_fmt(format_args!(".align {}", ptr_dir.align_arg(align))); - cg.state().emit_fmt(format_args!("{} {}", ptr_dir.as_str(), dtor)); - } -} - -/// Generate code for a single function. -fn generate_function(cg: &mut dyn ArchCodegen, func: &IrFunction, source_mgr: Option<&SourceManager>, file_table: &FxHashMap) { - cg.state().reset_for_function(); - - let type_dir = cg.function_type_directive(); - cg.state().emit_linkage(&func.name, func.is_static, func.is_weak); - cg.state().emit_visibility(&func.name, &func.visibility); - - // Emit patchable function entry NOP padding (-fpatchable-function-entry=N,M). - // This is used by the Linux kernel for ftrace and static call patching. - // Format: M NOPs before the entry point, (N-M) NOPs after, plus a - // __patchable_function_entries section pointing to the NOP area. - // - // Skip patchable entries for inline functions: our compiler emits all static - // inline functions from headers as separate definitions (since we don't inline - // them yet). Emitting __patchable_function_entries for each of these would create - // thousands of entries per file (~1400 instead of ~5), overwhelming the kernel's - // ftrace initialization and causing boot hangs. GCC avoids this by inlining - // static inline functions so they never get their own patchable entries. - let emit_patchable = !func.is_inline; - if emit_patchable { - if let Some((total, before)) = cg.state().patchable_function_entry { - if total > 0 { - let pfe_id = cg.state().next_label_id(); - let pfe_label = format!(".LPFE{}", pfe_id); - - // Emit __patchable_function_entries section with a pointer to the NOP area - cg.state().emit_fmt(format_args!( - ".section __patchable_function_entries,\"awo\",@progbits,{}", - pfe_label - )); - let pfe_align = crate::common::types::target_ptr_size(); - let pfe_dir = cg.ptr_directive(); - cg.state().emit_fmt(format_args!(".align {}", pfe_align)); - cg.state().emit_fmt(format_args!("{} {}", pfe_dir.as_str(), pfe_label)); - - // Switch back to the function's section (custom or .text) - if let Some(ref sect) = func.section { - cg.state().emit_fmt(format_args!(".section {},\"ax\",@progbits", sect)); - } else { - cg.state().emit(".text"); - } - - // Emit the LPFE label and M NOPs before the function entry point - cg.state().emit_fmt(format_args!("{}:", pfe_label)); - for _ in 0..before { - cg.state().emit("nop"); - } - } - } - } - - cg.state().emit_fmt(format_args!(".type {}, {}", func.name, type_dir)); - cg.state().emit_fmt(format_args!("{}:", func.name)); - let emit_cfi = cg.state().emit_cfi; - if emit_cfi { - cg.state().emit(".cfi_startproc"); - } - - // Emit (N-M) NOPs after the function entry point for patchable function entry - if emit_patchable { - if let Some((total, before)) = cg.state().patchable_function_entry { - let after = total.saturating_sub(before); - for _ in 0..after { - cg.state().emit("nop"); - } - } - } - - // Naked functions: emit only inline asm blocks, no prologue/epilogue/params. - if func.is_naked { - for block in &func.blocks { - for inst in &block.instructions { - if let Instruction::InlineAsm { template, .. } = inst { - cg.emit_raw_inline_asm(template); - } - } - } - if emit_cfi { - cg.state().emit(".cfi_endproc"); - } - cg.state().emit_fmt(format_args!(".size {}, .-{}", func.name, func.name)); - cg.state().emit(""); - return; - } - - // Pre-scan for DynAlloca/StackRestore: if present, the epilogue must restore SP from - // the frame pointer instead of adding back the compile-time frame size. - let has_dyn_alloca = func.blocks.iter().any(|block| { - block.instructions.iter().any(|inst| matches!(inst, Instruction::DynAlloca { .. } | Instruction::StackRestore { .. })) - }); - cg.state().has_dyn_alloca = has_dyn_alloca; - cg.state().uses_sret = func.uses_sret; - - // Calculate stack space and emit prologue - let raw_space = cg.calculate_stack_space(func); - let frame_size = cg.aligned_frame_size(raw_space); - cg.emit_prologue(func, frame_size); - - // Store parameters - cg.emit_store_params(func); - - // Generate basic blocks - let entry_label = func.blocks.first().map(|b| b.label); - - // Pre-scan: count uses of each Value across the entire function to identify - // single-use Cmp results eligible for compare-branch fusion. - let value_use_counts = count_value_uses(func); - - // Pre-scan: identify GEPs with constant offsets that can be folded into - // Load/Store addressing modes, eliminating the GEP instruction entirely. - let gep_fold_map = build_gep_fold_map(func, &value_use_counts); - - // Pre-scan: map Value IDs to global symbol names (with offsets from GEP). - // Used to emit direct symbol(%rip) references for segment-overridden loads/stores. - let global_addr_map = build_global_addr_map(func, &cg.state_ref().tls_symbols); - - // Pre-scan: identify GlobalAddr values used as Load/Store pointers. - // In kernel code model, non-pointer GlobalAddr values use absolute addressing - // (R_X86_64_32S) for the linked virtual address, while pointer GlobalAddr - // values use RIP-relative addressing for position-independent memory access. - let global_addr_ptr_set = if cg.state_ref().code_model_kernel { - build_global_addr_ptr_set(func) - } else { - FxHashSet::default() - }; - - // Pre-scan: identify GlobalAddr values that can be skipped because ALL of - // their uses are Load/Store pointers that will be folded into direct - // `symbol(%rip)` accesses by the generate_load/generate_store fold. - let dead_global_addrs = if cg.supports_global_addr_fold() { - build_foldable_global_addr_set(func, &global_addr_map) - } else { - FxHashSet::default() - }; - - // Debug info state: track last emitted file/line to suppress redundant .loc directives. - let emit_debug = cg.state_ref().debug_info && source_mgr.is_some() && !file_table.is_empty(); - let mut last_debug_file: u32 = 0; - let mut last_debug_line: u32 = 0; - - for block in &func.blocks { - if Some(block.label) != entry_label { - // Invalidate register cache at block boundaries: a value in a register - // from the previous block's fall-through is not guaranteed to be valid - // if control arrives from a different predecessor. - cg.state().reg_cache.invalidate_all(); - cg.state().out.emit_block_label(block.label.0); - } - - // Check for compare-branch fusion opportunity: - // If the last instruction is a Cmp whose result is only used by the - // CondBranch terminator, emit a fused compare-and-conditional-jump - // instead of materializing the boolean result to a register/stack slot. - let fuse_idx = detect_cmp_branch_fusion(block, &value_use_counts); - - for (idx, inst) in block.instructions.iter().enumerate() { - if Some(idx) == fuse_idx { - // Skip this Cmp -- it will be emitted fused with the terminator - continue; - } - // Skip GEP instructions whose offset has been folded into Load/Store. - // Safe to skip when: - // 1. Base is an alloca (Direct or OverAligned): alloca slots are stable - // and never reused by liveness packing. - // 2. Base has a register assignment: the liveness analysis has been - // extended to keep the base alive through all Load/Store uses of - // this GEP result (see extend_gep_base_liveness in liveness.rs), - // so the register holds the correct value at the use points. - if let Instruction::GetElementPtr { dest, base, .. } = inst { - if gep_fold_map.contains_key(&dest.0) && - (cg.state_ref().is_alloca(base.0) || cg.get_phys_reg_for_value(base.0).is_some()) { - continue; - } - } - - // Emit .loc directive if source location changed. - if emit_debug { - if let Some(span) = block.source_spans.get(idx) { - emit_loc_directive(cg, span, source_mgr.expect("debug mode requires source manager"), file_table, - &mut last_debug_file, &mut last_debug_line); - } - } - - generate_instruction(cg, inst, &gep_fold_map, &global_addr_map, &global_addr_ptr_set, &dead_global_addrs); - } - - if let Some(fi) = fuse_idx { - // Emit fused compare-and-branch: cmp + jCC directly - if let Instruction::Cmp { dest: _, op, lhs, rhs, ty } = &block.instructions[fi] { - if let Terminator::CondBranch { cond: _, true_label, false_label } = &block.terminator { - cg.emit_fused_cmp_branch_blocks(*op, lhs, rhs, *ty, *true_label, *false_label); - } - } - } else { - generate_terminator(cg, &block.terminator, frame_size); - } - } - - if emit_cfi { - cg.state().emit(".cfi_endproc"); - } - cg.state().emit_fmt(format_args!(".size {}, .-{}", func.name, func.name)); - cg.state().emit(""); -} - -/// Emit a `.loc` directive if the source location for this instruction differs -/// from the previously emitted location. Suppresses redundant directives and -/// skips dummy spans (start==0, end==0). -fn emit_loc_directive( - cg: &mut dyn ArchCodegen, - span: &Span, - source_mgr: &SourceManager, - file_table: &FxHashMap, - last_file: &mut u32, - last_line: &mut u32, -) { - // Skip dummy spans - if span.start == 0 && span.end == 0 { - return; - } - let loc = source_mgr.resolve_span(*span); - if let Some(&dwarf_file_id) = file_table.get(&loc.file) { - if dwarf_file_id != *last_file || loc.line != *last_line { - cg.state().emit_fmt(format_args!(".loc {} {} {}", dwarf_file_id, loc.line, loc.column)); - *last_file = dwarf_file_id; - *last_line = loc.line; - } - } -} - -/// Dispatch a single IR instruction to the appropriate arch method. -/// -/// Register cache management strategy: -/// The cache tracks which IR value is currently in the accumulator register -/// (rax on x86, x0 on ARM, t0 on RISC-V). -/// -/// Many instructions follow the pattern: load operand(s) → compute → store_result(dest), -/// which means the accumulator holds dest's value when the instruction completes. -/// For these "acc-preserving" instructions, we keep the cache valid so the next -/// instruction can skip reloading the result. -/// -/// Instructions that clobber the accumulator unpredictably (calls, stores, atomics, -/// inline asm, va_arg, memcpy, etc.) invalidate the cache after execution. -fn generate_instruction(cg: &mut dyn ArchCodegen, inst: &Instruction, gep_fold_map: &FxHashMap, global_addr_map: &FxHashMap, global_addr_ptr_set: &FxHashSet, dead_global_addrs: &FxHashSet) { - match inst { - Instruction::Alloca { .. } => { - // Space already allocated in prologue; does not touch registers - } - Instruction::Copy { dest, src } => { - generate_copy(cg, dest, src); - } - - // ── Acc-preserving instructions ────────────────────────────────── - // These all end with emit_store_result(dest) or store_rax_to(dest), - // which sets the reg cache correctly. The accumulator holds dest's - // value after execution, so we do NOT invalidate. - - Instruction::Load { dest, ptr, ty, seg_override } => { - generate_load(cg, dest, ptr, *ty, *seg_override, gep_fold_map, global_addr_map); - } - Instruction::BinOp { dest, op, lhs, rhs, ty } => { - cg.emit_binop(dest, *op, lhs, rhs, *ty); - if is_wide_int_type(*ty) { - cg.state().reg_cache.invalidate_all(); - } - } - Instruction::UnaryOp { dest, op, src, ty } => { - cg.emit_unaryop(dest, *op, src, *ty); - if is_wide_int_type(*ty) { - cg.state().reg_cache.invalidate_all(); - } - } - Instruction::Cmp { dest, op, lhs, rhs, ty } => { - cg.emit_cmp(dest, *op, lhs, rhs, *ty); - } - Instruction::Cast { dest, src, from_ty, to_ty } => { - cg.emit_cast(dest, src, *from_ty, *to_ty); - if is_wide_int_type(*to_ty) || is_wide_int_type(*from_ty) { - cg.state().reg_cache.invalidate_all(); - } - } - Instruction::GetElementPtr { dest, base, offset, .. } => { - cg.emit_gep(dest, base, offset); - } - Instruction::GlobalAddr { dest, name } => { - // Skip GlobalAddr when all its uses are folded into direct symbol(%rip) - // loads/stores by generate_load/generate_store. The needs_got check - // ensures we don't skip when GOT indirection is required. - // needs_got_for_addr is used because x86-64 needs GOT for external - // symbol addresses even in non-PIC mode (for PIE compatibility). - // TLS symbols must never be folded: they need %fs:sym@TPOFF access, - // not symbol(%rip). - let is_dead = dead_global_addrs.contains(&dest.0) - && !cg.state_ref().needs_got_for_addr(name) - && !cg.state_ref().tls_symbols.contains(name.as_str()); - if !is_dead { - if cg.state_ref().tls_symbols.contains(name.as_str()) { - cg.emit_tls_global_addr(dest, name); - } else if cg.state_ref().code_model_kernel && !global_addr_ptr_set.contains(&dest.0) { - cg.emit_global_addr_absolute(dest, name); - } else { - cg.emit_global_addr(dest, name); - } - } - } - Instruction::Select { dest, cond, true_val, false_val, ty } => { - cg.emit_select(dest, cond, true_val, false_val, *ty); - } - Instruction::LabelAddr { dest, label } => { - cg.emit_label_addr(dest, &label.as_label()); - } - - // ── Cache-invalidating instructions ────────────────────────────── - // These clobber the accumulator unpredictably or don't produce a - // simple acc → dest result. Each arm invalidates the reg cache. - - Instruction::Store { val, ptr, ty, seg_override } => { - generate_store(cg, val, ptr, *ty, *seg_override, gep_fold_map, global_addr_map); - cg.state().reg_cache.invalidate_all(); - } - Instruction::DynAlloca { dest, size, align } => { - cg.emit_dyn_alloca(dest, size, *align); - cg.state().reg_cache.invalidate_all(); - } - Instruction::Call { func, info } => { - cg.emit_call(&info.args, &info.arg_types, Some(func), None, info.dest, info.return_type, info.is_variadic, info.num_fixed_args, &info.struct_arg_sizes, &info.struct_arg_aligns, &info.struct_arg_classes, &info.struct_arg_riscv_float_classes, info.is_sret, info.is_fastcall, &info.ret_eightbyte_classes); - cg.state().reg_cache.invalidate_all(); - } - Instruction::CallIndirect { func_ptr, info } => { - cg.emit_call(&info.args, &info.arg_types, None, Some(func_ptr), info.dest, info.return_type, info.is_variadic, info.num_fixed_args, &info.struct_arg_sizes, &info.struct_arg_aligns, &info.struct_arg_classes, &info.struct_arg_riscv_float_classes, info.is_sret, info.is_fastcall, &info.ret_eightbyte_classes); - cg.state().reg_cache.invalidate_all(); - } - Instruction::Memcpy { dest, src, size } => { - cg.emit_memcpy(dest, src, *size); - cg.state().reg_cache.invalidate_all(); - } - Instruction::VaArg { dest, va_list_ptr, result_ty } => { - cg.emit_va_arg(dest, va_list_ptr, *result_ty); - cg.state().reg_cache.invalidate_all(); - } - Instruction::VaStart { va_list_ptr } => { - cg.emit_va_start(va_list_ptr); - cg.state().reg_cache.invalidate_all(); - } - Instruction::VaEnd { va_list_ptr } => { - cg.emit_va_end(va_list_ptr); - cg.state().reg_cache.invalidate_all(); - } - Instruction::VaCopy { dest_ptr, src_ptr } => { - cg.emit_va_copy(dest_ptr, src_ptr); - cg.state().reg_cache.invalidate_all(); - } - Instruction::VaArgStruct { dest_ptr, va_list_ptr, size, ref eightbyte_classes } => { - cg.emit_va_arg_struct_ex(dest_ptr, va_list_ptr, *size, eightbyte_classes); - cg.state().reg_cache.invalidate_all(); - } - Instruction::AtomicRmw { dest, op, ptr, val, ty, ordering } => { - cg.emit_atomic_rmw(dest, *op, ptr, val, *ty, *ordering); - cg.state().reg_cache.invalidate_all(); - } - Instruction::AtomicCmpxchg { dest, ptr, expected, desired, ty, success_ordering, failure_ordering, returns_bool } => { - cg.emit_atomic_cmpxchg(dest, ptr, expected, desired, *ty, *success_ordering, *failure_ordering, *returns_bool); - cg.state().reg_cache.invalidate_all(); - } - Instruction::AtomicLoad { dest, ptr, ty, ordering } => { - cg.emit_atomic_load(dest, ptr, *ty, *ordering); - cg.state().reg_cache.invalidate_all(); - } - Instruction::AtomicStore { ptr, val, ty, ordering } => { - cg.emit_atomic_store(ptr, val, *ty, *ordering); - cg.state().reg_cache.invalidate_all(); - } - Instruction::Fence { ordering } => { - cg.emit_fence(*ordering); - cg.state().reg_cache.invalidate_all(); - } - Instruction::Phi { .. } => { /* resolved before codegen */ } - Instruction::GetReturnF64Second { dest } => { - cg.emit_get_return_f64_second(dest); - cg.state().reg_cache.invalidate_all(); - } - Instruction::SetReturnF64Second { src } => { - cg.emit_set_return_f64_second(src); - cg.state().reg_cache.invalidate_all(); - } - Instruction::GetReturnF32Second { dest } => { - cg.emit_get_return_f32_second(dest); - cg.state().reg_cache.invalidate_all(); - } - Instruction::SetReturnF32Second { src } => { - cg.emit_set_return_f32_second(src); - cg.state().reg_cache.invalidate_all(); - } - Instruction::GetReturnF128Second { dest } => { - cg.emit_get_return_f128_second(dest); - cg.state().reg_cache.invalidate_all(); - } - Instruction::SetReturnF128Second { src } => { - cg.emit_set_return_f128_second(src); - cg.state().reg_cache.invalidate_all(); - } - Instruction::InlineAsm { template, outputs, inputs, clobbers, operand_types, goto_labels, input_symbols, seg_overrides } => { - cg.emit_inline_asm_with_segs(template, outputs, inputs, clobbers, operand_types, goto_labels, input_symbols, seg_overrides); - cg.state().reg_cache.invalidate_all(); - } - Instruction::Intrinsic { dest, op, dest_ptr, args } => { - cg.emit_intrinsic(dest, op, dest_ptr, args); - cg.state().reg_cache.invalidate_all(); - } - Instruction::StackSave { dest } => { - cg.emit_stack_save(dest); - cg.state().reg_cache.invalidate_all(); - } - Instruction::StackRestore { ptr } => { - cg.emit_stack_restore(ptr); - cg.state().reg_cache.invalidate_all(); - } - Instruction::ParamRef { dest, param_idx, ty } => { - cg.emit_param_ref(dest, *param_idx, *ty); - cg.state().reg_cache.invalidate_all(); - } - } -} - -/// Generate a Copy instruction, handling coalesced slots, i128, and wide values. -fn generate_copy(cg: &mut dyn ArchCodegen, dest: &Value, src: &Operand) { - // Skip Copy when dest and src share the same stack slot (from copy coalescing). - if let Operand::Value(src_val) = src { - let dest_slot = cg.state_ref().get_slot(dest.0); - let src_slot = cg.state_ref().get_slot(src_val.0); - if let (Some(ds), Some(ss)) = (dest_slot, src_slot) { - if ds.0 == ss.0 { - if cg.state_ref().reg_cache.acc_has(src_val.0, false) { - cg.state().reg_cache.set_acc(dest.0, false); - } - return; - } - } - } - - let is_i128_copy = match src { - Operand::Value(v) => cg.state_ref().is_i128_value(v.0), - Operand::Const(IrConst::I128(_)) => true, - _ => false, - }; - if is_i128_copy { - cg.state().i128_values.insert(dest.0); - cg.emit_copy_i128(dest, src); - cg.state().reg_cache.invalidate_all(); - return; - } - - // Propagate wide value status through Copy chains on 32-bit targets. - // IrConst::I64 is the universal container for ALL integer constants, - // so only mark as wide if the value doesn't fit in 32 bits. - let is_wide = match src { - Operand::Value(v) => cg.state_ref().is_wide_value(v.0), - Operand::Const(IrConst::F64(_)) => crate::common::types::target_is_32bit(), - Operand::Const(IrConst::I64(val)) => { - crate::common::types::target_is_32bit() - && (*val < i32::MIN as i64 || *val > u32::MAX as i64) - } - _ => false, - }; - if is_wide { - cg.state().wide_values.insert(dest.0); - } - cg.emit_copy_value(dest, src); -} - -/// Generate a Load instruction with segment override, kernel code model, -/// and GEP folding support. -fn generate_load( - cg: &mut dyn ArchCodegen, - dest: &Value, ptr: &Value, ty: IrType, seg_override: AddressSpace, - gep_fold_map: &FxHashMap, - global_addr_map: &FxHashMap, -) { - if seg_override != AddressSpace::Default { - if let Some(sym) = global_addr_map.get(&ptr.0) { - cg.emit_seg_load_symbol(dest, sym, ty, seg_override); - } else { - cg.emit_seg_load(dest, ptr, ty, seg_override); - } - return; - } - // Fold GlobalAddr + Load into a direct PC-relative memory access. - // On x86-64 this emits `movl symbol(%rip), %eax` instead of separate - // `leaq symbol(%rip), %rax` + `movl (%rax), %eax`. - // Works for kernel and default code models. Skipped for symbols - // that require GOT indirection (the pointer comes from the GOT), and - // for TLS symbols which require %fs:sym@TPOFF access patterns. - // Uses needs_got_for_addr to block folding for external symbols even - // in non-PIC mode (x86-64 needs GOTPCREL for PIE compatibility). - if cg.supports_global_addr_fold() && !is_wide_int_type(ty) && ty != IrType::F128 { - if let Some(sym) = global_addr_map.get(&ptr.0) { - if !cg.state_ref().needs_got_for_addr(sym) && !cg.state_ref().tls_symbols.contains(sym.as_str()) { - cg.emit_global_load_rip_rel(dest, sym, ty); - return; - } - } - } - // Fold GEP with constant offset into Load addressing mode. - if let Some(gep_info) = gep_fold_map.get(&ptr.0) { - if !is_wide_int_type(ty) && - (cg.state_ref().is_alloca(gep_info.base.0) || cg.get_phys_reg_for_value(gep_info.base.0).is_some()) { - cg.emit_load_with_const_offset(dest, &gep_info.base, gep_info.offset, ty); - return; - } - } - cg.emit_load(dest, ptr, ty); - if is_wide_int_type(ty) { - cg.state().reg_cache.invalidate_all(); - } -} - -/// Generate a Store instruction with segment override, kernel code model, -/// and GEP folding support. -fn generate_store( - cg: &mut dyn ArchCodegen, - val: &Operand, ptr: &Value, ty: IrType, seg_override: AddressSpace, - gep_fold_map: &FxHashMap, - global_addr_map: &FxHashMap, -) { - if seg_override != AddressSpace::Default { - if let Some(sym) = global_addr_map.get(&ptr.0) { - cg.emit_seg_store_symbol(val, sym, ty, seg_override); - } else { - cg.emit_seg_store(val, ptr, ty, seg_override); - } - return; - } - // Fold GlobalAddr + Store into a direct PC-relative memory access. - // Skipped for TLS symbols which require %fs:sym@TPOFF access patterns. - // Uses needs_got_for_addr: same as Load fold above. - if cg.supports_global_addr_fold() && !is_wide_int_type(ty) && ty != IrType::F128 { - if let Some(sym) = global_addr_map.get(&ptr.0) { - if !cg.state_ref().needs_got_for_addr(sym) && !cg.state_ref().tls_symbols.contains(sym.as_str()) { - cg.emit_global_store_rip_rel(val, sym, ty); - return; - } - } - } - // Fold GEP with constant offset into Store addressing mode. - if let Some(gep_info) = gep_fold_map.get(&ptr.0) { - if !is_wide_int_type(ty) && - (cg.state_ref().is_alloca(gep_info.base.0) || cg.get_phys_reg_for_value(gep_info.base.0).is_some()) { - cg.emit_store_with_const_offset(val, &gep_info.base, gep_info.offset, ty); - return; - } - } - cg.emit_store(val, ptr, ty); -} - -/// Dispatch a terminator to the appropriate arch method. -fn generate_terminator(cg: &mut dyn ArchCodegen, term: &Terminator, frame_size: i64) { - match term { - Terminator::Return(val) => { - cg.emit_return(val.as_ref(), frame_size); - } - Terminator::Branch(label) => { - cg.emit_branch_to_block(*label); - } - Terminator::CondBranch { cond, true_label, false_label } => { - cg.emit_cond_branch_blocks(cond, *true_label, *false_label); - } - Terminator::IndirectBranch { target, .. } => { - cg.emit_indirect_branch(target); - } - Terminator::Switch { val, cases, default, ty } => { - cg.emit_switch(val, cases, default, *ty); - } - Terminator::Unreachable => { - cg.emit_unreachable(); - } - } -} - -/// Check if an IR type is a 128-bit integer type (I128 or U128). -pub fn is_i128_type(ty: IrType) -> bool { - matches!(ty, IrType::I128 | IrType::U128) -} - -/// Check if a type is "wide" — needs register-pair operations on the current target. -/// -/// Only I128/U128 on all targets. On i686, I64/U64 BinOps are handled via -/// the i686-specific `emit_binop`/`emit_cmp`/`emit_unaryop` overrides which -/// route them through register-pair arithmetic. We don't include I64/U64 here -/// because the framework-level effects (disabling GEP folding, fused branches, -/// cache invalidation) would cause excessive overhead on the common case of -/// widened I32 arithmetic. -pub fn is_wide_int_type(ty: IrType) -> bool { - matches!(ty, IrType::I128 | IrType::U128) -} - -// Re-export stack layout functions so existing `crate::backend::generation::X` imports -// continue to work without changes to downstream code. -pub use super::stack_layout::{ - collect_inline_asm_callee_saved, - collect_inline_asm_callee_saved_with_generic, - run_regalloc_and_merge_clobbers, - filter_available_regs, - calculate_stack_space_common, - find_param_alloca, -}; - diff --git a/src/backend/i686/README.md b/src/backend/i686/README.md deleted file mode 100644 index 29ae21a991..0000000000 --- a/src/backend/i686/README.md +++ /dev/null @@ -1,44 +0,0 @@ -# i686 Backend -- 32-bit x86 - -The i686 backend targets 32-bit x86 (IA-32) with the System V i386 ABI -(cdecl). It covers the full pipeline from IR to ELF executable: code -generation (instruction selection, register allocation, peephole -optimization), a builtin assembler (reuses the x86-64 AT&T parser with a -32-bit encoder, producing ELFCLASS32 objects), and a builtin linker -(32-bit ELF, `.rel` relocations, dynamic or static linking). - -## Directory Structure - -``` -i686/ - codegen/ Code generation and peephole optimizer - assembler/ Builtin i686 assembler (shared x86 parser, 32-bit encoder) - linker/ Builtin i686 linker (32-bit ELF, R_386 relocations) -``` - -## Sub-Module Documentation - -| Module | README | -|--------|--------| -| Code generation | [`codegen/README.md`](codegen/README.md) | -| Assembler | [`assembler/README.md`](assembler/README.md) | -| Linker | [`linker/README.md`](linker/README.md) | - -## Key Characteristics - -- **ABI**: cdecl (all arguments on stack), with `-mregparm=N` and fastcall - support -- **ILP32 type model**: Pointers are 4 bytes, `long` is 4 bytes, - `long long` requires `eax:edx` register pairs -- **Accumulator model**: Values flow through `%eax`; only 3 callee-saved - registers (`ebx`, `esi`, `edi`) plus `ebp` with `-fomit-frame-pointer` -- **64-bit splitting**: 64-bit arithmetic splits across `eax:edx` pairs - with carry/borrow propagation; 64-bit values stored in 8-byte stack slots -- **F128 (long double)**: Native x87 80-bit extended precision in 12-byte - stack slots -- **PIC mode**: GOT-relative addressing via `%ebx` as GOT base register -- **Assembler**: Reuses x86-64 AT&T parser, 32-bit instruction encoding -- **Linker**: Dynamic and static linking with PLT/GOT, 32-bit ELF, - `.rel` (not `.rela`) -- **`-m16` / `.code16gcc`**: Supports 16-bit real-mode code generation for - Linux kernel early boot diff --git a/src/backend/i686/assembler/README.md b/src/backend/i686/assembler/README.md deleted file mode 100644 index 215f04a2dc..0000000000 --- a/src/backend/i686/assembler/README.md +++ /dev/null @@ -1,440 +0,0 @@ -# i686 Built-in Assembler -- Design Document - -## Overview - -The i686 built-in assembler translates AT&T-syntax assembly text into 32-bit ELF -relocatable object files (`.o`). It replaces the external GNU assembler (`as`) -when the compiler is configured to use its own toolchain, giving the compiler a -self-contained build path for the `i686-linux-gnu` target. - -The assembler is structured as a three-stage pipeline: - -``` - AT&T assembly text - | - v - +------------------+ - | Parser | Reused from the x86-64 backend - | (x86/parser.rs) | Produces Vec - +------------------+ - | - v - +------------------+ - | Encoder | i686-specific: no REX, 32-bit default operand size - | encoder/ | Produces machine-code bytes + Relocation entries - +------------------+ - | - v - +---------------------+ - | ELF Writer | elf_writer.rs (i686 adapter) + - | + ElfWriterCore | elf_writer_common.rs (shared logic) - | | Produces ELFCLASS32 / EM_386 / Elf32_Rel .o files - +---------------------+ - | - v - .o file on disk -``` - -The entry point is the `assemble()` function in `mod.rs`, which wires the three -stages together: parse, build, write. - - -## Relationship to the x86-64 Assembler - -The i686 and x86-64 backends share two major components: - -1. **AT&T syntax parser** (`crate::backend::x86::assembler::parser`). The parsed - representation (`AsmItem`, `Instruction`, `Operand`, etc.) is - architecture-neutral -- the parser does not make assumptions about register - width or operand-size defaults. - -2. **ELF writer core** (`crate::backend::elf_writer_common::ElfWriterCore`). - Section management, symbol tables, jump relaxation, numeric label resolution, - and internal relocation resolution are all generic over an `X86Arch` trait. - The i686 adapter (`elf_writer.rs`) plugs in i686-specific constants and - the instruction encoder; the shared core handles everything else. - -Everything else is i686-specific: - -| Concern | x86-64 | i686 | -|-------------------------|-------------------------------|-------------------------------| -| Default operand size | 64-bit (for GP instrs) | 32-bit | -| REX prefix | Required for r8-r15, 64-bit | Not used | -| Register file | 16 GP + 16 XMM | 8 GP + 8 XMM | -| Addressing modes | RIP-relative (`%rip`) | Absolute displacement only | -| ELF class | ELFCLASS64, Elf64_Sym (24 B) | ELFCLASS32, Elf32_Sym (16 B) | -| Relocation format | RELA (Elf64_Rela, 24 B) | REL (Elf32_Rel, 8 B) | -| Relocation types | R_X86_64_* | R_386_* | -| ELF machine | EM_X86_64 (62) | EM_386 (3) | -| `inc`/`dec` encoding | ModR/M form (0xFF /0, /1) | Compact form (0x40+r, 0x48+r) | -| Mnemonic `q` suffix | 64-bit operations | Mapped to 32-bit gracefully | - - -## Key Data Structures - -### Parser types (from `x86::assembler::parser`) - -| Type | Role | -|---------------------|---------------------------------------------------------| -| `AsmItem` | One parsed assembly line: directive, label, instruction | -| `Instruction` | Mnemonic + optional prefix + operand list | -| `Operand` | Register, Immediate, Memory, Label, or Indirect | -| `MemoryOperand` | `disp(%base, %index, scale)` with optional segment | -| `Displacement` | Integer, symbol, symbol+addend, or `sym@MODIFIER` | -| `SectionDirective` | `.section name,"flags",@type` | -| `DataValue` | Integer, symbol, `sym+offset`, or `sym1-sym2` | -| `SizeExpr` | Constant, `.-sym`, or `end-start` for `.size` directive | -| `SymbolKind` | Function, Object, TlsObject, NoType | - -### Encoder types (`encoder/`) - -| Type | Role | -|------------------------|------------------------------------------------------------| -| `InstructionEncoder` | Stateful encoder; accumulates bytes and relocations | -| `Relocation` | Offset + symbol + R_386 type + addend + optional diff_symbol| - -The encoder's `bytes: Vec` collects the raw machine code for one -instruction. The `offset: u64` field tracks the current position within the -section so that relocation offsets are computed correctly. - -### ELF writer types - -The i686 `elf_writer.rs` is a thin adapter (see `I686Arch`) that plugs into the -shared `ElfWriterCore` from `elf_writer_common.rs`. The shared -core defines the types that drive ELF emission: - -| Type (in `elf_writer_common`) | Role | -|-------------------------------|-----------------------------------------------------| -| `ElfWriterCore` | Top-level builder: sections, symbols, label positions| -| `Section` | In-progress section: name, type, flags, data, relocs| -| `ElfRelocation` | Section-local relocation (offset, symbol, type, addend)| -| `SymbolInfo` | Binding, type, visibility, section, value, size | -| `JumpInfo` | Tracks a jump for short-form relaxation | - -The i686 adapter (`elf_writer.rs`) defines: - -| Type | Role | -|---------------------|--------------------------------------------------------------| -| `I686Arch` | Implements `X86Arch`: encoder dispatch, ELF constants, REL format| -| `ElfWriter` | Type alias for `ElfWriterCore` | - -String tables (`StringTable`) live in `backend::elf` and are used during final -serialization. - - -## Processing Algorithm - -### Stage 1: Parsing - -`parse_asm(text)` iterates over lines and produces a `Vec`. Each line -is classified as one of: - -1. **Empty / comment** -- stripped away. -2. **Label** -- `name:` at the start of a line. -3. **Directive** -- lines starting with `.` (`.section`, `.globl`, `.align`, - `.byte`, `.long`, `.asciz`, `.type`, `.size`, `.comm`, `.set`, CFI, etc.). -4. **Prefixed instruction** -- `lock`, `rep`, `repz`, `repnz` followed by a - mnemonic. -5. **Instruction** -- mnemonic + comma-separated operands in AT&T order - (source, destination). - -Lines containing `;` are split into multiple items (GAS multi-statement syntax). -Comments starting with `#` are stripped. String literals are respected during -both comment stripping and semicolon splitting. - -### Numeric Label Resolution (pre-pass) - -Before encoding begins, the ELF writer core runs a numeric label resolution -pre-pass (`resolve_numeric_labels`). GNU assembler numeric labels (`1:`, `2:`, -etc.) can be defined multiple times; forward references (`1f`) refer to the next -definition, and backward references (`1b`) refer to the most recent. - -The pre-pass renames each numeric label definition to a unique internal name -(`.Lnum_N_K`) and updates all instruction operands and data directives (`Byte`, -`Long`, `Quad`, `SkipExpr`) that reference them. This converts inherently -ambiguous references into unique `.L`-prefixed labels that the rest of the -pipeline handles normally. - -As a defense-in-depth measure, the ELF writer also tracks numeric label -positions at runtime for fallback resolution during jump relaxation and -relocation processing. - -### Stage 2: Instruction Encoding - -The `InstructionEncoder` converts each `Instruction` into machine-code bytes. -The main dispatch function `encode_mnemonic()` is a large `match` that covers: - -- **Data movement**: `mov`, `movsx`/`movzx`, `lea`, `push`, `pop`, `xchg` -- **ALU**: `add`, `sub`, `and`, `or`, `xor`, `cmp`, `test` (8 ALU group ops) -- **Multiply/divide**: `imul` (1/2/3 operand), `mul`, `div`, `idiv` -- **Unary**: `neg`, `not`, `inc`, `dec` -- **Shifts**: `shl`/`shr`/`sar`/`rol`/`ror`/`rcl`/`rcr`, `shld`/`shrd` -- **Bit operations**: `bt`/`bts`/`btr`/`btc`, `bsf`/`bsr`, `lzcnt`/`tzcnt`/`popcnt` -- **Sign extension**: `cdq`, `cwde`, `cbw`, `cwd` -- **Conditional**: `setcc`, `cmovcc` -- **Control flow**: `jmp`, `jcc`, `jecxz`, `loop`, `call`, `ret` -- **Atomics**: `cmpxchg`, `xadd`, `cmpxchg8b` -- **String ops**: `movsb`/`movsl`, `stosb`/`stosl`, `cmpsb`/`cmpsl`, etc. -- **I/O**: `inb`/`outb`, `insb`/`outsb`, etc. -- **SSE/SSE2**: Scalar and packed float/integer ops, shuffles, conversions, - comparisons, non-temporal stores -- **SSE3/SSSE3/SSE4**: Horizontal ops, blends, rounds, dot products, `ptest` -- **AES-NI**: `aesenc`, `aesdec`, `aeskeygenassist`, `pclmulqdq` -- **x87 FPU**: Load/store, arithmetic, transcendentals, control word, `fcomip` -- **System**: `int`, `cpuid`, `rdtsc`, `syscall`, `sysenter`, `hlt`, `mfence`, - `rdmsr`/`wrmsr`, `bswap`, `ud2`, `endbr32` -- **Prefixes**: `lock`, `rep`/`repe`/`repnz` (both as prefixes and standalone) - -#### ModR/M and SIB encoding - -`encode_modrm_mem()` is the central memory-operand encoder. It handles: - -- **No base, no index**: `mod=00, rm=5` with disp32 (absolute addressing). -- **Base only**: Direct ModR/M when base is not ESP/EBP-special. -- **Base + index + scale**: SIB byte. ESP (reg 4) as base always triggers SIB. -- **Symbol displacements**: Always use `mod=10` (disp32) and emit a relocation - pointing at the displacement bytes. - -Displacement sizes are chosen automatically: - -``` - disp == 0 && base != EBP -> mod=00 (no displacement bytes) - -128 <= disp <= 127 -> mod=01 (disp8) - otherwise -> mod=10 (disp32) - symbol reference -> mod=10 (disp32 + relocation) -``` - -Relocations are deferred until after the ModR/M and SIB bytes are emitted so -that the relocation offset points to the displacement field, not the ModR/M -byte. This is essential for the REL format where the addend is embedded inline. - -#### Key i686-specific encoding decisions - -- **No REX prefix**. The register file is limited to 8 GP registers (0-7) and - 8 XMM registers (0-7). The 3-bit `reg_num()` function maps register names - directly without any extension bit. - -- **32-bit default operand size**. Unsuffixed instructions default to 32-bit. - The `l` suffix is standard; `q` suffix is mapped to 32-bit with a graceful - fallback. - -- **Compact `inc`/`dec`**. Unlike x86-64 (where 0x40-0x4F are REX prefixes), - i686 uses the single-byte `0x40+r` (inc) and `0x48+r` (dec) encodings for - 32-bit registers. - -- **Absolute addressing for calls/jumps**. All branch instructions (`call`, - `jmp`, `jcc`) emit `R_386_PLT32` relocations for label targets, matching - modern GCC/binutils behavior. The `@PLT` suffix is stripped from symbol names - but does not affect the relocation type (always PLT32). - -- **Operand-size prefix** (`0x66`) is emitted for 16-bit operations. Segment - override prefixes (`0x64` for `%fs`, `0x65` for `%gs`) are emitted for - segment-prefixed memory operands. - -#### TLS and GOT relocation mapping - -The encoder maps AT&T `@MODIFIER` syntax to i386 relocation types: - -| AT&T modifier | Relocation constant | Usage | -|------------------|----------------------|------------------------------------| -| `@NTPOFF` | `R_386_TLS_LE_32` | Negative TP offset (Local Exec) | -| `@TPOFF` | `R_386_32S` | TP offset (Local Exec) | -| `@TLSGD` | `R_386_TLS_GD` | General Dynamic TLS | -| `@TLSLDM` | `R_386_TLS_LDM` | Local Dynamic TLS | -| `@DTPOFF` | `R_386_TLS_LDO_32` | DTP-relative offset | -| `@GOT` | `R_386_GOT32` | GOT entry | -| `@GOTOFF` | `R_386_GOTOFF` | Offset from GOT base | -| `@PLT` | `R_386_PLT32` | PLT-relative call | -| `@GOTPC` | `R_386_GOTPC` | PC-relative to GOT base | -| `@GOTNTPOFF` | `R_386_TLS_IE` | IE model via GOT | -| `@INDNTPOFF` | `R_386_TLS_IE` | IE model via GOT (alias) | - -### Stage 3: ELF Object File Emission - -The `ElfWriterCore` (parameterized with `I686Arch`) processes all `AsmItem`s in -order, building up sections, symbols, and relocations, then serializes them into -an ELF32 relocatable object. - -#### Item processing - -``` - for each AsmItem: - Section(dir) -> switch to / create section - Global(name) -> mark symbol as STB_GLOBAL (pending) - Weak(name) -> mark symbol as STB_WEAK (pending) - Hidden(name) -> mark symbol visibility STV_HIDDEN (pending) - Label(name) -> record label position; create/update SymbolInfo - Align(n) -> pad with NOP (text) or 0x00 (data) to alignment - Byte/Short/Long/ -> append data bytes; emit R_386_32 relocs for symbols - Quad/Zero/Asciz - Comm(n,s,a) -> create COMMON symbol (SHN_COMMON) - Set(alias,target) -> record symbol alias - Instruction(instr) -> encode via InstructionEncoder; copy bytes & relocs - SymbolType/Size -> deferred; applied after encoding - Cfi/File/Loc/Empty -> ignored (debug info not emitted by built-in assembler) -``` - -#### Jump relaxation - -After all items are processed, the ELF writer runs a **jump relaxation pass**. -Jumps are initially encoded in their long form: - -- Unconditional `jmp`: `E9 rel32` (5 bytes) -- Conditional `jcc`: `0F 8x rel32` (6 bytes) - -The relaxation algorithm iterates until convergence: - -``` - loop: - for each jump in section: - if jump is not yet relaxed AND target is in same section: - compute displacement assuming short encoding (2 bytes) - if displacement fits in [-128, 127]: - mark jump for relaxation - if no new relaxations: break - for each newly relaxed jump (processed back-to-front): - rewrite opcode to short form: - jmp -> EB disp8 (2 bytes) - jcc -> 7x disp8 (2 bytes) - remove excess bytes from section data - adjust all label positions after this jump - adjust all relocation offsets after this jump - remove the now-unnecessary PC32 relocation for this jump - adjust offsets of other tracked jumps -``` - -After relaxation, short jump displacements are patched with the final -`disp8` values. - -#### Internal relocation resolution - -Same-section, PC-relative relocations to **local** symbols (STB_LOCAL or `.L*` -labels) are resolved inline before serialization. The resolved value is written -directly into the section data, and the relocation entry is removed. This -avoids emitting relocations that the linker would just resolve to the same -object anyway. - -Global and weak symbols always keep their relocations so the linker can handle -symbol interposition and PLT redirection. - -#### Symbol table construction - -The ELF symbol table is built in the standard order: - -1. **Null symbol** (index 0) -2. **Section symbols** (STT_SECTION, STB_LOCAL) -- one per section -3. **Local defined symbols** (STB_LOCAL, excluding `.L*` labels) -4. **Global and weak symbols** -- the `sh_info` field of `.symtab` records the - index of the first global symbol -5. **Alias symbols** (from `.set` directives) -- cloned from their targets -6. **Undefined external symbols** -- created on demand when a relocation - references a symbol not defined in this object - -Size expressions (`.-symbol`, `end-start`) are resolved after jump relaxation -so that function sizes account for any shortened jumps. - -#### ELF32 file layout - -``` - +-----------------------------+ offset 0 - | ELF32 Header (52 bytes) | e_ident[EI_CLASS] = ELFCLASS32 - | e_machine = EM_386 | e_ident[EI_DATA] = ELFDATA2LSB - | e_type = ET_REL | - +-----------------------------+ - | Section data | .text, .data, .rodata, .bss, ... - | (each aligned per | (SHT_NOBITS sections occupy no space) - | section requirements) | - +-----------------------------+ - | .symtab | Elf32_Sym entries (16 bytes each) - | (4-byte aligned) | - +-----------------------------+ - | .strtab | NUL-terminated symbol name strings - +-----------------------------+ - | .shstrtab | NUL-terminated section name strings - +-----------------------------+ - | .rel.text, .rel.data, ... | Elf32_Rel entries (8 bytes each) - | (4-byte aligned) | r_info = (sym << 8) | type - +-----------------------------+ - | Section Header Table | Elf32_Shdr entries (40 bytes each) - | (4-byte aligned) | Null + data sections + symtab + - | | strtab + shstrtab + rel sections - +-----------------------------+ -``` - -The critical difference from the x86-64 ELF writer: this uses **Elf32_Rel** -(8 bytes: `r_offset` + `r_info`) rather than **Elf64_Rela** (24 bytes: -`r_offset` + `r_info` + `r_addend`). In the REL format, the addend is -embedded in the instruction bytes at the relocation site. The ELF writer -patches these implicit addends into the section data during serialization. - -The `r_info` field is packed as `(symbol_index << 8) | reloc_type` (32-bit -format), compared to x86-64's `(symbol_index << 32) | reloc_type` (64-bit -format). - - -## Key Design Decisions and Trade-offs - -1. **Parser reuse**. Sharing the AT&T parser between i686 and x86-64 eliminates - duplicated parsing logic. The parser is architecture-neutral by design: - register names, mnemonic suffixes, and memory operand syntax are identical in - AT&T notation. The cost is that the parser accepts some x86-64-only - constructs (like `%rax`) that the encoder will reject. - -2. **REL vs. RELA**. i386 ELF conventionally uses REL relocations. This - requires the assembler to embed addends in the instruction stream, and the - ELF writer to patch them during serialization. The x86-64 backend uses RELA - (explicit addends in the relocation entry), which is simpler to implement. - The REL approach here adds complexity but produces standard-conforming i386 - objects that work with any ELF linker. - -3. **Shared ELF writer infrastructure**. The `ElfWriterCore` is generic over an - `X86Arch` trait, so the i686 and x86-64 backends share all section/symbol - management, jump relaxation, and ELF serialization logic. The i686 adapter - (`elf_writer.rs`) only needs to provide architecture-specific constants and - wire up the instruction encoder. - -4. **Eager long encoding + relaxation**. Instructions are initially encoded in - their longest form. A post-encoding relaxation pass shortens jumps that can - reach their targets with 8-bit displacements. This avoids the complexity of - multi-pass encoding (where shortening one jump might allow others to shorten) - while still producing reasonably compact code. - -5. **Inline resolution of local relocations**. Same-section PC-relative - relocations to local symbols are resolved by the assembler, not deferred to - the linker. This reduces the number of relocations in the output and avoids - unnecessary linker work. Only global/weak symbols retain relocations. - -6. **No `.eh_frame` / DWARF generation**. The assembler ignores CFI directives - and debug metadata. This simplifies the implementation at the cost of no - stack unwinding or debug info in the output. The linker can still link - objects that contain `.eh_frame` from other sources (e.g., CRT objects). - -7. **Compact `inc`/`dec` encoding**. The i686 backend uses the single-byte - `0x40+r` / `0x48+r` forms for 32-bit `inc`/`dec`, which are unavailable on - x86-64 (where those bytes are REX prefixes). This produces smaller code. - - -## File Inventory - -| File | Lines | Role | -|-----------------------------|--------|-------------------------------------------------| -| `mod.rs` | ~30 | Module root; `assemble()` entry point | -| `encoder/` | ~3520 | i686 instruction encoder (split into focused submodules, see below) | -| `elf_writer.rs` | ~170 | `I686Arch` adapter for `ElfWriterCore` | -| *(shared with x86-64)* | | | -| `x86/assembler/parser.rs` | ~2180 | AT&T syntax parser; data types; directives | -| `elf_writer_common.rs` | ~1700 | Section/symbol/jump relax/ELF32 serialization | - -### Encoder Submodules (`encoder/`) - -The instruction encoder is organized as a directory of focused submodules: - -| File | Lines | Role | -|------|-------|------| -| `mod.rs` | ~770 | `InstructionEncoder` struct, `encode()` entry point, `encode_mnemonic()` dispatch match, `Relocation` type, relocation constants, `split_label_offset()` helper | -| `registers.rs` | ~138 | Register number mapping (`reg_num`), segment register mapping (`seg_reg_num`), XMM detection, suffix inference, x87 ST parsing, condition code parsing | -| `core.rs` | ~180 | Low-level encoding primitives: ModR/M + SIB byte construction, segment prefix emission, memory operand encoding (`encode_modrm_mem`), relocation helpers | -| `gp_integer.rs` | ~1390 | General-purpose integer instructions: MOV, ALU ops, shifts, IMUL, PUSH/POP, LEA, XCHG, CMPXCHG, conditional moves/sets, JMP/CALL/RET, string ops | -| `sse.rs` | ~385 | SSE/SSE2/SSE3/SSSE3/SSE4.1 and MMX instructions: scalar/packed float, integer SIMD, shuffles, conversions, AES-NI | -| `x87.rs` | ~279 | x87 FPU instructions: FLD/FSTP, arithmetic (FADD/FSUB/FMUL/FDIV), transcendentals, control word, FCOMIP | -| `system.rs` | ~379 | System and privileged instructions: INT, CPUID, RDTSC, SYSENTER, HLT, MFENCE, RDMSR/WRMSR, BSWAP, I/O, segment/control register moves | diff --git a/src/backend/i686/assembler/elf_writer.rs b/src/backend/i686/assembler/elf_writer.rs deleted file mode 100644 index 340a0a5875..0000000000 --- a/src/backend/i686/assembler/elf_writer.rs +++ /dev/null @@ -1,170 +0,0 @@ -//! 32-bit ELF relocatable object file writer for i686. -//! -//! Thin wrapper around `ElfWriterCore` that provides i686-specific -//! instruction encoding and relocation types. Uses ELFCLASS32, EM_386, -//! and REL (not RELA) relocation format. All shared logic lives in -//! `backend::elf_writer_common`. - -use crate::backend::x86::assembler::parser::*; -use super::encoder::*; -use crate::backend::elf::{ELFCLASS32, EM_386}; -use crate::backend::elf_writer_common::{ - X86Arch, ElfWriterCore, EncodeResult, EncoderReloc, JumpDetection, -}; -use crate::backend::x86::assembler::encoder::{ - InstructionEncoder as X86_64Encoder, - R_X86_64_64, R_X86_64_PC32, R_X86_64_PLT32, R_X86_64_32, R_X86_64_32S, -}; - -/// i686 architecture implementation for the shared ELF writer. -pub struct I686Arch; - -impl X86Arch for I686Arch { - fn encode_instruction( - instr: &Instruction, - section_data_len: u64, - ) -> Result { - let mut encoder = InstructionEncoder::new(); - encoder.offset = section_data_len; - encoder.encode(instr)?; - - let instr_len = encoder.bytes.len(); - - // Detect jump instructions for relaxation - let jump = { - let mnem = &instr.mnemonic; - let is_jump = mnem == "jmp" || mnem == "loop" - || (mnem.starts_with('j') && mnem.len() >= 2); - if is_jump && instr.operands.len() == 1 { - if let Operand::Label(_) = &instr.operands[0] { - let is_short_only = matches!(mnem.as_str(), "jecxz" | "jcxz" | "loop"); - let is_conditional = mnem != "jmp"; - if is_short_only && instr_len == 2 { - // Short-only jumps have no long form; register as already relaxed - Some(JumpDetection { - is_conditional: true, - already_short: true, - }) - } else { - let expected_len = if is_conditional { 6 } else { 5 }; - if instr_len == expected_len { - Some(JumpDetection { - is_conditional, - already_short: false, - }) - } else { - None - } - } - } else { - None - } - } else { - None - } - }; - - let relocations = encoder.relocations.into_iter().map(|r| { - EncoderReloc { - offset: r.offset, - symbol: r.symbol, - reloc_type: r.reloc_type, - addend: r.addend, - diff_symbol: r.diff_symbol, - } - }).collect(); - - Ok(EncodeResult { - bytes: encoder.bytes, - relocations, - jump, - }) - } - - fn elf_machine() -> u16 { EM_386 } - fn elf_class() -> u8 { ELFCLASS32 } - - fn reloc_abs(size: usize) -> u32 { - let _ = size; // i686 always uses R_386_32 for absolute - R_386_32 - } - fn reloc_abs64() -> u32 { R_386_32 } // i686 doesn't have 64-bit relocs - fn reloc_pc32() -> u32 { R_386_PC32 } - fn reloc_plt32() -> u32 { R_386_PLT32 } - - fn uses_rel_format() -> bool { true } - fn supports_deferred_skips() -> bool { true } - fn resolve_set_aliases_in_data() -> bool { true } - - fn default_code_mode() -> u8 { 32 } - - /// Encode an instruction using the x86-64 encoder for .code64 sections. - /// This is needed for kernel realmode trampoline code (trampoline_64.S) - /// which is compiled with -m16 but has .code64 sections containing - /// 64-bit instructions like jmpq, lidt with RIP-relative addressing, etc. - fn encode_instruction_code64( - instr: &Instruction, - _section_data_len: u64, - ) -> Result { - let mut encoder = X86_64Encoder::new(); - // Set offset to 0 so relocation offsets are relative to instruction start. - // The ElfWriterCore will add base_offset when recording the relocations. - encoder.offset = 0; - encoder.encode(instr)?; - - let instr_len = encoder.bytes.len(); - - // Detect jump instructions for relaxation (same logic as x86-64) - let jump = { - let mnem = &instr.mnemonic; - let is_jump = mnem.starts_with('j') && mnem.len() >= 2; - if is_jump && instr.operands.len() == 1 { - if let Operand::Label(_) = &instr.operands[0] { - let is_conditional = mnem != "jmp"; - let expected_len = if is_conditional { 6 } else { 5 }; - if instr_len == expected_len { - Some(JumpDetection { - is_conditional, - already_short: false, - }) - } else { - None - } - } else { - None - } - } else { - None - } - }; - - // Convert x86-64 relocations. Since we're in a .code64 section of an - // i686 object, we need to keep i686 relocation types (R_386_*) because - // the object file is still ELF32. The linker (ld -m elf_i386) expects - // R_386_* relocations. - let relocations = encoder.relocations.into_iter().map(|r| { - // Map x86-64 reloc types to i686 equivalents - let reloc_type = match r.reloc_type { - R_X86_64_PC32 | R_X86_64_PLT32 => R_386_PC32, - R_X86_64_64 | R_X86_64_32 | R_X86_64_32S => R_386_32, - other => other, - }; - EncoderReloc { - offset: r.offset, - symbol: r.symbol, - reloc_type, - addend: r.addend, - diff_symbol: None, - } - }).collect(); - - Ok(EncodeResult { - bytes: encoder.bytes, - relocations, - jump, - }) - } -} - -/// Builds a 32-bit ELF relocatable object file from parsed assembly items. -pub type ElfWriter = ElfWriterCore; diff --git a/src/backend/i686/assembler/encoder/core.rs b/src/backend/i686/assembler/encoder/core.rs deleted file mode 100644 index 4070af1d47..0000000000 --- a/src/backend/i686/assembler/encoder/core.rs +++ /dev/null @@ -1,180 +0,0 @@ -//! Core encoding helpers for i686 instruction encoding. -//! -//! ModR/M, SIB, segment prefixes, memory operand encoding, and relocation helpers. - -use super::*; - -impl super::InstructionEncoder { - /// Encode ModR/M byte. - pub(super) fn modrm(&self, mod_: u8, reg: u8, rm: u8) -> u8 { - (mod_ << 6) | ((reg & 7) << 3) | (rm & 7) - } - - /// Encode SIB byte. - pub(super) fn sib(&self, scale: u8, index: u8, base: u8) -> u8 { - let scale_bits = match scale { - 1 => 0, - 2 => 1, - 4 => 2, - 8 => 3, - _ => 0, - }; - (scale_bits << 6) | ((index & 7) << 3) | (base & 7) - } - - /// Encode ModR/M + SIB + displacement for a memory operand. - /// - /// For i686 REL relocations, the relocation offset must point to the - /// displacement field (where the addend is embedded), not to the ModR/M - /// byte. So we defer add_relocation() until after emitting ModR/M and SIB. - /// Emit segment override prefix if the memory operand has a segment. - pub(super) fn emit_segment_prefix(&mut self, mem: &MemoryOperand) { - if let Some(ref seg) = mem.segment { - match seg.as_str() { - "fs" => self.bytes.push(0x64), - "gs" => self.bytes.push(0x65), - "es" => self.bytes.push(0x26), - "cs" => self.bytes.push(0x2E), - "ss" => self.bytes.push(0x36), - "ds" => self.bytes.push(0x3E), - _ => {} - } - } - } - - pub(super) fn encode_modrm_mem(&mut self, reg_field: u8, mem: &MemoryOperand) -> Result<(), String> { - let base = mem.base.as_ref(); - let index = mem.index.as_ref(); - - // Parse displacement but defer relocation until after ModR/M+SIB bytes - let (disp_val, has_symbol, pending_reloc) = match &mem.displacement { - Displacement::None => (0i64, false, None), - Displacement::Integer(v) => (*v, false, None), - Displacement::Symbol(sym) => { - (0i64, true, Some((sym.clone(), R_386_32, 0i64))) - } - Displacement::SymbolAddend(sym, addend) => { - (0i64, true, Some((sym.clone(), R_386_32, *addend))) - } - Displacement::SymbolPlusOffset(sym, offset) => { - (0i64, true, Some((sym.clone(), R_386_32, *offset))) - } - Displacement::SymbolMod(sym, modifier) => { - let reloc_type = self.tls_reloc_type(modifier); - (0i64, true, Some((sym.clone(), reloc_type, 0i64))) - } - }; - - // No base register - need SIB with no-base encoding or direct displacement - if base.is_none() && index.is_none() { - // Direct memory reference - mod=00, rm=101 (disp32) - self.bytes.push(self.modrm(0, reg_field, 5)); - // Emit relocation now, pointing at the displacement bytes - if let Some((sym, reloc_type, addend)) = pending_reloc { - self.add_relocation(&sym, reloc_type, addend); - } - self.bytes.extend_from_slice(&(disp_val as i32).to_le_bytes()); - return Ok(()); - } - - let base_reg = base.map(|r| &r.name as &str).unwrap_or(""); - let base_num = if !base_reg.is_empty() { reg_num(base_reg).unwrap_or(0) } else { 5 }; - - // Determine if we need SIB - let need_sib = index.is_some() - || (base_num & 7) == 4 // ESP always needs SIB - || base.is_none(); - - // Determine displacement size - let (mod_bits, disp_size) = if has_symbol { - (2, 4) // always use disp32 for symbols - } else if disp_val == 0 && (base_num & 7) != 5 { - // No displacement (EBP always needs at least disp8) - (0, 0) - } else if (-128..=127).contains(&disp_val) { - (1, 1) // disp8 - } else { - (2, 4) // disp32 - }; - - if need_sib { - let idx = index.as_ref(); - let idx_num = idx.map(|r| reg_num(&r.name).unwrap_or(4)).unwrap_or(4); - let scale = mem.scale.unwrap_or(1); - - if base.is_none() { - // No base - disp32 with SIB - self.bytes.push(self.modrm(0, reg_field, 4)); - self.bytes.push(self.sib(scale, idx_num, 5)); - // Emit relocation after ModR/M+SIB, before displacement - if let Some((sym, reloc_type, addend)) = pending_reloc { - self.add_relocation(&sym, reloc_type, addend); - } - self.bytes.extend_from_slice(&(disp_val as i32).to_le_bytes()); - } else { - self.bytes.push(self.modrm(mod_bits, reg_field, 4)); - self.bytes.push(self.sib(scale, idx_num, base_num)); - // Emit relocation after ModR/M+SIB, before displacement - if let Some((sym, reloc_type, addend)) = pending_reloc { - self.add_relocation(&sym, reloc_type, addend); - } - match disp_size { - 0 => {} - 1 => self.bytes.push(disp_val as u8), - 4 => self.bytes.extend_from_slice(&(disp_val as i32).to_le_bytes()), - _ => unreachable!(), - } - } - } else { - self.bytes.push(self.modrm(mod_bits, reg_field, base_num)); - // Emit relocation after ModR/M, before displacement - if let Some((sym, reloc_type, addend)) = pending_reloc { - self.add_relocation(&sym, reloc_type, addend); - } - match disp_size { - 0 => {} - 1 => self.bytes.push(disp_val as u8), - 4 => self.bytes.extend_from_slice(&(disp_val as i32).to_le_bytes()), - _ => unreachable!(), - } - } - - Ok(()) - } - - /// Add a relocation relative to current position. - pub(super) fn add_relocation(&mut self, symbol: &str, reloc_type: u32, addend: i64) { - // Strip @PLT suffix from symbol names - the suffix only affects relocation type, - // not the symbol name in the ELF symbol table. - let (sym, rtype) = if let Some(base) = symbol.strip_suffix("@PLT") { - let plt_type = if reloc_type == R_386_PC32 { R_386_PLT32 } else { reloc_type }; - (base, plt_type) - } else { - (symbol, reloc_type) - }; - self.relocations.push(Relocation { - offset: self.bytes.len() as u64, - symbol: sym.to_string(), - reloc_type: rtype, - addend, - diff_symbol: None, - }); - } - - /// Add a relocation for a label that may contain `symbol+offset` or `symbol-offset`. - /// Splits the label string and extracts the addend if present. - pub(super) fn add_relocation_for_label(&mut self, label: &str, reloc_type: u32) { - let (sym, addend) = split_label_offset(label); - self.add_relocation(sym, reloc_type, addend); - } - - pub(super) fn add_relocation_with_diff(&mut self, symbol: &str, reloc_type: u32, addend: i64, diff_sym: &str) { - self.relocations.push(Relocation { - offset: self.bytes.len() as u64, - symbol: symbol.to_string(), - reloc_type, - addend, - diff_symbol: Some(diff_sym.to_string()), - }); - } -} diff --git a/src/backend/i686/assembler/encoder/gp_integer.rs b/src/backend/i686/assembler/encoder/gp_integer.rs deleted file mode 100644 index 9838b56dbb..0000000000 --- a/src/backend/i686/assembler/encoder/gp_integer.rs +++ /dev/null @@ -1,1390 +0,0 @@ -//! General-purpose integer instruction encoders for i686. -//! -//! MOV, LEA, PUSH/POP, ALU, TEST, IMUL, shifts, bit operations, -//! conditional set/move, jumps, calls, exchange, and misc GP instructions. - -use super::*; - -impl super::InstructionEncoder { - // ---- Instruction-specific encoders ---- - - pub(super) fn encode_mov(&mut self, ops: &[Operand], size: u8) -> Result<(), String> { - if ops.len() != 2 { - return Err(format!("mov requires 2 operands, got {}", ops.len())); - } - - // Check for control register moves - if let (Operand::Register(r1), Operand::Register(r2)) = (&ops[0], &ops[1]) { - if is_control_reg(&r1.name) || is_control_reg(&r2.name) { - return self.encode_mov_cr(ops); - } - if is_segment_reg(&r1.name) || is_segment_reg(&r2.name) { - return self.encode_mov_seg(ops); - } - } - // Check for segment register moves involving memory - if let (Operand::Register(r), Operand::Memory(_)) = (&ops[0], &ops[1]) { - if is_segment_reg(&r.name) { - return self.encode_mov_seg(ops); - } - } - if let (Operand::Memory(_), Operand::Register(r)) = (&ops[0], &ops[1]) { - if is_segment_reg(&r.name) { - return self.encode_mov_seg(ops); - } - } - - match (&ops[0], &ops[1]) { - (Operand::Immediate(imm), Operand::Register(dst)) => { - self.encode_mov_imm_reg(imm, dst, size) - } - (Operand::Register(src), Operand::Register(dst)) => { - self.encode_mov_rr(src, dst, size) - } - (Operand::Memory(mem), Operand::Register(dst)) => { - self.encode_mov_mem_reg(mem, dst, size) - } - (Operand::Register(src), Operand::Memory(mem)) => { - self.encode_mov_reg_mem(src, mem, size) - } - (Operand::Immediate(imm), Operand::Memory(mem)) => { - self.encode_mov_imm_mem(imm, mem, size) - } - // Label as memory source: movl symbol, %reg (absolute address) - (Operand::Label(label), Operand::Register(dst)) => { - let dst_num = reg_num(&dst.name).ok_or_else(|| format!("bad register: {}", dst.name))?; - if size == 2 { self.bytes.push(0x66); } - self.bytes.push(if size == 1 { 0x8A } else { 0x8B }); - // mod=00, rm=101 for disp32 (no base) - self.bytes.push(self.modrm(0, dst_num, 5)); - // Check if label is a numeric literal (absolute address) - if let Ok(addr) = label.parse::() { - self.bytes.extend_from_slice(&(addr as i32).to_le_bytes()); - } else { - self.add_relocation_for_label(label, R_386_32); - self.bytes.extend_from_slice(&[0, 0, 0, 0]); - } - Ok(()) - } - // Label as memory destination: movl %reg, symbol - (Operand::Register(src), Operand::Label(label)) => { - let src_num = reg_num(&src.name).ok_or_else(|| format!("bad register: {}", src.name))?; - if size == 2 { self.bytes.push(0x66); } - self.bytes.push(if size == 1 { 0x88 } else { 0x89 }); - self.bytes.push(self.modrm(0, src_num, 5)); - if let Ok(addr) = label.parse::() { - self.bytes.extend_from_slice(&(addr as i32).to_le_bytes()); - } else { - self.add_relocation_for_label(label, R_386_32); - self.bytes.extend_from_slice(&[0, 0, 0, 0]); - } - Ok(()) - } - // movl $imm, symbol (immediate to memory at absolute address) - (Operand::Immediate(imm), Operand::Label(label)) => { - if size == 2 { self.bytes.push(0x66); } - self.bytes.push(if size == 1 { 0xC6 } else { 0xC7 }); - self.bytes.push(self.modrm(0, 0, 5)); - if let Ok(addr) = label.parse::() { - self.bytes.extend_from_slice(&(addr as i32).to_le_bytes()); - } else { - self.add_relocation_for_label(label, R_386_32); - self.bytes.extend_from_slice(&[0, 0, 0, 0]); - } - match imm { - ImmediateValue::Integer(val) => { - match size { - 1 => self.bytes.push(*val as u8), - 2 => self.bytes.extend_from_slice(&(*val as i16).to_le_bytes()), - 4 => self.bytes.extend_from_slice(&(*val as i32).to_le_bytes()), - _ => unreachable!(), - } - } - _ => return Err("unsupported immediate for mov to label address".to_string()), - } - Ok(()) - } - _ => Err("unsupported mov operand combination".to_string()), - } - } - - /// Handle unsuffixed `mov` from inline asm - infer size from operands - pub(super) fn encode_mov_infer_size(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.len() != 2 { - return Err(format!("mov requires 2 operands, got {}", ops.len())); - } - // Infer size from register operands - let size = match (&ops[0], &ops[1]) { - (Operand::Register(r), _) => reg_size(&r.name), - (_, Operand::Register(r)) => reg_size(&r.name), - _ => 4, // default to 32-bit - }; - self.encode_mov(ops, size) - } - - fn encode_mov_imm_reg(&mut self, imm: &ImmediateValue, dst: &Register, size: u8) -> Result<(), String> { - let dst_num = reg_num(&dst.name).ok_or_else(|| format!("bad register: {}", dst.name))?; - - match imm { - ImmediateValue::Integer(val) => { - let val = *val; - if size == 4 { - // movl $imm32, %reg - use compact B8+rd encoding - self.bytes.push(0xB8 + dst_num); - self.bytes.extend_from_slice(&(val as i32).to_le_bytes()); - } else if size == 2 { - self.bytes.push(0x66); - self.bytes.push(0xB8 + dst_num); - self.bytes.extend_from_slice(&(val as i16).to_le_bytes()); - } else { - // 8-bit - self.bytes.push(0xB0 + dst_num); - self.bytes.push(val as u8); - } - } - ImmediateValue::Symbol(sym) | ImmediateValue::SymbolPlusOffset(sym, _) => { - let addend = if let ImmediateValue::SymbolPlusOffset(_, a) = imm { *a } else { 0 }; - if size == 4 { - self.bytes.push(0xB8 + dst_num); - self.add_relocation(sym, R_386_32, addend); - self.bytes.extend_from_slice(&[0, 0, 0, 0]); - } else { - return Err("symbol immediate only supported for 32-bit mov".to_string()); - } - } - ImmediateValue::SymbolMod(_, _) | ImmediateValue::SymbolDiff(_, _) => { - return Err("unsupported immediate type for mov".to_string()); - } - } - Ok(()) - } - - fn encode_mov_rr(&mut self, src: &Register, dst: &Register, size: u8) -> Result<(), String> { - // Handle segment register moves - if let Some(seg_num) = seg_reg_num(&dst.name) { - // mov %r16, %sreg (8E /r) - let src_num = reg_num(&src.name).ok_or_else(|| format!("bad register: {}", src.name))?; - self.bytes.push(0x8E); - self.bytes.push(self.modrm(3, seg_num, src_num)); - return Ok(()); - } - if let Some(seg_num) = seg_reg_num(&src.name) { - // mov %sreg, %r16 (8C /r) - let dst_num = reg_num(&dst.name).ok_or_else(|| format!("bad register: {}", dst.name))?; - self.bytes.push(0x8C); - self.bytes.push(self.modrm(3, seg_num, dst_num)); - return Ok(()); - } - - let src_num = reg_num(&src.name).ok_or_else(|| format!("bad register: {}", src.name))?; - let dst_num = reg_num(&dst.name).ok_or_else(|| format!("bad register: {}", dst.name))?; - - if size == 2 { - self.bytes.push(0x66); - } - if size == 1 { - self.bytes.push(0x88); - } else { - self.bytes.push(0x89); - } - self.bytes.push(self.modrm(3, src_num, dst_num)); - Ok(()) - } - - fn encode_mov_mem_reg(&mut self, mem: &MemoryOperand, dst: &Register, size: u8) -> Result<(), String> { - let dst_num = reg_num(&dst.name).ok_or_else(|| format!("bad register: {}", dst.name))?; - - if let Some(ref seg) = mem.segment { - match seg.as_str() { - "fs" => self.bytes.push(0x64), - "gs" => self.bytes.push(0x65), - _ => return Err(format!("unsupported segment: {}", seg)), - } - } - - if size == 2 { - self.bytes.push(0x66); - } - if size == 1 { - self.bytes.push(0x8A); - } else { - self.bytes.push(0x8B); - } - self.encode_modrm_mem(dst_num, mem) - } - - fn encode_mov_reg_mem(&mut self, src: &Register, mem: &MemoryOperand, size: u8) -> Result<(), String> { - let src_num = reg_num(&src.name).ok_or_else(|| format!("bad register: {}", src.name))?; - - if let Some(ref seg) = mem.segment { - match seg.as_str() { - "fs" => self.bytes.push(0x64), - "gs" => self.bytes.push(0x65), - _ => return Err(format!("unsupported segment: {}", seg)), - } - } - - if size == 2 { - self.bytes.push(0x66); - } - if size == 1 { - self.bytes.push(0x88); - } else { - self.bytes.push(0x89); - } - self.encode_modrm_mem(src_num, mem) - } - - fn encode_mov_imm_mem(&mut self, imm: &ImmediateValue, mem: &MemoryOperand, size: u8) -> Result<(), String> { - if size == 2 { - self.bytes.push(0x66); - } - if size == 1 { - self.bytes.push(0xC6); - } else { - self.bytes.push(0xC7); - } - self.encode_modrm_mem(0, mem)?; - - match imm { - ImmediateValue::Integer(val) => { - match size { - 1 => self.bytes.push(*val as u8), - 2 => self.bytes.extend_from_slice(&(*val as i16).to_le_bytes()), - 4 => self.bytes.extend_from_slice(&(*val as i32).to_le_bytes()), - _ => unreachable!(), - } - } - ImmediateValue::Symbol(sym) | ImmediateValue::SymbolPlusOffset(sym, _) => { - let addend = if let ImmediateValue::SymbolPlusOffset(_, a) = imm { *a } else { 0 }; - if size == 4 { - self.add_relocation(sym, R_386_32, addend); - self.bytes.extend_from_slice(&[0, 0, 0, 0]); - } else { - return Err("symbol immediate only supported for 32-bit mov to memory".to_string()); - } - } - _ => return Err("unsupported immediate for mov to memory".to_string()), - } - Ok(()) - } - - pub(super) fn encode_movsx(&mut self, ops: &[Operand], src_size: u8, dst_size: u8) -> Result<(), String> { - if ops.len() != 2 { - return Err("movsx requires 2 operands".to_string()); - } - - if dst_size == 2 { self.bytes.push(0x66); } - - let opcode = match src_size { - 1 => vec![0x0F, 0xBE], - 2 => vec![0x0F, 0xBF], - _ => return Err(format!("unsupported movsx src size: {}", src_size)), - }; - - match (&ops[0], &ops[1]) { - (Operand::Register(src), Operand::Register(dst)) => { - let src_num = reg_num(&src.name).ok_or("bad src register")?; - let dst_num = reg_num(&dst.name).ok_or("bad dst register")?; - self.bytes.extend_from_slice(&opcode); - self.bytes.push(self.modrm(3, dst_num, src_num)); - } - (Operand::Memory(mem), Operand::Register(dst)) => { - let dst_num = reg_num(&dst.name).ok_or("bad dst register")?; - self.bytes.extend_from_slice(&opcode); - self.encode_modrm_mem(dst_num, mem)?; - } - _ => return Err("unsupported movsx operands".to_string()), - } - Ok(()) - } - - pub(super) fn encode_movzx(&mut self, ops: &[Operand], src_size: u8, dst_size: u8) -> Result<(), String> { - if ops.len() != 2 { - return Err("movzx requires 2 operands".to_string()); - } - - if dst_size == 2 { self.bytes.push(0x66); } - - let opcode = match src_size { - 1 => vec![0x0F, 0xB6], - 2 => vec![0x0F, 0xB7], - _ => return Err(format!("unsupported movzx src size: {}", src_size)), - }; - - match (&ops[0], &ops[1]) { - (Operand::Register(src), Operand::Register(dst)) => { - let src_num = reg_num(&src.name).ok_or("bad src register")?; - let dst_num = reg_num(&dst.name).ok_or("bad dst register")?; - self.bytes.extend_from_slice(&opcode); - self.bytes.push(self.modrm(3, dst_num, src_num)); - } - (Operand::Memory(mem), Operand::Register(dst)) => { - let dst_num = reg_num(&dst.name).ok_or("bad dst register")?; - self.bytes.extend_from_slice(&opcode); - self.encode_modrm_mem(dst_num, mem)?; - } - _ => return Err("unsupported movzx operands".to_string()), - } - Ok(()) - } - - pub(super) fn encode_lea(&mut self, ops: &[Operand], _size: u8) -> Result<(), String> { - if ops.len() != 2 { - return Err("lea requires 2 operands".to_string()); - } - match (&ops[0], &ops[1]) { - (Operand::Memory(mem), Operand::Register(dst)) => { - let dst_num = reg_num(&dst.name).ok_or("bad dst register")?; - self.bytes.push(0x8D); - self.encode_modrm_mem(dst_num, mem) - } - _ => Err("lea requires memory source and register destination".to_string()), - } - } - - pub(super) fn encode_push(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.len() != 1 { - return Err("push requires 1 operand".to_string()); - } - match &ops[0] { - Operand::Register(reg) => { - let num = reg_num(®.name).ok_or("bad register")?; - self.bytes.push(0x50 + num); - Ok(()) - } - Operand::Immediate(ImmediateValue::Integer(val)) => { - if *val >= -128 && *val <= 127 { - self.bytes.push(0x6A); - self.bytes.push(*val as u8); - } else { - self.bytes.push(0x68); - self.bytes.extend_from_slice(&(*val as i32).to_le_bytes()); - } - Ok(()) - } - Operand::Immediate(ImmediateValue::Symbol(sym)) | - Operand::Immediate(ImmediateValue::SymbolPlusOffset(sym, _)) => { - let addend = if let Operand::Immediate(ImmediateValue::SymbolPlusOffset(_, a)) = &ops[0] { *a } else { 0 }; - self.bytes.push(0x68); - self.add_relocation(sym, R_386_32, addend); - self.bytes.extend_from_slice(&[0, 0, 0, 0]); - Ok(()) - } - Operand::Memory(mem) => { - self.bytes.push(0xFF); - self.encode_modrm_mem(6, mem) - } - _ => Err("unsupported push operand".to_string()), - } - } - - pub(super) fn encode_push16(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.len() != 1 { - return Err("pushw requires 1 operand".to_string()); - } - match &ops[0] { - Operand::Immediate(ImmediateValue::Integer(val)) => { - self.bytes.push(0x66); - if *val >= -128 && *val <= 127 { - self.bytes.push(0x6A); - self.bytes.push(*val as u8); - } else { - self.bytes.push(0x68); - self.bytes.extend_from_slice(&(*val as i16).to_le_bytes()); - } - Ok(()) - } - _ => Err("unsupported pushw operand".to_string()), - } - } - - pub(super) fn encode_pop(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.len() != 1 { - return Err("pop requires 1 operand".to_string()); - } - match &ops[0] { - Operand::Register(reg) => { - if is_segment_reg(®.name) { - // Pop to segment register - match reg.name.as_str() { - "es" => { self.bytes.push(0x07); Ok(()) } - "ss" => { self.bytes.push(0x17); Ok(()) } - "ds" => { self.bytes.push(0x1F); Ok(()) } - "fs" => { self.bytes.extend_from_slice(&[0x0F, 0xA1]); Ok(()) } - "gs" => { self.bytes.extend_from_slice(&[0x0F, 0xA9]); Ok(()) } - _ => Err(format!("cannot pop to {}", reg.name)), - } - } else { - let num = reg_num(®.name).ok_or("bad register")?; - self.bytes.push(0x58 + num); - Ok(()) - } - } - Operand::Memory(mem) => { - // pop m32: 0x8F /0 - self.bytes.push(0x8F); - self.encode_modrm_mem(0, mem) - } - _ => Err("unsupported pop operand".to_string()), - } - } - - pub(super) fn encode_alu(&mut self, ops: &[Operand], mnemonic: &str, alu_op: u8) -> Result<(), String> { - if ops.len() != 2 { - return Err(format!("{} requires 2 operands", mnemonic)); - } - - let size = mnemonic_size_suffix(mnemonic).unwrap_or(4); - - match (&ops[0], &ops[1]) { - (Operand::Immediate(ImmediateValue::Integer(val)), Operand::Register(dst)) => { - let val = *val; - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - - if size == 2 { self.bytes.push(0x66); } - - if size == 1 { - self.bytes.push(0x80); - self.bytes.push(self.modrm(3, alu_op, dst_num)); - self.bytes.push(val as u8); - } else if (-128..=127).contains(&val) { - self.bytes.push(0x83); - self.bytes.push(self.modrm(3, alu_op, dst_num)); - self.bytes.push(val as u8); - } else { - if dst_num == 0 { - // Short form: op eax, imm32 - self.bytes.push(if size == 1 { 0x04 } else { 0x05 } + alu_op * 8); - } else { - self.bytes.push(0x81); - self.bytes.push(self.modrm(3, alu_op, dst_num)); - } - if size == 2 { - self.bytes.extend_from_slice(&(val as i16).to_le_bytes()); - } else { - self.bytes.extend_from_slice(&(val as i32).to_le_bytes()); - } - } - Ok(()) - } - (Operand::Immediate(ImmediateValue::Symbol(sym)), Operand::Register(dst)) | - (Operand::Immediate(ImmediateValue::SymbolPlusOffset(sym, _)), Operand::Register(dst)) => { - let addend = match &ops[0] { Operand::Immediate(ImmediateValue::SymbolPlusOffset(_, a)) => *a, _ => 0 }; - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - if size == 2 { self.bytes.push(0x66); } - let opcode_len = if dst_num == 0 { - self.bytes.push(0x05 + alu_op * 8); - 1u32 - } else { - self.bytes.push(0x81); - self.bytes.push(self.modrm(3, alu_op, dst_num)); - 2u32 - }; - // _GLOBAL_OFFSET_TABLE_ requires R_386_GOTPC (PC-relative to GOT). - // The implicit addend = opcode length so the PC correction works: - // ebx (= return addr of thunk call) + (GOT + addend - P) = GOT - if sym == "_GLOBAL_OFFSET_TABLE_" { - self.add_relocation(sym, R_386_GOTPC, 0); - self.bytes.extend_from_slice(&opcode_len.to_le_bytes()); - } else { - self.add_relocation(sym, R_386_32, addend); - self.bytes.extend_from_slice(&[0, 0, 0, 0]); - } - Ok(()) - } - (Operand::Register(src), Operand::Register(dst)) => { - let src_num = reg_num(&src.name).ok_or("bad src register")?; - let dst_num = reg_num(&dst.name).ok_or("bad dst register")?; - - if size == 2 { self.bytes.push(0x66); } - self.bytes.push(if size == 1 { 0x00 } else { 0x01 } + alu_op * 8); - self.bytes.push(self.modrm(3, src_num, dst_num)); - Ok(()) - } - (Operand::Memory(mem), Operand::Register(dst)) => { - let dst_num = reg_num(&dst.name).ok_or("bad dst register")?; - if size == 2 { self.bytes.push(0x66); } - self.bytes.push(if size == 1 { 0x02 } else { 0x03 } + alu_op * 8); - self.encode_modrm_mem(dst_num, mem) - } - (Operand::Register(src), Operand::Memory(mem)) => { - let src_num = reg_num(&src.name).ok_or("bad src register")?; - if size == 2 { self.bytes.push(0x66); } - self.bytes.push(if size == 1 { 0x00 } else { 0x01 } + alu_op * 8); - self.encode_modrm_mem(src_num, mem) - } - (Operand::Immediate(ImmediateValue::Integer(val)), Operand::Memory(mem)) => { - let val = *val; - if size == 2 { self.bytes.push(0x66); } - - if size == 1 { - self.bytes.push(0x80); - self.encode_modrm_mem(alu_op, mem)?; - self.bytes.push(val as u8); - } else if (-128..=127).contains(&val) { - self.bytes.push(0x83); - self.encode_modrm_mem(alu_op, mem)?; - self.bytes.push(val as u8); - } else { - self.bytes.push(0x81); - self.encode_modrm_mem(alu_op, mem)?; - if size == 2 { - self.bytes.extend_from_slice(&(val as i16).to_le_bytes()); - } else { - self.bytes.extend_from_slice(&(val as i32).to_le_bytes()); - } - } - Ok(()) - } - (Operand::Immediate(ImmediateValue::SymbolMod(sym, modifier)), Operand::Register(dst)) => { - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - if size == 2 { self.bytes.push(0x66); } - let reloc_type = self.tls_reloc_type(modifier); - if dst_num == 0 { - self.bytes.push(0x05 + alu_op * 8); - } else { - self.bytes.push(0x81); - self.bytes.push(self.modrm(3, alu_op, dst_num)); - } - self.add_relocation(sym, reloc_type, 0); - self.bytes.extend_from_slice(&[0, 0, 0, 0]); - Ok(()) - } - (Operand::Immediate(ImmediateValue::SymbolMod(sym, modifier)), Operand::Memory(mem)) => { - if size == 2 { self.bytes.push(0x66); } - let reloc_type = self.tls_reloc_type(modifier); - self.bytes.push(0x81); - self.encode_modrm_mem(alu_op, mem)?; - self.add_relocation(sym, reloc_type, 0); - self.bytes.extend_from_slice(&[0, 0, 0, 0]); - Ok(()) - } - (Operand::Immediate(ImmediateValue::Symbol(sym)), Operand::Memory(mem)) | - (Operand::Immediate(ImmediateValue::SymbolPlusOffset(sym, _)), Operand::Memory(mem)) => { - let addend = match &ops[0] { Operand::Immediate(ImmediateValue::SymbolPlusOffset(_, a)) => *a, _ => 0 }; - if size == 2 { self.bytes.push(0x66); } - self.bytes.push(0x81); - self.encode_modrm_mem(alu_op, mem)?; - self.add_relocation(sym, R_386_32, addend); - self.bytes.extend_from_slice(&[0, 0, 0, 0]); - Ok(()) - } - // Symbol difference immediate: e.g. addl $_DYNAMIC-1b, (%esp) - // Uses R_386_PC32 with diff_symbol so the ELF writer resolves A - B - (Operand::Immediate(ImmediateValue::SymbolDiff(sym_a, sym_b)), Operand::Memory(mem)) => { - if size == 2 { self.bytes.push(0x66); } - self.bytes.push(0x81); - self.encode_modrm_mem(alu_op, mem)?; - self.add_relocation_with_diff(sym_a, R_386_PC32, 0, sym_b); - self.bytes.extend_from_slice(&[0, 0, 0, 0]); - Ok(()) - } - (Operand::Immediate(ImmediateValue::SymbolDiff(sym_a, sym_b)), Operand::Register(dst)) => { - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - if size == 2 { self.bytes.push(0x66); } - if dst_num == 0 { - self.bytes.push(0x05 + alu_op * 8); - } else { - self.bytes.push(0x81); - self.bytes.push(self.modrm(3, alu_op, dst_num)); - } - self.add_relocation_with_diff(sym_a, R_386_PC32, 0, sym_b); - self.bytes.extend_from_slice(&[0, 0, 0, 0]); - Ok(()) - } - // Label as memory reference: addl %reg, symbol - (Operand::Register(src), Operand::Label(label)) => { - let src_num = reg_num(&src.name).ok_or("bad src register")?; - if size == 2 { self.bytes.push(0x66); } - self.bytes.push(if size == 1 { 0x00 } else { 0x01 } + alu_op * 8); - // Encode as disp32 (mod=00, rm=101) - self.bytes.push(self.modrm(0, src_num, 5)); - self.add_relocation_for_label(label, R_386_32); - self.bytes.extend_from_slice(&[0, 0, 0, 0]); - Ok(()) - } - (Operand::Label(label), Operand::Register(dst)) => { - let dst_num = reg_num(&dst.name).ok_or("bad dst register")?; - if size == 2 { self.bytes.push(0x66); } - self.bytes.push(if size == 1 { 0x02 } else { 0x03 } + alu_op * 8); - self.bytes.push(self.modrm(0, dst_num, 5)); - self.add_relocation_for_label(label, R_386_32); - self.bytes.extend_from_slice(&[0, 0, 0, 0]); - Ok(()) - } - // Immediate to label-as-memory: addl $1, global_counter - (Operand::Immediate(ImmediateValue::Integer(val)), Operand::Label(label)) => { - let val = *val; - if size == 2 { self.bytes.push(0x66); } - - if size == 1 { - self.bytes.push(0x80); - } else if (-128..=127).contains(&val) { - self.bytes.push(0x83); - } else { - self.bytes.push(0x81); - } - // mod=00, rm=101 for disp32 (no base) - self.bytes.push(self.modrm(0, alu_op, 5)); - self.add_relocation_for_label(label, R_386_32); - self.bytes.extend_from_slice(&[0, 0, 0, 0]); - if size == 1 || (-128..=127).contains(&val) { - self.bytes.push(val as u8); - } else if size == 2 { - self.bytes.extend_from_slice(&(val as i16).to_le_bytes()); - } else { - self.bytes.extend_from_slice(&(val as i32).to_le_bytes()); - } - Ok(()) - } - _ => Err(format!("unsupported {} operands", mnemonic)), - } - } - - pub(super) fn encode_test(&mut self, ops: &[Operand], mnemonic: &str) -> Result<(), String> { - if ops.len() != 2 { - return Err(format!("{} requires 2 operands", mnemonic)); - } - - let size = mnemonic_size_suffix(mnemonic).unwrap_or(4); - - match (&ops[0], &ops[1]) { - (Operand::Register(src), Operand::Register(dst)) => { - let src_num = reg_num(&src.name).ok_or("bad src register")?; - let dst_num = reg_num(&dst.name).ok_or("bad dst register")?; - if size == 2 { self.bytes.push(0x66); } - self.bytes.push(if size == 1 { 0x84 } else { 0x85 }); - self.bytes.push(self.modrm(3, src_num, dst_num)); - Ok(()) - } - (Operand::Immediate(ImmediateValue::Integer(val)), Operand::Register(dst)) => { - let val = *val; - let dst_num = reg_num(&dst.name).ok_or("bad dst register")?; - if size == 2 { self.bytes.push(0x66); } - - if size == 1 { - if dst_num == 0 { - self.bytes.push(0xA8); - } else { - self.bytes.push(0xF6); - self.bytes.push(self.modrm(3, 0, dst_num)); - } - self.bytes.push(val as u8); - } else { - if dst_num == 0 { - self.bytes.push(0xA9); - } else { - self.bytes.push(0xF7); - self.bytes.push(self.modrm(3, 0, dst_num)); - } - if size == 2 { - self.bytes.extend_from_slice(&(val as i16).to_le_bytes()); - } else { - self.bytes.extend_from_slice(&(val as i32).to_le_bytes()); - } - } - Ok(()) - } - (Operand::Immediate(ImmediateValue::Integer(val)), Operand::Memory(mem)) => { - let val = *val; - if size == 2 { self.bytes.push(0x66); } - if size == 1 { - self.bytes.push(0xF6); - } else { - self.bytes.push(0xF7); - } - self.encode_modrm_mem(0, mem)?; - if size == 1 { - self.bytes.push(val as u8); - } else if size == 2 { - self.bytes.extend_from_slice(&(val as i16).to_le_bytes()); - } else { - self.bytes.extend_from_slice(&(val as i32).to_le_bytes()); - } - Ok(()) - } - _ => Err("unsupported test operands".to_string()), - } - } - - pub(super) fn encode_imul(&mut self, ops: &[Operand], size: u8) -> Result<(), String> { - match ops.len() { - 1 => self.encode_unary_rm(ops, 5, size), - 2 => { - match (&ops[0], &ops[1]) { - (Operand::Register(src), Operand::Register(dst)) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&[0x0F, 0xAF]); - self.bytes.push(self.modrm(3, dst_num, src_num)); - Ok(()) - } - (Operand::Memory(mem), Operand::Register(dst)) => { - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&[0x0F, 0xAF]); - self.encode_modrm_mem(dst_num, mem) - } - // imul $imm, %reg => imul $imm, %reg, %reg (dst = src * imm) - (Operand::Immediate(ImmediateValue::Integer(val)), Operand::Register(dst)) => { - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - if *val >= -128 && *val <= 127 { - self.bytes.push(0x6B); - self.bytes.push(self.modrm(3, dst_num, dst_num)); - self.bytes.push(*val as u8); - } else { - self.bytes.push(0x69); - self.bytes.push(self.modrm(3, dst_num, dst_num)); - self.bytes.extend_from_slice(&(*val as i32).to_le_bytes()); - } - Ok(()) - } - _ => Err("unsupported imul operands".to_string()), - } - } - 3 => { - match (&ops[0], &ops[1], &ops[2]) { - (Operand::Immediate(ImmediateValue::Integer(val)), Operand::Register(src), Operand::Register(dst)) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - if *val >= -128 && *val <= 127 { - self.bytes.push(0x6B); - self.bytes.push(self.modrm(3, dst_num, src_num)); - self.bytes.push(*val as u8); - } else { - self.bytes.push(0x69); - self.bytes.push(self.modrm(3, dst_num, src_num)); - self.bytes.extend_from_slice(&(*val as i32).to_le_bytes()); - } - Ok(()) - } - (Operand::Immediate(ImmediateValue::Integer(val)), Operand::Memory(mem), Operand::Register(dst)) => { - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - if *val >= -128 && *val <= 127 { - self.bytes.push(0x6B); - self.encode_modrm_mem(dst_num, mem)?; - self.bytes.push(*val as u8); - } else { - self.bytes.push(0x69); - self.encode_modrm_mem(dst_num, mem)?; - self.bytes.extend_from_slice(&(*val as i32).to_le_bytes()); - } - Ok(()) - } - _ => Err("unsupported imul operands".to_string()), - } - } - _ => Err("imul requires 1-3 operands".to_string()), - } - } - - pub(super) fn encode_unary_rm(&mut self, ops: &[Operand], op_ext: u8, size: u8) -> Result<(), String> { - if ops.len() != 1 { - return Err("unary op requires 1 operand".to_string()); - } - if size == 2 { self.bytes.push(0x66); } - match &ops[0] { - Operand::Register(reg) => { - let num = reg_num(®.name).ok_or("bad register")?; - self.bytes.push(if size == 1 { 0xF6 } else { 0xF7 }); - self.bytes.push(self.modrm(3, op_ext, num)); - Ok(()) - } - Operand::Memory(mem) => { - self.bytes.push(if size == 1 { 0xF6 } else { 0xF7 }); - self.encode_modrm_mem(op_ext, mem) - } - _ => Err("unsupported unary operand".to_string()), - } - } - - /// Encode inc/dec instructions. - /// In 32-bit mode, inc/dec have compact single-byte encodings for 32-bit registers: - /// inc: 0x40+reg, dec: 0x48+reg - /// For memory operands or byte/word sizes, use opcode 0xFE (byte) / 0xFF (word/dword) - /// with modrm extension /0 (inc) or /1 (dec). - pub(super) fn encode_inc_dec(&mut self, ops: &[Operand], op_ext: u8, size: u8) -> Result<(), String> { - if ops.len() != 1 { - return Err("inc/dec requires 1 operand".to_string()); - } - match &ops[0] { - Operand::Register(reg) => { - let num = reg_num(®.name).ok_or("bad register")?; - if size == 4 { - // Use compact single-byte encoding: 0x40+reg (inc) or 0x48+reg (dec) - let base = if op_ext == 0 { 0x40 } else { 0x48 }; - self.bytes.push(base + num); - } else if size == 2 { - // 16-bit: operand size prefix + 0x40+reg (inc) or 0x48+reg (dec) - self.bytes.push(0x66); - let base = if op_ext == 0 { 0x40 } else { 0x48 }; - self.bytes.push(base + num); - } else { - // 8-bit: use 0xFE /0 (inc) or 0xFE /1 (dec) with modrm - self.bytes.push(0xFE); - self.bytes.push(self.modrm(3, op_ext, num)); - } - Ok(()) - } - Operand::Memory(mem) => { - if size == 2 { self.bytes.push(0x66); } - self.bytes.push(if size == 1 { 0xFE } else { 0xFF }); - self.encode_modrm_mem(op_ext, mem) - } - // Label as memory reference: incl symbol or incl symbol+4 - Operand::Label(label) => { - if size == 2 { self.bytes.push(0x66); } - self.bytes.push(if size == 1 { 0xFE } else { 0xFF }); - // Encode as disp32 (mod=00, rm=101) - self.bytes.push(self.modrm(0, op_ext, 5)); - self.add_relocation_for_label(label, R_386_32); - self.bytes.extend_from_slice(&[0, 0, 0, 0]); - Ok(()) - } - _ => Err("unsupported inc/dec operand".to_string()), - } - } - - pub(super) fn encode_shift(&mut self, ops: &[Operand], mnemonic: &str, shift_op: u8) -> Result<(), String> { - let size = mnemonic_size_suffix(mnemonic).unwrap_or(4); - - // Handle 1-operand form: shrl %eax means shift right by 1 - if ops.len() == 1 { - match &ops[0] { - Operand::Register(dst) => { - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - if size == 2 { self.bytes.push(0x66); } - self.bytes.push(if size == 1 { 0xD0 } else { 0xD1 }); - self.bytes.push(self.modrm(3, shift_op, dst_num)); - return Ok(()); - } - Operand::Memory(mem) => { - if size == 2 { self.bytes.push(0x66); } - self.bytes.push(if size == 1 { 0xD0 } else { 0xD1 }); - return self.encode_modrm_mem(shift_op, mem); - } - _ => return Err(format!("unsupported {} operand", mnemonic)), - } - } - - if ops.len() != 2 { - return Err(format!("{} requires 1 or 2 operands", mnemonic)); - } - - match (&ops[0], &ops[1]) { - (Operand::Immediate(ImmediateValue::Integer(count)), Operand::Register(dst)) => { - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - let count = *count as u8; - - if size == 2 { self.bytes.push(0x66); } - - if count == 1 { - self.bytes.push(if size == 1 { 0xD0 } else { 0xD1 }); - self.bytes.push(self.modrm(3, shift_op, dst_num)); - } else { - self.bytes.push(if size == 1 { 0xC0 } else { 0xC1 }); - self.bytes.push(self.modrm(3, shift_op, dst_num)); - self.bytes.push(count); - } - Ok(()) - } - (Operand::Register(cl), Operand::Register(dst)) if cl.name == "cl" => { - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - if size == 2 { self.bytes.push(0x66); } - self.bytes.push(if size == 1 { 0xD2 } else { 0xD3 }); - self.bytes.push(self.modrm(3, shift_op, dst_num)); - Ok(()) - } - (Operand::Immediate(ImmediateValue::Integer(count)), Operand::Memory(mem)) => { - let count = *count as u8; - if size == 2 { self.bytes.push(0x66); } - if count == 1 { - self.bytes.push(if size == 1 { 0xD0 } else { 0xD1 }); - self.encode_modrm_mem(shift_op, mem)?; - } else { - self.bytes.push(if size == 1 { 0xC0 } else { 0xC1 }); - self.encode_modrm_mem(shift_op, mem)?; - self.bytes.push(count); - } - Ok(()) - } - (Operand::Register(cl), Operand::Memory(mem)) if cl.name == "cl" => { - if size == 2 { self.bytes.push(0x66); } - self.bytes.push(if size == 1 { 0xD2 } else { 0xD3 }); - self.encode_modrm_mem(shift_op, mem) - } - _ => Err(format!("unsupported {} operands", mnemonic)), - } - } - - pub(super) fn encode_double_shift(&mut self, ops: &[Operand], opcode: u8, _size: u8) -> Result<(), String> { - if ops.len() != 3 { - return Err("double shift requires 3 operands".to_string()); - } - - match (&ops[0], &ops[1], &ops[2]) { - (Operand::Immediate(ImmediateValue::Integer(count)), Operand::Register(src), Operand::Register(dst)) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&[0x0F, opcode]); - self.bytes.push(self.modrm(3, src_num, dst_num)); - self.bytes.push(*count as u8); - Ok(()) - } - (Operand::Register(cl), Operand::Register(src), Operand::Register(dst)) if cl.name == "cl" => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&[0x0F, opcode + 1]); - self.bytes.push(self.modrm(3, src_num, dst_num)); - Ok(()) - } - _ => Err("unsupported double shift operands".to_string()), - } - } - - pub(super) fn encode_bswap(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.len() != 1 { - return Err("bswap requires 1 operand".to_string()); - } - match &ops[0] { - Operand::Register(reg) => { - let num = reg_num(®.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&[0x0F, 0xC8 + num]); - Ok(()) - } - _ => Err("bswap requires register operand".to_string()), - } - } - - pub(super) fn encode_bit_count(&mut self, ops: &[Operand], mnemonic: &str) -> Result<(), String> { - if ops.len() != 2 { - return Err(format!("{} requires 2 operands", mnemonic)); - } - - let (prefix, opcode) = match mnemonic { - "lzcntl" => (0xF3u8, [0x0F, 0xBD]), - "tzcntl" => (0xF3, [0x0F, 0xBC]), - "popcntl" => (0xF3, [0x0F, 0xB8]), - _ => return Err(format!("unknown bit count: {}", mnemonic)), - }; - - match (&ops[0], &ops[1]) { - (Operand::Register(src), Operand::Register(dst)) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.push(prefix); - self.bytes.extend_from_slice(&opcode); - self.bytes.push(self.modrm(3, dst_num, src_num)); - Ok(()) - } - _ => Err(format!("unsupported {} operands", mnemonic)), - } - } - - pub(super) fn encode_bsr_bsf(&mut self, ops: &[Operand], mnemonic: &str) -> Result<(), String> { - if ops.len() != 2 { - return Err(format!("{} requires 2 operands", mnemonic)); - } - - let opcode = match mnemonic { - "bsrl" | "bsr" => [0x0F, 0xBD], - "bsfl" | "bsf" => [0x0F, 0xBC], - _ => return Err(format!("unknown bit scan: {}", mnemonic)), - }; - - match (&ops[0], &ops[1]) { - (Operand::Register(src), Operand::Register(dst)) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&opcode); - self.bytes.push(self.modrm(3, dst_num, src_num)); - Ok(()) - } - (Operand::Memory(mem), Operand::Register(dst)) => { - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&opcode); - self.encode_modrm_mem(dst_num, mem) - } - _ => Err(format!("unsupported {} operands", mnemonic)), - } - } - - pub(super) fn encode_bt(&mut self, ops: &[Operand], mnemonic: &str) -> Result<(), String> { - if ops.len() != 2 { - return Err(format!("{} requires 2 operands", mnemonic)); - } - - let (opcode_rr, ext) = match mnemonic { - "btl" | "bt" => (0xA3u8, 4u8), - "btsl" | "bts" => (0xAB, 5), - "btrl" | "btr" => (0xB3, 6), - "btcl" | "btc" => (0xBB, 7), - _ => return Err(format!("unknown bt instruction: {}", mnemonic)), - }; - - match (&ops[0], &ops[1]) { - (Operand::Register(src), Operand::Register(dst)) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&[0x0F, opcode_rr]); - self.bytes.push(self.modrm(3, src_num, dst_num)); - Ok(()) - } - (Operand::Register(src), Operand::Memory(mem)) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&[0x0F, opcode_rr]); - self.encode_modrm_mem(src_num, mem) - } - (Operand::Immediate(ImmediateValue::Integer(val)), Operand::Register(dst)) => { - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&[0x0F, 0xBA]); - self.bytes.push(self.modrm(3, ext, dst_num)); - self.bytes.push(*val as u8); - Ok(()) - } - (Operand::Immediate(ImmediateValue::Integer(val)), Operand::Memory(mem)) => { - self.bytes.extend_from_slice(&[0x0F, 0xBA]); - self.encode_modrm_mem(ext, mem)?; - self.bytes.push(*val as u8); - Ok(()) - } - // bt $imm, label (treat label as absolute memory reference) - (Operand::Immediate(ImmediateValue::Integer(val)), Operand::Label(label)) => { - self.bytes.extend_from_slice(&[0x0F, 0xBA]); - // mod=00, rm=101 for disp32 (no base register) - self.bytes.push(self.modrm(0, ext, 5)); - self.add_relocation_for_label(label, R_386_32); - self.bytes.extend_from_slice(&[0, 0, 0, 0]); - self.bytes.push(*val as u8); - Ok(()) - } - // bt %reg, label (treat label as absolute memory reference) - (Operand::Register(src), Operand::Label(label)) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&[0x0F, opcode_rr]); - // mod=00, rm=101 for disp32 (no base register) - self.bytes.push(self.modrm(0, src_num, 5)); - self.add_relocation_for_label(label, R_386_32); - self.bytes.extend_from_slice(&[0, 0, 0, 0]); - Ok(()) - } - _ => Err(format!("unsupported {} operands", mnemonic)), - } - } - - pub(super) fn encode_setcc(&mut self, ops: &[Operand], mnemonic: &str) -> Result<(), String> { - if ops.len() != 1 { - return Err("setcc requires 1 operand".to_string()); - } - - let cc = cc_from_mnemonic(&mnemonic[3..])?; - - match &ops[0] { - Operand::Register(reg) => { - let num = reg_num(®.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&[0x0F, 0x90 + cc]); - self.bytes.push(self.modrm(3, 0, num)); - Ok(()) - } - Operand::Memory(mem) => { - self.bytes.extend_from_slice(&[0x0F, 0x90 + cc]); - self.encode_modrm_mem(0, mem) - } - _ => Err("setcc requires register or memory operand".to_string()), - } - } - - pub(super) fn encode_cmovcc(&mut self, ops: &[Operand], mnemonic: &str) -> Result<(), String> { - if ops.len() != 2 { - return Err("cmovcc requires 2 operands".to_string()); - } - - let without_prefix = &mnemonic[4..]; - // Strip size suffix if present, otherwise use as-is (unsuffixed = 32-bit default) - let (cc_str, is_16bit) = if without_prefix.ends_with('w') - && without_prefix != "w" - && cc_from_mnemonic(&without_prefix[..without_prefix.len()-1]).is_ok() - { - (&without_prefix[..without_prefix.len()-1], true) - } else if without_prefix.ends_with('l') - && without_prefix != "l" - && cc_from_mnemonic(&without_prefix[..without_prefix.len()-1]).is_ok() - { - (&without_prefix[..without_prefix.len()-1], false) - } else { - (without_prefix, false) - }; - let cc = cc_from_mnemonic(cc_str)?; - - match (&ops[0], &ops[1]) { - (Operand::Register(src), Operand::Register(dst)) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - if is_16bit { self.bytes.push(0x66); } - self.bytes.extend_from_slice(&[0x0F, 0x40 + cc]); - self.bytes.push(self.modrm(3, dst_num, src_num)); - Ok(()) - } - (Operand::Memory(mem), Operand::Register(dst)) => { - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - if is_16bit { self.bytes.push(0x66); } - self.bytes.extend_from_slice(&[0x0F, 0x40 + cc]); - self.encode_modrm_mem(dst_num, mem) - } - _ => Err("unsupported cmov operands".to_string()), - } - } - - pub(super) fn encode_jmp(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.len() != 1 { - return Err("jmp requires 1 operand".to_string()); - } - - match &ops[0] { - Operand::Label(label) => { - self.bytes.push(0xE9); - // Always use R_386_PLT32 for branch targets, matching modern GCC/binutils. - // R_386_PC32 is rejected by ld for PIE executables calling shared lib functions. - let sym = label.strip_suffix("@PLT").unwrap_or(label.as_str()); - let reloc_type = R_386_PLT32; - self.add_relocation(sym, reloc_type, -4); - self.bytes.extend_from_slice(&[0, 0, 0, 0]); - Ok(()) - } - Operand::Indirect(inner) => { - match inner.as_ref() { - Operand::Register(reg) => { - let num = reg_num(®.name).ok_or("bad register")?; - self.bytes.push(0xFF); - self.bytes.push(self.modrm(3, 4, num)); - Ok(()) - } - Operand::Memory(mem) => { - self.emit_segment_prefix(mem); - self.bytes.push(0xFF); - self.encode_modrm_mem(4, mem) - } - _ => Err("unsupported indirect jmp target".to_string()), - } - } - _ => Err("unsupported jmp operand".to_string()), - } - } - - /// Encode far jump (ljmpl/ljmp): direct or indirect - pub(super) fn encode_ljmp(&mut self, ops: &[Operand]) -> Result<(), String> { - match ops.len() { - // ljmpl *mem - indirect far jump through memory (FF /5) - 1 => { - match &ops[0] { - Operand::Indirect(inner) => { - match inner.as_ref() { - Operand::Memory(mem) => { - self.emit_segment_prefix(mem); - self.bytes.push(0xFF); - self.encode_modrm_mem(5, mem) - } - Operand::Label(label) => { - // ljmpl *symbol - indirect far jump via label - self.bytes.push(0xFF); - self.bytes.push(self.modrm(0, 5, 5)); - self.add_relocation_for_label(label, R_386_32); - self.bytes.extend_from_slice(&[0, 0, 0, 0]); - Ok(()) - } - _ => Err("ljmp indirect requires memory or label operand".to_string()), - } - } - Operand::Memory(mem) => { - // ljmp *mem (without explicit indirect prefix) - self.emit_segment_prefix(mem); - self.bytes.push(0xFF); - self.encode_modrm_mem(5, mem) - } - _ => Err("ljmp requires indirect memory or segment:offset operands".to_string()), - } - } - // ljmpl $segment, $offset - direct far jump (opcode 0xEA) - 2 => { - match (&ops[0], &ops[1]) { - (Operand::Immediate(ImmediateValue::Integer(seg)), Operand::Immediate(ImmediateValue::Integer(off))) => { - self.bytes.push(0xEA); - self.bytes.extend_from_slice(&(*off as u32).to_le_bytes()); - self.bytes.extend_from_slice(&(*seg as u16).to_le_bytes()); - Ok(()) - } - (Operand::Immediate(ImmediateValue::Integer(seg)), Operand::Immediate(ImmediateValue::Symbol(sym))) => { - self.bytes.push(0xEA); - self.add_relocation(sym, R_386_32, 0); - self.bytes.extend_from_slice(&[0, 0, 0, 0]); - self.bytes.extend_from_slice(&(*seg as u16).to_le_bytes()); - Ok(()) - } - _ => Err("ljmp requires $segment, $offset operands".to_string()), - } - } - _ => Err("ljmp requires 1 or 2 operands".to_string()), - } - } - - pub(super) fn encode_jcc(&mut self, ops: &[Operand], mnemonic: &str) -> Result<(), String> { - if ops.len() != 1 { - return Err("jcc requires 1 operand".to_string()); - } - - let cc = cc_from_mnemonic(&mnemonic[1..])?; - - match &ops[0] { - Operand::Label(label) => { - self.bytes.extend_from_slice(&[0x0F, 0x80 + cc]); - // Always use R_386_PLT32 for branch targets, matching modern GCC/binutils. - let sym = label.strip_suffix("@PLT").unwrap_or(label.as_str()); - let reloc_type = R_386_PLT32; - self.add_relocation(sym, reloc_type, -4); - self.bytes.extend_from_slice(&[0, 0, 0, 0]); - Ok(()) - } - _ => Err("jcc requires label operand".to_string()), - } - } - - pub(super) fn encode_call(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.len() != 1 { - return Err("call requires 1 operand".to_string()); - } - - match &ops[0] { - Operand::Label(label) => { - self.bytes.push(0xE8); - // Always use R_386_PLT32 for branch targets, matching modern GCC/binutils. - let reloc_type = R_386_PLT32; - let sym = label.strip_suffix("@PLT").unwrap_or(label.as_str()); - self.add_relocation(sym, reloc_type, -4); - self.bytes.extend_from_slice(&[0, 0, 0, 0]); - Ok(()) - } - Operand::Indirect(inner) => { - match inner.as_ref() { - Operand::Register(reg) => { - let num = reg_num(®.name).ok_or("bad register")?; - self.bytes.push(0xFF); - self.bytes.push(self.modrm(3, 2, num)); - Ok(()) - } - Operand::Memory(mem) => { - self.emit_segment_prefix(mem); - self.bytes.push(0xFF); - self.encode_modrm_mem(2, mem) - } - _ => Err("unsupported indirect call target".to_string()), - } - } - _ => Err("unsupported call operand".to_string()), - } - } - - pub(super) fn encode_xchg(&mut self, ops: &[Operand], mnemonic: &str) -> Result<(), String> { - if ops.len() != 2 { - return Err("xchg requires 2 operands".to_string()); - } - let size = mnemonic_size_suffix(mnemonic).unwrap_or(4); - - match (&ops[0], &ops[1]) { - (Operand::Register(src), Operand::Memory(mem)) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - if size == 2 { self.bytes.push(0x66); } - self.bytes.push(if size == 1 { 0x86 } else { 0x87 }); - self.encode_modrm_mem(src_num, mem) - } - (Operand::Register(src), Operand::Register(dst)) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - if size == 2 { self.bytes.push(0x66); } - self.bytes.push(if size == 1 { 0x86 } else { 0x87 }); - self.bytes.push(self.modrm(3, src_num, dst_num)); - Ok(()) - } - _ => Err("unsupported xchg operands".to_string()), - } - } - - pub(super) fn encode_cmpxchg(&mut self, ops: &[Operand], mnemonic: &str) -> Result<(), String> { - if ops.len() != 2 { - return Err("cmpxchg requires 2 operands".to_string()); - } - let size = mnemonic_size_suffix(mnemonic).unwrap_or(4); - - match (&ops[0], &ops[1]) { - (Operand::Register(src), Operand::Memory(mem)) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - if size == 2 { self.bytes.push(0x66); } - self.bytes.extend_from_slice(&[0x0F, if size == 1 { 0xB0 } else { 0xB1 }]); - self.encode_modrm_mem(src_num, mem) - } - _ => Err("unsupported cmpxchg operands".to_string()), - } - } - - pub(super) fn encode_xadd(&mut self, ops: &[Operand], mnemonic: &str) -> Result<(), String> { - if ops.len() != 2 { - return Err("xadd requires 2 operands".to_string()); - } - let size = mnemonic_size_suffix(mnemonic).unwrap_or(4); - - match (&ops[0], &ops[1]) { - (Operand::Register(src), Operand::Memory(mem)) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - if size == 2 { self.bytes.push(0x66); } - self.bytes.extend_from_slice(&[0x0F, if size == 1 { 0xC0 } else { 0xC1 }]); - self.encode_modrm_mem(src_num, mem) - } - _ => Err("unsupported xadd operands".to_string()), - } - } - - pub(super) fn encode_clflush(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.len() != 1 { - return Err("clflush requires 1 operand".to_string()); - } - match &ops[0] { - Operand::Memory(mem) => { - self.bytes.extend_from_slice(&[0x0F, 0xAE]); - self.encode_modrm_mem(7, mem) - } - _ => Err("clflush requires memory operand".to_string()), - } - } - - /// Encode SSE memory-only instructions (ldmxcsr, stmxcsr). - /// Format: 0F AE /ext mem - pub(super) fn encode_sse_mem_only(&mut self, ops: &[Operand], ext: u8) -> Result<(), String> { - if ops.len() != 1 { - return Err("SSE mem-only op requires 1 operand".to_string()); - } - match &ops[0] { - Operand::Memory(mem) => { - self.bytes.extend_from_slice(&[0x0F, 0xAE]); - self.encode_modrm_mem(ext, mem) - } - _ => Err("SSE mem-only op requires memory operand".to_string()), - } - } - - pub(super) fn encode_int(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.len() != 1 { - return Err("int requires 1 operand".to_string()); - } - match &ops[0] { - Operand::Immediate(ImmediateValue::Integer(val)) => { - if *val == 3 { - self.bytes.push(0xCC); - } else { - self.bytes.push(0xCD); - self.bytes.push(*val as u8); - } - Ok(()) - } - _ => Err("int requires immediate operand".to_string()), - } - } -} diff --git a/src/backend/i686/assembler/encoder/mod.rs b/src/backend/i686/assembler/encoder/mod.rs deleted file mode 100644 index f8b768e677..0000000000 --- a/src/backend/i686/assembler/encoder/mod.rs +++ /dev/null @@ -1,770 +0,0 @@ -//! i686 (32-bit x86) instruction encoder. -//! -//! Encodes parsed i686 instructions into machine code bytes. -//! Similar to the x86-64 encoder but without REX prefixes and with -//! 32-bit default operand size. Uses R_386_* relocation types. - -mod registers; -mod core; -mod gp_integer; -mod sse; -mod x87; -mod system; - -pub(crate) use registers::*; - -use crate::backend::x86::assembler::parser::*; - -/// Split a label string like `"pa_tr_efer + 4"` or `"symbol-8"` into (symbol, addend). -/// Returns the original string with addend 0 if no offset is found. -fn split_label_offset(label: &str) -> (&str, i64) { - // Scan for '+' or '-' that separates the symbol from the offset. - // Skip the first character to avoid splitting on leading sign/dot. - for (i, c) in label.char_indices().skip(1) { - if c == '+' || c == '-' { - let left = label[..i].trim(); - if left.is_empty() { - continue; - } - // Extract the numeric part (including sign for '-') - let num_str = if c == '+' { - label[i + 1..].trim() - } else { - // Keep the '-' sign - label[i..].trim() - }; - if let Ok(offset) = num_str.parse::() { - return (left, offset); - } - } - } - (label, 0) -} - -/// Relocation entry for the linker to resolve. -#[derive(Debug, Clone)] -pub struct Relocation { - /// Offset within the section where the relocation applies. - pub offset: u64, - /// Symbol name to resolve. - pub symbol: String, - /// Relocation type (ELF R_386_* constants). - pub reloc_type: u32, - /// Addend for the relocation (used in RELA; for REL format, embedded in instruction). - pub addend: i64, - /// For symbol difference expressions (A - B): the subtracted symbol. - pub diff_symbol: Option, -} - -// ELF i386 relocation types -pub const R_386_32: u32 = 1; -pub const R_386_PC32: u32 = 2; -pub const R_386_GOT32: u32 = 3; -pub const R_386_PLT32: u32 = 4; -pub const R_386_GOTOFF: u32 = 9; -pub const R_386_GOTPC: u32 = 10; -pub const R_386_TLS_LE_32: u32 = 37; -pub const R_386_TLS_IE: u32 = 15; -pub const R_386_TLS_GD: u32 = 18; -pub const R_386_TLS_LDM: u32 = 19; -pub const R_386_TLS_LDO_32: u32 = 32; -#[allow(dead_code)] // ELF standard constant; not yet emitted by assembler but used by linker -pub const R_386_TLS_GOTIE: u32 = 16; -pub const R_386_32S: u32 = 38; // R_386_TLS_LE (negative offset from TP) - -/// Instruction encoding context for i686. -pub struct InstructionEncoder { - /// Output bytes. - pub bytes: Vec, - /// Relocations generated during encoding. - pub relocations: Vec, - /// Current offset within the section. - pub offset: u64, - /// Whether we are in .code16gcc mode (16-bit real mode with 32-bit instructions). - /// Currently .code16gcc is handled at the assembly text level (prepended to asm output); - /// this field is infrastructure for future per-instruction operand size overrides. - #[allow(dead_code)] - pub code16gcc: bool, -} - -impl InstructionEncoder { - pub fn new() -> Self { - InstructionEncoder { - bytes: Vec::new(), - relocations: Vec::new(), - offset: 0, - code16gcc: false, - } - } - - /// Encode a single instruction and append bytes. - pub fn encode(&mut self, instr: &Instruction) -> Result<(), String> { - let start_len = self.bytes.len(); - - // Handle prefix - if let Some(ref prefix) = instr.prefix { - match prefix.as_str() { - "lock" => self.bytes.push(0xF0), - "rep" | "repz" | "repe" => self.bytes.push(0xF3), - "repnz" | "repne" => self.bytes.push(0xF2), - _ => return Err(format!("unknown prefix: {}", prefix)), - } - } - - let result = self.encode_mnemonic(instr); - - if result.is_ok() { - self.offset += (self.bytes.len() - start_len) as u64; - } - - result - } - - /// Main mnemonic dispatch. - fn encode_mnemonic(&mut self, instr: &Instruction) -> Result<(), String> { - let mnemonic = instr.mnemonic.as_str(); - let ops = &instr.operands; - - match mnemonic { - // Data movement - "movl" => self.encode_mov(ops, 4), - "movw" => self.encode_mov(ops, 2), - "movb" => self.encode_mov(ops, 1), - // Unsuffixed mov from inline asm - infer size from operands - "mov" => self.encode_mov_infer_size(ops), - "movsbl" => self.encode_movsx(ops, 1, 4), - "movswl" => self.encode_movsx(ops, 2, 4), - "movsbw" => self.encode_movsx(ops, 1, 2), - "movzbl" => self.encode_movzx(ops, 1, 4), - "movzwl" => self.encode_movzx(ops, 2, 4), - "movzbw" => self.encode_movzx(ops, 1, 2), - - // LEA - "leal" | "lea" => self.encode_lea(ops, 4), - - // Stack ops (32-bit default) - "pushl" | "push" => self.encode_push(ops), - "popl" | "pop" => self.encode_pop(ops), - // Also handle pushw/popw for 16-bit variants - "pushw" => self.encode_push16(ops), - "popw" => self.encode_pop16(ops), - - // Arithmetic - "addl" | "addw" | "addb" | "add" => self.encode_alu(ops, mnemonic, 0), - "orl" | "orw" | "orb" | "or" => self.encode_alu(ops, mnemonic, 1), - "adcl" | "adcw" | "adcb" | "adc" => self.encode_alu(ops, mnemonic, 2), - "sbbl" | "sbbw" | "sbbb" | "sbb" => self.encode_alu(ops, mnemonic, 3), - "andl" | "andw" | "andb" | "and" => self.encode_alu(ops, mnemonic, 4), - "subl" | "subw" | "subb" | "sub" => self.encode_alu(ops, mnemonic, 5), - "xorl" | "xorw" | "xorb" | "xor" => self.encode_alu(ops, mnemonic, 6), - "cmpl" | "cmpw" | "cmpb" | "cmp" => self.encode_alu(ops, mnemonic, 7), - "testl" | "testw" | "testb" | "test" => self.encode_test(ops, mnemonic), - - // Multiply/divide - "imull" | "imul" => self.encode_imul(ops, 4), - "mull" | "mul" => self.encode_unary_rm(ops, 4, 4), - "divl" | "div" => self.encode_unary_rm(ops, 6, 4), - "idivl" | "idiv" => self.encode_unary_rm(ops, 7, 4), - - // Unary - "negl" | "neg" => self.encode_unary_rm(ops, 3, 4), - "negw" => self.encode_unary_rm(ops, 3, 2), - "negb" => self.encode_unary_rm(ops, 3, 1), - "notl" | "not" => self.encode_unary_rm(ops, 2, 4), - "notw" => self.encode_unary_rm(ops, 2, 2), - "notb" => self.encode_unary_rm(ops, 2, 1), - "incl" | "inc" => self.encode_inc_dec(ops, 0, 4), - "incw" => self.encode_inc_dec(ops, 0, 2), - "incb" => self.encode_inc_dec(ops, 0, 1), - "decl" | "dec" => self.encode_inc_dec(ops, 1, 4), - "decw" => self.encode_inc_dec(ops, 1, 2), - "decb" => self.encode_inc_dec(ops, 1, 1), - - // Shifts - "shll" | "shlw" | "shlb" | "shl" => self.encode_shift(ops, mnemonic, 4), - "shrl" | "shrw" | "shrb" | "shr" => self.encode_shift(ops, mnemonic, 5), - "sarl" | "sarw" | "sarb" | "sar" => self.encode_shift(ops, mnemonic, 7), - "roll" | "rolw" | "rolb" | "rol" => self.encode_shift(ops, mnemonic, 0), - "rorl" | "rorw" | "rorb" | "ror" => self.encode_shift(ops, mnemonic, 1), - - // Double-precision shifts - "shldl" | "shld" => self.encode_double_shift(ops, 0xA4, 4), - "shrdl" | "shrd" => self.encode_double_shift(ops, 0xAC, 4), - - // Sign extension - "cltd" | "cdq" => { self.bytes.push(0x99); Ok(()) } - "cwtl" | "cwde" => { self.bytes.push(0x98); Ok(()) } - "cbtw" | "cbw" => { self.bytes.extend_from_slice(&[0x66, 0x98]); Ok(()) } - "cwtd" | "cwd" => { self.bytes.extend_from_slice(&[0x66, 0x99]); Ok(()) } - - // Byte swap - "bswapl" | "bswap" => self.encode_bswap(ops), - - // Bit operations - "lzcntl" | "tzcntl" | "popcntl" => self.encode_bit_count(ops, mnemonic), - "bsrl" | "bsfl" | "bsr" | "bsf" => self.encode_bsr_bsf(ops, mnemonic), - "bsrw" | "bsfw" => self.encode_bsr_bsf_16(ops, mnemonic), - "btl" | "btsl" | "btrl" | "btcl" | "bt" | "bts" | "btr" | "btc" => self.encode_bt(ops, mnemonic), - - // Conditional set - "sete" | "setz" | "setne" | "setnz" | "setl" | "setle" | "setg" | "setge" - | "setb" | "setc" | "setbe" | "seta" | "setae" | "setnc" | "setnp" | "setp" - | "sets" | "setns" | "seto" | "setno" => self.encode_setcc(ops, mnemonic), - - // Conditional move - "cmovel" | "cmovnel" | "cmovll" | "cmovlel" | "cmovgl" | "cmovgel" - | "cmovbl" | "cmovbel" | "cmoval" | "cmovael" - | "cmovsl" | "cmovnsl" | "cmovzl" | "cmovnzl" | "cmovpl" | "cmovnpl" - | "cmovol" | "cmovnol" | "cmovcl" | "cmovncl" - | "cmovew" | "cmovnew" | "cmovlw" | "cmovlew" | "cmovgw" | "cmovgew" - | "cmovbw" | "cmovbew" | "cmovaw" | "cmovaew" - | "cmovsw" | "cmovnsw" | "cmovzw" | "cmovnzw" | "cmovpw" | "cmovnpw" - | "cmovow" | "cmovnow" | "cmovcw" | "cmovncw" - | "cmove" | "cmovne" | "cmovl" | "cmovle" | "cmovg" | "cmovge" - | "cmovb" | "cmovbe" | "cmova" | "cmovae" - | "cmovs" | "cmovns" | "cmovz" | "cmovnz" | "cmovp" | "cmovnp" - | "cmovo" | "cmovno" | "cmovc" | "cmovnc" => self.encode_cmovcc(ops, mnemonic), - - // Jumps - "jmp" => self.encode_jmp(ops), - "je" | "jz" | "jne" | "jnz" | "jl" | "jle" | "jg" | "jge" - | "jb" | "jbe" | "ja" | "jae" | "js" | "jns" | "jo" | "jno" | "jp" | "jnp" - | "jc" | "jnc" => { - self.encode_jcc(ops, mnemonic) - } - // jecxz/jcxz - short jump only (no long form) - "jecxz" | "jcxz" => { - if ops.len() != 1 { - return Err("jecxz requires 1 operand".to_string()); - } - match &ops[0] { - Operand::Label(_) => { - // E3 cb - Jump short if ECX register is 0 - self.bytes.extend_from_slice(&[0xE3, 0x00]); - Ok(()) - } - _ => Err("jecxz requires label operand".to_string()), - } - } - // loop - short jump only (dec ECX, jump if non-zero) - "loop" => { - if ops.len() != 1 { - return Err("loop requires 1 operand".to_string()); - } - match &ops[0] { - Operand::Label(_) => { - // E2 cb - Dec ECX; jump short if ECX != 0 - self.bytes.extend_from_slice(&[0xE2, 0x00]); - Ok(()) - } - _ => Err("loop requires label operand".to_string()), - } - } - - // Call/return - "call" => self.encode_call(ops), - "ret" => { - if ops.is_empty() { - self.bytes.push(0xC3); - } else if let Some(Operand::Immediate(ImmediateValue::Integer(val))) = ops.first() { - // ret $imm16 - pop return address and deallocate imm16 bytes - self.bytes.push(0xC2); - self.bytes.extend_from_slice(&(*val as u16).to_le_bytes()); - } else { - return Err("unsupported ret operand".to_string()); - } - Ok(()) - } - // Far jump - "ljmpl" | "ljmpw" | "ljmp" => self.encode_ljmp(ops), - // Far return - "lret" | "lretl" => { - if ops.is_empty() { - self.bytes.push(0xCB); - } else if let Some(Operand::Immediate(ImmediateValue::Integer(val))) = ops.first() { - self.bytes.push(0xCA); - self.bytes.extend_from_slice(&(*val as u16).to_le_bytes()); - } else { - return Err("unsupported lret operand".to_string()); - } - Ok(()) - } - // 64-bit far return (in .code64 sections, encoded with REX.W prefix) - "lretq" => { - self.bytes.extend_from_slice(&[0x48, 0xCB]); // REX.W + lret - Ok(()) - } - - // No-ops and misc - "nop" => { self.bytes.push(0x90); Ok(()) } - "ud2" => { self.bytes.extend_from_slice(&[0x0F, 0x0B]); Ok(()) } - "pause" => { self.bytes.extend_from_slice(&[0xF3, 0x90]); Ok(()) } - "mfence" => { self.bytes.extend_from_slice(&[0x0F, 0xAE, 0xF0]); Ok(()) } - "lfence" => { self.bytes.extend_from_slice(&[0x0F, 0xAE, 0xE8]); Ok(()) } - "sfence" => { self.bytes.extend_from_slice(&[0x0F, 0xAE, 0xF8]); Ok(()) } - "clflush" => self.encode_clflush(ops), - "ldmxcsr" => self.encode_sse_mem_only(ops, 2), - "stmxcsr" => self.encode_sse_mem_only(ops, 3), - "int" => self.encode_int(ops), - "cpuid" => { self.bytes.extend_from_slice(&[0x0F, 0xA2]); Ok(()) } - "rdtsc" => { self.bytes.extend_from_slice(&[0x0F, 0x31]); Ok(()) } - "rdtscp" => { self.bytes.extend_from_slice(&[0x0F, 0x01, 0xF9]); Ok(()) } - "xgetbv" => { self.bytes.extend_from_slice(&[0x0F, 0x01, 0xD0]); Ok(()) } - "syscall" => { self.bytes.extend_from_slice(&[0x0F, 0x05]); Ok(()) } - "sysenter" => { self.bytes.extend_from_slice(&[0x0F, 0x34]); Ok(()) } - "hlt" => { self.bytes.push(0xF4); Ok(()) } - "emms" => { self.bytes.extend_from_slice(&[0x0F, 0x77]); Ok(()) } - "cmpxchg8b" => self.encode_cmpxchg8b(ops), - "rdmsr" => { self.bytes.extend_from_slice(&[0x0F, 0x32]); Ok(()) } - "wrmsr" => { self.bytes.extend_from_slice(&[0x0F, 0x30]); Ok(()) } - "rdpmc" => { self.bytes.extend_from_slice(&[0x0F, 0x33]); Ok(()) } - "wbinvd" => { self.bytes.extend_from_slice(&[0x0F, 0x09]); Ok(()) } - "invlpg" => self.encode_invlpg(ops), - "verw" => self.encode_verw(ops), - "lsl" => self.encode_lsl(ops), - "sgdt" | "sgdtl" | "sidt" | "sidtl" | "lgdt" | "lgdtl" | "lidt" | "lidtl" => self.encode_system_table(ops, mnemonic), - "lmsw" => self.encode_lmsw(ops), - "smsw" => self.encode_smsw(ops), - - // Standalone prefix mnemonics (e.g. from "rep; nop" split on semicolon) - "lock" if ops.is_empty() => { self.bytes.push(0xF0); Ok(()) } - "rep" | "repe" | "repz" if ops.is_empty() => { self.bytes.push(0xF3); Ok(()) } - "repnz" | "repne" if ops.is_empty() => { self.bytes.push(0xF2); Ok(()) } - - // String ops - "movsb" => { self.bytes.push(0xA4); Ok(()) } - "movsl" if ops.is_empty() => { self.bytes.push(0xA5); Ok(()) } - "stosb" => { self.bytes.push(0xAA); Ok(()) } - "stosl" => { self.bytes.push(0xAB); Ok(()) } - "cmpsb" => { self.bytes.push(0xA6); Ok(()) } - "cmpsl" => { self.bytes.push(0xA7); Ok(()) } - "scasb" => { self.bytes.push(0xAE); Ok(()) } - "scasl" => { self.bytes.push(0xAF); Ok(()) } - "lodsb" => { self.bytes.push(0xAC); Ok(()) } - "lodsl" => { self.bytes.push(0xAD); Ok(()) } - - // I/O string ops - "insb" => { self.bytes.push(0x6C); Ok(()) } - "insw" => { self.bytes.extend_from_slice(&[0x66, 0x6D]); Ok(()) } - "insl" => { self.bytes.push(0x6D); Ok(()) } - "outsb" => { self.bytes.push(0x6E); Ok(()) } - "outsw" => { self.bytes.extend_from_slice(&[0x66, 0x6F]); Ok(()) } - "outsl" => { self.bytes.push(0x6F); Ok(()) } - - // Port I/O instructions - "outb" | "outw" | "outl" => self.encode_out(ops, mnemonic), - "inb" | "inw" | "inl" => self.encode_in(ops, mnemonic), - - // Atomic exchange - "xchgb" | "xchgw" | "xchgl" | "xchg" => self.encode_xchg(ops, mnemonic), - - // Lock-prefixed atomics - "cmpxchgb" | "cmpxchgw" | "cmpxchgl" | "cmpxchg" => self.encode_cmpxchg(ops, mnemonic), - "xaddb" | "xaddw" | "xaddl" | "xadd" => self.encode_xadd(ops, mnemonic), - - // SSE/SSE2 floating-point - "movss" => self.encode_sse_rr_rm(ops, &[0xF3, 0x0F, 0x10], &[0xF3, 0x0F, 0x11]), - "movsd" => self.encode_sse_rr_rm(ops, &[0xF2, 0x0F, 0x10], &[0xF2, 0x0F, 0x11]), - "movd" => self.encode_movd(ops), - "movq" => self.encode_movq(ops), - "movdqu" => self.encode_sse_rr_rm(ops, &[0xF3, 0x0F, 0x6F], &[0xF3, 0x0F, 0x7F]), - "movupd" => self.encode_sse_rr_rm(ops, &[0x66, 0x0F, 0x10], &[0x66, 0x0F, 0x11]), - "movups" => self.encode_sse_rr_rm(ops, &[0x0F, 0x10], &[0x0F, 0x11]), - "movaps" => self.encode_sse_rr_rm(ops, &[0x0F, 0x28], &[0x0F, 0x29]), - "movdqa" => self.encode_sse_rr_rm(ops, &[0x66, 0x0F, 0x6F], &[0x66, 0x0F, 0x7F]), - "movlps" => self.encode_sse_op(ops, &[0x0F, 0x12]), - "movhps" => self.encode_sse_op(ops, &[0x0F, 0x16]), - "movlpd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x12]), - "movhpd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x16]), - - // Non-temporal stores - "movnti" | "movntil" => self.encode_movnti(ops), - "movntdq" => self.encode_sse_store_only(ops, &[0x66, 0x0F, 0xE7]), - - "addsd" => self.encode_sse_op(ops, &[0xF2, 0x0F, 0x58]), - "subsd" => self.encode_sse_op(ops, &[0xF2, 0x0F, 0x5C]), - "mulsd" => self.encode_sse_op(ops, &[0xF2, 0x0F, 0x59]), - "divsd" => self.encode_sse_op(ops, &[0xF2, 0x0F, 0x5E]), - "addss" => self.encode_sse_op(ops, &[0xF3, 0x0F, 0x58]), - "subss" => self.encode_sse_op(ops, &[0xF3, 0x0F, 0x5C]), - "mulss" => self.encode_sse_op(ops, &[0xF3, 0x0F, 0x59]), - "divss" => self.encode_sse_op(ops, &[0xF3, 0x0F, 0x5E]), - "sqrtsd" => self.encode_sse_op(ops, &[0xF2, 0x0F, 0x51]), - "sqrtss" => self.encode_sse_op(ops, &[0xF3, 0x0F, 0x51]), - "sqrtps" => self.encode_sse_op(ops, &[0x0F, 0x51]), - "sqrtpd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x51]), - "rsqrtss" => self.encode_sse_op(ops, &[0xF3, 0x0F, 0x52]), - "rsqrtps" => self.encode_sse_op(ops, &[0x0F, 0x52]), - "rcpss" => self.encode_sse_op(ops, &[0xF3, 0x0F, 0x53]), - "rcpps" => self.encode_sse_op(ops, &[0x0F, 0x53]), - "maxsd" => self.encode_sse_op(ops, &[0xF2, 0x0F, 0x5F]), - "maxss" => self.encode_sse_op(ops, &[0xF3, 0x0F, 0x5F]), - "minsd" => self.encode_sse_op(ops, &[0xF2, 0x0F, 0x5D]), - "minss" => self.encode_sse_op(ops, &[0xF3, 0x0F, 0x5D]), - "ucomisd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x2E]), - "ucomiss" => self.encode_sse_op(ops, &[0x0F, 0x2E]), - "comisd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x2F]), - "comiss" => self.encode_sse_op(ops, &[0x0F, 0x2F]), - "xorpd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x57]), - "xorps" => self.encode_sse_op(ops, &[0x0F, 0x57]), - "andpd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x54]), - "andps" => self.encode_sse_op(ops, &[0x0F, 0x54]), - "andnpd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x55]), - "andnps" => self.encode_sse_op(ops, &[0x0F, 0x55]), - "orpd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x56]), - "orps" => self.encode_sse_op(ops, &[0x0F, 0x56]), - "unpcklps" => self.encode_sse_op(ops, &[0x0F, 0x14]), - "unpckhps" => self.encode_sse_op(ops, &[0x0F, 0x15]), - "unpcklpd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x14]), - "unpckhpd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x15]), - "shufps" => self.encode_sse_op_imm8(ops, &[0x0F, 0xC6]), - "shufpd" => self.encode_sse_op_imm8(ops, &[0x66, 0x0F, 0xC6]), - "cmpsd" => self.encode_sse_op_imm8(ops, &[0xF2, 0x0F, 0xC2]), - "cmpss" => self.encode_sse_op_imm8(ops, &[0xF3, 0x0F, 0xC2]), - "cmppd" => self.encode_sse_op_imm8(ops, &[0x66, 0x0F, 0xC2]), - "cmpps" => self.encode_sse_op_imm8(ops, &[0x0F, 0xC2]), - "pclmulqdq" => self.encode_sse_op_imm8(ops, &[0x66, 0x0F, 0x3A, 0x44]), - - // AES-NI - "aesenc" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0xDC]), - "aesenclast" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0xDD]), - "aesdec" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0xDE]), - "aesdeclast" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0xDF]), - "aesimc" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0xDB]), - "aeskeygenassist" => self.encode_sse_op_imm8(ops, &[0x66, 0x0F, 0x3A, 0xDF]), - - // SSE conversions (32-bit integer operand size for i686) - "cvtsd2ss" => self.encode_sse_op(ops, &[0xF2, 0x0F, 0x5A]), - "cvtss2sd" => self.encode_sse_op(ops, &[0xF3, 0x0F, 0x5A]), - "cvtsi2sdl" | "cvtsi2sd" => self.encode_sse_cvt_gp_to_xmm(ops, &[0xF2, 0x0F, 0x2A]), - "cvtsi2ssl" | "cvtsi2ss" => self.encode_sse_cvt_gp_to_xmm(ops, &[0xF3, 0x0F, 0x2A]), - "cvttsd2sil" | "cvttsd2si" | "cvtsd2sil" | "cvtsd2si" => self.encode_sse_cvt_xmm_to_gp(ops, &[0xF2, 0x0F, 0x2C]), - "cvttss2sil" | "cvttss2si" | "cvtss2sil" | "cvtss2si" => self.encode_sse_cvt_xmm_to_gp(ops, &[0xF3, 0x0F, 0x2C]), - "cvtps2pd" => self.encode_sse_op(ops, &[0x0F, 0x5A]), - "cvtpd2ps" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x5A]), - "cvtdq2ps" => self.encode_sse_op(ops, &[0x0F, 0x5B]), - "cvtps2dq" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x5B]), - "cvttps2dq" => self.encode_sse_op(ops, &[0xF3, 0x0F, 0x5B]), - "cvtdq2pd" => self.encode_sse_op(ops, &[0xF3, 0x0F, 0xE6]), - "cvtpd2dq" => self.encode_sse_op(ops, &[0xF2, 0x0F, 0xE6]), - - // SSE packed integer - "pshufd" => self.encode_sse_op_imm8(ops, &[0x66, 0x0F, 0x70]), - "pshuflw" => self.encode_sse_op_imm8(ops, &[0xF2, 0x0F, 0x70]), - "pshufhw" => self.encode_sse_op_imm8(ops, &[0xF3, 0x0F, 0x70]), - "pxor" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xEF]), - "pand" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xDB]), - "por" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xEB]), - "pandn" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xDF]), - "pcmpeqb" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x74]), - "pcmpeqd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x76]), - "pcmpeqw" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x75]), - "pcmpgtb" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x64]), - "pcmpgtd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x66]), - "pcmpgtw" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x65]), - "pmovmskb" => self.encode_sse_cvt_xmm_to_gp(ops, &[0x66, 0x0F, 0xD7]), - "movmskps" => self.encode_sse_cvt_xmm_to_gp(ops, &[0x0F, 0x50]), - "movmskpd" => self.encode_sse_cvt_xmm_to_gp(ops, &[0x66, 0x0F, 0x50]), - - // SSE packed arithmetic - "paddb" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xFC]), - "paddw" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xFD]), - "paddd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xFE]), - "paddq" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xD4]), - "psubb" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xF8]), - "psubw" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xF9]), - "psubd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xFA]), - "psubq" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xFB]), - "pmullw" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xD5]), - "pmulld" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x40]), - "pmulhw" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xE5]), - "pmulhuw" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xE4]), - "pmuludq" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xF4]), - "paddusb" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xDC]), - "paddusw" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xDD]), - "psubusb" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xD8]), - "psubusw" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xD9]), - "paddsb" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xEC]), - "paddsw" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xED]), - "psubsb" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xE8]), - "psubsw" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xE9]), - "pmaxub" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xDE]), - "pmaxsw" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xEE]), - "pminub" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xDA]), - "pminsw" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xEA]), - "pavgb" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xE0]), - "pavgw" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xE3]), - "psadbw" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xF6]), - "pmaddwd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xF5]), - "pabsb" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x1C]), - "pabsw" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x1D]), - "pabsd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x1E]), - - // SSE pack/unpack - "punpcklbw" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x60]), - "punpcklwd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x61]), - "punpckldq" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x62]), - "punpcklqdq" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x6C]), - "punpckhbw" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x68]), - "punpckhwd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x69]), - "punpckhdq" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x6A]), - "punpckhqdq" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x6D]), - "packsswb" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x63]), - "packssdw" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x6B]), - "packuswb" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x67]), - "packusdw" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x2B]), - - // SSE insert/extract - "pextrw" => self.encode_pextrw(ops), - "pinsrw" => self.encode_pinsrw(ops), - - // SSE shifts - "pslld" => self.encode_sse_shift(ops, &[0x66, 0x0F, 0xF2], 6, &[0x66, 0x0F, 0x72]), - "psrld" => self.encode_sse_shift(ops, &[0x66, 0x0F, 0xD2], 2, &[0x66, 0x0F, 0x72]), - "psrad" => self.encode_sse_shift(ops, &[0x66, 0x0F, 0xE2], 4, &[0x66, 0x0F, 0x72]), - "psllq" => self.encode_sse_shift(ops, &[0x66, 0x0F, 0xF3], 6, &[0x66, 0x0F, 0x73]), - "psrlq" => self.encode_sse_shift(ops, &[0x66, 0x0F, 0xD3], 2, &[0x66, 0x0F, 0x73]), - "psllw" => self.encode_sse_shift(ops, &[0x66, 0x0F, 0xF1], 6, &[0x66, 0x0F, 0x71]), - "psrlw" => self.encode_sse_shift(ops, &[0x66, 0x0F, 0xD1], 2, &[0x66, 0x0F, 0x71]), - "psraw" => self.encode_sse_shift(ops, &[0x66, 0x0F, 0xE1], 4, &[0x66, 0x0F, 0x71]), - // pslldq/psrldq: byte shifts (only immediate form) - "pslldq" => self.encode_sse_byte_shift(ops, 7), - "psrldq" => self.encode_sse_byte_shift(ops, 3), - - // x87 FPU - "fldt" => self.encode_x87_mem(ops, &[0xDB], 5), - "fstpt" => self.encode_x87_mem(ops, &[0xDB], 7), - "fldl" => self.encode_x87_mem(ops, &[0xDD], 0), - "flds" => self.encode_x87_mem(ops, &[0xD9], 0), - "fstpl" => self.encode_x87_mem(ops, &[0xDD], 3), - "fstps" => self.encode_x87_mem(ops, &[0xD9], 3), - "fstl" => self.encode_x87_mem(ops, &[0xDD], 2), - "fsts" => self.encode_x87_mem(ops, &[0xD9], 2), - "fildl" => self.encode_x87_mem(ops, &[0xDB], 0), - "fildq" => self.encode_x87_mem(ops, &[0xDF], 5), - "filds" => self.encode_x87_mem(ops, &[0xDF], 0), - "fistpl" => self.encode_x87_mem(ops, &[0xDB], 3), - "fistpq" => self.encode_x87_mem(ops, &[0xDF], 7), - "fisttpq" => self.encode_x87_mem(ops, &[0xDD], 1), - "fisttpl" => self.encode_x87_mem(ops, &[0xDB], 1), - "faddp" => { self.bytes.extend_from_slice(&[0xDE, 0xC1]); Ok(()) } - // Note: AT&T syntax swaps the meaning of fsub/fsubr and fdiv/fdivr - // relative to Intel mnemonics for the *p (pop) forms. - // GAS: fsubp = DE E1, fsubrp = DE E9, fdivp = DE F1, fdivrp = DE F9 - "fsubp" => { self.bytes.extend_from_slice(&[0xDE, 0xE1]); Ok(()) } - "fsubrp" => { self.bytes.extend_from_slice(&[0xDE, 0xE9]); Ok(()) } - "fmulp" => { self.bytes.extend_from_slice(&[0xDE, 0xC9]); Ok(()) } - "fdivp" => { self.bytes.extend_from_slice(&[0xDE, 0xF1]); Ok(()) } - "fdivrp" => { self.bytes.extend_from_slice(&[0xDE, 0xF9]); Ok(()) } - "fchs" => { self.bytes.extend_from_slice(&[0xD9, 0xE0]); Ok(()) } - "fabs" => { self.bytes.extend_from_slice(&[0xD9, 0xE1]); Ok(()) } - "fsqrt" => { self.bytes.extend_from_slice(&[0xD9, 0xFA]); Ok(()) } - "fsin" => { self.bytes.extend_from_slice(&[0xD9, 0xFE]); Ok(()) } - "fcos" => { self.bytes.extend_from_slice(&[0xD9, 0xFF]); Ok(()) } - "fpatan" => { self.bytes.extend_from_slice(&[0xD9, 0xF3]); Ok(()) } - "fptan" => { self.bytes.extend_from_slice(&[0xD9, 0xF2]); Ok(()) } - "fprem" => { self.bytes.extend_from_slice(&[0xD9, 0xF8]); Ok(()) } - "fprem1" => { self.bytes.extend_from_slice(&[0xD9, 0xF5]); Ok(()) } - "frndint" => { self.bytes.extend_from_slice(&[0xD9, 0xFC]); Ok(()) } - "fscale" => { self.bytes.extend_from_slice(&[0xD9, 0xFD]); Ok(()) } - "f2xm1" => { self.bytes.extend_from_slice(&[0xD9, 0xF0]); Ok(()) } - "fyl2x" => { self.bytes.extend_from_slice(&[0xD9, 0xF1]); Ok(()) } - "fyl2xp1" => { self.bytes.extend_from_slice(&[0xD9, 0xF9]); Ok(()) } - "fld1" => { self.bytes.extend_from_slice(&[0xD9, 0xE8]); Ok(()) } - "fldl2e" => { self.bytes.extend_from_slice(&[0xD9, 0xEA]); Ok(()) } - "fldl2t" => { self.bytes.extend_from_slice(&[0xD9, 0xE9]); Ok(()) } - "fldlg2" => { self.bytes.extend_from_slice(&[0xD9, 0xEC]); Ok(()) } - "fldln2" => { self.bytes.extend_from_slice(&[0xD9, 0xED]); Ok(()) } - "fldpi" => { self.bytes.extend_from_slice(&[0xD9, 0xEB]); Ok(()) } - "fldz" => { self.bytes.extend_from_slice(&[0xD9, 0xEE]); Ok(()) } - "fnstsw" => self.encode_fnstsw(ops), - "fnstcw" => self.encode_x87_mem(ops, &[0xD9], 7), - "fldcw" => self.encode_x87_mem(ops, &[0xD9], 5), - "fwait" | "wait" => { self.bytes.push(0x9B); Ok(()) } - "fnclex" => { self.bytes.extend_from_slice(&[0xDB, 0xE2]); Ok(()) } - "fninit" => { self.bytes.extend_from_slice(&[0xDB, 0xE3]); Ok(()) } - "ftst" => { self.bytes.extend_from_slice(&[0xD9, 0xE4]); Ok(()) } - "fxam" => { self.bytes.extend_from_slice(&[0xD9, 0xE5]); Ok(()) } - "fcomip" => self.encode_fcomip(ops), - "fucomip" => self.encode_fucomip(ops), - "fucomi" => self.encode_fucomi(ops), - "fucomp" => self.encode_fucomp(ops), - "fucom" => self.encode_fucom(ops), - "fld" => self.encode_fld_st(ops), - "fstp" => self.encode_fstp_st(ops), - "fxch" => self.encode_fxch(ops), - "faddl" => self.encode_x87_mem(ops, &[0xDC], 0), - "fadds" => self.encode_x87_mem(ops, &[0xD8], 0), - "fsubl" => self.encode_x87_mem(ops, &[0xDC], 4), - "fsubs" => self.encode_x87_mem(ops, &[0xD8], 4), - "fmull" => self.encode_x87_mem(ops, &[0xDC], 1), - "fmuls" => self.encode_x87_mem(ops, &[0xD8], 1), - "fdivl" => self.encode_x87_mem(ops, &[0xDC], 6), - "fdivs" => self.encode_x87_mem(ops, &[0xD8], 6), - "fsubrl" => self.encode_x87_mem(ops, &[0xDC], 5), - "fdivrl" => self.encode_x87_mem(ops, &[0xDC], 7), - "fsubrs" => self.encode_x87_mem(ops, &[0xD8], 5), - "fdivrs" => self.encode_x87_mem(ops, &[0xD8], 7), - - // x87 register-register arithmetic (fadd/fmul/fsub/fdiv with st(i) operands) - "fadd" => self.encode_x87_arith_reg(ops, 0xD8, 0xDC, 0xC0), - "fmul" => self.encode_x87_arith_reg(ops, 0xD8, 0xDC, 0xC8), - "fsub" => self.encode_x87_arith_reg(ops, 0xD8, 0xDC, 0xE0), - "fdiv" => self.encode_x87_arith_reg(ops, 0xD8, 0xDC, 0xF0), - - // x87 additional - "fxtract" => { self.bytes.extend_from_slice(&[0xD9, 0xF4]); Ok(()) } - "fnstenv" => self.encode_x87_mem(ops, &[0xD9], 6), - "fldenv" => self.encode_x87_mem(ops, &[0xD9], 4), - "fistl" => self.encode_x87_mem(ops, &[0xDB], 2), - "fistps" => self.encode_x87_mem(ops, &[0xDF], 3), - "fildll" => self.encode_x87_mem(ops, &[0xDF], 5), - "fisttpll" => self.encode_x87_mem(ops, &[0xDD], 1), - "fistpll" => self.encode_x87_mem(ops, &[0xDF], 7), - "fstcw" => { - // fstcw = fwait + fnstcw - self.bytes.push(0x9B); // FWAIT - self.encode_x87_mem(ops, &[0xD9], 7) - } - - // Flag manipulation - "cld" => { self.bytes.push(0xFC); Ok(()) } - "std" => { self.bytes.push(0xFD); Ok(()) } - "clc" => { self.bytes.push(0xF8); Ok(()) } - "stc" => { self.bytes.push(0xF9); Ok(()) } - "cmc" => { self.bytes.push(0xF5); Ok(()) } - "cli" => { self.bytes.push(0xFA); Ok(()) } - "sti" => { self.bytes.push(0xFB); Ok(()) } - "sahf" => { self.bytes.push(0x9E); Ok(()) } - "lahf" => { self.bytes.push(0x9F); Ok(()) } - "pushf" | "pushfl" => { self.bytes.push(0x9C); Ok(()) } - "popf" | "popfl" => { self.bytes.push(0x9D); Ok(()) } - - // Leave (stack frame teardown) - "leave" => { self.bytes.push(0xC9); Ok(()) } - - // int3 (explicit breakpoint mnemonic) - "int3" => { self.bytes.push(0xCC); Ok(()) } - - // Endbr32 (CET) - "endbr32" => { self.bytes.extend_from_slice(&[0xF3, 0x0F, 0x1E, 0xFB]); Ok(()) } - - // SSE packed float arithmetic - "addpd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x58]), - "subpd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x5C]), - "mulpd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x59]), - "divpd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x5E]), - "addps" => self.encode_sse_op(ops, &[0x0F, 0x58]), - "subps" => self.encode_sse_op(ops, &[0x0F, 0x5C]), - "mulps" => self.encode_sse_op(ops, &[0x0F, 0x59]), - "divps" => self.encode_sse_op(ops, &[0x0F, 0x5E]), - - // SSE3 horizontal operations - "haddpd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x7C]), - "hsubpd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x7D]), - "haddps" => self.encode_sse_op(ops, &[0xF2, 0x0F, 0x7C]), - "hsubps" => self.encode_sse_op(ops, &[0xF2, 0x0F, 0x7D]), - "addsubpd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0xD0]), - "addsubps" => self.encode_sse_op(ops, &[0xF2, 0x0F, 0xD0]), - - // SSSE3 - "palignr" => self.encode_sse_op_imm8(ops, &[0x66, 0x0F, 0x3A, 0x0F]), - "pshufb" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x00]), - "phaddw" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x01]), - "phaddd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x02]), - "phsubw" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x05]), - "phsubd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x06]), - "pmulhrsw" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x0B]), - - // SSE4.1 blend - "blendvpd" => { let ops2 = if ops.len() == 3 { &ops[1..] } else { ops }; self.encode_sse_op(ops2, &[0x66, 0x0F, 0x38, 0x15]) } - "blendvps" => { let ops2 = if ops.len() == 3 { &ops[1..] } else { ops }; self.encode_sse_op(ops2, &[0x66, 0x0F, 0x38, 0x14]) } - "pblendvb" => { let ops2 = if ops.len() == 3 { &ops[1..] } else { ops }; self.encode_sse_op(ops2, &[0x66, 0x0F, 0x38, 0x10]) } - "roundsd" => self.encode_sse_op_imm8(ops, &[0x66, 0x0F, 0x3A, 0x0B]), - "roundss" => self.encode_sse_op_imm8(ops, &[0x66, 0x0F, 0x3A, 0x0A]), - "roundpd" => self.encode_sse_op_imm8(ops, &[0x66, 0x0F, 0x3A, 0x09]), - "roundps" => self.encode_sse_op_imm8(ops, &[0x66, 0x0F, 0x3A, 0x08]), - "pblendw" => self.encode_sse_op_imm8(ops, &[0x66, 0x0F, 0x3A, 0x0E]), - "blendpd" => self.encode_sse_op_imm8(ops, &[0x66, 0x0F, 0x3A, 0x0D]), - "blendps" => self.encode_sse_op_imm8(ops, &[0x66, 0x0F, 0x3A, 0x0C]), - "dpps" => self.encode_sse_op_imm8(ops, &[0x66, 0x0F, 0x3A, 0x40]), - "dppd" => self.encode_sse_op_imm8(ops, &[0x66, 0x0F, 0x3A, 0x41]), - - // SSE4.1 test / min-max - "ptest" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x17]), - "pminsb" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x38]), - "pmaxsb" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x3C]), - "pminuw" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x3A]), - "pmaxuw" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x3E]), - "pminud" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x3B]), - "pmaxud" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x3F]), - "pminsd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x39]), - "pmaxsd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x3D]), - "phminposuw" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x41]), - - // SSE4.1 insert/extract (32-bit and byte) - "pinsrd" => self.encode_sse_insert(ops, &[0x66, 0x0F, 0x3A, 0x22]), - "pextrd" => self.encode_sse_extract(ops, &[0x66, 0x0F, 0x3A, 0x16]), - "pinsrb" => self.encode_sse_insert(ops, &[0x66, 0x0F, 0x3A, 0x20]), - "pextrb" => self.encode_sse_extract(ops, &[0x66, 0x0F, 0x3A, 0x14]), - - // SSE4.1/SSE4.2 packed integer extensions - "pcmpgtq" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x37]), - - // SSE4.1 zero/sign extend - "pmovzxbw" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x30]), - "pmovzxbd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x31]), - "pmovzxbq" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x32]), - "pmovzxwd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x33]), - "pmovzxwq" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x34]), - "pmovzxdq" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x35]), - "pmovsxbw" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x20]), - "pmovsxbd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x21]), - "pmovsxbq" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x22]), - "pmovsxwd" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x23]), - "pmovsxwq" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x24]), - "pmovsxdq" => self.encode_sse_op(ops, &[0x66, 0x0F, 0x38, 0x25]), - - // SSE data movement (missing from i686) - "movapd" => self.encode_sse_rr_rm(ops, &[0x66, 0x0F, 0x28], &[0x66, 0x0F, 0x29]), - "movhlps" => self.encode_sse_op(ops, &[0x0F, 0x12]), - "movlhps" => self.encode_sse_op(ops, &[0x0F, 0x16]), - "movddup" => self.encode_sse_op(ops, &[0xF2, 0x0F, 0x12]), - "movshdup" => self.encode_sse_op(ops, &[0xF3, 0x0F, 0x16]), - "movsldup" => self.encode_sse_op(ops, &[0xF3, 0x0F, 0x12]), - "movntps" => self.encode_sse_store_only(ops, &[0x0F, 0x2B]), - "movntpd" => self.encode_sse_store_only(ops, &[0x66, 0x0F, 0x2B]), - - // Prefetch instructions - "prefetcht0" => self.encode_prefetch(ops, 1), - "prefetcht1" => self.encode_prefetch(ops, 2), - "prefetcht2" => self.encode_prefetch(ops, 3), - "prefetchnta" => self.encode_prefetch(ops, 0), - "prefetchw" => self.encode_prefetch_0f0d(ops, 1), - - // Rotate through carry - "rclb" | "rclw" | "rcll" | "rcl" => self.encode_shift(ops, mnemonic, 2), - "rcrb" | "rcrw" | "rcrl" | "rcr" => self.encode_shift(ops, mnemonic, 3), - - // 16-bit string operations - "movsw" => { self.bytes.extend_from_slice(&[0x66, 0xA5]); Ok(()) } - "stosw" => { self.bytes.extend_from_slice(&[0x66, 0xAB]); Ok(()) } - "lodsw" => { self.bytes.extend_from_slice(&[0x66, 0xAD]); Ok(()) } - "scasw" => { self.bytes.extend_from_slice(&[0x66, 0xAF]); Ok(()) } - "cmpsw" => { self.bytes.extend_from_slice(&[0x66, 0xA7]); Ok(()) } - - // Additional multiply/divide sizes - "mulb" => self.encode_unary_rm(ops, 4, 1), - "mulw" => { self.bytes.push(0x66); self.encode_unary_rm(ops, 4, 2) } - "divw" => { self.bytes.push(0x66); self.encode_unary_rm(ops, 6, 2) } - "divb" => self.encode_unary_rm(ops, 6, 1), - "idivw" => { self.bytes.push(0x66); self.encode_unary_rm(ops, 7, 2) } - "idivb" => self.encode_unary_rm(ops, 7, 1), - "imulw" => self.encode_imul(ops, 2), - - _ => { - Err(format!("unhandled i686 instruction: {} {:?}", mnemonic, ops)) - } - } - } -} diff --git a/src/backend/i686/assembler/encoder/registers.rs b/src/backend/i686/assembler/encoder/registers.rs deleted file mode 100644 index d5b932317a..0000000000 --- a/src/backend/i686/assembler/encoder/registers.rs +++ /dev/null @@ -1,138 +0,0 @@ -//! Register helper functions for i686 instruction encoding. - -/// Register encoding (3-bit register number in ModR/M and SIB). -pub(crate) fn reg_num(name: &str) -> Option { - match name { - "al" | "ax" | "eax" | "xmm0" | "mm0" | "st" | "st(0)" | "ymm0" => Some(0), - "cl" | "cx" | "ecx" | "xmm1" | "mm1" | "st(1)" | "ymm1" => Some(1), - "dl" | "dx" | "edx" | "xmm2" | "mm2" | "st(2)" | "ymm2" => Some(2), - "bl" | "bx" | "ebx" | "xmm3" | "mm3" | "st(3)" | "ymm3" => Some(3), - "ah" | "sp" | "esp" | "xmm4" | "mm4" | "st(4)" | "ymm4" => Some(4), - "ch" | "bp" | "ebp" | "xmm5" | "mm5" | "st(5)" | "ymm5" => Some(5), - "dh" | "si" | "esi" | "xmm6" | "mm6" | "st(6)" | "ymm6" => Some(6), - "bh" | "di" | "edi" | "xmm7" | "mm7" | "st(7)" | "ymm7" => Some(7), - _ => None, - } -} - -/// Get segment register number (es=0, cs=1, ss=2, ds=3, fs=4, gs=5). -pub(crate) fn seg_reg_num(name: &str) -> Option { - match name { - "es" => Some(0), - "cs" => Some(1), - "ss" => Some(2), - "ds" => Some(3), - "fs" => Some(4), - "gs" => Some(5), - _ => None, - } -} - -/// Is this a segment register? -pub(crate) fn is_segment_reg(name: &str) -> bool { - matches!(name, "es" | "cs" | "ss" | "ds" | "fs" | "gs") -} - -/// Is this a control register? -pub(crate) fn is_control_reg(name: &str) -> bool { - matches!(name, "cr0" | "cr2" | "cr3" | "cr4") -} - -/// Get control register number. -pub(crate) fn control_reg_num(name: &str) -> Option { - match name { - "cr0" => Some(0), - "cr2" => Some(2), - "cr3" => Some(3), - "cr4" => Some(4), - _ => None, - } -} - -/// Is this an XMM register? -pub(crate) fn is_xmm(name: &str) -> bool { - name.starts_with("xmm") -} - -/// Is this an MM (MMX) register? -pub(crate) fn is_mm(name: &str) -> bool { - name.starts_with("mm") && !name.starts_with("mmx") -} - -/// Infer operand size from register name for unsuffixed instructions. -pub(crate) fn reg_size(name: &str) -> u8 { - match name { - "al" | "ah" | "bl" | "bh" | "cl" | "ch" | "dl" | "dh" => 1, - "ax" | "bx" | "cx" | "dx" | "sp" | "bp" | "si" | "di" => 2, - "es" | "cs" | "ss" | "ds" | "fs" | "gs" => 2, - _ => 4, // eax, ebx, etc. default to 32-bit on i686 - } -} - -/// Get operand size from mnemonic suffix. -pub(crate) fn mnemonic_size_suffix(mnemonic: &str) -> Option { - match mnemonic { - "cltd" | "cdq" | "ret" | "nop" | "ud2" | "pause" - | "mfence" | "lfence" | "sfence" | "clflush" - | "ldmxcsr" | "stmxcsr" - | "syscall" | "sysenter" | "cpuid" | "rdtsc" | "rdtscp" | "xgetbv" - // Base ALU/shift mnemonics whose last letter is NOT a size suffix - | "sub" | "sbb" | "add" | "and" | "shl" | "rol" | "xadd" - | "insb" | "insw" | "insl" | "outsb" | "outsw" | "outsl" - | "outb" | "outw" | "outl" | "inb" | "inw" | "inl" - | "verw" | "lsl" | "sgdt" | "sidt" | "lgdt" | "lidt" - | "sgdtl" | "sidtl" | "lgdtl" | "lidtl" - | "lmsw" | "smsw" - | "wbinvd" | "invlpg" | "rdpmc" - | "ljmpl" | "ljmpw" | "ljmp" | "lret" | "lretl" | "lretq" => return None, - _ => {} - } - let last = mnemonic.as_bytes().last()?; - match last { - b'b' => Some(1), - b'w' => Some(2), - b'l' | b'd' => Some(4), - // i686 shouldn't have 'q' suffix for GP instructions, but handle gracefully - b'q' => Some(4), - _ => None, - } -} - -/// Parse x87 register number: "st(0)" -> 0, "st" -> 0, "st(1)" -> 1, etc. -pub(crate) fn parse_st_num(name: &str) -> Result { - if name == "st" || name == "st(0)" { - return Ok(0); - } - if name.starts_with("st(") && name.ends_with(')') { - let n: u8 = name[3..name.len()-1].parse() - .map_err(|_| format!("bad st register: {}", name))?; - if n > 7 { - return Err(format!("st register out of range: {}", name)); - } - return Ok(n); - } - Err(format!("not an st register: {}", name)) -} - -/// Map condition code suffix to encoding. -pub(crate) fn cc_from_mnemonic(cc_str: &str) -> Result { - match cc_str { - "o" => Ok(0), - "no" => Ok(1), - "b" | "c" | "nae" => Ok(2), - "nb" | "nc" | "ae" => Ok(3), - "e" | "z" => Ok(4), - "ne" | "nz" => Ok(5), - "be" | "na" => Ok(6), - "nbe" | "a" => Ok(7), - "s" => Ok(8), - "ns" => Ok(9), - "p" | "pe" => Ok(10), - "np" | "po" => Ok(11), - "l" | "nge" => Ok(12), - "nl" | "ge" => Ok(13), - "le" | "ng" => Ok(14), - "nle" | "g" => Ok(15), - _ => Err(format!("unknown condition code: {}", cc_str)), - } -} diff --git a/src/backend/i686/assembler/encoder/sse.rs b/src/backend/i686/assembler/encoder/sse.rs deleted file mode 100644 index 5f1707ad91..0000000000 --- a/src/backend/i686/assembler/encoder/sse.rs +++ /dev/null @@ -1,385 +0,0 @@ -//! SSE/SSE2/SSE3/SSSE3/SSE4/MMX instruction encoders for i686. -//! -//! Handles SIMD data movement, arithmetic, comparisons, conversions, -//! shifts, pack/unpack, insert/extract, and related operations. - -use super::*; - -impl super::InstructionEncoder { - // ---- SSE encoding helpers (same opcodes, no REX) ---- - - pub(super) fn encode_sse_rr_rm(&mut self, ops: &[Operand], load_opcode: &[u8], store_opcode: &[u8]) -> Result<(), String> { - if ops.len() != 2 { - return Err("SSE mov requires 2 operands".to_string()); - } - - match (&ops[0], &ops[1]) { - (Operand::Register(src), Operand::Register(dst)) if is_xmm(&src.name) && is_xmm(&dst.name) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(load_opcode); - self.bytes.push(self.modrm(3, dst_num, src_num)); - Ok(()) - } - (Operand::Memory(mem), Operand::Register(dst)) if is_xmm(&dst.name) => { - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(load_opcode); - self.encode_modrm_mem(dst_num, mem) - } - (Operand::Register(src), Operand::Memory(mem)) if is_xmm(&src.name) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - self.bytes.extend_from_slice(store_opcode); - self.encode_modrm_mem(src_num, mem) - } - _ => Err("unsupported SSE mov operands".to_string()), - } - } - - pub(super) fn encode_sse_op(&mut self, ops: &[Operand], opcode: &[u8]) -> Result<(), String> { - if ops.len() != 2 { - return Err("SSE op requires 2 operands".to_string()); - } - - match (&ops[0], &ops[1]) { - (Operand::Register(src), Operand::Register(dst)) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(opcode); - self.bytes.push(self.modrm(3, dst_num, src_num)); - Ok(()) - } - (Operand::Memory(mem), Operand::Register(dst)) => { - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(opcode); - self.encode_modrm_mem(dst_num, mem) - } - _ => Err("unsupported SSE op operands".to_string()), - } - } - - pub(super) fn encode_sse_op_imm8(&mut self, ops: &[Operand], opcode: &[u8]) -> Result<(), String> { - if ops.len() != 3 { - return Err("SSE op+imm8 requires 3 operands".to_string()); - } - match (&ops[0], &ops[1], &ops[2]) { - (Operand::Immediate(ImmediateValue::Integer(imm)), Operand::Register(src), Operand::Register(dst)) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(opcode); - self.bytes.push(self.modrm(3, dst_num, src_num)); - self.bytes.push(*imm as u8); - Ok(()) - } - (Operand::Immediate(ImmediateValue::Integer(imm)), Operand::Memory(mem), Operand::Register(dst)) => { - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(opcode); - self.encode_modrm_mem(dst_num, mem)?; - self.bytes.push(*imm as u8); - Ok(()) - } - _ => Err("unsupported SSE op+imm8 operands".to_string()), - } - } - - pub(super) fn encode_movd(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.len() != 2 { - return Err("movd requires 2 operands".to_string()); - } - match (&ops[0], &ops[1]) { - (Operand::Register(src), Operand::Register(dst)) if is_xmm(&dst.name) && !is_xmm(&src.name) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&[0x66, 0x0F, 0x6E]); - self.bytes.push(self.modrm(3, dst_num, src_num)); - Ok(()) - } - (Operand::Register(src), Operand::Register(dst)) if is_xmm(&src.name) && !is_xmm(&dst.name) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&[0x66, 0x0F, 0x7E]); - self.bytes.push(self.modrm(3, src_num, dst_num)); - Ok(()) - } - (Operand::Memory(mem), Operand::Register(dst)) if is_xmm(&dst.name) => { - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&[0x66, 0x0F, 0x6E]); - self.encode_modrm_mem(dst_num, mem) - } - (Operand::Register(src), Operand::Memory(mem)) if is_xmm(&src.name) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&[0x66, 0x0F, 0x7E]); - self.encode_modrm_mem(src_num, mem) - } - _ => Err("unsupported movd operands".to_string()), - } - } - - pub(super) fn encode_sse_cvt_gp_to_xmm(&mut self, ops: &[Operand], opcode: &[u8]) -> Result<(), String> { - if ops.len() != 2 { - return Err("cvt requires 2 operands".to_string()); - } - match (&ops[0], &ops[1]) { - (Operand::Register(src), Operand::Register(dst)) if is_xmm(&dst.name) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(opcode); - self.bytes.push(self.modrm(3, dst_num, src_num)); - Ok(()) - } - (Operand::Memory(mem), Operand::Register(dst)) if is_xmm(&dst.name) => { - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(opcode); - self.encode_modrm_mem(dst_num, mem) - } - _ => Err("unsupported cvt operands".to_string()), - } - } - - pub(super) fn encode_sse_cvt_xmm_to_gp(&mut self, ops: &[Operand], opcode: &[u8]) -> Result<(), String> { - if ops.len() != 2 { - return Err("cvt requires 2 operands".to_string()); - } - match (&ops[0], &ops[1]) { - (Operand::Register(src), Operand::Register(dst)) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(opcode); - self.bytes.push(self.modrm(3, dst_num, src_num)); - Ok(()) - } - (Operand::Memory(mem), Operand::Register(dst)) => { - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(opcode); - self.encode_modrm_mem(dst_num, mem) - } - _ => Err("unsupported cvt operands".to_string()), - } - } - - pub(super) fn encode_sse_shift(&mut self, ops: &[Operand], _reg_opcode: &[u8], imm_ext: u8, imm_opcode: &[u8]) -> Result<(), String> { - if ops.len() != 2 { - return Err("SSE shift requires 2 operands".to_string()); - } - match (&ops[0], &ops[1]) { - (Operand::Immediate(ImmediateValue::Integer(imm)), Operand::Register(dst)) => { - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(imm_opcode); - self.bytes.push(self.modrm(3, imm_ext, dst_num)); - self.bytes.push(*imm as u8); - Ok(()) - } - (Operand::Register(src), Operand::Register(dst)) if is_xmm(&src.name) => { - self.encode_sse_op(&[ops[0].clone(), ops[1].clone()], _reg_opcode) - } - _ => Err("unsupported SSE shift operands".to_string()), - } - } - - /// Encode movq for MMX/SSE: 64-bit move between MMX/XMM registers and memory. - pub(super) fn encode_movq(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.len() != 2 { - return Err("movq requires 2 operands".to_string()); - } - match (&ops[0], &ops[1]) { - // movq xmm -> xmm or mem -> xmm (load): F3 0F 7E - (Operand::Register(src), Operand::Register(dst)) if is_xmm(&src.name) && is_xmm(&dst.name) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&[0xF3, 0x0F, 0x7E]); - self.bytes.push(self.modrm(3, dst_num, src_num)); - Ok(()) - } - (Operand::Memory(mem), Operand::Register(dst)) if is_xmm(&dst.name) => { - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&[0xF3, 0x0F, 0x7E]); - self.encode_modrm_mem(dst_num, mem) - } - // movq xmm -> mem (store): 66 0F D6 - (Operand::Register(src), Operand::Memory(mem)) if is_xmm(&src.name) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&[0x66, 0x0F, 0xD6]); - self.encode_modrm_mem(src_num, mem) - } - // MMX movq: mm -> mm, mem -> mm, mm -> mem - (Operand::Register(src), Operand::Register(dst)) if is_mm(&src.name) || is_mm(&dst.name) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - if is_mm(&dst.name) { - // load: 0F 6F - self.bytes.extend_from_slice(&[0x0F, 0x6F]); - self.bytes.push(self.modrm(3, dst_num, src_num)); - } else { - // store: 0F 7F - self.bytes.extend_from_slice(&[0x0F, 0x7F]); - self.bytes.push(self.modrm(3, src_num, dst_num)); - } - Ok(()) - } - (Operand::Memory(mem), Operand::Register(dst)) if is_mm(&dst.name) => { - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&[0x0F, 0x6F]); - self.encode_modrm_mem(dst_num, mem) - } - (Operand::Register(src), Operand::Memory(mem)) if is_mm(&src.name) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&[0x0F, 0x7F]); - self.encode_modrm_mem(src_num, mem) - } - _ => Err("unsupported movq operands".to_string()), - } - } - - /// Encode movnti: non-temporal store from GP register to memory. - pub(super) fn encode_movnti(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.len() != 2 { - return Err("movnti requires 2 operands".to_string()); - } - match (&ops[0], &ops[1]) { - (Operand::Register(src), Operand::Memory(mem)) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&[0x0F, 0xC3]); - self.encode_modrm_mem(src_num, mem) - } - _ => Err("movnti requires register source, memory destination".to_string()), - } - } - - /// Encode SSE store-only instructions (xmm -> mem). - pub(super) fn encode_sse_store_only(&mut self, ops: &[Operand], opcode: &[u8]) -> Result<(), String> { - if ops.len() != 2 { - return Err("SSE store requires 2 operands".to_string()); - } - match (&ops[0], &ops[1]) { - (Operand::Register(src), Operand::Memory(mem)) if is_xmm(&src.name) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - self.bytes.extend_from_slice(opcode); - self.encode_modrm_mem(src_num, mem) - } - _ => Err("SSE store requires xmm source and memory destination".to_string()), - } - } - - /// Encode pslldq/psrldq (byte shifts, immediate-only). - pub(super) fn encode_sse_byte_shift(&mut self, ops: &[Operand], ext: u8) -> Result<(), String> { - if ops.len() != 2 { - return Err("pslldq/psrldq requires 2 operands".to_string()); - } - match (&ops[0], &ops[1]) { - (Operand::Immediate(ImmediateValue::Integer(imm)), Operand::Register(dst)) => { - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&[0x66, 0x0F, 0x73]); - self.bytes.push(self.modrm(3, ext, dst_num)); - self.bytes.push(*imm as u8); - Ok(()) - } - _ => Err("pslldq/psrldq requires immediate and xmm register".to_string()), - } - } - - /// Encode cmpxchg8b (compare and exchange 8 bytes). - pub(super) fn encode_cmpxchg8b(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.len() != 1 { - return Err("cmpxchg8b requires 1 operand".to_string()); - } - match &ops[0] { - Operand::Memory(mem) => { - self.bytes.extend_from_slice(&[0x0F, 0xC7]); - self.encode_modrm_mem(1, mem) - } - _ => Err("cmpxchg8b requires memory operand".to_string()), - } - } - - /// Encode pextrw (extract word from XMM). - pub(super) fn encode_pextrw(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.len() != 3 { - return Err("pextrw requires 3 operands".to_string()); - } - match (&ops[0], &ops[1], &ops[2]) { - (Operand::Immediate(ImmediateValue::Integer(imm)), Operand::Register(src), Operand::Register(dst)) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&[0x66, 0x0F, 0xC5]); - self.bytes.push(self.modrm(3, dst_num, src_num)); - self.bytes.push(*imm as u8); - Ok(()) - } - _ => Err("unsupported pextrw operands".to_string()), - } - } - - /// Encode pinsrw (insert word into XMM). - pub(super) fn encode_pinsrw(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.len() != 3 { - return Err("pinsrw requires 3 operands".to_string()); - } - match (&ops[0], &ops[1], &ops[2]) { - (Operand::Immediate(ImmediateValue::Integer(imm)), Operand::Register(src), Operand::Register(dst)) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&[0x66, 0x0F, 0xC4]); - self.bytes.push(self.modrm(3, dst_num, src_num)); - self.bytes.push(*imm as u8); - Ok(()) - } - (Operand::Immediate(ImmediateValue::Integer(imm)), Operand::Memory(mem), Operand::Register(dst)) => { - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&[0x66, 0x0F, 0xC4]); - self.encode_modrm_mem(dst_num, mem)?; - self.bytes.push(*imm as u8); - Ok(()) - } - _ => Err("unsupported pinsrw operands".to_string()), - } - } - - /// Encode SSE4.1 insert (pinsrd, pinsrb): $imm8, r/m32, xmm - pub(super) fn encode_sse_insert(&mut self, ops: &[Operand], opcode: &[u8]) -> Result<(), String> { - if ops.len() != 3 { - return Err("pinsrX requires 3 operands".to_string()); - } - match (&ops[0], &ops[1], &ops[2]) { - (Operand::Immediate(ImmediateValue::Integer(imm)), Operand::Register(src), Operand::Register(dst)) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(opcode); - self.bytes.push(self.modrm(3, dst_num, src_num)); - self.bytes.push(*imm as u8); - Ok(()) - } - (Operand::Immediate(ImmediateValue::Integer(imm)), Operand::Memory(mem), Operand::Register(dst)) => { - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(opcode); - self.encode_modrm_mem(dst_num, mem)?; - self.bytes.push(*imm as u8); - Ok(()) - } - _ => Err("unsupported pinsrX operands".to_string()), - } - } - - /// Encode SSE4.1 extract (pextrd, pextrb): $imm8, xmm, r/m32 - pub(super) fn encode_sse_extract(&mut self, ops: &[Operand], opcode: &[u8]) -> Result<(), String> { - if ops.len() != 3 { - return Err("pextrX requires 3 operands".to_string()); - } - match (&ops[0], &ops[1], &ops[2]) { - (Operand::Immediate(ImmediateValue::Integer(imm)), Operand::Register(src), Operand::Register(dst)) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(opcode); - self.bytes.push(self.modrm(3, src_num, dst_num)); - self.bytes.push(*imm as u8); - Ok(()) - } - (Operand::Immediate(ImmediateValue::Integer(imm)), Operand::Register(src), Operand::Memory(mem)) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - self.bytes.extend_from_slice(opcode); - self.encode_modrm_mem(src_num, mem)?; - self.bytes.push(*imm as u8); - Ok(()) - } - _ => Err("unsupported pextrX operands".to_string()), - } - } -} diff --git a/src/backend/i686/assembler/encoder/system.rs b/src/backend/i686/assembler/encoder/system.rs deleted file mode 100644 index d891b5cc06..0000000000 --- a/src/backend/i686/assembler/encoder/system.rs +++ /dev/null @@ -1,379 +0,0 @@ -//! System and privileged instruction encoders for i686. -//! -//! Handles prefetch, port I/O, INVLPG, VERW, LSL, descriptor table -//! operations, control register moves, segment register moves, -//! and other system-level instructions. - -use super::*; - -impl super::InstructionEncoder { - /// Encode prefetch instructions (0F 18 /hint) - pub(super) fn encode_prefetch(&mut self, ops: &[Operand], hint: u8) -> Result<(), String> { - if ops.len() != 1 { - return Err("prefetch requires 1 operand".to_string()); - } - match &ops[0] { - Operand::Memory(mem) => { - self.bytes.extend_from_slice(&[0x0F, 0x18]); - self.encode_modrm_mem(hint, mem) - } - _ => Err("prefetch requires memory operand".to_string()), - } - } - - /// Encode prefetchw (0F 0D /1) - pub(super) fn encode_prefetch_0f0d(&mut self, ops: &[Operand], hint: u8) -> Result<(), String> { - if ops.len() != 1 { - return Err("prefetchw requires 1 operand".to_string()); - } - match &ops[0] { - Operand::Memory(mem) => { - self.bytes.extend_from_slice(&[0x0F, 0x0D]); - self.encode_modrm_mem(hint, mem) - } - _ => Err("prefetchw requires memory operand".to_string()), - } - } - - /// Encode OUT instruction: outb/outw/outl - pub(super) fn encode_out(&mut self, ops: &[Operand], mnemonic: &str) -> Result<(), String> { - let size: u8 = match mnemonic { - "outb" => 1, - "outw" => 2, - "outl" => 4, - _ => return Err(format!("unknown out mnemonic: {}", mnemonic)), - }; - - // Handle zero-operand form (implicit operands) - if ops.is_empty() { - if size == 2 { self.bytes.push(0x66); } - self.bytes.push(if size == 1 { 0xEE } else { 0xEF }); - return Ok(()); - } - - if ops.len() != 2 { - return Err(format!("{} requires 0 or 2 operands", mnemonic)); - } - - match (&ops[0], &ops[1]) { - (Operand::Register(_src), Operand::Register(_dst)) => { - if size == 2 { self.bytes.push(0x66); } - self.bytes.push(if size == 1 { 0xEE } else { 0xEF }); - Ok(()) - } - (Operand::Register(_src), Operand::Immediate(ImmediateValue::Integer(val))) => { - if size == 2 { self.bytes.push(0x66); } - self.bytes.push(if size == 1 { 0xE6 } else { 0xE7 }); - self.bytes.push(*val as u8); - Ok(()) - } - _ => Err(format!("unsupported {} operands", mnemonic)), - } - } - - /// Encode IN instruction: inb/inw/inl - pub(super) fn encode_in(&mut self, ops: &[Operand], mnemonic: &str) -> Result<(), String> { - let size: u8 = match mnemonic { - "inb" => 1, - "inw" => 2, - "inl" => 4, - _ => return Err(format!("unknown in mnemonic: {}", mnemonic)), - }; - - if ops.is_empty() { - if size == 2 { self.bytes.push(0x66); } - self.bytes.push(if size == 1 { 0xEC } else { 0xED }); - return Ok(()); - } - - if ops.len() != 2 { - return Err(format!("{} requires 0 or 2 operands", mnemonic)); - } - - match (&ops[0], &ops[1]) { - (Operand::Register(_src), Operand::Register(_dst)) => { - if size == 2 { self.bytes.push(0x66); } - self.bytes.push(if size == 1 { 0xEC } else { 0xED }); - Ok(()) - } - (Operand::Immediate(ImmediateValue::Integer(val)), Operand::Register(_dst)) => { - if size == 2 { self.bytes.push(0x66); } - self.bytes.push(if size == 1 { 0xE4 } else { 0xE5 }); - self.bytes.push(*val as u8); - Ok(()) - } - _ => Err(format!("unsupported {} operands", mnemonic)), - } - } - - /// Encode INVLPG: 0F 01 /7 (memory operand) - pub(super) fn encode_invlpg(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.len() != 1 { - return Err("invlpg requires 1 operand".to_string()); - } - match &ops[0] { - Operand::Memory(mem) => { - self.bytes.extend_from_slice(&[0x0F, 0x01]); - self.encode_modrm_mem(7, mem) - } - _ => Err("invlpg requires memory operand".to_string()), - } - } - - /// Encode VERW: 0F 00 /5 - pub(super) fn encode_verw(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.len() != 1 { - return Err("verw requires 1 operand".to_string()); - } - match &ops[0] { - Operand::Memory(mem) => { - self.bytes.extend_from_slice(&[0x0F, 0x00]); - self.encode_modrm_mem(5, mem) - } - Operand::Register(reg) => { - let rm = reg_num(®.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&[0x0F, 0x00]); - self.bytes.push(self.modrm(3, 5, rm)); - Ok(()) - } - _ => Err("verw requires memory or register operand".to_string()), - } - } - - /// Encode LSL (Load Segment Limit): 0F 03 /r - pub(super) fn encode_lsl(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.len() != 2 { - return Err("lsl requires 2 operands".to_string()); - } - match (&ops[0], &ops[1]) { - (Operand::Register(src), Operand::Register(dst)) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - let is_16 = matches!(src.name.as_str(), "ax"|"bx"|"cx"|"dx"|"si"|"di"|"sp"|"bp"); - if is_16 { - self.bytes.push(0x66); - } - self.bytes.extend_from_slice(&[0x0F, 0x03]); - self.bytes.push(self.modrm(3, dst_num, src_num)); - Ok(()) - } - (Operand::Memory(mem), Operand::Register(dst)) => { - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&[0x0F, 0x03]); - self.encode_modrm_mem(dst_num, mem) - } - _ => Err("unsupported lsl operands".to_string()), - } - } - - /// Encode SGDT/SIDT/LGDT/LIDT: 0F 01 /N (memory operand) - pub(super) fn encode_system_table(&mut self, ops: &[Operand], mnemonic: &str) -> Result<(), String> { - if ops.len() != 1 { - return Err(format!("{} requires 1 operand", mnemonic)); - } - // Strip optional 'l' suffix (e.g., "lgdtl" -> "lgdt") - let base = mnemonic.strip_suffix('l').unwrap_or(mnemonic); - let reg_ext = match base { - "sgdt" => 0, - "sidt" => 1, - "lgdt" => 2, - "lidt" => 3, - _ => return Err(format!("unknown system table instruction: {}", mnemonic)), - }; - match &ops[0] { - Operand::Memory(mem) => { - self.bytes.extend_from_slice(&[0x0F, 0x01]); - self.encode_modrm_mem(reg_ext, mem) - } - // Label as absolute memory reference: lgdtl tr_gdt - Operand::Label(label) => { - self.bytes.extend_from_slice(&[0x0F, 0x01]); - // mod=00, rm=101 for disp32 (no base register) - self.bytes.push(self.modrm(0, reg_ext, 5)); - self.add_relocation_for_label(label, R_386_32); - self.bytes.extend_from_slice(&[0, 0, 0, 0]); - Ok(()) - } - _ => Err(format!("{} requires memory operand", mnemonic)), - } - } - - /// Encode LMSW (Load Machine Status Word): 0F 01 /6 - /// Accepts a 16-bit register or memory operand. - pub(super) fn encode_lmsw(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.len() != 1 { - return Err("lmsw requires 1 operand".to_string()); - } - match &ops[0] { - Operand::Register(reg) => { - let rm = reg_num(®.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&[0x0F, 0x01]); - self.bytes.push(self.modrm(3, 6, rm)); - Ok(()) - } - Operand::Memory(mem) => { - self.bytes.extend_from_slice(&[0x0F, 0x01]); - self.encode_modrm_mem(6, mem) - } - _ => Err("lmsw requires register or memory operand".to_string()), - } - } - - /// Encode SMSW (Store Machine Status Word): 0F 01 /4 - /// Accepts a 16-bit register or memory operand. - /// Register form gets a 66h prefix for 16-bit operand size. - pub(super) fn encode_smsw(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.len() != 1 { - return Err("smsw requires 1 operand".to_string()); - } - match &ops[0] { - Operand::Register(reg) => { - let rm = reg_num(®.name).ok_or("bad register")?; - // 16-bit register form needs operand size prefix - let is_16 = matches!(reg.name.as_str(), "ax"|"bx"|"cx"|"dx"|"si"|"di"|"sp"|"bp"); - if is_16 { - self.bytes.push(0x66); - } - self.bytes.extend_from_slice(&[0x0F, 0x01]); - self.bytes.push(self.modrm(3, 4, rm)); - Ok(()) - } - Operand::Memory(mem) => { - self.bytes.extend_from_slice(&[0x0F, 0x01]); - self.encode_modrm_mem(4, mem) - } - _ => Err("smsw requires register or memory operand".to_string()), - } - } - - /// Encode MOV to/from control register: 0F 20 /r (read) or 0F 22 /r (write) - pub(super) fn encode_mov_cr(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.len() != 2 { - return Err("mov cr requires 2 operands".to_string()); - } - match (&ops[0], &ops[1]) { - (Operand::Register(cr), Operand::Register(gp)) if is_control_reg(&cr.name) => { - let cr_num = control_reg_num(&cr.name).ok_or("bad control register")?; - let gp_num = reg_num(&gp.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&[0x0F, 0x20]); - self.bytes.push(self.modrm(3, cr_num, gp_num)); - Ok(()) - } - (Operand::Register(gp), Operand::Register(cr)) if is_control_reg(&cr.name) => { - let cr_num = control_reg_num(&cr.name).ok_or("bad control register")?; - let gp_num = reg_num(&gp.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&[0x0F, 0x22]); - self.bytes.push(self.modrm(3, cr_num, gp_num)); - Ok(()) - } - _ => Err("unsupported mov cr operands".to_string()), - } - } - - /// Encode MOV to/from segment register - pub(super) fn encode_mov_seg(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.len() != 2 { - return Err("mov seg requires 2 operands".to_string()); - } - - let seg_num = |name: &str| -> Option { - match name { - "es" => Some(0), - "cs" => Some(1), - "ss" => Some(2), - "ds" => Some(3), - "fs" => Some(4), - "gs" => Some(5), - _ => None, - } - }; - - match (&ops[0], &ops[1]) { - // mov %sreg, %reg32 - (Operand::Register(src), Operand::Register(dst)) if is_segment_reg(&src.name) => { - let sr = seg_num(&src.name).ok_or("bad segment register")?; - let gp = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.push(0x8C); - self.bytes.push(self.modrm(3, sr, gp)); - Ok(()) - } - // mov %reg32, %sreg - (Operand::Register(src), Operand::Register(dst)) if is_segment_reg(&dst.name) => { - let gp = reg_num(&src.name).ok_or("bad register")?; - let sr = seg_num(&dst.name).ok_or("bad segment register")?; - self.bytes.push(0x8E); - self.bytes.push(self.modrm(3, sr, gp)); - Ok(()) - } - // mov %sreg, mem - (Operand::Register(src), Operand::Memory(mem)) if is_segment_reg(&src.name) => { - let sr = seg_num(&src.name).ok_or("bad segment register")?; - self.bytes.push(0x8C); - self.encode_modrm_mem(sr, mem) - } - // mov mem, %sreg - (Operand::Memory(mem), Operand::Register(dst)) if is_segment_reg(&dst.name) => { - let sr = seg_num(&dst.name).ok_or("bad segment register")?; - self.bytes.push(0x8E); - self.encode_modrm_mem(sr, mem) - } - _ => Err("unsupported mov seg operands".to_string()), - } - } - - /// Encode popw (16-bit pop) - pub(super) fn encode_pop16(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.len() != 1 { - return Err("popw requires 1 operand".to_string()); - } - match &ops[0] { - Operand::Register(reg) => { - if is_segment_reg(®.name) { - // Segment register pops don't use 0x66 prefix - match reg.name.as_str() { - "es" => { self.bytes.push(0x07); Ok(()) } - "ss" => { self.bytes.push(0x17); Ok(()) } - "ds" => { self.bytes.push(0x1F); Ok(()) } - "fs" => { self.bytes.extend_from_slice(&[0x0F, 0xA1]); Ok(()) } - "gs" => { self.bytes.extend_from_slice(&[0x0F, 0xA9]); Ok(()) } - _ => Err(format!("cannot pop to {}", reg.name)), - } - } else { - let num = reg_num(®.name).ok_or("bad register")?; - self.bytes.push(0x66); - self.bytes.push(0x58 + num); - Ok(()) - } - } - _ => Err("unsupported popw operand".to_string()), - } - } - - /// Encode 16-bit BSF/BSR: bsfw/bsrw - pub(super) fn encode_bsr_bsf_16(&mut self, ops: &[Operand], mnemonic: &str) -> Result<(), String> { - if ops.len() != 2 { - return Err(format!("{} requires 2 operands", mnemonic)); - } - let opcode = match mnemonic { - "bsrw" => [0x0F, 0xBD], - "bsfw" => [0x0F, 0xBC], - _ => return Err(format!("unknown bit scan: {}", mnemonic)), - }; - self.bytes.push(0x66); // 16-bit operand size prefix - match (&ops[0], &ops[1]) { - (Operand::Register(src), Operand::Register(dst)) => { - let src_num = reg_num(&src.name).ok_or("bad register")?; - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&opcode); - self.bytes.push(self.modrm(3, dst_num, src_num)); - Ok(()) - } - (Operand::Memory(mem), Operand::Register(dst)) => { - let dst_num = reg_num(&dst.name).ok_or("bad register")?; - self.bytes.extend_from_slice(&opcode); - self.encode_modrm_mem(dst_num, mem) - } - _ => Err(format!("unsupported {} operands", mnemonic)), - } - } -} diff --git a/src/backend/i686/assembler/encoder/x87.rs b/src/backend/i686/assembler/encoder/x87.rs deleted file mode 100644 index 2d88fb81b0..0000000000 --- a/src/backend/i686/assembler/encoder/x87.rs +++ /dev/null @@ -1,279 +0,0 @@ -//! x87 FPU instruction encoders for i686. -//! -//! Handles x87 floating-point load/store, arithmetic, comparison, -//! and control instructions. - -use super::*; - -impl super::InstructionEncoder { - // ---- x87 FPU encoding (identical to x86-64, no REX needed) ---- - - pub(super) fn encode_x87_mem(&mut self, ops: &[Operand], opcode: &[u8], ext: u8) -> Result<(), String> { - if ops.len() != 1 { - return Err("x87 mem op requires 1 operand".to_string()); - } - match &ops[0] { - Operand::Memory(mem) => { - self.bytes.extend_from_slice(opcode); - self.encode_modrm_mem(ext, mem) - } - _ => Err("x87 mem op requires memory operand".to_string()), - } - } - - pub(super) fn encode_fcomip(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.len() == 2 { - match &ops[0] { - Operand::Register(reg) => { - let n = parse_st_num(®.name)?; - self.bytes.extend_from_slice(&[0xDF, 0xF0 + n]); - Ok(()) - } - _ => Err("fcomip requires st register".to_string()), - } - } else if ops.is_empty() { - self.bytes.extend_from_slice(&[0xDF, 0xF1]); - Ok(()) - } else { - Err("fcomip requires 0 or 2 operands".to_string()) - } - } - - pub(super) fn encode_fucomip(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.len() == 2 { - match &ops[0] { - Operand::Register(reg) => { - let n = parse_st_num(®.name)?; - self.bytes.extend_from_slice(&[0xDF, 0xE8 + n]); - Ok(()) - } - _ => Err("fucomip requires st register".to_string()), - } - } else if ops.is_empty() { - self.bytes.extend_from_slice(&[0xDF, 0xE9]); - Ok(()) - } else { - Err("fucomip requires 0 or 2 operands".to_string()) - } - } - - pub(super) fn encode_fld_st(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.len() != 1 { - return Err("fld requires 1 operand".to_string()); - } - match &ops[0] { - Operand::Register(reg) => { - let n = parse_st_num(®.name)?; - self.bytes.extend_from_slice(&[0xD9, 0xC0 + n]); - Ok(()) - } - _ => Err("fld requires st register".to_string()), - } - } - - pub(super) fn encode_fstp_st(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.len() != 1 { - return Err("fstp requires 1 operand".to_string()); - } - match &ops[0] { - Operand::Register(reg) => { - let n = parse_st_num(®.name)?; - self.bytes.extend_from_slice(&[0xDD, 0xD8 + n]); - Ok(()) - } - _ => Err("fstp requires st register".to_string()), - } - } - - pub(super) fn encode_fxch(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.is_empty() { - // fxch with no operand defaults to st(1) - self.bytes.extend_from_slice(&[0xD9, 0xC9]); - return Ok(()); - } - if ops.len() == 1 || ops.len() == 2 { - // With 1 operand: fxch %st(i) - // With 2 operands: fxch %st(i), %st (AT&T syntax) - match &ops[0] { - Operand::Register(reg) => { - let n = parse_st_num(®.name)?; - self.bytes.extend_from_slice(&[0xD9, 0xC8 + n]); - Ok(()) - } - _ => Err("fxch requires st register".to_string()), - } - } else { - Err("fxch requires 0, 1 or 2 operands".to_string()) - } - } - - /// Encode fnstsw (store FPU status word). - pub(super) fn encode_fnstsw(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.is_empty() { - // fnstsw with no operand defaults to %ax - self.bytes.extend_from_slice(&[0xDF, 0xE0]); - return Ok(()); - } - if ops.len() != 1 { - return Err("fnstsw requires 0 or 1 operand".to_string()); - } - match &ops[0] { - Operand::Register(reg) if reg.name == "ax" => { - self.bytes.extend_from_slice(&[0xDF, 0xE0]); - Ok(()) - } - Operand::Memory(mem) => { - self.bytes.push(0xDD); - self.encode_modrm_mem(7, mem) - } - _ => Err("fnstsw requires %ax or memory operand".to_string()), - } - } - - /// Encode fucomi (unordered compare and set EFLAGS). - pub(super) fn encode_fucomi(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.len() == 2 { - match &ops[0] { - Operand::Register(reg) => { - let n = parse_st_num(®.name)?; - self.bytes.extend_from_slice(&[0xDB, 0xE8 + n]); - Ok(()) - } - _ => Err("fucomi requires st register".to_string()), - } - } else if ops.is_empty() { - self.bytes.extend_from_slice(&[0xDB, 0xE9]); - Ok(()) - } else { - Err("fucomi requires 0 or 2 operands".to_string()) - } - } - - /// Encode fucomp (unordered compare and pop, sets FPU status word). - pub(super) fn encode_fucomp(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.len() == 1 { - match &ops[0] { - Operand::Register(reg) => { - let n = parse_st_num(®.name)?; - self.bytes.extend_from_slice(&[0xDD, 0xE8 + n]); - Ok(()) - } - _ => Err("fucomp requires st register".to_string()), - } - } else if ops.len() == 2 { - // AT&T syntax: fucomp %st(1), %st — first operand is the source - match &ops[0] { - Operand::Register(reg) => { - let n = parse_st_num(®.name)?; - self.bytes.extend_from_slice(&[0xDD, 0xE8 + n]); - Ok(()) - } - _ => Err("fucomp requires st register".to_string()), - } - } else if ops.is_empty() { - // Default: fucomp %st(1) - self.bytes.extend_from_slice(&[0xDD, 0xE9]); - Ok(()) - } else { - Err("fucomp requires 0, 1 or 2 operands".to_string()) - } - } - - /// Encode fucom (unordered compare, sets FPU status word, no pop). - pub(super) fn encode_fucom(&mut self, ops: &[Operand]) -> Result<(), String> { - if ops.len() == 1 { - match &ops[0] { - Operand::Register(reg) => { - let n = parse_st_num(®.name)?; - self.bytes.extend_from_slice(&[0xDD, 0xE0 + n]); - Ok(()) - } - _ => Err("fucom requires st register".to_string()), - } - } else if ops.len() == 2 { - // AT&T syntax: fucom %st(1), %st — first operand is the source - match &ops[0] { - Operand::Register(reg) => { - let n = parse_st_num(®.name)?; - self.bytes.extend_from_slice(&[0xDD, 0xE0 + n]); - Ok(()) - } - _ => Err("fucom requires st register".to_string()), - } - } else if ops.is_empty() { - // Default: fucom %st(1) - self.bytes.extend_from_slice(&[0xDD, 0xE1]); - Ok(()) - } else { - Err("fucom requires 0, 1 or 2 operands".to_string()) - } - } - - /// Map TLS modifier string to relocation type. - pub(super) fn tls_reloc_type(&self, modifier: &str) -> u32 { - match modifier { - "NTPOFF" => R_386_TLS_LE_32, - "TPOFF" => R_386_32S, - "TLSGD" => R_386_TLS_GD, - "TLSLDM" => R_386_TLS_LDM, - "DTPOFF" => R_386_TLS_LDO_32, - "GOT" => R_386_GOT32, - "GOTOFF" => R_386_GOTOFF, - "PLT" => R_386_PLT32, - "GOTPC" => R_386_GOTPC, - "GOTNTPOFF" | "INDNTPOFF" => R_386_TLS_IE, - _ => R_386_32, - } - } - - /// Encode x87 register-register arithmetic (fadd/fmul/fsub/fdiv with st(i) operands). - pub(super) fn encode_x87_arith_reg(&mut self, ops: &[Operand], opcode_st0: u8, opcode_sti: u8, base_modrm: u8) -> Result<(), String> { - match ops.len() { - 0 => { - // Default: fadd %st(1), %st (i.e., st(0) = st(0) op st(1)) - self.bytes.extend_from_slice(&[opcode_st0, base_modrm + 1]); - Ok(()) - } - 1 => { - // fadd %st(i) -> st(0) = st(0) op st(i) - match &ops[0] { - Operand::Register(reg) => { - let n = parse_st_num(®.name)?; - self.bytes.extend_from_slice(&[opcode_st0, base_modrm + n]); - Ok(()) - } - _ => Err("x87 arith requires st register operand".to_string()), - } - } - 2 => { - // Two operands: fadd %st(i), %st or fadd %st, %st(i) - match (&ops[0], &ops[1]) { - (Operand::Register(src), Operand::Register(dst)) => { - let src_n = parse_st_num(&src.name)?; - let dst_n = parse_st_num(&dst.name)?; - if dst_n == 0 { - // fadd %st(i), %st -> D8 (base + i) - self.bytes.extend_from_slice(&[opcode_st0, base_modrm + src_n]); - } else if src_n == 0 { - // fadd %st, %st(i) -> DC (base + i) - // Note: fsub/fdiv swap in DC encoding - let dc_modrm = match base_modrm { - 0xC0 => 0xC0, // fadd - 0xC8 => 0xC8, // fmul - 0xE0 => 0xE8, // fsub -> fsubr encoding in DC - 0xF0 => 0xF8, // fdiv -> fdivr encoding in DC - _ => base_modrm, - }; - self.bytes.extend_from_slice(&[opcode_sti, dc_modrm + dst_n]); - } else { - return Err("x87 arith: one operand must be st(0)".to_string()); - } - Ok(()) - } - _ => Err("x87 arith requires st register operands".to_string()), - } - } - _ => Err("x87 arith requires 0-2 operands".to_string()), - } - } -} diff --git a/src/backend/i686/assembler/mod.rs b/src/backend/i686/assembler/mod.rs deleted file mode 100644 index 6c6f1cb47f..0000000000 --- a/src/backend/i686/assembler/mod.rs +++ /dev/null @@ -1,31 +0,0 @@ -//! Native i686 (32-bit x86) assembler: parses AT&T syntax assembly and produces -//! 32-bit ELF .o files. -//! -//! Reuses the x86-64 parser since AT&T syntax is identical, but provides its own -//! instruction encoder (no REX prefixes, 32-bit default operand size) and 32-bit -//! ELF writer (ELFCLASS32, EM_386, Elf32_Sym/Elf32_Rel). -//! -//! Architecture: -//! - Parser: reused from `super::super::x86::assembler::parser` -//! - `encoder.rs` – Encode i686 instructions into machine code bytes (no REX) -//! - `elf_writer.rs` – Write 32-bit ELF object files - -pub mod encoder; -pub mod elf_writer; - -// Re-export the x86 parser – AT&T syntax is the same for both architectures -pub use crate::backend::x86::assembler::parser::parse_asm; - -use elf_writer::ElfWriter; - -/// Assemble AT&T syntax i686 assembly text into a 32-bit ELF object file. -/// -/// This is the default assembler (used when the `gcc_assembler` feature is disabled). -pub fn assemble(asm_text: &str, output_path: &str) -> Result<(), String> { - let items = parse_asm(asm_text)?; - let obj = ElfWriter::new(); - let elf_bytes = obj.build(&items)?; - std::fs::write(output_path, &elf_bytes) - .map_err(|e| format!("Failed to write object file: {}", e))?; - Ok(()) -} diff --git a/src/backend/i686/codegen/README.md b/src/backend/i686/codegen/README.md deleted file mode 100644 index 90f99cf942..0000000000 --- a/src/backend/i686/codegen/README.md +++ /dev/null @@ -1,707 +0,0 @@ -# i686 Backend -- 32-bit x86 Code Generator - -## Overview - -The i686 backend targets 32-bit x86 (IA-32) processors, emitting AT&T-syntax -assembly. It implements the `ArchCodegen` trait that the shared code -generation framework dispatches to, producing one `.s` file per translation -unit. The backend includes a builtin 32-bit assembler and linker that reuse -the x86-64 AT&T parser with a 32-bit encoder, producing ELFCLASS32 executables. - -The default calling convention is **cdecl** (System V i386 ABI): all arguments -are passed on the stack, pushed right-to-left, and the caller cleans up. -Return values are placed in `%eax` (32-bit scalars), `%eax:%edx` (64-bit -integers), or `st(0)` (float, double, long double). Two alternative calling -conventions are also supported: **`-mregparm=N`** (first 1--3 integer -arguments in `%eax`, `%edx`, `%ecx`) and **`__attribute__((fastcall))`** -(first two DWORD-or-smaller arguments in `%ecx`, `%edx`, callee cleans -the stack). - -The backend operates as an *accumulator machine*: intermediate results flow -through `%eax` (and `%edx` for the upper half of 64-bit values), with a -lightweight register allocator that promotes hot IR values into callee-saved -registers. A post-emission peephole optimizer cleans up the redundant -store/load traffic this style produces. - ---- - -## Table of Contents - -1. [ILP32 Type Model](#ilp32-type-model) -2. [File Inventory](#file-inventory) -3. [Calling Convention](#calling-convention) -4. [Register Allocation](#register-allocation) -5. [Stack Frame Layout](#stack-frame-layout) -6. [64-bit Operation Splitting](#64-bit-operation-splitting) -7. [F128 / Long Double via x87 FPU](#f128--long-double-via-x87-fpu) -8. [Inline Assembly Support](#inline-assembly-support) -9. [Intrinsics](#intrinsics) -10. [Peephole Optimizer](#peephole-optimizer) -11. [Codegen Options](#codegen-options) -12. [Key Design Decisions and Challenges](#key-design-decisions-and-challenges) - ---- - -## ILP32 Type Model - -The i686 target uses the ILP32 data model, which differs from LP64 (x86-64) -in several important ways: - -| Type | i686 (ILP32) | x86-64 (LP64) | -|------|:------------:|:--------------:| -| `char` | 1 | 1 | -| `short` | 2 | 2 | -| `int` | 4 | 4 | -| `long` | **4** | 8 | -| `long long` | 8 | 8 | -| pointer | **4** | 8 | -| `size_t` | **4** | 8 | -| `float` | 4 | 4 | -| `double` | 8 | 8 | -| `long double` | **12** (80-bit x87) | 16 (80-bit x87, padded) | - -The key consequences for code generation: - -- **Pointers are 4 bytes.** Address arithmetic, GEP offsets, and pointer - loads/stores all use `movl` and 32-bit registers. The assembler pointer - directive is `.long` (not `.quad`). -- **`long` is 4 bytes**, so `long` and `int` are identical in size. - This means `long` function parameters need no special treatment relative - to `int`. -- **`long long` (64-bit) does not fit in a single register** and must be - split across `%eax:%edx` pairs, creating the register-pair splitting - described below. -- **`long double` is native 80-bit x87 extended precision**, stored in - 12-byte stack slots (10 bytes of data, 2 bytes padding). It is loaded - and stored with `fldt`/`fstpt` directly, not via software emulation. - ---- - -## File Inventory - -All code generation logic lives under `src/backend/i686/codegen/`: - -| File | Responsibility | -|------|---------------| -| `emit.rs` | `I686Codegen` struct and `ArchCodegen` trait impl. Core accumulator helpers (`operand_to_eax`, `operand_to_ecx`, `store_eax_to`), x87 FPU load/store helpers (`emit_f128_load_to_x87`, `emit_f64_load_to_x87`, `emit_f64_store_from_x87`), wide (64-bit) atomic operations via `lock cmpxchg8b`, runtime stubs (`__x86.get_pc_thunk.bx`, `__divdi3`/`__udivdi3`/`__moddi3`/`__umoddi3`), fastcall call emission, segment-override load/store (`%fs:`/`%gs:`), 64-bit bit manipulation (clz, ctz, bswap, popcount), and utility functions. | -| `prologue.rs` | Stack frame setup: `calculate_stack_space`, `emit_prologue`/`emit_epilogue`/`emit_epilogue_and_ret`, parameter storage from stack/registers to slots, register allocator integration, frame pointer omission logic, `aligned_frame_size` computation, and `emit_param_ref` for parameter re-reads. | -| `calls.rs` | Call ABI: stack argument layout, `regparm` register argument emission (reverse-order to avoid clobbering `%eax`), call instruction emission (direct/indirect/PLT), result retrieval (`%eax`, `%eax:%edx`, `st(0)` for float/double/F128). | -| `memory.rs` | Load/store for all type widths, 64-bit and F128 split load/store via `%eax:%edx` and x87, constant-offset load/store with offset folding, GEP address computation (direct, indirect, over-aligned), dynamic alloca support, memcpy emission via `rep movsb`, and over-aligned alloca handling via runtime `leal`+`andl` alignment. | -| `alu.rs` | Integer ALU: `add`/`sub`/`mul`/`and`/`or`/`xor`/`shl`/`shr`/`sar`, signed and unsigned division (`idivl`/`divl`), LEA strength reduction for multiply by 3/5/9, immediate-operand fast paths, integer negation (`negl`), bitwise NOT (`notl`), CLZ (`lzcntl`), CTZ (`tzcntl`), bswap, popcount, and F32 negation (SSE `xorps` with sign-bit mask). (F64 negation uses x87 `fchs` in `emit.rs`; F128 negation is in `float_ops.rs`.) | -| `i128_ops.rs` | 64-bit register-pair operations (called "i128" in the shared trait): `add`/`adc`, `sub`/`sbb`, `mul` (schoolbook cross-product), `shld`/`shrd` shifts with 32-bit boundary handling, constant shift specializations, comparisons (`cmpl`+`sete`+`andb` for equality, high-first branching for ordered), `__divdi3`/`__udivdi3` calls for division, float conversions via x87 `fildq`/`fisttpq` with unsigned 2^63 correction. | -| `comparison.rs` | Float comparisons (SSE `ucomiss` for F32, x87 `fucomip` for F64/F128), integer comparisons (`cmpl` + `setCC` for all 10 comparison operators), fused compare-and-branch (`cmpl` + `jCC`), and `select` via conditional branching (test condition, branch to true/false label, copy appropriate value). | -| `casts.rs` | Type conversions: integer widening (`movsbl`/`movzbl`/`movswl`/`movzwl`) and narrowing, float-to-int and int-to-float via x87 (`fildl`/`fildq`/`fisttpl`/`fisttpq`), F128 conversions via `fldt`/`fstpt`, unsigned-to-float fixup for values with the sign bit set (2^64 / 2^63 correction paths), SSE scalar F32 casts (`cvtsi2ssl`/`cvttss2si`), and I64 widening/narrowing (sign-extension via `cltd` and half-word extraction). | -| `returns.rs` | Return value placement: 64-bit in `%eax:%edx` (loaded via `emit_load_acc_pair`), F32 returned in `st(0)` (pushed from `%eax` bit pattern via `flds`), F64 returned in `st(0)` (loaded from `%eax:%edx` 8-byte pair via `fldl`), F128 returned in `st(0)` (loaded via `fldt`), 32-bit scalars in `%eax` (no-op). Second return value accessors for F32/F64/F128 multi-register returns. | -| `float_ops.rs` | F128 negation: loads value onto x87 via `fldt`, applies `fchs`, stores back via `fstpt`. | -| `globals.rs` | Global/label address loading: absolute mode (`movl $name`), PIC mode (`@GOT(%ebx)`/`@GOTOFF(%ebx)` relative to GOT base), TLS access (`@GOTNTPOFF(%ebx)` in PIC, `@NTPOFF` in non-PIC, both reading `%gs:0` for the thread pointer base). | -| `variadic.rs` | `va_start` (compute stack pointer to first unnamed argument via `leal`), `va_arg` (load from `va_list` pointer, advance by argument size with 4-byte minimum; special handling for I64/U64/F64 8-byte types, F128 12-byte types via `fldt`, and I128 16-byte quad-word copy), `va_copy` (copy 4-byte pointer). On i686, `va_list` is a simple pointer into the stack frame. | -| `atomics.rs` | 32-bit atomic operations: `lock xadd` for add, `xchg` for exchange, `lock cmpxchg` loops for sub/and/or/xor/nand, `xchgb` for test-and-set, `lock cmpxchg` for CAS, atomic load/store via plain `mov` (with `mfence` for SeqCst), and `mfence` for fences. 64-bit atomics are in `emit.rs` via `lock cmpxchg8b`. | -| `intrinsics.rs` | SSE packed 128-bit operations (arithmetic, compare, logical, shuffle, shift, insert/extract), AES-NI (`aesenc`/`aesenclast`/`aesdec`/`aesdeclast`/`aesimc`/`aeskeygenassist`), CLMUL (`pclmulqdq`), CRC32 (`crc32b`/`crc32l`; 64-bit CRC32 emulated via two 32-bit ops), memory fences (`lfence`/`mfence`/`sfence`/`pause`/`clflush`), non-temporal stores (`movnti`/`movntdq`/`movntpd`), x87 FPU math (`fsqrt`/`fabs` for F32/F64), frame/return address intrinsics, and thread pointer (`%gs:0`). | -| `inline_asm.rs` | Inline assembly template substitution (delegates to shared x86 parser) and operand formatting with size modifiers (`%b` for 8-bit low, `%h` for 8-bit high, `%w` for 16-bit, `%k` for 32-bit). | -| `asm_emitter.rs` | `InlineAsmEmitter` trait impl: GCC constraint classification (`r`/`q` for GP, `a`/`b`/`c`/`d`/`S`/`D` for specific registers, `m` for memory, `i` for immediate, `t`/`u` for x87 `st(0)`/`st(1)`, `{regname}` for explicit registers, `=@cc` for condition codes, digit-tied operands), scratch register allocation from 6 GP registers (`ecx`/`edx`/`esi`/`edi`/`eax`/`ebx`) and 8 XMM registers (`xmm0`-`xmm7`), operand loading/storing for GP, XMM, x87 FPU stack, 64-bit register pairs, and condition code outputs (`=@cc` via `setCC`/`movzbl`), memory operand resolution (ebp-relative, over-aligned, indirect), and memory fallback when GP registers are exhausted. | -| `peephole.rs` | Post-emission assembly optimizer (see dedicated section below). | -| `mod.rs` | Module declarations and visibility. | - ---- - -## Calling Convention - -### cdecl (Default) - -The standard System V i386 ABI: - -``` - Caller's frame - ┌──────────────────────┐ higher addresses - │ arg N │ ← pushed first (right-to-left) - │ ... │ - │ arg 1 │ - │ arg 0 │ - │ return address │ ← pushed by CALL - ├──────────────────────┤ - │ saved %ebp │ ← pushed by prologue (unless -fomit-frame-pointer) - │ saved callee-saved │ ← %ebx, %esi, %edi (as needed) - │ local variables │ - │ spill slots │ - └──────────────────────┘ ← %esp (16-byte aligned at call sites) -``` - -- All arguments are on the stack. The caller adjusts `%esp` after the call - to remove them. -- The stack is aligned to 16 bytes at the `call` instruction (modern i386 ABI - requirement). -- `%eax`, `%ecx`, `%edx` are caller-saved (scratch). -- `%ebx`, `%esi`, `%edi`, `%ebp` are callee-saved. - -### `-mregparm=N` (N = 1, 2, or 3) - -Passes the first N integer/pointer arguments in registers instead of on the -stack. The register order is `%eax`, `%edx`, `%ecx`. This is used -extensively by the Linux kernel. The `CallAbiConfig` sets `max_int_regs` to N, -and `emit_call_reg_args` loads the arguments into the appropriate registers in -reverse order to avoid clobbering `%eax` (the accumulator) prematurely. - -### `__attribute__((fastcall))` - -Passes the first two DWORD-or-smaller integer/pointer arguments in `%ecx` and -`%edx`. The callee pops the *stack* arguments on return (callee-cleanup) via -`ret $N`. Implemented via `is_fastcall`, `fastcall_reg_param_count`, and -`fastcall_stack_cleanup` fields on the codegen struct. - -The prologue handles fastcall parameter storage by storing from `%ecx`/`%edx` -to the appropriate stack slots, with sub-integer types (I8, U8, I16, U16) -properly sign/zero-extended before storing. The epilogue emits `ret $N` -where N accounts for the stack bytes the callee must clean up. - -### ABI Configuration - -The `CallAbiConfig` for i686 is (from `calls.rs`): - -``` -max_int_regs: regparm (0-3) // 0 for cdecl, 1-3 for -mregparm=N -max_float_regs: 0 // all floats go on the stack -align_i128_pairs: false // no even-register alignment for i128 -f128_in_fp_regs: false // long double passed on the stack, not FP regs -f128_in_gp_pairs: false // F128 not split into GP register pairs -variadic_floats_in_gp: false // not needed (no FP reg args on i686) -large_struct_by_ref: false // large structs pushed by value on the stack -use_sysv_struct_classification: false // no per-eightbyte classification (x86-64 only) -use_riscv_float_struct_classification: false // not applicable -allow_struct_split_reg_stack: false // no partial reg/stack split -align_struct_pairs: false // no struct pair alignment -``` - -### Return Values - -| Type | Location | -|------|----------| -| `int`, `long`, pointer | `%eax` | -| `long long` / 64-bit | `%eax` (low), `%edx` (high) | -| `float` | `st(0)` (pushed from `%eax` bit pattern via `flds`) | -| `double` | `st(0)` (loaded from `%eax:%edx` 8-byte pair via `fldl`) | -| `long double` (F128) | `st(0)` (loaded via `fldt`) | - ---- - -## Register Allocation - -The i686 backend has only **6 usable general-purpose registers** in total -(excluding `%esp`), of which three are caller-saved scratch: - -| Register | Role | -|----------|------| -| `%eax` | Accumulator -- all intermediate results flow through here | -| `%ecx` | Secondary operand register (shift counts, RHS of binary ops) | -| `%edx` | Upper half of 64-bit results; `idivl`/`divl` remainder | -| `%ebx` | Callee-saved, allocatable (PhysReg 0); GOT base in PIC mode | -| `%esi` | Callee-saved, allocatable (PhysReg 1) | -| `%edi` | Callee-saved, allocatable (PhysReg 2) | -| `%ebp` | Frame pointer (callee-saved; allocatable as PhysReg 3 only with `-fomit-frame-pointer`) | - -No caller-saved registers are available for general allocation -(`I686_CALLER_SAVED` is empty), because `%eax`, `%ecx`, and `%edx` are -consumed by the accumulator-based codegen as implicit scratch registers. - -The register allocator runs before stack space computation and assigns -frequently-used IR values to the callee-saved registers `%ebx`, `%esi`, -`%edi` (and `%ebp` when available). Values assigned to physical registers are -loaded/stored with `movl %reg, ...` instead of going through stack slots, -eliminating memory traffic for the hottest values. - -In **PIC mode**, `%ebx` (PhysReg 0) is reserved as the GOT base pointer -(loaded via `__x86.get_pc_thunk.bx` + `_GLOBAL_OFFSET_TABLE_`) and is excluded -from the allocatable set. It is still saved/restored as a callee-saved -register. - -Inline assembly clobber lists are integrated into allocation: if an `asm` -block clobbers `%esi`, the allocator will not place values in `%esi` across -that block. Generic constraints (`r`, `q`, `g`) conservatively mark all -callee-saved registers as clobbered, since the scratch allocator might pick -any of them. - -### Accumulator Register Cache - -The codegen tracks the current contents of `%eax` with a small tag cache -(`reg_cache`). When `operand_to_eax` is called for a value that is already -in `%eax`, the load is skipped. The cache is invalidated on calls, inline -assembly, and any instruction that implicitly clobbers `%eax`. This simple -one-entry cache eliminates a significant fraction of redundant loads without -the complexity of a full register allocator. - ---- - -## Stack Frame Layout - -### With Frame Pointer (default) - -``` - higher addresses - ┌──────────────────────┐ - │ arg 1 │ 12(%ebp) - │ arg 0 │ 8(%ebp) - │ return address │ 4(%ebp) - ├──────────────────────┤ - │ saved %ebp │ 0(%ebp) ← %ebp points here - │ saved %ebx │ -4(%ebp) - │ saved %esi │ -8(%ebp) - │ ... │ - │ local slot 0 │ -N(%ebp) - │ local slot 1 │ -(N+4)(%ebp) - │ ... │ - └──────────────────────┘ ← %esp (16-byte aligned) -``` - -All local slots are referenced as negative offsets from `%ebp`. The total -frame size (the `subl $N, %esp` in the prologue) is rounded up so that -`%esp` is 16-byte aligned, accounting for the saved `%ebp`, return address, -and callee-saved register pushes. - -Stack slots are 4-byte granularity by default. 64-bit values get 8-byte -slots; F128 (long double) gets 12-byte slots; 128-bit integers get 16-byte -slots. Over-aligned allocas (e.g., `__attribute__((aligned(16)))`) get -extra space and are dynamically aligned at access time with -`leal`/`addl`/`andl` sequences. - -### Frame Alignment - -The `aligned_frame_size` function ensures `%esp` is 16-byte aligned after -the prologue completes. It accounts for the fixed overhead on the stack -(callee-saved register pushes + return address + saved `%ebp` if present): - -``` -fixed_overhead = callee_saved_bytes + 8 (with FP: saved ebp + return addr) -fixed_overhead = callee_saved_bytes + 4 (without FP: return addr only) -raw_locals = raw_space - callee_saved_bytes -needed = raw_locals + fixed_overhead -aligned = (needed + 15) & !15 -frame_size = aligned - fixed_overhead -``` - -This rounds up the total stack usage (locals + overhead) to the next -16-byte boundary, then subtracts the fixed overhead to get the `subl` -operand for the prologue. - -The alignment bias (8 with FP, 12 without FP) also appears in the -per-alloca slot allocation logic, where it ensures that allocas requesting -16-byte or greater alignment land on properly aligned addresses at runtime. - -### Without Frame Pointer (`-fomit-frame-pointer`) - -When the frame pointer is omitted, `%ebp` is freed as a fourth callee-saved -register (PhysReg 3). All stack references use `%esp`-relative addressing -instead. The `slot_ref` helper converts the EBP-relative offsets stored in -`StackSlot` values to ESP-relative offsets by adding `frame_base_offset + -esp_adjust`: - -- `frame_base_offset` = `callee_saved_bytes + frame_size` (set once in the - prologue) -- `esp_adjust` tracks temporary ESP changes during code generation (e.g., - `subl $N, %esp` for call arguments, `pushl` for temporaries) - -This bookkeeping is critical for correctness: every `subl`/`pushl` that -modifies `%esp` increments `esp_adjust`, and every `addl`/`popl` decrements -it, keeping slot references accurate throughout the function body. - -Parameter references require a small correction: without the pushed `%ebp`, -parameters are 4 bytes closer to the current stack frame. The `param_ref` -helper subtracts 4 from the EBP-relative offset before adding the -ESP-relative base. - -Dynamic allocas (`alloca` / VLAs) force the frame pointer to remain enabled, -since ESP changes by runtime-computed amounts that cannot be statically -tracked. - ---- - -## 64-bit Operation Splitting - -Because every general-purpose register is 32 bits wide, 64-bit values -(`long long`, `double` bit patterns, `uint64_t`) must be represented as -register pairs or 8-byte stack slots. - -### Register Pair Convention - -The canonical register pair is `%eax:%edx` (low:high). For 64-bit -arithmetic: - -| Operation | Instruction sequence | -|-----------|---------------------| -| Add | `addl` low, `adcl` high | -| Subtract | `subl` low, `sbbl` high | -| Multiply | Cross-multiply with `mull` + `imull`, accumulate partial products into `%edx` | -| Left shift | `shldl %cl, %eax, %edx` / `shll %cl, %eax` with branch on `%cl >= 32` | -| Logical right shift | `shrdl %cl, %edx, %eax` / `shrl %cl, %edx` with branch on `%cl >= 32` | -| Arithmetic right shift | `shrdl %cl, %edx, %eax` / `sarl %cl, %edx` with sign-extend fixup | -| Bitwise ops | Pair of `andl`/`orl`/`xorl` on both halves | -| Negate | `notl` both halves, `addl $1` low, `adcl $0` high | -| Bitwise NOT | `notl` both halves | -| Compare (eq/ne) | `cmpl` + `sete` on each half, `andb` the results (for ne: `xorb $1`) | -| Compare (ordered) | Compare high halves first; if equal, compare low halves (unsigned for low half, signed/unsigned for high depending on the comparison) | - -The right-hand operand is pushed onto the stack before the operation and -popped afterward, since all scratch registers are occupied by the result pair. - -Constant shifts are specialized inline without branches, using different -sequences for amounts < 32, == 32, and > 32. - -### 64-bit Division and Modulo - -Hardware `divl`/`idivl` only supports 32-bit divisors. For 64-bit -division, the backend calls runtime helper functions (`__divdi3`, -`__udivdi3`, `__moddi3`, `__umoddi3`) following the cdecl convention -- both -the dividend and divisor are pushed as 8-byte pairs. The compiler emits -`.weak` implementations of these helpers (based on compiler-rt's algorithms) -so that standalone builds without libgcc can link successfully, while builds -that do link libgcc naturally use its versions instead. - -The division helper stubs use normalized-divisor estimation and are only -emitted when 64-bit division is actually used (`needs_divdi3_helpers` flag on -the codegen state). - -### 64-bit Float Conversions - -Conversions between 64-bit integers and floating-point use the x87 FPU: - -- **Signed i64 to float**: Push the 64-bit value onto the stack, `fildq` - to load as a signed integer onto the x87 stack, then `fstps`/`fstpl` to - convert to the target float type. -- **Unsigned u64 to float**: Same as signed, but with a 2^63 correction - path: if the high bit is set, the value is halved (right shift by 1), - converted via `fildq`, then doubled via `fadd %st(0), %st(0)`. -- **Float to signed i64**: Load the float onto x87 (`flds`/`fldl`), then - `fisttpq` to truncate and store as a 64-bit integer. - ---- - -## F128 / Long Double via x87 FPU - -On i686, `long double` maps to the x87 80-bit extended precision format -(10 bytes of data, stored in 12-byte aligned slots). Unlike x86-64, where -F128 is often software-emulated via `__float128` library calls, the i686 -backend uses the x87 FPU natively: - -- **Load:** `fldt offset(%ebp)` pushes the 80-bit value onto `st(0)`. -- **Store:** `fstpt offset(%ebp)` pops `st(0)` and writes 10 bytes. -- **Arithmetic:** `faddp`, `fsubp`, `fmulp`, `fdivp` operate on the x87 - stack. -- **Negation:** `fchs` negates `st(0)`. -- **Comparison:** Two values are loaded onto the x87 stack; `fucomip` - compares `st(0)` with `st(1)` and sets EFLAGS directly (P6+ feature), - followed by `fstp %st(0)` to pop the remaining operand. -- **Conversions:** Integer-to-F128 uses `fildl`/`fildq` (load integer from - memory to x87); F128-to-integer uses `fisttpq` (truncate and store). - Float/double to F128 uses `flds`/`fldl`; F128 to float/double uses - `fstps`/`fstpl`. - -Constants are materialized by constructing the 80-bit x87 byte representation -on the stack with `movl`/`movw` and then loading with `fldt`. The -`f128_bytes_to_x87_bytes` helper converts from IEEE binary128 to x87 -extended format. - -Tracking which values are "directly" in F128 slots (vs. loaded through a -pointer) is maintained via the `f128_direct_slots` set in `CodegenState`. - ---- - -## Inline Assembly Support - -### Template Substitution - -The backend delegates to the shared x86 inline assembly parser for template -substitution. Positional (`%0`, `%1`), named (`%[name]`), and modified -operand references are all supported. - -### Register Size Modifiers - -| Modifier | Effect | Example (`eax`) | -|----------|--------|-----------------| -| (none) | Default based on operand type (I8/U8→`%al`, I16/U16→`%ax`, else→`%eax`) | depends on IR type | -| `%k0` | 32-bit | `%eax` | -| `%w0` | 16-bit | `%ax` | -| `%b0` | 8-bit low | `%al` | -| `%h0` | 8-bit high | `%ah` | -| `%c0` / `%P0` | Bare constant (no `$` prefix) | `42` | -| `%n0` | Negated constant | `-42` | -| `%a0` | Address reference | `symbol` (memory operand form) | - -### Constraint Classification - -The backend recognizes the GCC/Clang i386 constraint vocabulary: - -| Constraint | Meaning | -|------------|---------| -| `r`, `q`, `R`, `l` | General-purpose register | -| `Q` | Byte-addressable register (al/bl/cl/dl) | -| `x`, `v`, `Y` | SSE/XMM register | -| `m`, `o`, `V`, `p` | Memory operand | -| `i`, `I`, `n`, `N`, `e`, `E`, `K`, `M`, `G`, `H`, `J`, `L`, `O` | Immediate | -| `g` | General (register, memory, or immediate) | -| `a` / `b` / `c` / `d` / `S` / `D` | Specific registers (`eax`/`ebx`/`ecx`/`edx`/`esi`/`edi`) | -| `t` / `u` | x87 `st(0)` / `st(1)` | -| `{regname}` | Explicit register | -| `=@cc` | Condition code output (emits `set{cc}` + `movzbl`) | -| `0`-`9` | Tied to operand N | - -### Scratch Register Pools - -Inline assembly allocates scratch registers from: -- **GP**: `ecx`, `edx`, `esi`, `edi`, `eax`, `ebx` (6 registers) -- **XMM**: `xmm0` through `xmm7` (8 registers) - -When all GP registers are exhausted by operand assignments, the allocator -falls back to memory operands (stack slot references). - -### x87 FPU Stack Operands - -The `t` and `u` constraints map to `st(0)` and `st(1)` respectively. For -input operands, values are loaded onto the x87 stack via `fldt`/`fldl`/`flds`. -For output operands, the x87 stack top is stored back via `fstpt`/`fstpl`/ -`fstps`. This supports inline assembly that operates on x87 registers -directly (common in math library code and legacy FPU routines). - -### 64-bit Register Pairs - -For I64/U64 types, the inline assembly emitter handles register pairs -automatically. Each 64-bit operand receives two GP registers (a low and -high half), and template substitution produces the appropriate register -for the requested half. - ---- - -## Intrinsics - -The backend directly emits machine instructions for architecture-specific -intrinsics, avoiding function call overhead. - -### Memory Fences and Hints - -`lfence`, `mfence`, `sfence`, `pause`, `clflush` - -### Non-Temporal Stores - -`movnti` (32-bit), `movntdq` (128-bit integer), `movntpd` (128-bit double) - -### SSE/SSE2 Packed 128-bit Operations - -- **Arithmetic**: `paddw`, `psubw`, `paddd`, `psubd`, `pmulhw`, `pmaddwd` -- **Compare**: `pcmpeqb`, `pcmpeqd`, `pcmpgtw`, `pcmpgtb` -- **Logical**: `pand`, `por`, `pxor`, `psubusb`, `psubsb` -- **Shuffle / Pack**: `pshufd`, `pshuflw`, `pshufhw`, `packssdw`, - `packsswb`, `packuswb`, `punpcklbw`, `punpckhbw`, `punpcklwd`, - `punpckhwd` -- **Shift**: `pslldq`, `psrldq`, `psllq`, `psrlq`, `psllw`, `psrlw`, - `psraw`, `psrad`, `pslld`, `psrld` (with immediate) -- **Move mask**: `pmovmskb` -- **Broadcast**: `set_epi8` (byte splat), `set_epi16` (word splat), - `set_epi32` (dword splat) -- **Load / Store**: `loaddqu`, `storedqu`, `loadldi128` (low 64-bit load), - `storeldi128` (low 64-bit store) -- **Insert / Extract**: `pinsrw`, `pextrw`, `pinsrb`, `pextrb`, `pinsrd`, - `pextrd` (SSE4.1 byte/dword variants) - -### Scalar Float Math - -x87 `fsqrt` (square root for F32/F64), x87 `fabs` (absolute value for F32/F64) - -### AES-NI - -`aesenc`, `aesenclast`, `aesdec`, `aesdeclast`, `aesimc`, `aeskeygenassist` - -### CLMUL - -`pclmulqdq` (carry-less multiplication with immediate selector) - -### CRC32 - -`crc32b`, `crc32w`, `crc32l` (hardware CRC32C). The 64-bit `crc32q` variant -is emulated on i686 via two 32-bit CRC32 operations on each half. - -### Builtins - -- `__builtin_frame_address(0)` -- reads `%ebp` -- `__builtin_return_address(0)` -- reads `4(%ebp)` -- `__builtin_thread_pointer()` -- reads `%gs:0` - ---- - -## Peephole Optimizer - -After all assembly text is emitted, the entire function is processed by a -multi-pass peephole optimizer (`peephole.rs`) that eliminates redundancies -inherent in the accumulator-based code generation style. - -### Pass Structure - -1. **Local passes** (iterative, up to 8 rounds): - - **Store/load elimination:** A `movl %eax, -8(%ebp)` immediately - followed by `movl -8(%ebp), %eax` -- the load is removed. If the - load is into a different register, it is converted to a reg-reg move. - - **Self-move elimination:** `movl %eax, %eax` is deleted. - - **Strength reduction:** `addl $1` → `incl`, `subl $1` → `decl`, - `movl $0, %reg` → `xorl %reg, %reg`, with carry-flag safety checks - to avoid breaking sequences that depend on CF. - - **Redundant sign/zero-extension elimination:** A `movsbl ..., %eax` - followed by `movsbl %al, %eax` -- the second is removed. - - **Redundant jump elimination:** An unconditional `jmp` to the - immediately following label is removed. - - **Branch inversion:** A conditional jump over an unconditional jump is - inverted to eliminate the unconditional jump. - - **Reverse move elimination:** A `movl %ecx, %eax` followed by - `movl %eax, %ecx` -- the second is removed. - -2. **Global passes** (single pass): - - **Dead register move elimination:** A `movl %eax, %ecx` where `%ecx` - is never read before being overwritten is removed. - - **Dead store elimination:** A `movl %eax, -8(%ebp)` where the slot is - written again before being read is removed. - - **Compare+branch fusion:** Detects patterns where a comparison result - is stored, reloaded, and tested (`cmpl + setCC + movzbl + testl %eax - + jne/je`), fusing them into a single `cmpl` + `jCC`. - - **Memory operand folding:** Replaces a load-from-slot + ALU-with-register - sequence (`movl -N(%ebp), %ecx; addl %ecx, %eax`) with a single - ALU-with-memory-operand instruction (`addl -N(%ebp), %eax`). - -3. **Local cleanup** (up to 4 rounds): Re-runs local and global passes to - clean up opportunities exposed by the previous round. - -4. **Never-read store elimination:** A global analysis collects all loaded - stack offsets, then removes stores to offsets that are never loaded anywhere - in the function. Conservatively bails if any `leal` address-of or indirect - memory access exists (which could create aliased slot references). - -### Line Classification - -Every assembly line is classified into a `LineKind` enum (`StoreEbp`, -`LoadEbp`, `Move`, `SelfMove`, `Label`, `Jmp`, `JmpIndirect`, `CondJmp`, -`Call`, `Ret`, `Push`, `Pop`, `SetCC`, `Cmp`, `Directive`, `Nop`, `Empty`, -`Other`) for efficient pattern matching. Register operands are mapped to -family IDs (0--7 for `%eax` through `%edi`) so that sub-register aliases -(`%al`, `%ax`, `%eax`) are treated as the same physical register. - -Implicit register uses are tracked for instructions like `cltd` (reads `%eax`, -writes `%edx`), `idivl`/`divl` (reads `%eax:%edx`, writes `%eax:%edx`), -`rep movsb` (uses `%esi`/`%edi`/`%ecx`), and `mull`/`imull` (writes -`%eax:%edx`). This ensures dead-register elimination does not remove moves -that are consumed by implicit register operands. - ---- - -## Codegen Options - -These options are applied via `I686Codegen::apply_options()`: - -| Option | CLI Flag | Effect | -|--------|----------|--------| -| `pic` | `-fPIC` | Position-independent code: use `@GOT(%ebx)` for external globals, `@GOTOFF(%ebx)` for local globals, `@PLT` for external calls; reserves `%ebx` as GOT base | -| `regparm` | `-mregparm=N` | Pass first N (1--3) integer/pointer arguments in `%eax`, `%edx`, `%ecx` instead of on the stack | -| `omit_frame_pointer` | `-fomit-frame-pointer` | Skip `%ebp` frame pointer setup; use `%esp`-relative addressing; free `%ebp` as a 4th callee-saved register | -| `no_jump_tables` | `-fno-jump-tables` | Force all switch statements to use compare-and-branch chains instead of jump tables | -| `emit_cfi` | (internal) | Emit `.cfi_startproc`/`.cfi_endproc` CFI directives for DWARF unwinding | - -The `-m16` flag sets `code16gcc` mode, which prepends `.code16gcc` to the -assembly output. This GNU assembler directive causes all subsequent 32-bit -instructions to be emitted with operand-size and address-size override -prefixes, allowing the code to execute in 16-bit real mode while being -written in 32-bit syntax. Used by the Linux kernel's early boot code. -The `.code16gcc` directive is prepended in the backend dispatcher -(`src/backend/mod.rs`) after peephole optimization completes. - ---- - -## Key Design Decisions and Challenges - -### The Accumulator Bottleneck - -With only 6 GPRs total (3 scratch, 3 callee-saved), the i686 backend cannot -use a general-purpose register allocator the way x86-64 can with its 15 -GPRs. Instead, it uses `%eax` as a universal accumulator: every expression -evaluation flows through `%eax`, with `%ecx` as the secondary operand -register for binary operations and `%edx` as the implicit upper-half -register for multiply/divide/64-bit pairs. - -This design is simple and correct, but produces excessive memory traffic -(store to stack, reload from stack). The register allocator mitigates this -by assigning the most frequently used values to `%ebx`, `%esi`, `%edi` -(and `%ebp` when available), and the peephole optimizer eliminates the -remaining redundant store/load pairs. - -The accumulator register cache (`reg_cache`) tracks what IR value is currently -in `%eax`, allowing `operand_to_eax` to skip the load when the value is -already present. This simple one-entry cache eliminates a significant -fraction of redundant loads without the complexity of a full register -allocator. - -### 64-bit Values on a 32-bit Machine - -Every 64-bit operation requires careful orchestration of register pairs. -The difficulty is compounded by the scarcity of registers: with `%eax:%edx` -holding the result and `%ecx` needed for shift counts, there are no scratch -registers left for the second operand. The backend resolves this by pushing -the RHS onto the stack and operating against `(%esp)`. - -64-bit comparisons are particularly tricky: ordered comparisons must first -check the high halves, then branch to check the low halves only if the high -halves are equal. This requires careful label management and different -condition codes for the high (signed) and low (unsigned) halves. - -### ESP Tracking for Frame Pointer Omission - -Without `%ebp` as a stable reference point, every temporary ESP adjustment -(pushing call arguments, pushing temporaries for x87 conversions, etc.) -shifts all stack slot addresses. The `esp_adjust` field is meticulously -incremented and decremented around every `pushl`/`subl` and `popl`/`addl` -that modifies `%esp`, and `slot_ref` adds it to every stack access. A -single missed update would silently corrupt all subsequent memory references. - -### PIC Mode and `%ebx` Reservation - -Position-independent code on i686 requires a GOT base register. The -backend reserves `%ebx` for this purpose, loading it in the prologue via -`call __x86.get_pc_thunk.bx` / `addl $_GLOBAL_OFFSET_TABLE_, %ebx`. -Global address references use `@GOT(%ebx)` for external symbols and -`@GOTOFF(%ebx)` for local symbols. The `__x86.get_pc_thunk.bx` helper is -emitted as a COMDAT section so that the linker deduplicates it across -translation units. - -### Standalone 64-bit Division Runtime - -Programs that link without libgcc (e.g., musl libc) need compiler-provided -implementations of `__divdi3`, `__udivdi3`, `__moddi3`, and `__umoddi3`. -The backend emits these as `.weak` symbols in the `.text` section, based on -the compiler-rt i386 division algorithms using normalized-divisor estimation. -If libgcc is linked, its strong symbols take precedence. The stubs are only -emitted when 64-bit division is actually used (`needs_divdi3_helpers` flag). - -### 64-bit Atomic Operations - -The i686 ISA has no 64-bit atomic load/store instructions. The backend uses -`lock cmpxchg8b` loops for all 64-bit atomic operations (RMW, cmpxchg, -load, store). This requires `%ebx` and `%ecx` for the desired value and -`%eax:%edx` for the expected/old value, consuming all scratch registers -and `%ebx`. The backend saves `%ebx` and `%esi` on the stack and uses -`%esi` as the pointer register for the duration of the atomic operation. - -F64 values are treated as "atomic wide" alongside I64/U64, since they -require the same 8-byte atomic semantics. - -### Segment-Override Load/Store - -The backend supports `%fs:` and `%gs:` segment-override prefixed memory -accesses, used for thread-local storage and kernel per-CPU data. Both -pointer-based (`%gs:(%ecx)`) and symbol-based (`%gs:symbol`) addressing -forms are supported, with proper type-width register selection (`%al`/`%ax`/ -`%eax` for byte/word/dword operands). - -### Division-by-Constant Optimization (Disabled) - -The IR-level `div_by_const` pass, which replaces integer division by -compile-time constants with multiply-and-shift sequences, is **disabled for -the i686 target**. The replacement sequences use `MulHigh` (upper-half -multiply) operations that the IR expresses as 64-bit arithmetic. The i686 -backend truncates 64-bit operations to 32 bits in its accumulator, producing -incorrect results for these sequences. - -Until a 32-bit-aware variant is implemented (using single-operand `imull` -for the upper-half multiply), the backend falls back to hardware -`idivl`/`divl` instructions for all division and modulo operations. The -guard is `!target.is_32bit()` in the optimization pipeline. diff --git a/src/backend/i686/codegen/alu.rs b/src/backend/i686/codegen/alu.rs deleted file mode 100644 index ea0d58a3dd..0000000000 --- a/src/backend/i686/codegen/alu.rs +++ /dev/null @@ -1,135 +0,0 @@ -//! I686Codegen: ALU operations (integer arithmetic, bitwise, shifts). - -use crate::ir::reexports::{IrBinOp, Operand, Value}; -use crate::common::types::IrType; -use crate::emit; -use super::emit::{I686Codegen, alu_mnemonic, shift_mnemonic}; - -impl I686Codegen { - pub(super) fn emit_float_neg_impl(&mut self, ty: IrType) { - if ty == IrType::F32 { - self.state.emit(" movd %eax, %xmm0"); - self.state.emit(" movl $0x80000000, %ecx"); - self.state.emit(" movd %ecx, %xmm1"); - self.state.emit(" xorps %xmm1, %xmm0"); - self.state.emit(" movd %xmm0, %eax"); - } else { - self.state.emit(" xorl $0x80000000, %eax"); - } - } - - pub(super) fn emit_int_neg_impl(&mut self, _ty: IrType) { - self.state.emit(" negl %eax"); - } - - pub(super) fn emit_int_not_impl(&mut self, _ty: IrType) { - self.state.emit(" notl %eax"); - } - - pub(super) fn emit_int_clz_impl(&mut self, ty: IrType) { - if matches!(ty, IrType::I32 | IrType::U32 | IrType::Ptr) { - self.state.emit(" lzcntl %eax, %eax"); - } else if matches!(ty, IrType::I16 | IrType::U16) { - self.state.emit(" lzcntw %ax, %ax"); - } else { - self.state.emit(" lzcntl %eax, %eax"); - } - } - - pub(super) fn emit_int_ctz_impl(&mut self, _ty: IrType) { - // tzcntl works for all integer widths on i686: the value is in %eax - // and trailing zero count is the same regardless of nominal width. - self.state.emit(" tzcntl %eax, %eax"); - } - - pub(super) fn emit_int_bswap_impl(&mut self, ty: IrType) { - match ty { - IrType::I16 | IrType::U16 => self.state.emit(" rolw $8, %ax"), - IrType::I32 | IrType::U32 | IrType::Ptr => self.state.emit(" bswapl %eax"), - _ => self.state.emit(" bswapl %eax"), - } - } - - pub(super) fn emit_int_popcount_impl(&mut self, _ty: IrType) { - self.state.emit(" popcntl %eax, %eax"); - } - - pub(super) fn emit_int_binop_impl(&mut self, dest: &Value, op: IrBinOp, lhs: &Operand, rhs: &Operand, _ty: IrType) { - // Immediate optimization for ALU ops - if matches!(op, IrBinOp::Add | IrBinOp::Sub | IrBinOp::And | IrBinOp::Or | IrBinOp::Xor) { - if let Some(imm) = Self::const_as_imm32(rhs) { - self.operand_to_eax(lhs); - let mnem = alu_mnemonic(op); - emit!(self.state, " {}l ${}, %eax", mnem, imm); - self.state.reg_cache.invalidate_acc(); - self.store_eax_to(dest); - return; - } - } - - // Immediate multiply - if op == IrBinOp::Mul { - if let Some(imm) = Self::const_as_imm32(rhs) { - self.operand_to_eax(lhs); - match imm { - 3 => emit!(self.state, " leal (%eax, %eax, 2), %eax"), - 5 => emit!(self.state, " leal (%eax, %eax, 4), %eax"), - 9 => emit!(self.state, " leal (%eax, %eax, 8), %eax"), - _ => emit!(self.state, " imull ${}, %eax, %eax", imm), - } - self.state.reg_cache.invalidate_acc(); - self.store_eax_to(dest); - return; - } - } - - // Immediate shift - if matches!(op, IrBinOp::Shl | IrBinOp::AShr | IrBinOp::LShr) { - if let Some(imm) = Self::const_as_imm32(rhs) { - self.operand_to_eax(lhs); - let mnem = shift_mnemonic(op); - let shift_amount = (imm as u32) & 31; - emit!(self.state, " {} ${}, %eax", mnem, shift_amount); - self.state.reg_cache.invalidate_acc(); - self.store_eax_to(dest); - return; - } - } - - // General case: load lhs to eax, rhs to ecx - self.operand_to_eax(lhs); - self.operand_to_ecx(rhs); - - match op { - IrBinOp::Add => self.state.emit(" addl %ecx, %eax"), - IrBinOp::Sub => self.state.emit(" subl %ecx, %eax"), - IrBinOp::Mul => self.state.emit(" imull %ecx, %eax"), - IrBinOp::And => self.state.emit(" andl %ecx, %eax"), - IrBinOp::Or => self.state.emit(" orl %ecx, %eax"), - IrBinOp::Xor => self.state.emit(" xorl %ecx, %eax"), - IrBinOp::Shl => self.state.emit(" shll %cl, %eax"), - IrBinOp::AShr => self.state.emit(" sarl %cl, %eax"), - IrBinOp::LShr => self.state.emit(" shrl %cl, %eax"), - IrBinOp::SDiv => { - self.state.emit(" cltd"); - self.state.emit(" idivl %ecx"); - } - IrBinOp::UDiv => { - self.state.emit(" xorl %edx, %edx"); - self.state.emit(" divl %ecx"); - } - IrBinOp::SRem => { - self.state.emit(" cltd"); - self.state.emit(" idivl %ecx"); - self.state.emit(" movl %edx, %eax"); - } - IrBinOp::URem => { - self.state.emit(" xorl %edx, %edx"); - self.state.emit(" divl %ecx"); - self.state.emit(" movl %edx, %eax"); - } - } - self.state.reg_cache.invalidate_acc(); - self.store_eax_to(dest); - } -} diff --git a/src/backend/i686/codegen/asm_emitter.rs b/src/backend/i686/codegen/asm_emitter.rs deleted file mode 100644 index 5d0e7f057e..0000000000 --- a/src/backend/i686/codegen/asm_emitter.rs +++ /dev/null @@ -1,672 +0,0 @@ -//! i686 InlineAsmEmitter implementation: constraint classification, register -//! allocation, operand loading/storing, and template substitution for inline asm. -//! -//! Handles 32-bit x86 registers (eax, ebx, ecx, edx, esi, edi) and i686 -//! calling conventions (cdecl, ILP32). - -use std::borrow::Cow; -use crate::ir::reexports::{ - BlockId, - IrConst, - Operand, - Value, -}; -use crate::common::types::IrType; -use crate::backend::state::CodegenState; -use crate::backend::inline_asm::{InlineAsmEmitter, AsmOperandKind, AsmOperand}; -use crate::emit; -use super::emit::I686Codegen; - -/// i686 scratch XMM registers (SSE available on most i686 targets). -const I686_XMM_SCRATCH: &[&str] = &["xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"]; - -impl InlineAsmEmitter for I686Codegen { - fn asm_state(&mut self) -> &mut CodegenState { &mut self.state } - - fn classify_constraint(&self, constraint: &str) -> AsmOperandKind { - let c = constraint.trim_start_matches(['=', '+', '&', '%']); - // Explicit register constraint: {regname} - if c.starts_with('{') && c.ends_with('}') { - let reg_name = &c[1..c.len()-1]; - return AsmOperandKind::Specific(reg_name.to_string()); - } - // GCC condition code output: =@cc - if let Some(cond) = c.strip_prefix("@cc") { - return AsmOperandKind::ConditionCode(cond.to_string()); - } - // Tied operand (all digits) - if !c.is_empty() && c.chars().all(|ch| ch.is_ascii_digit()) { - if let Ok(n) = c.parse::() { - return AsmOperandKind::Tied(n); - } - } - // x87 FPU stack constraints: "t" = st(0), "u" = st(1) - if c == "t" { - return AsmOperandKind::X87St0; - } - if c == "u" { - return AsmOperandKind::X87St1; - } - - // Multi-alternative constraint parsing (same logic as x86-64 but with 32-bit registers) - let mut has_gp = false; - let mut has_fp = false; - let mut has_mem = false; - let mut has_imm = false; - let mut specific: Option = None; - - for ch in c.chars() { - match ch { - 'r' | 'q' | 'R' | 'Q' | 'l' => has_gp = true, - 'g' => { has_gp = true; has_mem = true; has_imm = true; } - 'x' | 'v' | 'Y' => has_fp = true, - 'm' | 'o' | 'V' | 'p' => has_mem = true, - 'i' | 'I' | 'n' | 'N' | 'e' | 'E' | 'K' | 'M' | 'G' | 'H' | 'J' | 'L' | 'O' => has_imm = true, - 'a' if specific.is_none() => specific = Some("eax".to_string()), - 'b' if specific.is_none() => specific = Some("ebx".to_string()), - 'c' if specific.is_none() => specific = Some("ecx".to_string()), - 'd' if specific.is_none() => specific = Some("edx".to_string()), - 'S' if specific.is_none() => specific = Some("esi".to_string()), - 'D' if specific.is_none() => specific = Some("edi".to_string()), - _ => {} - } - } - - if let Some(reg) = specific { - AsmOperandKind::Specific(reg) - } else if has_gp { - AsmOperandKind::GpReg - } else if has_fp { - AsmOperandKind::FpReg - } else if has_mem { - AsmOperandKind::Memory - } else if has_imm { - AsmOperandKind::Immediate - } else { - AsmOperandKind::GpReg - } - } - - fn setup_operand_metadata(&self, op: &mut AsmOperand, val: &Operand, _is_output: bool) { - if matches!(op.kind, AsmOperandKind::Memory) { - if let Operand::Value(v) = val { - if let Some(slot) = self.state.get_slot(v.0) { - if self.state.is_alloca(v.0) { - if self.state.alloca_over_align(v.0).is_some() { - op.mem_addr = String::new(); - } else { - op.mem_addr = self.slot_ref(slot); - } - } else { - op.mem_addr = String::new(); - } - } - } - } - if matches!(op.kind, AsmOperandKind::Immediate) { - if let Operand::Const(c) = val { - op.imm_value = c.to_i64(); - } - } - } - - fn resolve_memory_operand(&mut self, op: &mut AsmOperand, val: &Operand, excluded: &[String]) -> bool { - if !op.mem_addr.is_empty() { - return false; - } - if let Some(ref sym) = op.imm_symbol { - op.mem_addr = sym.clone(); - return false; - } - match val { - Operand::Value(v) => { - if let Some(slot) = self.state.get_slot(v.0) { - let tmp_reg = self.assign_scratch_reg(&AsmOperandKind::GpReg, excluded); - if self.state.is_alloca(v.0) { - let sr = self.slot_ref(slot); - if let Some(align) = self.state.alloca_over_align(v.0) { - emit!(self.state, " leal {}, %{}", sr, tmp_reg); - emit!(self.state, " addl ${}, %{}", align - 1, tmp_reg); - emit!(self.state, " andl ${}, %{}", -(align as i32), tmp_reg); - } else { - emit!(self.state, " leal {}, %{}", sr, tmp_reg); - } - } else { - let sr = self.slot_ref(slot); - emit!(self.state, " movl {}, %{}", sr, tmp_reg); - } - op.mem_addr = format!("(%{})", tmp_reg); - return true; - } - } - Operand::Const(c) => { - if let Some(addr) = c.to_i64() { - let tmp_reg = self.assign_scratch_reg(&AsmOperandKind::GpReg, excluded); - emit!(self.state, " movl ${}, %{}", addr as i32, tmp_reg); - op.mem_addr = format!("(%{})", tmp_reg); - return true; - } - } - } - false - } - - fn assign_scratch_reg(&mut self, kind: &AsmOperandKind, excluded: &[String]) -> String { - if matches!(kind, AsmOperandKind::FpReg) { - // i686 only has xmm0-xmm7 (no xmm8-xmm15 without 64-bit mode). - // Skip excluded registers but cap at 8. - loop { - let idx = self.asm_xmm_scratch_idx; - self.asm_xmm_scratch_idx += 1; - if idx >= I686_XMM_SCRATCH.len() { - // All 8 XMM registers exhausted; wrap around and pick - // the first non-excluded register for reuse. - for r in I686_XMM_SCRATCH { - if !excluded.iter().any(|e| e == *r) { - return r.to_string(); - } - } - return "xmm0".to_string(); - } - let reg = I686_XMM_SCRATCH[idx].to_string(); - if !excluded.iter().any(|e| e == ®) { - return reg; - } - } - } else { - // All GP registers on i686 (including caller-saved) - const ALL_GP: &[&str] = &["ecx", "edx", "esi", "edi", "eax", "ebx"]; - for _ in 0..ALL_GP.len() { - let idx = self.asm_scratch_idx; - self.asm_scratch_idx += 1; - let reg = if idx < ALL_GP.len() { - ALL_GP[idx].to_string() - } else { - // All registers exhausted — return empty to signal spill-to-memory. - // The caller (emit_inline_asm_common_impl) will detect the empty - // string and fall back to a memory operand for constraints like "g" - // that have a memory alternative. - return String::new(); - }; - if !excluded.iter().any(|e| e == ®) { - return reg; - } - } - // All registers excluded — return empty to signal spill-to-memory. - String::new() - } - } - - fn load_input_to_reg(&mut self, op: &AsmOperand, val: &Operand, _constraint: &str) { - let reg = &op.reg; - let ty = op.operand_type; - - // x87 FPU stack: load value from memory onto the x87 stack with fld - if matches!(op.kind, AsmOperandKind::X87St0 | AsmOperandKind::X87St1) { - match val { - Operand::Value(v) => { - if let Some(slot) = self.state.get_slot(v.0) { - let fld_instr = match ty { - IrType::F32 => "flds", - IrType::F128 => "fldt", - _ => "fldl", // F64 and default - }; - let sr = self.slot_ref(slot); - self.state.emit_fmt(format_args!(" {} {}", fld_instr, sr)); - } - } - Operand::Const(c) => { - // Materialize float constant via stack scratch space - let bits = match ty { - IrType::F32 => { - let f = c.to_f64().unwrap_or(0.0) as f32; - f.to_bits() as u64 - } - _ => { - let f = c.to_f64().unwrap_or(0.0); - f.to_bits() - } - }; - if ty == IrType::F32 { - self.state.emit(" subl $4, %esp"); - self.state.emit_fmt(format_args!(" movl ${}, (%esp)", bits as u32)); - self.state.emit(" flds (%esp)"); - self.state.emit(" addl $4, %esp"); - } else { - let lo = bits as u32; - let hi = (bits >> 32) as u32; - self.state.emit(" subl $8, %esp"); - self.state.emit_fmt(format_args!(" movl ${}, (%esp)", lo)); - self.state.emit_fmt(format_args!(" movl ${}, 4(%esp)", hi)); - self.state.emit(" fldl (%esp)"); - self.state.emit(" addl $8, %esp"); - } - } - } - return; - } - - let is_xmm = reg.starts_with("xmm"); - - if is_xmm { - match val { - Operand::Value(v) => { - if let Some(slot) = self.state.get_slot(v.0) { - let sr = self.slot_ref(slot); - let load_instr = match ty { - IrType::F32 => "movss", - IrType::F64 => "movsd", - _ => "movdqu", - }; - self.state.emit_fmt(format_args!(" {} {}, %{}", load_instr, sr, reg)); - } - } - Operand::Const(_) => { - self.state.emit_fmt(format_args!(" xorpd %{}, %{}", reg, reg)); - } - } - return; - } - - // GP register - check for 64-bit register pair - let reg_hi = &op.reg_hi; - let is_pair = !reg_hi.is_empty() && matches!(ty, IrType::I64 | IrType::U64); - - match val { - Operand::Const(c) => { - let imm = match c { - IrConst::F32(v) => v.to_bits() as i64, - IrConst::F64(v) => v.to_bits() as i64, - _ => c.to_i64().unwrap_or(0), - }; - if is_pair { - let lo = imm as u32; - let hi = (imm as u64 >> 32) as u32; - if lo == 0 { - self.state.emit_fmt(format_args!(" xorl %{}, %{}", reg, reg)); - } else { - self.state.emit_fmt(format_args!(" movl ${}, %{}", lo as i32, reg)); - } - if hi == 0 { - self.state.emit_fmt(format_args!(" xorl %{}, %{}", reg_hi, reg_hi)); - } else { - self.state.emit_fmt(format_args!(" movl ${}, %{}", hi as i32, reg_hi)); - } - } else { - let imm32 = imm as i32; - if imm32 == 0 { - self.state.emit_fmt(format_args!(" xorl %{}, %{}", Self::reg_to_32(reg), Self::reg_to_32(reg))); - } else { - self.state.emit_fmt(format_args!(" movl ${}, %{}", imm32, Self::reg_to_32(reg))); - } - } - } - Operand::Value(v) => { - if let Some(slot) = self.state.get_slot(v.0) { - let sr = self.slot_ref(slot); - if self.state.is_alloca(v.0) { - if let Some(align) = self.state.alloca_over_align(v.0) { - self.state.emit_fmt(format_args!(" leal {}, %{}", sr, reg)); - self.state.emit_fmt(format_args!(" addl ${}, %{}", align - 1, reg)); - self.state.emit_fmt(format_args!(" andl ${}, %{}", -(align as i32), reg)); - } else { - self.state.emit_fmt(format_args!(" leal {}, %{}", sr, reg)); - } - } else if is_pair { - let sr4 = self.slot_ref_offset(slot, 4); - self.state.emit_fmt(format_args!(" movl {}, %{}", sr, reg)); - self.state.emit_fmt(format_args!(" movl {}, %{}", sr4, reg_hi)); - } else { - let load_instr = Self::i686_mov_load_for_type(ty); - let dest = if Self::is_extending_load(load_instr) { - Self::reg_to_32(reg) - } else { - Self::dest_reg_for_type(reg, ty) - }; - self.state.emit_fmt(format_args!(" {} {}, %{}", load_instr, sr, dest)); - } - } - } - } - } - - fn preload_readwrite_output(&mut self, op: &AsmOperand, ptr: &Value) { - // x87 FPU stack - if matches!(op.kind, AsmOperandKind::X87St0 | AsmOperandKind::X87St1) { - let ty = op.operand_type; - if let Some(slot) = self.state.get_slot(ptr.0) { - let fld_instr = match ty { - IrType::F32 => "flds", - IrType::F128 => "fldt", - _ => "fldl", - }; - if self.state.is_alloca(ptr.0) { - let sr = self.slot_ref(slot); - self.state.emit_fmt(format_args!(" {} {}", fld_instr, sr)); - } else { - self.state.emit(" pushl %ecx"); - self.esp_adjust += 4; - let sr = self.slot_ref(slot); - self.state.emit_fmt(format_args!(" movl {}, %ecx", sr)); - self.state.emit_fmt(format_args!(" {} (%ecx)", fld_instr)); - self.state.emit(" popl %ecx"); - self.esp_adjust -= 4; - } - } - return; - } - let reg = &op.reg; - let ty = op.operand_type; - let reg_hi = &op.reg_hi; - let is_pair = !reg_hi.is_empty() && matches!(ty, IrType::I64 | IrType::U64); - let is_xmm = reg.starts_with("xmm"); - if let Some(slot) = self.state.get_slot(ptr.0) { - let sr = self.slot_ref(slot); - if self.state.is_alloca(ptr.0) { - if is_xmm { - let load_instr = match ty { - IrType::F32 => "movss", - IrType::F64 => "movsd", - _ => "movdqu", - }; - self.state.emit_fmt(format_args!(" {} {}, %{}", load_instr, sr, reg)); - } else if is_pair { - let sr4 = self.slot_ref_offset(slot, 4); - self.state.emit_fmt(format_args!(" movl {}, %{}", sr, reg)); - self.state.emit_fmt(format_args!(" movl {}, %{}", sr4, reg_hi)); - } else { - let load_instr = Self::i686_mov_load_for_type(ty); - let dest = if Self::is_extending_load(load_instr) { - Self::reg_to_32(reg) - } else { - Self::dest_reg_for_type(reg, ty) - }; - self.state.emit_fmt(format_args!(" {} {}, %{}", load_instr, sr, dest)); - } - } else { - // Non-alloca: slot holds a pointer — do indirect load - if is_pair { - self.state.emit_fmt(format_args!(" movl {}, %{}", sr, reg)); - self.state.emit_fmt(format_args!(" movl 4(%{}), %{}", reg, reg_hi)); - self.state.emit_fmt(format_args!(" movl (%{}), %{}", reg, reg)); - } else { - self.state.emit_fmt(format_args!(" movl {}, %{}", sr, reg)); - if is_xmm { - let load_instr = match ty { - IrType::F32 => "movss", - IrType::F64 => "movsd", - _ => "movdqu", - }; - self.state.emit_fmt(format_args!(" {} (%{}), %{}", load_instr, reg, reg)); - } else { - let load_instr = Self::i686_mov_load_for_type(ty); - // For zero/sign-extending loads, destination must be 32-bit - let dest = if Self::is_extending_load(load_instr) { - Self::reg_to_32(reg) - } else { - Self::dest_reg_for_type(reg, ty) - }; - self.state.emit_fmt(format_args!(" {} (%{}), %{}", load_instr, reg, dest)); - } - } - } - } - } - - fn substitute_template_line(&self, line: &str, operands: &[AsmOperand], gcc_to_internal: &[usize], operand_types: &[IrType], goto_labels: &[(String, BlockId)]) -> String { - let op_regs: Vec = operands.iter().map(|o| o.reg.clone()).collect(); - let op_names: Vec> = operands.iter().map(|o| o.name.clone()).collect(); - let op_is_memory: Vec = operands.iter().map(|o| matches!(o.kind, AsmOperandKind::Memory)).collect(); - let op_mem_addrs: Vec = operands.iter().map(|o| { - if o.seg_prefix.is_empty() { - o.mem_addr.clone() - } else { - format!("{}{}", o.seg_prefix, o.mem_addr) - } - }).collect(); - let op_imm_values: Vec> = operands.iter().map(|o| o.imm_value).collect(); - let op_imm_symbols: Vec> = operands.iter().map(|o| o.imm_symbol.clone()).collect(); - - let total = operands.len(); - let mut op_types: Vec = vec![IrType::I32; total]; - for (i, ty) in operand_types.iter().enumerate() { - if i < total { op_types[i] = *ty; } - } - for (i, op) in operands.iter().enumerate() { - if let AsmOperandKind::Tied(tied_to) = &op.kind { - if *tied_to < op_types.len() && i < op_types.len() { - op_types[i] = op_types[*tied_to]; - } - } - } - - Self::substitute_i686_asm_operands(line, &op_regs, &op_names, &op_is_memory, &op_mem_addrs, &op_types, gcc_to_internal, goto_labels, &op_imm_values, &op_imm_symbols) - } - - fn store_output_from_reg(&mut self, op: &AsmOperand, ptr: &Value, _constraint: &str, _all_output_regs: &[&str]) { - if matches!(op.kind, AsmOperandKind::Memory) { - return; - } - // x87 FPU stack outputs - if matches!(op.kind, AsmOperandKind::X87St0 | AsmOperandKind::X87St1) { - if let Some(slot) = self.state.get_slot(ptr.0) { - let ty = op.operand_type; - let fstp_instr = match ty { - IrType::F32 => "fstps", - IrType::F128 => "fstpt", - _ => "fstpl", - }; - if self.state.is_direct_slot(ptr.0) { - let sr = self.slot_ref(slot); - self.state.emit_fmt(format_args!(" {} {}", fstp_instr, sr)); - } else { - self.state.emit(" pushl %ecx"); - self.esp_adjust += 4; - let sr = self.slot_ref(slot); - self.state.emit_fmt(format_args!(" movl {}, %ecx", sr)); - self.state.emit_fmt(format_args!(" {} (%ecx)", fstp_instr)); - self.state.emit(" popl %ecx"); - self.esp_adjust -= 4; - } - } - return; - } - // =@cc condition code outputs - if let AsmOperandKind::ConditionCode(ref cond) = op.kind { - let reg = &op.reg; - let reg8 = Self::reg_to_8l(reg); - let x86_cond = Self::gcc_cc_to_x86(cond); - self.state.emit_fmt(format_args!(" set{} %{}", x86_cond, reg8)); - self.state.emit_fmt(format_args!(" movzbl %{}, %{}", reg8, Self::reg_to_32(reg))); - if let Some(slot) = self.state.get_slot(ptr.0) { - let ty = op.operand_type; - if self.state.is_direct_slot(ptr.0) { - let sr = self.slot_ref(slot); - let store_instr = Self::i686_mov_store_for_type(ty); - let src = Self::src_reg_for_type(reg, ty); - self.state.emit_fmt(format_args!(" {} %{}, {}", store_instr, src, sr)); - } else { - let scratch = if reg != "ecx" { "ecx" } else { "edx" }; - self.state.emit_fmt(format_args!(" pushl %{}", scratch)); - self.esp_adjust += 4; - let sr = self.slot_ref(slot); - self.state.emit_fmt(format_args!(" movl {}, %{}", sr, scratch)); - let store_instr = Self::i686_mov_store_for_type(ty); - let src = Self::src_reg_for_type(reg, ty); - self.state.emit_fmt(format_args!(" {} %{}, (%{})", store_instr, src, scratch)); - self.state.emit_fmt(format_args!(" popl %{}", scratch)); - self.esp_adjust -= 4; - } - } - return; - } - - let reg = &op.reg; - let ty = op.operand_type; - let reg_hi = &op.reg_hi; - let is_pair = !reg_hi.is_empty() && matches!(ty, IrType::I64 | IrType::U64); - let is_xmm = reg.starts_with("xmm"); - if let Some(slot) = self.state.get_slot(ptr.0) { - if is_xmm { - if self.state.is_direct_slot(ptr.0) { - let sr = self.slot_ref(slot); - let store_instr = match ty { - IrType::F32 => "movss", - IrType::F64 => "movsd", - _ => "movdqu", - }; - self.state.emit_fmt(format_args!(" {} %{}, {}", store_instr, reg, sr)); - } else { - let store_instr = match ty { - IrType::F32 => "movss", - IrType::F64 => "movsd", - _ => "movdqu", - }; - self.state.emit(" pushl %ecx"); - self.esp_adjust += 4; - let sr = self.slot_ref(slot); - self.state.emit_fmt(format_args!(" movl {}, %ecx", sr)); - self.state.emit_fmt(format_args!(" {} %{}, (%ecx)", store_instr, reg)); - self.state.emit(" popl %ecx"); - self.esp_adjust -= 4; - } - } else if is_pair { - if self.state.is_direct_slot(ptr.0) { - let sr = self.slot_ref(slot); - let sr4 = self.slot_ref_offset(slot, 4); - self.state.emit_fmt(format_args!(" movl %{}, {}", reg, sr)); - self.state.emit_fmt(format_args!(" movl %{}, {}", reg_hi, sr4)); - } else { - let scratch = if reg != "ecx" && reg_hi != "ecx" { "ecx" } - else if reg != "edx" && reg_hi != "edx" { "edx" } - else { "esi" }; - self.state.emit_fmt(format_args!(" pushl %{}", scratch)); - self.esp_adjust += 4; - let sr = self.slot_ref(slot); - self.state.emit_fmt(format_args!(" movl {}, %{}", sr, scratch)); - self.state.emit_fmt(format_args!(" movl %{}, (%{})", reg, scratch)); - self.state.emit_fmt(format_args!(" movl %{}, 4(%{})", reg_hi, scratch)); - self.state.emit_fmt(format_args!(" popl %{}", scratch)); - self.esp_adjust -= 4; - } - } else if self.state.is_direct_slot(ptr.0) { - let sr = self.slot_ref(slot); - let store_instr = Self::i686_mov_store_for_type(ty); - let src = Self::src_reg_for_type(reg, ty); - self.state.emit_fmt(format_args!(" {} %{}, {}", store_instr, src, sr)); - } else { - let scratch = if reg != "ecx" { "ecx" } else { "edx" }; - self.state.emit_fmt(format_args!(" pushl %{}", scratch)); - self.esp_adjust += 4; - let sr = self.slot_ref(slot); - self.state.emit_fmt(format_args!(" movl {}, %{}", sr, scratch)); - let store_instr = Self::i686_mov_store_for_type(ty); - let src = Self::src_reg_for_type(reg, ty); - self.state.emit_fmt(format_args!(" {} %{}, (%{})", store_instr, src, scratch)); - self.state.emit_fmt(format_args!(" popl %{}", scratch)); - self.esp_adjust -= 4; - } - } - } - - fn setup_memory_fallback(&self, op: &mut AsmOperand, val: &Operand) { - // When all GP registers are exhausted (only 6 on i686) and the constraint - // allows memory (e.g., "g"), fall back to referencing the value's stack slot - // directly. Unlike setup_operand_metadata for Memory (which handles "m" - // constraints where the value is an address), here we want the VALUE itself, - // which lives directly in the stack slot at offset(%ebp) or offset(%esp). - match val { - Operand::Value(v) => { - if let Some(slot) = self.state.get_slot(v.0) { - op.mem_addr = self.slot_ref(slot); - } - } - Operand::Const(c) => { - // Constant value — promote to immediate instead of memory. - op.kind = AsmOperandKind::Immediate; - op.imm_value = c.to_i64(); - } - } - } - - fn needs_register_pair(&self, ty: IrType) -> bool { - // On i686, 64-bit integer types need two 32-bit GP registers (a register pair). - matches!(ty, IrType::I64 | IrType::U64) - } - - fn reset_scratch_state(&mut self) { - self.asm_scratch_idx = 0; - self.asm_xmm_scratch_idx = 0; - } -} - -// Helper methods for i686 inline asm register formatting. -// Register conversion and condition code mapping delegate to the shared -// `x86_common` module to avoid duplication with the x86-64 backend. -impl I686Codegen { - /// Return the i686 store mnemonic for a given IR type. - /// Uses movb/movw for sub-32-bit types, movl for everything else. - fn i686_mov_store_for_type(ty: IrType) -> &'static str { - match ty { - IrType::I8 | IrType::U8 => "movb", - IrType::I16 | IrType::U16 => "movw", - _ => "movl", - } - } - - /// Return the i686 load mnemonic for a given IR type. - /// Uses sign/zero-extending loads for sub-32-bit types, movl for everything else. - fn i686_mov_load_for_type(ty: IrType) -> &'static str { - match ty { - IrType::I8 => "movsbl", - IrType::U8 => "movzbl", - IrType::I16 => "movswl", - IrType::U16 => "movzwl", - _ => "movl", - } - } - - /// Convert register to 32-bit variant. Delegates to shared x86_common. - pub(super) fn reg_to_32<'a>(reg: &'a str) -> Cow<'a, str> { - crate::backend::x86_common::reg_to_32(reg) - } - - /// Convert register to 16-bit variant. Delegates to shared x86_common. - pub(super) fn reg_to_16<'a>(reg: &'a str) -> Cow<'a, str> { - crate::backend::x86_common::reg_to_16(reg) - } - - /// Convert register to 8-bit low variant. Delegates to shared x86_common. - pub(super) fn reg_to_8l<'a>(reg: &'a str) -> Cow<'a, str> { - crate::backend::x86_common::reg_to_8l(reg) - } - - /// Map GCC condition code suffix to x86 SETcc suffix. Delegates to shared x86_common. - pub(super) fn gcc_cc_to_x86(cond: &str) -> &'static str { - crate::backend::x86_common::gcc_cc_to_x86(cond) - } - - /// Get the appropriately-sized source register name for a type. - fn src_reg_for_type<'a>(reg: &'a str, ty: IrType) -> Cow<'a, str> { - match ty { - IrType::I8 | IrType::U8 => Self::reg_to_8l(reg), - IrType::I16 | IrType::U16 => Self::reg_to_16(reg), - _ => Self::reg_to_32(reg), - } - } - - /// Check if a load instruction is a zero/sign-extending load. - /// These instructions always require a 32-bit destination register. - fn is_extending_load(instr: &str) -> bool { - matches!(instr, "movzbl" | "movzwl" | "movsbl" | "movswl") - } - - /// Get the appropriately-sized destination register name for a type. - fn dest_reg_for_type<'a>(reg: &'a str, ty: IrType) -> Cow<'a, str> { - match ty { - IrType::I8 | IrType::U8 => Self::reg_to_8l(reg), - IrType::I16 | IrType::U16 => Self::reg_to_16(reg), - _ => Self::reg_to_32(reg), - } - } -} diff --git a/src/backend/i686/codegen/atomics.rs b/src/backend/i686/codegen/atomics.rs deleted file mode 100644 index 0db3bd0939..0000000000 --- a/src/backend/i686/codegen/atomics.rs +++ /dev/null @@ -1,163 +0,0 @@ -//! I686Codegen: atomic operations (RMW, cmpxchg, load, store, fence). - -use crate::ir::reexports::{AtomicOrdering, AtomicRmwOp, Operand, Value}; -use crate::common::types::IrType; -use crate::emit; -use super::emit::I686Codegen; - -impl I686Codegen { - pub(super) fn emit_atomic_rmw_impl(&mut self, dest: &Value, op: AtomicRmwOp, ptr: &Operand, val: &Operand, - ty: IrType, _ordering: AtomicOrdering) { - if self.is_atomic_wide(ty) { - self.emit_atomic_rmw_wide(dest, op, ptr, val); - return; - } - - self.operand_to_eax(val); - self.state.emit(" movl %eax, %edx"); - self.operand_to_eax(ptr); - self.state.emit(" movl %eax, %ecx"); - - match op { - AtomicRmwOp::Xchg => { - let suffix = self.type_suffix(ty); - let reg = self.eax_for_type(ty); - self.state.emit(" movl %edx, %eax"); - emit!(self.state, " xchg{} {}, (%ecx)", suffix, reg); - } - AtomicRmwOp::TestAndSet => { - self.state.emit(" movb $1, %al"); - self.state.emit(" xchgb %al, (%ecx)"); - // Zero-extend %al to %eax: xchgb only sets the low byte, - // leaving upper bytes with garbage from prior register usage. - self.state.emit(" movzbl %al, %eax"); - } - AtomicRmwOp::Add => { - let suffix = self.type_suffix(ty); - let reg = match ty { - IrType::I8 | IrType::U8 => "%dl", - IrType::I16 | IrType::U16 => "%dx", - _ => "%edx", - }; - emit!(self.state, " lock xadd{} {}, (%ecx)", suffix, reg); - self.state.emit(" movl %edx, %eax"); - } - _ => { - let suffix = self.type_suffix(ty); - let edx_reg = match ty { - IrType::I8 | IrType::U8 => "%dl", - IrType::I16 | IrType::U16 => "%dx", - _ => "%edx", - }; - self.state.emit(" pushl %edx"); - let load_instr = self.mov_load_for_type(ty); - emit!(self.state, " {} (%ecx), %eax", load_instr); - let loop_label = format!(".Latomic_{}", self.state.next_label_id()); - emit!(self.state, "{}:", loop_label); - self.state.emit(" movl %eax, %edx"); - match op { - AtomicRmwOp::Sub => { - emit!(self.state, " sub{} (%esp), {}", suffix, edx_reg); - } - AtomicRmwOp::And => { - emit!(self.state, " and{} (%esp), {}", suffix, edx_reg); - } - AtomicRmwOp::Or => { - emit!(self.state, " or{} (%esp), {}", suffix, edx_reg); - } - AtomicRmwOp::Xor => { - emit!(self.state, " xor{} (%esp), {}", suffix, edx_reg); - } - AtomicRmwOp::Nand => { - emit!(self.state, " and{} (%esp), {}", suffix, edx_reg); - emit!(self.state, " not{} {}", suffix, edx_reg); - } - _ => {} - } - emit!(self.state, " lock cmpxchg{} {}, (%ecx)", suffix, edx_reg); - emit!(self.state, " jne {}", loop_label); - self.state.emit(" addl $4, %esp"); - } - } - self.state.reg_cache.invalidate_acc(); - self.store_eax_to(dest); - } - - pub(super) fn emit_atomic_cmpxchg_impl(&mut self, dest: &Value, ptr: &Operand, expected: &Operand, - desired: &Operand, ty: IrType, _success: AtomicOrdering, - _failure: AtomicOrdering, returns_bool: bool) { - if self.is_atomic_wide(ty) { - self.emit_atomic_cmpxchg_wide(dest, ptr, expected, desired, returns_bool); - return; - } - - self.operand_to_eax(expected); - self.state.emit(" movl %eax, %edx"); - self.operand_to_eax(desired); - self.state.emit(" pushl %eax"); - self.esp_adjust += 4; - self.operand_to_eax(ptr); - self.state.emit(" movl %eax, %ecx"); - self.state.emit(" movl %edx, %eax"); - self.state.emit(" popl %edx"); - self.esp_adjust -= 4; - let suffix = self.type_suffix(ty); - let reg = match ty { - IrType::I8 | IrType::U8 => "%dl", - IrType::I16 | IrType::U16 => "%dx", - _ => "%edx", - }; - emit!(self.state, " lock cmpxchg{} {}, (%ecx)", suffix, reg); - if returns_bool { - self.state.emit(" sete %al"); - self.state.emit(" movzbl %al, %eax"); - } - self.state.reg_cache.invalidate_acc(); - self.store_eax_to(dest); - } - - pub(super) fn emit_atomic_load_impl(&mut self, dest: &Value, ptr: &Operand, ty: IrType, _ordering: AtomicOrdering) { - if self.is_atomic_wide(ty) { - self.emit_atomic_load_wide(dest, ptr); - return; - } - - self.operand_to_eax(ptr); - let load_instr = self.mov_load_for_type(ty); - emit!(self.state, " {} (%eax), %eax", load_instr); - self.state.reg_cache.invalidate_acc(); - self.store_eax_to(dest); - } - - pub(super) fn emit_atomic_store_impl(&mut self, ptr: &Operand, val: &Operand, ty: IrType, ordering: AtomicOrdering) { - if self.is_atomic_wide(ty) { - self.emit_atomic_store_wide(ptr, val); - if matches!(ordering, AtomicOrdering::SeqCst) { - self.state.emit(" mfence"); - } - return; - } - - self.operand_to_eax(val); - self.state.emit(" movl %eax, %edx"); - self.operand_to_eax(ptr); - self.state.emit(" movl %eax, %ecx"); - let store_instr = self.mov_store_for_type(ty); - let reg = match ty { - IrType::I8 | IrType::U8 => "%dl", - IrType::I16 | IrType::U16 => "%dx", - _ => "%edx", - }; - emit!(self.state, " {} {}, (%ecx)", store_instr, reg); - if matches!(ordering, AtomicOrdering::SeqCst) { - self.state.emit(" mfence"); - } - } - - pub(super) fn emit_fence_impl(&mut self, ordering: AtomicOrdering) { - match ordering { - AtomicOrdering::Relaxed => {} - _ => self.state.emit(" mfence"), - } - } -} diff --git a/src/backend/i686/codegen/calls.rs b/src/backend/i686/codegen/calls.rs deleted file mode 100644 index 861632ccfb..0000000000 --- a/src/backend/i686/codegen/calls.rs +++ /dev/null @@ -1,216 +0,0 @@ -//! I686Codegen: function call operations (cdecl calling convention). - -use crate::ir::reexports::{Operand, Value}; -use crate::common::types::IrType; -use crate::backend::call_abi; -use crate::emit; -use crate::backend::traits::ArchCodegen; -use super::emit::I686Codegen; -use crate::backend::generation::is_i128_type; - -impl I686Codegen { - pub(super) fn call_abi_config_impl(&self) -> call_abi::CallAbiConfig { - call_abi::CallAbiConfig { - max_int_regs: self.regparm as usize, - max_float_regs: 0, - align_i128_pairs: false, - f128_in_fp_regs: false, - f128_in_gp_pairs: false, - variadic_floats_in_gp: false, - large_struct_by_ref: false, - use_sysv_struct_classification: false, - use_riscv_float_struct_classification: false, - allow_struct_split_reg_stack: false, - align_struct_pairs: false, - sret_uses_dedicated_reg: false, - } - } - - pub(super) fn emit_call_compute_stack_space_impl(&self, arg_classes: &[call_abi::CallArgClass], arg_types: &[IrType]) -> usize { - let mut total = 0; - for (i, ac) in arg_classes.iter().enumerate() { - let ty = if i < arg_types.len() { arg_types[i] } else { IrType::I32 }; - match ac { - call_abi::CallArgClass::Stack => { - match ty { - IrType::F64 | IrType::I64 | IrType::U64 => total += 8, - _ => total += 4, - } - } - call_abi::CallArgClass::F128Stack => total += 12, - call_abi::CallArgClass::I128Stack => total += 16, - call_abi::CallArgClass::StructByValStack { size } => total += (*size + 3) & !3, - call_abi::CallArgClass::LargeStructStack { size } => total += (*size + 3) & !3, - call_abi::CallArgClass::ZeroSizeSkip => {} - call_abi::CallArgClass::IntReg { .. } => {} // regparm: in register, no stack space - _ => total += 4, - } - } - (total + 15) & !15 - } - - pub(super) fn emit_call_f128_pre_convert_impl(&mut self, _args: &[Operand], _arg_classes: &[call_abi::CallArgClass], _arg_types: &[IrType], _stack_arg_space: usize) -> usize { - 0 // No F128 pre-conversion needed on i686 - } - - pub(super) fn emit_call_stack_args_impl(&mut self, args: &[Operand], arg_classes: &[call_abi::CallArgClass], - arg_types: &[IrType], stack_arg_space: usize, - _fptr_spill: usize, _f128_temp_space: usize) -> i64 { - if stack_arg_space > 0 { - emit!(self.state, " subl ${}, %esp", stack_arg_space); - self.esp_adjust += stack_arg_space as i64; - } - - let mut stack_offset: usize = 0; - for (i, ac) in arg_classes.iter().enumerate() { - match ac { - call_abi::CallArgClass::I128Stack => { - self.emit_call_i128_stack_arg(&args[i], stack_offset); - stack_offset += 16; - } - call_abi::CallArgClass::F128Stack => { - self.emit_call_f128_stack_arg(&args[i], stack_offset); - stack_offset += 12; - } - call_abi::CallArgClass::StructByValStack { size } | - call_abi::CallArgClass::LargeStructStack { size } => { - let sz = *size; - self.emit_call_struct_stack_arg(&args[i], stack_offset, sz); - stack_offset += (sz + 3) & !3; - } - call_abi::CallArgClass::Stack => { - let ty = arg_types[i]; - if ty == IrType::F64 || ty == IrType::I64 || ty == IrType::U64 { - self.emit_call_8byte_stack_arg(&args[i], ty, stack_offset); - stack_offset += 8; - } else { - self.operand_to_eax(&args[i]); - emit!(self.state, " movl %eax, {}(%esp)", stack_offset); - stack_offset += 4; - } - } - call_abi::CallArgClass::ZeroSizeSkip => {} - call_abi::CallArgClass::IntReg { .. } => {} // regparm: handled in emit_call_reg_args - _ => { - self.operand_to_eax(&args[i]); - emit!(self.state, " movl %eax, {}(%esp)", stack_offset); - stack_offset += 4; - } - } - } - - stack_arg_space as i64 - } - - pub(super) fn emit_call_reg_args_impl(&mut self, args: &[Operand], arg_classes: &[call_abi::CallArgClass], - _arg_types: &[IrType], _total_sp_adjust: i64, - _f128_temp_space: usize, _stack_arg_space: usize, - _struct_arg_riscv_float_classes: &[Option]) { - if self.regparm == 0 { - return; // cdecl: no register args - } - // regparm register order: EAX (reg_idx 0), EDX (reg_idx 1), ECX (reg_idx 2). - // We must load args into registers in reverse order to avoid clobbering - // EAX (the accumulator) before we're done using it to load other values. - // Collect register args first, then emit in reverse order. - let regparm_regs: &[&str] = &["%eax", "%edx", "%ecx"]; - let mut reg_args: Vec<(usize, usize)> = Vec::new(); // (arg_idx, reg_idx) - for (i, ac) in arg_classes.iter().enumerate() { - if let call_abi::CallArgClass::IntReg { reg_idx } = ac { - reg_args.push((i, *reg_idx)); - } - } - // Emit in reverse order so we load into edx/ecx before eax - // (since operand_to_eax uses eax as accumulator). - for &(arg_i, reg_idx) in reg_args.iter().rev() { - if reg_idx < regparm_regs.len() { - let dest_reg = regparm_regs[reg_idx]; - if dest_reg == "%eax" { - self.operand_to_eax(&args[arg_i]); - } else { - self.operand_to_eax(&args[arg_i]); - emit!(self.state, " movl %eax, {}", dest_reg); - self.state.reg_cache.invalidate_acc(); - } - } - } - } - - pub(super) fn emit_call_instruction_impl(&mut self, direct_name: Option<&str>, func_ptr: Option<&Operand>, - indirect: bool, _stack_arg_space: usize) { - if let Some(name) = direct_name { - if self.state.needs_plt(name) { - emit!(self.state, " call {}@PLT", name); - } else { - emit!(self.state, " call {}", name); - } - } else if indirect { - if let Some(fptr) = func_ptr { - self.operand_to_eax(fptr); - } - self.state.emit(" call *%eax"); - } - } - - pub(super) fn emit_call_cleanup_impl(&mut self, stack_arg_space: usize, _f128_temp_space: usize, _indirect: bool) { - if stack_arg_space > 0 { - emit!(self.state, " addl ${}, %esp", stack_arg_space); - self.esp_adjust -= stack_arg_space as i64; - } - } - - pub(super) fn emit_call_store_result_impl(&mut self, dest: &Value, return_type: IrType) { - if return_type == IrType::I64 || return_type == IrType::U64 { - if let Some(slot) = self.state.get_slot(dest.0) { - let sr0 = self.slot_ref(slot); - let sr4 = self.slot_ref_offset(slot, 4); - emit!(self.state, " movl %eax, {}", sr0); - emit!(self.state, " movl %edx, {}", sr4); - } - self.state.reg_cache.invalidate_acc(); - } else if is_i128_type(return_type) { - self.emit_call_store_i128_result(dest); - } else if return_type.is_long_double() { - self.emit_call_store_f128_result(dest); - } else if return_type == IrType::F32 { - self.emit_call_move_f32_to_acc(); - self.emit_store_result(dest); - } else if return_type == IrType::F64 { - self.emit_f64_store_from_x87(dest); - self.state.reg_cache.invalidate_acc(); - } else { - self.emit_store_result(dest); - } - } - - pub(super) fn emit_call_store_i128_result_impl(&mut self, dest: &Value) { - if let Some(slot) = self.state.get_slot(dest.0) { - let sr0 = self.slot_ref(slot); - let sr4 = self.slot_ref_offset(slot, 4); - emit!(self.state, " movl %eax, {}", sr0); - emit!(self.state, " movl %edx, {}", sr4); - } - } - - pub(super) fn emit_call_store_f128_result_impl(&mut self, dest: &Value) { - if let Some(slot) = self.state.get_slot(dest.0) { - let sr = self.slot_ref(slot); - emit!(self.state, " fstpt {}", sr); - self.state.f128_direct_slots.insert(dest.0); - } - } - - pub(super) fn emit_call_move_f32_to_acc_impl(&mut self) { - self.state.emit(" subl $4, %esp"); - self.state.emit(" fstps (%esp)"); - self.state.emit(" movl (%esp), %eax"); - self.state.emit(" addl $4, %esp"); - } - - pub(super) fn emit_call_move_f64_to_acc_impl(&mut self) { - self.state.emit(" subl $8, %esp"); - self.state.emit(" fstpl (%esp)"); - self.state.emit(" movl (%esp), %eax"); - self.state.emit(" addl $8, %esp"); - } -} diff --git a/src/backend/i686/codegen/casts.rs b/src/backend/i686/codegen/casts.rs deleted file mode 100644 index 3bca374d7a..0000000000 --- a/src/backend/i686/codegen/casts.rs +++ /dev/null @@ -1,601 +0,0 @@ -//! i686 type cast emission. -//! -//! Handles `emit_cast` and `emit_cast_instrs` for the i686 backend. -//! On i686, casts involving F64/F128 or 64-bit integers require special -//! handling because: -//! - F64 values are 8 bytes but the accumulator (eax) is only 32 bits -//! - F128 (long double) is native x87 80-bit extended precision (12 bytes) -//! - 64-bit integers use the eax:edx register pair -//! -//! All F64/F128 conversions go through the x87 FPU, bypassing the default -//! emit_load_operand path that assumes values fit in a single register. - -use crate::backend::traits::ArchCodegen; -use crate::ir::reexports::{Operand, Value}; -use crate::common::types::IrType; -use crate::emit; -use super::emit::I686Codegen; - -impl I686Codegen { - /// Override emit_cast to handle F64 source/destination specially on i686. - /// F64 values are 8 bytes but the accumulator is only 32 bits, so we use - /// x87 FPU for all F64 conversions, bypassing the default emit_load_operand path. - pub(super) fn emit_cast_impl(&mut self, dest: &Value, src: &Operand, from_ty: IrType, to_ty: IrType) { - use crate::backend::cast::{CastKind, classify_cast_with_f128}; - - // Let the default handle i128 conversions - if crate::backend::generation::is_i128_type(from_ty) || crate::backend::generation::is_i128_type(to_ty) { - crate::backend::traits::emit_cast_default(self, dest, src, from_ty, to_ty); - return; - } - - // On i686, F128 (long double) is native x87 80-bit extended precision, - // stored as 12 bytes. We must use f128_is_native=true so that F128 casts - // go through the dedicated SignedToF128/UnsignedToF128/F128ToSigned/etc. - // paths that use fstpt (12-byte store), not through the F64 paths that - // use fstpl (8-byte store) which would corrupt F128 values. - match classify_cast_with_f128(from_ty, to_ty, true) { - // --- Casts where F64 is the destination (result needs 8-byte slot) --- - CastKind::SignedToFloat { to_f64: true, from_ty: src_ty } => { - self.emit_signed_to_f64(src, src_ty, dest); - } - CastKind::UnsignedToFloat { to_f64: true, from_ty } => { - self.emit_unsigned_to_f64(src, from_ty, dest); - } - CastKind::FloatToFloat { widen: true } => { - // F32 -> F64: load F32 from eax, x87 will auto-extend - self.operand_to_eax(src); - self.state.emit(" pushl %eax"); - self.state.emit(" flds (%esp)"); - self.state.emit(" addl $4, %esp"); - // st(0) is now the F64 value, store to 8-byte slot - self.emit_f64_store_from_x87(dest); - self.state.reg_cache.invalidate_acc(); - } - - // --- Casts where F64 is the source (need to load 8-byte value) --- - CastKind::FloatToSigned { from_f64: true } => { - self.emit_f64_to_signed(src, to_ty, dest); - } - CastKind::FloatToUnsigned { from_f64: true, to_u64 } => { - self.emit_f64_to_unsigned(src, to_u64, to_ty, dest); - } - CastKind::FloatToFloat { widen: false } => { - // F64 -> F32: load full 8-byte F64, convert to F32 on x87 - self.emit_f64_load_to_x87(src); - self.state.emit(" subl $4, %esp"); - self.state.emit(" fstps (%esp)"); - self.state.emit(" movl (%esp), %eax"); - self.state.emit(" addl $4, %esp"); - self.state.reg_cache.invalidate_acc(); - self.store_eax_to(dest); - } - - // --- F128 <-> F64/F32 conversions --- - CastKind::FloatToF128 { from_f32 } => { - self.emit_float_to_f128(src, from_f32, dest); - } - CastKind::F128ToFloat { to_f32 } => { - self.emit_f128_to_float(src, to_f32, dest); - } - - // --- F128 <-> int conversions --- - CastKind::SignedToF128 { from_ty: src_ty } => { - self.emit_signed_to_f128(src, src_ty, dest); - } - CastKind::UnsignedToF128 { from_ty: src_ty } => { - self.emit_unsigned_to_f128(src, src_ty, dest); - } - CastKind::F128ToSigned { to_ty: dest_ty } => { - self.emit_f128_to_signed(src, dest_ty, dest); - } - CastKind::F128ToUnsigned { to_ty: dest_ty } => { - self.emit_f128_to_unsigned(src, dest_ty, dest); - } - - // --- I64 -> F32: use x87 fildq for full 64-bit precision --- - CastKind::SignedToFloat { to_f64: false, from_ty: IrType::I64 } => { - self.emit_load_acc_pair(src); - self.state.emit(" pushl %edx"); - self.state.emit(" pushl %eax"); - self.state.emit(" fildq (%esp)"); - self.state.emit(" fstps (%esp)"); - self.state.emit(" movl (%esp), %eax"); - self.state.emit(" addl $8, %esp"); - self.state.reg_cache.invalidate_acc(); - self.store_eax_to(dest); - } - // --- U64 -> F32: use x87 with unsigned handling --- - CastKind::UnsignedToFloat { to_f64: false, from_ty: IrType::U64 } => { - self.emit_u64_to_f32(src, dest); - } - // --- F32 -> I64: use x87 fisttpq --- - CastKind::FloatToSigned { from_f64: false } if to_ty == IrType::I64 => { - self.emit_f32_to_i64(src, dest); - } - // --- F32 -> U64: use x87 fisttpq --- - CastKind::FloatToUnsigned { from_f64: false, to_u64: true } => { - self.emit_f32_to_i64(src, dest); // same implementation as F32->I64 - } - - // --- Same-size cast between I64 and U64: copy all 8 bytes --- - CastKind::SignedToUnsignedSameSize { to_ty: IrType::U64 } - | CastKind::UnsignedToSignedSameSize { to_ty: IrType::I64 } - | CastKind::Noop if matches!((from_ty, to_ty), (IrType::I64, IrType::U64) | (IrType::U64, IrType::I64) | (IrType::I64, IrType::I64) | (IrType::U64, IrType::U64)) => { - self.emit_load_acc_pair(src); - self.emit_store_acc_pair(dest); - self.state.reg_cache.invalidate_all(); - } - - // --- Widening casts to I64/U64 need full 8-byte store --- - CastKind::IntWiden { .. } if matches!(to_ty, IrType::I64 | IrType::U64) => { - self.operand_to_eax(src); - self.emit_cast_instrs_impl(from_ty, to_ty); - // Set high half: sign-extend for signed sources, zero-extend for unsigned - if from_ty.is_signed() { - self.state.emit(" cltd"); // sign-extend eax into edx:eax - } else { - self.state.emit(" xorl %edx, %edx"); - } - self.emit_store_acc_pair(dest); - self.state.reg_cache.invalidate_all(); - } - // --- I64/U64 narrowing to smaller types --- - CastKind::IntNarrow { .. } if matches!(from_ty, IrType::I64 | IrType::U64) => { - // Load only the low 32 bits (truncation) - self.operand_to_eax(src); - self.emit_cast_instrs_impl(from_ty, to_ty); - self.store_eax_to(dest); - } - // --- All other casts use the default path (emit_load_operand -> eax -> cast -> store) --- - _ => { - self.operand_to_eax(src); - self.emit_cast_instrs_impl(from_ty, to_ty); - self.store_eax_to(dest); - } - } - } - - // --- Helper methods for cast families --- - - /// Signed integer -> F64 via x87 FPU. - fn emit_signed_to_f64(&mut self, src: &Operand, src_ty: IrType, dest: &Value) { - if src_ty == IrType::I64 { - // I64 -> F64: load full 64-bit value, use fildq - self.emit_load_acc_pair(src); - self.state.emit(" pushl %edx"); - self.state.emit(" pushl %eax"); - self.state.emit(" fildq (%esp)"); - self.state.emit(" addl $8, %esp"); - } else { - self.operand_to_eax(src); - match src_ty { - IrType::I8 => self.state.emit(" movsbl %al, %eax"), - IrType::I16 => self.state.emit(" movswl %ax, %eax"), - _ => {} - } - self.state.emit(" pushl %eax"); - self.state.emit(" fildl (%esp)"); - self.state.emit(" addl $4, %esp"); - } - // st(0) = F64 result, store to dest's 8-byte slot - self.emit_f64_store_from_x87(dest); - self.state.reg_cache.invalidate_acc(); - } - - /// Unsigned integer -> F64 via x87 FPU. - fn emit_unsigned_to_f64(&mut self, src: &Operand, from_ty: IrType, dest: &Value) { - if from_ty == IrType::U64 { - // U64 -> F64: fildq treats the value as signed, so values - // >= 2^63 need correction by adding float constant 2^64. - self.emit_load_acc_pair(src); - self.state.emit(" pushl %edx"); - self.state.emit(" pushl %eax"); - self.state.emit(" fildq (%esp)"); - self.state.emit(" addl $8, %esp"); - self.state.emit(" testl %edx, %edx"); - let done_label = self.state.fresh_label("u64_f64_done"); - self.state.out.emit_jcc_label(" jns", &done_label); - // High bit set: add 2^64 (float 0x5F800000) to fix sign - self.state.emit(" pushl $0x5F800000"); - self.state.emit(" fadds (%esp)"); - self.state.emit(" addl $4, %esp"); - self.state.out.emit_named_label(&done_label); - } else { - // U8/U16/U32 -> F64: handle high-bit-set U32 values - self.operand_to_eax(src); - let big_label = self.state.fresh_label("u2f_big"); - let done_label = self.state.fresh_label("u2f_done"); - self.state.emit(" testl %eax, %eax"); - self.state.out.emit_jcc_label(" js", &big_label); - // Positive (< 2^31): fildl works directly - self.state.emit(" pushl %eax"); - self.state.emit(" fildl (%esp)"); - self.state.emit(" addl $4, %esp"); - self.state.out.emit_jmp_label(&done_label); - self.state.out.emit_named_label(&big_label); - // Bit 31 set: push as u64 (zero-extend), use fildq - self.state.emit(" pushl $0"); - self.state.emit(" pushl %eax"); - self.state.emit(" fildq (%esp)"); - self.state.emit(" addl $8, %esp"); - self.state.out.emit_named_label(&done_label); - } - self.emit_f64_store_from_x87(dest); - self.state.reg_cache.invalidate_acc(); - } - - /// F64 -> signed integer via x87 FPU. - fn emit_f64_to_signed(&mut self, src: &Operand, to_ty: IrType, dest: &Value) { - self.emit_f64_load_to_x87(src); - if to_ty == IrType::I64 { - // F64 -> I64: use fisttpq for full 64-bit conversion - self.state.emit(" subl $8, %esp"); - self.state.emit(" fisttpq (%esp)"); - self.state.emit(" movl (%esp), %eax"); - self.state.emit(" movl 4(%esp), %edx"); - self.state.emit(" addl $8, %esp"); - self.emit_store_acc_pair(dest); - } else { - // F64 -> I32/I16/I8: use fisttpl for 32-bit conversion - self.state.emit(" subl $4, %esp"); - self.state.emit(" fisttpl (%esp)"); - self.state.emit(" movl (%esp), %eax"); - self.state.emit(" addl $4, %esp"); - // Truncate to target width for sub-32-bit signed types - match to_ty { - IrType::I8 => self.state.emit(" movsbl %al, %eax"), - IrType::I16 => self.state.emit(" movswl %ax, %eax"), - _ => {} - } - self.state.reg_cache.invalidate_acc(); - self.store_eax_to(dest); - } - } - - /// F64 -> unsigned integer via x87 FPU. - fn emit_f64_to_unsigned(&mut self, src: &Operand, to_u64: bool, to_ty: IrType, dest: &Value) { - self.emit_f64_load_to_x87(src); - if to_u64 { - // F64 -> U64: use fisttpq for full 64-bit conversion - self.state.emit(" subl $8, %esp"); - self.state.emit(" fisttpq (%esp)"); - self.state.emit(" movl (%esp), %eax"); - self.state.emit(" movl 4(%esp), %edx"); - self.state.emit(" addl $8, %esp"); - self.emit_store_acc_pair(dest); - } else { - // F64 -> unsigned sub-64-bit: use fisttpq then take low 32 bits - self.state.emit(" subl $8, %esp"); - self.state.emit(" fisttpq (%esp)"); - self.state.emit(" movl (%esp), %eax"); - self.state.emit(" addl $8, %esp"); - // Truncate to target width for sub-32-bit unsigned types - match to_ty { - IrType::U8 => self.state.emit(" movzbl %al, %eax"), - IrType::U16 => self.state.emit(" movzwl %ax, %eax"), - _ => {} - } - self.state.reg_cache.invalidate_acc(); - self.store_eax_to(dest); - } - } - - /// F32/F64 -> F128 (x87 80-bit long double). - fn emit_float_to_f128(&mut self, src: &Operand, from_f32: bool, dest: &Value) { - if from_f32 { - // F32 -> F128: load F32 onto x87, store as F128 - self.operand_to_eax(src); - self.state.emit(" pushl %eax"); - self.state.emit(" flds (%esp)"); - self.state.emit(" addl $4, %esp"); - } else { - // F64 -> F128: load F64 onto x87 - self.emit_f64_load_to_x87(src); - } - if let Some(slot) = self.state.get_slot(dest.0) { - let sr = self.slot_ref(slot); - emit!(self.state, " fstpt {}", sr); - self.state.f128_direct_slots.insert(dest.0); - } - self.state.reg_cache.invalidate_acc(); - } - - /// F128 (x87 80-bit) -> F32/F64. - fn emit_f128_to_float(&mut self, src: &Operand, to_f32: bool, dest: &Value) { - self.emit_f128_load_to_x87(src); - if to_f32 { - // F128 -> F32 - self.state.emit(" subl $4, %esp"); - self.state.emit(" fstps (%esp)"); - self.state.emit(" movl (%esp), %eax"); - self.state.emit(" addl $4, %esp"); - self.state.reg_cache.invalidate_acc(); - self.store_eax_to(dest); - } else { - // F128 -> F64 - self.emit_f64_store_from_x87(dest); - self.state.reg_cache.invalidate_acc(); - } - } - - /// Signed integer -> F128 (x87 80-bit long double). - fn emit_signed_to_f128(&mut self, src: &Operand, src_ty: IrType, dest: &Value) { - if src_ty == IrType::I64 { - // I64 -> F128: load full 64-bit value via register pair, use fildq - self.emit_load_acc_pair(src); - self.state.emit(" pushl %edx"); - self.state.emit(" pushl %eax"); - self.state.emit(" fildq (%esp)"); - self.state.emit(" addl $8, %esp"); - } else { - self.operand_to_eax(src); - match src_ty { - IrType::I8 => self.state.emit(" movsbl %al, %eax"), - IrType::I16 => self.state.emit(" movswl %ax, %eax"), - _ => {} - } - self.state.emit(" pushl %eax"); - self.state.emit(" fildl (%esp)"); - self.state.emit(" addl $4, %esp"); - } - if let Some(slot) = self.state.get_slot(dest.0) { - let sr = self.slot_ref(slot); - emit!(self.state, " fstpt {}", sr); - self.state.f128_direct_slots.insert(dest.0); - } - self.state.reg_cache.invalidate_acc(); - } - - /// Unsigned integer -> F128 (x87 80-bit long double). - fn emit_unsigned_to_f128(&mut self, src: &Operand, src_ty: IrType, dest: &Value) { - if src_ty == IrType::U64 { - // U64 -> F128 (x87 80-bit long double): - // fildq treats the value as signed. For values >= 2^63 - // (high bit set), fildq gives a negative result. We fix - // this by adding 2^64 (as a float constant 0x5F800000). - self.emit_load_acc_pair(src); - self.state.emit(" pushl %edx"); - self.state.emit(" pushl %eax"); - self.state.emit(" fildq (%esp)"); - self.state.emit(" addl $8, %esp"); - self.state.emit(" testl %edx, %edx"); - let done_label = self.state.fresh_label("u64_f128_done"); - self.state.out.emit_jcc_label(" jns", &done_label); - // High bit set: add 2^64 to compensate for signed interpretation. - // Float constant 0x5F800000 = 2^64 = 18446744073709551616.0f - self.state.emit(" pushl $0x5F800000"); - self.state.emit(" fadds (%esp)"); - self.state.emit(" addl $4, %esp"); - self.state.out.emit_named_label(&done_label); - } else { - // U8/U16/U32 -> F128: handle high-bit-set U32 values - self.operand_to_eax(src); - let big_label = self.state.fresh_label("u2f128_big"); - let done_label = self.state.fresh_label("u2f128_done"); - self.state.emit(" testl %eax, %eax"); - self.state.out.emit_jcc_label(" js", &big_label); - self.state.emit(" pushl %eax"); - self.state.emit(" fildl (%esp)"); - self.state.emit(" addl $4, %esp"); - self.state.out.emit_jmp_label(&done_label); - self.state.out.emit_named_label(&big_label); - // Bit 31 set: zero-extend to 64-bit and use fildq - self.state.emit(" pushl $0"); - self.state.emit(" pushl %eax"); - self.state.emit(" fildq (%esp)"); - self.state.emit(" addl $8, %esp"); - self.state.out.emit_named_label(&done_label); - } - if let Some(slot) = self.state.get_slot(dest.0) { - let sr = self.slot_ref(slot); - emit!(self.state, " fstpt {}", sr); - self.state.f128_direct_slots.insert(dest.0); - } - self.state.reg_cache.invalidate_acc(); - } - - /// F128 (x87 80-bit) -> signed integer. - fn emit_f128_to_signed(&mut self, src: &Operand, dest_ty: IrType, dest: &Value) { - self.emit_f128_load_to_x87(src); - if dest_ty == IrType::I64 { - // F128 -> I64: use fisttpq for full 64-bit conversion - self.state.emit(" subl $8, %esp"); - self.state.emit(" fisttpq (%esp)"); - self.state.emit(" movl (%esp), %eax"); - self.state.emit(" movl 4(%esp), %edx"); - self.state.emit(" addl $8, %esp"); - self.emit_store_acc_pair(dest); - } else { - // F128 -> I32/I16/I8: use fisttpl for 32-bit conversion - self.state.emit(" subl $4, %esp"); - self.state.emit(" fisttpl (%esp)"); - self.state.emit(" movl (%esp), %eax"); - self.state.emit(" addl $4, %esp"); - self.state.reg_cache.invalidate_acc(); - self.store_eax_to(dest); - } - } - - /// F128 (x87 80-bit) -> unsigned integer. - fn emit_f128_to_unsigned(&mut self, src: &Operand, dest_ty: IrType, dest: &Value) { - self.emit_f128_load_to_x87(src); - if dest_ty == IrType::U64 { - // F128 -> U64: use fisttpq for full 64-bit conversion - self.state.emit(" subl $8, %esp"); - self.state.emit(" fisttpq (%esp)"); - self.state.emit(" movl (%esp), %eax"); - self.state.emit(" movl 4(%esp), %edx"); - self.state.emit(" addl $8, %esp"); - self.emit_store_acc_pair(dest); - } else { - // F128 -> U32/U16/U8: use fisttpq then take low 32 bits - self.state.emit(" subl $8, %esp"); - self.state.emit(" fisttpq (%esp)"); - self.state.emit(" movl (%esp), %eax"); - self.state.emit(" addl $8, %esp"); - self.state.reg_cache.invalidate_acc(); - self.store_eax_to(dest); - } - } - - /// U64 -> F32 via x87 with unsigned correction. - fn emit_u64_to_f32(&mut self, src: &Operand, dest: &Value) { - self.emit_load_acc_pair(src); - self.state.emit(" pushl %edx"); - self.state.emit(" pushl %eax"); - self.state.emit(" fildq (%esp)"); - self.state.emit(" addl $8, %esp"); - // If high bit was set, fildq gave a negative result; add 2^64 - self.state.emit(" testl %edx, %edx"); - let done_label = self.state.fresh_label("u64_f32_done"); - self.state.out.emit_jcc_label(" jns", &done_label); - // Float constant 0x5F800000 = 2^64 - self.state.emit(" pushl $0x5F800000"); - self.state.emit(" fadds (%esp)"); - self.state.emit(" addl $4, %esp"); - self.state.out.emit_named_label(&done_label); - self.state.emit(" subl $4, %esp"); - self.state.emit(" fstps (%esp)"); - self.state.emit(" movl (%esp), %eax"); - self.state.emit(" addl $4, %esp"); - self.state.reg_cache.invalidate_acc(); - self.store_eax_to(dest); - } - - /// F32 -> I64/U64 via x87 fisttpq. - fn emit_f32_to_i64(&mut self, src: &Operand, dest: &Value) { - self.operand_to_eax(src); - self.state.emit(" subl $8, %esp"); - self.state.emit(" movl %eax, (%esp)"); - self.state.emit(" flds (%esp)"); - self.state.emit(" fisttpq (%esp)"); - self.state.emit(" movl (%esp), %eax"); - self.state.emit(" movl 4(%esp), %edx"); - self.state.emit(" addl $8, %esp"); - self.emit_store_acc_pair(dest); - self.state.reg_cache.invalidate_all(); - } - - /// Emit scalar cast instructions (non-F64/F128, non-64-bit integer). - /// Operates on value already in eax, result left in eax. - pub(super) fn emit_cast_instrs_impl(&mut self, from_ty: IrType, to_ty: IrType) { - use crate::backend::cast::{CastKind, classify_cast}; - - match classify_cast(from_ty, to_ty) { - CastKind::Noop | CastKind::UnsignedToSignedSameSize { .. } => {} - - CastKind::IntNarrow { to_ty } => { - // Truncation to a narrower type: sign-extend or zero-extend - // the sub-register to fill all of %eax. Without this, the - // upper bits of %eax retain stale data from the wider - // computation, which corrupts truthiness checks (testl %eax) - // and other 32-bit operations on the narrowed value. - if to_ty.is_signed() { - match to_ty { - IrType::I8 => self.state.emit(" movsbl %al, %eax"), - IrType::I16 => self.state.emit(" movswl %ax, %eax"), - _ => {} // I32: no-op (already 32-bit) - } - } else { - match to_ty { - IrType::U8 => self.state.emit(" movzbl %al, %eax"), - IrType::U16 => self.state.emit(" movzwl %ax, %eax"), - _ => {} // U32: no-op - } - } - } - - CastKind::IntWiden { from_ty, .. } => { - if from_ty.is_unsigned() { - match from_ty { - IrType::U8 => self.state.emit(" movzbl %al, %eax"), - IrType::U16 => self.state.emit(" movzwl %ax, %eax"), - // U32 -> I64/U64: no-op on i686 (eax already has 32 bits) - _ => {} - } - } else { - match from_ty { - IrType::I8 => self.state.emit(" movsbl %al, %eax"), - IrType::I16 => self.state.emit(" movswl %ax, %eax"), - // I32 -> I64/U64: no-op on i686 (eax already has 32 bits) - _ => {} - } - } - } - - CastKind::SignedToUnsignedSameSize { to_ty } => { - // On i686, same-size signed->unsigned: mask for sub-32-bit types - match to_ty { - IrType::U8 => self.state.emit(" movzbl %al, %eax"), - IrType::U16 => self.state.emit(" movzwl %ax, %eax"), - _ => {} // U32, U64: no-op - } - } - - CastKind::SignedToFloat { to_f64: false, .. } => { - // Signed int -> F32 via SSE - self.state.emit(" cvtsi2ssl %eax, %xmm0"); - self.state.emit(" movd %xmm0, %eax"); - } - - CastKind::UnsignedToFloat { to_f64: false, .. } => { - // U8/U16/U32 -> F32 - let big_label = self.state.fresh_label("u2f_big"); - let done_label = self.state.fresh_label("u2f_done"); - self.state.emit(" testl %eax, %eax"); - self.state.out.emit_jcc_label(" js", &big_label); - self.state.emit(" cvtsi2ssl %eax, %xmm0"); - self.state.emit(" movd %xmm0, %eax"); - self.state.out.emit_jmp_label(&done_label); - self.state.out.emit_named_label(&big_label); - self.state.emit(" pushl $0"); - self.state.emit(" pushl %eax"); - self.state.emit(" fildq (%esp)"); - self.state.emit(" fstps (%esp)"); - self.state.emit(" popl %eax"); - self.state.emit(" addl $4, %esp"); - self.state.out.emit_named_label(&done_label); - } - - CastKind::FloatToSigned { from_f64: false } => { - // F32 -> signed int via SSE - self.state.emit(" movd %eax, %xmm0"); - self.state.emit(" cvttss2si %xmm0, %eax"); - // Truncate to target width for sub-32-bit signed types - match to_ty { - IrType::I8 => self.state.emit(" movsbl %al, %eax"), - IrType::I16 => self.state.emit(" movswl %ax, %eax"), - _ => {} - } - } - - CastKind::FloatToUnsigned { from_f64: false, to_u64 } => { - if to_u64 { - // F32 -> U64: use x87 - self.state.emit(" pushl %eax"); - self.state.emit(" flds (%esp)"); - self.state.emit(" addl $4, %esp"); - self.state.emit(" subl $8, %esp"); - self.state.emit(" fisttpq (%esp)"); - self.state.emit(" movl (%esp), %eax"); - self.state.emit(" movl 4(%esp), %edx"); - self.state.emit(" addl $8, %esp"); - } else { - // F32 -> unsigned int: cvttss2si treats result as signed - self.state.emit(" movd %eax, %xmm0"); - self.state.emit(" cvttss2si %xmm0, %eax"); - // Truncate to target width for sub-32-bit unsigned types - match to_ty { - IrType::U8 => self.state.emit(" movzbl %al, %eax"), - IrType::U16 => self.state.emit(" movzwl %ax, %eax"), - _ => {} - } - } - } - - // F64/F128 casts are handled by emit_cast_impl above - _ => {} - } - } -} diff --git a/src/backend/i686/codegen/comparison.rs b/src/backend/i686/codegen/comparison.rs deleted file mode 100644 index 430ca8cf5b..0000000000 --- a/src/backend/i686/codegen/comparison.rs +++ /dev/null @@ -1,216 +0,0 @@ -//! I686Codegen: comparison operations (float, int, fused branches, select). - -use crate::ir::reexports::{IrCmpOp, Operand, Value}; -use crate::common::types::IrType; -use crate::emit; -use crate::backend::traits::ArchCodegen; -use super::emit::I686Codegen; - -impl I686Codegen { - pub(super) fn emit_float_cmp_impl(&mut self, dest: &Value, op: IrCmpOp, lhs: &Operand, rhs: &Operand, ty: IrType) { - if ty == IrType::F64 { - let swap = matches!(op, IrCmpOp::Slt | IrCmpOp::Ult | IrCmpOp::Sle | IrCmpOp::Ule); - let (first, second) = if swap { (lhs, rhs) } else { (rhs, lhs) }; - self.emit_f64_load_to_x87(first); - self.emit_f64_load_to_x87(second); - self.state.emit(" fucomip %st(1), %st"); - self.state.emit(" fstp %st(0)"); - - match op { - IrCmpOp::Eq => { - self.state.emit(" setnp %al"); - self.state.emit(" sete %cl"); - self.state.emit(" andb %cl, %al"); - } - IrCmpOp::Ne => { - self.state.emit(" setp %al"); - self.state.emit(" setne %cl"); - self.state.emit(" orb %cl, %al"); - } - IrCmpOp::Slt | IrCmpOp::Ult | IrCmpOp::Sgt | IrCmpOp::Ugt => { - self.state.emit(" seta %al"); - } - IrCmpOp::Sle | IrCmpOp::Ule | IrCmpOp::Sge | IrCmpOp::Uge => { - self.state.emit(" setae %al"); - } - } - self.state.emit(" movzbl %al, %eax"); - self.state.reg_cache.invalidate_acc(); - self.store_eax_to(dest); - return; - } - // F32: Use SSE for float comparisons - let swap_operands = matches!(op, IrCmpOp::Slt | IrCmpOp::Ult | IrCmpOp::Sle | IrCmpOp::Ule); - let (first, second) = if swap_operands { (rhs, lhs) } else { (lhs, rhs) }; - - self.operand_to_eax(first); - self.state.emit(" movd %eax, %xmm0"); - self.operand_to_ecx(second); - self.state.emit(" movd %ecx, %xmm1"); - self.state.emit(" ucomiss %xmm1, %xmm0"); - - match op { - IrCmpOp::Eq => { - self.state.emit(" setnp %al"); - self.state.emit(" sete %cl"); - self.state.emit(" andb %cl, %al"); - } - IrCmpOp::Ne => { - self.state.emit(" setp %al"); - self.state.emit(" setne %cl"); - self.state.emit(" orb %cl, %al"); - } - IrCmpOp::Slt | IrCmpOp::Ult | IrCmpOp::Sgt | IrCmpOp::Ugt => { - self.state.emit(" seta %al"); - } - IrCmpOp::Sle | IrCmpOp::Ule | IrCmpOp::Sge | IrCmpOp::Uge => { - self.state.emit(" setae %al"); - } - } - self.state.emit(" movzbl %al, %eax"); - self.state.reg_cache.invalidate_acc(); - self.store_eax_to(dest); - } - - pub(super) fn emit_f128_cmp_impl(&mut self, dest: &Value, op: IrCmpOp, lhs: &Operand, rhs: &Operand) { - let swap = matches!(op, IrCmpOp::Slt | IrCmpOp::Ult | IrCmpOp::Sle | IrCmpOp::Ule); - let (first, second) = if swap { (lhs, rhs) } else { (rhs, lhs) }; - self.emit_f128_load_to_x87(first); - self.emit_f128_load_to_x87(second); - self.state.emit(" fucomip %st(1), %st"); - self.state.emit(" fstp %st(0)"); - - match op { - IrCmpOp::Eq => { - self.state.emit(" setnp %al"); - self.state.emit(" sete %cl"); - self.state.emit(" andb %cl, %al"); - } - IrCmpOp::Ne => { - self.state.emit(" setp %al"); - self.state.emit(" setne %cl"); - self.state.emit(" orb %cl, %al"); - } - IrCmpOp::Slt | IrCmpOp::Ult | IrCmpOp::Sgt | IrCmpOp::Ugt => { - self.state.emit(" seta %al"); - } - IrCmpOp::Sle | IrCmpOp::Ule | IrCmpOp::Sge | IrCmpOp::Uge => { - self.state.emit(" setae %al"); - } - } - self.state.emit(" movzbl %al, %eax"); - self.state.reg_cache.invalidate_acc(); - self.store_eax_to(dest); - } - - pub(super) fn emit_int_cmp_impl(&mut self, dest: &Value, op: IrCmpOp, lhs: &Operand, rhs: &Operand, _ty: IrType) { - self.operand_to_eax(lhs); - self.operand_to_ecx(rhs); - self.state.emit(" cmpl %ecx, %eax"); - - let set_instr = match op { - IrCmpOp::Eq => "sete", - IrCmpOp::Ne => "setne", - IrCmpOp::Slt => "setl", - IrCmpOp::Sle => "setle", - IrCmpOp::Sgt => "setg", - IrCmpOp::Sge => "setge", - IrCmpOp::Ult => "setb", - IrCmpOp::Ule => "setbe", - IrCmpOp::Ugt => "seta", - IrCmpOp::Uge => "setae", - }; - emit!(self.state, " {} %al", set_instr); - self.state.emit(" movzbl %al, %eax"); - self.state.reg_cache.invalidate_acc(); - self.store_eax_to(dest); - } - - pub(super) fn emit_fused_cmp_branch_impl( - &mut self, - op: IrCmpOp, - lhs: &Operand, - rhs: &Operand, - _ty: IrType, - true_label: &str, - false_label: &str, - ) { - self.operand_to_eax(lhs); - self.operand_to_ecx(rhs); - self.state.emit(" cmpl %ecx, %eax"); - - let jcc = match op { - IrCmpOp::Eq => "je", - IrCmpOp::Ne => "jne", - IrCmpOp::Slt => "jl", - IrCmpOp::Sle => "jle", - IrCmpOp::Sgt => "jg", - IrCmpOp::Sge => "jge", - IrCmpOp::Ult => "jb", - IrCmpOp::Ule => "jbe", - IrCmpOp::Ugt => "ja", - IrCmpOp::Uge => "jae", - }; - emit!(self.state, " {} {}", jcc, true_label); - emit!(self.state, " jmp {}", false_label); - self.state.reg_cache.invalidate_all(); - } - - pub(super) fn emit_select_impl(&mut self, dest: &Value, cond: &Operand, true_val: &Operand, false_val: &Operand, ty: IrType) { - use crate::ir::reexports::IrConst; - // Constant-fold wide conditions at compile time - match cond { - Operand::Const(IrConst::I64(v)) => { - self.emit_copy_value(dest, if *v != 0 { true_val } else { false_val }); - return; - } - Operand::Const(IrConst::F64(fval)) => { - self.emit_copy_value(dest, if *fval != 0.0 { true_val } else { false_val }); - return; - } - _ => {} - } - - let cond_is_wide = matches!(cond, Operand::Value(v) if self.state.is_wide_value(v.0)); - let result_is_wide = matches!(ty, IrType::F64 | IrType::I64 | IrType::U64); - - if !cond_is_wide && !result_is_wide { - let label_id = self.state.next_label_id(); - let true_label = format!(".Lsel_true_{}", label_id); - let end_label = format!(".Lsel_end_{}", label_id); - self.emit_load_operand(cond); - self.emit_branch_nonzero(&true_label); - self.emit_load_operand(false_val); - self.emit_store_result(dest); - self.emit_branch(&end_label); - self.state.emit_fmt(format_args!("{}:", true_label)); - self.emit_load_operand(true_val); - self.emit_store_result(dest); - self.state.emit_fmt(format_args!("{}:", end_label)); - return; - } - - let label_id = self.state.next_label_id(); - let true_label = format!(".Lsel_true_{}", label_id); - let end_label = format!(".Lsel_end_{}", label_id); - - if cond_is_wide { - if let Operand::Value(v) = cond { - self.emit_wide_value_to_eax_ored(v.0); - self.state.reg_cache.invalidate_acc(); - } - } else { - self.operand_to_eax(cond); - } - - self.emit_branch_nonzero(&true_label); - - self.emit_copy_value(dest, false_val); - self.emit_branch(&end_label); - - self.state.emit_fmt(format_args!("{}:", true_label)); - self.emit_copy_value(dest, true_val); - - self.state.emit_fmt(format_args!("{}:", end_label)); - } -} diff --git a/src/backend/i686/codegen/emit.rs b/src/backend/i686/codegen/emit.rs deleted file mode 100644 index f5f1947e06..0000000000 --- a/src/backend/i686/codegen/emit.rs +++ /dev/null @@ -1,2274 +0,0 @@ -//! i686 (32-bit x86) code generator. Implements the ArchCodegen trait. -//! -//! Uses the cdecl calling convention (System V i386 ABI): -//! - All arguments passed on the stack, pushed right-to-left -//! - Return values: eax (32-bit), eax:edx (64-bit), st(0) for float/double/long double -//! - Callee-saved: ebx, esi, edi, ebp -//! - Caller-saved: eax, ecx, edx -//! - No register-based argument passing (unlike x86-64 SysV ABI) -//! - Stack aligned to 16 bytes at call sites (modern i386 ABI) - -use crate::delegate_to_impl; -use crate::backend::traits::ArchCodegen; -use crate::backend::common::PtrDirective; -use crate::backend::state::{CodegenState, StackSlot}; -use crate::backend::regalloc::PhysReg; -use crate::backend::generation::is_i128_type; -use crate::backend::call_abi; -use crate::ir::reexports::{ - AtomicOrdering, - AtomicRmwOp, - BlockId, - IntrinsicOp, - IrBinOp, - IrCmpOp, - IrConst, - IrFunction, - IrUnaryOp, - Operand, - Value, -}; -use crate::common::types::{AddressSpace, IrType}; -use crate::common::fx_hash::FxHashMap; -use crate::{emit}; - -/// i686 code generator. Implements the ArchCodegen trait for the shared framework. -/// Uses cdecl calling convention with no register allocation (accumulator-based). -pub struct I686Codegen { - pub(crate) state: CodegenState, - pub(super) current_return_type: IrType, - /// Whether the current function is variadic - pub(super) is_variadic: bool, - /// Register allocation results (callee-saved registers: ebx, esi, edi) - pub(super) reg_assignments: FxHashMap, - /// Which callee-saved registers are used and need save/restore - pub(super) used_callee_saved: Vec, - /// Total stack bytes consumed by named parameters (for va_start computation). - pub(super) va_named_stack_bytes: usize, - /// Scratch register allocation index for inline asm GP registers. - pub(super) asm_scratch_idx: usize, - /// Scratch register allocation index for inline asm XMM registers. - pub(super) asm_xmm_scratch_idx: usize, - /// Whether the current function uses the fastcall calling convention. - pub(super) is_fastcall: bool, - /// For fastcall functions, the number of bytes of stack args the callee must pop on return. - pub(super) fastcall_stack_cleanup: usize, - /// For fastcall functions, how many leading params are passed in registers (0, 1, or 2). - pub(super) fastcall_reg_param_count: usize, - /// Whether the __x86.get_pc_thunk.bx helper needs to be emitted. - pub(super) needs_pc_thunk_bx: bool, - /// Number of integer arguments to pass in registers (-mregparm=N). - /// 0 = standard cdecl, 1-3 = pass first N int args in EAX, EDX, ECX. - pub(super) regparm: u8, - /// Whether to omit the frame pointer (-fomit-frame-pointer). - pub(super) omit_frame_pointer: bool, - /// When omit_frame_pointer is true, this holds the offset from ESP (at its - /// base position after prologue) to where EBP would have pointed. - /// slot_ref() adds this to convert EBP-relative slot offsets to ESP-relative. - /// Value: frame_size + callee_saved_bytes (without the pushed EBP). - pub(super) frame_base_offset: i64, - /// Tracks temporary ESP adjustments (e.g., subl $N,%esp for call args, - /// subl $4,%esp for f32 conversion). Incremented on subl, decremented on addl. - /// Added to frame_base_offset in slot_ref() to get the correct ESP offset. - pub(super) esp_adjust: i64, -} - -// Callee-saved physical register indices for i686 -// PhysReg(0) = ebx, PhysReg(1) = esi, PhysReg(2) = edi, PhysReg(3) = ebp -pub(super) const I686_CALLEE_SAVED: &[PhysReg] = &[PhysReg(0), PhysReg(1), PhysReg(2)]; -// Extended callee-saved list including ebp (used when -fomit-frame-pointer) -pub(super) const I686_CALLEE_SAVED_WITH_EBP: &[PhysReg] = &[PhysReg(0), PhysReg(1), PhysReg(2), PhysReg(3)]; -// No caller-saved registers available for allocation (eax/ecx/edx are scratch) -pub(super) const I686_CALLER_SAVED: &[PhysReg] = &[]; - -pub(super) fn phys_reg_name(reg: PhysReg) -> &'static str { - match reg.0 { - 0 => "ebx", - 1 => "esi", - 2 => "edi", - 3 => "ebp", - _ => panic!("invalid i686 phys reg: {:?}", reg), - } -} - -/// Map inline asm constraint register names to callee-saved PhysReg indices. -pub(super) fn i686_constraint_to_phys(constraint: &str) -> Option { - match constraint { - "b" | "{ebx}" | "ebx" => Some(PhysReg(0)), - "S" | "{esi}" | "esi" => Some(PhysReg(1)), - "D" | "{edi}" | "edi" => Some(PhysReg(2)), - _ => None, - } -} - -/// Map inline asm clobber register names to callee-saved PhysReg indices. -pub(super) fn i686_clobber_to_phys(clobber: &str) -> Option { - match clobber { - "ebx" | "~{ebx}" => Some(PhysReg(0)), - "esi" | "~{esi}" => Some(PhysReg(1)), - "edi" | "~{edi}" => Some(PhysReg(2)), - _ => None, - } -} - -impl I686Codegen { - pub fn new() -> Self { - Self { - state: CodegenState::new(), - current_return_type: IrType::I32, - is_variadic: false, - reg_assignments: FxHashMap::default(), - used_callee_saved: Vec::new(), - va_named_stack_bytes: 0, - asm_scratch_idx: 0, - asm_xmm_scratch_idx: 0, - is_fastcall: false, - fastcall_stack_cleanup: 0, - fastcall_reg_param_count: 0, - needs_pc_thunk_bx: false, - regparm: 0, - omit_frame_pointer: false, - frame_base_offset: 0, - esp_adjust: 0, - } - } - - pub fn set_pic(&mut self, pic: bool) { - self.state.pic_mode = pic; - } - - pub fn set_no_jump_tables(&mut self, enabled: bool) { - self.state.no_jump_tables = enabled; - } - - /// Apply all relevant options from a `CodegenOptions` struct. - pub fn apply_options(&mut self, opts: &crate::backend::CodegenOptions) { - self.set_pic(opts.pic); - self.set_no_jump_tables(opts.no_jump_tables); - self.regparm = opts.regparm; - self.omit_frame_pointer = opts.omit_frame_pointer; - self.state.emit_cfi = opts.emit_cfi; - } - - // --- i686 helper methods --- - - /// Format a stack slot reference as either `offset(%ebp)` or `offset(%esp)`. - /// When frame pointer is omitted, converts EBP-relative offsets to ESP-relative - /// by adding frame_base_offset + esp_adjust. - pub(super) fn slot_ref(&self, slot: StackSlot) -> String { - if self.omit_frame_pointer { - let esp_off = slot.0 + self.frame_base_offset + self.esp_adjust; - format!("{}(%esp)", esp_off) - } else { - format!("{}(%ebp)", slot.0) - } - } - - /// Format a stack slot reference with an additional byte offset. - /// Used for accessing sub-fields of multi-byte slots (e.g., upper 4 bytes of i64). - pub(super) fn slot_ref_offset(&self, slot: StackSlot, extra: i64) -> String { - if self.omit_frame_pointer { - let esp_off = slot.0 + extra + self.frame_base_offset + self.esp_adjust; - format!("{}(%esp)", esp_off) - } else { - format!("{}(%ebp)", slot.0 + extra) - } - } - - /// Format a parameter reference from the caller's stack frame. - /// With frame pointer: offset(%ebp) where offset is positive (above EBP). - /// Without frame pointer: params are at ESP + frame_base_offset + esp_adjust + offset, - /// but we need to subtract 4 because there's no pushed EBP taking up space. - /// The param_offset is the EBP-relative offset (e.g., 8 for first param). - pub(super) fn param_ref(&self, ebp_offset: i64) -> String { - if self.omit_frame_pointer { - // Without pushed EBP, params are 4 bytes closer to the frame. - // EBP would have been at original_esp - 4 (after push ebp). - // Without push ebp, the return address is at original_esp. - // So param at ebp_offset(%ebp) = (ebp_offset - 4) relative to original_esp. - // And relative to current ESP: ebp_offset - 4 + frame_base_offset + esp_adjust. - let esp_off = ebp_offset - 4 + self.frame_base_offset + self.esp_adjust; - format!("{}(%esp)", esp_off) - } else { - format!("{}(%ebp)", ebp_offset) - } - } - - pub(super) fn dest_reg(&self, dest: &Value) -> Option { - self.reg_assignments.get(&dest.0).copied() - } - - /// Load the address of va_list storage into %edx. - /// - /// va_list_ptr is an IR value that holds a pointer to the va_list storage. - /// - If va_list_ptr is an alloca (local va_list variable), we LEA the slot - /// address into %edx (the alloca IS the va_list storage). - /// - If va_list_ptr is a regular value (e.g., loaded pointer from va_list*), - /// we load its value into %edx (the value IS the address of va_list storage). - pub(super) fn load_va_list_addr_to_edx(&mut self, va_list_ptr: &Value) { - let is_alloca = self.state.is_alloca(va_list_ptr.0); - if let Some(phys) = self.reg_assignments.get(&va_list_ptr.0).copied() { - // Value is in a callee-saved register (non-alloca pointer value) - let reg = phys_reg_name(phys); - emit!(self.state, " movl %{}, %edx", reg); - } else if let Some(slot) = self.state.get_slot(va_list_ptr.0) { - let sr = self.slot_ref(slot); - if is_alloca { - // Alloca: the slot IS the va_list; get the address of the slot - emit!(self.state, " leal {}, %edx", sr); - } else { - // Regular value: the slot holds a pointer to the va_list storage - emit!(self.state, " movl {}, %edx", sr); - } - } - } - - /// Load an operand into %eax. - pub(super) fn operand_to_eax(&mut self, op: &Operand) { - // Check register cache - skip load if value is already in eax - if let Operand::Value(v) = op { - let is_alloca = self.state.is_alloca(v.0); - if self.state.reg_cache.acc_has(v.0, is_alloca) { - return; - } - } - - match op { - Operand::Const(c) => { - match c { - IrConst::I8(v) => emit!(self.state, " movl ${}, %eax", *v as i32), - IrConst::I16(v) => emit!(self.state, " movl ${}, %eax", *v as i32), - IrConst::I32(v) => { - if *v == 0 { - self.state.emit(" xorl %eax, %eax"); - } else { - emit!(self.state, " movl ${}, %eax", v); - } - } - IrConst::I64(v) => { - // On i686, we can only hold 32 bits in eax - // Truncate to low 32 bits - let low = *v as i32; - if low == 0 { - self.state.emit(" xorl %eax, %eax"); - } else { - emit!(self.state, " movl ${}, %eax", low); - } - } - IrConst::I128(v) => { - let low = *v as i32; - emit!(self.state, " movl ${}, %eax", low); - } - IrConst::F32(fval) => emit!(self.state, " movl ${}, %eax", fval.to_bits() as i32), - IrConst::F64(fval) => { - // Store low 32 bits of the f64 bit pattern - let low = fval.to_bits() as i32; - emit!(self.state, " movl ${}, %eax", low); - } - IrConst::LongDouble(_, bytes) => { - // Load first 4 bytes of long double - let low = i32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]); - emit!(self.state, " movl ${}, %eax", low); - } - IrConst::Zero => { - self.state.emit(" xorl %eax, %eax"); - } - } - self.state.reg_cache.invalidate_acc(); - } - Operand::Value(v) => { - let is_alloca = self.state.is_alloca(v.0); - // Check if value is in a callee-saved register (allocas are never register-allocated) - if let Some(phys) = self.reg_assignments.get(&v.0).copied() { - let reg = phys_reg_name(phys); - emit!(self.state, " movl %{}, %eax", reg); - self.state.reg_cache.set_acc(v.0, false); - } else if let Some(slot) = self.state.get_slot(v.0) { - let sr = self.slot_ref(slot); - if is_alloca { - // Alloca: the slot IS the data; load the address of the slot - if let Some(align) = self.state.alloca_over_align(v.0) { - // Over-aligned alloca: compute aligned address - emit!(self.state, " leal {}, %eax", sr); - emit!(self.state, " addl ${}, %eax", align - 1); - emit!(self.state, " andl ${}, %eax", -(align as i32)); - } else { - emit!(self.state, " leal {}, %eax", sr); - } - } else { - // Regular value: load the value from the slot - emit!(self.state, " movl {}, %eax", sr); - } - self.state.reg_cache.set_acc(v.0, is_alloca); - } - } - } - } - - /// Load a 64-bit value's slot into %eax by OR'ing both 32-bit halves. - /// Used for truthiness testing of I64/U64/F64 values on i686, where a value - /// is nonzero iff either half is nonzero. - pub(super) fn emit_wide_value_to_eax_ored(&mut self, value_id: u32) { - if let Some(slot) = self.state.get_slot(value_id) { - let sr0 = self.slot_ref(slot); - let sr4 = self.slot_ref_offset(slot, 4); - emit!(self.state, " movl {}, %eax", sr0); - emit!(self.state, " orl {}, %eax", sr4); - } else { - // Wide values (I64/F64) on i686 should always have stack slots since - // they can't fit in a single 32-bit register. Fall back to loading - // the low 32 bits only as a last resort. - self.operand_to_eax(&Operand::Value(Value(value_id))); - } - } - - /// Load an operand into %ecx. - pub(super) fn operand_to_ecx(&mut self, op: &Operand) { - match op { - Operand::Const(c) => { - match c { - IrConst::I8(v) => emit!(self.state, " movl ${}, %ecx", *v as i32), - IrConst::I16(v) => emit!(self.state, " movl ${}, %ecx", *v as i32), - IrConst::I32(v) => { - if *v == 0 { - self.state.emit(" xorl %ecx, %ecx"); - } else { - emit!(self.state, " movl ${}, %ecx", v); - } - } - IrConst::I64(v) => { - let low = *v as i32; - if low == 0 { - self.state.emit(" xorl %ecx, %ecx"); - } else { - emit!(self.state, " movl ${}, %ecx", low); - } - } - IrConst::I128(v) => { - let low = *v as i32; - emit!(self.state, " movl ${}, %ecx", low); - } - IrConst::F32(fval) => emit!(self.state, " movl ${}, %ecx", fval.to_bits() as i32), - IrConst::F64(fval) => { - let low = fval.to_bits() as i32; - emit!(self.state, " movl ${}, %ecx", low); - } - IrConst::LongDouble(_, bytes) => { - let low = i32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]); - emit!(self.state, " movl ${}, %ecx", low); - } - IrConst::Zero => { - self.state.emit(" xorl %ecx, %ecx"); - } - } - } - Operand::Value(v) => { - let is_alloca = self.state.is_alloca(v.0); - if let Some(phys) = self.reg_assignments.get(&v.0).copied() { - let reg = phys_reg_name(phys); - emit!(self.state, " movl %{}, %ecx", reg); - } else if let Some(slot) = self.state.get_slot(v.0) { - let sr = self.slot_ref(slot); - if is_alloca { - // Alloca: load the address of the slot - if let Some(align) = self.state.alloca_over_align(v.0) { - emit!(self.state, " leal {}, %ecx", sr); - emit!(self.state, " addl ${}, %ecx", align - 1); - emit!(self.state, " andl ${}, %ecx", -(align as i32)); - } else { - emit!(self.state, " leal {}, %ecx", sr); - } - } else { - emit!(self.state, " movl {}, %ecx", sr); - } - } else if self.state.reg_cache.acc_has(v.0, false) || self.state.reg_cache.acc_has(v.0, true) { - // Value is in accumulator (no stack slot) — move eax to ecx. - self.state.emit(" movl %eax, %ecx"); - } else { - self.state.emit(" xorl %ecx, %ecx"); - } - } - } - } - - /// Store %eax to a value's destination (callee-saved register or stack slot). - pub(super) fn store_eax_to(&mut self, dest: &Value) { - if let Some(phys) = self.dest_reg(dest) { - let reg = phys_reg_name(phys); - emit!(self.state, " movl %eax, %{}", reg); - self.state.reg_cache.invalidate_acc(); - } else if let Some(slot) = self.state.get_slot(dest.0) { - let sr = self.slot_ref(slot); - emit!(self.state, " movl %eax, {}", sr); - // If this dest is a wide value (I64/U64/F64), zero the upper 4 bytes. - // Wide values occupy 8-byte slots, and other paths (e.g. Copy from - // IrConst::I64) may write all 8 bytes. If we only write the low 4, - // the upper half retains stack garbage, which corrupts truthiness - // checks that OR both halves (emit_wide_value_to_eax_ored). - if self.state.wide_values.contains(&dest.0) { - let sr4 = self.slot_ref_offset(slot, 4); - emit!(self.state, " movl $0, {}", sr4); - } - self.state.reg_cache.set_acc(dest.0, false); - } - } - - /// Return the store mnemonic for a given type. - pub(super) fn mov_store_for_type(&self, ty: IrType) -> &'static str { - match ty { - IrType::I8 | IrType::U8 => "movb", - IrType::I16 | IrType::U16 => "movw", - // On i686, pointer-sized types use movl (32-bit) - _ => "movl", - } - } - - /// Return the load mnemonic for a given type. - pub(super) fn mov_load_for_type(&self, ty: IrType) -> &'static str { - match ty { - IrType::I8 => "movsbl", // sign-extend byte to 32-bit - IrType::U8 => "movzbl", // zero-extend byte to 32-bit - IrType::I16 => "movswl", // sign-extend word to 32-bit - IrType::U16 => "movzwl", // zero-extend word to 32-bit - // Everything 32-bit or larger uses movl - _ => "movl", - } - } - - /// Return the type suffix for an operation. - pub(super) fn type_suffix(&self, ty: IrType) -> &'static str { - match ty { - IrType::I8 | IrType::U8 => "b", - IrType::I16 | IrType::U16 => "w", - // On i686, the default (pointer-sized) is "l" (32-bit) - _ => "l", - } - } - - /// Return the register name for eax sub-register based on type size. - pub(super) fn eax_for_type(&self, ty: IrType) -> &'static str { - match ty { - IrType::I8 | IrType::U8 => "%al", - IrType::I16 | IrType::U16 => "%ax", - _ => "%eax", - } - } - - /// Check if a param type is eligible for fastcall register passing. - /// Only DWORD-sized or smaller integer/pointer types qualify. - pub(super) fn is_fastcall_reg_eligible(&self, ty: IrType) -> bool { - matches!(ty, IrType::I8 | IrType::U8 | IrType::I16 | IrType::U16 | - IrType::I32 | IrType::U32 | IrType::Ptr) - } - - /// Count how many leading params are passed in registers for fastcall (max 2). - pub(super) fn count_fastcall_reg_params(&self, func: &IrFunction) -> usize { - let mut count = 0; - for param in &func.params { - if count >= 2 { break; } - let ty = param.ty; - if self.is_fastcall_reg_eligible(ty) { - count += 1; - } else { - break; // non-eligible param stops register assignment - } - } - count - } - - /// Check if an operand is a constant that fits in an i32 immediate. - pub(super) fn const_as_imm32(op: &Operand) -> Option { - match op { - Operand::Const(c) => { - match c { - IrConst::I8(v) => Some(*v as i64), - IrConst::I16(v) => Some(*v as i64), - IrConst::I32(v) => Some(*v as i64), - IrConst::I64(v) => { - // On i686, check if the value fits in 32 bits - if *v >= i32::MIN as i64 && *v <= i32::MAX as i64 { - Some(*v) - } else { - None - } - } - _ => None, - } - } - _ => None, - } - } - - /// Extract an immediate integer value from an operand. - /// Used for SSE/AES instructions that require compile-time immediate operands. - pub(super) fn operand_to_imm_i64(op: &Operand) -> i64 { - match op { - Operand::Const(c) => match c { - IrConst::I8(v) => *v as i64, - IrConst::I16(v) => *v as i64, - IrConst::I32(v) => *v as i64, - IrConst::I64(v) => *v, - _ => 0, - }, - Operand::Value(_) => 0, - } - } - - /// Load an F128 (long double) operand onto the x87 FPU stack. - pub(super) fn emit_f128_load_to_x87(&mut self, op: &Operand) { - match op { - Operand::Value(v) => { - if let Some(slot) = self.state.get_slot(v.0) { - let sr = self.slot_ref(slot); - emit!(self.state, " fldt {}", sr); - } - } - Operand::Const(IrConst::LongDouble(_, bytes)) => { - // Convert f128 (IEEE binary128) bytes to x87 80-bit format for fldt - let x87 = crate::common::long_double::f128_bytes_to_x87_bytes(bytes); - let dword0 = i32::from_le_bytes([x87[0], x87[1], x87[2], x87[3]]); - let dword1 = i32::from_le_bytes([x87[4], x87[5], x87[6], x87[7]]); - let word2 = i16::from_le_bytes([x87[8], x87[9]]) as i32; - self.state.emit(" subl $12, %esp"); - emit!(self.state, " movl ${}, (%esp)", dword0); - emit!(self.state, " movl ${}, 4(%esp)", dword1); - emit!(self.state, " movw ${}, 8(%esp)", word2); - self.state.emit(" fldt (%esp)"); - self.state.emit(" addl $12, %esp"); - } - Operand::Const(IrConst::F64(fval)) => { - // Convert f64 to x87: push to stack as f64, fld, convert - let bits = fval.to_bits(); - let low = (bits & 0xFFFFFFFF) as i32; - let high = ((bits >> 32) & 0xFFFFFFFF) as i32; - self.state.emit(" subl $8, %esp"); - emit!(self.state, " movl ${}, (%esp)", low); - emit!(self.state, " movl ${}, 4(%esp)", high); - self.state.emit(" fldl (%esp)"); - self.state.emit(" addl $8, %esp"); - } - Operand::Const(IrConst::F32(fval)) => { - emit!(self.state, " movl ${}, %eax", fval.to_bits() as i32); - self.state.emit(" pushl %eax"); - self.state.emit(" flds (%esp)"); - self.state.emit(" addl $4, %esp"); - } - _ => { - self.operand_to_eax(op); - // Fallback: treat as integer, push to stack - self.state.emit(" pushl %eax"); - self.state.emit(" flds (%esp)"); - self.state.emit(" addl $4, %esp"); - } - } - } - - /// Load an F64 (double) operand onto the x87 FPU stack. - /// F64 values occupy 8-byte stack slots on i686, so we use fldl to load - /// them directly from memory rather than going through the 32-bit accumulator. - pub(super) fn emit_f64_load_to_x87(&mut self, op: &Operand) { - match op { - Operand::Value(v) => { - if let Some(slot) = self.state.get_slot(v.0) { - let sr = self.slot_ref(slot); - emit!(self.state, " fldl {}", sr); - } - } - Operand::Const(IrConst::F64(fval)) => { - let bits = fval.to_bits(); - let low = (bits & 0xFFFFFFFF) as i32; - let high = ((bits >> 32) & 0xFFFFFFFF) as i32; - self.state.emit(" subl $8, %esp"); - emit!(self.state, " movl ${}, (%esp)", low); - emit!(self.state, " movl ${}, 4(%esp)", high); - self.state.emit(" fldl (%esp)"); - self.state.emit(" addl $8, %esp"); - } - Operand::Const(IrConst::F32(fval)) => { - emit!(self.state, " movl ${}, %eax", fval.to_bits() as i32); - self.state.emit(" pushl %eax"); - self.state.emit(" flds (%esp)"); - self.state.emit(" addl $4, %esp"); - } - Operand::Const(IrConst::Zero) => { - self.state.emit(" fldz"); - } - _ => { - // Fallback: load integer bits and convert - self.operand_to_eax(op); - self.state.emit(" pushl %eax"); - self.state.emit(" fildl (%esp)"); - self.state.emit(" addl $4, %esp"); - } - } - } - - /// Store the x87 st(0) value as F64 into a destination stack slot. - /// Pops st(0). - pub(super) fn emit_f64_store_from_x87(&mut self, dest: &Value) { - if let Some(slot) = self.state.get_slot(dest.0) { - let sr = self.slot_ref(slot); - emit!(self.state, " fstpl {}", sr); - } else { - // No slot available, pop x87 stack to discard - self.state.emit(" fstp %st(0)"); - } - } - - // --- 64-bit atomic helpers using cmpxchg8b (for I64/U64/F64 on i686) --- - - /// Check if a type requires 64-bit atomic handling on i686 (needs cmpxchg8b). - pub(super) fn is_atomic_wide(&self, ty: IrType) -> bool { - matches!(ty, IrType::I64 | IrType::U64 | IrType::F64) - } - - /// 64-bit atomic RMW using lock cmpxchg8b loop. - /// - /// cmpxchg8b compares edx:eax with 8 bytes at memory location. - /// If equal, stores ecx:ebx to memory. If not, loads memory into edx:eax. - /// We use a loop: load old value, compute new value, try cmpxchg8b. - /// - /// Register plan: - /// esi = pointer to atomic variable (saved/restored) - /// edx:eax = old (expected) value - /// ecx:ebx = new (desired) value - /// Stack: saved operand value (8 bytes) - pub(super) fn emit_atomic_rmw_wide(&mut self, dest: &Value, op: AtomicRmwOp, ptr: &Operand, - val: &Operand) { - // Save callee-saved registers we need to clobber - self.state.emit(" pushl %ebx"); - self.esp_adjust += 4; - self.state.emit(" pushl %esi"); - self.esp_adjust += 4; - - // Load pointer into esi - self.operand_to_eax(ptr); - self.state.emit(" movl %eax, %esi"); - - // Load 64-bit operand value onto stack (8 bytes) - self.emit_load_acc_pair(val); - self.state.emit(" pushl %edx"); // high word at 4(%esp) - self.state.emit(" pushl %eax"); // low word at (%esp) - - // Load current value from memory into edx:eax - self.state.emit(" movl (%esi), %eax"); - self.state.emit(" movl 4(%esi), %edx"); - - match op { - AtomicRmwOp::Xchg => { - // For exchange, the desired value is the operand (constant across retries) - self.state.emit(" movl (%esp), %ebx"); - self.state.emit(" movl 4(%esp), %ecx"); - let loop_label = format!(".Latomic_{}", self.state.next_label_id()); - emit!(self.state, "{}:", loop_label); - self.state.emit(" lock cmpxchg8b (%esi)"); - emit!(self.state, " jne {}", loop_label); - } - AtomicRmwOp::Add => { - let loop_label = format!(".Latomic_{}", self.state.next_label_id()); - emit!(self.state, "{}:", loop_label); - self.state.emit(" movl %eax, %ebx"); - self.state.emit(" movl %edx, %ecx"); - self.state.emit(" addl (%esp), %ebx"); - self.state.emit(" adcl 4(%esp), %ecx"); - self.state.emit(" lock cmpxchg8b (%esi)"); - emit!(self.state, " jne {}", loop_label); - } - AtomicRmwOp::Sub => { - let loop_label = format!(".Latomic_{}", self.state.next_label_id()); - emit!(self.state, "{}:", loop_label); - self.state.emit(" movl %eax, %ebx"); - self.state.emit(" movl %edx, %ecx"); - self.state.emit(" subl (%esp), %ebx"); - self.state.emit(" sbbl 4(%esp), %ecx"); - self.state.emit(" lock cmpxchg8b (%esi)"); - emit!(self.state, " jne {}", loop_label); - } - AtomicRmwOp::And => { - let loop_label = format!(".Latomic_{}", self.state.next_label_id()); - emit!(self.state, "{}:", loop_label); - self.state.emit(" movl %eax, %ebx"); - self.state.emit(" movl %edx, %ecx"); - self.state.emit(" andl (%esp), %ebx"); - self.state.emit(" andl 4(%esp), %ecx"); - self.state.emit(" lock cmpxchg8b (%esi)"); - emit!(self.state, " jne {}", loop_label); - } - AtomicRmwOp::Or => { - let loop_label = format!(".Latomic_{}", self.state.next_label_id()); - emit!(self.state, "{}:", loop_label); - self.state.emit(" movl %eax, %ebx"); - self.state.emit(" movl %edx, %ecx"); - self.state.emit(" orl (%esp), %ebx"); - self.state.emit(" orl 4(%esp), %ecx"); - self.state.emit(" lock cmpxchg8b (%esi)"); - emit!(self.state, " jne {}", loop_label); - } - AtomicRmwOp::Xor => { - let loop_label = format!(".Latomic_{}", self.state.next_label_id()); - emit!(self.state, "{}:", loop_label); - self.state.emit(" movl %eax, %ebx"); - self.state.emit(" movl %edx, %ecx"); - self.state.emit(" xorl (%esp), %ebx"); - self.state.emit(" xorl 4(%esp), %ecx"); - self.state.emit(" lock cmpxchg8b (%esi)"); - emit!(self.state, " jne {}", loop_label); - } - AtomicRmwOp::Nand => { - let loop_label = format!(".Latomic_{}", self.state.next_label_id()); - emit!(self.state, "{}:", loop_label); - self.state.emit(" movl %eax, %ebx"); - self.state.emit(" movl %edx, %ecx"); - self.state.emit(" andl (%esp), %ebx"); - self.state.emit(" andl 4(%esp), %ecx"); - self.state.emit(" notl %ebx"); - self.state.emit(" notl %ecx"); - self.state.emit(" lock cmpxchg8b (%esi)"); - emit!(self.state, " jne {}", loop_label); - } - AtomicRmwOp::TestAndSet => { - // For 64-bit test-and-set, set the low byte to 1, rest to 0 - self.state.emit(" movl $1, %ebx"); - self.state.emit(" xorl %ecx, %ecx"); - let loop_label = format!(".Latomic_{}", self.state.next_label_id()); - emit!(self.state, "{}:", loop_label); - self.state.emit(" lock cmpxchg8b (%esi)"); - emit!(self.state, " jne {}", loop_label); - } - } - - // Clean up stack (remove 8-byte operand value) - self.state.emit(" addl $8, %esp"); - // Restore callee-saved registers - self.state.emit(" popl %esi"); - self.esp_adjust -= 4; - self.state.emit(" popl %ebx"); - self.esp_adjust -= 4; - - // Result (old value) is in edx:eax — store to dest's 64-bit stack slot - self.state.reg_cache.invalidate_acc(); - self.emit_store_acc_pair(dest); - } - - /// 64-bit atomic compare-exchange using lock cmpxchg8b. - /// - /// cmpxchg8b: compares edx:eax with 8 bytes at memory. - /// If equal, stores ecx:ebx to memory and sets ZF. - /// If not equal, loads memory into edx:eax and clears ZF. - pub(super) fn emit_atomic_cmpxchg_wide(&mut self, dest: &Value, ptr: &Operand, expected: &Operand, - desired: &Operand, returns_bool: bool) { - // Save callee-saved registers - self.state.emit(" pushl %ebx"); - self.esp_adjust += 4; - self.state.emit(" pushl %esi"); - self.esp_adjust += 4; - - // Load pointer into esi - self.operand_to_eax(ptr); - self.state.emit(" movl %eax, %esi"); - - // Load expected into edx:eax, save on stack temporarily - self.emit_load_acc_pair(expected); - self.state.emit(" pushl %edx"); - self.esp_adjust += 4; - self.state.emit(" pushl %eax"); - self.esp_adjust += 4; - - // Load desired into ecx:ebx - self.emit_load_acc_pair(desired); - self.state.emit(" movl %eax, %ebx"); - self.state.emit(" movl %edx, %ecx"); - - // Restore expected into edx:eax - self.state.emit(" popl %eax"); - self.esp_adjust -= 4; - self.state.emit(" popl %edx"); - self.esp_adjust -= 4; - - // Execute cmpxchg8b - self.state.emit(" lock cmpxchg8b (%esi)"); - - if returns_bool { - self.state.emit(" sete %al"); - self.state.emit(" movzbl %al, %eax"); - // Restore callee-saved registers - self.state.emit(" popl %esi"); - self.esp_adjust -= 4; - self.state.emit(" popl %ebx"); - self.esp_adjust -= 4; - self.state.reg_cache.invalidate_acc(); - self.store_eax_to(dest); - } else { - // Result (old value) is in edx:eax - // Restore callee-saved registers - self.state.emit(" popl %esi"); - self.esp_adjust -= 4; - self.state.emit(" popl %ebx"); - self.esp_adjust -= 4; - self.state.reg_cache.invalidate_acc(); - self.emit_store_acc_pair(dest); - } - } - - /// 64-bit atomic load using cmpxchg8b with expected == desired == 0. - /// - /// cmpxchg8b always loads the current memory value into edx:eax on failure, - /// so we set edx:eax = ecx:ebx = 0 and execute cmpxchg8b. If the memory - /// happens to be 0, the exchange writes 0 (no change). If non-zero, - /// we get the current value in edx:eax without modifying memory. - pub(super) fn emit_atomic_load_wide(&mut self, dest: &Value, ptr: &Operand) { - self.state.emit(" pushl %ebx"); - self.esp_adjust += 4; - self.state.emit(" pushl %esi"); - self.esp_adjust += 4; - - self.operand_to_eax(ptr); - self.state.emit(" movl %eax, %esi"); - - // Set all registers to zero: edx:eax = ecx:ebx = 0 - self.state.emit(" xorl %eax, %eax"); - self.state.emit(" xorl %edx, %edx"); - self.state.emit(" xorl %ebx, %ebx"); - self.state.emit(" xorl %ecx, %ecx"); - // lock cmpxchg8b: if (%esi) == 0 -> store 0 (no change), else load into edx:eax - self.state.emit(" lock cmpxchg8b (%esi)"); - - self.state.emit(" popl %esi"); - self.esp_adjust -= 4; - self.state.emit(" popl %ebx"); - self.esp_adjust -= 4; - - self.state.reg_cache.invalidate_acc(); - self.emit_store_acc_pair(dest); - } - - /// 64-bit atomic store using a cmpxchg8b loop. - /// - /// There is no single instruction for atomic 64-bit stores on i686, so we - /// use a cmpxchg8b loop: read current value, try to replace with desired. - pub(super) fn emit_atomic_store_wide(&mut self, ptr: &Operand, val: &Operand) { - self.state.emit(" pushl %ebx"); - self.esp_adjust += 4; - self.state.emit(" pushl %esi"); - self.esp_adjust += 4; - - // Load pointer into esi - self.operand_to_eax(ptr); - self.state.emit(" movl %eax, %esi"); - - // Load desired value into ecx:ebx - self.emit_load_acc_pair(val); - self.state.emit(" movl %eax, %ebx"); - self.state.emit(" movl %edx, %ecx"); - - // Load current value from memory into edx:eax (initial guess for cmpxchg8b) - self.state.emit(" movl (%esi), %eax"); - self.state.emit(" movl 4(%esi), %edx"); - - let loop_label = format!(".Latomic_{}", self.state.next_label_id()); - emit!(self.state, "{}:", loop_label); - self.state.emit(" lock cmpxchg8b (%esi)"); - emit!(self.state, " jne {}", loop_label); - - self.state.emit(" popl %esi"); - self.esp_adjust -= 4; - self.state.emit(" popl %ebx"); - self.esp_adjust -= 4; - self.state.reg_cache.invalidate_acc(); - } - - /// Emit a fastcall function call on i686. - /// First two DWORD (int/ptr) args go in ECX, EDX. - /// Remaining args go on the stack (right-to-left push order). - /// The callee pops stack args, so caller does NOT adjust ESP after call. - pub(super) fn emit_fastcall(&mut self, args: &[Operand], arg_types: &[IrType], - direct_name: Option<&str>, func_ptr: Option<&Operand>, - dest: Option, return_type: IrType) { - let indirect = func_ptr.is_some() && direct_name.is_none(); - - // Determine which args go in registers vs stack. - let mut reg_count = 0usize; - for ty in arg_types.iter() { - if reg_count >= 2 { break; } - if self.is_fastcall_reg_eligible(*ty) { - reg_count += 1; - } else { - break; - } - } - - // Compute stack space for overflow args (args beyond the register ones). - let mut stack_bytes = 0usize; - for i in reg_count..args.len() { - let ty = if i < arg_types.len() { arg_types[i] } else { IrType::I32 }; - match ty { - IrType::F64 | IrType::I64 | IrType::U64 => stack_bytes += 8, - IrType::F128 => stack_bytes += 12, - _ if is_i128_type(ty) => stack_bytes += 16, - _ => stack_bytes += 4, - } - } - // Align to 16 bytes - let stack_arg_space = (stack_bytes + 15) & !15; - - // Spill indirect function pointer before stack manipulation. - if indirect { - self.emit_call_spill_fptr(func_ptr.expect("indirect call requires func_ptr")); - } - - // Phase 1: Allocate stack space and write stack args. - if stack_arg_space > 0 { - emit!(self.state, " subl ${}, %esp", stack_arg_space); - self.esp_adjust += stack_arg_space as i64; - } - - // Write stack args (skipping register args). - let mut offset = 0i64; - for i in reg_count..args.len() { - let ty = if i < arg_types.len() { arg_types[i] } else { IrType::I32 }; - let arg = &args[i]; - - match ty { - IrType::I64 | IrType::U64 | IrType::F64 => { - self.emit_load_acc_pair(arg); - emit!(self.state, " movl %eax, {}(%esp)", offset); - emit!(self.state, " movl %edx, {}(%esp)", offset + 4); - offset += 8; - } - IrType::F128 => { - // Load F128 value to x87 and store to stack - self.emit_f128_load_to_x87(arg); - emit!(self.state, " fstpt {}(%esp)", offset); - offset += 12; - } - _ if is_i128_type(ty) => { - // Copy 16 bytes - if let Operand::Value(v) = arg { - if let Some(slot) = self.state.get_slot(v.0) { - for j in (0..16).step_by(4) { - let sr = self.slot_ref_offset(slot, j as i64); - emit!(self.state, " movl {}, %eax", sr); - emit!(self.state, " movl %eax, {}(%esp)", offset + j as i64); - } - } - } - offset += 16; - } - _ => { - self.emit_load_operand(arg); - emit!(self.state, " movl %eax, {}(%esp)", offset); - offset += 4; - } - } - } - - // Phase 2: Load register args into ECX and EDX. - // Load EDX first (arg 1) then ECX (arg 0), because loading arg 0 - // may clobber EDX if it involves function calls. - if reg_count >= 2 { - self.emit_load_operand(&args[1]); - self.state.emit(" movl %eax, %edx"); - } - if reg_count >= 1 { - self.emit_load_operand(&args[0]); - self.state.emit(" movl %eax, %ecx"); - } - - // Phase 3: Emit the call. - if indirect { - // Reload function pointer from spill slot - let fptr_offset = stack_arg_space as i64; - emit!(self.state, " movl {}(%esp), %eax", fptr_offset); - self.state.emit(" call *%eax"); - } else if let Some(name) = direct_name { - emit!(self.state, " call {}", name); - } - - // Phase 4: For indirect calls, pop the spilled function pointer. - // Note: callee already cleaned up the stack args, so we only need - // to handle the fptr spill and alignment padding. - // After call: callee popped stack_bytes, so esp_adjust drops by that amount. - self.esp_adjust -= stack_bytes as i64; - if indirect { - self.state.emit(" addl $4, %esp"); // pop fptr spill - self.esp_adjust -= 4; - } - // Clean up alignment padding (the difference between actual stack bytes and aligned) - let padding = stack_arg_space - stack_bytes; - if padding > 0 { - emit!(self.state, " addl ${}, %esp", padding); - self.esp_adjust -= padding as i64; - } - - // Phase 5: Store return value. - if let Some(dest) = dest { - self.emit_call_store_result(&dest, return_type); - } - - self.state.reg_cache.invalidate_acc(); - } -} - -// Helper functions for ALU mnemonics -pub(super) fn alu_mnemonic(op: IrBinOp) -> &'static str { - match op { - IrBinOp::Add => "add", - IrBinOp::Sub => "sub", - IrBinOp::And => "and", - IrBinOp::Or => "or", - IrBinOp::Xor => "xor", - _ => panic!("not a simple ALU op: {:?}", op), - } -} - -pub(super) fn shift_mnemonic(op: IrBinOp) -> &'static str { - match op { - IrBinOp::Shl => "shll", - IrBinOp::AShr => "sarl", - IrBinOp::LShr => "shrl", - _ => panic!("not a shift op: {:?}", op), - } -} - -// --- 64-bit bit-manipulation helpers --- -// On i686, 64-bit values are in the eax:edx register pair (eax=low, edx=high). -// The result of clz/ctz/popcount is a small integer (0-64) that fits in eax, -// so we zero edx to produce a proper I64 result. -impl I686Codegen { - /// clzll(x): Count leading zeros of 64-bit value in eax:edx. - /// If high half (edx) != 0, result = lzcnt(edx). - /// Otherwise, result = 32 + lzcnt(eax). - pub(super) fn emit_i64_clz(&mut self) { - let done = self.state.fresh_label("clz64_done"); - let hi_zero = self.state.fresh_label("clz64_hi_zero"); - // Test high half - self.state.emit(" testl %edx, %edx"); - emit!(self.state, " je {}", hi_zero); - // High half is non-zero: result = lzcnt(edx) - self.state.emit(" lzcntl %edx, %eax"); - self.state.emit(" xorl %edx, %edx"); - emit!(self.state, " jmp {}", done); - // High half is zero: result = 32 + lzcnt(eax) - emit!(self.state, "{}:", hi_zero); - self.state.emit(" lzcntl %eax, %eax"); - self.state.emit(" addl $32, %eax"); - self.state.emit(" xorl %edx, %edx"); - emit!(self.state, "{}:", done); - } - - /// ctzll(x): Count trailing zeros of 64-bit value in eax:edx. - /// If low half (eax) != 0, result = tzcnt(eax). - /// Otherwise, result = 32 + tzcnt(edx). - pub(super) fn emit_i64_ctz(&mut self) { - let done = self.state.fresh_label("ctz64_done"); - let lo_zero = self.state.fresh_label("ctz64_lo_zero"); - // Test low half - self.state.emit(" testl %eax, %eax"); - emit!(self.state, " je {}", lo_zero); - // Low half is non-zero: result = tzcnt(eax) - self.state.emit(" tzcntl %eax, %eax"); - self.state.emit(" xorl %edx, %edx"); - emit!(self.state, " jmp {}", done); - // Low half is zero: result = 32 + tzcnt(edx) - emit!(self.state, "{}:", lo_zero); - self.state.emit(" tzcntl %edx, %eax"); - self.state.emit(" addl $32, %eax"); - self.state.emit(" xorl %edx, %edx"); - emit!(self.state, "{}:", done); - } - - /// popcountll(x): Population count of 64-bit value in eax:edx. - /// result = popcount(eax) + popcount(edx) - pub(super) fn emit_i64_popcount(&mut self) { - self.state.emit(" popcntl %edx, %ecx"); - self.state.emit(" popcntl %eax, %eax"); - self.state.emit(" addl %ecx, %eax"); - self.state.emit(" xorl %edx, %edx"); - } - - /// bswap64(x): Byte-swap 64-bit value in eax:edx. - /// result_lo = bswap(high), result_hi = bswap(low) - pub(super) fn emit_i64_bswap(&mut self) { - // eax=low, edx=high - // bswap each half, then swap: new_eax = bswap(edx), new_edx = bswap(eax) - self.state.emit(" bswapl %eax"); - self.state.emit(" bswapl %edx"); - self.state.emit(" xchgl %eax, %edx"); - } - - /// Copy `n_bytes` from stack slot to call stack area, 4 bytes at a time. - pub(super) fn emit_copy_slot_to_stack(&mut self, slot: StackSlot, stack_offset: usize, n_bytes: usize) { - let mut copied = 0usize; - while copied + 4 <= n_bytes { - let sr = self.slot_ref_offset(slot, copied as i64); - emit!(self.state, " movl {}, %eax", sr); - emit!(self.state, " movl %eax, {}(%esp)", stack_offset + copied); - copied += 4; - } - while copied < n_bytes { - let sr = self.slot_ref_offset(slot, copied as i64); - emit!(self.state, " movb {}, %al", sr); - emit!(self.state, " movb %al, {}(%esp)", stack_offset + copied); - copied += 1; - } - self.state.reg_cache.invalidate_acc(); - } - - /// Fallback: store eax to stack, zero-fill remaining bytes. - pub(super) fn emit_eax_to_stack_zeroed(&mut self, arg: &Operand, stack_offset: usize, total_bytes: usize) { - self.operand_to_eax(arg); - emit!(self.state, " movl %eax, {}(%esp)", stack_offset); - for j in (4..total_bytes).step_by(4) { - emit!(self.state, " movl $0, {}(%esp)", stack_offset + j); - } - } - - /// Emit I128 argument to call stack (16 bytes). - pub(super) fn emit_call_i128_stack_arg(&mut self, arg: &Operand, stack_offset: usize) { - if let Operand::Value(v) = arg { - if let Some(slot) = self.state.get_slot(v.0) { - self.emit_copy_slot_to_stack(slot, stack_offset, 16); - } else { - self.emit_eax_to_stack_zeroed(arg, stack_offset, 16); - } - } - } - - /// Emit F128 (long double) argument to call stack (12 bytes). - pub(super) fn emit_call_f128_stack_arg(&mut self, arg: &Operand, stack_offset: usize) { - match arg { - Operand::Value(v) => { - if self.state.f128_direct_slots.contains(&v.0) { - if let Some(slot) = self.state.get_slot(v.0) { - let sr = self.slot_ref(slot); - emit!(self.state, " fldt {}", sr); - emit!(self.state, " fstpt {}(%esp)", stack_offset); - } - } else if let Some(slot) = self.state.get_slot(v.0) { - self.emit_copy_slot_to_stack(slot, stack_offset, 12); - } else { - self.emit_eax_to_stack_zeroed(arg, stack_offset, 12); - } - } - Operand::Const(IrConst::LongDouble(_, bytes)) => { - let x87 = crate::common::long_double::f128_bytes_to_x87_bytes(bytes); - let dword0 = i32::from_le_bytes([x87[0], x87[1], x87[2], x87[3]]); - let dword1 = i32::from_le_bytes([x87[4], x87[5], x87[6], x87[7]]); - let word2 = i16::from_le_bytes([x87[8], x87[9]]) as i32; - emit!(self.state, " movl ${}, {}(%esp)", dword0, stack_offset); - emit!(self.state, " movl ${}, {}(%esp)", dword1, stack_offset + 4); - emit!(self.state, " movw ${}, {}(%esp)", word2, stack_offset + 8); - } - Operand::Const(IrConst::F64(fval)) => { - let bits = fval.to_bits(); - let low = (bits & 0xFFFFFFFF) as i32; - let high = ((bits >> 32) & 0xFFFFFFFF) as i32; - self.state.emit(" subl $8, %esp"); - emit!(self.state, " movl ${}, (%esp)", low); - emit!(self.state, " movl ${}, 4(%esp)", high); - self.state.emit(" fldl (%esp)"); - self.state.emit(" addl $8, %esp"); - emit!(self.state, " fstpt {}(%esp)", stack_offset); - } - _ => { - self.emit_eax_to_stack_zeroed(arg, stack_offset, 12); - } - } - } - - /// Emit struct-by-value argument to call stack. - pub(super) fn emit_call_struct_stack_arg(&mut self, arg: &Operand, stack_offset: usize, size: usize) { - if let Operand::Value(v) = arg { - if self.state.is_alloca(v.0) { - if let Some(slot) = self.state.get_slot(v.0) { - self.emit_copy_slot_to_stack(slot, stack_offset, size); - } - } else { - // Non-alloca: value is a pointer to struct data. - self.operand_to_eax(arg); - self.state.emit(" movl %eax, %ecx"); - let mut copied = 0usize; - while copied + 4 <= size { - emit!(self.state, " movl {}(%ecx), %eax", copied); - emit!(self.state, " movl %eax, {}(%esp)", stack_offset + copied); - copied += 4; - } - while copied < size { - emit!(self.state, " movb {}(%ecx), %al", copied); - emit!(self.state, " movb %al, {}(%esp)", stack_offset + copied); - copied += 1; - } - self.state.reg_cache.invalidate_acc(); - } - } - } - - /// Emit 8-byte scalar (F64/I64/U64) to call stack. - pub(super) fn emit_call_8byte_stack_arg(&mut self, arg: &Operand, ty: IrType, stack_offset: usize) { - if let Operand::Value(v) = arg { - if let Some(slot) = self.state.get_slot(v.0) { - let sr0 = self.slot_ref(slot); - let sr4 = self.slot_ref_offset(slot, 4); - emit!(self.state, " movl {}, %eax", sr0); - emit!(self.state, " movl %eax, {}(%esp)", stack_offset); - emit!(self.state, " movl {}, %eax", sr4); - emit!(self.state, " movl %eax, {}(%esp)", stack_offset + 4); - self.state.reg_cache.invalidate_acc(); - } else { - self.operand_to_eax(arg); - emit!(self.state, " movl %eax, {}(%esp)", stack_offset); - emit!(self.state, " movl $0, {}(%esp)", stack_offset + 4); - } - } else if ty == IrType::F64 { - if let Operand::Const(IrConst::F64(f)) = arg { - let bits = f.to_bits(); - let lo = (bits & 0xFFFF_FFFF) as u32; - let hi = (bits >> 32) as u32; - emit!(self.state, " movl ${}, {}(%esp)", lo as i32, stack_offset); - emit!(self.state, " movl ${}, {}(%esp)", hi as i32, stack_offset + 4); - } else { - self.operand_to_eax(arg); - emit!(self.state, " movl %eax, {}(%esp)", stack_offset); - emit!(self.state, " movl $0, {}(%esp)", stack_offset + 4); - } - } else { - // I64/U64 constant - self.operand_to_eax(arg); - emit!(self.state, " movl %eax, {}(%esp)", stack_offset); - if let Operand::Const(IrConst::I64(v)) = arg { - let hi = ((*v as u64) >> 32) as i32; - emit!(self.state, " movl ${}, {}(%esp)", hi, stack_offset + 4); - } else { - emit!(self.state, " movl $0, {}(%esp)", stack_offset + 4); - } - } - } -} - - -// ─── ArchCodegen trait implementation ──────────────────────────────────────── - -impl ArchCodegen for I686Codegen { - fn state(&mut self) -> &mut CodegenState { &mut self.state } - fn state_ref(&self) -> &CodegenState { &self.state } - - fn ptr_directive(&self) -> PtrDirective { PtrDirective::Long } - - fn get_phys_reg_for_value(&self, val_id: u32) -> Option { - self.reg_assignments.get(&val_id).copied() - } - - fn emit_reg_to_reg_move(&mut self, src: PhysReg, dest: PhysReg) { - let src_name = phys_reg_name(src); - let dest_name = phys_reg_name(dest); - emit!(self.state, " movl %{}, %{}", src_name, dest_name); - } - - fn emit_acc_to_phys_reg(&mut self, dest: PhysReg) { - let dest_name = phys_reg_name(dest); - emit!(self.state, " movl %eax, %{}", dest_name); - } - - // ---- Standard trait methods (kept inline - arch-specific) ---- - fn emit_load_operand(&mut self, op: &Operand) { self.operand_to_eax(op); } - fn emit_store_result(&mut self, dest: &Value) { self.store_eax_to(dest); } - fn emit_save_acc(&mut self) { self.state.emit(" movl %eax, %edx"); } - fn emit_add_secondary_to_acc(&mut self) { self.state.emit(" addl %ecx, %eax"); } - fn emit_acc_to_secondary(&mut self) { self.state.emit(" movl %eax, %ecx"); } - fn emit_memcpy_store_dest_from_acc(&mut self) { self.state.emit(" movl %eax, %edi"); } - fn emit_memcpy_store_src_from_acc(&mut self) { self.state.emit(" movl %eax, %esi"); } - fn current_return_type(&self) -> IrType { self.current_return_type } - fn emit_gep_add_const_to_acc(&mut self, offset: i64) { - if offset != 0 { - emit!(self.state, " addl ${}, %eax", offset as i32); - } - } - - /// Override emit_memcpy for i686: uses rep movsb with esi/edi. - fn emit_memcpy(&mut self, dest: &Value, src: &Value, size: usize) { - use crate::backend::state::SlotAddr; - // Always save esi and edi around rep movsb. - // These are callee-saved registers in the System V i386 ABI, so we must - // preserve them even if the register allocator didn't assign any values - // to them in this function. A caller may be relying on their preservation - // across a call to this function. - self.state.emit(" pushl %esi"); - self.esp_adjust += 4; - self.state.emit(" pushl %edi"); - self.esp_adjust += 4; - - // Load dest address into edi - if let Some(addr) = self.state.resolve_slot_addr(dest.0) { - match addr { - SlotAddr::OverAligned(slot, id) => { - self.emit_alloca_aligned_addr_to_acc(slot, id); - self.state.emit(" movl %eax, %edi"); - } - SlotAddr::Direct(slot) => self.emit_memcpy_load_dest_addr(slot, true, dest.0), - SlotAddr::Indirect(slot) => self.emit_memcpy_load_dest_addr(slot, false, dest.0), - } - } - // Load src address into esi - if let Some(addr) = self.state.resolve_slot_addr(src.0) { - match addr { - SlotAddr::OverAligned(slot, id) => { - self.emit_alloca_aligned_addr_to_acc(slot, id); - self.state.emit(" movl %eax, %esi"); - } - SlotAddr::Direct(slot) => self.emit_memcpy_load_src_addr(slot, true, src.0), - SlotAddr::Indirect(slot) => self.emit_memcpy_load_src_addr(slot, false, src.0), - } - } - // Perform the copy - self.emit_memcpy_impl(size); - - // Restore edi and esi (reverse order of push) - self.state.emit(" popl %edi"); - self.esp_adjust -= 4; - self.state.emit(" popl %esi"); - self.esp_adjust -= 4; - } - - /// Override emit_binop to route I64/U64 through register-pair (eax:edx) arithmetic. - fn emit_binop(&mut self, dest: &Value, op: IrBinOp, lhs: &Operand, rhs: &Operand, ty: IrType) { - if matches!(ty, IrType::I64 | IrType::U64) { - self.emit_i128_binop(dest, op, lhs, rhs); - self.state.reg_cache.invalidate_all(); - return; - } - if crate::backend::generation::is_i128_type(ty) { - self.emit_i128_binop(dest, op, lhs, rhs); - return; - } - if ty.is_float() { - let float_op = crate::backend::cast::classify_float_binop(op) - .unwrap_or_else(|| panic!("unsupported float binop: {:?} on type {:?}", op, ty)); - self.emit_float_binop(dest, float_op, lhs, rhs, ty); - return; - } - self.emit_int_binop(dest, op, lhs, rhs, ty); - } - - /// Override emit_cmp to route F64 and I64/U64 comparisons correctly on i686. - fn emit_cmp(&mut self, dest: &Value, op: IrCmpOp, lhs: &Operand, rhs: &Operand, ty: IrType) { - if ty == IrType::F128 { - self.emit_f128_cmp(dest, op, lhs, rhs); - return; - } - if ty == IrType::F64 || ty == IrType::F32 { - self.emit_float_cmp(dest, op, lhs, rhs, ty); - return; - } - if matches!(ty, IrType::I64 | IrType::U64) || crate::backend::generation::is_i128_type(ty) { - self.emit_i128_cmp(dest, op, lhs, rhs); - return; - } - self.emit_int_cmp(dest, op, lhs, rhs, ty); - } - - fn emit_unaryop(&mut self, dest: &Value, op: IrUnaryOp, src: &Operand, ty: IrType) { - if op == IrUnaryOp::IsConstant { - self.emit_load_operand(&Operand::Const(IrConst::I32(0))); - self.emit_store_result(dest); - return; - } - if ty == IrType::F128 && matches!(op, IrUnaryOp::Neg) { - self.emit_f128_neg(dest, src); - return; - } - if ty == IrType::F64 && op == IrUnaryOp::Neg { - self.emit_f64_load_to_x87(src); - self.state.emit(" fchs"); - self.emit_f64_store_from_x87(dest); - self.state.reg_cache.invalidate_acc(); - return; - } - if matches!(ty, IrType::I64 | IrType::U64) || crate::backend::generation::is_i128_type(ty) { - self.emit_load_acc_pair(src); - match op { - IrUnaryOp::Neg => self.emit_i128_neg(), - IrUnaryOp::Not => self.emit_i128_not(), - IrUnaryOp::Clz => self.emit_i64_clz(), - IrUnaryOp::Ctz => self.emit_i64_ctz(), - IrUnaryOp::Popcount => self.emit_i64_popcount(), - IrUnaryOp::Bswap => self.emit_i64_bswap(), - IrUnaryOp::IsConstant => unreachable!("handled above"), - } - self.emit_store_acc_pair(dest); - self.state.reg_cache.invalidate_all(); - return; - } - self.operand_to_eax(src); - match op { - IrUnaryOp::Neg => { - if ty.is_float() { self.emit_float_neg(ty); } else { self.emit_int_neg(ty); } - } - IrUnaryOp::Not => self.emit_int_not(ty), - IrUnaryOp::Clz => self.emit_int_clz(ty), - IrUnaryOp::Ctz => self.emit_int_ctz(ty), - IrUnaryOp::Popcount => self.emit_int_popcount(ty), - IrUnaryOp::Bswap => self.emit_int_bswap(ty), - IrUnaryOp::IsConstant => unreachable!("handled above"), - } - self.state.reg_cache.invalidate_acc(); - self.store_eax_to(dest); - } - - /// Override emit_call to handle fastcall calling convention. - fn emit_call(&mut self, args: &[Operand], arg_types: &[IrType], direct_name: Option<&str>, - func_ptr: Option<&Operand>, dest: Option, return_type: IrType, - is_variadic: bool, _num_fixed_args: usize, struct_arg_sizes: &[Option], - struct_arg_aligns: &[Option], - struct_arg_classes: &[Vec], - struct_arg_riscv_float_classes: &[Option], - is_sret: bool, - is_fastcall: bool, - ret_eightbyte_classes: &[crate::common::types::EightbyteClass]) { - if is_fastcall { - self.emit_fastcall(args, arg_types, direct_name, func_ptr, dest, return_type); - return; - } - use crate::backend::call_abi::*; - let config = self.call_abi_config(); - let arg_classes_vec = classify_call_args(args, arg_types, struct_arg_sizes, struct_arg_aligns, struct_arg_classes, struct_arg_riscv_float_classes, is_variadic, &config); - let indirect = func_ptr.is_some() && direct_name.is_none(); - if indirect { - self.emit_call_spill_fptr(func_ptr.expect("indirect call requires func_ptr")); - } - let stack_arg_space = self.emit_call_compute_stack_space(&arg_classes_vec, arg_types); - let f128_temp_space = self.emit_call_f128_pre_convert(args, &arg_classes_vec, arg_types, stack_arg_space); - self.state().reg_cache.invalidate_acc(); - let total_sp_adjust = self.emit_call_stack_args(args, &arg_classes_vec, arg_types, stack_arg_space, - if indirect { self.emit_call_fptr_spill_size() } else { 0 }, - f128_temp_space); - self.state().reg_cache.invalidate_acc(); - self.emit_call_reg_args(args, &arg_classes_vec, arg_types, total_sp_adjust, f128_temp_space, stack_arg_space, &[]); - self.emit_call_instruction(direct_name, func_ptr, indirect, stack_arg_space); - let callee_pops = self.callee_pops_bytes_for_sret(is_sret); - // Account for bytes the callee pops via `ret $N` (sret pointer on i686). - if callee_pops > 0 { - self.esp_adjust -= callee_pops as i64; - } - let effective_stack_cleanup = stack_arg_space.saturating_sub(callee_pops); - self.emit_call_cleanup(effective_stack_cleanup, f128_temp_space, indirect); - if let Some(dest) = dest { - self.set_call_ret_eightbyte_classes(ret_eightbyte_classes); - self.emit_call_store_result(&dest, return_type); - } - } - - fn callee_pops_bytes_for_sret(&self, is_sret: bool) -> usize { - if is_sret { 4 } else { 0 } - } - - // ---- Control flow ---- - - fn jump_mnemonic(&self) -> &'static str { "jmp" } - fn trap_instruction(&self) -> &'static str { "ud2" } - - fn emit_branch_nonzero(&mut self, label: &str) { - self.state.emit(" testl %eax, %eax"); - emit!(self.state, " jne {}", label); - } - - /// On i686, 64-bit conditions need both 32-bit halves tested. - fn emit_cond_branch_blocks(&mut self, cond: &Operand, true_block: BlockId, false_block: BlockId) { - match cond { - Operand::Const(IrConst::I64(v)) => { - if *v != 0 { - self.emit_branch_to_block(true_block); - } else { - self.emit_branch_to_block(false_block); - } - return; - } - Operand::Const(IrConst::F64(fval)) => { - if *fval != 0.0 { - self.emit_branch_to_block(true_block); - } else { - self.emit_branch_to_block(false_block); - } - return; - } - _ => {} - } - if let Operand::Value(v) = cond { - if self.state.is_wide_value(v.0) { - self.emit_wide_value_to_eax_ored(v.0); - self.state.reg_cache.invalidate_acc(); - let true_label = true_block.as_label(); - self.emit_branch_nonzero(&true_label); - self.emit_branch_to_block(false_block); - return; - } - } - self.operand_to_eax(cond); - let true_label = true_block.as_label(); - self.emit_branch_nonzero(&true_label); - self.emit_branch_to_block(false_block); - } - - fn emit_jump_indirect(&mut self) { - self.state.emit(" jmp *%eax"); - } - - /// Override emit_switch to handle 64-bit switch values on i686. - fn emit_switch(&mut self, val: &Operand, cases: &[(i64, BlockId)], default: &BlockId, ty: IrType) { - let is_wide = match val { - Operand::Value(v) => self.state.is_wide_value(v.0), - Operand::Const(IrConst::I64(_)) => true, - _ => false, - }; - - if !is_wide { - use crate::backend::traits::{MIN_JUMP_TABLE_CASES, MAX_JUMP_TABLE_RANGE, MIN_JUMP_TABLE_DENSITY_PERCENT}; - let use_jump_table = if self.state.no_jump_tables { - false - } else if cases.len() >= MIN_JUMP_TABLE_CASES { - let min_val = cases.iter().map(|&(v, _)| v).min().expect("switch must have cases"); - let max_val = cases.iter().map(|&(v, _)| v).max().expect("switch must have cases"); - let range = (max_val - min_val + 1) as usize; - range <= MAX_JUMP_TABLE_RANGE && cases.len() * 100 / range >= MIN_JUMP_TABLE_DENSITY_PERCENT - } else { - false - }; - if use_jump_table { - self.emit_switch_jump_table(val, cases, default, ty); - } else { - self.emit_load_operand(val); - for &(case_val, target) in cases { - let label = target.as_label(); - self.emit_switch_case_branch(case_val, &label, ty); - } - self.emit_branch_to_block(*default); - } - return; - } - - // 64-bit switch: compare both 32-bit halves - self.emit_load_acc_pair(val); - for &(case_val, target) in cases { - let case_low = case_val as i32; - let case_high = (case_val >> 32) as i32; - let label = target.as_label(); - let skip_label = format!(".Lswskip_{}", self.state.next_label_id()); - - if case_low == 0 { - self.state.emit(" testl %eax, %eax"); - } else { - emit!(self.state, " cmpl ${}, %eax", case_low); - } - emit!(self.state, " jne {}", skip_label); - - if case_high == 0 { - self.state.emit(" testl %edx, %edx"); - } else { - emit!(self.state, " cmpl ${}, %edx", case_high); - } - emit!(self.state, " je {}", label); - - emit!(self.state, "{}:", skip_label); - } - self.emit_branch_to_block(*default); - self.state.reg_cache.invalidate_all(); - } - - fn emit_switch_case_branch(&mut self, case_val: i64, label: &str, _ty: IrType) { - let val = case_val as i32; - if val == 0 { - self.state.emit(" testl %eax, %eax"); - } else { - emit!(self.state, " cmpl ${}, %eax", val); - } - emit!(self.state, " je {}", label); - } - - fn emit_switch_jump_table(&mut self, val: &Operand, cases: &[(i64, BlockId)], default: &BlockId, _ty: IrType) { - use crate::backend::traits::build_jump_table; - let (table, min_val, range) = build_jump_table(cases, default); - let table_label = self.state.fresh_label("jt"); - let default_label = default.as_label(); - - self.operand_to_eax(val); - if min_val != 0 { - emit!(self.state, " subl ${}, %eax", min_val as i32); - } - emit!(self.state, " cmpl ${}, %eax", range); - emit!(self.state, " jae {}", default_label); - - if self.state.pic_mode { - emit!(self.state, " leal {}@GOTOFF(%ebx), %ecx", table_label); - self.state.emit(" movl (%ecx, %eax, 4), %eax"); - self.state.emit(" addl %ecx, %eax"); - self.state.emit(" jmp *%eax"); - } else { - emit!(self.state, " jmp *{}(, %eax, 4)", table_label); - } - - self.state.emit(".section .rodata"); - self.state.emit(".align 4"); - self.state.emit_fmt(format_args!("{}:", table_label)); - for target in &table { - let target_label = target.as_label(); - if self.state.pic_mode { - self.state.emit_fmt(format_args!(" .long {} - {}", target_label, table_label)); - } else { - self.state.emit_fmt(format_args!(" .long {}", target_label)); - } - } - let sect = self.state.current_text_section.clone(); - self.state.emit_fmt(format_args!(".section {},\"ax\",@progbits", sect)); - self.state.reg_cache.invalidate_all(); - } - - fn emit_float_binop(&mut self, dest: &Value, op: crate::backend::cast::FloatOp, lhs: &Operand, rhs: &Operand, ty: IrType) { - if ty == IrType::F64 { - let mnemonic = self.emit_float_binop_mnemonic(op); - self.emit_f64_load_to_x87(lhs); - self.emit_f64_load_to_x87(rhs); - emit!(self.state, " f{}p %st, %st(1)", mnemonic); - self.emit_f64_store_from_x87(dest); - self.state.reg_cache.invalidate_acc(); - return; - } - if ty == IrType::F128 { - let mnemonic = self.emit_float_binop_mnemonic(op); - self.emit_f128_load_to_x87(lhs); - self.emit_f128_load_to_x87(rhs); - emit!(self.state, " f{}p %st, %st(1)", mnemonic); - if let Some(slot) = self.state.get_slot(dest.0) { - let sr = self.slot_ref(slot); - emit!(self.state, " fstpt {}", sr); - self.state.f128_direct_slots.insert(dest.0); - } - self.state.reg_cache.invalidate_acc(); - return; - } - let mnemonic = match op { - crate::backend::cast::FloatOp::Add => "add", - crate::backend::cast::FloatOp::Sub => "sub", - crate::backend::cast::FloatOp::Mul => "mul", - crate::backend::cast::FloatOp::Div => "div", - }; - self.emit_load_operand(lhs); - self.emit_acc_to_secondary(); - self.emit_load_operand(rhs); - self.emit_float_binop_impl(mnemonic, ty); - self.emit_store_result(dest); - } - - fn emit_float_binop_mnemonic(&self, op: crate::backend::cast::FloatOp) -> &'static str { - match op { - crate::backend::cast::FloatOp::Add => "add", - crate::backend::cast::FloatOp::Sub => "subr", - crate::backend::cast::FloatOp::Mul => "mul", - crate::backend::cast::FloatOp::Div => "divr", - } - } - - fn emit_float_binop_impl(&mut self, mnemonic: &str, ty: IrType) { - if ty == IrType::F32 { - self.state.emit(" movd %ecx, %xmm0"); - self.state.emit(" movd %eax, %xmm1"); - emit!(self.state, " {}ss %xmm1, %xmm0", mnemonic); - self.state.emit(" movd %xmm0, %eax"); - } - self.state.reg_cache.invalidate_acc(); - } - - /// emit_copy_value: handles F128, wide (F64/I64/U64), and 32-bit copies. - fn emit_copy_value(&mut self, dest: &Value, src: &Operand) { - if let Operand::Const(IrConst::LongDouble(..)) = src { - if let Some(dest_slot) = self.state.get_slot(dest.0) { - self.emit_f128_load_to_x87(src); - let sr = self.slot_ref(dest_slot); - emit!(self.state, " fstpt {}", sr); - self.state.f128_direct_slots.insert(dest.0); - return; - } - } - - if let Operand::Value(v) = src { - if self.state.f128_direct_slots.contains(&v.0) { - if let (Some(src_slot), Some(dest_slot)) = (self.state.get_slot(v.0), self.state.get_slot(dest.0)) { - let ssr = self.slot_ref(src_slot); - let dsr = self.slot_ref(dest_slot); - emit!(self.state, " fldt {}", ssr); - emit!(self.state, " fstpt {}", dsr); - self.state.f128_direct_slots.insert(dest.0); - return; - } - } - if let Some(&alloca_ty) = self.state.alloca_types.get(&v.0) { - if alloca_ty == IrType::F128 { - if let (Some(src_slot), Some(dest_slot)) = (self.state.get_slot(v.0), self.state.get_slot(dest.0)) { - let ssr = self.slot_ref(src_slot); - let dsr = self.slot_ref(dest_slot); - emit!(self.state, " fldt {}", ssr); - emit!(self.state, " fstpt {}", dsr); - self.state.f128_direct_slots.insert(dest.0); - return; - } - } - } - } - - let is_wide = match src { - Operand::Value(v) => self.state.is_wide_value(v.0), - Operand::Const(IrConst::F64(_)) => true, - Operand::Const(IrConst::I64(_)) => true, - _ => false, - }; - if is_wide { - if let Some(dest_slot) = self.state.get_slot(dest.0) { - match src { - Operand::Value(v) => { - if let Some(src_slot) = self.state.get_slot(v.0) { - let ssr0 = self.slot_ref(src_slot); - let dsr0 = self.slot_ref(dest_slot); - let ssr4 = self.slot_ref_offset(src_slot, 4); - let dsr4 = self.slot_ref_offset(dest_slot, 4); - emit!(self.state, " movl {}, %eax", ssr0); - emit!(self.state, " movl %eax, {}", dsr0); - emit!(self.state, " movl {}, %eax", ssr4); - emit!(self.state, " movl %eax, {}", dsr4); - } - } - Operand::Const(IrConst::F64(val)) => { - let bits = val.to_bits(); - let lo = bits as u32; - let hi = (bits >> 32) as u32; - let dsr0 = self.slot_ref(dest_slot); - let dsr4 = self.slot_ref_offset(dest_slot, 4); - emit!(self.state, " movl ${}, {}", lo as i32, dsr0); - emit!(self.state, " movl ${}, {}", hi as i32, dsr4); - } - Operand::Const(IrConst::I64(val)) => { - let lo = *val as u32; - let hi = (*val >> 32) as u32; - let dsr0 = self.slot_ref(dest_slot); - let dsr4 = self.slot_ref_offset(dest_slot, 4); - emit!(self.state, " movl ${}, {}", lo as i32, dsr0); - emit!(self.state, " movl ${}, {}", hi as i32, dsr4); - } - _ => unreachable!("unexpected wide constant type in i686 emit_copy"), - } - self.state.reg_cache.invalidate_all(); - return; - } - } - - self.emit_load_operand(src); - self.emit_store_result(dest); - } - - fn emit_inline_asm(&mut self, template: &str, outputs: &[(String, Value, Option)], - inputs: &[(String, Operand, Option)], clobbers: &[String], - operand_types: &[IrType], goto_labels: &[(String, BlockId)], - input_symbols: &[Option]) { - crate::backend::inline_asm::emit_inline_asm_common(self, template, outputs, inputs, clobbers, operand_types, goto_labels, input_symbols); - } - - fn emit_intrinsic(&mut self, dest: &Option, op: &IntrinsicOp, dest_ptr: &Option, args: &[Operand]) { - self.emit_intrinsic_impl(dest, op, dest_ptr, args); - } - - // ---- Delegated methods (via macro) ---- - - delegate_to_impl! { - // prologue - fn calculate_stack_space(&mut self, func: &IrFunction) -> i64 => calculate_stack_space_impl; - fn aligned_frame_size(&self, raw_space: i64) -> i64 => aligned_frame_size_impl; - fn emit_prologue(&mut self, func: &IrFunction, frame_size: i64) => emit_prologue_impl; - fn emit_epilogue(&mut self, frame_size: i64) => emit_epilogue_impl; - fn emit_store_params(&mut self, func: &IrFunction) => emit_store_params_impl; - fn emit_param_ref(&mut self, dest: &Value, param_idx: usize, ty: IrType) => emit_param_ref_impl; - fn emit_epilogue_and_ret(&mut self, frame_size: i64) => emit_epilogue_and_ret_impl; - fn store_instr_for_type(&self, ty: IrType) -> &'static str => store_instr_for_type_impl; - fn load_instr_for_type(&self, ty: IrType) -> &'static str => load_instr_for_type_impl; - // memory - fn emit_store(&mut self, val: &Operand, ptr: &Value, ty: IrType) => emit_store_impl; - fn emit_load(&mut self, dest: &Value, ptr: &Value, ty: IrType) => emit_load_impl; - fn emit_store_with_const_offset(&mut self, val: &Operand, base: &Value, offset: i64, ty: IrType) => emit_store_with_const_offset_impl; - fn emit_load_with_const_offset(&mut self, dest: &Value, base: &Value, offset: i64, ty: IrType) => emit_load_with_const_offset_impl; - fn emit_typed_store_to_slot(&mut self, instr: &'static str, ty: IrType, slot: StackSlot) => emit_typed_store_to_slot_impl; - fn emit_typed_load_from_slot(&mut self, instr: &'static str, slot: StackSlot) => emit_typed_load_from_slot_impl; - fn emit_load_ptr_from_slot(&mut self, slot: StackSlot, val_id: u32) => emit_load_ptr_from_slot_impl; - fn emit_typed_store_indirect(&mut self, instr: &'static str, ty: IrType) => emit_typed_store_indirect_impl; - fn emit_typed_load_indirect(&mut self, instr: &'static str) => emit_typed_load_indirect_impl; - fn emit_add_offset_to_addr_reg(&mut self, offset: i64) => emit_add_offset_to_addr_reg_impl; - fn emit_slot_addr_to_secondary(&mut self, slot: StackSlot, is_alloca: bool, val_id: u32) => emit_slot_addr_to_secondary_impl; - fn emit_gep_direct_const(&mut self, slot: StackSlot, offset: i64) => emit_gep_direct_const_impl; - fn emit_gep_indirect_const(&mut self, slot: StackSlot, offset: i64, val_id: u32) => emit_gep_indirect_const_impl; - fn emit_add_imm_to_acc(&mut self, imm: i64) => emit_add_imm_to_acc_impl; - fn emit_round_up_acc_to_16(&mut self) => emit_round_up_acc_to_16_impl; - fn emit_sub_sp_by_acc(&mut self) => emit_sub_sp_by_acc_impl; - fn emit_mov_sp_to_acc(&mut self) => emit_mov_sp_to_acc_impl; - fn emit_mov_acc_to_sp(&mut self) => emit_mov_acc_to_sp_impl; - fn emit_align_acc(&mut self, align: usize) => emit_align_acc_impl; - fn emit_alloca_aligned_addr(&mut self, slot: StackSlot, val_id: u32) => emit_alloca_aligned_addr_impl; - fn emit_alloca_aligned_addr_to_acc(&mut self, slot: StackSlot, val_id: u32) => emit_alloca_aligned_addr_to_acc_impl; - fn emit_memcpy_load_dest_addr(&mut self, slot: StackSlot, is_alloca: bool, val_id: u32) => emit_memcpy_load_dest_addr_impl; - fn emit_memcpy_load_src_addr(&mut self, slot: StackSlot, is_alloca: bool, val_id: u32) => emit_memcpy_load_src_addr_impl; - fn emit_memcpy_impl(&mut self, size: usize) => emit_memcpy_impl_impl; - // alu - fn emit_float_neg(&mut self, ty: IrType) => emit_float_neg_impl; - fn emit_int_neg(&mut self, ty: IrType) => emit_int_neg_impl; - fn emit_int_not(&mut self, ty: IrType) => emit_int_not_impl; - fn emit_int_clz(&mut self, ty: IrType) => emit_int_clz_impl; - fn emit_int_ctz(&mut self, ty: IrType) => emit_int_ctz_impl; - fn emit_int_bswap(&mut self, ty: IrType) => emit_int_bswap_impl; - fn emit_int_popcount(&mut self, ty: IrType) => emit_int_popcount_impl; - fn emit_int_binop(&mut self, dest: &Value, op: IrBinOp, lhs: &Operand, rhs: &Operand, ty: IrType) => emit_int_binop_impl; - // comparison - fn emit_float_cmp(&mut self, dest: &Value, op: IrCmpOp, lhs: &Operand, rhs: &Operand, ty: IrType) => emit_float_cmp_impl; - fn emit_f128_cmp(&mut self, dest: &Value, op: IrCmpOp, lhs: &Operand, rhs: &Operand) => emit_f128_cmp_impl; - fn emit_int_cmp(&mut self, dest: &Value, op: IrCmpOp, lhs: &Operand, rhs: &Operand, ty: IrType) => emit_int_cmp_impl; - fn emit_fused_cmp_branch(&mut self, op: IrCmpOp, lhs: &Operand, rhs: &Operand, ty: IrType, true_label: &str, false_label: &str) => emit_fused_cmp_branch_impl; - fn emit_select(&mut self, dest: &Value, cond: &Operand, true_val: &Operand, false_val: &Operand, ty: IrType) => emit_select_impl; - fn emit_f128_neg(&mut self, dest: &Value, src: &Operand) => emit_f128_neg_impl; - // calls - fn call_abi_config(&self) -> call_abi::CallAbiConfig => call_abi_config_impl; - fn emit_call_f128_pre_convert(&mut self, args: &[Operand], arg_classes: &[call_abi::CallArgClass], arg_types: &[IrType], stack_arg_space: usize) -> usize => emit_call_f128_pre_convert_impl; - fn emit_call_compute_stack_space(&self, arg_classes: &[call_abi::CallArgClass], arg_types: &[IrType]) -> usize => emit_call_compute_stack_space_impl; - fn emit_call_stack_args(&mut self, args: &[Operand], arg_classes: &[call_abi::CallArgClass], arg_types: &[IrType], stack_arg_space: usize, fptr_spill: usize, f128_temp_space: usize) -> i64 => emit_call_stack_args_impl; - fn emit_call_reg_args(&mut self, args: &[Operand], arg_classes: &[call_abi::CallArgClass], arg_types: &[IrType], total_sp_adjust: i64, f128_temp_space: usize, stack_arg_space: usize, struct_arg_riscv_float_classes: &[Option]) => emit_call_reg_args_impl; - fn emit_call_instruction(&mut self, direct_name: Option<&str>, func_ptr: Option<&Operand>, indirect: bool, stack_arg_space: usize) => emit_call_instruction_impl; - fn emit_call_cleanup(&mut self, stack_arg_space: usize, f128_temp_space: usize, indirect: bool) => emit_call_cleanup_impl; - fn emit_call_store_result(&mut self, dest: &Value, return_type: IrType) => emit_call_store_result_impl; - fn emit_call_store_i128_result(&mut self, dest: &Value) => emit_call_store_i128_result_impl; - fn emit_call_store_f128_result(&mut self, dest: &Value) => emit_call_store_f128_result_impl; - fn emit_call_move_f32_to_acc(&mut self) => emit_call_move_f32_to_acc_impl; - fn emit_call_move_f64_to_acc(&mut self) => emit_call_move_f64_to_acc_impl; - // globals - fn emit_global_addr(&mut self, dest: &Value, name: &str) => emit_global_addr_impl; - fn emit_label_addr(&mut self, dest: &Value, label: &str) => emit_label_addr_impl; - fn emit_tls_global_addr(&mut self, dest: &Value, name: &str) => emit_tls_global_addr_impl; - // cast - fn emit_cast(&mut self, dest: &Value, src: &Operand, from_ty: IrType, to_ty: IrType) => emit_cast_impl; - fn emit_cast_instrs(&mut self, from_ty: IrType, to_ty: IrType) => emit_cast_instrs_impl; - // variadic - fn emit_va_arg(&mut self, dest: &Value, va_list_ptr: &Value, result_ty: IrType) => emit_va_arg_impl; - fn emit_va_start(&mut self, va_list_ptr: &Value) => emit_va_start_impl; - fn emit_va_copy(&mut self, dest_ptr: &Value, src_ptr: &Value) => emit_va_copy_impl; - fn emit_va_arg_struct(&mut self, dest_ptr: &Value, va_list_ptr: &Value, size: usize) => emit_va_arg_struct_impl; - // returns - fn emit_return(&mut self, val: Option<&Operand>, frame_size: i64) => emit_return_impl; - fn emit_return_i128_to_regs(&mut self) => emit_return_i128_to_regs_impl; - fn emit_return_f128_to_reg(&mut self) => emit_return_f128_to_reg_impl; - fn emit_return_f32_to_reg(&mut self) => emit_return_f32_to_reg_impl; - fn emit_return_f64_to_reg(&mut self) => emit_return_f64_to_reg_impl; - fn emit_return_int_to_reg(&mut self) => emit_return_int_to_reg_impl; - fn emit_get_return_f64_second(&mut self, dest: &Value) => emit_get_return_f64_second_impl; - fn emit_set_return_f64_second(&mut self, src: &Operand) => emit_set_return_f64_second_impl; - fn emit_get_return_f32_second(&mut self, dest: &Value) => emit_get_return_f32_second_impl; - fn emit_set_return_f32_second(&mut self, src: &Operand) => emit_set_return_f32_second_impl; - fn emit_get_return_f128_second(&mut self, dest: &Value) => emit_get_return_f128_second_impl; - fn emit_set_return_f128_second(&mut self, src: &Operand) => emit_set_return_f128_second_impl; - // atomics - fn emit_atomic_rmw(&mut self, dest: &Value, op: AtomicRmwOp, ptr: &Operand, val: &Operand, ty: IrType, ordering: AtomicOrdering) => emit_atomic_rmw_impl; - fn emit_atomic_cmpxchg(&mut self, dest: &Value, ptr: &Operand, expected: &Operand, desired: &Operand, ty: IrType, success_ordering: AtomicOrdering, failure_ordering: AtomicOrdering, returns_bool: bool) => emit_atomic_cmpxchg_impl; - fn emit_atomic_load(&mut self, dest: &Value, ptr: &Operand, ty: IrType, ordering: AtomicOrdering) => emit_atomic_load_impl; - fn emit_atomic_store(&mut self, ptr: &Operand, val: &Operand, ty: IrType, ordering: AtomicOrdering) => emit_atomic_store_impl; - fn emit_fence(&mut self, ordering: AtomicOrdering) => emit_fence_impl; - // i128 ops - fn emit_sign_extend_acc_high(&mut self) => emit_sign_extend_acc_high_impl; - fn emit_zero_acc_high(&mut self) => emit_zero_acc_high_impl; - fn emit_load_acc_pair(&mut self, op: &Operand) => emit_load_acc_pair_impl; - fn emit_store_acc_pair(&mut self, dest: &Value) => emit_store_acc_pair_impl; - fn emit_store_pair_to_slot(&mut self, slot: StackSlot) => emit_store_pair_to_slot_impl; - fn emit_load_pair_from_slot(&mut self, slot: StackSlot) => emit_load_pair_from_slot_impl; - fn emit_save_acc_pair(&mut self) => emit_save_acc_pair_impl; - fn emit_store_pair_indirect(&mut self) => emit_store_pair_indirect_impl; - fn emit_load_pair_indirect(&mut self) => emit_load_pair_indirect_impl; - fn emit_i128_neg(&mut self) => emit_i128_neg_impl; - fn emit_i128_not(&mut self) => emit_i128_not_impl; - fn emit_i128_to_float_call(&mut self, src: &Operand, from_signed: bool, to_ty: IrType) => emit_i128_to_float_call_impl; - fn emit_float_to_i128_call(&mut self, src: &Operand, to_signed: bool, from_ty: IrType) => emit_float_to_i128_call_impl; - fn emit_i128_prep_binop(&mut self, lhs: &Operand, rhs: &Operand) => emit_i128_prep_binop_impl; - fn emit_i128_prep_shift_lhs(&mut self, lhs: &Operand) => emit_i128_prep_shift_lhs_impl; - fn emit_i128_add(&mut self) => emit_i128_add_impl; - fn emit_i128_sub(&mut self) => emit_i128_sub_impl; - fn emit_i128_mul(&mut self) => emit_i128_mul_impl; - fn emit_i128_and(&mut self) => emit_i128_and_impl; - fn emit_i128_or(&mut self) => emit_i128_or_impl; - fn emit_i128_xor(&mut self) => emit_i128_xor_impl; - fn emit_i128_shl(&mut self) => emit_i128_shl_impl; - fn emit_i128_lshr(&mut self) => emit_i128_lshr_impl; - fn emit_i128_ashr(&mut self) => emit_i128_ashr_impl; - fn emit_i128_shl_const(&mut self, amount: u32) => emit_i128_shl_const_impl; - fn emit_i128_lshr_const(&mut self, amount: u32) => emit_i128_lshr_const_impl; - fn emit_i128_ashr_const(&mut self, amount: u32) => emit_i128_ashr_const_impl; - fn emit_i128_divrem_call(&mut self, func_name: &str, lhs: &Operand, rhs: &Operand) => emit_i128_divrem_call_impl; - fn emit_i128_store_result(&mut self, dest: &Value) => emit_i128_store_result_impl; - fn emit_i128_cmp_eq(&mut self, is_ne: bool) => emit_i128_cmp_eq_impl; - fn emit_i128_cmp_ordered(&mut self, op: IrCmpOp) => emit_i128_cmp_ordered_impl; - fn emit_i128_cmp_store_result(&mut self, dest: &Value) => emit_i128_cmp_store_result_impl; - } - - // ---- Segment overrides (x86-specific) ---- - - fn emit_seg_load(&mut self, dest: &Value, ptr: &Value, ty: IrType, seg: AddressSpace) { - self.operand_to_eax(&Operand::Value(*ptr)); - self.state.emit(" movl %eax, %ecx"); - let seg_prefix = match seg { - AddressSpace::SegGs => "%gs:", - AddressSpace::SegFs => "%fs:", - AddressSpace::Default => unreachable!("segment-prefixed op called with default address space"), - }; - let load_instr = self.mov_load_for_type(ty); - emit!(self.state, " {} {}(%ecx), %eax", load_instr, seg_prefix); - self.state.reg_cache.invalidate_acc(); - self.store_eax_to(dest); - } - - fn emit_seg_store(&mut self, val: &Operand, ptr: &Value, ty: IrType, seg: AddressSpace) { - self.operand_to_eax(val); - self.state.emit(" movl %eax, %edx"); - self.operand_to_eax(&Operand::Value(*ptr)); - self.state.emit(" movl %eax, %ecx"); - let seg_prefix = match seg { - AddressSpace::SegGs => "%gs:", - AddressSpace::SegFs => "%fs:", - AddressSpace::Default => unreachable!("segment-prefixed op called with default address space"), - }; - let store_instr = self.mov_store_for_type(ty); - let reg = match ty { - IrType::I8 | IrType::U8 => "%dl", - IrType::I16 | IrType::U16 => "%dx", - _ => "%edx", - }; - emit!(self.state, " {} {}, {}(%ecx)", store_instr, reg, seg_prefix); - } - - fn emit_seg_load_symbol(&mut self, dest: &Value, sym: &str, ty: IrType, seg: AddressSpace) { - let seg_prefix = match seg { - AddressSpace::SegGs => "%gs:", - AddressSpace::SegFs => "%fs:", - AddressSpace::Default => unreachable!("segment-prefixed op called with default address space"), - }; - let load_instr = self.mov_load_for_type(ty); - // i686 uses absolute addressing (no RIP-relative) - emit!(self.state, " {} {}{}, %eax", load_instr, seg_prefix, sym); - self.state.reg_cache.invalidate_acc(); - self.store_eax_to(dest); - } - - fn emit_seg_store_symbol(&mut self, val: &Operand, sym: &str, ty: IrType, seg: AddressSpace) { - self.operand_to_eax(val); - let seg_prefix = match seg { - AddressSpace::SegGs => "%gs:", - AddressSpace::SegFs => "%fs:", - AddressSpace::Default => unreachable!("segment-prefixed op called with default address space"), - }; - let store_instr = self.mov_store_for_type(ty); - let reg = match ty { - IrType::I8 | IrType::U8 => "%al", - IrType::I16 | IrType::U16 => "%ax", - _ => "%eax", - }; - emit!(self.state, " {} {}, {}{}", store_instr, reg, seg_prefix, sym); - } - - fn emit_runtime_stubs(&mut self) { - if self.needs_pc_thunk_bx { - let s = &mut self.state; - s.emit(""); - s.emit(".section .text.__x86.get_pc_thunk.bx,\"axG\",@progbits,__x86.get_pc_thunk.bx,comdat"); - s.emit(".globl __x86.get_pc_thunk.bx"); - s.emit(".hidden __x86.get_pc_thunk.bx"); - s.emit(".type __x86.get_pc_thunk.bx, @function"); - s.emit("__x86.get_pc_thunk.bx:"); - s.emit(" movl (%esp), %ebx"); - s.emit(" ret"); - s.emit(".size __x86.get_pc_thunk.bx, .-__x86.get_pc_thunk.bx"); - s.emit(".text"); - } - - if !self.state.needs_divdi3_helpers { - return; - } - - // Switch to .text explicitly: the last function in the module may have been - // in a custom section (e.g. .init.text), and without this directive the - // helper stubs would inherit that section. The Linux kernel's modpost - // check rejects .text -> .init.text cross-references, so these must live - // in .text. - self.state.emit(".text"); - - self.emit_udivdi3_stub(); - self.emit_umoddi3_stub(); - self.emit_divdi3_stub(); - self.emit_moddi3_stub(); - } -} - -// ─── 64-bit division runtime stubs for i686 ────────────────────────────────── -// -// On 32-bit x86, 64-bit division/modulo requires runtime helpers normally -// provided by libgcc (__divdi3, __udivdi3, __moddi3, __umoddi3). Standalone -// builds (e.g. musl libc) that don't link libgcc need the compiler to provide -// these. We emit them as .weak symbols so that if libgcc IS linked, its -// versions take precedence. -// -// Calling convention (cdecl, stack-based): -// 4(%esp) = dividend low (A_lo) -// 8(%esp) = dividend high (A_hi) -// 12(%esp) = divisor low (B_lo) -// 16(%esp) = divisor high (B_hi) -// Return: edx:eax = 64-bit result - -impl I686Codegen { - /// Emit __udivdi3: unsigned 64-bit division, returns quotient in edx:eax. - /// Algorithm based on compiler-rt's i386/udivdi3.S (Stephen Canon, 2008). - /// Uses normalized-divisor estimation with remainder-based adjustment. - fn emit_udivdi3_stub(&mut self) { - let s = &mut self.state; - s.emit(""); - s.emit(".weak __udivdi3"); - s.emit(".type __udivdi3, @function"); - s.emit("__udivdi3:"); - // Stack: ret(0), A_lo(4), A_hi(8), B_lo(12), B_hi(16) - s.emit(" pushl %ebx"); - // Stack: ebx(0), ret(4), A_lo(8), A_hi(12), B_lo(16), B_hi(20) - s.emit(" movl 20(%esp), %ebx"); // B_hi - s.emit(" bsrl %ebx, %ecx"); // ecx = i = index of leading bit of B_hi - s.emit(" jz .Ludiv_b_hi_zero"); // B_hi == 0 -> special case - - // --- B_hi != 0: quotient fits in 32 bits --- - // Construct bhi = bits [1+i : 32+i] of B (top 32 bits of B, normalized). - // bhi = (B_lo >> (1+i)) | (B_hi << (31-i)) - s.emit(" movl 16(%esp), %eax"); // B_lo - s.emit(" shrl %cl, %eax"); // B_lo >> i - s.emit(" shrl %eax"); // B_lo >> (1+i) - s.emit(" notl %ecx"); // cl = 31-i (mod 32) - s.emit(" shll %cl, %ebx"); // B_hi << (31-i) - s.emit(" orl %eax, %ebx"); // ebx = bhi - s.emit(" movl 12(%esp), %edx"); // A_hi - s.emit(" movl 8(%esp), %eax"); // A_lo - s.emit(" cmpl %ebx, %edx"); // if A_hi >= bhi, need overflow path - s.emit(" jae .Ludiv_big_overflow"); - - // A_hi < bhi: divide edx:eax by bhi directly (no overflow) - s.emit(" divl %ebx"); // eax = qs - s.emit(" pushl %edi"); - // Stack: edi(0), ebx(4), ret(8), A_lo(12), A_hi(16), B_lo(20), B_hi(24) - s.emit(" notl %ecx"); // cl = i again - s.emit(" shrl %eax"); - s.emit(" shrl %cl, %eax"); // q = qs >> (1+i) - s.emit(" movl %eax, %edi"); // edi = q - // Verify: compute a - q*b, adjust if negative - s.emit(" mull 20(%esp)"); // edx:eax = q * B_lo - s.emit(" movl 12(%esp), %ebx"); - s.emit(" movl 16(%esp), %ecx"); // ecx:ebx = a - s.emit(" subl %eax, %ebx"); - s.emit(" sbbl %edx, %ecx"); // ecx:ebx = a - q*B_lo - s.emit(" movl 24(%esp), %eax"); // B_hi - s.emit(" imull %edi, %eax"); // q * B_hi (low 32 bits) - s.emit(" subl %eax, %ecx"); // ecx:ebx = a - q*b - s.emit(" sbbl $0, %edi"); // if remainder was negative, decrement q - s.emit(" xorl %edx, %edx"); - s.emit(" movl %edi, %eax"); - s.emit(" popl %edi"); - s.emit(" popl %ebx"); - s.emit(" ret"); - - // A_hi >= bhi: subtract bhi first to avoid divl overflow - s.emit(".Ludiv_big_overflow:"); - s.emit(" subl %ebx, %edx"); // edx = A_hi - bhi - s.emit(" divl %ebx"); // eax = qs (for quotient 1:qs) - s.emit(" pushl %edi"); - s.emit(" notl %ecx"); // cl = i - s.emit(" shrl %eax"); - s.emit(" orl $0x80000000, %eax"); // set high bit (the '1' prefix) - s.emit(" shrl %cl, %eax"); // q = (1:qs) >> (1+i) - s.emit(" movl %eax, %edi"); - s.emit(" mull 20(%esp)"); // q * B_lo - s.emit(" movl 12(%esp), %ebx"); - s.emit(" movl 16(%esp), %ecx"); - s.emit(" subl %eax, %ebx"); - s.emit(" sbbl %edx, %ecx"); - s.emit(" movl 24(%esp), %eax"); - s.emit(" imull %edi, %eax"); - s.emit(" subl %eax, %ecx"); - s.emit(" sbbl $0, %edi"); - s.emit(" xorl %edx, %edx"); - s.emit(" movl %edi, %eax"); - s.emit(" popl %edi"); - s.emit(" popl %ebx"); - s.emit(" ret"); - - // --- B_hi == 0: two-step divide --- - s.emit(".Ludiv_b_hi_zero:"); - s.emit(" movl 12(%esp), %eax"); // A_hi - s.emit(" movl 16(%esp), %ecx"); // B_lo - s.emit(" xorl %edx, %edx"); - s.emit(" divl %ecx"); // eax = Q_hi, edx = rem - s.emit(" movl %eax, %ebx"); // save Q_hi - s.emit(" movl 8(%esp), %eax"); // A_lo - s.emit(" divl %ecx"); // eax = Q_lo - s.emit(" movl %ebx, %edx"); // edx = Q_hi - s.emit(" popl %ebx"); - s.emit(" ret"); - s.emit(".size __udivdi3, .-__udivdi3"); - } - - /// Emit __umoddi3: unsigned 64-bit modulo, returns remainder in edx:eax. - /// Computes a % b = a - (a / b) * b, delegating division to __udivdi3. - fn emit_umoddi3_stub(&mut self) { - let s = &mut self.state; - s.emit(""); - s.emit(".weak __umoddi3"); - s.emit(".type __umoddi3, @function"); - s.emit("__umoddi3:"); - // Stack: ret(0), A_lo(4), A_hi(8), B_lo(12), B_hi(16) - s.emit(" pushl %ebx"); - s.emit(" pushl %esi"); - s.emit(" pushl %edi"); - s.emit(" pushl %ebp"); - // Stack: ebp(0), edi(4), esi(8), ebx(12), ret(16), A_lo(20), A_hi(24), B_lo(28), B_hi(32) - - // Call __udivdi3(A, B) to get quotient - s.emit(" pushl 32(%esp)"); // B_hi - s.emit(" pushl 32(%esp)"); // B_lo (28+4=32 after push) - s.emit(" pushl 32(%esp)"); // A_hi (24+8=32 after two pushes) - s.emit(" pushl 32(%esp)"); // A_lo (20+12=32 after three pushes) - s.emit(" call __udivdi3"); - s.emit(" addl $16, %esp"); - // edx:eax = quotient (Q_hi:Q_lo) - - // Compute q * B (64-bit), result in ecx:ebx - // q * B = Q_lo * B_lo + (Q_lo * B_hi + Q_hi * B_lo) << 32 - // We only need the low 64 bits. - s.emit(" movl %eax, %ebx"); // save Q_lo - s.emit(" movl %edx, %ecx"); // save Q_hi - s.emit(" imull 28(%esp), %ecx"); // ecx = Q_hi * B_lo (low 32) - s.emit(" movl 32(%esp), %ebp"); // B_hi - s.emit(" imull %ebx, %ebp"); // ebp = Q_lo * B_hi (low 32) - s.emit(" addl %ebp, %ecx"); // ecx = cross terms sum - s.emit(" movl %ebx, %eax"); // eax = Q_lo - s.emit(" mull 28(%esp)"); // edx:eax = Q_lo * B_lo - s.emit(" addl %ecx, %edx"); // edx:eax = q * B (low 64 bits) - - // remainder = A - q*B - s.emit(" movl 20(%esp), %ebx"); // A_lo - s.emit(" movl 24(%esp), %ecx"); // A_hi - s.emit(" subl %eax, %ebx"); - s.emit(" sbbl %edx, %ecx"); - s.emit(" movl %ebx, %eax"); - s.emit(" movl %ecx, %edx"); - s.emit(" popl %ebp"); - s.emit(" popl %edi"); - s.emit(" popl %esi"); - s.emit(" popl %ebx"); - s.emit(" ret"); - s.emit(".size __umoddi3, .-__umoddi3"); - } - - /// Emit __divdi3: signed 64-bit division. - /// Negates operands to unsigned, calls __udivdi3, negates result if needed. - fn emit_divdi3_stub(&mut self) { - let s = &mut self.state; - s.emit(""); - s.emit(".weak __divdi3"); - s.emit(".type __divdi3, @function"); - s.emit("__divdi3:"); - // Stack: ret, A_lo(4), A_hi(8), B_lo(12), B_hi(16) - s.emit(" pushl %ebx"); - s.emit(" pushl %esi"); - s.emit(" pushl %edi"); - // Stack: edi, esi, ebx, ret, A_lo(16), A_hi(20), B_lo(24), B_hi(28) - s.emit(" movl 20(%esp), %edx"); // A_hi - s.emit(" movl 28(%esp), %ecx"); // B_hi - s.emit(" movl %edx, %edi"); // save A_hi for sign - s.emit(" xorl %ecx, %edi"); // edi = sign of result (bit 31) - s.emit(" movl 16(%esp), %eax"); // A_lo - s.emit(" movl 24(%esp), %ebx"); // B_lo - // Negate A if negative - s.emit(" testl %edx, %edx"); - s.emit(" jns .Ldiv_a_pos"); - s.emit(" negl %eax"); - s.emit(" adcl $0, %edx"); - s.emit(" negl %edx"); - s.emit(".Ldiv_a_pos:"); - // Negate B if negative - s.emit(" testl %ecx, %ecx"); - s.emit(" jns .Ldiv_b_pos"); - s.emit(" negl %ebx"); - s.emit(" adcl $0, %ecx"); - s.emit(" negl %ecx"); - s.emit(".Ldiv_b_pos:"); - // Push unsigned args and call __udivdi3 - s.emit(" pushl %ecx"); // B_hi (unsigned) - s.emit(" pushl %ebx"); // B_lo (unsigned) - s.emit(" pushl %edx"); // A_hi (unsigned) - s.emit(" pushl %eax"); // A_lo (unsigned) - s.emit(" call __udivdi3"); - s.emit(" addl $16, %esp"); - // Result in edx:eax. Negate if sign differs. - s.emit(" testl %edi, %edi"); - s.emit(" jns .Ldiv_done"); - s.emit(" negl %eax"); - s.emit(" adcl $0, %edx"); - s.emit(" negl %edx"); - s.emit(".Ldiv_done:"); - s.emit(" popl %edi"); - s.emit(" popl %esi"); - s.emit(" popl %ebx"); - s.emit(" ret"); - s.emit(".size __divdi3, .-__divdi3"); - } - - /// Emit __moddi3: signed 64-bit modulo. - /// Negates operands to unsigned, calls __umoddi3, negates result if dividend was negative. - fn emit_moddi3_stub(&mut self) { - let s = &mut self.state; - s.emit(""); - s.emit(".weak __moddi3"); - s.emit(".type __moddi3, @function"); - s.emit("__moddi3:"); - // Stack: ret, A_lo(4), A_hi(8), B_lo(12), B_hi(16) - s.emit(" pushl %ebx"); - s.emit(" pushl %esi"); - s.emit(" pushl %edi"); - // Stack: edi, esi, ebx, ret, A_lo(16), A_hi(20), B_lo(24), B_hi(28) - s.emit(" movl 20(%esp), %edx"); // A_hi - s.emit(" movl 28(%esp), %ecx"); // B_hi - s.emit(" movl %edx, %edi"); // save A_hi sign (remainder sign = dividend sign) - s.emit(" movl 16(%esp), %eax"); // A_lo - s.emit(" movl 24(%esp), %ebx"); // B_lo - // Negate A if negative - s.emit(" testl %edx, %edx"); - s.emit(" jns .Lmod_a_pos"); - s.emit(" negl %eax"); - s.emit(" adcl $0, %edx"); - s.emit(" negl %edx"); - s.emit(".Lmod_a_pos:"); - // Negate B if negative - s.emit(" testl %ecx, %ecx"); - s.emit(" jns .Lmod_b_pos"); - s.emit(" negl %ebx"); - s.emit(" adcl $0, %ecx"); - s.emit(" negl %ecx"); - s.emit(".Lmod_b_pos:"); - // Push unsigned args and call __umoddi3 - s.emit(" pushl %ecx"); - s.emit(" pushl %ebx"); - s.emit(" pushl %edx"); - s.emit(" pushl %eax"); - s.emit(" call __umoddi3"); - s.emit(" addl $16, %esp"); - // Negate result if dividend was negative - s.emit(" testl %edi, %edi"); - s.emit(" jns .Lmod_done"); - s.emit(" negl %eax"); - s.emit(" adcl $0, %edx"); - s.emit(" negl %edx"); - s.emit(".Lmod_done:"); - s.emit(" popl %edi"); - s.emit(" popl %esi"); - s.emit(" popl %ebx"); - s.emit(" ret"); - s.emit(".size __moddi3, .-__moddi3"); - } -} diff --git a/src/backend/i686/codegen/float_ops.rs b/src/backend/i686/codegen/float_ops.rs deleted file mode 100644 index 7cff549996..0000000000 --- a/src/backend/i686/codegen/float_ops.rs +++ /dev/null @@ -1,18 +0,0 @@ -//! I686Codegen: F128 negation. - -use crate::ir::reexports::{Operand, Value}; -use crate::emit; -use super::emit::I686Codegen; - -impl I686Codegen { - pub(super) fn emit_f128_neg_impl(&mut self, dest: &Value, src: &Operand) { - self.emit_f128_load_to_x87(src); - self.state.emit(" fchs"); - if let Some(slot) = self.state.get_slot(dest.0) { - let sr = self.slot_ref(slot); - emit!(self.state, " fstpt {}", sr); - self.state.f128_direct_slots.insert(dest.0); - } - self.state.reg_cache.invalidate_acc(); - } -} diff --git a/src/backend/i686/codegen/globals.rs b/src/backend/i686/codegen/globals.rs deleted file mode 100644 index 010118771a..0000000000 --- a/src/backend/i686/codegen/globals.rs +++ /dev/null @@ -1,43 +0,0 @@ -//! I686Codegen: global address operations (global, label, TLS). - -use crate::ir::reexports::Value; -use crate::emit; -use super::emit::I686Codegen; - -impl I686Codegen { - pub(super) fn emit_global_addr_impl(&mut self, dest: &Value, name: &str) { - if self.state.pic_mode { - if self.state.needs_got(name) { - emit!(self.state, " movl {}@GOT(%ebx), %eax", name); - } else { - emit!(self.state, " leal {}@GOTOFF(%ebx), %eax", name); - } - } else { - emit!(self.state, " movl ${}, %eax", name); - } - self.state.reg_cache.invalidate_acc(); - self.store_eax_to(dest); - } - - pub(super) fn emit_label_addr_impl(&mut self, dest: &Value, label: &str) { - if self.state.pic_mode { - emit!(self.state, " leal {}@GOTOFF(%ebx), %eax", label); - } else { - emit!(self.state, " movl ${}, %eax", label); - } - self.state.reg_cache.invalidate_acc(); - self.store_eax_to(dest); - } - - pub(super) fn emit_tls_global_addr_impl(&mut self, dest: &Value, name: &str) { - if self.state.pic_mode { - emit!(self.state, " movl {}@GOTNTPOFF(%ebx), %eax", name); - self.state.emit(" addl %gs:0, %eax"); - } else { - self.state.emit(" movl %gs:0, %eax"); - emit!(self.state, " addl ${}@NTPOFF, %eax", name); - } - self.state.reg_cache.invalidate_acc(); - self.store_eax_to(dest); - } -} diff --git a/src/backend/i686/codegen/i128_ops.rs b/src/backend/i686/codegen/i128_ops.rs deleted file mode 100644 index 0bd1a682b4..0000000000 --- a/src/backend/i686/codegen/i128_ops.rs +++ /dev/null @@ -1,450 +0,0 @@ -//! I686Codegen: 128-bit (i128) and 64-bit pair operations. -//! -//! On i686, "i128" operations actually operate on 64-bit values using eax:edx pairs. -//! This module also contains the i64 bit-manipulation helpers. - -use crate::ir::reexports::{IrConst, IrCmpOp, Operand, Value}; -use crate::common::types::IrType; -use crate::backend::state::StackSlot; -use crate::backend::traits::ArchCodegen; -use crate::emit; -use super::emit::I686Codegen; - -impl I686Codegen { - pub(super) fn emit_sign_extend_acc_high_impl(&mut self) { - self.state.emit(" cltd"); - } - - pub(super) fn emit_zero_acc_high_impl(&mut self) { - self.state.emit(" xorl %edx, %edx"); - } - - pub(super) fn emit_load_acc_pair_impl(&mut self, op: &Operand) { - match op { - Operand::Value(v) => { - if let Some(slot) = self.state.get_slot(v.0) { - let sr0 = self.slot_ref(slot); - let sr4 = self.slot_ref_offset(slot, 4); - emit!(self.state, " movl {}, %eax", sr0); - emit!(self.state, " movl {}, %edx", sr4); - } else if let Some(phys) = self.reg_assignments.get(&v.0).copied() { - let reg = super::emit::phys_reg_name(phys); - emit!(self.state, " movl %{}, %eax", reg); - self.state.emit(" xorl %edx, %edx"); - } - } - Operand::Const(IrConst::I128(v)) => { - let low = (*v & 0xFFFFFFFF) as i32; - let high = ((*v >> 32) & 0xFFFFFFFF) as i32; - emit!(self.state, " movl ${}, %eax", low); - emit!(self.state, " movl ${}, %edx", high); - } - Operand::Const(IrConst::I64(v)) => { - let low = (*v & 0xFFFFFFFF) as i32; - let high = ((*v >> 32) & 0xFFFFFFFF) as i32; - emit!(self.state, " movl ${}, %eax", low); - emit!(self.state, " movl ${}, %edx", high); - } - Operand::Const(IrConst::F64(f)) => { - let bits = f.to_bits(); - let low = (bits & 0xFFFFFFFF) as i32; - let high = (bits >> 32) as i32; - emit!(self.state, " movl ${}, %eax", low); - emit!(self.state, " movl ${}, %edx", high); - } - Operand::Const(IrConst::Zero) => { - self.state.emit(" xorl %eax, %eax"); - self.state.emit(" xorl %edx, %edx"); - } - Operand::Const(c) if matches!(c, IrConst::I8(_) | IrConst::I16(_) | IrConst::I32(_)) => { - if let Some(ext) = c.to_i64() { - let low = (ext & 0xFFFFFFFF) as i32; - let high = ((ext >> 32) & 0xFFFFFFFF) as i32; - emit!(self.state, " movl ${}, %eax", low); - emit!(self.state, " movl ${}, %edx", high); - } - } - _ => { - self.operand_to_eax(op); - self.state.emit(" xorl %edx, %edx"); - } - } - } - - pub(super) fn emit_store_acc_pair_impl(&mut self, dest: &Value) { - if let Some(slot) = self.state.get_slot(dest.0) { - let sr0 = self.slot_ref(slot); - let sr4 = self.slot_ref_offset(slot, 4); - emit!(self.state, " movl %eax, {}", sr0); - emit!(self.state, " movl %edx, {}", sr4); - } - } - - pub(super) fn emit_store_pair_to_slot_impl(&mut self, slot: StackSlot) { - let sr0 = self.slot_ref(slot); - let sr4 = self.slot_ref_offset(slot, 4); - emit!(self.state, " movl %eax, {}", sr0); - emit!(self.state, " movl %edx, {}", sr4); - } - - pub(super) fn emit_load_pair_from_slot_impl(&mut self, slot: StackSlot) { - let sr0 = self.slot_ref(slot); - let sr4 = self.slot_ref_offset(slot, 4); - emit!(self.state, " movl {}, %eax", sr0); - emit!(self.state, " movl {}, %edx", sr4); - } - - pub(super) fn emit_save_acc_pair_impl(&mut self) { - self.state.emit(" movl %eax, %esi"); - self.state.emit(" movl %edx, %edi"); - } - - pub(super) fn emit_store_pair_indirect_impl(&mut self) { - self.state.emit(" movl %esi, (%ecx)"); - self.state.emit(" movl %edi, 4(%ecx)"); - } - - pub(super) fn emit_load_pair_indirect_impl(&mut self) { - self.state.emit(" movl (%ecx), %eax"); - self.state.emit(" movl 4(%ecx), %edx"); - } - - pub(super) fn emit_i128_neg_impl(&mut self) { - self.state.emit(" notl %eax"); - self.state.emit(" notl %edx"); - self.state.emit(" addl $1, %eax"); - self.state.emit(" adcl $0, %edx"); - } - - pub(super) fn emit_i128_not_impl(&mut self) { - self.state.emit(" notl %eax"); - self.state.emit(" notl %edx"); - } - - pub(super) fn emit_i128_to_float_call_impl(&mut self, src: &Operand, from_signed: bool, to_ty: IrType) { - self.emit_load_acc_pair(src); - if from_signed { - self.state.emit(" pushl %edx"); - self.state.emit(" pushl %eax"); - self.state.emit(" fildq (%esp)"); - if to_ty == IrType::F32 { - self.state.emit(" fstps (%esp)"); - self.state.emit(" movl (%esp), %eax"); - } else { - self.state.emit(" fstpl (%esp)"); - self.state.emit(" movl (%esp), %eax"); - } - self.state.emit(" addl $8, %esp"); - } else { - let label_id = self.state.next_label_id(); - let big_label = format!(".Lu64_to_f_big_{}", label_id); - let done_label = format!(".Lu64_to_f_done_{}", label_id); - - self.state.emit(" pushl %edx"); - self.state.emit(" pushl %eax"); - self.state.emit(" testl %edx, %edx"); - emit!(self.state, " js {}", big_label); - self.state.emit(" fildq (%esp)"); - emit!(self.state, " jmp {}", done_label); - emit!(self.state, "{}:", big_label); - self.state.emit(" movl (%esp), %eax"); - self.state.emit(" movl 4(%esp), %edx"); - self.state.emit(" shrl $1, %eax"); - self.state.emit(" movl %edx, %ecx"); - self.state.emit(" shrl $1, %edx"); - self.state.emit(" andl $1, %ecx"); - self.state.emit(" shll $31, %ecx"); - self.state.emit(" orl %ecx, %eax"); - self.state.emit(" movl %eax, (%esp)"); - self.state.emit(" movl %edx, 4(%esp)"); - self.state.emit(" fildq (%esp)"); - self.state.emit(" fadd %st(0), %st(0)"); - emit!(self.state, "{}:", done_label); - if to_ty == IrType::F32 { - self.state.emit(" fstps (%esp)"); - self.state.emit(" movl (%esp), %eax"); - } else { - self.state.emit(" fstpl (%esp)"); - self.state.emit(" movl (%esp), %eax"); - } - self.state.emit(" addl $8, %esp"); - } - } - - pub(super) fn emit_float_to_i128_call_impl(&mut self, src: &Operand, _to_signed: bool, _from_ty: IrType) { - // TODO: F64 should use fldl instead of flds, and unsigned conversion - // may need different handling for values exceeding i64 range. - self.operand_to_eax(src); - self.state.emit(" subl $8, %esp"); - self.state.emit(" movl %eax, (%esp)"); - self.state.emit(" flds (%esp)"); - self.state.emit(" fisttpq (%esp)"); - self.state.emit(" movl (%esp), %eax"); - self.state.emit(" movl 4(%esp), %edx"); - self.state.emit(" addl $8, %esp"); - } - - pub(super) fn emit_i128_prep_binop_impl(&mut self, lhs: &Operand, rhs: &Operand) { - self.emit_load_acc_pair(rhs); - self.state.emit(" pushl %edx"); - self.esp_adjust += 4; - self.state.emit(" pushl %eax"); - self.esp_adjust += 4; - self.emit_load_acc_pair(lhs); - } - - pub(super) fn emit_i128_prep_shift_lhs_impl(&mut self, lhs: &Operand) { - self.emit_load_acc_pair(lhs); - } - - pub(super) fn emit_i128_add_impl(&mut self) { - self.state.emit(" addl (%esp), %eax"); - self.state.emit(" adcl 4(%esp), %edx"); - self.state.emit(" addl $8, %esp"); - self.esp_adjust -= 8; - } - - pub(super) fn emit_i128_sub_impl(&mut self) { - self.state.emit(" subl (%esp), %eax"); - self.state.emit(" sbbl 4(%esp), %edx"); - self.state.emit(" addl $8, %esp"); - self.esp_adjust -= 8; - } - - pub(super) fn emit_i128_mul_impl(&mut self) { - self.state.emit(" movl %edx, %ecx"); - self.state.emit(" imull (%esp), %ecx"); - self.state.emit(" movl %eax, %edx"); - self.state.emit(" imull 4(%esp), %edx"); - self.state.emit(" addl %edx, %ecx"); - self.state.emit(" mull (%esp)"); - self.state.emit(" addl %ecx, %edx"); - self.state.emit(" addl $8, %esp"); - self.esp_adjust -= 8; - } - - pub(super) fn emit_i128_and_impl(&mut self) { - self.state.emit(" andl (%esp), %eax"); - self.state.emit(" andl 4(%esp), %edx"); - self.state.emit(" addl $8, %esp"); - self.esp_adjust -= 8; - } - - pub(super) fn emit_i128_or_impl(&mut self) { - self.state.emit(" orl (%esp), %eax"); - self.state.emit(" orl 4(%esp), %edx"); - self.state.emit(" addl $8, %esp"); - self.esp_adjust -= 8; - } - - pub(super) fn emit_i128_xor_impl(&mut self) { - self.state.emit(" xorl (%esp), %eax"); - self.state.emit(" xorl 4(%esp), %edx"); - self.state.emit(" addl $8, %esp"); - self.esp_adjust -= 8; - } - - pub(super) fn emit_i128_shl_impl(&mut self) { - let label_id = self.state.next_label_id(); - let done_label = format!(".Lshl64_done_{}", label_id); - self.state.emit(" movl (%esp), %ecx"); - self.state.emit(" addl $8, %esp"); - self.esp_adjust -= 8; - self.state.emit(" shldl %cl, %eax, %edx"); - self.state.emit(" shll %cl, %eax"); - self.state.emit(" testb $32, %cl"); - emit!(self.state, " je {}", done_label); - self.state.emit(" movl %eax, %edx"); - self.state.emit(" xorl %eax, %eax"); - emit!(self.state, "{}:", done_label); - } - - pub(super) fn emit_i128_lshr_impl(&mut self) { - let label_id = self.state.next_label_id(); - let done_label = format!(".Llshr64_done_{}", label_id); - self.state.emit(" movl (%esp), %ecx"); - self.state.emit(" addl $8, %esp"); - self.esp_adjust -= 8; - self.state.emit(" shrdl %cl, %edx, %eax"); - self.state.emit(" shrl %cl, %edx"); - self.state.emit(" testb $32, %cl"); - emit!(self.state, " je {}", done_label); - self.state.emit(" movl %edx, %eax"); - self.state.emit(" xorl %edx, %edx"); - emit!(self.state, "{}:", done_label); - } - - pub(super) fn emit_i128_ashr_impl(&mut self) { - let label_id = self.state.next_label_id(); - let done_label = format!(".Lashr64_done_{}", label_id); - self.state.emit(" movl (%esp), %ecx"); - self.state.emit(" addl $8, %esp"); - self.esp_adjust -= 8; - self.state.emit(" shrdl %cl, %edx, %eax"); - self.state.emit(" sarl %cl, %edx"); - self.state.emit(" testb $32, %cl"); - emit!(self.state, " je {}", done_label); - self.state.emit(" movl %edx, %eax"); - self.state.emit(" sarl $31, %edx"); - emit!(self.state, "{}:", done_label); - } - - pub(super) fn emit_i128_divrem_call_impl(&mut self, func_name: &str, lhs: &Operand, rhs: &Operand) { - let di_func = match func_name { - "__divti3" => "__divdi3", - "__udivti3" => "__udivdi3", - "__modti3" => "__moddi3", - "__umodti3" => "__umoddi3", - _ => func_name, - }; - - self.state.needs_divdi3_helpers = true; - - self.emit_load_acc_pair(rhs); - self.state.emit(" pushl %edx"); - self.esp_adjust += 4; - self.state.emit(" pushl %eax"); - self.esp_adjust += 4; - self.emit_load_acc_pair(lhs); - self.state.emit(" pushl %edx"); - self.state.emit(" pushl %eax"); - if self.state.needs_plt(di_func) { - emit!(self.state, " call {}@PLT", di_func); - } else { - emit!(self.state, " call {}", di_func); - } - self.state.emit(" addl $16, %esp"); - self.esp_adjust -= 8; - } - - pub(super) fn emit_i128_store_result_impl(&mut self, dest: &Value) { - self.emit_store_acc_pair(dest); - } - - pub(super) fn emit_i128_shl_const_impl(&mut self, amount: u32) { - if amount == 0 { return; } - if amount >= 64 { - self.state.emit(" xorl %eax, %eax"); - self.state.emit(" xorl %edx, %edx"); - } else if amount >= 32 { - self.state.emit(" movl %eax, %edx"); - self.state.emit(" xorl %eax, %eax"); - if amount > 32 { - emit!(self.state, " shll ${}, %edx", amount - 32); - } - } else { - emit!(self.state, " shldl ${}, %eax, %edx", amount); - emit!(self.state, " shll ${}, %eax", amount); - } - } - - pub(super) fn emit_i128_lshr_const_impl(&mut self, amount: u32) { - if amount == 0 { return; } - if amount >= 64 { - self.state.emit(" xorl %eax, %eax"); - self.state.emit(" xorl %edx, %edx"); - } else if amount >= 32 { - self.state.emit(" movl %edx, %eax"); - self.state.emit(" xorl %edx, %edx"); - if amount > 32 { - emit!(self.state, " shrl ${}, %eax", amount - 32); - } - } else { - emit!(self.state, " shrdl ${}, %edx, %eax", amount); - emit!(self.state, " shrl ${}, %edx", amount); - } - } - - pub(super) fn emit_i128_ashr_const_impl(&mut self, amount: u32) { - if amount == 0 { return; } - if amount >= 64 { - self.state.emit(" sarl $31, %edx"); - self.state.emit(" movl %edx, %eax"); - } else if amount >= 32 { - self.state.emit(" movl %edx, %eax"); - self.state.emit(" sarl $31, %edx"); - if amount > 32 { - emit!(self.state, " sarl ${}, %eax", amount - 32); - } - } else { - emit!(self.state, " shrdl ${}, %edx, %eax", amount); - emit!(self.state, " sarl ${}, %edx", amount); - } - } - - pub(super) fn emit_i128_cmp_eq_impl(&mut self, is_ne: bool) { - self.state.emit(" cmpl (%esp), %eax"); - self.state.emit(" sete %al"); - self.state.emit(" cmpl 4(%esp), %edx"); - self.state.emit(" sete %cl"); - self.state.emit(" andb %cl, %al"); - if is_ne { - self.state.emit(" xorb $1, %al"); - } - self.state.emit(" movzbl %al, %eax"); - self.state.emit(" addl $8, %esp"); - self.esp_adjust -= 8; - } - - pub(super) fn emit_i128_cmp_ordered_impl(&mut self, op: IrCmpOp) { - let is_signed = matches!(op, IrCmpOp::Slt | IrCmpOp::Sle | IrCmpOp::Sgt | IrCmpOp::Sge); - - if is_signed { - let label_id = self.state.next_label_id(); - let label_hi_decided = format!(".Li128_hidec_{}", label_id); - let label_done = format!(".Li128_done_{}", label_id); - - self.state.emit(" cmpl 4(%esp), %edx"); - emit!(self.state, " jne {}", label_hi_decided); - - self.state.emit(" cmpl (%esp), %eax"); - let low_set = match op { - IrCmpOp::Slt => "setb", - IrCmpOp::Sle => "setbe", - IrCmpOp::Sgt => "seta", - IrCmpOp::Sge => "setae", - _ => unreachable!("signed i64 low-word cmp got non-signed op: {:?}", op), - }; - emit!(self.state, " {} %al", low_set); - emit!(self.state, " jmp {}", label_done); - - emit!(self.state, "{}:", label_hi_decided); - let high_set = match op { - IrCmpOp::Slt => "setl", - IrCmpOp::Sle => "setl", - IrCmpOp::Sgt => "setg", - IrCmpOp::Sge => "setg", - _ => unreachable!("signed i64 high-word cmp got non-signed op: {:?}", op), - }; - emit!(self.state, " {} %al", high_set); - - emit!(self.state, "{}:", label_done); - } else { - let label_id = self.state.next_label_id(); - let high_decided = format!(".Li128_high_{}", label_id); - - self.state.emit(" cmpl 4(%esp), %edx"); - emit!(self.state, " jne {}", high_decided); - self.state.emit(" cmpl (%esp), %eax"); - emit!(self.state, "{}:", high_decided); - - let set_instr = match op { - IrCmpOp::Ult => "setb", - IrCmpOp::Ule => "setbe", - IrCmpOp::Ugt => "seta", - IrCmpOp::Uge => "setae", - _ => unreachable!("unsigned i64 cmp got non-unsigned op: {:?}", op), - }; - emit!(self.state, " {} %al", set_instr); - } - self.state.emit(" movzbl %al, %eax"); - self.state.emit(" addl $8, %esp"); - self.esp_adjust -= 8; - } - - pub(super) fn emit_i128_cmp_store_result_impl(&mut self, dest: &Value) { - self.state.reg_cache.invalidate_acc(); - self.store_eax_to(dest); - } - -} diff --git a/src/backend/i686/codegen/inline_asm.rs b/src/backend/i686/codegen/inline_asm.rs deleted file mode 100644 index 3d868cc021..0000000000 --- a/src/backend/i686/codegen/inline_asm.rs +++ /dev/null @@ -1,113 +0,0 @@ -//! i686 inline assembly template substitution and register formatting. -//! -//! The default register size is 32-bit (eax, etc.). Supports GCC-style modifiers -//! for size variants (w=16-bit, b=8-bit low, h=8-bit high) and special operand -//! forms (c=raw constant, P=raw symbol, a=address, n=negated immediate). -//! -//! Template parsing delegates to the shared `x86_common` module. Only operand -//! emission differs from x86-64 (no RIP-relative addressing, 32-bit default). - -use std::borrow::Cow; -use std::fmt::Write; -use crate::common::types::IrType; -use crate::ir::reexports::BlockId; -use crate::backend::x86_common; -use super::emit::I686Codegen; - -impl I686Codegen { - /// Substitute %0, %1, %[name], %k0, %b1, %w2, %h3, %c0, %P0, %a0, %n0, %l[name] etc. - /// in i686 asm template. - /// - /// Delegates to the shared x86 template parser with an i686-specific - /// operand emission callback. - pub(super) fn substitute_i686_asm_operands( - line: &str, - op_regs: &[String], - op_names: &[Option], - op_is_memory: &[bool], - op_mem_addrs: &[String], - op_types: &[IrType], - gcc_to_internal: &[usize], - goto_labels: &[(String, BlockId)], - op_imm_values: &[Option], - op_imm_symbols: &[Option], - ) -> String { - x86_common::substitute_x86_asm_operands( - line, op_regs, op_names, op_is_memory, op_mem_addrs, op_types, - gcc_to_internal, goto_labels, op_imm_values, op_imm_symbols, - Self::emit_i686_operand, - ) - } - - /// Emit a single operand with the given modifier into the result string. - /// - /// Handles i686-specific behavior: - /// - `%a` with symbol emits just the symbol name (no RIP-relative) - /// - Default register width is 32-bit - fn emit_i686_operand( - result: &mut String, - idx: usize, - modifier: Option, - op_regs: &[String], - op_is_memory: &[bool], - op_mem_addrs: &[String], - op_types: &[IrType], - op_imm_values: &[Option], - op_imm_symbols: &[Option], - ) { - // Try shared logic first (handles %n, %c/%P, memory, $symbol, $imm) - if x86_common::emit_operand_common( - result, idx, modifier, op_regs, op_is_memory, op_mem_addrs, - op_imm_values, op_imm_symbols, - ) { - return; - } - - let has_symbol = op_imm_symbols.get(idx).and_then(|s| s.as_ref()); - let has_imm = op_imm_values.get(idx).and_then(|v| v.as_ref()); - - if modifier == Some('a') { - // %a: emit as address reference (i686 uses absolute, no RIP-relative) - if let Some(sym) = has_symbol { - result.push_str(sym); - } else if let Some(imm) = has_imm { - result.push_str(&imm.to_string()); - } else if op_is_memory[idx] { - result.push_str(&op_mem_addrs[idx]); - } else { - let _ = write!(result, "(%{})", op_regs[idx]); - } - } else { - // Register operand — apply size modifier, default is 32-bit - let effective_mod = modifier.or_else(|| Self::i686_default_modifier_for_type(op_types.get(idx).copied())); - result.push('%'); - result.push_str(&Self::format_i686_reg(&op_regs[idx], effective_mod)); - } - } - - /// Determine the default register size modifier based on the operand's IR type. - /// On i686, the default is 32-bit, so only smaller types get a modifier. - fn i686_default_modifier_for_type(ty: Option) -> Option { - match ty { - Some(IrType::I8) | Some(IrType::U8) => Some('b'), - Some(IrType::I16) | Some(IrType::U16) => Some('w'), - // 32-bit is the default on i686 - _ => None, - } - } - - /// Format i686 register with size modifier. - /// On i686, default (no modifier or 'k') is 32-bit. - fn format_i686_reg<'a>(reg: &'a str, modifier: Option) -> Cow<'a, str> { - if reg.starts_with("xmm") || reg.starts_with("st(") || reg == "st" { - return Cow::Borrowed(reg); - } - match modifier { - Some('w') => x86_common::reg_to_16(reg), - Some('b') => x86_common::reg_to_8l(reg), - Some('h') => x86_common::reg_to_8h(reg), - // 'k', 'l', 'q', or no modifier => 32-bit (no 64-bit on i686) - _ => x86_common::reg_to_32(reg), - } - } -} diff --git a/src/backend/i686/codegen/intrinsics.rs b/src/backend/i686/codegen/intrinsics.rs deleted file mode 100644 index 01e7e66840..0000000000 --- a/src/backend/i686/codegen/intrinsics.rs +++ /dev/null @@ -1,606 +0,0 @@ -//! i686 SSE/AES/CRC intrinsic emission and x87 FPU math intrinsics. -//! -//! Handles the `emit_intrinsic` trait method for the i686 backend, covering: -//! - Memory fences (lfence, mfence, sfence, pause) -//! - Non-temporal stores (movnti, movntdq, movntpd) -//! - SSE/SSE2 128-bit packed operations -//! - AES-NI encryption/decryption -//! - CRC32 instructions -//! - Frame/return address intrinsics -//! - x87 FPU math (sqrt, fabs) for F32/F64 - -use crate::ir::reexports::{ - IntrinsicOp, - IrConst, - Operand, - Value, -}; -use crate::emit; -use super::emit::I686Codegen; - -impl I686Codegen { - pub(super) fn emit_intrinsic_impl(&mut self, dest: &Option, op: &IntrinsicOp, dest_ptr: &Option, args: &[Operand]) { - match op { - // --- Memory fences (same x86 instructions as x86-64) --- - IntrinsicOp::Lfence => { self.state.emit(" lfence"); } - IntrinsicOp::Mfence => { self.state.emit(" mfence"); } - IntrinsicOp::Sfence => { self.state.emit(" sfence"); } - IntrinsicOp::Pause => { self.state.emit(" pause"); } - IntrinsicOp::Clflush => { - self.operand_to_eax(&args[0]); - self.state.emit(" clflush (%eax)"); - } - - // --- Non-temporal stores --- - IntrinsicOp::Movnti | IntrinsicOp::Movnti64 - | IntrinsicOp::Movntdq | IntrinsicOp::Movntpd => { - self.emit_nontemporal_store(op, dest_ptr, args); - } - - // --- SSE 128-bit load/store --- - IntrinsicOp::Loaddqu => { - if let Some(dptr) = dest_ptr { - self.operand_to_eax(&args[0]); - self.state.emit(" movdqu (%eax), %xmm0"); - self.operand_to_eax(&Operand::Value(*dptr)); - self.state.emit(" movdqu %xmm0, (%eax)"); - } - } - IntrinsicOp::Storedqu => { - if let Some(ptr) = dest_ptr { - self.operand_to_eax(&args[0]); - self.state.emit(" movdqu (%eax), %xmm0"); - self.operand_to_eax(&Operand::Value(*ptr)); - self.state.emit(" movdqu %xmm0, (%eax)"); - } - } - - // SSE 128-bit binary operations - IntrinsicOp::Pcmpeqb128 | IntrinsicOp::Pcmpeqd128 - | IntrinsicOp::Psubusb128 | IntrinsicOp::Psubsb128 - | IntrinsicOp::Por128 - | IntrinsicOp::Pand128 | IntrinsicOp::Pxor128 => { - if let Some(dptr) = dest_ptr { - let inst = match op { - IntrinsicOp::Pcmpeqb128 => "pcmpeqb", - IntrinsicOp::Pcmpeqd128 => "pcmpeqd", - IntrinsicOp::Psubusb128 => "psubusb", - IntrinsicOp::Psubsb128 => "psubsb", - IntrinsicOp::Por128 => "por", - IntrinsicOp::Pand128 => "pand", - IntrinsicOp::Pxor128 => "pxor", - _ => unreachable!("unexpected SSE binary op: {:?}", op), - }; - self.emit_sse_binary_128(dptr, args, inst); - } - } - IntrinsicOp::Pmovmskb128 => { - self.operand_to_eax(&args[0]); - self.state.emit(" movdqu (%eax), %xmm0"); - self.state.emit(" pmovmskb %xmm0, %eax"); - self.state.reg_cache.invalidate_acc(); - if let Some(d) = dest { - self.store_eax_to(d); - } - } - IntrinsicOp::SetEpi8 => { - if let Some(dptr) = dest_ptr { - self.operand_to_eax(&args[0]); - self.state.emit(" movd %eax, %xmm0"); - self.state.emit(" punpcklbw %xmm0, %xmm0"); - self.state.emit(" punpcklwd %xmm0, %xmm0"); - self.state.emit(" pshufd $0, %xmm0, %xmm0"); - self.operand_to_eax(&Operand::Value(*dptr)); - self.state.emit(" movdqu %xmm0, (%eax)"); - } - } - IntrinsicOp::SetEpi32 => { - if let Some(dptr) = dest_ptr { - self.operand_to_eax(&args[0]); - self.state.emit(" movd %eax, %xmm0"); - self.state.emit(" pshufd $0, %xmm0, %xmm0"); - self.operand_to_eax(&Operand::Value(*dptr)); - self.state.emit(" movdqu %xmm0, (%eax)"); - } - } - - // --- CRC32 --- - IntrinsicOp::Crc32_8 | IntrinsicOp::Crc32_16 - | IntrinsicOp::Crc32_32 | IntrinsicOp::Crc32_64 => { - self.emit_crc32_intrinsic(op, dest, args); - } - - // --- Frame and return address --- - IntrinsicOp::FrameAddress => { - self.state.emit(" movl %ebp, %eax"); - self.state.reg_cache.invalidate_acc(); - if let Some(d) = dest { - self.store_eax_to(d); - } - } - IntrinsicOp::ReturnAddress => { - // On i686, return address is at 4(%ebp) (32-bit stack frame) - // With FP omission: param_ref(4) computes the correct ESP-relative offset - let ra = self.param_ref(4); - emit!(self.state, " movl {}, %eax", ra); - self.state.reg_cache.invalidate_acc(); - if let Some(d) = dest { - self.store_eax_to(d); - } - } - IntrinsicOp::ThreadPointer => { - // __builtin_thread_pointer(): read TLS base from %gs:0 on i686 - self.state.emit(" movl %gs:0, %eax"); - self.state.reg_cache.invalidate_acc(); - if let Some(d) = dest { - self.store_eax_to(d); - } - } - - // --- Floating-point intrinsics via x87 FPU --- - IntrinsicOp::SqrtF64 => { - self.emit_f64_unary_x87(&args[0], "fsqrt", dest); - } - IntrinsicOp::SqrtF32 => { - self.emit_f32_load_to_x87(&args[0]); - self.state.emit(" fsqrt"); - self.emit_f32_store_from_x87(dest); - } - IntrinsicOp::FabsF64 => { - self.emit_f64_unary_x87(&args[0], "fabs", dest); - } - IntrinsicOp::FabsF32 => { - self.emit_f32_load_to_x87(&args[0]); - self.state.emit(" fabs"); - self.emit_f32_store_from_x87(dest); - } - - // --- AES-NI --- - IntrinsicOp::Aesenc128 | IntrinsicOp::Aesenclast128 - | IntrinsicOp::Aesdec128 | IntrinsicOp::Aesdeclast128 => { - if let Some(dptr) = dest_ptr { - let inst = match op { - IntrinsicOp::Aesenc128 => "aesenc", - IntrinsicOp::Aesenclast128 => "aesenclast", - IntrinsicOp::Aesdec128 => "aesdec", - IntrinsicOp::Aesdeclast128 => "aesdeclast", - _ => unreachable!("AES-NI dispatch matched non-AES op: {:?}", op), - }; - self.emit_sse_binary_128(dptr, args, inst); - } - } - IntrinsicOp::Aesimc128 => { - if let Some(dptr) = dest_ptr { - self.operand_to_eax(&args[0]); - self.state.emit(" movdqu (%eax), %xmm0"); - self.state.emit(" aesimc %xmm0, %xmm0"); - self.operand_to_eax(&Operand::Value(*dptr)); - self.state.emit(" movdqu %xmm0, (%eax)"); - } - } - IntrinsicOp::Aeskeygenassist128 => { - if let Some(dptr) = dest_ptr { - self.operand_to_eax(&args[0]); - self.state.emit(" movdqu (%eax), %xmm0"); - let imm = Self::operand_to_imm_i64(&args[1]); - self.state.emit_fmt(format_args!(" aeskeygenassist ${}, %xmm0, %xmm0", imm)); - self.operand_to_eax(&Operand::Value(*dptr)); - self.state.emit(" movdqu %xmm0, (%eax)"); - } - } - IntrinsicOp::Pclmulqdq128 => { - if let Some(dptr) = dest_ptr { - self.operand_to_eax(&args[0]); - self.state.emit(" movdqu (%eax), %xmm0"); - self.operand_to_eax(&args[1]); - self.state.emit(" movdqu (%eax), %xmm1"); - let imm = Self::operand_to_imm_i64(&args[2]); - self.state.emit_fmt(format_args!(" pclmulqdq ${}, %xmm1, %xmm0", imm)); - self.operand_to_eax(&Operand::Value(*dptr)); - self.state.emit(" movdqu %xmm0, (%eax)"); - } - } - - // SSE2 shift-by-immediate operations - IntrinsicOp::Pslldqi128 | IntrinsicOp::Psrldqi128 - | IntrinsicOp::Psllqi128 | IntrinsicOp::Psrlqi128 => { - if let Some(dptr) = dest_ptr { - let inst = match op { - IntrinsicOp::Pslldqi128 => "pslldq", - IntrinsicOp::Psrldqi128 => "psrldq", - IntrinsicOp::Psllqi128 => "psllq", - IntrinsicOp::Psrlqi128 => "psrlq", - _ => unreachable!("unexpected SSE shift-by-immediate op: {:?}", op), - }; - self.emit_sse_unary_imm_128(dptr, args, inst); - } - } - // SSE2 shuffle with immediate - IntrinsicOp::Pshufd128 => { - if let Some(dptr) = dest_ptr { - self.emit_sse_shuffle_imm_128(dptr, args, "pshufd"); - } - } - IntrinsicOp::Loadldi128 => { - if let Some(dptr) = dest_ptr { - self.operand_to_eax(&args[0]); - self.state.emit(" movq (%eax), %xmm0"); - self.operand_to_eax(&Operand::Value(*dptr)); - self.state.emit(" movdqu %xmm0, (%eax)"); - } - } - - // SSE2 binary 128-bit operations - IntrinsicOp::Paddw128 | IntrinsicOp::Psubw128 | IntrinsicOp::Pmulhw128 - | IntrinsicOp::Pmaddwd128 | IntrinsicOp::Pcmpgtw128 | IntrinsicOp::Pcmpgtb128 - | IntrinsicOp::Paddd128 | IntrinsicOp::Psubd128 - | IntrinsicOp::Packssdw128 | IntrinsicOp::Packsswb128 | IntrinsicOp::Packuswb128 - | IntrinsicOp::Punpcklbw128 | IntrinsicOp::Punpckhbw128 - | IntrinsicOp::Punpcklwd128 | IntrinsicOp::Punpckhwd128 => { - if let Some(dptr) = dest_ptr { - let inst = match op { - IntrinsicOp::Paddw128 => "paddw", - IntrinsicOp::Psubw128 => "psubw", - IntrinsicOp::Pmulhw128 => "pmulhw", - IntrinsicOp::Pmaddwd128 => "pmaddwd", - IntrinsicOp::Pcmpgtw128 => "pcmpgtw", - IntrinsicOp::Pcmpgtb128 => "pcmpgtb", - IntrinsicOp::Paddd128 => "paddd", - IntrinsicOp::Psubd128 => "psubd", - IntrinsicOp::Packssdw128 => "packssdw", - IntrinsicOp::Packsswb128 => "packsswb", - IntrinsicOp::Packuswb128 => "packuswb", - IntrinsicOp::Punpcklbw128 => "punpcklbw", - IntrinsicOp::Punpckhbw128 => "punpckhbw", - IntrinsicOp::Punpcklwd128 => "punpcklwd", - IntrinsicOp::Punpckhwd128 => "punpckhwd", - _ => unreachable!("unexpected SSE binary op: {:?}", op), - }; - self.emit_sse_binary_128(dptr, args, inst); - } - } - - // SSE2 element shift-by-immediate operations - IntrinsicOp::Psllwi128 | IntrinsicOp::Psrlwi128 | IntrinsicOp::Psrawi128 - | IntrinsicOp::Psradi128 | IntrinsicOp::Pslldi128 | IntrinsicOp::Psrldi128 => { - if let Some(dptr) = dest_ptr { - let inst = match op { - IntrinsicOp::Psllwi128 => "psllw", - IntrinsicOp::Psrlwi128 => "psrlw", - IntrinsicOp::Psrawi128 => "psraw", - IntrinsicOp::Psradi128 => "psrad", - IntrinsicOp::Pslldi128 => "pslld", - IntrinsicOp::Psrldi128 => "psrld", - _ => unreachable!("unexpected SSE element shift op: {:?}", op), - }; - self.emit_sse_unary_imm_128(dptr, args, inst); - } - } - - // --- SSE2 set/insert/extract/convert --- - IntrinsicOp::SetEpi16 => { - if let Some(dptr) = dest_ptr { - self.operand_to_eax(&args[0]); - self.state.emit(" movd %eax, %xmm0"); - self.state.emit(" punpcklwd %xmm0, %xmm0"); - self.state.emit(" pshufd $0, %xmm0, %xmm0"); - self.operand_to_eax(&Operand::Value(*dptr)); - self.state.emit(" movdqu %xmm0, (%eax)"); - } - } - IntrinsicOp::Pinsrw128 => { - if let Some(dptr) = dest_ptr { - self.operand_to_eax(&args[0]); - self.state.emit(" movdqu (%eax), %xmm0"); - self.operand_to_ecx(&args[1]); - let imm = Self::operand_to_imm_i64(&args[2]); - self.state.emit_fmt(format_args!(" pinsrw ${}, %ecx, %xmm0", imm)); - self.operand_to_eax(&Operand::Value(*dptr)); - self.state.emit(" movdqu %xmm0, (%eax)"); - } - } - IntrinsicOp::Pextrw128 => { - self.operand_to_eax(&args[0]); - self.state.emit(" movdqu (%eax), %xmm0"); - let imm = Self::operand_to_imm_i64(&args[1]); - self.state.emit_fmt(format_args!(" pextrw ${}, %xmm0, %eax", imm)); - self.state.reg_cache.invalidate_acc(); - if let Some(d) = dest { - self.store_eax_to(d); - } - } - IntrinsicOp::Pinsrd128 => { - // Insert 32-bit value at lane: pinsrd $imm, %eax, %xmm0 (SSE4.1) - if let Some(dptr) = dest_ptr { - self.operand_to_eax(&args[0]); - self.state.emit(" movdqu (%eax), %xmm0"); - self.operand_to_ecx(&args[1]); - let imm = Self::operand_to_imm_i64(&args[2]); - self.state.emit_fmt(format_args!(" pinsrd ${}, %ecx, %xmm0", imm)); - self.operand_to_eax(&Operand::Value(*dptr)); - self.state.emit(" movdqu %xmm0, (%eax)"); - } - } - IntrinsicOp::Pextrd128 => { - // Extract 32-bit value at lane: pextrd $imm, %xmm0, %eax (SSE4.1) - self.operand_to_eax(&args[0]); - self.state.emit(" movdqu (%eax), %xmm0"); - let imm = Self::operand_to_imm_i64(&args[1]); - self.state.emit_fmt(format_args!(" pextrd ${}, %xmm0, %eax", imm)); - self.state.reg_cache.invalidate_acc(); - if let Some(d) = dest { - self.store_eax_to(d); - } - } - IntrinsicOp::Pinsrb128 => { - // Insert 8-bit value at lane: pinsrb $imm, %eax, %xmm0 (SSE4.1) - if let Some(dptr) = dest_ptr { - self.operand_to_eax(&args[0]); - self.state.emit(" movdqu (%eax), %xmm0"); - self.operand_to_ecx(&args[1]); - let imm = Self::operand_to_imm_i64(&args[2]); - self.state.emit_fmt(format_args!(" pinsrb ${}, %ecx, %xmm0", imm)); - self.operand_to_eax(&Operand::Value(*dptr)); - self.state.emit(" movdqu %xmm0, (%eax)"); - } - } - IntrinsicOp::Pextrb128 => { - // Extract 8-bit value at lane: pextrb $imm, %xmm0, %eax (SSE4.1) - self.operand_to_eax(&args[0]); - self.state.emit(" movdqu (%eax), %xmm0"); - let imm = Self::operand_to_imm_i64(&args[1]); - self.state.emit_fmt(format_args!(" pextrb ${}, %xmm0, %eax", imm)); - self.state.reg_cache.invalidate_acc(); - if let Some(d) = dest { - self.store_eax_to(d); - } - } - IntrinsicOp::Pinsrq128 => { - // TODO: PINSRQ is not available on i686 - could emulate with two PINSRD - // Currently just copies input unchanged (no-op) - if let Some(dptr) = dest_ptr { - self.operand_to_eax(&args[0]); - self.state.emit(" movdqu (%eax), %xmm0"); - self.operand_to_eax(&Operand::Value(*dptr)); - self.state.emit(" movdqu %xmm0, (%eax)"); - } - } - IntrinsicOp::Pextrq128 => { - // TODO: PEXTRQ is not available on i686 - could emulate with MOVQ or two PEXTRD - // Currently only extracts low 32 bits as fallback - self.operand_to_eax(&args[0]); - self.state.emit(" movdqu (%eax), %xmm0"); - self.state.emit(" movd %xmm0, %eax"); - self.state.reg_cache.invalidate_acc(); - if let Some(d) = dest { - self.store_eax_to(d); - } - } - IntrinsicOp::Storeldi128 => { - if let Some(ptr) = dest_ptr { - self.operand_to_eax(&args[0]); - self.state.emit(" movdqu (%eax), %xmm0"); - self.operand_to_eax(&Operand::Value(*ptr)); - self.state.emit(" movq %xmm0, (%eax)"); - } - } - IntrinsicOp::Cvtsi128Si32 => { - self.operand_to_eax(&args[0]); - self.state.emit(" movdqu (%eax), %xmm0"); - self.state.emit(" movd %xmm0, %eax"); - self.state.reg_cache.invalidate_acc(); - if let Some(d) = dest { - self.store_eax_to(d); - } - } - IntrinsicOp::Cvtsi32Si128 => { - if let Some(dptr) = dest_ptr { - self.operand_to_eax(&args[0]); - self.state.emit(" movd %eax, %xmm0"); - self.operand_to_eax(&Operand::Value(*dptr)); - self.state.emit(" movdqu %xmm0, (%eax)"); - } - } - IntrinsicOp::Cvtsi128Si64 => { - // On i686, only extracts the low 32 bits - self.operand_to_eax(&args[0]); - self.state.emit(" movdqu (%eax), %xmm0"); - self.state.emit(" movd %xmm0, %eax"); - self.state.reg_cache.invalidate_acc(); - if let Some(d) = dest { - self.store_eax_to(d); - } - } - IntrinsicOp::Pshuflw128 | IntrinsicOp::Pshufhw128 => { - if let Some(dptr) = dest_ptr { - let inst = match op { - IntrinsicOp::Pshuflw128 => "pshuflw", - IntrinsicOp::Pshufhw128 => "pshufhw", - _ => unreachable!("unexpected SSE shuffle op: {:?}", op), - }; - self.emit_sse_shuffle_imm_128(dptr, args, inst); - } - } - } - self.state.reg_cache.invalidate_acc(); - } - - fn emit_nontemporal_store(&mut self, op: &IntrinsicOp, dest_ptr: &Option, args: &[Operand]) { - let Some(ptr) = dest_ptr else { return }; - match op { - IntrinsicOp::Movnti => { - self.operand_to_eax(&args[0]); - self.state.emit(" movl %eax, %ecx"); - self.operand_to_eax(&Operand::Value(*ptr)); - self.state.emit(" movnti %ecx, (%eax)"); - } - IntrinsicOp::Movnti64 => { - self.operand_to_eax(&Operand::Value(*ptr)); - self.state.emit(" movl %eax, %ecx"); - if let Operand::Value(v) = &args[0] { - if let Some(slot) = self.state.get_slot(v.0) { - let sr0 = self.slot_ref(slot); - let sr4 = self.slot_ref_offset(slot, 4); - emit!(self.state, " movl {}, %eax", sr0); - self.state.emit(" movnti %eax, (%ecx)"); - emit!(self.state, " movl {}, %eax", sr4); - self.state.emit(" movnti %eax, 4(%ecx)"); - } else { - self.operand_to_eax(&args[0]); - self.state.emit(" movnti %eax, (%ecx)"); - self.state.emit(" xorl %eax, %eax"); - self.state.emit(" movnti %eax, 4(%ecx)"); - } - } else { - self.operand_to_eax(&args[0]); - self.state.emit(" movnti %eax, (%ecx)"); - self.state.emit(" xorl %eax, %eax"); - self.state.emit(" movnti %eax, 4(%ecx)"); - } - } - IntrinsicOp::Movntdq => { - self.operand_to_eax(&args[0]); - self.state.emit(" movdqu (%eax), %xmm0"); - self.operand_to_eax(&Operand::Value(*ptr)); - self.state.emit(" movntdq %xmm0, (%eax)"); - } - IntrinsicOp::Movntpd => { - self.operand_to_eax(&args[0]); - self.state.emit(" movupd (%eax), %xmm0"); - self.operand_to_eax(&Operand::Value(*ptr)); - self.state.emit(" movntpd %xmm0, (%eax)"); - } - _ => {} - } - } - - fn emit_crc32_intrinsic(&mut self, op: &IntrinsicOp, dest: &Option, args: &[Operand]) { - if *op == IntrinsicOp::Crc32_64 { - // On i686, no 64-bit CRC32; do two 32-bit CRC32s - self.operand_to_eax(&args[0]); - self.state.emit(" movl %eax, %edx"); - if let Operand::Value(v) = &args[1] { - if let Some(slot) = self.state.get_slot(v.0) { - let sr0 = self.slot_ref(slot); - let sr4 = self.slot_ref_offset(slot, 4); - emit!(self.state, " movl {}, %ecx", sr0); - self.state.emit(" movl %edx, %eax"); - self.state.emit(" crc32l %ecx, %eax"); - emit!(self.state, " movl {}, %ecx", sr4); - self.state.emit(" crc32l %ecx, %eax"); - } else { - self.operand_to_ecx(&args[1]); - self.state.emit(" movl %edx, %eax"); - self.state.emit(" crc32l %ecx, %eax"); - } - } else { - self.operand_to_ecx(&args[1]); - self.state.emit(" movl %edx, %eax"); - self.state.emit(" crc32l %ecx, %eax"); - } - } else { - self.operand_to_eax(&args[0]); - self.state.emit(" movl %eax, %ecx"); - self.operand_to_eax(&args[1]); - self.state.emit(" xchgl %eax, %ecx"); - let inst = match op { - IntrinsicOp::Crc32_8 => "crc32b %cl, %eax", - IntrinsicOp::Crc32_16 => "crc32w %cx, %eax", - IntrinsicOp::Crc32_32 => "crc32l %ecx, %eax", - _ => unreachable!("unexpected CRC32 op: {:?}", op), - }; - self.state.emit_fmt(format_args!(" {}", inst)); - } - self.state.reg_cache.invalidate_acc(); - if let Some(d) = dest { - self.store_eax_to(d); - } - } - - /// Apply an x87 unary FPU op on an f64 operand and store the result. - fn emit_f64_unary_x87(&mut self, arg: &Operand, x87_op: &str, dest: &Option) { - self.emit_f64_load_to_x87(arg); - self.state.emit_fmt(format_args!(" {}", x87_op)); - if let Some(d) = dest { - if let Some(slot) = self.state.get_slot(d.0) { - let sr = self.slot_ref(slot); - emit!(self.state, " fstpl {}", sr); - } else { - self.state.emit(" fstp %st(0)"); - } - } else { - self.state.emit(" fstp %st(0)"); - } - } - - /// Emit a binary SSE 128-bit operation: load two 128-bit operands from - /// pointers, apply the operation, and store the result to dest_ptr. - fn emit_sse_binary_128(&mut self, dptr: &Value, args: &[Operand], sse_inst: &str) { - self.operand_to_eax(&args[0]); - self.state.emit(" movdqu (%eax), %xmm0"); - self.operand_to_eax(&args[1]); - self.state.emit(" movdqu (%eax), %xmm1"); - self.state.emit_fmt(format_args!(" {} %xmm1, %xmm0", sse_inst)); - self.operand_to_eax(&Operand::Value(*dptr)); - self.state.emit(" movdqu %xmm0, (%eax)"); - } - - /// Emit SSE unary 128-bit op with immediate: load xmm0 from arg0 ptr, - /// apply `inst $imm, %xmm0`, store result xmm0 to dest_ptr. - fn emit_sse_unary_imm_128(&mut self, dptr: &Value, args: &[Operand], sse_inst: &str) { - self.operand_to_eax(&args[0]); - self.state.emit(" movdqu (%eax), %xmm0"); - let imm = Self::operand_to_imm_i64(&args[1]); - self.state.emit_fmt(format_args!(" {} ${}, %xmm0", sse_inst, imm)); - self.operand_to_eax(&Operand::Value(*dptr)); - self.state.emit(" movdqu %xmm0, (%eax)"); - } - - /// Emit SSE shuffle with immediate: load xmm0, apply `inst $imm, %xmm0, %xmm0`, - /// store result. Used for pshufd/pshuflw/pshufhw. - fn emit_sse_shuffle_imm_128(&mut self, dptr: &Value, args: &[Operand], sse_inst: &str) { - self.operand_to_eax(&args[0]); - self.state.emit(" movdqu (%eax), %xmm0"); - let imm = Self::operand_to_imm_i64(&args[1]); - self.state.emit_fmt(format_args!(" {} ${}, %xmm0, %xmm0", sse_inst, imm)); - self.operand_to_eax(&Operand::Value(*dptr)); - self.state.emit(" movdqu %xmm0, (%eax)"); - } - - /// Load an F32 operand onto the x87 FPU stack. - fn emit_f32_load_to_x87(&mut self, op: &Operand) { - match op { - Operand::Value(v) if self.state.get_slot(v.0).is_some() => { - let slot = self.state.get_slot(v.0).expect("slot exists (guarded by is_some)"); - let sr = self.slot_ref(slot); - emit!(self.state, " flds {}", sr); - } - Operand::Const(IrConst::F32(fval)) => { - emit!(self.state, " movl ${}, %eax", fval.to_bits() as i32); - self.state.emit(" pushl %eax"); - self.state.emit(" flds (%esp)"); - self.state.emit(" addl $4, %esp"); - } - _ => { - self.operand_to_eax(op); - self.state.emit(" pushl %eax"); - self.state.emit(" flds (%esp)"); - self.state.emit(" addl $4, %esp"); - } - } - } - - /// Store an x87 FPU result as F32 to a destination value. - fn emit_f32_store_from_x87(&mut self, dest: &Option) { - if let Some(d) = dest { - if let Some(slot) = self.state.get_slot(d.0) { - let sr = self.slot_ref(slot); - emit!(self.state, " fstps {}", sr); - } else { - self.state.emit(" fstp %st(0)"); - } - } else { - self.state.emit(" fstp %st(0)"); - } - } -} diff --git a/src/backend/i686/codegen/memory.rs b/src/backend/i686/codegen/memory.rs deleted file mode 100644 index f2bd3f20aa..0000000000 --- a/src/backend/i686/codegen/memory.rs +++ /dev/null @@ -1,506 +0,0 @@ -//! I686Codegen: memory operations (load, store, memcpy, GEP, stack). - -use crate::ir::reexports::{Operand, Value}; -use crate::common::types::IrType; -use crate::backend::state::{StackSlot, SlotAddr}; -use crate::backend::traits::ArchCodegen; -use crate::emit; -use super::emit::{I686Codegen, phys_reg_name}; - -impl I686Codegen { - // ---- Store/Load overrides ---- - - pub(super) fn emit_store_impl(&mut self, val: &Operand, ptr: &Value, ty: IrType) { - if ty == IrType::F128 { - self.emit_f128_load_to_x87(val); - let addr = self.state.resolve_slot_addr(ptr.0); - if let Some(addr) = addr { - match addr { - SlotAddr::OverAligned(slot, id) => { - self.emit_alloca_aligned_addr(slot, id); - self.state.emit(" fstpt (%ecx)"); - } - SlotAddr::Direct(slot) => { - let sr = self.slot_ref(slot); - emit!(self.state, " fstpt {}", sr); - } - SlotAddr::Indirect(slot) => { - self.emit_load_ptr_from_slot(slot, ptr.0); - self.state.emit(" fstpt (%ecx)"); - } - } - } - self.state.reg_cache.invalidate_acc(); - return; - } - if ty == IrType::I64 || ty == IrType::U64 || ty == IrType::F64 { - let addr = self.state.resolve_slot_addr(ptr.0); - self.emit_load_acc_pair(val); - if let Some(addr) = addr { - match addr { - SlotAddr::OverAligned(slot, id) => { - self.state.emit(" pushl %edx"); - self.esp_adjust += 4; - self.state.emit(" pushl %eax"); - self.esp_adjust += 4; - self.emit_alloca_aligned_addr(slot, id); - self.state.emit(" popl %eax"); - self.esp_adjust -= 4; - self.state.emit(" movl %eax, (%ecx)"); - self.state.emit(" popl %edx"); - self.esp_adjust -= 4; - self.state.emit(" movl %edx, 4(%ecx)"); - } - SlotAddr::Direct(slot) => { - let sr0 = self.slot_ref(slot); - let sr4 = self.slot_ref_offset(slot, 4); - emit!(self.state, " movl %eax, {}", sr0); - emit!(self.state, " movl %edx, {}", sr4); - } - SlotAddr::Indirect(slot) => { - self.state.emit(" pushl %edx"); - self.esp_adjust += 4; - self.state.emit(" pushl %eax"); - self.esp_adjust += 4; - self.emit_load_ptr_from_slot(slot, ptr.0); - self.state.emit(" popl %eax"); - self.esp_adjust -= 4; - self.state.emit(" movl %eax, (%ecx)"); - self.state.emit(" popl %edx"); - self.esp_adjust -= 4; - self.state.emit(" movl %edx, 4(%ecx)"); - } - } - } - self.state.reg_cache.invalidate_acc(); - return; - } - crate::backend::traits::emit_store_default(self, val, ptr, ty); - } - - pub(super) fn emit_load_impl(&mut self, dest: &Value, ptr: &Value, ty: IrType) { - if ty == IrType::F128 { - let addr = self.state.resolve_slot_addr(ptr.0); - if let Some(addr) = addr { - match addr { - SlotAddr::OverAligned(slot, id) => { - self.emit_alloca_aligned_addr(slot, id); - self.state.emit(" fldt (%ecx)"); - } - SlotAddr::Direct(slot) => { - let sr = self.slot_ref(slot); - emit!(self.state, " fldt {}", sr); - } - SlotAddr::Indirect(slot) => { - self.emit_load_ptr_from_slot(slot, ptr.0); - self.state.emit(" fldt (%ecx)"); - } - } - if let Some(dest_slot) = self.state.get_slot(dest.0) { - let sr = self.slot_ref(dest_slot); - emit!(self.state, " fstpt {}", sr); - self.state.f128_direct_slots.insert(dest.0); - } - } - return; - } - if ty == IrType::I64 || ty == IrType::U64 || ty == IrType::F64 { - let addr = self.state.resolve_slot_addr(ptr.0); - if let Some(addr) = addr { - match addr { - SlotAddr::OverAligned(slot, id) => { - self.emit_alloca_aligned_addr(slot, id); - self.state.emit(" movl (%ecx), %eax"); - self.state.emit(" movl 4(%ecx), %edx"); - } - SlotAddr::Direct(slot) => { - let sr0 = self.slot_ref(slot); - let sr4 = self.slot_ref_offset(slot, 4); - emit!(self.state, " movl {}, %eax", sr0); - emit!(self.state, " movl {}, %edx", sr4); - } - SlotAddr::Indirect(slot) => { - self.emit_load_ptr_from_slot(slot, ptr.0); - self.state.emit(" movl (%ecx), %eax"); - self.state.emit(" movl 4(%ecx), %edx"); - } - } - self.emit_store_acc_pair(dest); - } - self.state.reg_cache.invalidate_acc(); - return; - } - crate::backend::traits::emit_load_default(self, dest, ptr, ty); - } - - pub(super) fn emit_store_with_const_offset_impl(&mut self, val: &Operand, base: &Value, offset: i64, ty: IrType) { - if ty == IrType::F128 { - self.emit_f128_load_to_x87(val); - let addr = self.state.resolve_slot_addr(base.0); - if let Some(addr) = addr { - match addr { - SlotAddr::OverAligned(slot, id) => { - self.emit_alloca_aligned_addr(slot, id); - if offset != 0 { - self.emit_add_offset_to_addr_reg(offset); - } - self.state.emit(" fstpt (%ecx)"); - } - SlotAddr::Direct(slot) => { - let folded_slot = StackSlot(slot.0 + offset); - let sr = self.slot_ref(folded_slot); - emit!(self.state, " fstpt {}", sr); - } - SlotAddr::Indirect(slot) => { - self.emit_load_ptr_from_slot(slot, base.0); - if offset != 0 { - self.emit_add_offset_to_addr_reg(offset); - } - self.state.emit(" fstpt (%ecx)"); - } - } - } - self.state.reg_cache.invalidate_acc(); - return; - } - if ty == IrType::I64 || ty == IrType::U64 || ty == IrType::F64 { - self.emit_load_acc_pair(val); - let addr = self.state.resolve_slot_addr(base.0); - if let Some(addr) = addr { - match addr { - SlotAddr::OverAligned(slot, id) => { - self.state.emit(" pushl %edx"); - self.esp_adjust += 4; - self.state.emit(" pushl %eax"); - self.esp_adjust += 4; - self.emit_alloca_aligned_addr(slot, id); - if offset != 0 { - self.emit_add_offset_to_addr_reg(offset); - } - self.state.emit(" popl %eax"); - self.esp_adjust -= 4; - self.state.emit(" movl %eax, (%ecx)"); - self.state.emit(" popl %edx"); - self.esp_adjust -= 4; - self.state.emit(" movl %edx, 4(%ecx)"); - } - SlotAddr::Direct(slot) => { - let folded_slot = StackSlot(slot.0 + offset); - let sr0 = self.slot_ref(folded_slot); - let sr4 = self.slot_ref_offset(folded_slot, 4); - emit!(self.state, " movl %eax, {}", sr0); - emit!(self.state, " movl %edx, {}", sr4); - } - SlotAddr::Indirect(slot) => { - self.state.emit(" pushl %edx"); - self.esp_adjust += 4; - self.state.emit(" pushl %eax"); - self.esp_adjust += 4; - self.emit_load_ptr_from_slot(slot, base.0); - if offset != 0 { - self.emit_add_offset_to_addr_reg(offset); - } - self.state.emit(" popl %eax"); - self.esp_adjust -= 4; - self.state.emit(" movl %eax, (%ecx)"); - self.state.emit(" popl %edx"); - self.esp_adjust -= 4; - self.state.emit(" movl %edx, 4(%ecx)"); - } - } - } - self.state.reg_cache.invalidate_acc(); - return; - } - // Delegate to default for other types - self.operand_to_eax(val); - let addr = self.state.resolve_slot_addr(base.0); - if let Some(addr) = addr { - let store_instr = self.store_instr_for_type(ty); - match addr { - SlotAddr::OverAligned(slot, id) => { - self.emit_save_acc(); - self.emit_alloca_aligned_addr(slot, id); - self.emit_add_offset_to_addr_reg(offset); - self.emit_typed_store_indirect(store_instr, ty); - } - SlotAddr::Direct(slot) => { - let folded_slot = StackSlot(slot.0 + offset); - self.emit_typed_store_to_slot(store_instr, ty, folded_slot); - } - SlotAddr::Indirect(slot) => { - self.emit_save_acc(); - self.emit_load_ptr_from_slot(slot, base.0); - if offset != 0 { - self.emit_add_offset_to_addr_reg(offset); - } - self.emit_typed_store_indirect(store_instr, ty); - } - } - } - } - - pub(super) fn emit_load_with_const_offset_impl(&mut self, dest: &Value, base: &Value, offset: i64, ty: IrType) { - if ty == IrType::F128 { - let addr = self.state.resolve_slot_addr(base.0); - if let Some(addr) = addr { - match addr { - SlotAddr::OverAligned(slot, id) => { - self.emit_alloca_aligned_addr(slot, id); - if offset != 0 { - self.emit_add_offset_to_addr_reg(offset); - } - self.state.emit(" fldt (%ecx)"); - } - SlotAddr::Direct(slot) => { - let folded_slot = StackSlot(slot.0 + offset); - let sr = self.slot_ref(folded_slot); - emit!(self.state, " fldt {}", sr); - } - SlotAddr::Indirect(slot) => { - self.emit_load_ptr_from_slot(slot, base.0); - if offset != 0 { - self.emit_add_offset_to_addr_reg(offset); - } - self.state.emit(" fldt (%ecx)"); - } - } - if let Some(dest_slot) = self.state.get_slot(dest.0) { - let sr = self.slot_ref(dest_slot); - emit!(self.state, " fstpt {}", sr); - self.state.f128_direct_slots.insert(dest.0); - } - } - return; - } - if ty == IrType::I64 || ty == IrType::U64 || ty == IrType::F64 { - let addr = self.state.resolve_slot_addr(base.0); - if let Some(addr) = addr { - match addr { - SlotAddr::OverAligned(slot, id) => { - self.emit_alloca_aligned_addr(slot, id); - if offset != 0 { - self.emit_add_offset_to_addr_reg(offset); - } - self.state.emit(" movl (%ecx), %eax"); - self.state.emit(" movl 4(%ecx), %edx"); - } - SlotAddr::Direct(slot) => { - let folded_slot = StackSlot(slot.0 + offset); - let sr0 = self.slot_ref(folded_slot); - let sr4 = self.slot_ref_offset(folded_slot, 4); - emit!(self.state, " movl {}, %eax", sr0); - emit!(self.state, " movl {}, %edx", sr4); - } - SlotAddr::Indirect(slot) => { - self.emit_load_ptr_from_slot(slot, base.0); - if offset != 0 { - self.emit_add_offset_to_addr_reg(offset); - } - self.state.emit(" movl (%ecx), %eax"); - self.state.emit(" movl 4(%ecx), %edx"); - } - } - self.emit_store_acc_pair(dest); - } - self.state.reg_cache.invalidate_acc(); - return; - } - // Delegate to default for other types - let addr = self.state.resolve_slot_addr(base.0); - if let Some(addr) = addr { - let load_instr = self.load_instr_for_type(ty); - match addr { - SlotAddr::OverAligned(slot, id) => { - self.emit_alloca_aligned_addr(slot, id); - self.emit_add_offset_to_addr_reg(offset); - self.emit_typed_load_indirect(load_instr); - } - SlotAddr::Direct(slot) => { - let folded_slot = StackSlot(slot.0 + offset); - self.emit_typed_load_from_slot(load_instr, folded_slot); - } - SlotAddr::Indirect(slot) => { - self.emit_load_ptr_from_slot(slot, base.0); - if offset != 0 { - self.emit_add_offset_to_addr_reg(offset); - } - self.emit_typed_load_indirect(load_instr); - } - } - self.emit_store_result(dest); - } - } - - // ---- Typed store/load helpers ---- - - pub(super) fn emit_typed_store_to_slot_impl(&mut self, instr: &'static str, ty: IrType, slot: StackSlot) { - let reg = self.eax_for_type(ty); - let sr = self.slot_ref(slot); - emit!(self.state, " {} {}, {}", instr, reg, sr); - } - - pub(super) fn emit_typed_load_from_slot_impl(&mut self, instr: &'static str, slot: StackSlot) { - let sr = self.slot_ref(slot); - emit!(self.state, " {} {}, %eax", instr, sr); - } - - pub(super) fn emit_load_ptr_from_slot_impl(&mut self, slot: StackSlot, val_id: u32) { - if let Some(phys) = self.reg_assignments.get(&val_id).copied() { - let reg = phys_reg_name(phys); - emit!(self.state, " movl %{}, %ecx", reg); - } else { - let sr = self.slot_ref(slot); - emit!(self.state, " movl {}, %ecx", sr); - } - } - - pub(super) fn emit_typed_store_indirect_impl(&mut self, instr: &'static str, ty: IrType) { - let reg = match ty { - IrType::I8 | IrType::U8 => "%dl", - IrType::I16 | IrType::U16 => "%dx", - _ => "%edx", - }; - emit!(self.state, " {} {}, (%ecx)", instr, reg); - } - - pub(super) fn emit_typed_load_indirect_impl(&mut self, instr: &'static str) { - emit!(self.state, " {} (%ecx), %eax", instr); - } - - pub(super) fn emit_add_offset_to_addr_reg_impl(&mut self, offset: i64) { - if offset != 0 { - emit!(self.state, " addl ${}, %ecx", offset as i32); - } - } - - // ---- GEP primitives ---- - - /// Compute the address of an alloca into `reg`, handling over-aligned allocas. - pub(super) fn emit_alloca_addr_to(&mut self, reg: &str, val_id: u32, slot: StackSlot) { - let sr = self.slot_ref(slot); - if let Some(align) = self.state.alloca_over_align(val_id) { - emit!(self.state, " leal {}, %{}", sr, reg); - emit!(self.state, " addl ${}, %{}", align - 1, reg); - emit!(self.state, " andl ${}, %{}", -(align as i32), reg); - } else { - emit!(self.state, " leal {}, %{}", sr, reg); - } - } - - pub(super) fn emit_slot_addr_to_secondary_impl(&mut self, slot: StackSlot, is_alloca: bool, val_id: u32) { - if is_alloca { - self.emit_alloca_addr_to("ecx", val_id, slot); - } else if let Some(phys) = self.reg_assignments.get(&val_id).copied() { - let reg = phys_reg_name(phys); - emit!(self.state, " movl %{}, %ecx", reg); - } else { - let sr = self.slot_ref(slot); - emit!(self.state, " movl {}, %ecx", sr); - } - } - - pub(super) fn emit_gep_direct_const_impl(&mut self, slot: StackSlot, offset: i64) { - let folded_slot = StackSlot(slot.0 + offset); - let sr = self.slot_ref(folded_slot); - emit!(self.state, " leal {}, %eax", sr); - } - - pub(super) fn emit_gep_indirect_const_impl(&mut self, slot: StackSlot, offset: i64, val_id: u32) { - if let Some(phys) = self.reg_assignments.get(&val_id).copied() { - let reg = phys_reg_name(phys); - if offset == 0 { - emit!(self.state, " movl %{}, %eax", reg); - } else { - emit!(self.state, " leal {}(%{}), %eax", offset, reg); - } - } else { - let sr = self.slot_ref(slot); - emit!(self.state, " movl {}, %eax", sr); - if offset != 0 { - emit!(self.state, " addl ${}, %eax", offset as i32); - } - } - } - - // ---- Dynamic alloca ---- - - pub(super) fn emit_add_imm_to_acc_impl(&mut self, imm: i64) { - emit!(self.state, " addl ${}, %eax", imm as i32); - } - - pub(super) fn emit_round_up_acc_to_16_impl(&mut self) { - self.state.emit(" addl $15, %eax"); - self.state.emit(" andl $-16, %eax"); - } - - pub(super) fn emit_sub_sp_by_acc_impl(&mut self) { - self.state.emit(" subl %eax, %esp"); - } - - pub(super) fn emit_mov_sp_to_acc_impl(&mut self) { - self.state.emit(" movl %esp, %eax"); - self.state.reg_cache.invalidate_acc(); - } - - pub(super) fn emit_mov_acc_to_sp_impl(&mut self) { - self.state.emit(" movl %eax, %esp"); - } - - pub(super) fn emit_align_acc_impl(&mut self, align: usize) { - emit!(self.state, " addl ${}, %eax", align - 1); - emit!(self.state, " andl ${}, %eax", -(align as i32)); - } - - // ---- Alloca aligned addr ---- - - pub(super) fn emit_alloca_aligned_addr_impl(&mut self, slot: StackSlot, val_id: u32) { - let align = self.state.alloca_over_align(val_id) - .expect("alloca must have over-alignment for aligned addr emission"); - let sr = self.slot_ref(slot); - emit!(self.state, " leal {}, %ecx", sr); - emit!(self.state, " addl ${}, %ecx", align - 1); - emit!(self.state, " andl ${}, %ecx", -(align as i32)); - } - - pub(super) fn emit_alloca_aligned_addr_to_acc_impl(&mut self, slot: StackSlot, val_id: u32) { - let align = self.state.alloca_over_align(val_id) - .expect("alloca must have over-alignment for aligned addr emission"); - let sr = self.slot_ref(slot); - emit!(self.state, " leal {}, %eax", sr); - emit!(self.state, " addl ${}, %eax", align - 1); - emit!(self.state, " andl ${}, %eax", -(align as i32)); - self.state.reg_cache.invalidate_acc(); - } - - // ---- Memcpy ---- - - pub(super) fn emit_memcpy_load_dest_addr_impl(&mut self, slot: StackSlot, is_alloca: bool, val_id: u32) { - if is_alloca { - self.emit_alloca_addr_to("edi", val_id, slot); - } else if let Some(phys) = self.reg_assignments.get(&val_id).copied() { - let reg = phys_reg_name(phys); - emit!(self.state, " movl %{}, %edi", reg); - } else { - let sr = self.slot_ref(slot); - emit!(self.state, " movl {}, %edi", sr); - } - } - - pub(super) fn emit_memcpy_load_src_addr_impl(&mut self, slot: StackSlot, is_alloca: bool, val_id: u32) { - if is_alloca { - self.emit_alloca_addr_to("esi", val_id, slot); - } else if let Some(phys) = self.reg_assignments.get(&val_id).copied() { - let reg = phys_reg_name(phys); - emit!(self.state, " movl %{}, %esi", reg); - } else { - let sr = self.slot_ref(slot); - emit!(self.state, " movl {}, %esi", sr); - } - } - - pub(super) fn emit_memcpy_impl_impl(&mut self, size: usize) { - emit!(self.state, " movl ${}, %ecx", size); - self.state.emit(" rep movsb"); - } -} diff --git a/src/backend/i686/codegen/mod.rs b/src/backend/i686/codegen/mod.rs deleted file mode 100644 index c76cf5a224..0000000000 --- a/src/backend/i686/codegen/mod.rs +++ /dev/null @@ -1,17 +0,0 @@ -pub(crate) mod emit; -mod asm_emitter; -mod casts; -mod inline_asm; -mod intrinsics; -mod prologue; -mod memory; -mod alu; -mod comparison; -mod calls; -mod globals; -mod variadic; -mod returns; -mod atomics; -mod i128_ops; -mod float_ops; -pub(crate) mod peephole; diff --git a/src/backend/i686/codegen/peephole.rs b/src/backend/i686/codegen/peephole.rs deleted file mode 100644 index 59a78071ac..0000000000 --- a/src/backend/i686/codegen/peephole.rs +++ /dev/null @@ -1,1947 +0,0 @@ -//! i686 peephole optimizer for assembly text. -//! -//! Operates on generated assembly text to eliminate redundant patterns from the -//! stack-based codegen. Adapted from the x86-64 peephole optimizer for 32-bit -//! i686 assembly (uses %ebp instead of %rbp, %eax instead of %rax, etc.). -//! -//! ## Pass structure -//! -//! 1. **Local passes** (iterative, up to 8 rounds): adjacent store/load elimination, -//! self-move elimination, redundant jump elimination, branch inversion, reverse -//! move elimination. -//! -//! 2. **Global passes** (once): dead register move elimination, dead store elimination, -//! compare+branch fusion, memory operand folding. -//! -//! 3. **Local cleanup** (up to 4 rounds): re-run local and global passes to clean up -//! opportunities exposed by the first round. -//! -//! 4. **Never-read store elimination**: global analysis to remove stores to -//! stack slots that are never read anywhere in the function. - -// ── Constants ──────────────────────────────────────────────────────────────── - -const MAX_LOCAL_PASS_ITERATIONS: usize = 8; -const MAX_POST_GLOBAL_ITERATIONS: usize = 4; - -// Register IDs (i686 has fewer registers) -type RegId = u8; -const REG_NONE: RegId = 255; -const REG_EAX: RegId = 0; -const REG_ECX: RegId = 1; -const REG_EDX: RegId = 2; -const REG_EBX: RegId = 3; -const REG_ESP: RegId = 4; -const REG_EBP: RegId = 5; -const REG_ESI: RegId = 6; -const REG_EDI: RegId = 7; -const REG_GP_MAX: RegId = 7; - -/// Sentinel value for ebp_offset meaning "no %ebp reference" or "complex reference". -const EBP_OFFSET_NONE: i32 = i32::MIN; - -// ── Line classification ────────────────────────────────────────────────────── - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -enum LineKind { - Nop, - Empty, - StoreEbp { reg: RegId, offset: i32, size: MoveSize }, - LoadEbp { reg: RegId, offset: i32, size: MoveSize }, - Move { dst: RegId, src: RegId }, - SelfMove, - Label, - Jmp, - JmpIndirect, - CondJmp, - Call, - Ret, - Push { reg: RegId }, - Pop { reg: RegId }, - SetCC { reg: RegId }, - Cmp, - Directive, - Other { dest_reg: RegId }, -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -enum MoveSize { - L, // movl (32-bit) - W, // movw (16-bit) - B, // movb (8-bit) -} - -impl MoveSize { - fn mnemonic(self) -> &'static str { - match self { - MoveSize::L => "movl", - MoveSize::W => "movw", - MoveSize::B => "movb", - } - } - fn byte_size(self) -> i32 { - match self { - MoveSize::L => 4, - MoveSize::W => 2, - MoveSize::B => 1, - } - } -} - -/// Check if two byte ranges `[a, a+a_size)` and `[b, b+b_size)` overlap. -#[inline] -fn ranges_overlap(a_off: i32, a_size: i32, b_off: i32, b_size: i32) -> bool { - a_off < b_off + b_size && b_off < a_off + a_size -} - -#[derive(Clone, Copy)] -struct LineInfo { - kind: LineKind, - trim_start: u16, - has_indirect_mem: bool, - ebp_offset: i32, -} - -impl LineInfo { - #[inline] - fn is_nop(self) -> bool { self.kind == LineKind::Nop } - #[inline] - fn is_barrier(self) -> bool { - matches!(self.kind, - LineKind::Label | LineKind::Call | LineKind::Jmp | LineKind::JmpIndirect | - LineKind::CondJmp | LineKind::Ret | LineKind::Directive) - } -} - -#[inline] -fn line_info(kind: LineKind, ts: u16) -> LineInfo { - LineInfo { kind, trim_start: ts, has_indirect_mem: false, ebp_offset: EBP_OFFSET_NONE } -} - -// ── Register parsing ───────────────────────────────────────────────────────── - -/// Map i686 register name to family ID. -fn register_family(name: &str) -> RegId { - let name = name.trim_start_matches('%'); - match name { - "eax" | "ax" | "al" | "ah" => REG_EAX, - "ecx" | "cx" | "cl" | "ch" => REG_ECX, - "edx" | "dx" | "dl" | "dh" => REG_EDX, - "ebx" | "bx" | "bl" | "bh" => REG_EBX, - "esp" | "sp" => REG_ESP, - "ebp" | "bp" => REG_EBP, - "esi" | "si" => REG_ESI, - "edi" | "di" => REG_EDI, - _ => REG_NONE, - } -} - -/// Get the 32-bit register name for a family ID. -fn reg32_name(id: RegId) -> &'static str { - match id { - REG_EAX => "%eax", - REG_ECX => "%ecx", - REG_EDX => "%edx", - REG_EBX => "%ebx", - REG_ESP => "%esp", - REG_EBP => "%ebp", - REG_ESI => "%esi", - REG_EDI => "%edi", - _ => "%???", - } -} - -/// Check if a register is caller-saved (clobbered by calls). -fn is_caller_saved(reg: RegId) -> bool { - matches!(reg, REG_EAX | REG_ECX | REG_EDX) -} - -// ── Store/Load parsing ─────────────────────────────────────────────────────── - -/// Parse `movX %reg, offset(%ebp)` → (reg_name, offset_str, MoveSize) -fn parse_store_to_ebp(s: &str) -> Option<(&str, &str, MoveSize)> { - let (rest, size) = if let Some(r) = s.strip_prefix("movl ") { - (r, MoveSize::L) - } else if let Some(r) = s.strip_prefix("movw ") { - (r, MoveSize::W) - } else if let Some(r) = s.strip_prefix("movb ") { - (r, MoveSize::B) - } else { - return None; - }; - // rest = "%eax, -8(%ebp)" - let rest = rest.trim(); - if !rest.starts_with('%') { return None; } - let comma = rest.find(',')?; - let reg = &rest[..comma]; - let mem = rest[comma + 1..].trim(); - if !mem.ends_with("(%ebp)") { return None; } - // Reject indirect memory (pointer dereference, not stack slot) - if mem.contains("(%e") && !mem.ends_with("(%ebp)") { return None; } - let offset_str = &mem[..mem.len() - 6]; // strip "(%ebp)" - Some((reg.trim(), offset_str, size)) -} - -/// Parse `movX offset(%ebp), %reg` → (offset_str, reg_name, MoveSize) -fn parse_load_from_ebp(s: &str) -> Option<(&str, &str, MoveSize)> { - let (rest, size) = if let Some(r) = s.strip_prefix("movl ") { - (r, MoveSize::L) - } else if let Some(r) = s.strip_prefix("movw ") { - (r, MoveSize::W) - } else if let Some(r) = s.strip_prefix("movb ") { - (r, MoveSize::B) - } else if let Some(r) = s.strip_prefix("movzbl ") { - (r, MoveSize::L) // movzbl from stack, dest is 32-bit - } else if let Some(r) = s.strip_prefix("movzwl ") { - (r, MoveSize::L) - } else if let Some(r) = s.strip_prefix("movsbl ") { - (r, MoveSize::L) - } else if let Some(r) = s.strip_prefix("movswl ") { - (r, MoveSize::L) - } else { - return None; - }; - let rest = rest.trim(); - // Must start with an offset or directly with (%ebp) - if !rest.contains("(%ebp)") { return None; } - let paren_start = rest.find("(%ebp)")?; - let offset_str = &rest[..paren_start]; - let after = rest[paren_start + 6..].trim(); - if !after.starts_with(',') { return None; } - let reg = after[1..].trim(); - if !reg.starts_with('%') { return None; } - Some((offset_str, reg, size)) -} - -/// Parse `movl %src, %dst` (register-to-register move). -fn parse_reg_to_reg_move(s: &str) -> Option<(RegId, RegId)> { - let rest = s.strip_prefix("movl ")?.trim(); - if !rest.starts_with('%') { return None; } - let comma = rest.find(',')?; - let src_name = rest[..comma].trim(); - let dst_name = rest[comma + 1..].trim(); - if !dst_name.starts_with('%') { return None; } - // Must not be memory operands - if src_name.contains('(') || dst_name.contains('(') { return None; } - let src = register_family(src_name); - let dst = register_family(dst_name); - if src <= REG_GP_MAX && dst <= REG_GP_MAX { - Some((src, dst)) - } else { - None - } -} - -/// Parse integer offset from string. -fn parse_offset(s: &str) -> i32 { - if s.is_empty() { return 0; } - s.parse::().unwrap_or(EBP_OFFSET_NONE) -} - -/// Check if a line has indirect memory access (pointer dereference through a register). -fn has_indirect_memory_access(s: &str) -> bool { - // Pattern: offset(%eXX) where XX is not bp or sp - // or (%eXX) where XX is not bp or sp - // or (%eXX, %eYY, N) - let bytes = s.as_bytes(); - for i in 0..bytes.len() { - if bytes[i] == b'(' && i + 4 < bytes.len() && bytes[i + 1] == b'%' { - // Check if it's (%ebp) or (%esp) - those are stack accesses, not indirect - if i + 5 < bytes.len() && (&bytes[i + 1..i + 5] == b"%ebp" || &bytes[i + 1..i + 5] == b"%esp") { - continue; - } - return true; - } - } - false -} - -/// Parse the %ebp offset from a line, or return EBP_OFFSET_NONE. -fn parse_ebp_offset_in_line(s: &str) -> i32 { - if let Some(pos) = s.find("(%ebp)") { - let before = &s[..pos]; - // Find the start of the offset number - let offset_start = before.rfind(|c: char| !c.is_ascii_digit() && c != '-').map(|p| p + 1).unwrap_or(0); - let offset_str = &before[offset_start..]; - if offset_str.is_empty() { - 0 - } else { - offset_str.parse::().unwrap_or(EBP_OFFSET_NONE) - } - } else { - EBP_OFFSET_NONE - } -} - -/// Parse the destination register of a generic instruction. -/// For two-operand instructions (AT&T syntax), the destination is the last operand. -fn parse_dest_reg(s: &str) -> RegId { - // Find the last %reg - if let Some(comma) = s.rfind(',') { - let after = s[comma + 1..].trim(); - if after.starts_with('%') && !after.contains('(') { - return register_family(after); - } - } - REG_NONE -} - -/// Check if a line references a specific register family. -/// This includes both explicit register operands and implicit register uses -/// by instructions like cltd, idivl, rep movsb, etc. -fn line_references_reg(s: &str, reg: RegId) -> bool { - // Check explicit register operands - let names: &[&str] = match reg { - REG_EAX => &["%eax", "%ax", "%al", "%ah"], - REG_ECX => &["%ecx", "%cx", "%cl", "%ch"], - REG_EDX => &["%edx", "%dx", "%dl", "%dh"], - REG_EBX => &["%ebx", "%bx", "%bl", "%bh"], - REG_ESP => &["%esp", "%sp"], - REG_EBP => &["%ebp", "%bp"], - REG_ESI => &["%esi", "%si"], - REG_EDI => &["%edi", "%di"], - _ => return false, - }; - for name in names { - if s.contains(name) { return true; } - } - // Check implicit register uses by specific instructions - if implicit_reg_use(s, reg) { return true; } - false -} - -/// Check if an instruction implicitly uses a register (not mentioned in text). -fn implicit_reg_use(s: &str, reg: RegId) -> bool { - let bytes = s.as_bytes(); - if bytes.is_empty() { return false; } - match bytes[0] { - b'c' => { - // cmpxchg8b (without lock prefix): reads/writes eax, edx, ecx, ebx - if s.starts_with("cmpxchg8b") { - return reg == REG_EAX || reg == REG_EDX || reg == REG_ECX || reg == REG_EBX; - } - // cmpxchg{l,w,b} (without lock prefix): implicitly reads eax - if s.starts_with("cmpxchg") { - return reg == REG_EAX; - } - // cltd/cdq: reads eax, writes edx - if s == "cltd" || s == "cdq" { - return reg == REG_EAX || reg == REG_EDX; - } - // cbw/cwde: reads/writes eax - if s == "cbw" || s == "cwde" || s == "cwtl" { - return reg == REG_EAX; - } - } - b'i' => { - // idivl/idivw: implicitly reads edx:eax, writes eax and edx - if s.starts_with("idivl") || s.starts_with("idivw") || s.starts_with("idivb") { - return reg == REG_EAX || reg == REG_EDX; - } - // imull with 1 operand: reads eax, writes edx:eax - // imull with 2 or 3 operands has explicit regs - if s.starts_with("imull ") && !s.contains(',') { - return reg == REG_EAX || reg == REG_EDX; - } - } - b'd' => { - // divl/divw: implicitly reads edx:eax, writes eax and edx - if s.starts_with("divl") || s.starts_with("divw") || s.starts_with("divb") { - return reg == REG_EAX || reg == REG_EDX; - } - } - b'm' => { - // mul: reads eax, writes edx:eax - if s.starts_with("mull ") || s.starts_with("mulw ") || s.starts_with("mulb ") { - return reg == REG_EAX || reg == REG_EDX; - } - } - b'r' => { - // rep movsb/movsl: uses esi, edi, ecx - // rep stosb/stosl: uses edi, ecx, eax - if s.starts_with("rep") { - if s.contains("movs") { - return reg == REG_ESI || reg == REG_EDI || reg == REG_ECX; - } - if s.contains("stos") { - return reg == REG_EAX || reg == REG_EDI || reg == REG_ECX; - } - if s.contains("scas") || s.contains("cmps") { - return reg == REG_ESI || reg == REG_EDI || reg == REG_ECX || reg == REG_EAX; - } - // Unknown rep instruction - assume all regs used - return true; - } - } - b'l' => { - // lock cmpxchg8b: implicitly reads/writes eax, edx, ecx, ebx - // cmpxchg8b compares edx:eax with memory, stores ecx:ebx on match - if s.starts_with("lock cmpxchg8b") { - return reg == REG_EAX || reg == REG_EDX || reg == REG_ECX || reg == REG_EBX; - } - // lock cmpxchg{l,w,b}: implicitly reads eax (compared with memory) - if s.starts_with("lock cmpxchg") { - return reg == REG_EAX; - } - // loop/loope/loopne: reads ecx - if s.starts_with("loop") { - return reg == REG_ECX; - } - } - _ => {} - } - false -} - -// ── Line classifier ────────────────────────────────────────────────────────── - -fn classify_line(raw: &str) -> LineInfo { - let trim_start = raw.len() - raw.trim_start().len(); - let s = &raw[trim_start..]; - - if s.is_empty() { - return line_info(LineKind::Empty, trim_start as u16); - } - - let bytes = s.as_bytes(); - let first = bytes[0]; - let last = bytes[bytes.len() - 1]; - let ts = trim_start as u16; - - // Label - if last == b':' { - return line_info(LineKind::Label, ts); - } - - // Directive - if first == b'.' { - return line_info(LineKind::Directive, ts); - } - - // Comment - if first == b'#' { - return line_info(LineKind::Directive, ts); - } - - // mov instructions - check store/load/self-move/reg-reg - if first == b'm' && bytes.len() >= 4 && bytes[1] == b'o' && bytes[2] == b'v' { - if let Some((reg_str, offset_str, size)) = parse_store_to_ebp(s) { - let reg = register_family(reg_str); - if reg <= REG_GP_MAX { - let offset = parse_offset(offset_str); - return line_info(LineKind::StoreEbp { reg, offset, size }, ts); - } - } - if let Some((offset_str, reg_str, size)) = parse_load_from_ebp(s) { - let reg = register_family(reg_str); - if reg <= REG_GP_MAX { - let offset = parse_offset(offset_str); - return line_info(LineKind::LoadEbp { reg, offset, size }, ts); - } - } - if let Some((src, dst)) = parse_reg_to_reg_move(s) { - if src == dst { - return line_info(LineKind::SelfMove, ts); - } - return line_info(LineKind::Move { dst, src }, ts); - } - } - - // Control flow - if first == b'j' { - if bytes.len() >= 4 && bytes[1] == b'm' && bytes[2] == b'p' { - if bytes.len() > 4 && bytes[4] == b'*' { - return line_info(LineKind::JmpIndirect, ts); - } - if bytes[3] == b' ' { - if s.contains("indirect_thunk") || s.contains("*%") { - return line_info(LineKind::JmpIndirect, ts); - } - return line_info(LineKind::Jmp, ts); - } - } - if is_conditional_jump(s) { - return line_info(LineKind::CondJmp, ts); - } - } - - if first == b'c' { - if bytes.len() >= 4 && bytes[1] == b'a' && bytes[2] == b'l' && bytes[3] == b'l' { - return line_info(LineKind::Call, ts); - } - if bytes.len() >= 4 && bytes[1] == b'm' && bytes[2] == b'p' { - return line_info(LineKind::Cmp, ts); - } - } - - if first == b'r' && s == "ret" { - return line_info(LineKind::Ret, ts); - } - - // test instructions - if first == b't' && bytes.len() >= 5 && bytes[1] == b'e' && bytes[2] == b's' && bytes[3] == b't' { - return line_info(LineKind::Cmp, ts); - } - - // push/pop - if first == b'p' { - if let Some(rest) = s.strip_prefix("pushl ") { - let reg = register_family(rest.trim()); - return line_info(LineKind::Push { reg }, ts); - } - if let Some(rest) = s.strip_prefix("popl ") { - let reg = register_family(rest.trim()); - return line_info(LineKind::Pop { reg }, ts); - } - } - - // setCC - if first == b's' && bytes.len() >= 4 && bytes[1] == b'e' && bytes[2] == b't' && parse_setcc(s).is_some() { - let setcc_reg = if let Some(space_pos) = s.rfind(' ') { - register_family(s[space_pos + 1..].trim()) - } else { - REG_EAX - }; - return line_info(LineKind::SetCC { reg: setcc_reg }, ts); - } - - // Other instruction - let dest_reg = parse_dest_reg(s); - let has_indirect = has_indirect_memory_access(s); - let ebp_off = if has_indirect { EBP_OFFSET_NONE } else { parse_ebp_offset_in_line(s) }; - LineInfo { - kind: LineKind::Other { dest_reg }, - trim_start: ts, - has_indirect_mem: has_indirect, - ebp_offset: ebp_off, - } -} - -// ── Conditional jump helpers ───────────────────────────────────────────────── - -fn is_conditional_jump(s: &str) -> bool { - let b = s.as_bytes(); - if b.len() < 3 || b[0] != b'j' { return false; } - // jCC where CC is one of: e, ne, l, le, g, ge, b, be, a, ae, s, ns, o, no, p, np, z, nz - matches!(&s[1..2], "e" | "a" | "b" | "g" | "l" | "s" | "o" | "p" | "z" | "n") - && s.contains(' ') -} - -/// Invert a condition code. -fn invert_cc(cc: &str) -> Option<&'static str> { - match cc { - "e" | "z" => Some("ne"), - "ne" | "nz" => Some("e"), - "l" => Some("ge"), - "ge" => Some("l"), - "le" => Some("g"), - "g" => Some("le"), - "b" => Some("ae"), - "ae" => Some("b"), - "be" => Some("a"), - "a" => Some("be"), - "s" => Some("ns"), - "ns" => Some("s"), - "o" => Some("no"), - "no" => Some("o"), - "p" => Some("np"), - "np" => Some("p"), - _ => None, - } -} - -/// Extract condition code and target from a conditional jump. -fn parse_condjmp(s: &str) -> Option<(&str, &str)> { - if !s.starts_with('j') { return None; } - let space = s.find(' ')?; - let cc = &s[1..space]; - let target = s[space + 1..].trim(); - Some((cc, target)) -} - -/// Parse setCC instruction → condition code. -fn parse_setcc(s: &str) -> Option<&str> { - if !s.starts_with("set") { return None; } - let rest = &s[3..]; - let space = rest.find(' ')?; - let cc = &rest[..space]; - // Validate it's a real condition code - match cc { - "e" | "ne" | "z" | "nz" | "l" | "le" | "g" | "ge" | - "b" | "be" | "a" | "ae" | "s" | "ns" | "o" | "no" | - "p" | "np" => Some(cc), - _ => None, - } -} - -/// Extract the jump target from a jmp instruction. -fn parse_jmp_target(s: &str) -> Option<&str> { - s.strip_prefix("jmp ") -} - -// ── Line store ─────────────────────────────────────────────────────────────── - -/// Efficient line storage that avoids reallocating strings. -/// Lines are stored as byte offsets into the original assembly string. -/// Replaced lines are stored in a side buffer. -// Re-export the shared LineStore from peephole_common. -// See backend/peephole_common.rs for the implementation. -use crate::backend::peephole_common::LineStore; - -// ── Trimmed line helper ────────────────────────────────────────────────────── - -#[inline] -fn trimmed<'a>(store: &'a LineStore, info: &LineInfo, idx: usize) -> &'a str { - &store.get(idx)[info.trim_start as usize..] -} - -/// Check if the next instruction reads the carry flag (CF). -/// Instructions like `adcl`, `sbbl`, `rcl`, `rcr` depend on CF. -/// `incl`/`decl` do NOT set CF (unlike `addl`/`subl`), so converting -/// `addl $1` → `incl` or `subl $1` → `decl` is invalid when the next -/// instruction reads CF. -fn next_reads_carry_flag(store: &LineStore, infos: &[LineInfo], start: usize) -> bool { - let len = infos.len(); - for j in (start + 1)..len { - let s = store.get(j).trim(); - if s.is_empty() || s.starts_with('#') || s.starts_with("//") || s.ends_with(':') { - continue; - } - // Check if the instruction reads CF - return s.starts_with("adcl ") - || s.starts_with("adcb ") - || s.starts_with("adcw ") - || s.starts_with("sbbl ") - || s.starts_with("sbbb ") - || s.starts_with("sbbw ") - || s.starts_with("rcl ") - || s.starts_with("rcr ") - || s.starts_with("setc ") - || s.starts_with("setb ") - || s.starts_with("jc ") - || s.starts_with("jb ") - || s.starts_with("jnc ") - || s.starts_with("jnb ") - || s.starts_with("jae ") - || s.starts_with("cmc"); - } - false -} - -// ── Pass 1: Local patterns ─────────────────────────────────────────────────── - -/// Combined local pass: scan once, apply multiple patterns. -fn combined_local_pass(store: &mut LineStore, infos: &mut [LineInfo]) -> bool { - let len = infos.len(); - let mut changed = false; - - let mut i = 0; - while i < len { - if infos[i].is_nop() { i += 1; continue; } - - // Pattern 1: Self-move elimination - if infos[i].kind == LineKind::SelfMove { - infos[i].kind = LineKind::Nop; - changed = true; - i += 1; - continue; - } - - // Pattern 1b: Strength reduction for code size - // - addl $1, %reg → incl %reg (saves 2 bytes, critical for 16-bit boot code) - // - subl $1, %reg → decl %reg (saves 2 bytes) - // - movl $0, %reg → xorl %reg, %reg (saves 3 bytes) - // - addl $-1, %reg → decl %reg (saves 2 bytes) - // - subl $-1, %reg → incl %reg (saves 2 bytes) - if let LineKind::Other { dest_reg } = infos[i].kind { - if dest_reg != REG_NONE && dest_reg <= REG_GP_MAX && dest_reg != REG_ESP && dest_reg != REG_EBP { - let s = trimmed(store, &infos[i], i); - let rn = reg32_name(dest_reg); - // addl $1, %reg → incl %reg - // SAFETY: incl does NOT set the carry flag (CF), so this - // conversion is invalid if the next instruction reads CF - // (e.g., adcl used in 64-bit add-with-carry chains). - if s.starts_with("addl $1, ") && s.ends_with(rn) - && !next_reads_carry_flag(store, infos, i) - { - store.replace(i, format!(" incl {}", rn)); - infos[i] = LineInfo { - kind: LineKind::Other { dest_reg }, - trim_start: 4, - has_indirect_mem: false, - ebp_offset: EBP_OFFSET_NONE, - }; - changed = true; - i += 1; - continue; - } - // subl $1, %reg → decl %reg - // SAFETY: decl does NOT set CF, skip if next reads CF. - if s.starts_with("subl $1, ") && s.ends_with(rn) - && !next_reads_carry_flag(store, infos, i) - { - store.replace(i, format!(" decl {}", rn)); - infos[i] = LineInfo { - kind: LineKind::Other { dest_reg }, - trim_start: 4, - has_indirect_mem: false, - ebp_offset: EBP_OFFSET_NONE, - }; - changed = true; - i += 1; - continue; - } - // addl $-1, %reg → decl %reg - // SAFETY: decl does NOT set CF, skip if next reads CF. - if s.starts_with("addl $-1, ") && s.ends_with(rn) - && !next_reads_carry_flag(store, infos, i) - { - store.replace(i, format!(" decl {}", rn)); - infos[i] = LineInfo { - kind: LineKind::Other { dest_reg }, - trim_start: 4, - has_indirect_mem: false, - ebp_offset: EBP_OFFSET_NONE, - }; - changed = true; - i += 1; - continue; - } - // subl $-1, %reg → incl %reg - // SAFETY: incl does NOT set CF, skip if next reads CF. - if s.starts_with("subl $-1, ") && s.ends_with(rn) - && !next_reads_carry_flag(store, infos, i) - { - store.replace(i, format!(" incl {}", rn)); - infos[i] = LineInfo { - kind: LineKind::Other { dest_reg }, - trim_start: 4, - has_indirect_mem: false, - ebp_offset: EBP_OFFSET_NONE, - }; - changed = true; - i += 1; - continue; - } - } - } - // movl $0, %reg → xorl %reg, %reg (saves 3 bytes, clears flags) - if let LineKind::Other { dest_reg } = infos[i].kind { - if dest_reg != REG_NONE && dest_reg <= REG_GP_MAX && dest_reg != REG_ESP && dest_reg != REG_EBP { - let s = trimmed(store, &infos[i], i); - let rn = reg32_name(dest_reg); - if s == format!("movl $0, {}", rn) { - store.replace(i, format!(" xorl {}, {}", rn, rn)); - infos[i] = LineInfo { - kind: LineKind::Other { dest_reg }, - trim_start: 4, - has_indirect_mem: false, - ebp_offset: EBP_OFFSET_NONE, - }; - changed = true; - i += 1; - continue; - } - } - } - - // Find next non-nop line - let mut j = i + 1; - while j < len && infos[j].is_nop() { j += 1; } - if j >= len { i += 1; continue; } - - // Pattern 2: Adjacent store/load with same offset - if let LineKind::StoreEbp { reg: store_reg, offset: store_off, size: store_size } = infos[i].kind { - if let LineKind::LoadEbp { reg: load_reg, offset: load_off, size: load_size } = infos[j].kind { - if store_off == load_off && store_size == load_size { - if store_reg == load_reg { - // movl %eax, -8(%ebp); movl -8(%ebp), %eax → keep store only - infos[j].kind = LineKind::Nop; - changed = true; - i += 1; - continue; - } else { - // movl %eax, -8(%ebp); movl -8(%ebp), %ecx → movl %eax, -8(%ebp); movl %eax, %ecx - let new_line = format!(" {} {}, {}", store_size.mnemonic(), reg32_name(store_reg), reg32_name(load_reg)); - store.replace(j, new_line); - infos[j] = LineInfo { - kind: LineKind::Move { dst: load_reg, src: store_reg }, - trim_start: 4, - has_indirect_mem: false, - ebp_offset: EBP_OFFSET_NONE, - }; - changed = true; - i += 1; - continue; - } - } - } - } - - // Pattern 3: Redundant jump to next label - if infos[i].kind == LineKind::Jmp && infos[j].kind == LineKind::Label { - let jmp_s = trimmed(store, &infos[i], i); - let label_s = trimmed(store, &infos[j], j); - if let Some(target) = parse_jmp_target(jmp_s) { - if let Some(label_name) = label_s.strip_suffix(':') { - if target.trim() == label_name { - infos[i].kind = LineKind::Nop; - changed = true; - i += 1; - continue; - } - } - } - } - - // Pattern 4: Branch inversion: jCC .L1; jmp .L2; .L1: → j!CC .L2; .L1: - if infos[i].kind == LineKind::CondJmp { - let mut k = j + 1; - while k < len && infos[k].is_nop() { k += 1; } - if k < len && infos[j].kind == LineKind::Jmp && infos[k].kind == LineKind::Label { - let cond_s = trimmed(store, &infos[i], i); - let jmp_s = trimmed(store, &infos[j], j); - let label_s = trimmed(store, &infos[k], k); - if let (Some((cc, cond_target)), Some(jmp_target)) = - (parse_condjmp(cond_s), parse_jmp_target(jmp_s)) - { - if let Some(label_name) = label_s.strip_suffix(':') { - if cond_target == label_name { - if let Some(inv_cc) = invert_cc(cc) { - let new_line = format!(" j{} {}", inv_cc, jmp_target.trim()); - store.replace(i, new_line); - infos[i].kind = LineKind::CondJmp; - infos[j].kind = LineKind::Nop; - changed = true; - i += 1; - continue; - } - } - } - } - } - } - - // Pattern 5b: Redundant movsbl %al, %eax after movsbl (...), %eax - // The first sign-extension already produces a properly sign-extended 32-bit result, - // so the second `movsbl %al, %eax` is a no-op. - if let LineKind::Other { dest_reg: REG_EAX } = infos[i].kind { - let si = trimmed(store, &infos[i], i); - if si.starts_with("movsbl ") && si.ends_with(", %eax") { - if let LineKind::Other { dest_reg: REG_EAX } = infos[j].kind { - let sj = trimmed(store, &infos[j], j); - if sj == "movsbl %al, %eax" { - infos[j].kind = LineKind::Nop; - changed = true; - i += 1; - continue; - } - } - } - } - - // Pattern 5: Reverse move elimination: movl %A, %B; movl %B, %A → keep first only - if let LineKind::Move { dst: dst1, src: src1 } = infos[i].kind { - if let LineKind::Move { dst: dst2, src: src2 } = infos[j].kind { - if dst1 == src2 && src1 == dst2 { - infos[j].kind = LineKind::Nop; - changed = true; - i += 1; - continue; - } - } - } - - i += 1; - } - - changed -} - -// ── Pass 2: Global store forwarding ────────────────────────────────────────── - -/// Track which register value is stored at each stack slot. -/// When we see `movl %eax, -8(%ebp)`, record that slot -8 contains eax. -/// When we see `movl -8(%ebp), %ecx`, forward to `movl %eax, %ecx` or eliminate if same reg. -// TODO: Disabled - causes 21 regressions in FP computation tests (matrix/FP operations -// produce wrong numerical results). Needs investigation into FP load/store forwarding patterns. -#[allow(dead_code)] -fn global_store_forwarding(store: &mut LineStore, infos: &mut [LineInfo]) -> bool { - let len = infos.len(); - let mut changed = false; - - // Mapping: offset → (reg, line_idx) - // Small flat array for common offsets (-256..0) - const SLOT_COUNT: usize = 256; - let mut slots: [(RegId, MoveSize); SLOT_COUNT] = [(REG_NONE, MoveSize::L); SLOT_COUNT]; - - // Collect jump targets so we can invalidate at them - let mut jump_targets = std::collections::HashSet::new(); - for i in 0..len { - if infos[i].is_nop() { continue; } - let s = trimmed(store, &infos[i], i); - match infos[i].kind { - LineKind::Jmp | LineKind::JmpIndirect => { - if let Some(target) = parse_jmp_target(s) { - jump_targets.insert(target.trim().to_string()); - } - } - LineKind::CondJmp => { - if let Some((_, target)) = parse_condjmp(s) { - jump_targets.insert(target.to_string()); - } - } - _ => {} - } - } - - for i in 0..len { - if infos[i].is_nop() { continue; } - - match infos[i].kind { - LineKind::Label => { - // Check if this label is a jump target (invalidate all) - let s = trimmed(store, &infos[i], i); - if let Some(name) = s.strip_suffix(':') { - if jump_targets.contains(name) { - // This label is a jump target - invalidate all mappings - slots = [(REG_NONE, MoveSize::L); SLOT_COUNT]; - } - // If it's just a fallthrough label, keep mappings - } - } - LineKind::StoreEbp { reg, offset, size } => { - // Record that this slot now contains this register's value - if offset < 0 && (-offset as usize) <= SLOT_COUNT { - slots[(-offset - 1) as usize] = (reg, size); - } - } - LineKind::LoadEbp { reg: load_reg, offset, size: load_size } => { - // Check if we know what register value is in this slot - let mut forwarded = false; - if offset < 0 && (-offset as usize) <= SLOT_COUNT { - let (stored_reg, stored_size) = slots[(-offset - 1) as usize]; - if stored_reg != REG_NONE && stored_size == load_size { - if stored_reg == load_reg { - // Same register - just eliminate the load - infos[i].kind = LineKind::Nop; - changed = true; - forwarded = true; - } else { - // Different register - forward as reg-reg move - let new_line = format!(" {} {}, {}", load_size.mnemonic(), reg32_name(stored_reg), reg32_name(load_reg)); - store.replace(i, new_line); - infos[i] = LineInfo { - kind: LineKind::Move { dst: load_reg, src: stored_reg }, - trim_start: 4, - has_indirect_mem: false, - ebp_offset: EBP_OFFSET_NONE, - }; - changed = true; - forwarded = true; - } - } - } - // The load writes to load_reg, so invalidate any slot - // that maps to load_reg (its value has changed). - // This must happen even if we forwarded, because the - // destination register now has a new value. - for slot in slots.iter_mut() { - if slot.0 == load_reg { - *slot = (REG_NONE, MoveSize::L); - } - } - if forwarded { continue; } - } - LineKind::Call => { - // Calls clobber caller-saved registers (eax, ecx, edx) - // Invalidate all mappings involving these registers - for slot in slots.iter_mut() { - if is_caller_saved(slot.0) { - *slot = (REG_NONE, MoveSize::L); - } - } - } - LineKind::Jmp | LineKind::JmpIndirect | LineKind::Ret => { - // Control flow change - invalidate all - slots = [(REG_NONE, MoveSize::L); SLOT_COUNT]; - } - LineKind::Move { dst, .. } => { - // Invalidate any slot that was mapped to the overwritten register - for slot in slots.iter_mut() { - if slot.0 == dst { - *slot = (REG_NONE, MoveSize::L); - } - } - } - LineKind::SetCC { reg } => { - // setCC modifies a byte register, invalidate its family - for slot in slots.iter_mut() { - if slot.0 == reg { - *slot = (REG_NONE, MoveSize::L); - } - } - } - LineKind::Other { dest_reg } => { - // Invalidate any slot mapped to the destination register - if dest_reg != REG_NONE { - for slot in slots.iter_mut() { - if slot.0 == dest_reg { - *slot = (REG_NONE, MoveSize::L); - } - } - } - // If line has indirect memory access or might clobber stack, - // invalidate all (conservative) - let s = trimmed(store, &infos[i], i); - if infos[i].has_indirect_mem || s.contains("(%ebp)") { - // Only invalidate the specific slot if we can parse it - let off = infos[i].ebp_offset; - if off != EBP_OFFSET_NONE && off < 0 && (-off as usize) <= SLOT_COUNT { - slots[(-off - 1) as usize] = (REG_NONE, MoveSize::L); - } else if infos[i].has_indirect_mem { - // Indirect memory - could write anywhere, invalidate all - slots = [(REG_NONE, MoveSize::L); SLOT_COUNT]; - } - } - // Check for inline asm or instructions that clobber multiple regs - if s.contains(';') || s.starts_with("rdmsr") || s.starts_with("cpuid") - || s.starts_with("syscall") || s.starts_with("int ") || s.starts_with("int$") - || s.starts_with("rep") || s.starts_with("cld") { - slots = [(REG_NONE, MoveSize::L); SLOT_COUNT]; - } - } - LineKind::Push { .. } | LineKind::Pop { .. } => { - // Push/pop modify esp but don't affect ebp-relative slots - if let LineKind::Pop { reg } = infos[i].kind { - // Pop writes to a register, invalidate mappings - for slot in slots.iter_mut() { - if slot.0 == reg { - *slot = (REG_NONE, MoveSize::L); - } - } - } - } - _ => {} - } - } - - changed -} - -// ── Pass: Dead store elimination ───────────────────────────────────────────── - -/// Remove stores to stack slots that are immediately overwritten. -fn eliminate_dead_stores(store: &LineStore, infos: &mut [LineInfo]) -> bool { - let len = infos.len(); - let mut changed = false; - const WINDOW: usize = 16; - - for i in 0..len { - if infos[i].is_nop() { continue; } - if let LineKind::StoreEbp { offset: store_off, size: store_size, reg: store_reg } = infos[i].kind { - // Look ahead for another store to the same slot (meaning this one is dead) - // or a load from the same slot (meaning this one is alive) - let mut j = i + 1; - let mut count = 0; - while j < len && count < WINDOW { - if infos[j].is_nop() { j += 1; continue; } - - let store_bytes = store_size.byte_size(); - match infos[j].kind { - LineKind::StoreEbp { offset, size, .. } - if offset == store_off && size == store_size => - { - // Another store to the exact same slot - this store is dead - infos[i].kind = LineKind::Nop; - changed = true; - break; - } - LineKind::StoreEbp { offset, size, .. } - if ranges_overlap(store_off, store_bytes, offset, size.byte_size()) => - { - // Overlapping store but not identical - conservatively keep alive - break; - } - LineKind::LoadEbp { offset, size, .. } - if ranges_overlap(store_off, store_bytes, offset, size.byte_size()) => - { - // Load overlaps this store's byte range - this store is alive - break; - } - _ => {} - } - - // Stop at barriers - if infos[j].is_barrier() { break; } - // Stop if the stored register is modified (value may have changed) - match infos[j].kind { - LineKind::Other { dest_reg } if dest_reg == store_reg => break, - LineKind::Move { dst, .. } if dst == store_reg => break, - LineKind::SetCC { reg } if reg == store_reg => break, - _ => {} - } - // Stop at indirect memory access (could read the slot) - let s = trimmed(store, &infos[j], j); - if infos[j].has_indirect_mem { break; } - // If line references ebp with same offset, it's alive - if infos[j].ebp_offset == store_off { break; } - // leaq N(%ebp) takes address of slot - if s.contains("(%ebp)") && !matches!(infos[j].kind, LineKind::StoreEbp { .. } | LineKind::LoadEbp { .. }) { - break; - } - - j += 1; - count += 1; - } - } - } - - changed -} - -// ── Pass: Dead register move elimination ───────────────────────────────────── - -/// Remove register moves where the destination is overwritten before being read. -fn eliminate_dead_reg_moves(store: &LineStore, infos: &mut [LineInfo]) -> bool { - let len = infos.len(); - let mut changed = false; - const WINDOW: usize = 16; - - for i in 0..len { - if infos[i].is_nop() { continue; } - let dst_reg = match infos[i].kind { - LineKind::Move { dst, .. } => dst, - _ => continue, - }; - // Don't eliminate moves to esp/ebp - if dst_reg == REG_ESP || dst_reg == REG_EBP { continue; } - - // Look ahead: if dst is overwritten before being read, this move is dead - let mut j = i + 1; - let mut count = 0; - while j < len && count < WINDOW { - if infos[j].is_nop() { j += 1; continue; } - if infos[j].is_barrier() { break; } - - // Check if dst is read by this instruction - let s = trimmed(store, &infos[j], j); - match infos[j].kind { - LineKind::StoreEbp { reg, .. } if reg == dst_reg => { - // dst is read (stored to stack) - move is alive - break; - } - LineKind::Move { src, dst } => { - if src == dst_reg { - // dst is read - move is alive - break; - } - if dst == dst_reg { - // dst is overwritten - this move is dead - infos[i].kind = LineKind::Nop; - changed = true; - break; - } - } - LineKind::Other { dest_reg } => { - if dest_reg == dst_reg && !line_references_reg(s, dst_reg) { - // Destination only writes to dst, doesn't read it: dead - // But we need to make sure it doesn't ALSO read it - // Actually, just check if the line references the reg at all - // For "movl $5, %eax", dest is eax and it doesn't read eax - // For "addl $5, %eax", dest is eax and it reads eax - // parse_dest_reg returns the last operand. If the instruction writes - // to dst_reg but the source doesn't reference it, then this move is dead. - infos[i].kind = LineKind::Nop; - changed = true; - break; - } - if line_references_reg(s, dst_reg) { - break; // dst is read - } - } - _ => { - if line_references_reg(s, dst_reg) { - break; // dst is read - } - } - } - - j += 1; - count += 1; - } - } - - changed -} - -// ── Pass: Compare and branch fusion ────────────────────────────────────────── - -/// Maximum number of store/load offsets tracked during compare-and-branch fusion. -const MAX_TRACKED_STORE_LOAD_OFFSETS: usize = 4; - -/// Size of the instruction lookahead window for compare-and-branch fusion. -const CMP_FUSION_LOOKAHEAD: usize = 8; - -/// Collect up to N non-NOP line indices following `start_idx` (exclusive). -/// Returns the number of indices collected. -fn collect_non_nop_indices( - infos: &[LineInfo], start_idx: usize, len: usize, out: &mut [usize; N], -) -> usize { - let mut count = 0; - let mut j = start_idx + 1; - while j < len && count < N { - if !infos[j].is_nop() { - out[count] = j; - count += 1; - } - j += 1; - } - count -} - -/// Fuse `cmpl/testl + setCC %al + movzbl %al, %eax + [store/load] + testl %eax, %eax + jne/je` -/// into a single `jCC`/`j!CC` directly. -/// -/// This enhanced version can skip over store/load pairs between the movzbl and -/// testl, allowing fusion even when the boolean is temporarily spilled to the -/// stack. It tracks stored offsets and verifies each has a matching load nearby, -/// ensuring the stored boolean is only consumed locally. -fn fuse_compare_and_branch(store: &mut LineStore, infos: &mut [LineInfo]) -> bool { - let len = infos.len(); - let mut changed = false; - - let mut i = 0; - while i < len { - if infos[i].is_nop() || infos[i].kind != LineKind::Cmp { - i += 1; - continue; - } - - // Collect next non-NOP lines: cmp itself + (CMP_FUSION_LOOKAHEAD-1) following - let mut seq_indices = [0usize; CMP_FUSION_LOOKAHEAD]; - seq_indices[0] = i; - let mut rest = [0usize; CMP_FUSION_LOOKAHEAD - 1]; - let rest_count = collect_non_nop_indices::<{ CMP_FUSION_LOOKAHEAD - 1 }>(infos, i, len, &mut rest); - seq_indices[1..(rest_count + 1)].copy_from_slice(&rest[..rest_count]); - let seq_count = 1 + rest_count; - - if seq_count < 4 { - i += 1; - continue; - } - - // Second must be setCC %al - let setcc_cc = if let LineKind::SetCC { reg: REG_EAX } = infos[seq_indices[1]].kind { - let s = trimmed(store, &infos[seq_indices[1]], seq_indices[1]); - parse_setcc(s) - } else { - None - }; - if setcc_cc.is_none() { - i += 1; - continue; - } - let setcc_cc = setcc_cc.unwrap(); - - // Scan for testl %eax, %eax pattern. - // Track StoreEbp offsets so we can bail out if any store's slot is - // potentially read by another basic block (no matching load nearby). - let mut test_idx = None; - let mut store_offsets: [i32; MAX_TRACKED_STORE_LOAD_OFFSETS] = [0; MAX_TRACKED_STORE_LOAD_OFFSETS]; - let mut store_count = 0usize; - let mut scan = 2; - while scan < seq_count { - let si = seq_indices[scan]; - let line = trimmed(store, &infos[si], si); - - // Skip zero-extend of setcc result - if line == "movzbl %al, %eax" { - scan += 1; - continue; - } - // Skip store/load to ebp (pre-parsed fast check). - if let LineKind::StoreEbp { offset, .. } = infos[si].kind { - if store_count < MAX_TRACKED_STORE_LOAD_OFFSETS { - store_offsets[store_count] = offset; - store_count += 1; - } else { - store_count = usize::MAX; - break; - } - scan += 1; - continue; - } - if matches!(infos[si].kind, LineKind::LoadEbp { .. }) { - scan += 1; - continue; - } - // Skip cwtl (sign-extend ax->eax, i686 equivalent of cltq) - if line == "cwtl" || line.starts_with("movswl ") || line.starts_with("movsbl ") { - scan += 1; - continue; - } - // Check for test - if line == "testl %eax, %eax" { - test_idx = Some(scan); - break; - } - break; - } - - let test_scan = match test_idx { - Some(t) => t, - None => { i += 1; continue; } - }; - - // If there are stores in the sequence, verify each has a matching load nearby. - if store_count == usize::MAX { - i += 1; - continue; - } - if store_count > 0 { - let range_start = seq_indices[1]; - let range_end = seq_indices[test_scan]; - let mut load_offsets: [i32; MAX_TRACKED_STORE_LOAD_OFFSETS] = [0; MAX_TRACKED_STORE_LOAD_OFFSETS]; - let mut load_count = 0usize; - for ri in range_start..=range_end { - let off = match infos[ri].kind { - LineKind::LoadEbp { offset, .. } => Some(offset), - // Check NOP'd lines too - earlier passes (store/load forwarding) - // may have NOP'd a load that originally matched a store. - LineKind::Nop => { - let orig = classify_line(store.get(ri)); - match orig.kind { - LineKind::LoadEbp { offset, .. } => Some(offset), - _ => None, - } - } - _ => None, - }; - if let Some(o) = off { - if load_count < MAX_TRACKED_STORE_LOAD_OFFSETS { - load_offsets[load_count] = o; - load_count += 1; - } - } - } - let has_unmatched_store = (0..store_count).any(|si| { - !(0..load_count).any(|li| load_offsets[li] == store_offsets[si]) - }); - if has_unmatched_store { - i += 1; - continue; - } - } - - if test_scan + 1 >= seq_count { - i += 1; - continue; - } - - // Find jne/je after test - let jmp_line = trimmed(store, &infos[seq_indices[test_scan + 1]], seq_indices[test_scan + 1]); - let (is_jne, branch_target) = if let Some(target) = jmp_line.strip_prefix("jne ") { - (true, target.trim()) - } else if let Some(target) = jmp_line.strip_prefix("je ") { - (false, target.trim()) - } else { - i += 1; - continue; - }; - - let fused_cc = if is_jne { - setcc_cc - } else { - match invert_cc(setcc_cc) { - Some(inv) => inv, - None => { i += 1; continue; } - } - }; - - let fused_jcc = format!(" j{} {}", fused_cc, branch_target); - - // NOP out everything from setCC through testl - for s in 1..=test_scan { - infos[seq_indices[s]].kind = LineKind::Nop; - } - // Replace the jne/je with the fused conditional jump - let idx = seq_indices[test_scan + 1]; - store.replace(idx, fused_jcc); - infos[idx] = LineInfo { - kind: LineKind::CondJmp, - trim_start: 4, - has_indirect_mem: false, - ebp_offset: EBP_OFFSET_NONE, - }; - - changed = true; - i = idx + 1; - } - - changed -} - -// ── Pass: Memory operand folding ───────────────────────────────────────────── - -/// Fold `movl -N(%ebp), %ecx; addl %ecx, %eax` into `addl -N(%ebp), %eax`. -fn fold_memory_operands(store: &mut LineStore, infos: &mut [LineInfo]) -> bool { - let len = infos.len(); - let mut changed = false; - - let mut i = 0; - while i < len { - if infos[i].is_nop() { i += 1; continue; } - - // Look for load from stack slot - if let LineKind::LoadEbp { reg: load_reg, offset, size } = infos[i].kind { - // Only fold scratch registers (eax, ecx, edx) - if !is_caller_saved(load_reg) && load_reg != REG_EAX { - i += 1; continue; - } - - // Find next non-nop instruction - let j = next_non_nop(infos, i + 1); - if j >= len { i += 1; continue; } - - // Check if next instruction uses this register as a source operand - // Pattern: load into %ecx, then `addl %ecx, %eax` etc. - let s = trimmed(store, &infos[j], j); - if let Some(folded) = try_fold_memory_operand(s, load_reg, offset, size) { - store.replace(j, format!(" {}", folded)); - let dest_reg = parse_dest_reg(&folded); - infos[j] = LineInfo { - kind: LineKind::Other { dest_reg }, - trim_start: 4, - has_indirect_mem: false, - ebp_offset: offset, - }; - infos[i].kind = LineKind::Nop; // Remove the load - changed = true; - } - } - i += 1; - } - - changed -} - -/// Try to fold a stack slot into an ALU instruction. -/// Returns the folded instruction string if successful. -fn try_fold_memory_operand(s: &str, load_reg: RegId, offset: i32, _size: MoveSize) -> Option { - let reg_name = reg32_name(load_reg); - - // Try patterns: `OPCODE %load_reg, %other_reg` - for op in &["addl", "subl", "andl", "orl", "xorl", "cmpl", "testl", "imull"] { - if let Some(rest) = s.strip_prefix(op) { - let rest = rest.trim(); - // Pattern: `%load_reg, %dst` → `OPCODE offset(%ebp), %dst` - if let Some(after) = rest.strip_prefix(reg_name) { - let after = after.trim(); - if let Some(after_comma) = after.strip_prefix(',') { - let dst = after_comma.trim(); - if dst.starts_with('%') && !dst.contains('(') { - // Don't fold if dst is the same as load_reg (would be read after free) - if register_family(dst) != load_reg { - return Some(format!("{} {}(%ebp), {}", op, offset, dst)); - } - } - } - } - } - } - - None -} - -// ── Pass: Never-read store elimination ─────────────────────────────────────── - -/// Global pass: find stack slots that are never loaded and remove all stores to them. -fn eliminate_never_read_stores(store: &LineStore, infos: &mut [LineInfo]) { - let len = infos.len(); - - // Collect all loaded byte ranges (offset, size) - let mut read_ranges: Vec<(i32, i32)> = Vec::new(); - let mut addr_taken = false; - - for i in 0..len { - if infos[i].is_nop() { continue; } - match infos[i].kind { - LineKind::LoadEbp { offset, size, .. } => { - read_ranges.push((offset, size.byte_size())); - } - _ => { - let s = trimmed(store, &infos[i], i); - // Check for address-of-slot patterns (leal N(%ebp), %reg or leal N(%esp), %reg) - if s.starts_with("leal ") && (s.contains("(%ebp)") || s.contains("(%esp)")) { - addr_taken = true; - } - // Indirect memory access means we can't know what's read - if infos[i].has_indirect_mem { - addr_taken = true; - } - // Track %ebp-relative reads from non-Load/Store instructions - // (e.g. folded memory operands like "cmpl -44(%ebp), %eax") - let ebp_off = infos[i].ebp_offset; - if ebp_off != EBP_OFFSET_NONE { - // Conservatively treat as a 4-byte read (max store size on i686) - read_ranges.push((ebp_off, 4)); - } else if !matches!(infos[i].kind, LineKind::StoreEbp { .. }) && s.contains("(%ebp)") { - // Unknown %ebp reference - bail out - addr_taken = true; - } - } - } - } - - if addr_taken { return; } - - // Remove stores to slots whose byte range is never overlapped by any load - for i in 0..len { - if infos[i].is_nop() { continue; } - if let LineKind::StoreEbp { offset, size, .. } = infos[i].kind { - let store_bytes = size.byte_size(); - let is_read = read_ranges.iter().any(|&(r_off, r_sz)| { - ranges_overlap(offset, store_bytes, r_off, r_sz) - }); - if !is_read { - infos[i].kind = LineKind::Nop; - } - } - } -} - -// ── Pass: Unused callee-saved register elimination ─────────────────────────── - -/// Remove pushl/popl of callee-saved registers that are never referenced in the function body. -// TODO: Disabled - buggy leal -N(%ebp),%esp adjustment causes stack misalignment and 97+ -// segfault regressions. Needs proper understanding of frame layout before re-enabling. -#[allow(dead_code)] -fn eliminate_unused_callee_saves(store: &LineStore, infos: &mut [LineInfo]) { - let len = infos.len(); - - // Find function boundaries - let mut func_start = 0; - for i in 0..len { - if infos[i].is_nop() { continue; } - // Look for the prologue pattern: pushl %ebp; movl %esp, %ebp - if let LineKind::Push { reg: REG_EBP } = infos[i].kind { - func_start = i; - break; - } - if infos[i].kind == LineKind::Label { - let s = trimmed(store, &infos[i], i); - if s.ends_with(':') && !s.starts_with('.') { - func_start = i; - } - } - } - - // Identify callee-saved registers that are pushed in the prologue - // and check if they're used in the function body - for reg in [REG_EBX, REG_ESI, REG_EDI] { - // Find the push of this register - let mut push_idx = None; - let mut pop_idx = None; - let mut used = false; - - for i in func_start..len { - if infos[i].is_nop() { continue; } - match infos[i].kind { - LineKind::Push { reg: r } if r == reg && push_idx.is_none() => { - push_idx = Some(i); - } - LineKind::Pop { reg: r } if r == reg => { - pop_idx = Some(i); - } - _ => { - if push_idx.is_some() && pop_idx.is_none() { - // Check if the register is referenced in the body - let s = trimmed(store, &infos[i], i); - if line_references_reg(s, reg) { - used = true; - } - } - } - } - } - - if !used { - if let Some(pi) = push_idx { - infos[pi].kind = LineKind::Nop; - if let Some(qi) = pop_idx { - infos[qi].kind = LineKind::Nop; - } - // For noreturn functions (no pop), still eliminate the push - } - } - } -} - -// ── Pass: Push/pop elimination ─────────────────────────────────────────────── - -/// Eliminate push/pop pairs where the register is not modified between them. -// TODO: Disabled - removes function-level callee-save push/pops which breaks the -// leal -12(%ebp),%esp epilogue pattern. Needs awareness of function boundaries. -#[allow(dead_code)] -fn eliminate_push_pop_pairs(store: &LineStore, infos: &mut [LineInfo]) -> bool { - let len = infos.len(); - let mut changed = false; - - for i in 0..len { - if infos[i].is_nop() { continue; } - let push_reg = match infos[i].kind { - LineKind::Push { reg } if reg <= REG_GP_MAX => reg, - _ => continue, - }; - - // Scan forward for matching pop, checking reg is unmodified - let mut j = i + 1; - let mut depth = 0; // Track nested push/pops - let mut safe = true; - while j < len { - if infos[j].is_nop() { j += 1; continue; } - - match infos[j].kind { - LineKind::Push { .. } => { depth += 1; } - LineKind::Pop { reg } if depth > 0 => { depth -= 1; } - LineKind::Pop { reg } if reg == push_reg && depth == 0 => { - // Found matching pop - if safe { - infos[i].kind = LineKind::Nop; - infos[j].kind = LineKind::Nop; - changed = true; - } - break; - } - LineKind::Pop { .. } if depth == 0 => { break; } // Different register popped - _ => {} - } - - // Check if the register is modified - match infos[j].kind { - LineKind::Move { dst, .. } if dst == push_reg => { safe = false; } - LineKind::LoadEbp { reg, .. } if reg == push_reg => { safe = false; } - LineKind::Other { dest_reg } if dest_reg == push_reg => { safe = false; } - LineKind::Other { .. } => { - // For unknown instructions, check if the raw text references the register - // or if it's an instruction that implicitly clobbers registers (rep movsb, etc.) - let raw = store.get(j).trim(); - if raw.starts_with("rep") || raw.starts_with("cld") { - // rep movsb/movsl/stosb etc. clobber esi, edi, ecx - if push_reg == REG_ESI || push_reg == REG_EDI || push_reg == REG_ECX { - safe = false; - } - } - if line_references_reg(raw, push_reg) { - safe = false; - } - } - LineKind::Call => { if is_caller_saved(push_reg) { safe = false; } } - LineKind::SetCC { reg } if reg == push_reg => { safe = false; } - _ => {} - } - - // Stop at barriers that change control flow - if matches!(infos[j].kind, LineKind::Jmp | LineKind::JmpIndirect | LineKind::Ret | LineKind::Label) { - break; - } - - j += 1; - } - } - - changed -} - -// ── Utility ────────────────────────────────────────────────────────────────── - -/// Find the next non-nop line after index `start`. -fn next_non_nop(infos: &[LineInfo], start: usize) -> usize { - let mut i = start; - while i < infos.len() && (infos[i].is_nop() || infos[i].kind == LineKind::Empty) { - i += 1; - } - i -} - -// ── Main entry point ───────────────────────────────────────────────────────── - -/// Run peephole optimization on i686 assembly text. -/// Returns the optimized assembly string. -pub fn peephole_optimize(asm: String) -> String { - let mut store = LineStore::new(asm); - let line_count = store.len(); - let mut infos: Vec = (0..line_count).map(|i| classify_line(store.get(i))).collect(); - - // Phase 1: Iterative local passes - let mut changed = true; - let mut pass_count = 0; - while changed && pass_count < MAX_LOCAL_PASS_ITERATIONS { - changed = false; - changed |= combined_local_pass(&mut store, &mut infos); - pass_count += 1; - } - - // Phase 2: Global passes (run once) - let global_changed = eliminate_dead_reg_moves(&store, &mut infos); - let global_changed = global_changed | eliminate_dead_stores(&store, &mut infos); - let global_changed = global_changed | fuse_compare_and_branch(&mut store, &mut infos); - let global_changed = global_changed | fold_memory_operands(&mut store, &mut infos); - - // Phase 3: Local cleanup after global passes - if global_changed { - let mut changed2 = true; - let mut pass_count2 = 0; - while changed2 && pass_count2 < MAX_POST_GLOBAL_ITERATIONS { - changed2 = false; - changed2 |= combined_local_pass(&mut store, &mut infos); - changed2 |= eliminate_dead_reg_moves(&store, &mut infos); - changed2 |= eliminate_dead_stores(&store, &mut infos); - changed2 |= fold_memory_operands(&mut store, &mut infos); - pass_count2 += 1; - } - } - - // Phase 4: Never-read store elimination - eliminate_never_read_stores(&store, &mut infos); - - store.build_result(|i| infos[i].is_nop()) -} - -// ── Tests ──────────────────────────────────────────────────────────────────── - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_redundant_store_load() { - let asm = " movl %eax, -8(%ebp)\n movl -8(%ebp), %eax\n".to_string(); - let result = peephole_optimize(asm); - // After store/load elimination, the load is removed. Then never-read - // store elimination removes the now-unread store too. Both gone. - assert_eq!(result.trim(), ""); - } - - #[test] - fn test_store_load_different_reg() { - let asm = " movl %eax, -8(%ebp)\n movl -8(%ebp), %ecx\n".to_string(); - let result = peephole_optimize(asm); - assert!(result.contains("movl %eax, %ecx"), "should forward: {}", result); - assert!(!result.contains("-8(%ebp), %ecx"), "should eliminate load: {}", result); - } - - #[test] - fn test_self_move() { - let asm = " movl %eax, %eax\n".to_string(); - let result = peephole_optimize(asm); - assert_eq!(result.trim(), ""); - } - - #[test] - fn test_redundant_jump() { - let asm = " jmp .Lfoo\n.Lfoo:\n".to_string(); - let result = peephole_optimize(asm); - assert!(!result.contains("jmp"), "should eliminate redundant jmp: {}", result); - assert!(result.contains(".Lfoo:"), "should keep label: {}", result); - } - - #[test] - fn test_branch_inversion() { - let asm = [ - " jl .LBB2", - " jmp .LBB4", - ".LBB2:", - " movl %eax, %ecx", - ].join("\n") + "\n"; - let result = peephole_optimize(asm); - assert!(result.contains("jge .LBB4"), "should invert to jge: {}", result); - assert!(!result.contains("jmp .LBB4"), "should remove jmp: {}", result); - } - - #[test] - fn test_compare_branch_fusion() { - let asm = [ - " cmpl %ecx, %eax", - " setl %al", - " movzbl %al, %eax", - " testl %eax, %eax", - " jne .LBB2", - ].join("\n") + "\n"; - let result = peephole_optimize(asm); - assert!(result.contains("jl .LBB2"), "should fuse to jl: {}", result); - assert!(!result.contains("setl"), "should eliminate setl: {}", result); - } - - #[test] - fn test_compare_branch_fusion_with_store_load() { - // Pattern: cmp + setCC + movzbl + store + load + test + jne - // The store/load pair should be skipped, allowing fusion. - let asm = [ - " cmpl %ecx, %eax", - " setge %al", - " movzbl %al, %eax", - " movl %eax, -16(%ebp)", - " movl -16(%ebp), %eax", - " testl %eax, %eax", - " jne .LBB5", - ].join("\n") + "\n"; - let result = peephole_optimize(asm); - assert!(result.contains("jge .LBB5"), "should fuse to jge: {}", result); - assert!(!result.contains("setge"), "should eliminate setge: {}", result); - assert!(!result.contains("movzbl"), "should eliminate movzbl: {}", result); - assert!(!result.contains("testl"), "should eliminate testl: {}", result); - } - - #[test] - fn test_compare_branch_fusion_unmatched_store_bails() { - // If the store has no matching load, we should NOT fuse (the boolean - // escapes to another basic block). - let asm = [ - " cmpl %ecx, %eax", - " setge %al", - " movzbl %al, %eax", - " movl %eax, -16(%ebp)", - " testl %eax, %eax", - " jne .LBB5", - ].join("\n") + "\n"; - let result = peephole_optimize(asm); - // Should NOT fuse because the store has no matching load - assert!(result.contains("setge"), "should keep setge (unmatched store): {}", result); - } - - #[test] - fn test_compare_branch_fusion_inverted() { - // Test je (inverted condition) - let asm = [ - " cmpl %ecx, %eax", - " setl %al", - " movzbl %al, %eax", - " testl %eax, %eax", - " je .LBB3", - ].join("\n") + "\n"; - let result = peephole_optimize(asm); - assert!(result.contains("jge .LBB3"), "should fuse to jge (inverted): {}", result); - assert!(!result.contains("setl"), "should eliminate setl: {}", result); - } - - #[test] - fn test_dead_store() { - // Two consecutive stores to the same slot: first is dead, second survives. - // But never-read store elimination also removes the second store if no - // loads exist. Use a load after the second store to keep it alive. - let asm = [ - " movl %eax, -8(%ebp)", - " movl %ecx, -8(%ebp)", - " movl -8(%ebp), %edx", - ].join("\n") + "\n"; - let result = peephole_optimize(asm); - assert!(!result.contains("%eax, -8(%ebp)"), "first store dead: {}", result); - assert!(result.contains("%ecx"), "second store alive: {}", result); - } - - #[test] - fn test_memory_fold() { - let asm = [ - " movl -48(%ebp), %ecx", - " addl %ecx, %eax", - ].join("\n") + "\n"; - let result = peephole_optimize(asm); - assert!(result.contains("addl -48(%ebp), %eax"), "should fold: {}", result); - } - - // Note: store forwarding tests removed - global_store_forwarding is disabled - // due to FP computation regressions. - - #[test] - fn test_reverse_move_elimination() { - let asm = [ - " movl %eax, %ecx", - " movl %ecx, %eax", - ].join("\n") + "\n"; - let result = peephole_optimize(asm); - assert_eq!(result.matches("movl").count(), 1, "should eliminate reverse: {}", result); - } - - // Note: push/pop elimination test removed - eliminate_push_pop_pairs is disabled - // due to callee-save/leal epilogue interactions. - - #[test] - fn test_addl_1_to_incl() { - let asm = " addl $1, %eax\n".to_string(); - let result = peephole_optimize(asm); - assert!(result.contains("incl %eax"), "should convert to incl: {}", result); - assert!(!result.contains("addl"), "should eliminate addl: {}", result); - } - - #[test] - fn test_subl_1_to_decl() { - let asm = " subl $1, %ecx\n".to_string(); - let result = peephole_optimize(asm); - assert!(result.contains("decl %ecx"), "should convert to decl: {}", result); - assert!(!result.contains("subl"), "should eliminate subl: {}", result); - } - - #[test] - fn test_movl_0_to_xorl() { - let asm = " movl $0, %ebx\n".to_string(); - let result = peephole_optimize(asm); - assert!(result.contains("xorl %ebx, %ebx"), "should convert to xorl: {}", result); - assert!(!result.contains("movl"), "should eliminate movl: {}", result); - } - - #[test] - fn test_redundant_movsbl() { - let asm = [ - " movsbl (%ecx), %eax", - " movsbl %al, %eax", - ].join("\n") + "\n"; - let result = peephole_optimize(asm); - assert_eq!(result.matches("movsbl").count(), 1, - "should eliminate redundant movsbl: {}", result); - } - - #[test] - fn test_addl_neg1_to_decl() { - let asm = " addl $-1, %edx\n".to_string(); - let result = peephole_optimize(asm); - assert!(result.contains("decl %edx"), "should convert to decl: {}", result); - } - - #[test] - fn test_subl_neg1_to_incl() { - let asm = " subl $-1, %esi\n".to_string(); - let result = peephole_optimize(asm); - assert!(result.contains("incl %esi"), "should convert to incl: {}", result); - } - - #[test] - fn test_addl_1_not_incl_before_adcl() { - // addl $1 followed by adcl must NOT be converted to incl, - // because incl does not set the carry flag (CF). - // This pattern is used in 64-bit negation: notl+notl+addl+adcl. - let asm = [ - " notl %eax", - " notl %edx", - " addl $1, %eax", - " adcl $0, %edx", - ].join("\n") + "\n"; - let result = peephole_optimize(asm); - assert!(result.contains("addl $1, %eax"), "must keep addl before adcl: {}", result); - assert!(!result.contains("incl"), "must NOT convert to incl before adcl: {}", result); - } - - #[test] - fn test_subl_1_not_decl_before_sbbl() { - // subl $1 followed by sbbl must NOT be converted to decl, - // because decl does not set the carry flag. - let asm = [ - " subl $1, %eax", - " sbbl $0, %edx", - ].join("\n") + "\n"; - let result = peephole_optimize(asm); - assert!(result.contains("subl $1, %eax"), "must keep subl before sbbl: {}", result); - assert!(!result.contains("decl"), "must NOT convert to decl before sbbl: {}", result); - } -} diff --git a/src/backend/i686/codegen/prologue.rs b/src/backend/i686/codegen/prologue.rs deleted file mode 100644 index 57b6b20963..0000000000 --- a/src/backend/i686/codegen/prologue.rs +++ /dev/null @@ -1,662 +0,0 @@ -//! I686Codegen: prologue/epilogue and stack frame operations. - -use crate::ir::reexports::{Instruction, IrFunction, Value}; -use crate::common::types::IrType; -use crate::backend::generation::{ - is_i128_type, calculate_stack_space_common, run_regalloc_and_merge_clobbers, - filter_available_regs, find_param_alloca, collect_inline_asm_callee_saved_with_generic, -}; -use crate::backend::call_abi::{ParamClass, classify_params}; -use crate::emit; -use super::emit::{ - I686Codegen, phys_reg_name, i686_constraint_to_phys, i686_clobber_to_phys, - I686_CALLEE_SAVED, I686_CALLEE_SAVED_WITH_EBP, I686_CALLER_SAVED, -}; -use crate::backend::regalloc::PhysReg; -use crate::backend::traits::ArchCodegen; - -impl I686Codegen { - // ---- calculate_stack_space ---- - - pub(super) fn calculate_stack_space_impl(&mut self, func: &IrFunction) -> i64 { - self.is_variadic = func.is_variadic; - self.is_fastcall = func.is_fastcall; - self.current_return_type = func.return_type; - - // Dynamic alloca (VLAs) requires the frame pointer to track the stack, - // since ESP changes by runtime-computed amounts. - if self.state.has_dyn_alloca { - self.omit_frame_pointer = false; - } - - // Compute named parameter stack bytes for va_start (variadic functions). - if func.is_variadic { - let config = self.call_abi_config(); - let classification = crate::backend::call_abi::classify_params_full(func, &config); - self.va_named_stack_bytes = classification.total_stack_bytes; - } - - // Run register allocator before stack space computation. - // Use the _with_generic variant to conservatively mark all callee-saved - // registers as clobbered when generic register constraints (r, q, g) are - // present. On i686, the scratch allocator may pick esi/edi/ebx for generic - // constraints, which would clobber values the register allocator placed there. - let mut asm_clobbered_regs: Vec = Vec::new(); - - // When omitting the frame pointer, EBP is available as a callee-saved - // register, so use the extended set that includes EBP. - let callee_saved_set = if self.omit_frame_pointer { - I686_CALLEE_SAVED_WITH_EBP - } else { - I686_CALLEE_SAVED - }; - - collect_inline_asm_callee_saved_with_generic( - func, &mut asm_clobbered_regs, - i686_constraint_to_phys, - i686_clobber_to_phys, - callee_saved_set, - ); - // In PIC mode, %ebx (PhysReg(0)) is reserved as the GOT base pointer. - if self.state.pic_mode && !asm_clobbered_regs.contains(&PhysReg(0)) { - asm_clobbered_regs.push(PhysReg(0)); - } - let available_regs = filter_available_regs(callee_saved_set, &asm_clobbered_regs); - - let caller_saved_regs = I686_CALLER_SAVED.to_vec(); - - let (reg_assigned, cached_liveness) = run_regalloc_and_merge_clobbers( - func, available_regs, caller_saved_regs, &asm_clobbered_regs, - &mut self.reg_assignments, &mut self.used_callee_saved, - false, - ); - - // In PIC mode, %ebx must be saved/restored as a callee-saved register. - if self.state.pic_mode && !self.used_callee_saved.contains(&PhysReg(0)) { - self.used_callee_saved.insert(0, PhysReg(0)); - } - - let callee_saved_bytes = self.used_callee_saved.len() as i64 * 4; - - // The bias ensures that slots requiring >= 16-byte alignment land on - // 16-byte boundaries at runtime. The correct value depends on the - // stack overhead between the 16-byte-aligned call-site ESP and the - // reference point for slot addressing: - // - // With frame pointer: return addr (4) + saved ebp (4) = 8 - // Address of slot -X = EBP - X = (16n - 8) - X, aligned when X ≡ 8 mod 16 - // - // Without frame pointer: return addr (4) only - // Address of slot = 16n - 4 - space, aligned when space ≡ 12 mod 16 - let omit_fp = self.omit_frame_pointer; - let alignment_bias: i64 = if omit_fp { 12 } else { 8 }; - - calculate_stack_space_common(&mut self.state, func, callee_saved_bytes, |space, alloc_size, align| { - let effective_align = if align > 0 { align.max(4) } else { 4 }; - let alloc = (alloc_size + 3) & !3; - let required = space + alloc; - let new_space = if effective_align >= 16 { - let bias = alignment_bias; - let a = effective_align; - let rem = ((required % a) + a) % a; - let needed = if rem <= bias { bias - rem } else { a - rem + bias }; - required + needed - } else { - ((required + effective_align - 1) / effective_align) * effective_align - }; - (-new_space, new_space) - }, ®_assigned, callee_saved_set, cached_liveness, false) - } - - // ---- aligned_frame_size ---- - - pub(super) fn aligned_frame_size_impl(&self, raw_space: i64) -> i64 { - let callee_saved_bytes = self.used_callee_saved.len() as i64 * 4; - let raw_locals = raw_space - callee_saved_bytes; - // With frame pointer: overhead = callee_saved + 8 (saved ebp + return addr) - // Without frame pointer: overhead = callee_saved + 4 (return addr only) - let fixed_overhead = if self.omit_frame_pointer { - callee_saved_bytes + 4 - } else { - callee_saved_bytes + 8 - }; - let needed = raw_locals + fixed_overhead; - let aligned = (needed + 15) & !15; - aligned - fixed_overhead - } - - // ---- emit_prologue ---- - - pub(super) fn emit_prologue_impl(&mut self, _func: &IrFunction, frame_size: i64) { - if self.omit_frame_pointer { - // No frame pointer setup; use ESP-relative addressing. - // frame_base_offset and esp_adjust will be set after callee-saved pushes. - // TODO: Emit ESP-relative CFI directives (.cfi_def_cfa_offset after each - // push/sub) for proper unwinding when frame pointer is omitted. Currently - // the default .cfi_startproc CFA (ESP+4) is used, which is only valid at - // function entry. This is acceptable for now since -fomit-frame-pointer on - // i686 is primarily used by the Linux kernel boot code, which disables - // unwind tables via -fno-asynchronous-unwind-tables. - } else { - self.state.emit(" pushl %ebp"); - if self.state.emit_cfi { - self.state.emit(" .cfi_def_cfa_offset 8"); - self.state.emit(" .cfi_offset %ebp, -8"); - } - self.state.emit(" movl %esp, %ebp"); - if self.state.emit_cfi { - self.state.emit(" .cfi_def_cfa_register %ebp"); - } - } - - for ® in self.used_callee_saved.iter() { - let name = phys_reg_name(reg); - emit!(self.state, " pushl %{}", name); - } - - if self.state.pic_mode { - debug_assert!(self.used_callee_saved.contains(&PhysReg(0)), - "PIC mode requires ebx in used_callee_saved"); - self.state.emit(" call __x86.get_pc_thunk.bx"); - self.state.emit(" addl $_GLOBAL_OFFSET_TABLE_, %ebx"); - self.needs_pc_thunk_bx = true; - } - - if frame_size > 0 { - emit!(self.state, " subl ${}, %esp", frame_size); - } - - if self.omit_frame_pointer { - let callee_saved_bytes = self.used_callee_saved.len() as i64 * 4; - self.frame_base_offset = callee_saved_bytes + frame_size; - self.esp_adjust = 0; - } - } - - // ---- emit_epilogue ---- - - pub(super) fn emit_epilogue_impl(&mut self, _frame_size: i64) { - if self.omit_frame_pointer { - let callee_saved_bytes = self.used_callee_saved.len() as i64 * 4; - let total = self.frame_base_offset - callee_saved_bytes; - if total > 0 { - emit!(self.state, " addl ${}, %esp", total); - } - } else { - let callee_saved_bytes = self.used_callee_saved.len() as i64 * 4; - if callee_saved_bytes > 0 { - emit!(self.state, " leal -{}(%ebp), %esp", callee_saved_bytes); - } else { - self.state.emit(" movl %ebp, %esp"); - } - } - - for ® in self.used_callee_saved.iter().rev() { - let name = phys_reg_name(reg); - emit!(self.state, " popl %{}", name); - } - - if !self.omit_frame_pointer { - self.state.emit(" popl %ebp"); - } - } - - // ---- emit_store_params ---- - - pub(super) fn emit_store_params_impl(&mut self, func: &IrFunction) { - let config = self.call_abi_config(); - let param_classes = classify_params(func, &config); - self.state.param_classes = param_classes.clone(); - self.state.num_params = func.params.len(); - self.state.func_is_variadic = func.is_variadic; - - self.state.param_alloca_slots = (0..func.params.len()).map(|i| { - find_param_alloca(func, i).and_then(|(dest, ty)| { - self.state.get_slot(dest.0).map(|slot| (slot, ty)) - }) - }).collect(); - - let fastcall_reg_count = if self.is_fastcall { - self.count_fastcall_reg_params(func) - } else { - 0 - }; - self.fastcall_reg_param_count = fastcall_reg_count; - - if self.is_fastcall { - let mut total_stack_bytes: usize = 0; - for (i, _p) in func.params.iter().enumerate() { - if i < fastcall_reg_count { continue; } - let ty = func.params[i].ty; - let size = match ty { - IrType::I64 | IrType::U64 | IrType::F64 => 8, - IrType::F128 => 12, - _ if is_i128_type(ty) => 16, - _ => 4, - }; - total_stack_bytes += size; - } - self.fastcall_stack_cleanup = total_stack_bytes; - } else { - self.fastcall_stack_cleanup = 0; - } - - // Build a map of param_idx -> ParamRef dest Value for fast lookup. - // Used to handle the case where param alloca was eliminated by mem2reg - // but the register allocator assigned a callee-saved register. - let mut paramref_dests: Vec> = vec![None; func.params.len()]; - if self.is_fastcall { - for block in &func.blocks { - for inst in &block.instructions { - if let Instruction::ParamRef { dest, param_idx, .. } = inst { - if *param_idx < paramref_dests.len() { - paramref_dests[*param_idx] = Some(*dest); - } - } - } - } - } - - let stack_base: i64 = 8; - let mut fastcall_reg_idx = 0usize; - - // Build a map from physical register -> list of param indices that use it, - // so we can detect when two params share the same callee-saved register. - let mut reg_to_params: crate::common::fx_hash::FxHashMap> = crate::common::fx_hash::FxHashMap::default(); - if self.is_fastcall { - for (i, _) in func.params.iter().enumerate() { - if let Some(paramref_dest) = paramref_dests[i] { - if let Some(&phys_reg) = self.reg_assignments.get(¶mref_dest.0) { - reg_to_params.entry(phys_reg.0).or_default().push(i); - } - } - } - } - - for (i, _param) in func.params.iter().enumerate() { - let class = param_classes[i]; - - // Pre-store optimization for fastcall register params: when the param's - // alloca was eliminated (by mem2reg) but the ParamRef dest is register- - // allocated to a callee-saved register, store the fastcall ABI register - // (%ecx/%edx) directly to the assigned physical register. This is critical - // because: - // 1. Dead alloca means no stack slot exists for this param - // 2. %ecx/%edx are caller-saved and will be clobbered - // 3. We must save the value NOW, before any other code runs - // 4. emit_param_ref will see param_pre_stored and skip code generation - if self.is_fastcall && fastcall_reg_idx < fastcall_reg_count { - let param_ty = func.params[i].ty; - if self.is_fastcall_reg_eligible(param_ty) { - let has_alloca_slot = find_param_alloca(func, i) - .and_then(|(dest, _)| self.state.get_slot(dest.0)) - .is_some(); - if !has_alloca_slot { - let src_reg = if fastcall_reg_idx == 0 { "%ecx" } else { "%edx" }; - if let Some(paramref_dest) = paramref_dests[i] { - if let Some(&phys_reg) = self.reg_assignments.get(¶mref_dest.0) { - // Safety check: if another param's dest is also assigned - // to this register, skip pre-store to avoid conflicts. - let shared = reg_to_params.get(&phys_reg.0) - .is_some_and(|users| users.len() > 1); - if !shared { - // Store directly to the callee-saved register - let dest_reg = phys_reg_name(phys_reg); - emit!(self.state, " movl {}, %{}", src_reg, dest_reg); - self.state.param_pre_stored.insert(i); - } - } else if let Some(slot) = self.state.get_slot(paramref_dest.0) { - // Value was spilled to a stack slot - no register conflict - let slot_ref = self.slot_ref(slot); - emit!(self.state, " movl {}, {}", src_reg, slot_ref); - self.state.param_pre_stored.insert(i); - } - } - fastcall_reg_idx += 1; - continue; - } - } - } - - let (slot, ty, dest_id) = if let Some((dest, ty)) = find_param_alloca(func, i) { - if let Some(slot) = self.state.get_slot(dest.0) { - (slot, ty, dest.0) - } else { - if self.is_fastcall && fastcall_reg_idx < fastcall_reg_count && i < func.params.len() - && self.is_fastcall_reg_eligible(ty) { - fastcall_reg_idx += 1; - } - continue; - } - } else { - if self.is_fastcall && fastcall_reg_idx < fastcall_reg_count && i < func.params.len() { - let param_ty = func.params[i].ty; - if self.is_fastcall_reg_eligible(param_ty) { - fastcall_reg_idx += 1; - } - } - continue; - }; - - if self.is_fastcall && fastcall_reg_idx < fastcall_reg_count && self.is_fastcall_reg_eligible(ty) { - let src_reg_full = if fastcall_reg_idx == 0 { "%ecx" } else { "%edx" }; - let slot_ref = self.slot_ref(slot); - // For sub-int types, sign/zero-extend to full 32-bit before - // storing to the 4-byte SSA slot (avoids partial-write issues). - match ty { - IrType::I8 => { - let src_byte = if fastcall_reg_idx == 0 { "%cl" } else { "%dl" }; - emit!(self.state, " movsbl {}, {}", src_byte, src_reg_full); - emit!(self.state, " movl {}, {}", src_reg_full, slot_ref); - } - IrType::U8 => { - let src_byte = if fastcall_reg_idx == 0 { "%cl" } else { "%dl" }; - emit!(self.state, " movzbl {}, {}", src_byte, src_reg_full); - emit!(self.state, " movl {}, {}", src_reg_full, slot_ref); - } - IrType::I16 => { - let src_word = if fastcall_reg_idx == 0 { "%cx" } else { "%dx" }; - emit!(self.state, " movswl {}, {}", src_word, src_reg_full); - emit!(self.state, " movl {}, {}", src_reg_full, slot_ref); - } - IrType::U16 => { - let src_word = if fastcall_reg_idx == 0 { "%cx" } else { "%dx" }; - emit!(self.state, " movzwl {}, {}", src_word, src_reg_full); - emit!(self.state, " movl {}, {}", src_reg_full, slot_ref); - } - _ => { - emit!(self.state, " movl {}, {}", src_reg_full, slot_ref); - } - } - fastcall_reg_idx += 1; - continue; - } - - let stack_offset_adjust = if self.is_fastcall { fastcall_reg_count as i64 * 4 } else { 0 }; - - match class { - ParamClass::StackScalar { offset } => { - let src_offset = stack_base + offset - stack_offset_adjust; - if ty == IrType::F64 || ty == IrType::I64 || ty == IrType::U64 { - let src_ref = self.param_ref(src_offset); - let dst_ref = self.slot_ref(slot); - emit!(self.state, " movl {}, %eax", src_ref); - emit!(self.state, " movl %eax, {}", dst_ref); - let src_ref_hi = self.param_ref(src_offset + 4); - let dst_ref_hi = self.slot_ref_offset(slot, 4); - emit!(self.state, " movl {}, %eax", src_ref_hi); - emit!(self.state, " movl %eax, {}", dst_ref_hi); - } else { - let load_instr = self.mov_load_for_type(ty); - let src_ref = self.param_ref(src_offset); - let dst_ref = self.slot_ref(slot); - emit!(self.state, " {} {}, %eax", load_instr, src_ref); - // Always store full 32-bit value to SSA slot. The load - // instruction above already sign/zero-extended sub-int - // types into the full eax register. Using movb/movw here - // would leave garbage in the upper bytes of the 4-byte - // slot, which gets read back later by movl. - emit!(self.state, " movl %eax, {}", dst_ref); - } - } - ParamClass::StructStack { offset, size } => { - let src = stack_base + offset - stack_offset_adjust; - let mut copied = 0usize; - while copied + 4 <= size { - let src_ref = self.param_ref(src + copied as i64); - let dst_ref = self.slot_ref_offset(slot, copied as i64); - emit!(self.state, " movl {}, %eax", src_ref); - emit!(self.state, " movl %eax, {}", dst_ref); - copied += 4; - } - while copied < size { - let src_ref = self.param_ref(src + copied as i64); - let dst_ref = self.slot_ref_offset(slot, copied as i64); - emit!(self.state, " movb {}, %al", src_ref); - emit!(self.state, " movb %al, {}", dst_ref); - copied += 1; - } - } - ParamClass::LargeStructStack { offset, size } => { - let src = stack_base + offset - stack_offset_adjust; - let mut copied = 0usize; - while copied + 4 <= size { - let src_ref = self.param_ref(src + copied as i64); - let dst_ref = self.slot_ref_offset(slot, copied as i64); - emit!(self.state, " movl {}, %eax", src_ref); - emit!(self.state, " movl %eax, {}", dst_ref); - copied += 4; - } - while copied < size { - let src_ref = self.param_ref(src + copied as i64); - let dst_ref = self.slot_ref_offset(slot, copied as i64); - emit!(self.state, " movb {}, %al", src_ref); - emit!(self.state, " movb %al, {}", dst_ref); - copied += 1; - } - } - ParamClass::F128AlwaysStack { offset } => { - let src = stack_base + offset - stack_offset_adjust; - let src_ref = self.param_ref(src); - let dst_ref = self.slot_ref(slot); - emit!(self.state, " fldt {}", src_ref); - emit!(self.state, " fstpt {}", dst_ref); - self.state.f128_direct_slots.insert(dest_id); - } - ParamClass::I128Stack { offset } => { - let src = stack_base + offset - stack_offset_adjust; - for j in (0..16).step_by(4) { - let src_ref = self.param_ref(src + j as i64); - let dst_ref = self.slot_ref_offset(slot, j as i64); - emit!(self.state, " movl {}, %eax", src_ref); - emit!(self.state, " movl %eax, {}", dst_ref); - } - } - ParamClass::F128Stack { offset } => { - let src = stack_base + offset - stack_offset_adjust; - let src_ref = self.param_ref(src); - let dst_ref = self.slot_ref(slot); - emit!(self.state, " fldt {}", src_ref); - emit!(self.state, " fstpt {}", dst_ref); - self.state.f128_direct_slots.insert(dest_id); - } - ParamClass::IntReg { reg_idx } => { - // regparm: param arrives in EAX/EDX/ECX (reg_idx 0/1/2) - let regparm_regs_full = ["%eax", "%edx", "%ecx"]; - let regparm_regs_byte = ["%al", "%dl", "%cl"]; - let regparm_regs_word = ["%ax", "%dx", "%cx"]; - let src_full = regparm_regs_full[reg_idx]; - let slot_ref = self.slot_ref(slot); - match ty { - IrType::I8 => { - let src_byte = regparm_regs_byte[reg_idx]; - emit!(self.state, " movsbl {}, {}", src_byte, src_full); - emit!(self.state, " movl {}, {}", src_full, slot_ref); - } - IrType::U8 => { - let src_byte = regparm_regs_byte[reg_idx]; - emit!(self.state, " movzbl {}, {}", src_byte, src_full); - emit!(self.state, " movl {}, {}", src_full, slot_ref); - } - IrType::I16 => { - let src_word = regparm_regs_word[reg_idx]; - emit!(self.state, " movswl {}, {}", src_word, src_full); - emit!(self.state, " movl {}, {}", src_full, slot_ref); - } - IrType::U16 => { - let src_word = regparm_regs_word[reg_idx]; - emit!(self.state, " movzwl {}, {}", src_word, src_full); - emit!(self.state, " movl {}, {}", src_full, slot_ref); - } - _ => { - emit!(self.state, " movl {}, {}", src_full, slot_ref); - } - } - } - _ => { - // Remaining register classes (FloatReg, StructByValReg, etc.) - // don't apply to i686's ABI classification. - } - } - } - } - - // ---- emit_param_ref ---- - - pub(super) fn emit_param_ref_impl(&mut self, dest: &Value, param_idx: usize, ty: IrType) { - use crate::backend::call_abi::ParamClass; - - // If this param was pre-stored in the prologue (fastcall register param - // with eliminated alloca), the value is already in the correct physical - // register or stack slot. No code generation needed. - if self.state.param_pre_stored.contains(¶m_idx) { - return; - } - - if param_idx < self.state.param_alloca_slots.len() { - if let Some((alloca_slot, _alloca_ty)) = self.state.param_alloca_slots[param_idx] { - if let Some(dest_slot) = self.state.get_slot(dest.0) { - if dest_slot.0 == alloca_slot.0 { - // The param value is already in the alloca slot (stored by - // emit_store_params). If dest also has a register assignment, - // we must initialize the register from the slot — otherwise - // the register contains garbage from the caller, and any - // subsequent read via operand_to_eax will use the register - // (uninitialized) instead of the slot. - if let Some(phys) = self.dest_reg(dest) { - let reg = phys_reg_name(phys); - let load_instr = self.mov_load_for_type(ty); - let src_ref = self.slot_ref(alloca_slot); - emit!(self.state, " {} {}, %eax", load_instr, src_ref); - emit!(self.state, " movl %eax, %{}", reg); - self.state.reg_cache.invalidate_acc(); - } - return; - } - } - if let Some(dest_slot) = self.state.get_slot(dest.0) { - if is_i128_type(ty) { - for i in (0..16).step_by(4) { - let src_ref = self.slot_ref_offset(alloca_slot, i as i64); - let dst_ref = self.slot_ref_offset(dest_slot, i as i64); - emit!(self.state, " movl {}, %eax", src_ref); - emit!(self.state, " movl %eax, {}", dst_ref); - } - } else if ty == IrType::F128 { - let src_ref = self.slot_ref(alloca_slot); - let dst_ref = self.slot_ref(dest_slot); - emit!(self.state, " fldt {}", src_ref); - emit!(self.state, " fstpt {}", dst_ref); - self.state.f128_direct_slots.insert(dest.0); - } else if ty == IrType::F64 || ty == IrType::I64 || ty == IrType::U64 { - let src_ref = self.slot_ref(alloca_slot); - let dst_ref = self.slot_ref(dest_slot); - emit!(self.state, " movl {}, %eax", src_ref); - emit!(self.state, " movl %eax, {}", dst_ref); - let src_ref_hi = self.slot_ref_offset(alloca_slot, 4); - let dst_ref_hi = self.slot_ref_offset(dest_slot, 4); - emit!(self.state, " movl {}, %eax", src_ref_hi); - emit!(self.state, " movl %eax, {}", dst_ref_hi); - } else { - let load_instr = self.mov_load_for_type(ty); - let src_ref = self.slot_ref(alloca_slot); - emit!(self.state, " {} {}, %eax", load_instr, src_ref); - self.store_eax_to(dest); - } - return; - } - } - } - - if self.is_fastcall && param_idx < self.fastcall_reg_param_count { - if let Some(Some((slot, _slot_ty))) = self.state.param_alloca_slots.get(param_idx) { - let load_instr = self.mov_load_for_type(ty); - let slot_ref = self.slot_ref(*slot); - emit!(self.state, " {} {}, %eax", load_instr, slot_ref); - self.store_eax_to(dest); - } - return; - } - - let stack_base: i64 = 8; - let stack_offset_adjust = if self.is_fastcall { self.fastcall_reg_param_count as i64 * 4 } else { 0 }; - let param_offset = if param_idx < self.state.param_classes.len() { - match self.state.param_classes[param_idx] { - ParamClass::StackScalar { offset } | - ParamClass::StructStack { offset, .. } | - ParamClass::LargeStructStack { offset, .. } | - ParamClass::F128AlwaysStack { offset } | - ParamClass::I128Stack { offset } | - ParamClass::F128Stack { offset } | - ParamClass::LargeStructByRefStack { offset, .. } => stack_base + offset - stack_offset_adjust, - ParamClass::IntReg { .. } => { - // Regparm: param was stored to its alloca slot in emit_store_params. - // This should have been handled by the alloca_slot path above. - // If we get here, just use a fallback offset. - stack_base + (param_idx as i64) * 4 - } - _ => stack_base + (param_idx as i64) * 4, - } - } else { - stack_base + (param_idx as i64) * 4 - }; - - if is_i128_type(ty) { - if let Some(slot) = self.state.get_slot(dest.0) { - for i in (0..16).step_by(4) { - let src_ref = self.param_ref(param_offset + i as i64); - let dst_ref = self.slot_ref_offset(slot, i as i64); - emit!(self.state, " movl {}, %eax", src_ref); - emit!(self.state, " movl %eax, {}", dst_ref); - } - } - } else if ty == IrType::F128 { - if let Some(dest_slot) = self.state.get_slot(dest.0) { - let src_ref = self.param_ref(param_offset); - let dst_ref = self.slot_ref(dest_slot); - emit!(self.state, " fldt {}", src_ref); - emit!(self.state, " fstpt {}", dst_ref); - self.state.f128_direct_slots.insert(dest.0); - } - } else if ty == IrType::F64 || ty == IrType::I64 || ty == IrType::U64 { - if let Some(slot) = self.state.get_slot(dest.0) { - let src_ref = self.param_ref(param_offset); - let dst_ref = self.slot_ref(slot); - emit!(self.state, " movl {}, %eax", src_ref); - emit!(self.state, " movl %eax, {}", dst_ref); - let src_ref_hi = self.param_ref(param_offset + 4); - let dst_ref_hi = self.slot_ref_offset(slot, 4); - emit!(self.state, " movl {}, %eax", src_ref_hi); - emit!(self.state, " movl %eax, {}", dst_ref_hi); - } - } else { - let load_instr = self.mov_load_for_type(ty); - let src_ref = self.param_ref(param_offset); - emit!(self.state, " {} {}, %eax", load_instr, src_ref); - self.store_eax_to(dest); - } - } - - // ---- emit_epilogue_and_ret ---- - - pub(super) fn emit_epilogue_and_ret_impl(&mut self, frame_size: i64) { - self.emit_epilogue(frame_size); - if self.state.uses_sret { - self.state.emit(" ret $4"); - } else if self.is_fastcall && self.fastcall_stack_cleanup > 0 { - emit!(self.state, " ret ${}", self.fastcall_stack_cleanup); - } else { - self.state.emit(" ret"); - } - } - - // ---- store/load instr for type ---- - - pub(super) fn store_instr_for_type_impl(&self, ty: IrType) -> &'static str { - self.mov_store_for_type(ty) - } - - pub(super) fn load_instr_for_type_impl(&self, ty: IrType) -> &'static str { - self.mov_load_for_type(ty) - } -} diff --git a/src/backend/i686/codegen/returns.rs b/src/backend/i686/codegen/returns.rs deleted file mode 100644 index 6e14d8e8f4..0000000000 --- a/src/backend/i686/codegen/returns.rs +++ /dev/null @@ -1,87 +0,0 @@ -//! I686Codegen: return value operations. - -use crate::ir::reexports::{Operand, Value}; -use crate::common::types::IrType; -use crate::emit; -use crate::backend::traits::ArchCodegen; -use super::emit::I686Codegen; - -impl I686Codegen { - pub(super) fn emit_return_impl(&mut self, val: Option<&Operand>, frame_size: i64) { - if let Some(val) = val { - let ret_ty = self.current_return_type; - if ret_ty == IrType::I64 || ret_ty == IrType::U64 { - self.emit_load_acc_pair(val); - self.emit_epilogue_and_ret(frame_size); - return; - } - if ret_ty == IrType::F64 { - self.emit_f64_load_to_x87(val); - self.emit_epilogue_and_ret(frame_size); - return; - } - if ret_ty.is_long_double() { - self.emit_f128_load_to_x87(val); - self.emit_epilogue_and_ret(frame_size); - return; - } - } - // Delegate all other cases (I128, F32, scalar int) to default - crate::backend::traits::emit_return_default(self, val, frame_size); - } - - pub(super) fn emit_return_i128_to_regs_impl(&mut self) { - // eax:edx already holds the low 64 bits - } - - pub(super) fn emit_return_f128_to_reg_impl(&mut self) { - self.state.emit(" pushl %eax"); - self.state.emit(" fildl (%esp)"); - self.state.emit(" addl $4, %esp"); - } - - pub(super) fn emit_return_f32_to_reg_impl(&mut self) { - self.state.emit(" pushl %eax"); - self.state.emit(" flds (%esp)"); - self.state.emit(" addl $4, %esp"); - } - - pub(super) fn emit_return_f64_to_reg_impl(&mut self) { - self.state.emit(" pushl %edx"); - self.state.emit(" pushl %eax"); - self.state.emit(" fldl (%esp)"); - self.state.emit(" addl $8, %esp"); - } - - pub(super) fn emit_return_int_to_reg_impl(&mut self) { - // eax already holds the return value - } - - pub(super) fn emit_get_return_f64_second_impl(&mut self, dest: &Value) { - self.store_eax_to(dest); - } - - pub(super) fn emit_set_return_f64_second_impl(&mut self, src: &Operand) { - self.operand_to_eax(src); - } - - pub(super) fn emit_get_return_f32_second_impl(&mut self, dest: &Value) { - self.store_eax_to(dest); - } - - pub(super) fn emit_set_return_f32_second_impl(&mut self, src: &Operand) { - self.operand_to_eax(src); - } - - pub(super) fn emit_get_return_f128_second_impl(&mut self, dest: &Value) { - if let Some(slot) = self.state.get_slot(dest.0) { - let sr = self.slot_ref(slot); - emit!(self.state, " fstpt {}", sr); - self.state.f128_direct_slots.insert(dest.0); - } - } - - pub(super) fn emit_set_return_f128_second_impl(&mut self, src: &Operand) { - self.emit_f128_load_to_x87(src); - } -} diff --git a/src/backend/i686/codegen/variadic.rs b/src/backend/i686/codegen/variadic.rs deleted file mode 100644 index 439784b4f3..0000000000 --- a/src/backend/i686/codegen/variadic.rs +++ /dev/null @@ -1,71 +0,0 @@ -//! I686Codegen: variadic argument operations (va_arg, va_start, va_copy). - -use crate::ir::reexports::Value; -use crate::common::types::IrType; -use crate::backend::generation::is_i128_type; -use crate::emit; -use super::emit::I686Codegen; - -impl I686Codegen { - pub(super) fn emit_va_arg_impl(&mut self, dest: &Value, va_list_ptr: &Value, result_ty: IrType) { - self.load_va_list_addr_to_edx(va_list_ptr); - self.state.emit(" movl (%edx), %ecx"); - - if is_i128_type(result_ty) { - if let Some(dest_slot) = self.state.get_slot(dest.0) { - for i in (0..16).step_by(4) { - emit!(self.state, " movl {}(%ecx), %eax", i); - let sr = self.slot_ref_offset(dest_slot, i as i64); - emit!(self.state, " movl %eax, {}", sr); - } - } - self.state.emit(" addl $16, %ecx"); - } else if result_ty == IrType::F128 { - if let Some(dest_slot) = self.state.get_slot(dest.0) { - self.state.emit(" fldt (%ecx)"); - let sr = self.slot_ref(dest_slot); - emit!(self.state, " fstpt {}", sr); - self.state.f128_direct_slots.insert(dest.0); - } - self.state.emit(" addl $12, %ecx"); - } else if result_ty == IrType::F64 || result_ty == IrType::I64 || result_ty == IrType::U64 { - if let Some(dest_slot) = self.state.get_slot(dest.0) { - let sr0 = self.slot_ref(dest_slot); - let sr4 = self.slot_ref_offset(dest_slot, 4); - self.state.emit(" movl (%ecx), %eax"); - emit!(self.state, " movl %eax, {}", sr0); - self.state.emit(" movl 4(%ecx), %eax"); - emit!(self.state, " movl %eax, {}", sr4); - } - self.state.emit(" addl $8, %ecx"); - } else { - let load_instr = self.mov_load_for_type(result_ty); - emit!(self.state, " {} (%ecx), %eax", load_instr); - self.store_eax_to(dest); - let advance = result_ty.size().max(4); - emit!(self.state, " addl ${}, %ecx", advance); - } - self.load_va_list_addr_to_edx(va_list_ptr); - self.state.emit(" movl %ecx, (%edx)"); - } - - pub(super) fn emit_va_start_impl(&mut self, va_list_ptr: &Value) { - let vararg_offset = 8 + self.va_named_stack_bytes as i64; - self.load_va_list_addr_to_edx(va_list_ptr); - // vararg_offset is an EBP-relative param offset - let pr = self.param_ref(vararg_offset); - emit!(self.state, " leal {}, %eax", pr); - self.state.emit(" movl %eax, (%edx)"); - } - - pub(super) fn emit_va_copy_impl(&mut self, dest_ptr: &Value, src_ptr: &Value) { - self.load_va_list_addr_to_edx(src_ptr); - self.state.emit(" movl (%edx), %eax"); - self.load_va_list_addr_to_edx(dest_ptr); - self.state.emit(" movl %eax, (%edx)"); - } - - pub(super) fn emit_va_arg_struct_impl(&mut self, _dest_ptr: &Value, _va_list_ptr: &Value, _size: usize) { - panic!("VaArgStruct should not be emitted for i686 target"); - } -} diff --git a/src/backend/i686/linker/README.md b/src/backend/i686/linker/README.md deleted file mode 100644 index 2ebe52065b..0000000000 --- a/src/backend/i686/linker/README.md +++ /dev/null @@ -1,566 +0,0 @@ -# i686 Built-in Linker -- Design Document - -## Overview - -The i686 built-in linker reads ELF32 relocatable object files (`.o`) and static -archives (`.a`, including thin archives), resolves symbols against system shared -libraries, applies i386 relocations, and emits a dynamically-linked (or static) -ELF32 executable or shared library (`-shared`). It replaces the external GNU -linker (`ld`) for the `i686-linux-gnu` target. - -The linker is invoked by the compiler driver, which first discovers CRT objects -(`crt1.o`, `crti.o`, `crtn.o`), GCC library directories, and system library -paths using a shared architecture-configuration module (`DirectLdArchConfig`). -The linker itself focuses purely on ELF linking logic. - - -## Architecture - -``` - +-----------------------+ +-----------------------+ - | CRT objects | | User .o files | - | (crt1.o, crti.o, ...) | | (from compilation) | - +-----------+-----------+ +-----------+-----------+ - | | - +-------------+---------------+ - | - v - +------------------------+ - | Input Parsing | - | parse_elf32() | Validate ELFCLASS32, EM_386 - | parse_archive() | Extract .a members - +------------------------+ - | - v - +------------------------+ - | Archive Resolution | Demand-driven extraction: - | resolve_archive_ | pull in members that satisfy - | members() | undefined symbols - +------------------------+ - | - v - +------------------------+ - | Section Merging | Group .text.*, .rodata.*, etc. - | output_section_name() | into canonical output sections - | | (with COMDAT deduplication) - +------------------------+ - | - v - +------------------------+ - | Symbol Resolution | Two-pass: definitions, then - | global_symbols | undefined -> dynamic lookup - +------------------------+ - | - v - +------------------------+ - | PLT/GOT Construction | Build PLT stubs, GOT entries, - | | .rel.plt, .rel.dyn - +------------------------+ - | - v - +------------------------+ - | Layout | Assign virtual addresses and - | LOAD segments | file offsets; page alignment - +------------------------+ - | - v - +------------------------+ - | Relocation Application| Apply R_386_32, R_386_PC32, - | | R_386_PLT32, R_386_GOT*, - | | TLS relocations, etc. - +------------------------+ - | - v - +------------------------+ - | ELF Emission | Write headers, segments, - | | synthetic sections, data - +------------------------+ - | - v - ELF32 executable -``` - - -## Differences from the x86-64 Linker - -| Aspect | x86-64 | i686 | -|---------------------------|---------------------------------------|---------------------------------------| -| ELF class | ELFCLASS64 | ELFCLASS32 | -| Machine type | EM_X86_64 (62) | EM_386 (3) | -| ELF header size | 64 bytes | 52 bytes | -| Program header size | 56 bytes | 32 bytes | -| Section header size | 64 bytes | 40 bytes | -| Symbol entry size | 24 bytes (Elf64_Sym) | 16 bytes (Elf32_Sym) | -| Relocation format | RELA (Elf64_Rela, 24 bytes) | REL (Elf32_Rel, 8 bytes) | -| Relocation `r_info` | `(sym << 32) \| type` | `(sym << 8) \| type` | -| Implicit addend | Stored in `r_addend` field | Read from section data at reloc site | -| Base address | `0x400000` | `0x08048000` | -| Dynamic linker | `/lib64/ld-linux-x86-64.so.2` | `/lib/ld-linux.so.2` | -| Relocation types | `R_X86_64_*` | `R_386_*` | -| Address size | 8 bytes | 4 bytes | -| GOT entry size | 8 bytes | 4 bytes | -| Dynamic tag size | 16 bytes (d_tag + d_val) | 8 bytes (d_tag + d_val) | -| PLT addressing | RIP-relative (`jmp *GOT(%rip)`) | Absolute (`jmp *[abs32]`) | -| PLT entry size | 16 bytes | 16 bytes | -| DT_PLTREL value | DT_RELA (7) | DT_REL (17) | -| `.rel.plt` entry type | R_X86_64_JUMP_SLOT (Rela, 24 B) | R_386_JMP_SLOT (Rel, 8 B) | -| `.rel.dyn` entry type | R_X86_64_GLOB_DAT (Rela) | R_386_GLOB_DAT (Rel) | -| Copy relocation | R_X86_64_COPY | R_386_COPY | -| PC-relative addressing | RIP-relative (native) | Requires GOT-based PIC tricks | - - -## Key Data Structures - -### Input structures - -| Type | Role | -|-----------------|---------------------------------------------------------------| -| `InputObject` | Parsed `.o` file: sections + symbols + filename | -| `InputSection` | One section from an input file: data, flags, relocations | -| `InputSymbol` | One symbol: name, value, size, binding, type, section index | -| `Elf32Sym` | Raw ELF32 symbol entry (16 bytes) | -| `Elf32Shdr` | Raw ELF32 section header (40 bytes) | - -### Linker-internal structures - -| Type | Role | -|-----------------|---------------------------------------------------------------| -| `LinkerSymbol` | Resolved symbol: address, binding, PLT/GOT indices, dynamic | -| | library info, copy relocation flag, version string, | -| | `uses_textrel` flag for weak dynamic data symbols | -| `OutputSection` | Merged output section: data, vaddr, file offset | -| `SectionMap` | Maps `(obj_idx, sec_idx)` to `(out_sec_idx, offset)` | -| `DynSymInfo` | Symbol exported by a shared library: type, version, binding | -| `DynStrTab` | Builder for `.dynstr` (dynamic string table) | - -### Output layout tracking - -Virtual addresses and file offsets are tracked as `u32` variables during the -layout phase. Each segment and synthetic section has corresponding `*_offset`, -`*_vaddr`, and `*_size` variables. - - -## Processing Algorithm - -### Step 1: Input Parsing - -**Object files** are parsed by `parse_elf32()`: - -1. Validate ELF magic, ELFCLASS32, ELFDATA2LSB, ET_REL, EM_386. -2. Read section headers and section name string table. -3. Find `.symtab` and its linked `.strtab`. -4. Parse all symbols into `InputSymbol` structs. -5. For each section, find associated `.rel.*` sections and parse their - `Elf32_Rel` entries. The implicit addend is read from the section data at - the relocation offset (i386 REL convention). - -**Archives** are parsed by `parse_archive()` (regular) or -`parse_thin_archive_i686()` (thin): - -1. Validate `!\n` magic (or `!\n` for thin archives). -2. For regular archives: extract each member whose content starts with ELF magic. -3. For thin archives: read member paths and load external `.o` files. -4. Each extracted member is then parsed by `parse_elf32()`. - -Archive members are placed into an archive pool for demand-driven extraction -(see Step 2). - -**Shared libraries** are scanned by `read_dynsyms_with_search()`: - -1. Validate ELF magic, ELFCLASS32, ET_DYN. If the file is a GNU linker script - (GROUP/INPUT directives), resolve the referenced `.so` files recursively. -2. Find `.dynsym`, `.gnu.version`, and `.gnu.verdef` sections. -3. Parse version definitions (verdef) to build a version-index-to-name map. -4. Enumerate all defined, global/weak symbols from `.dynsym`. -5. For each symbol, look up its version from `.gnu.version` and map it through - the verdef table. -6. Return a list of `DynSymInfo` entries with name, type, size, and version. - -### Step 2: Archive Resolution - -Archive members are extracted on demand using an iterative algorithm: - -1. Scan all already-linked objects to collect defined and undefined symbols. -2. Search the archive pool for members that define any currently-undefined symbol. -3. When a member is extracted, add its definitions and new undefined references. -4. Repeat until no more members can be extracted (fixed-point iteration). - -This avoids linking unused code from archives while correctly handling -transitive dependencies between archive members. The `--defsym` alias targets -are also considered as undefined symbols to ensure their definitions are pulled -from archives. - -### Step 3: Section Merging - -Input sections are merged into canonical output sections by name. COMDAT -groups (`SHT_GROUP` with `GRP_COMDAT` flag) are deduplicated: only the first -instance of each group signature is kept, and duplicate members are skipped. - -``` - Input section name -> Output section name - ────────────────────────── ──────────────────── - .text, .text.* .text - .init .init (kept separate) - .fini .fini (kept separate) - .rodata, .rodata.* .rodata - .data, .data.*, .tm_clone_table .data - .bss, .bss.*, SHT_NOBITS .bss - .tdata, .tdata.* .tdata (TLS initialized) - .tbss, .tbss.* .tbss (TLS zero-init) - .init_array, .init_array.* .init_array - .fini_array, .fini_array.* .fini_array - .eh_frame .eh_frame - .note.* (SHT_NOTE) .note -``` - -Non-allocatable sections (`SHT_NULL`, `SHT_SYMTAB`, `SHT_STRTAB`, `SHT_REL`, -`SHT_RELA`, `SHT_GROUP`, `.note.GNU-stack`, `.comment`) are skipped. - -Each input section's data is appended to the output section with alignment -padding. The `SectionMap` records the mapping from `(object_index, -input_section_index)` to `(output_section_index, offset_within_output)`. - -### Step 4: Symbol Resolution - -**First pass -- collect definitions:** - -For each defined symbol in each input object (skipping FILE, SECTION, and -UNDEF symbols), compute its output-section index and offset. If a name -collision occurs: -- Global beats weak. -- Defined beats undefined. -- Otherwise, keep the first definition. - -**Second pass -- resolve undefined references:** - -For each undefined symbol, check: -1. Already defined in `global_symbols`? Skip. -2. Available in `dynlib_syms` (from shared library scanning)? - - Function symbols (including IFUNC): mark `needs_plt = true`, `needs_got = true`. - - Data symbols: mark `needs_copy = true` (copy relocation). -3. Weak undefined? Insert with address 0 (allowed to remain unresolved). -4. Truly undefined? Will produce an error later, unless it is a linker- - defined symbol. - -**Section symbol resolution:** - -Section symbols (STT_SECTION) are mapped to synthetic names -`__section_{obj}_{idx}` for relocation resolution. - -**PLT/GOT marking:** - -A third scan over all relocations marks which symbols need PLT or GOT entries -based on relocation types: -- `R_386_PLT32` on a dynamic symbol -> `needs_plt = true`, `needs_got = true`. -- `R_386_GOT32` or `R_386_GOT32X` -> `needs_got = true`. -- `R_386_TLS_GOTIE` or `R_386_TLS_IE` -> `needs_got = true`. - -**Weak dynamic data handling:** - -After PLT/GOT lists are built, weak dynamic data symbols (non-function, -`STB_WEAK`) are converted from copy relocations to text relocations -(`DT_TEXTREL`). This avoids issues with copy relocations for weak symbols -that may not exist in all library versions. - -**COMMON symbol allocation:** - -Tentative definitions (`SHN_COMMON`) -- global variables declared without an -initializer -- are allocated space in `.bss` with proper alignment. This -happens after symbol resolution so that a real definition from another object -file takes precedence. - -**Undefined symbol check:** - -After resolution, any symbol that is undefined, not dynamic, not weak, and not -linker-defined triggers an error. Linker-defined symbols are managed by -`linker_common::is_linker_defined_symbol()` and include: -`_GLOBAL_OFFSET_TABLE_`, `__ehdr_start`, `__executable_start`, `_end`, -`_edata`, `_etext`, `__bss_start`, `__dso_handle`, `__rel_iplt_start`, etc. - -### Step 5: PLT and GOT Construction - -PLT and GOT symbols are sorted alphabetically for deterministic output. - -**GOT layout:** - -``` - .got: - [0] _DYNAMIC address (1 reserved entry) - [1..N] non-PLT GOT entries (dynamic imports via R_386_GLOB_DAT, - then local symbols resolved at link time) - - .got.plt: - [0] _DYNAMIC address (GOT.PLT reserved) - [1] 0 (link_map, filled by ld.so at runtime) - [2] 0 (resolver, filled by ld.so at runtime) - [3+i] PLT[i+1] + 6 (lazy binding: points to pushl in PLT stub) -``` - -**PLT layout:** - -``` - PLT[0]: (resolver stub, 16 bytes) - ff 35 [GOT.PLT+4] pushl GOT.PLT[1] - ff 25 [GOT.PLT+8] jmp *GOT.PLT[2] - 90 90 90 90 nop padding - - PLT[i]: (per-symbol stub, 16 bytes each) - ff 25 [GOT.PLT[3+i]] jmp *GOT.PLT[3+i] - 68 [i*8] pushl reloc byte-offset into .rel.plt - e9 [PLT[0] - here] jmp PLT[0] -``` - -On i686, PLT stubs use **absolute addressing** (`jmp *[abs32]`) rather than -RIP-relative addressing. This is the fundamental difference from x86-64 PLT -entries. - -**IFUNC support (static linking):** - -For static executables, `STT_GNU_IFUNC` symbols are handled through IPLT -(indirect PLT) stubs. Each IFUNC gets an 8-byte IPLT entry (`jmp *[GOT]` -+ nop padding), a dedicated GOT entry initialized to the resolver address, -and an `R_386_IRELATIVE` entry in `.rel.iplt`. The C runtime invokes the -resolver at startup and patches the GOT entry with the resolved address. - -### Step 6: Layout - -The linker uses a fixed base address of `0x08048000` (standard i386 convention) -and lays out LOAD segments with page alignment: - -``` - Segment 0 (RO): ELF header + program headers + .interp + .note + - .gnu.hash + .dynsym + .dynstr + .gnu.version + - .gnu.version_r + .rel.dyn + .rel.plt - - Segment 1 (RX): .init + .plt + .text + .fini + .iplt - - Segment 2 (RO): .rodata + .eh_frame + .eh_frame_hdr - (omitted if empty) - - Segment 3 (RW): .init_array + .fini_array + .dynamic + .got + - .got.plt + .data + .tdata + .bss (+ copy reloc space) -``` - -The virtual address of each segment is aligned to `PAGE_SIZE` (0x1000) and -satisfies `vaddr = file_offset (mod PAGE_SIZE)` so that mmap-based loading -works correctly. - -**Program headers emitted:** - -| Type | Segment | Flags | -|-------------------|--------------------------------------|-------------| -| `PT_PHDR` | Program header table itself | `PF_R` | -| `PT_INTERP` | `.interp` (`/lib/ld-linux.so.2`) | `PF_R` | -| `PT_LOAD` | Read-only headers segment | `PF_R` | -| `PT_LOAD` | Text segment (RX) | `PF_R|PF_X` | -| `PT_LOAD` | Read-only data segment | `PF_R` | -| `PT_LOAD` | Read-write data segment | `PF_R|PF_W` | -| `PT_DYNAMIC` | `.dynamic` section | `PF_R|PF_W` | -| `PT_GNU_STACK` | Stack permissions (non-executable) | `PF_R|PF_W` | -| `PT_GNU_EH_FRAME` | `.eh_frame_hdr` for unwinding | `PF_R` | -| `PT_TLS` | TLS template (`.tdata` + `.tbss`) | `PF_R` | - -`PT_INTERP` and `PT_DYNAMIC` are omitted in static linking mode. -`PT_TLS` is only emitted when TLS sections are present. -`PT_GNU_RELRO` is intentionally omitted to avoid conflicts with lazy PLT -binding when `.got` and `.got.plt` share the same page. - -**Copy relocations:** - -Data symbols imported from shared libraries (e.g., `stdin`, `environ`) are -handled via **R_386_COPY** relocations. Space is allocated at the end of BSS -for each copy-relocated symbol, and the dynamic linker copies the symbol's -value from the shared library into this space at load time. - -### Step 7: Symbol Address Resolution - -After layout, final virtual addresses are assigned: - -- **Defined symbols**: `output_section.addr + section_offset` -- **Dynamic function symbols with PLT**: `plt_vaddr + header + index * 16` -- **Dynamic data symbols with copy reloc**: BSS copy address -- **IFUNC symbols (static)**: overridden to IPLT entry address -- **Linker-defined symbols**: Computed from segment boundaries - (`_etext`, `__bss_start`, `_end`, `_GLOBAL_OFFSET_TABLE_`, etc.) - -### Step 8: Relocation Application - -For each input section's relocations, the linker computes and patches the -output section data: - -| Relocation type | Formula | Description | -|---------------------|---------------------|------------------------------------| -| `R_386_NONE` | (skip) | No-op | -| `R_386_32` | `S + A` | Absolute 32-bit | -| `R_386_PC32` | `S + A - P` | PC-relative 32-bit | -| `R_386_PLT32` | `S + A - P` | PLT-relative (same formula) | -| `R_386_GOTPC` | `GOT + A - P` | PC-relative offset to GOT base | -| `R_386_GOTOFF` | `S + A - GOT` | Offset from GOT base | -| `R_386_GOT32` | `G + A - GOT` | GOT entry offset from GOT base | -| `R_386_GOT32X` | `G + A - GOT` | Relaxable GOT entry reference | -| `R_386_TLS_TPOFF` | `S - TP_base` | Negative TP offset (initial-exec) | -| `R_386_TLS_LE` | `S - TP_base` | Local-exec TP offset | -| `R_386_TLS_LE_32` | `TP_base - S` | Negated TP offset | -| `R_386_TLS_TPOFF32` | `TP_base - S` | Negated TP offset (variant) | -| `R_386_TLS_IE` | GOT entry / tpoff | Initial-exec via GOT | -| `R_386_TLS_GOTIE` | GOT offset / tpoff | Initial-exec GOT-relative | -| `R_386_TLS_GD` | tpoff | General-dynamic (relaxed to LE) | -| `R_386_TLS_DTPMOD32`| 1 | Module ID (always 1 for exe) | -| `R_386_TLS_DTPOFF32`| `S - TLS_base` | DTP offset within TLS block | - -Where: -- `S` = symbol address (PLT address for dynamic function symbols) -- `A` = addend (read from section data, i386 REL convention) -- `P` = relocation site address (patch_addr) -- `GOT` = GOT base address (start of `.got.plt` when PLT exists, else `.got`) -- `G` = GOT entry address for the symbol -- `TP_base` = TLS segment address + TLS memory size (thread pointer base) - -The GOT base address follows the i386 convention: `_GLOBAL_OFFSET_TABLE_` -points to `.got.plt` when PLT entries exist, otherwise to `.got`. - -**GOT32X relaxation:** When `R_386_GOT32X` references a locally-defined symbol -that doesn't need a GOT entry, the linker rewrites `mov` instructions to `lea` -(opcode `0x8b` -> `0x8d`) and computes the offset directly, avoiding the GOT -indirection. - -### Step 9: Dynamic Section and Version Tables - -**`.dynamic` section** entries (each 8 bytes: `d_tag` + `d_val`): - -- `DT_NEEDED` for each shared library (libc.so.6, libm.so.6, etc.) -- `DT_GNU_HASH`, `DT_STRTAB`, `DT_SYMTAB`, `DT_STRSZ`, `DT_SYMENT` -- `DT_INIT` / `DT_FINI` (if `.init` / `.fini` sections exist) -- `DT_INIT_ARRAY` / `DT_INIT_ARRAYSZ` / `DT_FINI_ARRAY` / `DT_FINI_ARRAYSZ` -- `DT_DEBUG` -- `DT_PLTGOT`, `DT_PLTRELSZ`, `DT_PLTREL` (= 17, DT_REL), `DT_JMPREL` -- `DT_REL`, `DT_RELSZ`, `DT_RELENT` (= 8) -- `DT_VERNEED`, `DT_VERNEEDNUM`, `DT_VERSYM` (if versions are present) -- `DT_TEXTREL` (if weak dynamic data symbols require text relocations) -- `DT_NULL` - -**`.gnu.hash` section** uses the GNU hash algorithm, consistent with the x86-64 -and RISC-V linkers. Uses 32-bit bloom filter words (ELF32 word size). Copy-reloc -and textrel symbols (defined in this executable) are placed in the hashed -portion so the dynamic linker can find them for symbol interposition. - -**Symbol versioning:** - -The linker emits `.gnu.version` (versym) and `.gnu.version_r` (verneed) -sections when dynamic symbols have GLIBC version annotations (e.g., -`GLIBC_2.0`, `GLIBC_2.17`). Version information is extracted from the shared -library's `.gnu.verdef` section during `read_dynsyms_with_search()`. - -Symbols without version annotations use `VER_NDX_GLOBAL` (index 1) in the -versym table, which means "any version" to the dynamic linker. - -### Step 10: Output Emission - -The final ELF32 executable is written as a flat byte array: - -1. ELF32 header (52 bytes) with `ET_EXEC`, `EM_386`, entry point -2. Program headers (32 bytes each) -3. Segment data in file-offset order: - - Read-only: `.interp`, `.note`, `.gnu.hash`, `.dynsym`, `.dynstr`, - `.gnu.version`, `.gnu.version_r`, `.rel.dyn`, `.rel.plt` - - Text: `.init`, `.plt`, `.text`, `.fini`, `.iplt` - - Read-only data: `.rodata`, `.eh_frame`, `.eh_frame_hdr` - - Read-write data: `.init_array`, `.fini_array`, `.dynamic`, `.got`, - `.got.plt`, `.data`, `.tdata` - - BSS (`.bss` + `.tbss` + copy reloc space) occupies virtual address space - but no file space -4. File permissions set to 0755 - -Section headers (`e_shoff`, `e_shnum`) are set to 0 -- the executable does -not include a section header table. This is valid for execution (only program -headers matter) and reduces file size. - - -## Key Design Decisions and Trade-offs - -1. **REL relocation format**. i386 ELF uses `Elf32_Rel` (no explicit addend - field). The linker reads implicit addends from the section data at the - relocation offset. This matches the convention established by the assembler - and is compatible with all i386 object files produced by GCC and LLVM. - -2. **GNU hash table**. The linker emits a `.gnu.hash` section using the GNU - hash algorithm with 32-bit bloom filter words (matching ELF32 word size). - This is consistent with the x86-64 and RISC-V linkers. The dynsym table - is ordered with unhashed (undefined import) symbols first, followed by - hashed (defined copy-reloc and textrel) symbols sorted by bucket. - -3. **No section headers in output**. The executable omits section headers - entirely. The kernel and dynamic linker only need program headers to load - and execute the file. This simplifies output emission and produces slightly - smaller executables. Debuggers and tools like `objdump` lose some - information, but `readelf -l` (program headers) still works. - -4. **Fixed base address (0x08048000)**. The linker produces `ET_EXEC` - executables with a fixed base address, not PIE (`ET_DYN`). This simplifies - relocation computation (no need for relative addressing throughout) and - matches the traditional i386 Linux executable layout. The trade-off is no - ASLR for the main executable. - -5. **Lazy PLT binding**. GOT.PLT entries initially point to the `pushl` - instruction in each PLT stub (the "lazy" target). On first call, the - dynamic linker resolves the symbol and patches the GOT.PLT entry to point - directly to the resolved function. This is the standard lazy binding - mechanism for i386. - -6. **Copy relocations for data symbols**. When a program references a data - symbol from a shared library (e.g., `errno`, `stdin`), the linker allocates - space in BSS and emits `R_386_COPY`. The dynamic linker copies the symbol's - initial value from the shared library. This avoids indirection through GOT - for data accesses. Weak dynamic data symbols use text relocations - (`DT_TEXTREL`) instead, to avoid issues with symbols that may be absent. - -7. **Demand-driven archive extraction**. Archive members are extracted only - when they define a currently-undefined symbol, iterating until a fixed point - is reached. This avoids linking unused code from large archives while - correctly resolving transitive dependencies between members. - -8. **PT_GNU_RELRO omitted**. The `PT_GNU_RELRO` segment is intentionally not - emitted. On i386, `.got` and `.got.plt` can share the same virtual page. - Marking part of that page as read-only after relocation would prevent lazy - PLT binding from working. A proper implementation would require separating - `.got` and `.got.plt` onto different pages. - -9. **Library search strategy**. The linker searches for shared libraries in the - caller-provided library paths plus user-specified `-L` paths. It scans - directory entries for versioned filenames (`libfoo.so.6`, `libfoo.so.6.0.1`) - and follows symlinks via `canonicalize()`. If no `.so` is found, it falls - back to a static archive (`libfoo.a`). - -10. **TLS relaxation**. All TLS access models (GD, IE, LE) are relaxed to - direct TP-offset computation for statically-linked TLS. The linker does - not emit TLS descriptors or `R_386_TLS_*` dynamic relocations; all TLS - offsets are resolved at link time. - - -## File Inventory - -| File | Lines | Role | -|--------------|-------|-------------------------------------------------------------| -| `mod.rs` | ~50 | Module declarations, `DynStrTab` u32 wrapper, public re-exports | -| `types.rs` | ~332 | ELF32-specific constants (relocation types, dynamic tags, | -| | | section flags), struct definitions (`InputObject`, | -| | | `LinkerSymbol`, `OutputSection`, etc.), helper functions | -| `parse.rs` | ~195 | ELF32 object file parsing (`parse_elf32`), regular and | -| | | thin archive extraction | -| `dynsym.rs` | ~310 | Dynamic symbol reading from ELF32 shared libraries, | -| | | GNU version info parsing, linker script resolution | -| `reloc.rs` | ~302 | i386 relocation application: all R_386_* types including | -| | | GOT32X relaxation and TLS relocations | -| `gnu_hash.rs`| ~76 | GNU hash table (.gnu.hash) builder for ELF32 with 32-bit | -| | | bloom filter words | -| `input.rs` | ~362 | Phases 1-4: argument parsing, file collection, library | -| | | loading, shared lib scanning, archive resolution | -| `sections.rs`| ~131 | Phase 5: section merging, COMDAT deduplication, | -| | | output section type/flag assignment | -| `symbols.rs` | ~356 | Phases 6-9: symbol resolution, COMMON allocation, | -| | | PLT/GOT marking, undefined check, PLT/GOT list building | -| `link.rs` | ~216 | Orchestration: `link_builtin` and `link_shared` entry points| -| `emit.rs` | ~1,248| Phase 10: executable layout, address assignment, | -| | | relocation application, ELF32 emission | -| `shared.rs` | ~1,058| Shared library (.so) emission: PIC layout, dynamic | -| | | relocations, symbol export, NEEDED discovery | diff --git a/src/backend/i686/linker/dynsym.rs b/src/backend/i686/linker/dynsym.rs deleted file mode 100644 index f0abafb8b1..0000000000 --- a/src/backend/i686/linker/dynsym.rs +++ /dev/null @@ -1,310 +0,0 @@ -//! Dynamic symbol reading from ELF32 shared libraries. -//! -//! Reads dynamic symbols from .so files for the i686 linker, including -//! GNU version info (GLIBC versioning). Also handles GNU linker scripts -//! that reference actual .so files via GROUP/INPUT directives. -//! -//! This is ELF32-specific because shared libraries on i686 use 32-bit -//! ELF format with different field sizes than the ELF64 shared library -//! parser in `linker_common`. - -use super::types::*; - -/// Read dynamic symbol info, with library search paths for resolving linker script entries. -pub(super) fn read_dynsyms_with_search(path: &str, lib_search_paths: &[&str]) -> Result, String> { - const LOCAL_SHT_GNU_VERSYM: u32 = 0x6fffffff; - const SHT_GNU_VERDEF: u32 = 0x6ffffffd; - - let data = std::fs::read(path) - .map_err(|e| format!("cannot read {}: {}", path, e))?; - if data.len() < 52 || data[0..4] != ELF_MAGIC || data[4] != ELFCLASS32 { - // Check if this is a linker script - if let Ok(text) = std::str::from_utf8(&data) { - if let Some(entries) = parse_linker_script_entries(text) { - return resolve_linker_script_syms(path, &entries, lib_search_paths); - } - } - return Err(format!("{}: not a valid ELF32 file", path)); - } - let e_type = read_u16(&data, 16); - if e_type != ET_DYN { - return Err(format!("{}: not a shared library (type={})", path, e_type)); - } - - let e_shoff = read_u32(&data, 32) as usize; - let e_shentsize = read_u16(&data, 46) as usize; - let e_shnum = read_u16(&data, 48) as usize; - - // First pass: find dynsym, versym, verdef sections - let mut dynsym_idx = None; - let mut versym_shdr: Option<(usize, usize)> = None; - let mut verdef_shdr: Option<(usize, usize, usize)> = None; - - for i in 0..e_shnum { - let off = e_shoff + i * e_shentsize; - if off + 40 > data.len() { break; } - let sh_type = read_u32(&data, off + 4); - match sh_type { - SHT_DYNSYM => { dynsym_idx = Some(i); } - LOCAL_SHT_GNU_VERSYM => { - let sh_offset = read_u32(&data, off + 16) as usize; - let sh_size = read_u32(&data, off + 20) as usize; - versym_shdr = Some((sh_offset, sh_size)); - } - SHT_GNU_VERDEF => { - let sh_offset = read_u32(&data, off + 16) as usize; - let sh_size = read_u32(&data, off + 20) as usize; - let sh_link = read_u32(&data, off + 24) as usize; - verdef_shdr = Some((sh_offset, sh_size, sh_link)); - } - _ => {} - } - } - - // Parse version definitions to build index -> version string mapping - let ver_names = parse_verdef(&data, verdef_shdr, e_shoff, e_shentsize); - - let dynsym_i = match dynsym_idx { - Some(i) => i, - None => return Ok(Vec::new()), - }; - - // Read dynsym section - let off = e_shoff + dynsym_i * e_shentsize; - if off + 40 > data.len() { return Ok(Vec::new()); } - - let sh_offset = read_u32(&data, off + 16) as usize; - let sh_size = read_u32(&data, off + 20) as usize; - let sh_link = read_u32(&data, off + 24) as usize; - let sh_entsize = read_u32(&data, off + 36) as usize; - if sh_entsize == 0 { return Ok(Vec::new()); } - - // Get the string table - let str_off = e_shoff + sh_link * e_shentsize; - if str_off + 40 > data.len() { return Ok(Vec::new()); } - let str_sh_offset = read_u32(&data, str_off + 16) as usize; - let str_sh_size = read_u32(&data, str_off + 20) as usize; - if str_sh_offset + str_sh_size > data.len() { return Ok(Vec::new()); } - let strtab = &data[str_sh_offset..str_sh_offset + str_sh_size]; - - let count = sh_size / sh_entsize; - let mut syms = Vec::new(); - for j in 0..count { - let sym_off = sh_offset + j * sh_entsize; - if sym_off + 16 > data.len() { break; } - let st_name = read_u32(&data, sym_off) as usize; - let st_size = read_u32(&data, sym_off + 8); - let st_info = data[sym_off + 12]; - let st_shndx = read_u16(&data, sym_off + 14); - - if st_shndx == SHN_UNDEF { continue; } - let binding = st_info >> 4; - if binding != STB_GLOBAL && binding != STB_WEAK { continue; } - - // Look up version for this symbol - let (version, is_default_ver) = lookup_version(j, versym_shdr, &ver_names, &data); - - if st_name < strtab.len() { - let end = strtab[st_name..].iter().position(|&b| b == 0) - .map(|p| st_name + p).unwrap_or(strtab.len()); - let name = String::from_utf8_lossy(&strtab[st_name..end]).into_owned(); - if !name.is_empty() { - syms.push(DynSymInfo { - name, - sym_type: st_info & 0xf, - size: st_size, - binding, - version, - is_default_ver, - }); - } - } - } - - Ok(syms) -} - -/// Parse GNU version definitions to build index -> version string mapping. -fn parse_verdef( - data: &[u8], - verdef_shdr: Option<(usize, usize, usize)>, - e_shoff: usize, - e_shentsize: usize, -) -> std::collections::HashMap { - let mut ver_names = std::collections::HashMap::new(); - - let (vd_off, vd_size, vd_link) = match verdef_shdr { - Some(v) => v, - None => return ver_names, - }; - - // Get the string table for verdef - let vd_str_hdr = e_shoff + vd_link * e_shentsize; - let vd_strtab = if vd_str_hdr + 40 <= data.len() { - let s_off = read_u32(data, vd_str_hdr + 16) as usize; - let s_sz = read_u32(data, vd_str_hdr + 20) as usize; - if s_off + s_sz <= data.len() { &data[s_off..s_off + s_sz] } else { return ver_names; } - } else { - return ver_names; - }; - - let mut pos = vd_off; - let end = vd_off + vd_size; - while pos < end && pos + 20 <= data.len() { - let vd_ndx = read_u16(data, pos + 4); - let vd_cnt = read_u16(data, pos + 6); - let vd_aux = read_u32(data, pos + 12) as usize; - let vd_next = read_u32(data, pos + 16) as usize; - - if vd_cnt > 0 { - let aux_pos = pos + vd_aux; - if aux_pos + 8 <= data.len() { - let vda_name = read_u32(data, aux_pos) as usize; - if vda_name < vd_strtab.len() { - let name = read_cstr(vd_strtab, vda_name); - ver_names.insert(vd_ndx, name.to_string()); - } - } - } - - if vd_next == 0 { break; } - pos += vd_next; - } - - ver_names -} - -/// Look up the GNU version info for a symbol at dynsym index `j`. -fn lookup_version( - j: usize, - versym_shdr: Option<(usize, usize)>, - ver_names: &std::collections::HashMap, - data: &[u8], -) -> (Option, bool) { - if let Some((vs_off, _vs_size)) = versym_shdr { - let vs_entry = vs_off + j * 2; - if vs_entry + 2 <= data.len() { - let raw_ver = read_u16(data, vs_entry); - let hidden = raw_ver & 0x8000 != 0; - let ver_idx = raw_ver & 0x7fff; - if ver_idx >= 2 { - (ver_names.get(&ver_idx).cloned(), !hidden) - } else { - (None, !hidden) - } - } else { - (None, true) - } - } else { - (None, true) - } -} - -/// Resolve symbols from a GNU linker script (GROUP/INPUT directives). -fn resolve_linker_script_syms( - path: &str, - entries: &[LinkerScriptEntry], - lib_search_paths: &[&str], -) -> Result, String> { - let script_dir = std::path::Path::new(path).parent() - .map(|p| p.to_string_lossy().to_string()); - let mut all_syms = Vec::new(); - for entry in entries { - let resolved = match entry { - LinkerScriptEntry::Path(lib_path) => { - resolve_script_path(lib_path, script_dir.as_deref(), lib_search_paths) - } - LinkerScriptEntry::Lib(lib_name) => { - lib_search_paths.iter() - .map(|dir| format!("{}/lib{}.so", dir, lib_name)) - .find(|p| std::path::Path::new(p).exists()) - } - }; - if let Some(resolved_path) = resolved { - if let Ok(syms) = read_dynsyms_with_search(&resolved_path, lib_search_paths) { - all_syms.extend(syms); - } - } - } - if !all_syms.is_empty() { - Ok(all_syms) - } else { - Err(format!("{}: linker script but no resolvable libraries found", path)) - } -} - -/// Extract the SONAME from an ELF32 shared library's .dynamic section. -/// -/// Returns the SONAME string if present, or None if not found. -pub(super) fn parse_soname_elf32(path: &str) -> Option { - let data = std::fs::read(path).ok()?; - if data.len() < 52 || data[0..4] != ELF_MAGIC || data[4] != ELFCLASS32 { - return None; - } - let e_type = read_u16(&data, 16); - if e_type != ET_DYN { return None; } - - let e_shoff = read_u32(&data, 32) as usize; - let e_shentsize = read_u16(&data, 46) as usize; - let e_shnum = read_u16(&data, 48) as usize; - - if e_shoff == 0 || e_shnum == 0 { return None; } - - // Find .dynamic section - for i in 0..e_shnum { - let off = e_shoff + i * e_shentsize; - if off + 40 > data.len() { break; } - let sh_type = read_u32(&data, off + 4); - if sh_type == 6 { // SHT_DYNAMIC - let dyn_off = read_u32(&data, off + 16) as usize; - let dyn_size = read_u32(&data, off + 20) as usize; - let link = read_u32(&data, off + 24) as usize; - - // Get linked string table - let str_sec_off = e_shoff + link * e_shentsize; - if str_sec_off + 40 > data.len() { return None; } - let str_off = read_u32(&data, str_sec_off + 16) as usize; - let str_size = read_u32(&data, str_sec_off + 20) as usize; - if str_off + str_size > data.len() { return None; } - let strtab = &data[str_off..str_off + str_size]; - - // Scan .dynamic entries for DT_SONAME (tag = 14) - let mut pos = dyn_off; - while pos + 8 <= dyn_off + dyn_size && pos + 8 <= data.len() { - let tag = read_i32(&data, pos); - let val = read_u32(&data, pos + 4) as usize; - if tag == 0 { break; } // DT_NULL - if tag == 14 { // DT_SONAME - return Some(read_cstr(strtab, val)); - } - pos += 8; - } - return None; - } - } - None -} - -/// Resolve a path from a linker script entry. -fn resolve_script_path( - lib_path: &str, - script_dir: Option<&str>, - lib_search_paths: &[&str], -) -> Option { - if std::path::Path::new(lib_path).exists() { - return Some(lib_path.to_string()); - } - if let Some(dir) = script_dir { - let p = format!("{}/{}", dir, lib_path); - if std::path::Path::new(&p).exists() { - return Some(p); - } - } - for search_dir in lib_search_paths { - let p = format!("{}/{}", search_dir, lib_path); - if std::path::Path::new(&p).exists() { - return Some(p); - } - } - None -} diff --git a/src/backend/i686/linker/emit.rs b/src/backend/i686/linker/emit.rs deleted file mode 100644 index e1735bd232..0000000000 --- a/src/backend/i686/linker/emit.rs +++ /dev/null @@ -1,1248 +0,0 @@ -//! Executable emission for the i686 linker. -//! -//! Phase 10: lays out segments, assigns addresses, applies relocations, -//! builds PLT/GOT/dynamic sections, and writes the final ELF32 executable. - -use std::collections::{HashMap, BTreeSet}; - -use super::types::*; -use super::reloc::{self, RelocContext}; -use super::gnu_hash::build_gnu_hash_32; -use super::DynStrTab; -use crate::backend::linker_common; - -pub(super) fn emit_executable( - inputs: &[InputObject], - output_sections: &mut Vec, - section_name_to_idx: &HashMap, - section_map: &SectionMap, - global_symbols: &mut HashMap, - _sym_resolution: &HashMap<(usize, usize), String>, - _dynlib_syms: &HashMap, bool, u8)>, - plt_symbols: &[String], - got_dyn_symbols: &[String], - got_local_symbols: &[String], - num_plt: usize, - _num_got_total: usize, - ifunc_symbols: &[String], - is_static: bool, - is_nostdlib: bool, - _needed_libs_param: &[&str], - output_path: &str, -) -> Result<(), String> { - let num_ifunc = ifunc_symbols.len(); - - // ── Build dynamic symbol/string tables ──────────────────────────────── - let mut needed_libs: Vec = Vec::new(); - if !is_static && !is_nostdlib { - needed_libs.push("libc.so.6".to_string()); - } - for sym in global_symbols.values() { - if sym.is_dynamic && !sym.dynlib.is_empty() && !needed_libs.contains(&sym.dynlib) { - needed_libs.push(sym.dynlib.clone()); - } - } - - let mut dynstr = DynStrTab::new(); - let _ = dynstr.add(""); - let mut needed_offsets: Vec = Vec::new(); - for lib in &needed_libs { - needed_offsets.push(dynstr.add(lib)); - } - - // Build dynsym entries - let mut dynsym_entries: Vec = Vec::new(); - dynsym_entries.push(Elf32Sym { name: 0, value: 0, size: 0, info: 0, other: 0, shndx: 0 }); - - let mut dynsym_map: HashMap = HashMap::new(); - let mut dynsym_names: Vec = Vec::new(); - - // PLT symbols (unhashed imports) - for name in plt_symbols { - let idx = dynsym_entries.len(); - let name_off = dynstr.add(name); - // Preserve original binding (STB_WEAK vs STB_GLOBAL) - let (bind, stype) = if let Some(sym) = global_symbols.get(name) { - (sym.binding, if sym.sym_type != 0 { sym.sym_type } else { STT_FUNC }) - } else { - (STB_GLOBAL, STT_FUNC) - }; - dynsym_entries.push(Elf32Sym { - name: name_off, value: 0, size: 0, - info: (bind << 4) | stype, other: 0, shndx: SHN_UNDEF, - }); - dynsym_map.insert(name.clone(), idx); - dynsym_names.push(name.clone()); - } - - // GOT-only symbols: only dynamic (imported) symbols go in .dynsym - // Local GOT symbols are resolved at link time and don't need dynamic entries - for name in got_dyn_symbols { - let idx = dynsym_entries.len(); - let name_off = dynstr.add(name); - let sym = &global_symbols[name]; - dynsym_entries.push(Elf32Sym { - name: name_off, value: 0, size: sym.size, - info: (sym.binding << 4) | sym.sym_type, other: 0, - shndx: SHN_UNDEF, - }); - dynsym_map.insert(name.clone(), idx); - dynsym_names.push(name.clone()); - } - - let gnu_hash_symoffset = dynsym_entries.len(); - - // Copy-reloc symbols (hashed: defined in this executable) - let mut copy_syms_for_dynsym: Vec = global_symbols.iter() - .filter(|(_, s)| s.needs_copy && s.is_dynamic) - .map(|(n, _)| n.clone()) - .collect(); - copy_syms_for_dynsym.sort(); - - for name in ©_syms_for_dynsym { - let idx = dynsym_entries.len(); - let name_off = dynstr.add(name); - let sym = &global_symbols[name]; - dynsym_entries.push(Elf32Sym { - name: name_off, value: 0, size: sym.size, - info: (STB_GLOBAL << 4) | STT_OBJECT, other: 0, shndx: SHN_UNDEF, - }); - dynsym_map.insert(name.clone(), idx); - dynsym_names.push(name.clone()); - } - - // Textrel symbols (hashed: need dynamic R_386_32 relocs) - let mut textrel_syms_for_dynsym: Vec = global_symbols.iter() - .filter(|(_, s)| s.uses_textrel && s.is_dynamic) - .map(|(n, _)| n.clone()) - .collect(); - textrel_syms_for_dynsym.sort(); - - for name in &textrel_syms_for_dynsym { - let idx = dynsym_entries.len(); - let name_off = dynstr.add(name); - let sym = &global_symbols[name]; - dynsym_entries.push(Elf32Sym { - name: name_off, value: 0, size: sym.size, - info: (sym.binding << 4) | sym.sym_type, other: 0, shndx: SHN_UNDEF, - }); - dynsym_map.insert(name.clone(), idx); - dynsym_names.push(name.clone()); - } - - // All hashed symbols = copy + textrel - let mut all_hashed_syms: Vec = Vec::new(); - all_hashed_syms.extend(copy_syms_for_dynsym.iter().cloned()); - all_hashed_syms.extend(textrel_syms_for_dynsym.iter().cloned()); - - // Build .gnu.hash and reorder hashed dynsym entries - let (gnu_hash_data, sorted_indices) = build_gnu_hash_32(&all_hashed_syms, gnu_hash_symoffset as u32); - - if !sorted_indices.is_empty() { - let hashed_start = gnu_hash_symoffset; - let hashed_names_start = hashed_start - 1; - - let orig_entries: Vec = (0..sorted_indices.len()) - .map(|i| dynsym_entries[hashed_start + i].clone()) - .collect(); - let orig_names: Vec = (0..sorted_indices.len()) - .map(|i| dynsym_names[hashed_names_start + i].clone()) - .collect(); - - for (new_pos, &orig_idx) in sorted_indices.iter().enumerate() { - dynsym_entries[hashed_start + new_pos] = orig_entries[orig_idx].clone(); - dynsym_names[hashed_names_start + new_pos] = orig_names[orig_idx].clone(); - } - - for (i, name) in dynsym_names[hashed_names_start..].iter().enumerate() { - dynsym_map.insert(name.clone(), hashed_start + i); - } - } - - // ── Build version tables ────────────────────────────────────────────── - let mut lib_versions: HashMap> = HashMap::new(); - for name in &dynsym_names { - if let Some(gs) = global_symbols.get(name) { - if gs.is_dynamic { - if let Some(ref ver) = gs.version { - lib_versions.entry(gs.dynlib.clone()).or_default().insert(ver.clone()); - } - } - } - } - - let mut ver_index_map: HashMap<(String, String), u16> = HashMap::new(); - let mut ver_idx: u16 = 2; - let mut lib_ver_list: Vec<(String, Vec)> = Vec::new(); - let mut sorted_libs: Vec = lib_versions.keys().cloned().collect(); - sorted_libs.sort(); - for lib in &sorted_libs { - let vers: Vec = lib_versions[lib].iter().cloned().collect(); - for v in &vers { - ver_index_map.insert((lib.clone(), v.clone()), ver_idx); - ver_idx += 1; - } - lib_ver_list.push((lib.clone(), vers)); - } - - // Rebuild dynstr with version strings - let mut dynstr2 = DynStrTab::new(); - let _ = dynstr2.add(""); - for lib in &needed_libs { dynstr2.add(lib); } - for name in plt_symbols { dynstr2.add(name); } - for name in got_dyn_symbols { dynstr2.add(name); } - for name in &all_hashed_syms { dynstr2.add(name); } - for (_, vers) in &lib_ver_list { - for v in vers { dynstr2.add(v); } - } - let dynstr_data = dynstr2.as_bytes().to_vec(); - - // Rebuild offsets - let mut needed_offsets: Vec = Vec::new(); - for lib in &needed_libs { - needed_offsets.push(dynstr2.get_offset(lib)); - } - for (i, entry) in dynsym_entries.iter_mut().enumerate() { - if i == 0 { continue; } - let name = &dynsym_names[i - 1]; - entry.name = dynstr2.get_offset(name); - } - - // Build .gnu.version (versym) - let mut versym_data: Vec = Vec::new(); - for (i, _) in dynsym_entries.iter().enumerate() { - if i == 0 { - versym_data.extend_from_slice(&0u16.to_le_bytes()); - } else { - let sym_name = if i - 1 < dynsym_names.len() { &dynsym_names[i - 1] } else { "" }; - let gs = global_symbols.get(sym_name); - if let Some(gs) = gs { - if gs.is_dynamic && !gs.dynlib.is_empty() { - if let Some(ref ver) = gs.version { - let idx = ver_index_map.get(&(gs.dynlib.clone(), ver.clone())) - .copied().unwrap_or(1); - versym_data.extend_from_slice(&idx.to_le_bytes()); - } else { - versym_data.extend_from_slice(&1u16.to_le_bytes()); - } - } else { - versym_data.extend_from_slice(&0u16.to_le_bytes()); - } - } else { - versym_data.extend_from_slice(&0u16.to_le_bytes()); - } - } - } - - // Build .gnu.version_r (verneed) - let mut verneed_data: Vec = Vec::new(); - let mut verneed_count: u32 = 0; - for (lib_i, (lib, vers)) in lib_ver_list.iter().enumerate() { - if !needed_libs.contains(lib) { continue; } - let lib_name_off = dynstr2.get_offset(lib); - let is_last_lib = lib_i == lib_ver_list.len() - 1; - - verneed_data.extend_from_slice(&1u16.to_le_bytes()); - verneed_data.extend_from_slice(&(vers.len() as u16).to_le_bytes()); - verneed_data.extend_from_slice(&lib_name_off.to_le_bytes()); - verneed_data.extend_from_slice(&16u32.to_le_bytes()); - let next_off = if is_last_lib { 0u32 } else { 16 + vers.len() as u32 * 16 }; - verneed_data.extend_from_slice(&next_off.to_le_bytes()); - verneed_count += 1; - - for (v_i, ver) in vers.iter().enumerate() { - let ver_name_off = dynstr2.get_offset(ver); - let v_idx = ver_index_map[&(lib.clone(), ver.clone())]; - let is_last_ver = v_i == vers.len() - 1; - - verneed_data.extend_from_slice(&linker_common::sysv_hash(ver.as_bytes()).to_le_bytes()); - verneed_data.extend_from_slice(&0u16.to_le_bytes()); - verneed_data.extend_from_slice(&v_idx.to_le_bytes()); - verneed_data.extend_from_slice(&ver_name_off.to_le_bytes()); - let vna_next: u32 = if is_last_ver { 0 } else { 16 }; - verneed_data.extend_from_slice(&vna_next.to_le_bytes()); - } - } - - // ── Layout ──────────────────────────────────────────────────────────── - let ehdr_size: u32 = 52; - let phdr_size: u32 = 32; - let has_tls_sections = output_sections.iter() - .any(|s| s.flags & SHF_TLS != 0 && s.flags & SHF_ALLOC != 0); - - let mut num_phdrs: u32 = 1; // PHDR - if !is_static { num_phdrs += 1; } // INTERP - num_phdrs += 4; // LOAD x4 (headers, text, rodata, data) - if !is_static { num_phdrs += 1; } // DYNAMIC - num_phdrs += 1; // GNU_STACK - num_phdrs += 1; // GNU_EH_FRAME - if has_tls_sections { num_phdrs += 1; } - - let phdrs_total_size = num_phdrs * phdr_size; - - let interp_data = INTERP.to_vec(); - - // Section layout tracking - let mut file_offset: u32 = ehdr_size; - let mut vaddr: u32 = BASE_ADDR + ehdr_size; - - let phdr_offset = file_offset; - let phdr_vaddr = vaddr; - file_offset += phdrs_total_size; - vaddr += phdrs_total_size; - - // INTERP - let interp_offset = file_offset; - let interp_vaddr = vaddr; - let interp_size = interp_data.len() as u32; - if !is_static { file_offset += interp_size; vaddr += interp_size; } - - // Note section - let note_sec_idx = section_name_to_idx.get(".note").copied(); - let note_size = note_sec_idx.map(|i| output_sections[i].data.len() as u32).unwrap_or(0); - if note_size > 0 { - if let Some(idx) = note_sec_idx { - output_sections[idx].file_offset = file_offset; - output_sections[idx].addr = vaddr; - } - file_offset += note_size; - vaddr += note_size; - } - - // .gnu.hash - file_offset = align_up(file_offset, 4); vaddr = align_up(vaddr, 4); - let gnu_hash_offset = file_offset; - let gnu_hash_vaddr = vaddr; - let gnu_hash_size = gnu_hash_data.len() as u32; - if !is_static { file_offset += gnu_hash_size; vaddr += gnu_hash_size; } - - // .dynsym - let dynsym_offset = file_offset; - let dynsym_vaddr = vaddr; - let dynsym_entsize: u32 = 16; - let dynsym_size = (dynsym_entries.len() as u32) * dynsym_entsize; - if !is_static { file_offset += dynsym_size; vaddr += dynsym_size; } - - // .dynstr - let dynstr_offset = file_offset; - let dynstr_vaddr = vaddr; - let dynstr_size = dynstr_data.len() as u32; - if !is_static { file_offset += dynstr_size; vaddr += dynstr_size; } - - // .gnu.version - let versym_offset = file_offset; - let versym_vaddr = vaddr; - let versym_size = versym_data.len() as u32; - if !is_static && versym_size > 0 { file_offset += versym_size; vaddr += versym_size; } - - // .gnu.version_r - file_offset = align_up(file_offset, 4); vaddr = align_up(vaddr, 4); - let verneed_offset = file_offset; - let verneed_vaddr = vaddr; - let verneed_size = verneed_data.len() as u32; - if !is_static && verneed_size > 0 { file_offset += verneed_size; vaddr += verneed_size; } - - // .rel.dyn - file_offset = align_up(file_offset, 4); vaddr = align_up(vaddr, 4); - let rel_dyn_offset = file_offset; - let rel_dyn_vaddr = vaddr; - let num_copy_relocs = copy_syms_for_dynsym.len(); - // Count actual R_386_32 relocations against textrel symbols - let num_text_relocs: usize = if textrel_syms_for_dynsym.is_empty() { 0 } else { - let mut count = 0usize; - for obj in inputs { - for sec in &obj.sections { - for &(_, rel_type, sym_idx, _) in &sec.relocations { - if rel_type == R_386_32 { - if let Some(sym) = obj.symbols.get(sym_idx as usize) { - if let Some(gs) = global_symbols.get(&sym.name) { - if gs.uses_textrel { count += 1; } - } - } - } - } - } - } - count - }; - let num_rel_dyn = got_dyn_symbols.len() + num_copy_relocs + num_text_relocs; - let rel_dyn_size = (num_rel_dyn as u32) * 8; - if !is_static { file_offset += rel_dyn_size; vaddr += rel_dyn_size; } - - // .rel.plt - let rel_plt_offset = file_offset; - let rel_plt_vaddr = vaddr; - let rel_plt_size = (num_plt as u32) * 8; - if !is_static { file_offset += rel_plt_size; vaddr += rel_plt_size; } - - let ro_headers_end = file_offset; - - // ── Segment 1 (RX): .init + .plt + .text + .fini ── - file_offset = align_up(file_offset, PAGE_SIZE); - vaddr = align_up(vaddr, PAGE_SIZE); - vaddr = (vaddr & !0xfff) | (file_offset & 0xfff); - if vaddr < BASE_ADDR + file_offset { vaddr += PAGE_SIZE; } - - let text_seg_file_start = file_offset; - let text_seg_vaddr_start = vaddr; - - // .init - let init_sec_idx = section_name_to_idx.get(".init").copied(); - let init_vaddr; - let init_size; - if let Some(idx) = init_sec_idx { - let a = output_sections[idx].align.max(4); - file_offset = align_up(file_offset, a); vaddr = align_up(vaddr, a); - init_vaddr = vaddr; - init_size = output_sections[idx].data.len() as u32; - output_sections[idx].addr = vaddr; - output_sections[idx].file_offset = file_offset; - file_offset += init_size; vaddr += init_size; - } else { - init_vaddr = vaddr; init_size = 0; - } - - // .plt - let plt_entry_size: u32 = 16; - let plt_header_size: u32 = if num_plt > 0 { 16 } else { 0 }; - let plt_total_size = plt_header_size + (num_plt as u32) * plt_entry_size; - file_offset = align_up(file_offset, 16); vaddr = align_up(vaddr, 16); - let plt_offset = file_offset; - let plt_vaddr = vaddr; - if plt_total_size > 0 { file_offset += plt_total_size; vaddr += plt_total_size; } - - // .text - if let Some(idx) = section_name_to_idx.get(".text").copied() { - let a = output_sections[idx].align.max(16); - file_offset = align_up(file_offset, a); vaddr = align_up(vaddr, a); - output_sections[idx].addr = vaddr; - output_sections[idx].file_offset = file_offset; - file_offset += output_sections[idx].data.len() as u32; - vaddr += output_sections[idx].data.len() as u32; - } - - // .fini - let fini_sec_idx = section_name_to_idx.get(".fini").copied(); - let fini_vaddr; - let fini_size; - if let Some(idx) = fini_sec_idx { - let a = output_sections[idx].align.max(4); - file_offset = align_up(file_offset, a); vaddr = align_up(vaddr, a); - fini_vaddr = vaddr; - fini_size = output_sections[idx].data.len() as u32; - output_sections[idx].addr = vaddr; - output_sections[idx].file_offset = file_offset; - file_offset += fini_size; vaddr += fini_size; - } else { - fini_vaddr = 0; fini_size = 0; - } - - // Layout custom executable sections (for __start_/__stop_ symbol auto-generation) - layout_custom_sections(section_name_to_idx, output_sections, - &mut file_offset, &mut vaddr, SHF_EXECINSTR); - - // .iplt (IFUNC PLT entries for static linking) - let iplt_entry_size: u32 = 8; - let iplt_total_size = (num_ifunc as u32) * iplt_entry_size; - file_offset = align_up(file_offset, 8); vaddr = align_up(vaddr, 8); - let iplt_offset = file_offset; - let iplt_vaddr = vaddr; - if iplt_total_size > 0 { file_offset += iplt_total_size; vaddr += iplt_total_size; } - - let text_seg_file_end = file_offset; - let text_seg_vaddr_end = vaddr; - - // ── Segment 2 (RO): .rodata + .eh_frame ── - file_offset = align_up(file_offset, PAGE_SIZE); vaddr = align_up(vaddr, PAGE_SIZE); - vaddr = (vaddr & !0xfff) | (file_offset & 0xfff); - if vaddr <= text_seg_vaddr_end { - vaddr = align_up(text_seg_vaddr_end, PAGE_SIZE) | (file_offset & 0xfff); - } - - let rodata_seg_file_start = file_offset; - let rodata_seg_vaddr_start = vaddr; - - if let Some(idx) = section_name_to_idx.get(".rodata").copied() { - let a = output_sections[idx].align.max(4); - file_offset = align_up(file_offset, a); vaddr = align_up(vaddr, a); - output_sections[idx].addr = vaddr; - output_sections[idx].file_offset = file_offset; - file_offset += output_sections[idx].data.len() as u32; - vaddr += output_sections[idx].data.len() as u32; - } - - let eh_frame_sec_idx = section_name_to_idx.get(".eh_frame").copied(); - if let Some(idx) = eh_frame_sec_idx { - let a = output_sections[idx].align.max(4); - file_offset = align_up(file_offset, a); vaddr = align_up(vaddr, a); - output_sections[idx].addr = vaddr; - output_sections[idx].file_offset = file_offset; - file_offset += output_sections[idx].data.len() as u32; - vaddr += output_sections[idx].data.len() as u32; - } - - // Build .eh_frame_hdr: count FDEs and reserve space after .eh_frame - let mut eh_frame_hdr_vaddr = 0u32; - let mut eh_frame_hdr_offset = 0u32; - let mut eh_frame_hdr_size = 0u32; - if let Some(idx) = eh_frame_sec_idx { - let fde_count = crate::backend::linker_common::count_eh_frame_fdes(&output_sections[idx].data); - if fde_count > 0 { - eh_frame_hdr_size = (12 + 8 * fde_count) as u32; - file_offset = align_up(file_offset, 4); - vaddr = align_up(vaddr, 4); - eh_frame_hdr_offset = file_offset; - eh_frame_hdr_vaddr = vaddr; - file_offset += eh_frame_hdr_size; - vaddr += eh_frame_hdr_size; - } - } - - // Layout custom read-only sections (for __start_/__stop_ symbol auto-generation) - layout_custom_sections(section_name_to_idx, output_sections, - &mut file_offset, &mut vaddr, 0); - - let rodata_seg_file_end = file_offset; - let rodata_seg_vaddr_end = vaddr; - - // ── Segment 3 (RW): .init_array + .fini_array + .dynamic + .got + .got.plt + .data + .bss ── - file_offset = align_up(file_offset, PAGE_SIZE); vaddr = align_up(vaddr, PAGE_SIZE); - vaddr = (vaddr & !0xfff) | (file_offset & 0xfff); - if vaddr <= rodata_seg_vaddr_end { - vaddr = align_up(rodata_seg_vaddr_end, PAGE_SIZE) | (file_offset & 0xfff); - } - - let data_seg_file_start = file_offset; - let data_seg_vaddr_start = vaddr; - - // .init_array - let (init_array_vaddr, init_array_size) = layout_section( - ".init_array", section_name_to_idx, output_sections, &mut file_offset, &mut vaddr, 4, - ); - - // .fini_array - let (fini_array_vaddr, fini_array_size) = layout_section( - ".fini_array", section_name_to_idx, output_sections, &mut file_offset, &mut vaddr, 4, - ); - - // Layout custom writable sections (for __start_/__stop_ symbol auto-generation) - layout_custom_sections(section_name_to_idx, output_sections, - &mut file_offset, &mut vaddr, SHF_WRITE); - - // .dynamic - file_offset = align_up(file_offset, 4); vaddr = align_up(vaddr, 4); - let dynamic_offset = file_offset; - let dynamic_vaddr = vaddr; - let num_dynamic_entries = count_dynamic_entries( - &needed_libs, init_vaddr, init_size, fini_vaddr, fini_size, - init_array_size, fini_array_size, num_plt, num_rel_dyn, verneed_size, - num_text_relocs, - ); - let dynamic_size = num_dynamic_entries * 8; - if !is_static { file_offset += dynamic_size; vaddr += dynamic_size; } - - // .got - file_offset = align_up(file_offset, 4); vaddr = align_up(vaddr, 4); - let got_offset = file_offset; - let got_vaddr = vaddr; - let got_reserved: usize = 1; - let got_non_plt_entries = got_dyn_symbols.len() + got_local_symbols.len(); - let got_entry_size: u32 = 4; - let got_size = (got_reserved + got_non_plt_entries) as u32 * got_entry_size; - let needs_got_section = !is_static || got_non_plt_entries > 0 || num_plt > 0; - if needs_got_section { file_offset += got_size; vaddr += got_size; } - - // .got.plt - let gotplt_offset = file_offset; - let gotplt_vaddr = vaddr; - let gotplt_reserved: u32 = 3; - let gotplt_size = (gotplt_reserved + num_plt as u32) * 4; - if !is_static && num_plt > 0 { file_offset += gotplt_size; vaddr += gotplt_size; } - - // IFUNC GOT - let ifunc_got_offset = file_offset; - let ifunc_got_vaddr = vaddr; - let ifunc_got_size = (num_ifunc as u32) * 4; - if ifunc_got_size > 0 { file_offset += ifunc_got_size; vaddr += ifunc_got_size; } - - // .rel.iplt - let rel_iplt_offset = file_offset; - let rel_iplt_vaddr = vaddr; - let rel_iplt_size = (num_ifunc as u32) * 8; - if rel_iplt_size > 0 { file_offset += rel_iplt_size; vaddr += rel_iplt_size; } - - // .data - if let Some(idx) = section_name_to_idx.get(".data").copied() { - let a = output_sections[idx].align.max(4); - file_offset = align_up(file_offset, a); vaddr = align_up(vaddr, a); - output_sections[idx].addr = vaddr; - output_sections[idx].file_offset = file_offset; - file_offset += output_sections[idx].data.len() as u32; - vaddr += output_sections[idx].data.len() as u32; - } - - // TLS sections - let (tls_addr, tls_file_offset, tls_file_size, tls_mem_size, tls_align) = - layout_tls(section_name_to_idx, output_sections, &mut file_offset, &mut vaddr); - let has_tls = tls_addr != 0; - - let data_seg_file_end = file_offset; - - // .bss - let bss_vaddr; - if let Some(idx) = section_name_to_idx.get(".bss").copied() { - let a = output_sections[idx].align.max(4); - vaddr = align_up(vaddr, a); - bss_vaddr = vaddr; - output_sections[idx].addr = vaddr; - output_sections[idx].file_offset = file_offset; - vaddr += output_sections[idx].data.len() as u32; - } else { - bss_vaddr = vaddr; - } - - // Allocate BSS space for copy relocations - let mut copy_reloc_symbols: Vec = global_symbols.iter() - .filter(|(_, s)| s.needs_copy && s.is_dynamic) - .map(|(n, _)| n.clone()).collect(); - copy_reloc_symbols.sort(); - - for name in ©_reloc_symbols { - if let Some(sym) = global_symbols.get_mut(name) { - let al = if sym.size >= 4 { 4 } else { 1 }; - vaddr = align_up(vaddr, al); - sym.copy_addr = vaddr; - sym.address = vaddr; - vaddr += sym.size.max(4); - } - } - - let data_seg_vaddr_end = vaddr; - - let got_base = if num_plt > 0 { gotplt_vaddr } else { got_vaddr }; - - // ── Assign symbol addresses ────────────────────────────────────────── - assign_symbol_addresses( - global_symbols, output_sections, got_base, - plt_vaddr, plt_header_size, plt_entry_size, - bss_vaddr, data_seg_vaddr_end, data_seg_vaddr_start, - text_seg_vaddr_end, dynamic_vaddr, is_static, - init_array_vaddr, init_array_size, fini_array_vaddr, fini_array_size, - rel_iplt_vaddr, rel_iplt_size, - ); - - // Override IFUNC symbol addresses to point to IPLT entries - let mut ifunc_resolver_addrs: Vec = Vec::new(); - for (i, name) in ifunc_symbols.iter().enumerate() { - if let Some(sym) = global_symbols.get_mut(name) { - ifunc_resolver_addrs.push(sym.address); - sym.address = iplt_vaddr + (i as u32) * iplt_entry_size; - } - } - - // ── Build IPLT data ────────────────────────────────────────────────── - let mut iplt_data: Vec = Vec::new(); - for i in 0..num_ifunc { - let got_entry_addr = ifunc_got_vaddr + (i as u32) * 4; - iplt_data.push(0xff); iplt_data.push(0x25); - iplt_data.extend_from_slice(&got_entry_addr.to_le_bytes()); - iplt_data.push(0x66); iplt_data.push(0x90); - } - - let mut ifunc_got_data: Vec = Vec::new(); - for &resolver_addr in &ifunc_resolver_addrs { - ifunc_got_data.extend_from_slice(&resolver_addr.to_le_bytes()); - } - - let mut rel_iplt_data: Vec = Vec::new(); - for i in 0..num_ifunc { - let r_offset = ifunc_got_vaddr + (i as u32) * 4; - rel_iplt_data.extend_from_slice(&r_offset.to_le_bytes()); - rel_iplt_data.extend_from_slice(&R_386_IRELATIVE.to_le_bytes()); - } - - // ── Build PLT ──────────────────────────────────────────────────────── - let plt_data = build_plt(num_plt, plt_vaddr, plt_header_size, plt_entry_size, - gotplt_vaddr, gotplt_reserved); - - // ── Apply relocations ──────────────────────────────────────────────── - let text_relocs; - { - let mut reloc_ctx = RelocContext { - global_symbols, - output_sections, - section_map, - got_base, - got_vaddr, - gotplt_vaddr, - got_reserved, - gotplt_reserved, - plt_vaddr, - plt_header_size, - plt_entry_size, - num_plt, - tls_addr, - tls_mem_size, - has_tls, - }; - text_relocs = reloc::apply_relocations(inputs, &mut reloc_ctx)?; - } - - // Build .eh_frame_hdr from relocated .eh_frame data - let eh_frame_hdr_data = if eh_frame_hdr_size > 0 { - if let Some(idx) = eh_frame_sec_idx { - let sec = &output_sections[idx]; - crate::backend::linker_common::build_eh_frame_hdr( - &sec.data, - sec.addr as u64, - eh_frame_hdr_vaddr as u64, - false, // 32-bit - ) - } else { - Vec::new() - } - } else { - Vec::new() - }; - - // ── Build GOT data ─────────────────────────────────────────────────── - let mut got_data: Vec = Vec::new(); - if needs_got_section { - got_data.extend_from_slice(&(if is_static { 0u32 } else { dynamic_vaddr }).to_le_bytes()); - // Dynamic GOT symbols first (filled by dynamic linker via GLOB_DAT) - for name in got_dyn_symbols { - if let Some(gs) = global_symbols.get(name) { - if has_tls && gs.sym_type == STT_TLS { - let tpoff = gs.address as i32 - tls_addr as i32 - tls_mem_size as i32; - got_data.extend_from_slice(&(tpoff as u32).to_le_bytes()); - } else { - got_data.extend_from_slice(&0u32.to_le_bytes()); - } - } else { - got_data.extend_from_slice(&0u32.to_le_bytes()); - } - } - // Local GOT symbols (filled at link time with resolved addresses) - for name in got_local_symbols { - if let Some(gs) = global_symbols.get(name) { - if has_tls && gs.sym_type == STT_TLS { - let tpoff = gs.address as i32 - tls_addr as i32 - tls_mem_size as i32; - got_data.extend_from_slice(&(tpoff as u32).to_le_bytes()); - } else { - got_data.extend_from_slice(&gs.address.to_le_bytes()); - } - } else { - got_data.extend_from_slice(&0u32.to_le_bytes()); - } - } - } - - // GOT.PLT data - let mut gotplt_data: Vec = Vec::new(); - if !is_static && num_plt > 0 { - gotplt_data.extend_from_slice(&dynamic_vaddr.to_le_bytes()); - gotplt_data.extend_from_slice(&0u32.to_le_bytes()); - gotplt_data.extend_from_slice(&0u32.to_le_bytes()); - for i in 0..num_plt { - let lazy_addr = plt_vaddr + plt_header_size + (i as u32) * plt_entry_size + 6; - gotplt_data.extend_from_slice(&lazy_addr.to_le_bytes()); - } - } - - // .rel.plt data - let mut rel_plt_data: Vec = Vec::new(); - for (i, name) in plt_symbols.iter().enumerate() { - let gotplt_entry_addr = gotplt_vaddr + (gotplt_reserved + i as u32) * 4; - let dynsym_idx = dynsym_map[name] as u32; - let r_info = (dynsym_idx << 8) | 7; // R_386_JMP_SLOT - rel_plt_data.extend_from_slice(&gotplt_entry_addr.to_le_bytes()); - rel_plt_data.extend_from_slice(&r_info.to_le_bytes()); - } - - // .rel.dyn data (only dynamic GOT symbols need GLOB_DAT) - let mut rel_dyn_data: Vec = Vec::new(); - for (i, name) in got_dyn_symbols.iter().enumerate() { - let got_entry_addr = got_vaddr + (got_reserved as u32 + i as u32) * 4; - let dynsym_idx = dynsym_map[name] as u32; - let r_info = (dynsym_idx << 8) | 6; // R_386_GLOB_DAT - rel_dyn_data.extend_from_slice(&got_entry_addr.to_le_bytes()); - rel_dyn_data.extend_from_slice(&r_info.to_le_bytes()); - } - for name in ©_reloc_symbols { - if let Some(gs) = global_symbols.get(name) { - if let Some(&dynsym_idx) = dynsym_map.get(name) { - let r_info = ((dynsym_idx as u32) << 8) | 5; // R_386_COPY - rel_dyn_data.extend_from_slice(&gs.copy_addr.to_le_bytes()); - rel_dyn_data.extend_from_slice(&r_info.to_le_bytes()); - } - } - } - // Text relocations for WEAK dynamic data symbols (R_386_32) - for (addr, ref name) in &text_relocs { - if let Some(&dynsym_idx) = dynsym_map.get(name) { - let r_info = ((dynsym_idx as u32) << 8) | R_386_32; - rel_dyn_data.extend_from_slice(&addr.to_le_bytes()); - rel_dyn_data.extend_from_slice(&r_info.to_le_bytes()); - } - } - - // .dynamic data - let mut dynamic_data: Vec = Vec::new(); - if !is_static { - for &off in &needed_offsets { push_dyn(&mut dynamic_data, DT_NEEDED, off); } - push_dyn(&mut dynamic_data, DT_GNU_HASH_TAG, gnu_hash_vaddr); - push_dyn(&mut dynamic_data, DT_STRTAB, dynstr_vaddr); - push_dyn(&mut dynamic_data, DT_SYMTAB, dynsym_vaddr); - push_dyn(&mut dynamic_data, DT_STRSZ, dynstr_size); - push_dyn(&mut dynamic_data, DT_SYMENT, dynsym_entsize); - if init_vaddr != 0 && init_size > 0 { push_dyn(&mut dynamic_data, DT_INIT, init_vaddr); } - if fini_vaddr != 0 && fini_size > 0 { push_dyn(&mut dynamic_data, DT_FINI, fini_vaddr); } - if init_array_size > 0 { - push_dyn(&mut dynamic_data, DT_INIT_ARRAY, init_array_vaddr); - push_dyn(&mut dynamic_data, DT_INIT_ARRAYSZ, init_array_size); - } - if fini_array_size > 0 { - push_dyn(&mut dynamic_data, DT_FINI_ARRAY, fini_array_vaddr); - push_dyn(&mut dynamic_data, DT_FINI_ARRAYSZ, fini_array_size); - } - push_dyn(&mut dynamic_data, DT_DEBUG, 0); - if num_plt > 0 { - push_dyn(&mut dynamic_data, DT_PLTGOT, gotplt_vaddr); - push_dyn(&mut dynamic_data, DT_PLTRELSZ, rel_plt_size); - push_dyn(&mut dynamic_data, DT_PLTREL, 17); - push_dyn(&mut dynamic_data, DT_JMPREL, rel_plt_vaddr); - } - if num_rel_dyn > 0 { - push_dyn(&mut dynamic_data, DT_REL, rel_dyn_vaddr); - push_dyn(&mut dynamic_data, DT_RELSZ, rel_dyn_size); - push_dyn(&mut dynamic_data, DT_RELENT, 8); - } - if verneed_size > 0 { - push_dyn(&mut dynamic_data, DT_VERNEED, verneed_vaddr); - push_dyn(&mut dynamic_data, DT_VERNEEDNUM, verneed_count); - push_dyn(&mut dynamic_data, DT_VERSYM, versym_vaddr); - } - if num_text_relocs > 0 { - push_dyn(&mut dynamic_data, DT_TEXTREL, 0); - } - push_dyn(&mut dynamic_data, DT_NULL, 0); - } - - // Entry point - let entry_point = global_symbols.get("_start") - .map(|s| s.address) - .unwrap_or_else(|| global_symbols.get("main").map(|s| s.address).unwrap_or(BASE_ADDR)); - - // Patch dynsym for copy-reloc symbols - for name in ©_syms_for_dynsym { - if let Some(sym) = global_symbols.get(name) { - if let Some(&idx) = dynsym_map.get(name) { - dynsym_entries[idx].value = sym.copy_addr; - dynsym_entries[idx].shndx = 1; - } - } - } - - // Serialize dynsym - let mut dynsym_data: Vec = Vec::new(); - for sym in &dynsym_entries { - dynsym_data.extend_from_slice(&sym.name.to_le_bytes()); - dynsym_data.extend_from_slice(&sym.value.to_le_bytes()); - dynsym_data.extend_from_slice(&sym.size.to_le_bytes()); - dynsym_data.push(sym.info); - dynsym_data.push(sym.other); - dynsym_data.extend_from_slice(&sym.shndx.to_le_bytes()); - } - - // ── Write ELF file ─────────────────────────────────────────────────── - let total_file_size = file_offset as usize; - let mut output = vec![0u8; total_file_size]; - - // ELF header - write_elf_header(&mut output, entry_point, ehdr_size, num_phdrs); - - // Program headers - let mut phdr_pos = phdr_offset as usize; - let write_ph = |output: &mut Vec, pos: &mut usize, - p_type: u32, p_off: u32, p_va: u32, - p_filesz: u32, p_memsz: u32, p_flags: u32, p_align: u32| { - output[*pos..*pos + 4].copy_from_slice(&p_type.to_le_bytes()); - output[*pos + 4..*pos + 8].copy_from_slice(&p_off.to_le_bytes()); - output[*pos + 8..*pos + 12].copy_from_slice(&p_va.to_le_bytes()); - output[*pos + 12..*pos + 16].copy_from_slice(&p_va.to_le_bytes()); - output[*pos + 16..*pos + 20].copy_from_slice(&p_filesz.to_le_bytes()); - output[*pos + 20..*pos + 24].copy_from_slice(&p_memsz.to_le_bytes()); - output[*pos + 24..*pos + 28].copy_from_slice(&p_flags.to_le_bytes()); - output[*pos + 28..*pos + 32].copy_from_slice(&p_align.to_le_bytes()); - *pos += phdr_size as usize; - }; - - write_ph(&mut output, &mut phdr_pos, PT_PHDR, phdr_offset, phdr_vaddr, - phdrs_total_size, phdrs_total_size, PF_R, 4); - if !is_static { - write_ph(&mut output, &mut phdr_pos, PT_INTERP, interp_offset, interp_vaddr, - interp_size, interp_size, PF_R, 1); - } - write_ph(&mut output, &mut phdr_pos, PT_LOAD, 0, BASE_ADDR, - ro_headers_end, ro_headers_end, PF_R, PAGE_SIZE); - write_ph(&mut output, &mut phdr_pos, PT_LOAD, text_seg_file_start, text_seg_vaddr_start, - text_seg_file_end - text_seg_file_start, text_seg_vaddr_end - text_seg_vaddr_start, - PF_R | PF_X, PAGE_SIZE); - if rodata_seg_file_end - rodata_seg_file_start > 0 { - write_ph(&mut output, &mut phdr_pos, PT_LOAD, rodata_seg_file_start, rodata_seg_vaddr_start, - rodata_seg_file_end - rodata_seg_file_start, rodata_seg_vaddr_end - rodata_seg_vaddr_start, - PF_R, PAGE_SIZE); - } - write_ph(&mut output, &mut phdr_pos, PT_LOAD, data_seg_file_start, data_seg_vaddr_start, - data_seg_file_end - data_seg_file_start, data_seg_vaddr_end - data_seg_vaddr_start, - PF_R | PF_W, PAGE_SIZE); - if !is_static { - write_ph(&mut output, &mut phdr_pos, PT_DYNAMIC, dynamic_offset, dynamic_vaddr, - dynamic_data.len() as u32, dynamic_data.len() as u32, PF_R | PF_W, 4); - } - write_ph(&mut output, &mut phdr_pos, PT_GNU_STACK, 0, 0, 0, 0, PF_R | PF_W, 0x10); - write_ph(&mut output, &mut phdr_pos, PT_GNU_EH_FRAME, - eh_frame_hdr_offset, eh_frame_hdr_vaddr, - eh_frame_hdr_size, eh_frame_hdr_size, PF_R, 4); - if has_tls { - write_ph(&mut output, &mut phdr_pos, PT_TLS, tls_file_offset, tls_addr, - tls_file_size, tls_mem_size, PF_R, tls_align); - } - - // Write section data - let write_data = |output: &mut Vec, offset: u32, data: &[u8]| { - if !data.is_empty() { - let off = offset as usize; - output[off..off + data.len()].copy_from_slice(data); - } - }; - - if !is_static { - write_data(&mut output, interp_offset, &interp_data); - write_data(&mut output, gnu_hash_offset, &gnu_hash_data); - write_data(&mut output, dynsym_offset, &dynsym_data); - write_data(&mut output, dynstr_offset, &dynstr_data); - if !versym_data.is_empty() { write_data(&mut output, versym_offset, &versym_data); } - if !verneed_data.is_empty() { write_data(&mut output, verneed_offset, &verneed_data); } - if !rel_dyn_data.is_empty() { write_data(&mut output, rel_dyn_offset, &rel_dyn_data); } - if !rel_plt_data.is_empty() { write_data(&mut output, rel_plt_offset, &rel_plt_data); } - write_data(&mut output, dynamic_offset, &dynamic_data); - if !gotplt_data.is_empty() { write_data(&mut output, gotplt_offset, &gotplt_data); } - } - write_data(&mut output, plt_offset, &plt_data); - write_data(&mut output, got_offset, &got_data); - write_data(&mut output, iplt_offset, &iplt_data); - write_data(&mut output, ifunc_got_offset, &ifunc_got_data); - write_data(&mut output, rel_iplt_offset, &rel_iplt_data); - - // Write .eh_frame_hdr - if !eh_frame_hdr_data.is_empty() && eh_frame_hdr_offset > 0 { - write_data(&mut output, eh_frame_hdr_offset, &eh_frame_hdr_data); - } - - // Write all output sections - for sec in output_sections.iter() { - if sec.sh_type == SHT_NOBITS || sec.data.is_empty() { continue; } - let off = sec.file_offset as usize; - let end = off + sec.data.len(); - if end <= output.len() { - output[off..end].copy_from_slice(&sec.data); - } - } - - // Write to file - std::fs::write(output_path, &output) - .map_err(|e| format!("failed to write output: {}", e))?; - - #[cfg(unix)] - { - use std::os::unix::fs::PermissionsExt; - let _ = std::fs::set_permissions(output_path, std::fs::Permissions::from_mode(0o755)); - } - - Ok(()) -} - -// ── Helpers for emit_executable ────────────────────────────────────────────── - -pub(super) fn layout_section( - name: &str, - section_name_to_idx: &HashMap, - output_sections: &mut [OutputSection], - file_offset: &mut u32, - vaddr: &mut u32, - min_align: u32, -) -> (u32, u32) { - if let Some(idx) = section_name_to_idx.get(name).copied() { - let a = output_sections[idx].align.max(min_align); - *file_offset = align_up(*file_offset, a); - *vaddr = align_up(*vaddr, a); - let sec_vaddr = *vaddr; - let sec_size = output_sections[idx].data.len() as u32; - output_sections[idx].addr = *vaddr; - output_sections[idx].file_offset = *file_offset; - *file_offset += sec_size; - *vaddr += sec_size; - (sec_vaddr, sec_size) - } else { - (0, 0) - } -} - -/// Layout custom sections with a given flag requirement. -/// -/// Custom sections are those not in the standard set (.text, .rodata, .data, etc.) -/// that have the SHF_ALLOC flag plus the specified additional flag. This is needed -/// for `__start_
` / `__stop_
` symbol auto-generation and for -/// sections placed via `__attribute__((section("name")))`. -/// -/// `required_flag` selects which type: -/// - SHF_EXECINSTR: custom executable sections (placed in text segment) -/// - 0: custom read-only sections (placed in rodata segment, no write/exec) -/// - SHF_WRITE: custom writable sections (placed in data segment) -pub(super) fn layout_custom_sections( - section_name_to_idx: &HashMap, - output_sections: &mut [OutputSection], - file_offset: &mut u32, - vaddr: &mut u32, - required_flag: u32, -) { - let standard_sections: &[&str] = &[ - ".text", ".rodata", ".data", ".bss", ".init", ".fini", - ".init_array", ".fini_array", ".eh_frame", ".note", - ".tdata", ".tbss", ".tm_clone_table", - ]; - let mut custom: Vec = section_name_to_idx.keys() - .filter(|name| { - if standard_sections.contains(&name.as_str()) { return false; } - let idx = match section_name_to_idx.get(name.as_str()) { - Some(&i) => i, - None => return false, - }; - let sec = &output_sections[idx]; - if sec.flags & SHF_ALLOC == 0 { return false; } - // Classify by flags: 0 = read-only (no write, no exec), - // SHF_EXECINSTR = executable, SHF_WRITE = writable (but not executable, - // since writable+executable sections go in the text segment). - match required_flag { - 0 => sec.flags & SHF_WRITE == 0 && sec.flags & SHF_EXECINSTR == 0, - f => sec.flags & f != 0 && (f != SHF_WRITE || sec.flags & SHF_EXECINSTR == 0), - } - }) - .cloned() - .collect(); - custom.sort(); // Deterministic order - for name in &custom { - layout_section(name, section_name_to_idx, output_sections, file_offset, vaddr, 4); - } -} - -pub(super) fn layout_tls( - section_name_to_idx: &HashMap, - output_sections: &mut [OutputSection], - file_offset: &mut u32, - vaddr: &mut u32, -) -> (u32, u32, u32, u32, u32) { - let mut tls_addr = 0u32; - let mut tls_file_offset = 0u32; - let mut tls_file_size = 0u32; - let mut tls_mem_size = 0u32; - let mut tls_align = 1u32; - - if let Some(&idx) = section_name_to_idx.get(".tdata") { - let a = output_sections[idx].align.max(4); - *file_offset = align_up(*file_offset, a); - *vaddr = align_up(*vaddr, a); - output_sections[idx].addr = *vaddr; - output_sections[idx].file_offset = *file_offset; - tls_addr = *vaddr; - tls_file_offset = *file_offset; - tls_align = a; - let sz = output_sections[idx].data.len() as u32; - tls_file_size = sz; - tls_mem_size = sz; - *file_offset += sz; - *vaddr += sz; - } - - if let Some(&idx) = section_name_to_idx.get(".tbss") { - let a = output_sections[idx].align.max(4); - let aligned = align_up(tls_mem_size, a); - if tls_addr == 0 { - tls_addr = align_up(*vaddr, a); - tls_file_offset = *file_offset; - tls_align = a; - } - output_sections[idx].addr = tls_addr + aligned; - output_sections[idx].file_offset = *file_offset; - tls_mem_size = aligned + output_sections[idx].data.len() as u32; - if a > tls_align { tls_align = a; } - } - - tls_mem_size = align_up(tls_mem_size, tls_align); - (tls_addr, tls_file_offset, tls_file_size, tls_mem_size, tls_align) -} - -fn count_dynamic_entries( - needed_libs: &[String], - init_vaddr: u32, init_size: u32, - fini_vaddr: u32, fini_size: u32, - init_array_size: u32, fini_array_size: u32, - num_plt: usize, num_rel_dyn: usize, verneed_size: u32, - num_text_relocs: usize, -) -> u32 { - let mut n: u32 = needed_libs.len() as u32; - n += 5; // GNU_HASH, STRTAB, SYMTAB, STRSZ, SYMENT - if init_vaddr != 0 && init_size > 0 { n += 1; } - if fini_vaddr != 0 && fini_size > 0 { n += 1; } - if init_array_size > 0 { n += 2; } - if fini_array_size > 0 { n += 2; } - n += 1; // DEBUG - if num_plt > 0 { n += 4; } - if num_rel_dyn > 0 { n += 3; } - if verneed_size > 0 { n += 3; } - if num_text_relocs > 0 { n += 1; } // DT_TEXTREL - n += 1; // DT_NULL - n -} - -fn assign_symbol_addresses( - global_symbols: &mut HashMap, - output_sections: &[OutputSection], - got_base: u32, - plt_vaddr: u32, plt_header_size: u32, plt_entry_size: u32, - bss_vaddr: u32, data_seg_vaddr_end: u32, data_seg_vaddr_start: u32, - text_seg_vaddr_end: u32, dynamic_vaddr: u32, is_static: bool, - init_array_vaddr: u32, init_array_size: u32, - fini_array_vaddr: u32, fini_array_size: u32, - rel_iplt_vaddr: u32, rel_iplt_size: u32, -) { - global_symbols.entry("_GLOBAL_OFFSET_TABLE_".to_string()).or_insert(LinkerSymbol { - address: got_base, size: 0, sym_type: STT_OBJECT, binding: STB_LOCAL, - visibility: STV_DEFAULT, is_defined: true, needs_plt: false, needs_got: false, - output_section: usize::MAX, section_offset: 0, plt_index: 0, got_index: 0, - is_dynamic: false, dynlib: String::new(), needs_copy: false, copy_addr: 0, version: None, uses_textrel: false, - }); - if let Some(sym) = global_symbols.get_mut("_GLOBAL_OFFSET_TABLE_") { - sym.address = got_base; - sym.is_defined = true; - } - - let linker_addrs = LinkerSymbolAddresses { - base_addr: BASE_ADDR as u64, - got_addr: got_base as u64, - dynamic_addr: if is_static { 0 } else { dynamic_vaddr as u64 }, - bss_addr: bss_vaddr as u64, - bss_size: (data_seg_vaddr_end - bss_vaddr) as u64, - text_end: text_seg_vaddr_end as u64, - data_start: data_seg_vaddr_start as u64, - init_array_start: init_array_vaddr as u64, - init_array_size: init_array_size as u64, - fini_array_start: fini_array_vaddr as u64, - fini_array_size: fini_array_size as u64, - preinit_array_start: 0, - preinit_array_size: 0, - rela_iplt_start: rel_iplt_vaddr as u64, - rela_iplt_size: rel_iplt_size as u64, - }; - let standard_syms = get_standard_linker_symbols(&linker_addrs); - let linker_sym_map: HashMap<&str, u64> = standard_syms.iter() - .filter(|s| !s.name.starts_with("__rela_iplt")) - .map(|s| (s.name, s.value)) - .collect(); - - for (name, sym) in global_symbols.iter_mut() { - if sym.is_dynamic { - if sym.needs_plt { - sym.address = plt_vaddr + plt_header_size + (sym.plt_index as u32) * plt_entry_size; - } - continue; - } - if sym.output_section < output_sections.len() { - sym.address = output_sections[sym.output_section].addr + sym.section_offset; - } - if let Some(&value) = linker_sym_map.get(name.as_str()) { - sym.address = value as u32; - if name == "__dso_handle" { sym.is_defined = true; } - } - match name.as_str() { - "__bss_start__" => sym.address = bss_vaddr, - "edata" => sym.address = bss_vaddr, - "end" | "__end__" => sym.address = data_seg_vaddr_end, - "__rel_iplt_start" => sym.address = rel_iplt_vaddr, - "__rel_iplt_end" => sym.address = rel_iplt_vaddr + rel_iplt_size, - _ => {} - } - // Auto-generate __start_
/ __stop_
symbols (GNU ld feature). - // Uses data.len() for __stop_ (equals mem_size for PROGBITS; custom sections are always PROGBITS). - if let Some(sec_name) = name.strip_prefix("__start_") { - if linker_common::is_valid_c_identifier_for_section(sec_name) { - if let Some(sec) = output_sections.iter().find(|s| s.name == sec_name) { - sym.address = sec.addr; - sym.is_defined = true; - } - } - } else if let Some(sec_name) = name.strip_prefix("__stop_") { - if linker_common::is_valid_c_identifier_for_section(sec_name) { - if let Some(sec) = output_sections.iter().find(|s| s.name == sec_name) { - sym.address = sec.addr + sec.data.len() as u32; - sym.is_defined = true; - } - } - } - } -} - -pub(super) fn build_plt( - num_plt: usize, plt_vaddr: u32, plt_header_size: u32, plt_entry_size: u32, - gotplt_vaddr: u32, gotplt_reserved: u32, -) -> Vec { - let mut plt_data: Vec = Vec::new(); - if num_plt == 0 { return plt_data; } - - // PLT[0]: resolver stub - let got1 = gotplt_vaddr + 4; - let got2 = gotplt_vaddr + 8; - plt_data.push(0xff); plt_data.push(0x35); - plt_data.extend_from_slice(&got1.to_le_bytes()); - plt_data.push(0xff); plt_data.push(0x25); - plt_data.extend_from_slice(&got2.to_le_bytes()); - while plt_data.len() < plt_header_size as usize { plt_data.push(0x90); } - - // PLT[N] - for i in 0..num_plt { - let gotplt_entry = gotplt_vaddr + (gotplt_reserved + i as u32) * 4; - let plt_entry_addr = plt_vaddr + plt_header_size + (i as u32) * plt_entry_size; - - plt_data.push(0xff); plt_data.push(0x25); - plt_data.extend_from_slice(&gotplt_entry.to_le_bytes()); - plt_data.push(0x68); - plt_data.extend_from_slice(&(i as u32 * 8).to_le_bytes()); - plt_data.push(0xe9); - let target = plt_vaddr as i32 - (plt_entry_addr as i32 + plt_entry_size as i32); - plt_data.extend_from_slice(&target.to_le_bytes()); - } - - plt_data -} - -fn write_elf_header(output: &mut [u8], entry_point: u32, ehdr_size: u32, num_phdrs: u32) { - output[0..4].copy_from_slice(&ELF_MAGIC); - output[4] = ELFCLASS32; - output[5] = ELFDATA2LSB; - output[6] = EV_CURRENT; - output[7] = 0; - output[16..18].copy_from_slice(&ET_EXEC.to_le_bytes()); - output[18..20].copy_from_slice(&EM_386.to_le_bytes()); - output[20..24].copy_from_slice(&1u32.to_le_bytes()); - output[24..28].copy_from_slice(&entry_point.to_le_bytes()); - output[28..32].copy_from_slice(&ehdr_size.to_le_bytes()); - output[32..36].copy_from_slice(&0u32.to_le_bytes()); - output[36..40].copy_from_slice(&0u32.to_le_bytes()); - output[40..42].copy_from_slice(&(ehdr_size as u16).to_le_bytes()); - output[42..44].copy_from_slice(&32u16.to_le_bytes()); - output[44..46].copy_from_slice(&(num_phdrs as u16).to_le_bytes()); - output[46..48].copy_from_slice(&40u16.to_le_bytes()); - output[48..50].copy_from_slice(&0u16.to_le_bytes()); - output[50..52].copy_from_slice(&0u16.to_le_bytes()); -} diff --git a/src/backend/i686/linker/gnu_hash.rs b/src/backend/i686/linker/gnu_hash.rs deleted file mode 100644 index 510bf73579..0000000000 --- a/src/backend/i686/linker/gnu_hash.rs +++ /dev/null @@ -1,76 +0,0 @@ -//! GNU hash table building for ELF32. -//! -//! Builds the `.gnu.hash` section for dynamically-linked i686 executables. -//! Uses 32-bit bloom filter words (ELF32 word size) and the GNU hash algorithm. - -use crate::backend::linker_common; - -/// Build a .gnu.hash section for ELF32. -/// -/// `hashed_names`: names of symbols that go into the hash (at indices >= symoffset) -/// `symoffset`: first hashed symbol's index in .dynsym -/// -/// Returns `(hash_data, sorted_indices)` where `sorted_indices` maps new position -/// to original index in `hashed_names`, so the caller can reorder dynsym entries. -pub(super) fn build_gnu_hash_32(hashed_names: &[String], symoffset: u32) -> (Vec, Vec) { - let num_hashed = hashed_names.len(); - let nbuckets = if num_hashed == 0 { 1 } else { num_hashed.next_power_of_two().max(1) } as u32; - let bloom_size: u32 = 1; - let bloom_shift: u32 = 5; - - // Compute hashes - let orig_hashes: Vec = hashed_names.iter() - .map(|name| linker_common::gnu_hash(name.as_bytes())) - .collect(); - - // Sort by bucket for proper chain grouping - let mut indices: Vec = (0..num_hashed).collect(); - indices.sort_by_key(|&i| orig_hashes[i] % nbuckets); - let sym_hashes: Vec = indices.iter().map(|&i| orig_hashes[i]).collect(); - - // Build bloom filter (single 32-bit word for ELF32) - let mut bloom_word: u32 = 0; - for &h in &sym_hashes { - bloom_word |= 1u32 << (h % 32); - bloom_word |= 1u32 << ((h >> bloom_shift) % 32); - } - - // Build buckets and chains - let mut buckets = vec![0u32; nbuckets as usize]; - let mut chains = vec![0u32; num_hashed]; - for (i, &h) in sym_hashes.iter().enumerate() { - let bucket = (h % nbuckets) as usize; - if buckets[bucket] == 0 { - buckets[bucket] = symoffset + i as u32; - } - chains[i] = h & !1; - } - - // Mark the last symbol in each bucket chain with bit 0 set - for bucket_idx in 0..nbuckets as usize { - if buckets[bucket_idx] == 0 { continue; } - let mut last_in_bucket = 0; - for (i, &h) in sym_hashes.iter().enumerate() { - if (h % nbuckets) as usize == bucket_idx { - last_in_bucket = i; - } - } - chains[last_in_bucket] |= 1; - } - - // Serialize - let mut data = Vec::new(); - data.extend_from_slice(&nbuckets.to_le_bytes()); - data.extend_from_slice(&symoffset.to_le_bytes()); - data.extend_from_slice(&bloom_size.to_le_bytes()); - data.extend_from_slice(&bloom_shift.to_le_bytes()); - data.extend_from_slice(&bloom_word.to_le_bytes()); - for &b in &buckets { - data.extend_from_slice(&b.to_le_bytes()); - } - for &c in &chains { - data.extend_from_slice(&c.to_le_bytes()); - } - - (data, indices) -} diff --git a/src/backend/i686/linker/input.rs b/src/backend/i686/linker/input.rs deleted file mode 100644 index 6369ababd3..0000000000 --- a/src/backend/i686/linker/input.rs +++ /dev/null @@ -1,362 +0,0 @@ -//! Input processing for the i686 linker. -//! -//! Phases 1-4 of the linking pipeline: argument parsing, input file collection, -//! library resolution, object parsing, and demand-driven archive extraction. - -use std::collections::{HashMap, HashSet}; -use std::path::Path; - -use super::types::*; -use super::parse::*; -use super::dynsym::*; - -// Phase 1: Argument parsing -// ══════════════════════════════════════════════════════════════════════════════ - -pub(super) fn parse_user_args(user_args: &[String]) -> (Vec, Vec, Vec, Vec, Vec<(String, String)>) { - let mut extra_libs = Vec::new(); - let mut extra_lib_files = Vec::new(); - let mut extra_lib_paths = Vec::new(); - let mut extra_objects = Vec::new(); - let mut defsym_defs: Vec<(String, String)> = Vec::new(); - - for arg in user_args { - if arg == "-nostdlib" || arg == "-shared" || arg == "-static" || arg == "-r" { - continue; - } else if let Some(libarg) = arg.strip_prefix("-l") { - if let Some(rest) = libarg.strip_prefix(':') { - extra_lib_files.push(rest.to_string()); - } else { - extra_libs.push(libarg.to_string()); - } - } else if let Some(rest) = arg.strip_prefix("-L") { - extra_lib_paths.push(rest.to_string()); - } else if arg == "-rdynamic" || arg == "--export-dynamic" { - // Accepted but not currently used - } else if let Some(wl_args) = arg.strip_prefix("-Wl,") { - let parts: Vec<&str> = wl_args.split(',').collect(); - let mut j = 0; - while j < parts.len() { - let part = parts[j]; - if let Some(libarg) = part.strip_prefix("-l") { - if let Some(rest) = libarg.strip_prefix(':') { - extra_lib_files.push(rest.to_string()); - } else { - extra_libs.push(libarg.to_string()); - } - } else if let Some(rest) = part.strip_prefix("-L") { - extra_lib_paths.push(rest.to_string()); - } else if let Some(defsym_arg) = part.strip_prefix("--defsym=") { - // --defsym=SYMBOL=EXPR: define a symbol alias - // TODO: only supports symbol-to-symbol aliasing, not arbitrary expressions - if let Some(eq_pos) = defsym_arg.find('=') { - defsym_defs.push((defsym_arg[..eq_pos].to_string(), defsym_arg[eq_pos + 1..].to_string())); - } - } else if part == "--defsym" && j + 1 < parts.len() { - // Two-argument form: --defsym SYM=VAL - j += 1; - let defsym_arg = parts[j]; - if let Some(eq_pos) = defsym_arg.find('=') { - defsym_defs.push((defsym_arg[..eq_pos].to_string(), defsym_arg[eq_pos + 1..].to_string())); - } - } - j += 1; - } - } else if !arg.starts_with('-') && Path::new(arg.as_str()).exists() { - extra_objects.push(arg.clone()); - } - } - - (extra_libs, extra_lib_files, extra_lib_paths, extra_objects, defsym_defs) -} - -// ══════════════════════════════════════════════════════════════════════════════ -// Phase 2: Collect input files -// ══════════════════════════════════════════════════════════════════════════════ - -pub(super) fn collect_input_files( - object_files: &[&str], - extra_objects: &[String], - crt_before: &[&str], - crt_after: &[&str], - is_nostdlib: bool, - is_static: bool, - lib_paths: &[&str], -) -> Vec { - let mut all_objects = Vec::new(); - - for path in crt_before { - if Path::new(path).exists() { - all_objects.push(path.to_string()); - } - } - for obj in object_files { - all_objects.push(obj.to_string()); - } - for obj in extra_objects { - all_objects.push(obj.clone()); - } - for path in crt_after { - if Path::new(path).exists() { - all_objects.push(path.to_string()); - } - } - - // Add libc_nonshared.a for dynamic linking - if !is_nostdlib && !is_static { - for dir in lib_paths { - let path = format!("{}/libc_nonshared.a", dir); - if Path::new(&path).exists() { - all_objects.push(path); - break; - } - } - } - - all_objects -} - -// ══════════════════════════════════════════════════════════════════════════════ -// Phase 3: Library resolution -// ══════════════════════════════════════════════════════════════════════════════ - -pub(super) fn load_libraries( - is_static: bool, - _is_nostdlib: bool, - needed_libs: &[&str], - extra_libs: &[String], - extra_lib_files: &[String], - all_lib_dirs: &[String], -) -> (HashMap, bool, u8)>, Vec) { - let mut dynlib_syms: HashMap, bool, u8)> = HashMap::new(); - let mut static_lib_objects: Vec = Vec::new(); - let all_lib_refs: Vec<&str> = all_lib_dirs.iter().map(|s| s.as_str()).collect(); - - if !is_static { - let mut libs_to_scan: Vec = needed_libs.iter().map(|s| s.to_string()).collect(); - libs_to_scan.extend(extra_libs.iter().cloned()); - - for lib in &libs_to_scan { - if !scan_shared_lib(lib, &all_lib_refs, &mut dynlib_syms) { - // No .so found, try static archive - let ar_filename = format!("lib{}.a", lib); - for dir in &all_lib_refs { - let path = format!("{}/{}", dir, ar_filename); - if Path::new(&path).exists() { - static_lib_objects.push(path); - break; - } - } - } - } - } - - // Handle -l flags in static linking mode - if is_static { - let mut libs_to_scan: Vec = needed_libs.iter().map(|s| s.to_string()).collect(); - libs_to_scan.extend(extra_libs.iter().cloned()); - for lib in &libs_to_scan { - let ar_filename = format!("lib{}.a", lib); - for dir in &all_lib_refs { - let path = format!("{}/{}", dir, ar_filename); - if Path::new(&path).exists() { - static_lib_objects.push(path); - break; - } - } - } - } - - // Handle -l:filename - for filename in extra_lib_files { - for dir in &all_lib_refs { - let path = format!("{}/{}", dir, filename); - if Path::new(&path).exists() { - if filename.ends_with(".a") || filename.ends_with(".o") { - static_lib_objects.push(path); - } else if !is_static { - let real_path = std::fs::canonicalize(&path).ok(); - let check_path = real_path.as_ref() - .map(|p| p.to_string_lossy().into_owned()) - .unwrap_or(path.clone()); - if let Ok(syms) = read_dynsyms_with_search(&check_path, &all_lib_refs) { - let lib_soname = filename.clone(); - for sym in syms { - insert_dynsym(&mut dynlib_syms, sym, &lib_soname); - } - } - static_lib_objects.push(path); - } else { - static_lib_objects.push(path); - } - break; - } - } - } - - (dynlib_syms, static_lib_objects) -} - -/// Try to find and scan a shared library, returning true if found. -pub(super) fn scan_shared_lib( - lib: &str, - lib_refs: &[&str], - dynlib_syms: &mut HashMap, bool, u8)>, -) -> bool { - let so_base = format!("lib{}.so", lib); - for dir in lib_refs { - let mut candidates: Vec = vec![format!("{}/{}", dir, so_base)]; - if let Ok(entries) = std::fs::read_dir(dir) { - for entry in entries.flatten() { - let fname = entry.file_name().to_string_lossy().into_owned(); - if fname.starts_with(&so_base) && fname.len() > so_base.len() - && fname.as_bytes()[so_base.len()] == b'.' - { - candidates.push(format!("{}/{}", dir, fname)); - } - } - } - for cand in &candidates { - let real_path = std::fs::canonicalize(cand).ok(); - let check_path = real_path.as_ref() - .map(|p| p.to_string_lossy().into_owned()) - .unwrap_or(cand.clone()); - if let Ok(syms) = read_dynsyms_with_search(&check_path, lib_refs) { - // Read the actual SONAME from the ELF file; fall back to hardcoded defaults - let lib_soname = parse_soname_elf32(&check_path) - .unwrap_or_else(|| { - if lib == "c" { "libc.so.6".to_string() } - else if lib == "m" { "libm.so.6".to_string() } - else { format!("lib{}.so", lib) } - }); - for sym in syms { - insert_dynsym(dynlib_syms, sym, &lib_soname); - } - return true; - } - } - } - false -} - -pub(super) fn insert_dynsym( - dynlib_syms: &mut HashMap, bool, u8)>, - sym: DynSymInfo, - lib_soname: &str, -) { - let entry = dynlib_syms.entry(sym.name.clone()); - match entry { - std::collections::hash_map::Entry::Vacant(e) => { - e.insert((lib_soname.to_string(), sym.sym_type, sym.size, sym.version, sym.is_default_ver, sym.binding)); - } - std::collections::hash_map::Entry::Occupied(mut e) => { - if sym.is_default_ver && !e.get().4 { - e.insert((lib_soname.to_string(), sym.sym_type, sym.size, sym.version, sym.is_default_ver, sym.binding)); - } - } - } -} - -// ══════════════════════════════════════════════════════════════════════════════ -// Phase 4: Parse objects with demand-driven archive extraction -// ══════════════════════════════════════════════════════════════════════════════ - -pub(super) fn load_and_parse_objects(all_objects: &[String], defsym_defs: &[(String, String)]) -> Result<(Vec, Vec), String> { - let mut inputs: Vec = Vec::new(); - let mut archive_pool: Vec = Vec::new(); - - for obj_path in all_objects { - let data = std::fs::read(obj_path) - .map_err(|e| format!("cannot read {}: {}", obj_path, e))?; - if data.len() >= 8 && &data[0..8] == b"!\n" { - let members = parse_archive(&data, obj_path)?; - for (name, mdata) in members { - let member_name = format!("{}({})", obj_path, name); - if let Ok(obj) = parse_elf32(&mdata, &member_name) { - archive_pool.push(obj); - } - } - } else if is_thin_archive(&data) { - let members = parse_thin_archive_i686(&data, obj_path)?; - for (name, mdata) in members { - let member_name = format!("{}({})", obj_path, name); - if let Ok(obj) = parse_elf32(&mdata, &member_name) { - archive_pool.push(obj); - } - } - } else { - inputs.push(parse_elf32(&data, obj_path)?); - } - } - - // Demand-driven archive member extraction - resolve_archive_members(&mut inputs, &mut archive_pool, defsym_defs); - - Ok((inputs, archive_pool)) -} - -/// Pull in archive members that satisfy undefined symbols, iterating until stable. -pub(super) fn resolve_archive_members(inputs: &mut Vec, archive_pool: &mut Vec, defsym_defs: &[(String, String)]) { - let mut defined: HashSet = HashSet::new(); - let mut undefined: HashSet = HashSet::new(); - - for obj in inputs.iter() { - for sym in &obj.symbols { - if sym.name.is_empty() || sym.sym_type == STT_FILE || sym.sym_type == STT_SECTION { - continue; - } - if sym.section_index != SHN_UNDEF { - defined.insert(sym.name.clone()); - } else { - undefined.insert(sym.name.clone()); - } - } - } - undefined.retain(|s| !defined.contains(s)); - - // For --defsym aliases (e.g. fmod=__ieee754_fmod), if the alias is - // undefined we also need the target symbol to be pulled from archives. - for (alias, target) in defsym_defs { - if undefined.contains(alias) && !defined.contains(target) { - undefined.insert(target.clone()); - } - } - - let mut changed = true; - while changed { - changed = false; - let mut i = 0; - while i < archive_pool.len() { - let resolves = archive_pool[i].symbols.iter().any(|sym| { - !sym.name.is_empty() - && sym.sym_type != STT_FILE - && sym.sym_type != STT_SECTION - && sym.section_index != SHN_UNDEF - && undefined.contains(&sym.name) - }); - if resolves { - let obj = archive_pool.remove(i); - for sym in &obj.symbols { - if sym.name.is_empty() || sym.sym_type == STT_FILE || sym.sym_type == STT_SECTION { - continue; - } - if sym.section_index != SHN_UNDEF { - defined.insert(sym.name.clone()); - undefined.remove(&sym.name); - } else if !defined.contains(&sym.name) { - undefined.insert(sym.name.clone()); - } - } - inputs.push(obj); - changed = true; - } else { - i += 1; - } - } - } -} - -// ══════════════════════════════════════════════════════════════════════════════ -// Phase 5: Section merging -// ══════════════════════════════════════════════════════════════════════════════ - diff --git a/src/backend/i686/linker/link.rs b/src/backend/i686/linker/link.rs deleted file mode 100644 index e921a0fe9b..0000000000 --- a/src/backend/i686/linker/link.rs +++ /dev/null @@ -1,216 +0,0 @@ -//! i686 linker orchestration. -//! -//! Contains the two public entry points (`link_builtin` and `link_shared`) that -//! orchestrate the linking pipeline: parse arguments, load inputs, merge sections, -//! resolve symbols, build PLT/GOT, and emit the ELF32 executable or shared library. - -use std::collections::HashMap; -use std::path::Path; - -use super::types::*; -use super::input::*; -use super::sections::merge_sections; -use super::symbols::*; -use super::emit::emit_executable; -use super::shared::{resolve_dynamic_symbols_for_shared, emit_shared_library_32}; - -/// Built-in linker entry point with pre-resolved CRT objects and library paths. -pub fn link_builtin( - object_files: &[&str], - output_path: &str, - user_args: &[String], - lib_paths: &[&str], - needed_libs_param: &[&str], - crt_objects_before: &[&str], - crt_objects_after: &[&str], -) -> Result<(), String> { - let is_nostdlib = user_args.iter().any(|a| a == "-nostdlib"); - let is_static = user_args.iter().any(|a| a == "-static"); - - // Phase 1: Parse arguments and collect file lists - let (extra_libs, extra_lib_files, extra_lib_paths, extra_objects, defsym_defs) = parse_user_args(user_args); - - let all_lib_dirs: Vec = extra_lib_paths.into_iter() - .chain(lib_paths.iter().map(|s| s.to_string())) - .collect(); - - // Phase 2: Collect all input objects in link order - let all_objects = collect_input_files( - object_files, &extra_objects, crt_objects_before, crt_objects_after, - is_nostdlib, is_static, lib_paths, - ); - - // Phase 3: Load dynamic library symbols and resolve static libs from -l flags - let (dynlib_syms, static_lib_objects) = load_libraries( - is_static, is_nostdlib, needed_libs_param, &extra_libs, &extra_lib_files, - &all_lib_dirs, - ); - - // Phase 4: Parse all input objects and archives - let mut all_objs = all_objects; - for lib_path in &static_lib_objects { - all_objs.push(lib_path.clone()); - } - - let (inputs, _archive_pool) = load_and_parse_objects(&all_objs, &defsym_defs)?; - - // Phase 5: Merge sections - let (mut output_sections, mut section_name_to_idx, section_map) = merge_sections(&inputs); - - // Phase 6: Resolve symbols - let (mut global_symbols, sym_resolution) = resolve_symbols( - &inputs, &output_sections, §ion_map, &dynlib_syms, - ); - - // Phase 6b: Allocate COMMON symbols in .bss - allocate_common_symbols(&inputs, &mut output_sections, &mut section_name_to_idx, &mut global_symbols); - - // Phase 7: Mark PLT/GOT needs and check undefined - mark_plt_got_needs(&inputs, &mut global_symbols, is_static); - - // Apply --defsym definitions: alias one symbol to another - for (alias, target) in &defsym_defs { - if let Some(target_sym) = global_symbols.get(target).cloned() { - global_symbols.insert(alias.clone(), target_sym); - } - } - - check_undefined_symbols(&global_symbols)?; - - // Phase 8: Build PLT/GOT structures - let (plt_symbols, got_dyn_symbols, got_local_symbols, num_plt, num_got_total) = build_plt_got_lists(&mut global_symbols); - - // Phase 8b: Mark WEAK dynamic data symbols for text relocations instead of COPY - if !is_static { - let weak_data_syms: Vec = global_symbols.iter() - .filter(|(_, s)| s.is_dynamic && s.needs_copy && s.binding == STB_WEAK - && s.sym_type != STT_FUNC && s.sym_type != STT_GNU_IFUNC) - .map(|(n, _)| n.clone()) - .collect(); - for name in &weak_data_syms { - if let Some(sym) = global_symbols.get_mut(name) { - sym.needs_copy = false; - sym.uses_textrel = true; - } - } - } - - // Phase 9: Collect IFUNC symbols for static linking - let ifunc_symbols = collect_ifunc_symbols(&global_symbols, is_static); - - // Phase 10: Layout + emit - emit_executable( - &inputs, &mut output_sections, §ion_name_to_idx, §ion_map, - &mut global_symbols, &sym_resolution, - &dynlib_syms, &plt_symbols, &got_dyn_symbols, &got_local_symbols, - num_plt, num_got_total, &ifunc_symbols, - is_static, is_nostdlib, needed_libs_param, - output_path, - ) -} - -// ══════════════════════════════════════════════════════════════════════════════ -// Shared library linker (-shared) -// ══════════════════════════════════════════════════════════════════════════════ - -/// Create a shared library (.so) from ELF32 object files. -/// -/// Produces an ELF32 `ET_DYN` file with base address 0, exporting all defined -/// global symbols. Used when the compiler is invoked with `-shared`. -pub fn link_shared( - object_files: &[&str], - output_path: &str, - user_args: &[String], - lib_paths: &[&str], -) -> Result<(), String> { - // Parse user args for -L, -l, -Wl,-soname=, bare .o/.a files - let mut extra_lib_paths: Vec = Vec::new(); - let mut libs_to_load: Vec = Vec::new(); - let mut extra_object_files: Vec = Vec::new(); - let mut soname: Option = None; - let mut i = 0; - let args: Vec<&str> = user_args.iter().map(|s| s.as_str()).collect(); - while i < args.len() { - let arg = args[i]; - if let Some(path) = arg.strip_prefix("-L") { - let p = if path.is_empty() && i + 1 < args.len() { i += 1; args[i] } else { path }; - extra_lib_paths.push(p.to_string()); - } else if let Some(lib) = arg.strip_prefix("-l") { - let l = if lib.is_empty() && i + 1 < args.len() { i += 1; args[i] } else { lib }; - libs_to_load.push(l.to_string()); - } else if let Some(wl_arg) = arg.strip_prefix("-Wl,") { - let parts: Vec<&str> = wl_arg.split(',').collect(); - let mut j = 0; - while j < parts.len() { - let part = parts[j]; - if let Some(sn) = part.strip_prefix("-soname=") { - soname = Some(sn.to_string()); - } else if part == "-soname" && j + 1 < parts.len() { - j += 1; - soname = Some(parts[j].to_string()); - } else if let Some(lpath) = part.strip_prefix("-L") { - extra_lib_paths.push(lpath.to_string()); - } else if let Some(lib) = part.strip_prefix("-l") { - libs_to_load.push(lib.to_string()); - } - j += 1; - } - } else if arg == "-shared" || arg == "-nostdlib" || arg == "-o" { - if arg == "-o" { i += 1; } - } else if !arg.starts_with('-') && Path::new(arg).exists() { - extra_object_files.push(arg.to_string()); - } - i += 1; - } - - // Collect all objects to parse - let mut all_objs: Vec = object_files.iter().map(|s| s.to_string()).collect(); - all_objs.extend(extra_object_files); - - // Parse all input objects - let defsym_defs: Vec<(String, String)> = Vec::new(); - let (inputs, _archive_pool) = load_and_parse_objects(&all_objs, &defsym_defs)?; - - // Merge sections - let (mut output_sections, section_name_to_idx, section_map) = merge_sections(&inputs); - - // Resolve symbols (no dynamic library symbols for shared lib output) - let dynlib_syms: HashMap, bool, u8)> = HashMap::new(); - let (mut global_symbols, _sym_resolution) = resolve_symbols( - &inputs, &output_sections, §ion_map, &dynlib_syms, - ); - - // Load -l libraries (resolve into archives and load them) - let lib_path_strings: Vec = lib_paths.iter().map(|s| s.to_string()).collect(); - let mut all_lib_paths: Vec = extra_lib_paths; - all_lib_paths.extend(lib_path_strings.iter().cloned()); - - if !libs_to_load.is_empty() { - for lib_name in &libs_to_load { - // Search for static archive only in shared library mode - for dir in &all_lib_paths { - let cand = format!("{}/lib{}.a", dir, lib_name); - if Path::new(&cand).exists() { - let objs = vec![cand]; - let (extra_inputs, _) = load_and_parse_objects(&objs, &defsym_defs)?; - // Add symbols from these archives - for _inp in &extra_inputs { - // TODO: properly merge archive objects - } - break; - } - } - } - } - - // Discover NEEDED dependencies by scanning for undefined symbols - let mut needed_sonames: Vec = Vec::new(); - resolve_dynamic_symbols_for_shared(&inputs, &global_symbols, &mut needed_sonames, &all_lib_paths); - - // Emit shared library - emit_shared_library_32( - &inputs, &mut global_symbols, &mut output_sections, - §ion_name_to_idx, §ion_map, - &needed_sonames, output_path, soname, - ) -} diff --git a/src/backend/i686/linker/mod.rs b/src/backend/i686/linker/mod.rs deleted file mode 100644 index b9461963ce..0000000000 --- a/src/backend/i686/linker/mod.rs +++ /dev/null @@ -1,52 +0,0 @@ -//! Native i686 (32-bit x86) ELF linker. -//! -//! Links ELF32 relocatable objects (.o) and archives (.a) into a dynamically- -//! linked or static ELF32 executable. Supports PLT/GOT for dynamic symbols, -//! TLS (all i386 models), GNU hash tables, GLIBC version tables, copy -//! relocations, COMDAT group deduplication, and IFUNC (IRELATIVE) for static. -//! -//! ## Module structure -//! -//! - `types` - ELF32 constants, structures, and linker state types -//! - `parse` - ELF32 object file parsing -//! - `dynsym` - Dynamic symbol reading from shared libraries -//! - `reloc` - i386 relocation application -//! - `gnu_hash` - GNU hash table building -//! - `input` - Phases 1-4: argument parsing, file loading, archive resolution -//! - `sections` - Phase 5: section merging and COMDAT deduplication -//! - `symbols` - Phases 6-9: symbol resolution, PLT/GOT marking, IFUNC collection -//! - `shared` - Shared library (.so) emission -//! - `emit` - Phase 10: executable layout and ELF32 emission -//! - `link` - Orchestration: `link_builtin` and `link_shared` entry points - -#[allow(dead_code)] // ELF constants defined for completeness; not all used yet -mod types; -mod parse; -mod dynsym; -mod reloc; -mod gnu_hash; -mod input; -mod sections; -mod symbols; -mod shared; -mod emit; -mod link; - -use crate::backend::linker_common; - -// ── DynStrTab using linker_common ───────────────────────────────────────── -// Wraps linker_common::DynStrTab (usize offsets) for i686's u32 needs. - -struct DynStrTab(linker_common::DynStrTab); - -impl DynStrTab { - fn new() -> Self { Self(linker_common::DynStrTab::new()) } - fn add(&mut self, s: &str) -> u32 { self.0.add(s) as u32 } - fn get_offset(&self, s: &str) -> u32 { self.0.get_offset(s) as u32 } - fn as_bytes(&self) -> &[u8] { self.0.as_bytes() } -} - -#[cfg(not(feature = "gcc_linker"))] -pub use link::link_builtin; -#[cfg(not(feature = "gcc_linker"))] -pub use link::link_shared; diff --git a/src/backend/i686/linker/parse.rs b/src/backend/i686/linker/parse.rs deleted file mode 100644 index d1487ed8f7..0000000000 --- a/src/backend/i686/linker/parse.rs +++ /dev/null @@ -1,195 +0,0 @@ -//! ELF32 object file parsing for the i686 linker. -//! -//! Handles parsing of relocatable ELF32 .o files, regular archives (.a), -//! and thin archives. This is separate from the ELF64 parser in `linker_common` -//! because ELF32 has different field widths (u32 vs u64 for addresses/offsets). - -use std::collections::HashMap; - -use super::types::*; - -/// Parse an ELF32 relocatable object file. -pub(super) fn parse_elf32(data: &[u8], filename: &str) -> Result { - if data.len() < 52 { - return Err(format!("{}: too small for ELF header", filename)); - } - if data[0..4] != ELF_MAGIC { - return Err(format!("{}: not an ELF file", filename)); - } - if data[4] != ELFCLASS32 { - return Err(format!("{}: not ELF32", filename)); - } - if data[5] != ELFDATA2LSB { - return Err(format!("{}: not little-endian", filename)); - } - let e_type = read_u16(data, 16); - if e_type != ET_REL { - return Err(format!("{}: not a relocatable object (type={})", filename, e_type)); - } - let e_machine = read_u16(data, 18); - if e_machine != EM_386 { - return Err(format!("{}: not i386 (machine={})", filename, e_machine)); - } - - let e_shoff = read_u32(data, 32) as usize; - let e_shentsize = read_u16(data, 46) as usize; - let e_shnum = read_u16(data, 48) as usize; - let e_shstrndx = read_u16(data, 50) as usize; - - // Parse section headers - let mut shdrs = Vec::with_capacity(e_shnum); - for i in 0..e_shnum { - let off = e_shoff + i * e_shentsize; - shdrs.push(Elf32Shdr { - name: read_u32(data, off), - sh_type: read_u32(data, off + 4), - flags: read_u32(data, off + 8), - addr: read_u32(data, off + 12), - offset: read_u32(data, off + 16), - size: read_u32(data, off + 20), - link: read_u32(data, off + 24), - info: read_u32(data, off + 28), - addralign: read_u32(data, off + 32), - entsize: read_u32(data, off + 36), - }); - } - - // Read section name string table - let shstrtab = &shdrs[e_shstrndx]; - let shstrtab_data = &data[shstrtab.offset as usize..(shstrtab.offset + shstrtab.size) as usize]; - - // Find symtab and strtab - let mut symtab_idx = None; - let mut strtab_data: &[u8] = &[]; - for (i, shdr) in shdrs.iter().enumerate() { - if shdr.sh_type == SHT_SYMTAB { - symtab_idx = Some(i); - let str_idx = shdr.link as usize; - let str_shdr = &shdrs[str_idx]; - strtab_data = &data[str_shdr.offset as usize..(str_shdr.offset + str_shdr.size) as usize]; - } - } - - // Parse symbols - let mut symbols = Vec::new(); - if let Some(si) = symtab_idx { - let sym_shdr = &shdrs[si]; - let sym_count = sym_shdr.size / sym_shdr.entsize; - for j in 0..sym_count { - let off = (sym_shdr.offset + j * sym_shdr.entsize) as usize; - let st_name = read_u32(data, off); - let st_value = read_u32(data, off + 4); - let st_size = read_u32(data, off + 8); - let st_info = data[off + 12]; - let st_other = data[off + 13]; - let st_shndx = read_u16(data, off + 14); - let mut sym_name = read_cstr(strtab_data, st_name as usize); - // Defense-in-depth: strip @PLT suffix from symbol names. - if sym_name.ends_with("@PLT") { - sym_name.truncate(sym_name.len() - 4); - } - symbols.push(InputSymbol { - name: sym_name, - value: st_value, - size: st_size, - binding: st_info >> 4, - sym_type: st_info & 0xf, - visibility: st_other & 3, - section_index: st_shndx, - }); - } - } - - // Build relocation map: section index -> list of REL section indices - let mut rel_map: HashMap> = HashMap::new(); - for (i, shdr) in shdrs.iter().enumerate() { - if shdr.sh_type == SHT_REL { - rel_map.entry(shdr.info as usize).or_default().push(i); - } - } - - // Parse sections with their relocations - let mut sections = Vec::with_capacity(e_shnum); - for (i, shdr) in shdrs.iter().enumerate() { - let sec_name = read_cstr(shstrtab_data, shdr.name as usize); - let sec_data = if shdr.sh_type != SHT_NOBITS && shdr.size > 0 { - data[shdr.offset as usize..(shdr.offset + shdr.size) as usize].to_vec() - } else { - vec![0u8; shdr.size as usize] - }; - - let mut relocs = Vec::new(); - if let Some(rel_indices) = rel_map.get(&i) { - for &ri in rel_indices { - let rel_shdr = &shdrs[ri]; - let count = rel_shdr.size / rel_shdr.entsize.max(8); - for j in 0..count { - let roff = (rel_shdr.offset + j * rel_shdr.entsize.max(8)) as usize; - let r_offset = read_u32(data, roff); - let r_info = read_u32(data, roff + 4); - let sym_idx = r_info >> 8; - let rel_type = r_info & 0xff; - // For REL (not RELA), the addend is implicit in the section data - let addend = if rel_type != R_386_NONE && (r_offset as usize + 4) <= sec_data.len() { - read_i32(&sec_data, r_offset as usize) - } else { - 0 - }; - relocs.push((r_offset, rel_type, sym_idx, addend)); - } - } - } - - sections.push(InputSection { - name: sec_name, - sh_type: shdr.sh_type, - flags: shdr.flags, - data: sec_data, - align: shdr.addralign.max(1), - relocations: relocs, - input_index: i, - entsize: shdr.entsize, - link: shdr.link, - info: shdr.info, - }); - } - - Ok(InputObject { - sections, - symbols, - filename: filename.to_string(), - }) -} - -/// Parse a regular (.a) archive, returning ELF32 members. -pub(super) fn parse_archive(data: &[u8], _filename: &str) -> Result)>, String> { - let raw_members = parse_archive_members(data)?; - let mut members = Vec::new(); - for (name, offset, size) in raw_members { - let content = &data[offset..offset + size]; - if content.len() >= 4 && content[0..4] == ELF_MAGIC { - members.push((name, content.to_vec())); - } - } - Ok(members) -} - -/// Parse a GNU thin archive, reading member data from external files. -pub(super) fn parse_thin_archive_i686(data: &[u8], archive_path: &str) -> Result)>, String> { - let member_names = parse_thin_archive_members(data)?; - let archive_dir = std::path::Path::new(archive_path) - .parent() - .unwrap_or_else(|| std::path::Path::new(".")); - let mut members = Vec::new(); - for name in member_names { - let member_path = archive_dir.join(&name); - let content = std::fs::read(&member_path).map_err(|e| { - format!("thin archive {}: failed to read member '{}': {}", - archive_path, member_path.display(), e) - })?; - if content.len() >= 4 && content[0..4] == ELF_MAGIC { - members.push((name, content)); - } - } - Ok(members) -} diff --git a/src/backend/i686/linker/reloc.rs b/src/backend/i686/linker/reloc.rs deleted file mode 100644 index 48ddfa366c..0000000000 --- a/src/backend/i686/linker/reloc.rs +++ /dev/null @@ -1,302 +0,0 @@ -//! i386 relocation application for the i686 linker. -//! -//! Applies relocations from input objects to merged output sections. -//! All i386 relocation types are handled here, separated from the main -//! linking logic to keep the code manageable. -//! -//! The relocation types supported include absolute (R_386_32), PC-relative -//! (R_386_PC32, R_386_PLT32), GOT-related (R_386_GOT32, R_386_GOT32X, -//! R_386_GOTPC, R_386_GOTOFF), and TLS relocations. - -use std::collections::HashMap; - -use super::types::*; - -/// Context for relocation application, containing all addresses needed -/// to resolve relocations. -pub(super) struct RelocContext<'a> { - pub global_symbols: &'a HashMap, - pub output_sections: &'a mut Vec, - pub section_map: &'a SectionMap, - pub got_base: u32, - pub got_vaddr: u32, - pub gotplt_vaddr: u32, - pub got_reserved: usize, - pub gotplt_reserved: u32, - #[allow(dead_code)] // Set by linker layout; available for future PLT-relative relocations - pub plt_vaddr: u32, - #[allow(dead_code)] // Set by linker layout; available for future PLT-relative relocations - pub plt_header_size: u32, - #[allow(dead_code)] // Set by linker layout; available for future PLT-relative relocations - pub plt_entry_size: u32, - pub num_plt: usize, - pub tls_addr: u32, - pub tls_mem_size: u32, - pub has_tls: bool, -} - -/// Apply all relocations from input objects to the output sections. -/// Returns a list of text relocations (address, dynsym_index) for symbols using textrel. -pub(super) fn apply_relocations( - inputs: &[InputObject], - ctx: &mut RelocContext, -) -> Result, String> { - let mut text_relocs: Vec<(u32, String)> = Vec::new(); - for (obj_idx, obj) in inputs.iter().enumerate() { - for sec in &obj.sections { - if sec.relocations.is_empty() { continue; } - - let _out_name = match output_section_name(&sec.name, sec.flags, sec.sh_type) { - Some(n) => n, - None => continue, - }; - let (out_sec_idx, sec_base_offset) = match ctx.section_map.get(&(obj_idx, sec.input_index)) { - Some(&v) => v, - None => continue, - }; - - for &(rel_offset, rel_type, sym_idx, addend) in &sec.relocations { - let tr = apply_one_reloc( - obj_idx, obj, sec, out_sec_idx, sec_base_offset, - rel_offset, rel_type, sym_idx, addend, - ctx, - )?; - if let Some(t) = tr { - text_relocs.push(t); - } - } - } - } - Ok(text_relocs) -} - -/// Apply a single relocation. -/// Returns Some((patch_addr, sym_name)) if a text relocation entry is needed. -fn apply_one_reloc( - obj_idx: usize, - obj: &InputObject, - _sec: &InputSection, - out_sec_idx: usize, - sec_base_offset: u32, - rel_offset: u32, - rel_type: u32, - sym_idx: u32, - addend: i32, - ctx: &mut RelocContext, -) -> Result, String> { - let patch_offset = sec_base_offset + rel_offset; - let patch_addr = ctx.output_sections[out_sec_idx].addr + patch_offset; - - let sym = if (sym_idx as usize) < obj.symbols.len() { - &obj.symbols[sym_idx as usize] - } else { - return Err(format!("invalid symbol index {} in reloc", sym_idx)); - }; - - let sym_addr = resolve_sym_addr(obj_idx, sym, ctx); - - // Check if this symbol goes through PLT - let is_dyn = !sym.name.is_empty() && ctx.global_symbols.get(&sym.name) - .map(|gs| gs.is_dynamic && gs.needs_plt).unwrap_or(false); - - let mut relax_got32x = false; - let mut text_reloc: Option<(u32, String)> = None; - - let value: u32 = match rel_type { - R_386_NONE => return Ok(None), - R_386_32 => { - // Check if this symbol uses text relocations (WEAK dynamic data) - if !sym.name.is_empty() { - if let Some(gs) = ctx.global_symbols.get(&sym.name) { - if gs.uses_textrel { - // Record a text relocation; write 0 for now (dynamic linker fills it) - text_reloc = Some((patch_addr, sym.name.clone())); - addend as u32 - } else { - (sym_addr as i32 + addend) as u32 - } - } else { - (sym_addr as i32 + addend) as u32 - } - } else { - (sym_addr as i32 + addend) as u32 - } - } - R_386_PC32 | R_386_PLT32 => { - let s = if is_dyn { - ctx.global_symbols.get(&sym.name).map(|gs| gs.address).unwrap_or(0) - } else { - sym_addr - }; - (s as i32 + addend - patch_addr as i32) as u32 - } - R_386_GOTPC => { - (ctx.got_base as i32 + addend - patch_addr as i32) as u32 - } - R_386_GOTOFF => { - (sym_addr as i32 + addend - ctx.got_base as i32) as u32 - } - R_386_GOT32 | R_386_GOT32X => { - resolve_got_reloc(sym, sym_addr, addend, rel_type, ctx, &mut relax_got32x) - } - R_386_TLS_TPOFF | R_386_TLS_LE => { - // Negative offset from TP - let tpoff = sym_addr as i32 - ctx.tls_addr as i32 - ctx.tls_mem_size as i32; - (tpoff + addend) as u32 - } - R_386_TLS_LE_32 | R_386_TLS_TPOFF32 => { - // ccc emits `add` with TLS_TPOFF32, so compute negative offset - // (same as TLS_TPOFF/TLS_LE) to match the `add` instruction. - let tpoff = sym_addr as i32 - ctx.tls_addr as i32 - ctx.tls_mem_size as i32; - (tpoff + addend) as u32 - } - R_386_TLS_IE => { - resolve_tls_ie(sym, sym_addr, addend, ctx) - } - R_386_TLS_GOTIE => { - resolve_tls_gotie(sym, sym_addr, addend, ctx) - } - R_386_TLS_GD => { - if ctx.has_tls && sym.sym_type == STT_TLS { - let tpoff = sym_addr as i32 - ctx.tls_addr as i32 - ctx.tls_mem_size as i32; - (tpoff + addend) as u32 - } else { - addend as u32 - } - } - R_386_TLS_DTPMOD32 => 1u32, - R_386_TLS_DTPOFF32 => { - if ctx.has_tls { - (sym_addr as i32 - ctx.tls_addr as i32 + addend) as u32 - } else { - addend as u32 - } - } - other => { - return Err(format!( - "unsupported i686 relocation type {} at {}:0x{:x}", - other, obj.filename, rel_offset - )); - } - }; - - // Patch the output section data - let out_sec = &mut ctx.output_sections[out_sec_idx]; - let off = patch_offset as usize; - if off + 4 <= out_sec.data.len() { - // For GOT32X relaxation, rewrite mov (0x8b) → lea (0x8d) - if relax_got32x && off >= 2 && out_sec.data[off - 2] == 0x8b { - out_sec.data[off - 2] = 0x8d; - } - out_sec.data[off..off + 4].copy_from_slice(&value.to_le_bytes()); - } - - Ok(text_reloc) -} - -/// Resolve a symbol's address, handling local, section, and global symbols. -fn resolve_sym_addr(obj_idx: usize, sym: &InputSymbol, ctx: &RelocContext) -> u32 { - if sym.sym_type == STT_SECTION { - if sym.section_index != SHN_UNDEF && sym.section_index != SHN_ABS { - match ctx.section_map.get(&(obj_idx, sym.section_index as usize)) { - Some(&(sec_out_idx, sec_out_offset)) => { - ctx.output_sections[sec_out_idx].addr + sec_out_offset - } - None => 0, - } - } else { - 0 - } - } else if sym.name.is_empty() { - 0 - } else if sym.binding == STB_LOCAL { - // Local symbols resolve per-object via section_map to avoid - // collisions between identically-named locals (e.g. .LC0). - resolve_via_section_map(obj_idx, sym, ctx) - } else { - match ctx.global_symbols.get(&sym.name) { - Some(gs) => gs.address, - None => resolve_via_section_map(obj_idx, sym, ctx), - } - } -} - -/// Resolve a symbol address through the section map + symbol value. -fn resolve_via_section_map(obj_idx: usize, sym: &InputSymbol, ctx: &RelocContext) -> u32 { - if sym.section_index != SHN_UNDEF && sym.section_index != SHN_ABS { - match ctx.section_map.get(&(obj_idx, sym.section_index as usize)) { - Some(&(sec_out_idx, sec_out_offset)) => { - ctx.output_sections[sec_out_idx].addr + sec_out_offset + sym.value - } - None => sym.value, - } - } else if sym.section_index == SHN_ABS { - sym.value - } else { - 0 - } -} - -/// Resolve R_386_GOT32 or R_386_GOT32X relocations. -pub(super) fn resolve_got_reloc( - sym: &InputSymbol, - sym_addr: u32, - addend: i32, - rel_type: u32, - ctx: &RelocContext, - relax_got32x: &mut bool, -) -> u32 { - if let Some(gs) = ctx.global_symbols.get(&sym.name) { - if gs.is_dynamic { - let got_entry_addr = if gs.needs_plt { - ctx.gotplt_vaddr + (ctx.gotplt_reserved + gs.plt_index as u32) * 4 - } else { - ctx.got_vaddr + (ctx.got_reserved as u32 + (gs.got_index - ctx.num_plt) as u32) * 4 - }; - (got_entry_addr as i32 + addend - ctx.got_base as i32) as u32 - } else if gs.needs_got { - let got_entry_addr = ctx.got_vaddr + (ctx.got_reserved as u32 + (gs.got_index - ctx.num_plt) as u32) * 4; - (got_entry_addr as i32 + addend - ctx.got_base as i32) as u32 - } else if rel_type == R_386_GOT32X { - *relax_got32x = true; - (sym_addr as i32 + addend - ctx.got_base as i32) as u32 - } else { - (sym_addr as i32 + addend - ctx.got_base as i32) as u32 - } - } else if rel_type == R_386_GOT32X { - *relax_got32x = true; - (sym_addr as i32 + addend - ctx.got_base as i32) as u32 - } else { - (sym_addr as i32 + addend - ctx.got_base as i32) as u32 - } -} - -/// Resolve R_386_TLS_IE relocation. -pub(super) fn resolve_tls_ie(sym: &InputSymbol, sym_addr: u32, addend: i32, ctx: &RelocContext) -> u32 { - if let Some(gs) = ctx.global_symbols.get(&sym.name) { - if gs.needs_got { - let got_entry_addr = ctx.got_vaddr + (ctx.got_reserved as u32 + (gs.got_index - ctx.num_plt) as u32) * 4; - (got_entry_addr as i32 + addend) as u32 - } else { - let tpoff = sym_addr as i32 - ctx.tls_addr as i32 - ctx.tls_mem_size as i32; - (tpoff + addend) as u32 - } - } else { - addend as u32 - } -} - -/// Resolve R_386_TLS_GOTIE relocation. -pub(super) fn resolve_tls_gotie(sym: &InputSymbol, sym_addr: u32, addend: i32, ctx: &RelocContext) -> u32 { - if let Some(gs) = ctx.global_symbols.get(&sym.name) { - if gs.needs_got { - let got_entry_addr = ctx.got_vaddr + (ctx.got_reserved as u32 + (gs.got_index - ctx.num_plt) as u32) * 4; - (got_entry_addr as i32 + addend - ctx.got_base as i32) as u32 - } else { - let tpoff = sym_addr as i32 - ctx.tls_addr as i32 - ctx.tls_mem_size as i32; - (tpoff + addend) as u32 - } - } else { - addend as u32 - } -} diff --git a/src/backend/i686/linker/sections.rs b/src/backend/i686/linker/sections.rs deleted file mode 100644 index c0a43af0f5..0000000000 --- a/src/backend/i686/linker/sections.rs +++ /dev/null @@ -1,131 +0,0 @@ -//! Section merging for the i686 linker. -//! -//! Phase 5 of the linking pipeline: merges input sections from all objects -//! into output sections, handling COMDAT group deduplication and section -//! type/flag assignment. - -use std::collections::{HashMap, HashSet}; - -use super::types::*; - -pub(super) fn merge_sections( - inputs: &[InputObject], -) -> (Vec, HashMap, SectionMap) { - let mut output_sections: Vec = Vec::new(); - let mut section_name_to_idx: HashMap = HashMap::new(); - let mut section_map: SectionMap = HashMap::new(); - let mut included_comdat_sections: HashSet = HashSet::new(); - - // COMDAT group deduplication - let comdat_skip = compute_comdat_skip(inputs); - - for (obj_idx, obj) in inputs.iter().enumerate() { - for sec in obj.sections.iter() { - if comdat_skip.contains(&(obj_idx, sec.input_index)) { - continue; - } - let out_name = match output_section_name(&sec.name, sec.flags, sec.sh_type) { - Some(n) => n, - None => continue, - }; - - // COMDAT deduplication by section name - if sec.flags & SHF_GROUP != 0 && !included_comdat_sections.insert(sec.name.clone()) { - continue; - } - - let out_idx = if let Some(&idx) = section_name_to_idx.get(&out_name) { - idx - } else { - let idx = output_sections.len(); - let (sh_type, flags) = section_type_and_flags(&out_name, sec); - section_name_to_idx.insert(out_name.clone(), idx); - output_sections.push(OutputSection { - name: out_name, - sh_type, - flags, - data: Vec::new(), - align: 1, - addr: 0, - file_offset: 0, - }); - idx - }; - - let out_sec = &mut output_sections[out_idx]; - // .init and .fini must be concatenated without padding - let align = if out_sec.name == ".init" || out_sec.name == ".fini" { - 1 - } else { - sec.align.max(1) - }; - if align > out_sec.align { - out_sec.align = align; - } - let padding = (align - (out_sec.data.len() as u32 % align)) % align; - out_sec.data.extend(std::iter::repeat_n(0u8, padding as usize)); - let offset = out_sec.data.len() as u32; - - section_map.insert((obj_idx, sec.input_index), (out_idx, offset)); - - if sec.sh_type != SHT_NOBITS { - out_sec.data.extend_from_slice(&sec.data); - } else { - out_sec.data.extend(std::iter::repeat_n(0u8, sec.data.len())); - } - } - } - - (output_sections, section_name_to_idx, section_map) -} - -pub(super) fn compute_comdat_skip(inputs: &[InputObject]) -> HashSet<(usize, usize)> { - let mut comdat_skip = HashSet::new(); - let mut seen_groups: HashSet = HashSet::new(); - - for (obj_idx, obj) in inputs.iter().enumerate() { - for sec in obj.sections.iter() { - if sec.sh_type != SHT_GROUP { continue; } - if sec.data.len() < 4 { continue; } - let flags = read_u32(&sec.data, 0); - if flags & 1 == 0 { continue; } - let sig_name = if (sec.info as usize) < obj.symbols.len() { - obj.symbols[sec.info as usize].name.clone() - } else { - continue; - }; - if !seen_groups.insert(sig_name) { - let mut off = 4; - while off + 4 <= sec.data.len() { - let member_idx = read_u32(&sec.data, off) as usize; - comdat_skip.insert((obj_idx, member_idx)); - off += 4; - } - } - } - } - - comdat_skip -} - -pub(super) fn section_type_and_flags(out_name: &str, sec: &InputSection) -> (u32, u32) { - match out_name { - ".text" => (SHT_PROGBITS, SHF_ALLOC | SHF_EXECINSTR), - ".rodata" => (SHT_PROGBITS, SHF_ALLOC), - ".data" => (SHT_PROGBITS, SHF_ALLOC | SHF_WRITE), - ".bss" => (SHT_NOBITS, SHF_ALLOC | SHF_WRITE), - ".tdata" => (SHT_PROGBITS, SHF_ALLOC | SHF_WRITE | SHF_TLS), - ".tbss" => (SHT_NOBITS, SHF_ALLOC | SHF_WRITE | SHF_TLS), - ".init" | ".fini" => (SHT_PROGBITS, SHF_ALLOC | SHF_EXECINSTR), - ".init_array" => (SHT_INIT_ARRAY, SHF_ALLOC | SHF_WRITE), - ".fini_array" => (SHT_FINI_ARRAY, SHF_ALLOC | SHF_WRITE), - ".eh_frame" => (SHT_PROGBITS, SHF_ALLOC), - ".note" => (SHT_NOTE, SHF_ALLOC), - _ => (sec.sh_type, sec.flags & (SHF_ALLOC | SHF_WRITE | SHF_EXECINSTR)), - } -} - -// ══════════════════════════════════════════════════════════════════════════════ -// Phase 6: Symbol resolution -// ══════════════════════════════════════════════════════════════════════════════ - diff --git a/src/backend/i686/linker/shared.rs b/src/backend/i686/linker/shared.rs deleted file mode 100644 index 2a41ccc8fa..0000000000 --- a/src/backend/i686/linker/shared.rs +++ /dev/null @@ -1,1057 +0,0 @@ -//! Shared library (.so) emission for the i686 linker. -//! -//! Produces ELF32 shared libraries (ET_DYN) with PLT/GOT, PIC relocations, -//! `.dynamic` section, GNU hash tables, and GLIBC version tables. - -use std::collections::HashMap; -use std::path::Path; - -use super::types::*; -use super::reloc::{RelocContext, resolve_got_reloc, resolve_tls_ie, resolve_tls_gotie}; -use super::gnu_hash::build_gnu_hash_32; -use super::emit::{layout_section, layout_custom_sections, layout_tls, build_plt}; -use super::DynStrTab; -use crate::backend::linker_common; - - -/// Discover NEEDED shared library dependencies for a shared library build. -pub(super) fn resolve_dynamic_symbols_for_shared( - inputs: &[InputObject], - global_symbols: &HashMap, - needed_sonames: &mut Vec, - lib_paths: &[String], -) { - // Collect undefined symbol names - let mut undefined: Vec = Vec::new(); - for obj in inputs.iter() { - for sym in &obj.symbols { - if sym.binding == STB_LOCAL { continue; } - if sym.section_index == SHN_UNDEF && !sym.name.is_empty() - && !global_symbols.get(&sym.name).map(|gs| gs.is_defined).unwrap_or(false) - && !undefined.contains(&sym.name) - { - undefined.push(sym.name.clone()); - } - } - } - if undefined.is_empty() { return; } - - // Search system libraries for these symbols - let lib_names = ["libc.so.6", "libm.so.6", "libpthread.so.0", "libdl.so.2", "librt.so.1", "libgcc_s.so.1", "ld-linux.so.2"]; - let mut libs: Vec = Vec::new(); - for lib_name in &lib_names { - for dir in lib_paths { - let candidate = format!("{}/{}", dir, lib_name); - if Path::new(&candidate).exists() { - libs.push(candidate); - break; - } - } - } - // Also check i686-specific paths - let extra_dirs = ["/lib/i386-linux-gnu", "/usr/lib/i386-linux-gnu", "/lib32", "/usr/lib32"]; - for lib_name in &lib_names { - for dir in &extra_dirs { - let candidate = format!("{}/{}", dir, lib_name); - if Path::new(&candidate).exists() && !libs.contains(&candidate) { - libs.push(candidate); - break; - } - } - } - - for lib_path in &libs { - let data = match std::fs::read(lib_path) { Ok(d) => d, Err(_) => continue }; - let soname_val = linker_common::parse_soname(&data).unwrap_or_else(|| { - Path::new(lib_path).file_name().map(|n| n.to_string_lossy().to_string()).unwrap_or_default() - }); - if needed_sonames.contains(&soname_val) { continue; } - let dyn_syms = match linker_common::parse_shared_library_symbols(&data, lib_path) { - Ok(s) => s, Err(_) => continue, - }; - let provides_any = undefined.iter().any(|name| dyn_syms.iter().any(|ds| ds.name == *name)); - if provides_any { - needed_sonames.push(soname_val); - } - } -} - -/// Emit an ELF32 shared library (.so) file. -/// -/// Key differences from emit_executable: -/// - ELF type is ET_DYN (not ET_EXEC) -/// - Base address is 0 (position-independent) -/// - No PT_INTERP segment -/// - All defined global symbols exported to .dynsym -/// - R_386_RELATIVE relocations for internal absolute addresses -pub(super) fn emit_shared_library_32( - inputs: &[InputObject], - global_symbols: &mut HashMap, - output_sections: &mut Vec, - section_name_to_idx: &HashMap, - section_map: &SectionMap, - needed_sonames: &[String], - output_path: &str, - soname: Option, -) -> Result<(), String> { - let base_addr: u32 = 0; - - // ── Build dynamic string table ──────────────────────────────────────── - let mut dynstr = DynStrTab::new(); - let _ = dynstr.add(""); - for lib in needed_sonames { dynstr.add(lib); } - if let Some(ref sn) = soname { dynstr.add(sn); } - - // ── Identify PLT symbols (undefined function calls) ─────────────────── - let mut plt_names: Vec = Vec::new(); - for obj in inputs.iter() { - for sec in &obj.sections { - for &(_, rel_type, sym_idx, _) in &sec.relocations { - let si = sym_idx as usize; - if si >= obj.symbols.len() { continue; } - let sym = &obj.symbols[si]; - if sym.name.is_empty() || sym.binding == STB_LOCAL { continue; } - match rel_type { - R_386_PC32 | R_386_PLT32 => { - if let Some(gs) = global_symbols.get(&sym.name) { - if !gs.is_defined && !plt_names.contains(&sym.name) { - plt_names.push(sym.name.clone()); - } - } else if sym.section_index == SHN_UNDEF && !plt_names.contains(&sym.name) { - plt_names.push(sym.name.clone()); - } - } - _ => {} - } - } - } - } - - // Ensure PLT symbols are in global_symbols - for name in &plt_names { - global_symbols.entry(name.clone()).or_insert(LinkerSymbol { - address: 0, size: 0, sym_type: STT_FUNC, binding: STB_GLOBAL, - visibility: STV_DEFAULT, is_defined: false, needs_plt: true, needs_got: true, - output_section: usize::MAX, section_offset: 0, plt_index: 0, got_index: 0, - is_dynamic: true, dynlib: String::new(), needs_copy: false, copy_addr: 0, - version: None, uses_textrel: false, - }); - if let Some(gs) = global_symbols.get_mut(name) { - gs.needs_plt = true; - gs.is_dynamic = true; - } - } - - // Assign PLT indices - for (i, name) in plt_names.iter().enumerate() { - if let Some(gs) = global_symbols.get_mut(name) { - gs.plt_index = i; - } - } - let num_plt = plt_names.len(); - - // ── Identify GOT symbols ────────────────────────────────────────────── - let mut got_names: Vec = Vec::new(); - for obj in inputs.iter() { - for sec in &obj.sections { - for &(_, rel_type, sym_idx, _) in &sec.relocations { - let si = sym_idx as usize; - if si >= obj.symbols.len() { continue; } - let sym = &obj.symbols[si]; - if sym.name.is_empty() { continue; } - match rel_type { - R_386_GOT32 | R_386_GOT32X | R_386_TLS_IE | R_386_TLS_GOTIE | R_386_TLS_GD => { - if !got_names.contains(&sym.name) && !plt_names.contains(&sym.name) { - got_names.push(sym.name.clone()); - } - } - _ => {} - } - } - } - } - - // Assign GOT indices (PLT symbols first, then GOT-only symbols) - for (i, name) in got_names.iter().enumerate() { - if let Some(gs) = global_symbols.get_mut(name) { - gs.needs_got = true; - gs.got_index = num_plt + i; - } - } - let _num_got = num_plt + got_names.len(); - - // ── Collect all exported symbols ────────────────────────────────────── - let mut exported_names: Vec = Vec::new(); - { - let mut sorted: Vec<&String> = global_symbols.keys().collect(); - sorted.sort(); - for name in sorted { - let gs = &global_symbols[name]; - if gs.is_defined && gs.binding != STB_LOCAL { - exported_names.push(name.clone()); - } - } - } - - // Build dynsym: null entry + undefined PLT imports + defined exports - let mut dynsym_names: Vec = Vec::new(); - let mut dynsym_entries: Vec = Vec::new(); - dynsym_entries.push(Elf32Sym { name: 0, value: 0, size: 0, info: 0, other: 0, shndx: 0 }); - - // Undefined symbols first (PLT imports + GOT imports that are undefined) - let mut undef_names: Vec = Vec::new(); - for name in &plt_names { - undef_names.push(name.clone()); - } - for name in &got_names { - if let Some(gs) = global_symbols.get(name) { - if !gs.is_defined && !undef_names.contains(name) { - undef_names.push(name.clone()); - } - } - } - - for name in &undef_names { - let name_off = dynstr.add(name); - let (bind, stype) = if let Some(gs) = global_symbols.get(name) { - (gs.binding, if gs.sym_type != 0 { gs.sym_type } else { STT_FUNC }) - } else { - (STB_GLOBAL, STT_FUNC) - }; - dynsym_entries.push(Elf32Sym { - name: name_off, value: 0, size: 0, - info: (bind << 4) | stype, other: 0, shndx: SHN_UNDEF, - }); - dynsym_names.push(name.clone()); - } - - let gnu_hash_symoffset = dynsym_entries.len(); - - // Defined/exported symbols (hashed) - for name in &exported_names { - if undef_names.contains(name) { continue; } - let name_off = dynstr.add(name); - let gs = &global_symbols[name]; - // Section index will be filled in after layout - dynsym_entries.push(Elf32Sym { - name: name_off, value: 0, size: gs.size, - info: (gs.binding << 4) | gs.sym_type, other: 0, - shndx: 1, // placeholder, will be fixed - }); - dynsym_names.push(name.clone()); - } - - // Build .gnu.hash for the defined symbols - let defined_for_hash: Vec = dynsym_names[gnu_hash_symoffset - 1..].to_vec(); - let (gnu_hash_data, sorted_indices) = build_gnu_hash_32(&defined_for_hash, gnu_hash_symoffset as u32); - - // Reorder hashed entries - if !sorted_indices.is_empty() { - let hashed_start = gnu_hash_symoffset; - let names_start = hashed_start - 1; - let orig_entries: Vec = (0..sorted_indices.len()) - .map(|i| dynsym_entries[hashed_start + i].clone()) - .collect(); - let orig_names: Vec = (0..sorted_indices.len()) - .map(|i| dynsym_names[names_start + i].clone()) - .collect(); - for (new_pos, &orig_idx) in sorted_indices.iter().enumerate() { - dynsym_entries[hashed_start + new_pos] = orig_entries[orig_idx].clone(); - dynsym_names[names_start + new_pos] = orig_names[orig_idx].clone(); - } - } - - let dynsym_map: HashMap = dynsym_names.iter().enumerate() - .map(|(i, n)| (n.clone(), i + 1)) - .collect(); - - // Rebuild dynstr with soname - let dynstr_data = dynstr.as_bytes().to_vec(); - - // ── Pre-scan: count R_386_RELATIVE relocations needed ──────────────── - // In shared libraries, R_386_32 against defined symbols becomes R_386_RELATIVE - let mut num_relative = 0usize; - for (obj_idx, obj) in inputs.iter().enumerate() { - for sec in &obj.sections { - for &(_, rel_type, sym_idx, _) in &sec.relocations { - if rel_type == R_386_32 { - let si = sym_idx as usize; - if si >= obj.symbols.len() { continue; } - let sym = &obj.symbols[si]; - if sym.sym_type == STT_SECTION { - if section_map.get(&(obj_idx, sym.section_index as usize)).is_some() { - num_relative += 1; - } - } else if !sym.name.is_empty() { - let is_defined = global_symbols.get(&sym.name) - .map(|gs| gs.is_defined).unwrap_or(false); - if is_defined { - num_relative += 1; - } - } - } - } - } - } - - // GOT entries for undefined symbols need GLOB_DAT - let mut num_glob_dat = 0usize; - for name in &got_names { - if let Some(gs) = global_symbols.get(name) { - if !gs.is_defined { num_glob_dat += 1; } - } - } - - let num_rel_dyn = num_relative + num_glob_dat; - let num_rel_plt = num_plt; - - // ── Layout ──────────────────────────────────────────────────────────── - let ehdr_size: u32 = 52; - let phdr_size: u32 = 32; - - // Program headers: PHDR, LOAD(ro headers), LOAD(text), LOAD(rodata), LOAD(data), DYNAMIC, GNU_STACK - let mut num_phdrs: u32 = 7; - let has_tls = output_sections.iter().any(|s| s.flags & SHF_TLS != 0 && s.flags & SHF_ALLOC != 0); - if has_tls { num_phdrs += 1; } - - let phdrs_total_size = num_phdrs * phdr_size; - - let mut file_offset: u32 = ehdr_size; - let mut vaddr: u32 = base_addr + ehdr_size; - - let phdr_offset = file_offset; - let phdr_vaddr = vaddr; - file_offset += phdrs_total_size; - vaddr += phdrs_total_size; - - // .gnu.hash - file_offset = align_up(file_offset, 4); vaddr = align_up(vaddr, 4); - let gnu_hash_offset = file_offset; - let gnu_hash_vaddr = vaddr; - let gnu_hash_size = gnu_hash_data.len() as u32; - file_offset += gnu_hash_size; vaddr += gnu_hash_size; - - // .dynsym - file_offset = align_up(file_offset, 4); vaddr = align_up(vaddr, 4); - let dynsym_offset = file_offset; - let dynsym_vaddr = vaddr; - let dynsym_entsize: u32 = 16; - let dynsym_size = (dynsym_entries.len() as u32) * dynsym_entsize; - file_offset += dynsym_size; vaddr += dynsym_size; - - // .dynstr - let dynstr_offset = file_offset; - let dynstr_vaddr = vaddr; - let dynstr_size = dynstr_data.len() as u32; - file_offset += dynstr_size; vaddr += dynstr_size; - - // .rel.dyn - file_offset = align_up(file_offset, 4); vaddr = align_up(vaddr, 4); - let rel_dyn_offset = file_offset; - let rel_dyn_vaddr = vaddr; - let rel_dyn_size = (num_rel_dyn as u32) * 8; - file_offset += rel_dyn_size; vaddr += rel_dyn_size; - - // .rel.plt - let rel_plt_offset = file_offset; - let rel_plt_vaddr = vaddr; - let rel_plt_size = (num_rel_plt as u32) * 8; - file_offset += rel_plt_size; vaddr += rel_plt_size; - - let ro_headers_end = file_offset; - - // ── Segment 1 (RX): .text + .plt ── - file_offset = align_up(file_offset, PAGE_SIZE); - vaddr = align_up(vaddr, PAGE_SIZE); - // Ensure congruent file_offset and vaddr (mod PAGE_SIZE) - vaddr = (vaddr & !0xfff) | (file_offset & 0xfff); - - let text_seg_file_start = file_offset; - let text_seg_vaddr_start = vaddr; - - // .init - let (init_vaddr, init_size) = layout_section( - ".init", section_name_to_idx, output_sections, &mut file_offset, &mut vaddr, 4); - - // .plt - let plt_entry_size: u32 = 16; - let plt_header_size: u32 = if num_plt > 0 { 16 } else { 0 }; - let plt_total_size = plt_header_size + (num_plt as u32) * plt_entry_size; - file_offset = align_up(file_offset, 16); vaddr = align_up(vaddr, 16); - let plt_offset = file_offset; - let plt_vaddr = vaddr; - file_offset += plt_total_size; vaddr += plt_total_size; - - // .text - let (text_vaddr, text_size) = layout_section( - ".text", section_name_to_idx, output_sections, &mut file_offset, &mut vaddr, 16); - let _ = (text_vaddr, text_size); - - // .fini - let (fini_vaddr, fini_size) = layout_section( - ".fini", section_name_to_idx, output_sections, &mut file_offset, &mut vaddr, 4); - - // Layout custom executable sections (for __start_/__stop_ symbol auto-generation) - layout_custom_sections(section_name_to_idx, output_sections, - &mut file_offset, &mut vaddr, SHF_EXECINSTR); - - let text_seg_file_end = file_offset; - let text_seg_vaddr_end = vaddr; - - // ── Segment 2 (R): .rodata + .eh_frame ── - file_offset = align_up(file_offset, PAGE_SIZE); - vaddr = align_up(vaddr, PAGE_SIZE); - vaddr = (vaddr & !0xfff) | (file_offset & 0xfff); - - let rodata_seg_file_start = file_offset; - let rodata_seg_vaddr_start = vaddr; - - let (_, _) = layout_section( - ".rodata", section_name_to_idx, output_sections, &mut file_offset, &mut vaddr, 16); - let (_, _) = layout_section( - ".eh_frame", section_name_to_idx, output_sections, &mut file_offset, &mut vaddr, 4); - - // Layout custom read-only sections (for __start_/__stop_ symbol auto-generation) - layout_custom_sections(section_name_to_idx, output_sections, - &mut file_offset, &mut vaddr, 0); - - let rodata_seg_file_end = file_offset; - let rodata_seg_vaddr_end = vaddr; - - // ── Segment 3 (RW): .data + .got + .got.plt + .dynamic + .bss ── - file_offset = align_up(file_offset, PAGE_SIZE); - vaddr = align_up(vaddr, PAGE_SIZE); - vaddr = (vaddr & !0xfff) | (file_offset & 0xfff); - - let data_seg_file_start = file_offset; - let data_seg_vaddr_start = vaddr; - - // TLS sections - let (tls_addr, _tls_file_offset, _tls_file_size, tls_mem_size, _tls_align) = - layout_tls(section_name_to_idx, output_sections, &mut file_offset, &mut vaddr); - - // .init_array - let (init_array_vaddr, init_array_size) = layout_section( - ".init_array", section_name_to_idx, output_sections, &mut file_offset, &mut vaddr, 4); - - // .fini_array - let (fini_array_vaddr, fini_array_size) = layout_section( - ".fini_array", section_name_to_idx, output_sections, &mut file_offset, &mut vaddr, 4); - - // Layout custom writable sections (for __start_/__stop_ symbol auto-generation) - layout_custom_sections(section_name_to_idx, output_sections, - &mut file_offset, &mut vaddr, SHF_WRITE); - - // .data - let (_, _) = layout_section( - ".data", section_name_to_idx, output_sections, &mut file_offset, &mut vaddr, 16); - - // .got (combined GOT for both data and PLT) - let got_reserved: usize = 1; // GOT[0] = dynamic addr - let gotplt_reserved: u32 = 3; // GOT.PLT[0..2] = dynamic/link_map/dl_resolve - let got_total_entries = got_reserved + got_names.len(); - let gotplt_entries = gotplt_reserved as usize + num_plt; - let got_size = (got_total_entries as u32) * 4; - let gotplt_size = (gotplt_entries as u32) * 4; - - file_offset = align_up(file_offset, 4); vaddr = align_up(vaddr, 4); - let got_offset = file_offset; - let got_vaddr = vaddr; - let got_base = got_vaddr; // _GLOBAL_OFFSET_TABLE_ points here - file_offset += got_size; vaddr += got_size; - - let gotplt_offset = file_offset; - let gotplt_vaddr = vaddr; - file_offset += gotplt_size; vaddr += gotplt_size; - - // .dynamic - let dynamic_entry_size: u32 = 8; - let mut num_dynamic: u32 = 0; - num_dynamic += needed_sonames.len() as u32; // DT_NEEDED - if soname.is_some() { num_dynamic += 1; } // DT_SONAME - num_dynamic += 5; // GNU_HASH, STRTAB, SYMTAB, STRSZ, SYMENT - if init_vaddr != 0 && init_size > 0 { num_dynamic += 1; } // DT_INIT - if fini_vaddr != 0 && fini_size > 0 { num_dynamic += 1; } // DT_FINI - if init_array_size > 0 { num_dynamic += 2; } - if fini_array_size > 0 { num_dynamic += 2; } - if num_rel_plt > 0 { num_dynamic += 4; } // PLTGOT, PLTRELSZ, PLTREL, JMPREL - if num_rel_dyn > 0 { num_dynamic += 3; } // REL, RELSZ, RELENT - num_dynamic += 1; // DT_NULL - - file_offset = align_up(file_offset, 4); vaddr = align_up(vaddr, 4); - let dynamic_offset = file_offset; - let dynamic_vaddr = vaddr; - let dynamic_size = num_dynamic * dynamic_entry_size; - file_offset += dynamic_size; vaddr += dynamic_size; - - // .bss - let bss_vaddr = vaddr; - if let Some(&idx) = section_name_to_idx.get(".bss") { - let a = output_sections[idx].align.max(4); - vaddr = align_up(vaddr, a); - output_sections[idx].addr = vaddr; - output_sections[idx].file_offset = file_offset; // BSS doesn't occupy file space - let bss_size = output_sections[idx].data.len() as u32; - vaddr += bss_size; - } - let _ = bss_vaddr; - - let data_seg_file_end = file_offset; - let data_seg_vaddr_end = vaddr; - - // ── Assign symbol addresses ─────────────────────────────────────────── - // Set _GLOBAL_OFFSET_TABLE_ - global_symbols.entry("_GLOBAL_OFFSET_TABLE_".to_string()).or_insert(LinkerSymbol { - address: got_base, size: 0, sym_type: STT_OBJECT, binding: STB_LOCAL, - visibility: STV_DEFAULT, is_defined: true, needs_plt: false, needs_got: false, - output_section: usize::MAX, section_offset: 0, plt_index: 0, got_index: 0, - is_dynamic: false, dynlib: String::new(), needs_copy: false, copy_addr: 0, - version: None, uses_textrel: false, - }); - if let Some(gs) = global_symbols.get_mut("_GLOBAL_OFFSET_TABLE_") { - gs.address = got_base; - gs.is_defined = true; - } - - for (name, sym) in global_symbols.iter_mut() { - if sym.needs_plt && !sym.is_defined { - sym.address = plt_vaddr + plt_header_size + (sym.plt_index as u32) * plt_entry_size; - continue; - } - if sym.output_section < output_sections.len() { - sym.address = output_sections[sym.output_section].addr + sym.section_offset; - } - // Standard linker symbols - match name.as_str() { - "_edata" | "edata" => sym.address = data_seg_vaddr_start + (data_seg_file_end - data_seg_file_start), - "_end" | "end" => sym.address = data_seg_vaddr_end, - "__bss_start" | "__bss_start__" => { - sym.address = if let Some(&idx) = section_name_to_idx.get(".bss") { - output_sections[idx].addr - } else { - data_seg_vaddr_end - }; - } - _ => {} - } - } - - // ── Apply relocations ───────────────────────────────────────────────── - // For shared libraries, we need to handle relocations differently: - // R_386_32 -> write resolved value, emit R_386_RELATIVE - // R_386_PC32 -> resolved normally (PC-relative, no dynamic reloc needed) - let mut relative_relocs: Vec = Vec::new(); // addresses needing R_386_RELATIVE - let mut glob_dat_relocs: Vec<(u32, usize)> = Vec::new(); // (got_addr, dynsym_idx) - - { - let reloc_ctx = RelocContext { - global_symbols: &*global_symbols, - output_sections, - section_map, - got_base, - got_vaddr, - gotplt_vaddr, - got_reserved, - gotplt_reserved, - plt_vaddr, - plt_header_size, - plt_entry_size, - num_plt, - tls_addr, - tls_mem_size, - has_tls, - }; - - for (obj_idx, obj) in inputs.iter().enumerate() { - for sec in &obj.sections { - if sec.relocations.is_empty() { continue; } - - let (out_sec_idx, sec_base_offset) = match section_map.get(&(obj_idx, sec.input_index)) { - Some(&v) => v, - None => continue, - }; - - for &(rel_offset, rel_type, sym_idx, addend) in &sec.relocations { - let patch_offset = sec_base_offset + rel_offset; - let patch_addr = reloc_ctx.output_sections[out_sec_idx].addr + patch_offset; - - let si = sym_idx as usize; - if si >= obj.symbols.len() { continue; } - let sym = &obj.symbols[si]; - - let sym_addr = resolve_sym_addr_shared(obj_idx, sym, &reloc_ctx); - - match rel_type { - R_386_NONE => {} - R_386_32 => { - let value = (sym_addr as i32 + addend) as u32; - let off = patch_offset as usize; - let sec_data = &mut reloc_ctx.output_sections[out_sec_idx].data; - if off + 4 <= sec_data.len() { - sec_data[off..off + 4].copy_from_slice(&value.to_le_bytes()); - } - // Determine if this needs a RELATIVE dynamic relocation - let needs_relative = if sym.sym_type == STT_SECTION { - section_map.contains_key(&(obj_idx, sym.section_index as usize)) - } else if !sym.name.is_empty() { - global_symbols.get(&sym.name).map(|gs| gs.is_defined).unwrap_or(false) - } else { - false - }; - if needs_relative { - relative_relocs.push(patch_addr); - } - } - R_386_PC32 | R_386_PLT32 => { - let target = if !sym.name.is_empty() { - if let Some(gs) = global_symbols.get(&sym.name) { - if gs.needs_plt && !gs.is_defined { - gs.address // PLT address - } else { - sym_addr - } - } else { - sym_addr - } - } else { - sym_addr - }; - let value = (target as i32 + addend - patch_addr as i32) as u32; - let off = patch_offset as usize; - let sec_data = &mut reloc_ctx.output_sections[out_sec_idx].data; - if off + 4 <= sec_data.len() { - sec_data[off..off + 4].copy_from_slice(&value.to_le_bytes()); - } - } - R_386_GOTPC => { - let value = (got_base as i32 + addend - patch_addr as i32) as u32; - let off = patch_offset as usize; - let sec_data = &mut reloc_ctx.output_sections[out_sec_idx].data; - if off + 4 <= sec_data.len() { - sec_data[off..off + 4].copy_from_slice(&value.to_le_bytes()); - } - } - R_386_GOTOFF => { - let value = (sym_addr as i32 + addend - got_base as i32) as u32; - let off = patch_offset as usize; - let sec_data = &mut reloc_ctx.output_sections[out_sec_idx].data; - if off + 4 <= sec_data.len() { - sec_data[off..off + 4].copy_from_slice(&value.to_le_bytes()); - } - } - R_386_GOT32 | R_386_GOT32X => { - // In shared libraries, GOT relocations work via GOT entries - let mut relax = false; - let value = resolve_got_reloc(sym, sym_addr, addend, rel_type, - &reloc_ctx, &mut relax); - let off = patch_offset as usize; - let sec_data = &mut reloc_ctx.output_sections[out_sec_idx].data; - if off + 4 <= sec_data.len() { - if relax && off >= 2 && sec_data[off - 2] == 0x8b { - sec_data[off - 2] = 0x8d; - } - sec_data[off..off + 4].copy_from_slice(&value.to_le_bytes()); - } - } - R_386_TLS_TPOFF | R_386_TLS_LE => { - if has_tls { - let tpoff = sym_addr as i32 - tls_addr as i32 - tls_mem_size as i32; - let value = (tpoff + addend) as u32; - let off = patch_offset as usize; - let sec_data = &mut reloc_ctx.output_sections[out_sec_idx].data; - if off + 4 <= sec_data.len() { - sec_data[off..off + 4].copy_from_slice(&value.to_le_bytes()); - } - } - } - R_386_TLS_LE_32 | R_386_TLS_TPOFF32 => { - if has_tls { - // ccc emits `add` with TLS_TPOFF32, so use negative offset - let value = (sym_addr as i32 - tls_addr as i32 - tls_mem_size as i32 + addend) as u32; - let off = patch_offset as usize; - let sec_data = &mut reloc_ctx.output_sections[out_sec_idx].data; - if off + 4 <= sec_data.len() { - sec_data[off..off + 4].copy_from_slice(&value.to_le_bytes()); - } - } - } - R_386_TLS_IE => { - let value = resolve_tls_ie(sym, sym_addr, addend, &reloc_ctx); - let off = patch_offset as usize; - let sec_data = &mut reloc_ctx.output_sections[out_sec_idx].data; - if off + 4 <= sec_data.len() { - sec_data[off..off + 4].copy_from_slice(&value.to_le_bytes()); - } - } - R_386_TLS_GOTIE => { - let value = resolve_tls_gotie(sym, sym_addr, addend, &reloc_ctx); - let off = patch_offset as usize; - let sec_data = &mut reloc_ctx.output_sections[out_sec_idx].data; - if off + 4 <= sec_data.len() { - sec_data[off..off + 4].copy_from_slice(&value.to_le_bytes()); - } - } - _ => { - // Silently skip unsupported relocations in shared libraries - eprintln!("warning: unsupported relocation type {} for '{}' in shared library", rel_type, sym.name); - } - } - } - } - } - } - - // Build GOT entries for undefined symbols -> GLOB_DAT - for name in &got_names { - if let Some(gs) = global_symbols.get(name) { - if !gs.is_defined { - let got_entry_addr = got_vaddr + (got_reserved as u32 + (gs.got_index - num_plt) as u32) * 4; - let dynsym_idx = dynsym_map.get(name).copied().unwrap_or(0); - glob_dat_relocs.push((got_entry_addr, dynsym_idx)); - } - } - } - - // ── Build PLT ──────────────────────────────────────────────────────── - let plt_data = build_plt(num_plt, plt_vaddr, plt_header_size, plt_entry_size, - gotplt_vaddr, gotplt_reserved); - - // ── Build GOT data ─────────────────────────────────────────────────── - let mut got_data: Vec = Vec::new(); - // GOT[0] = address of .dynamic (filled by dynamic linker) - got_data.extend_from_slice(&dynamic_vaddr.to_le_bytes()); - // GOT entries for data symbols - for name in &got_names { - if let Some(gs) = global_symbols.get(name) { - if gs.is_defined { - got_data.extend_from_slice(&gs.address.to_le_bytes()); - } else { - got_data.extend_from_slice(&0u32.to_le_bytes()); - } - } else { - got_data.extend_from_slice(&0u32.to_le_bytes()); - } - } - - // .got.plt - let mut gotplt_data: Vec = Vec::new(); - gotplt_data.extend_from_slice(&dynamic_vaddr.to_le_bytes()); // GOT.PLT[0] = .dynamic - gotplt_data.extend_from_slice(&0u32.to_le_bytes()); // GOT.PLT[1] = link_map (filled by ld.so) - gotplt_data.extend_from_slice(&0u32.to_le_bytes()); // GOT.PLT[2] = dl_resolve (filled by ld.so) - // GOT.PLT[3..] = PLT lazy stubs (point back to PLT[N]+6) - for i in 0..num_plt { - let plt_stub_addr = plt_vaddr + plt_header_size + (i as u32) * plt_entry_size + 6; - gotplt_data.extend_from_slice(&plt_stub_addr.to_le_bytes()); - } - - // ── Build .rel.dyn ─────────────────────────────────────────────────── - let mut rel_dyn_data: Vec = Vec::new(); - // R_386_RELATIVE entries - for &addr in &relative_relocs { - rel_dyn_data.extend_from_slice(&addr.to_le_bytes()); - rel_dyn_data.extend_from_slice(&R_386_RELATIVE.to_le_bytes()); - } - // R_386_GLOB_DAT entries - for &(addr, dynsym_idx) in &glob_dat_relocs { - rel_dyn_data.extend_from_slice(&addr.to_le_bytes()); - let r_info = ((dynsym_idx as u32) << 8) | R_386_GLOB_DAT; - rel_dyn_data.extend_from_slice(&r_info.to_le_bytes()); - } - - // ── Build .rel.plt ─────────────────────────────────────────────────── - let mut rel_plt_data: Vec = Vec::new(); - for (i, name) in plt_names.iter().enumerate() { - let gotplt_entry = gotplt_vaddr + (gotplt_reserved + i as u32) * 4; - let dynsym_idx = dynsym_map.get(name).copied().unwrap_or(0) as u32; - rel_plt_data.extend_from_slice(&gotplt_entry.to_le_bytes()); - let r_info = (dynsym_idx << 8) | R_386_JMP_SLOT; - rel_plt_data.extend_from_slice(&r_info.to_le_bytes()); - } - - // ── Build .dynamic section ─────────────────────────────────────────── - let mut dynamic_data: Vec = Vec::new(); - for lib in needed_sonames { - push_dyn(&mut dynamic_data, DT_NEEDED, dynstr.get_offset(lib)); - } - if let Some(ref sn) = soname { - push_dyn(&mut dynamic_data, DT_SONAME, dynstr.get_offset(sn)); - } - push_dyn(&mut dynamic_data, DT_GNU_HASH_TAG, gnu_hash_vaddr); - push_dyn(&mut dynamic_data, DT_STRTAB, dynstr_vaddr); - push_dyn(&mut dynamic_data, DT_SYMTAB, dynsym_vaddr); - push_dyn(&mut dynamic_data, DT_STRSZ, dynstr_size); - push_dyn(&mut dynamic_data, DT_SYMENT, dynsym_entsize); - if init_vaddr != 0 && init_size > 0 { - push_dyn(&mut dynamic_data, DT_INIT, init_vaddr); - } - if fini_vaddr != 0 && fini_size > 0 { - push_dyn(&mut dynamic_data, DT_FINI, fini_vaddr); - } - if init_array_size > 0 { - push_dyn(&mut dynamic_data, DT_INIT_ARRAY, init_array_vaddr); - push_dyn(&mut dynamic_data, DT_INIT_ARRAYSZ, init_array_size); - } - if fini_array_size > 0 { - push_dyn(&mut dynamic_data, DT_FINI_ARRAY, fini_array_vaddr); - push_dyn(&mut dynamic_data, DT_FINI_ARRAYSZ, fini_array_size); - } - if num_rel_plt > 0 { - push_dyn(&mut dynamic_data, DT_PLTGOT, gotplt_vaddr); - push_dyn(&mut dynamic_data, DT_PLTRELSZ, rel_plt_size); - push_dyn(&mut dynamic_data, DT_PLTREL, DT_REL as u32); - push_dyn(&mut dynamic_data, DT_JMPREL, rel_plt_vaddr); - } - if num_rel_dyn > 0 { - push_dyn(&mut dynamic_data, DT_REL, rel_dyn_vaddr); - push_dyn(&mut dynamic_data, DT_RELSZ, rel_dyn_size); - push_dyn(&mut dynamic_data, DT_RELENT, 8); - } - push_dyn(&mut dynamic_data, DT_NULL, 0); - - // Update dynsym entries with resolved addresses - for (i, name) in dynsym_names.iter().enumerate() { - if let Some(gs) = global_symbols.get(name) { - if gs.is_defined { - dynsym_entries[i + 1].value = gs.address; - // Determine section index for dynsym - if gs.output_section < output_sections.len() { - // Find the section number. For simplicity, mark as SHN_ABS=0xfff1 - // Real implementations map to proper section indices but - // dynamic symbols usually don't need exact shndx - dynsym_entries[i + 1].shndx = SHN_ABS; - } - } - } - } - - // ── Write output file ───────────────────────────────────────────────── - let total_file_size = data_seg_file_end as usize; - let mut output = vec![0u8; total_file_size]; - - // ELF header (ET_DYN, e_entry = 0) - output[0..4].copy_from_slice(&ELF_MAGIC); - output[4] = ELFCLASS32; - output[5] = ELFDATA2LSB; - output[6] = EV_CURRENT; - output[7] = 0; // ELFOSABI_NONE - output[16..18].copy_from_slice(&ET_DYN.to_le_bytes()); - output[18..20].copy_from_slice(&EM_386.to_le_bytes()); - output[20..24].copy_from_slice(&1u32.to_le_bytes()); // e_version - output[24..28].copy_from_slice(&0u32.to_le_bytes()); // e_entry = 0 for .so - output[28..32].copy_from_slice(&ehdr_size.to_le_bytes()); // e_phoff - output[32..36].copy_from_slice(&0u32.to_le_bytes()); // e_shoff = 0 (no section headers) - output[36..40].copy_from_slice(&0u32.to_le_bytes()); // e_flags - output[40..42].copy_from_slice(&(ehdr_size as u16).to_le_bytes()); // e_ehsize - output[42..44].copy_from_slice(&32u16.to_le_bytes()); // e_phentsize - output[44..46].copy_from_slice(&(num_phdrs as u16).to_le_bytes()); // e_phnum - output[46..48].copy_from_slice(&40u16.to_le_bytes()); // e_shentsize - output[48..50].copy_from_slice(&0u16.to_le_bytes()); // e_shnum - output[50..52].copy_from_slice(&0u16.to_le_bytes()); // e_shstrndx - - // Write program headers - let mut ph_off = phdr_offset as usize; - - // Helper to write a PHDR - let write_phdr = |out: &mut [u8], off: usize, p_type: u32, flags: u32, - f_off: u32, va: u32, f_sz: u32, m_sz: u32, align: u32| { - out[off..off+4].copy_from_slice(&p_type.to_le_bytes()); - out[off+4..off+8].copy_from_slice(&f_off.to_le_bytes()); - out[off+8..off+12].copy_from_slice(&va.to_le_bytes()); - out[off+12..off+16].copy_from_slice(&va.to_le_bytes()); // paddr = vaddr - out[off+16..off+20].copy_from_slice(&f_sz.to_le_bytes()); - out[off+20..off+24].copy_from_slice(&m_sz.to_le_bytes()); - out[off+24..off+28].copy_from_slice(&flags.to_le_bytes()); - out[off+28..off+32].copy_from_slice(&align.to_le_bytes()); - }; - - // PHDR - write_phdr(&mut output, ph_off, PT_PHDR, PF_R, - phdr_offset, phdr_vaddr, phdrs_total_size, phdrs_total_size, 4); - ph_off += 32; - - // LOAD: RO headers (ELF header + phdrs + .gnu.hash + .dynsym + .dynstr + .rel.*) - write_phdr(&mut output, ph_off, PT_LOAD, PF_R, - 0, base_addr, ro_headers_end, ro_headers_end, PAGE_SIZE); - ph_off += 32; - - // LOAD: RX (text segment) - let text_file_sz = text_seg_file_end - text_seg_file_start; - let text_mem_sz = text_seg_vaddr_end - text_seg_vaddr_start; - write_phdr(&mut output, ph_off, PT_LOAD, PF_R | PF_X, - text_seg_file_start, text_seg_vaddr_start, text_file_sz, text_mem_sz, PAGE_SIZE); - ph_off += 32; - - // LOAD: RO (rodata segment) - let rodata_file_sz = rodata_seg_file_end - rodata_seg_file_start; - let rodata_mem_sz = rodata_seg_vaddr_end - rodata_seg_vaddr_start; - if rodata_file_sz > 0 { - write_phdr(&mut output, ph_off, PT_LOAD, PF_R, - rodata_seg_file_start, rodata_seg_vaddr_start, rodata_file_sz, rodata_mem_sz, PAGE_SIZE); - } else { - // Empty rodata segment - write_phdr(&mut output, ph_off, PT_LOAD, PF_R, - rodata_seg_file_start, rodata_seg_vaddr_start, 0, 0, PAGE_SIZE); - } - ph_off += 32; - - // LOAD: RW (data segment) - let data_file_sz = data_seg_file_end - data_seg_file_start; - let data_mem_sz = data_seg_vaddr_end - data_seg_vaddr_start; - write_phdr(&mut output, ph_off, PT_LOAD, PF_R | PF_W, - data_seg_file_start, data_seg_vaddr_start, data_file_sz, data_mem_sz, PAGE_SIZE); - ph_off += 32; - - // DYNAMIC - write_phdr(&mut output, ph_off, PT_DYNAMIC, PF_R | PF_W, - dynamic_offset, dynamic_vaddr, dynamic_size, dynamic_size, 4); - ph_off += 32; - - // GNU_STACK - write_phdr(&mut output, ph_off, PT_GNU_STACK, PF_R | PF_W, - 0, 0, 0, 0, 0); - ph_off += 32; - - // TLS - if has_tls { - let tls_f_size = if let Some(&idx) = section_name_to_idx.get(".tdata") { - output_sections[idx].data.len() as u32 - } else { 0 }; - write_phdr(&mut output, ph_off, PT_TLS, PF_R, - if tls_addr > 0 { output_sections[section_name_to_idx[".tdata"]].file_offset } else { 0 }, - tls_addr, tls_f_size, tls_mem_size, _tls_align); - let _ = ph_off; // suppress unused warning - } - - // Write .gnu.hash - let off = gnu_hash_offset as usize; - if off + gnu_hash_data.len() <= output.len() { - output[off..off + gnu_hash_data.len()].copy_from_slice(&gnu_hash_data); - } - - // Write .dynsym - for (i, entry) in dynsym_entries.iter().enumerate() { - let off = dynsym_offset as usize + i * 16; - if off + 16 > output.len() { break; } - output[off..off+4].copy_from_slice(&entry.name.to_le_bytes()); - output[off+4..off+8].copy_from_slice(&entry.value.to_le_bytes()); - output[off+8..off+12].copy_from_slice(&entry.size.to_le_bytes()); - output[off+12] = entry.info; - output[off+13] = entry.other; - output[off+14..off+16].copy_from_slice(&entry.shndx.to_le_bytes()); - } - - // Write .dynstr - let off = dynstr_offset as usize; - if off + dynstr_data.len() <= output.len() { - output[off..off + dynstr_data.len()].copy_from_slice(&dynstr_data); - } - - // Write .rel.dyn - let off = rel_dyn_offset as usize; - if off + rel_dyn_data.len() <= output.len() { - output[off..off + rel_dyn_data.len()].copy_from_slice(&rel_dyn_data); - } - - // Write .rel.plt - let off = rel_plt_offset as usize; - if off + rel_plt_data.len() <= output.len() { - output[off..off + rel_plt_data.len()].copy_from_slice(&rel_plt_data); - } - - // Write .plt - let off = plt_offset as usize; - if off + plt_data.len() <= output.len() { - output[off..off + plt_data.len()].copy_from_slice(&plt_data); - } - - // Write output sections (text, rodata, data, etc.) - for sec in output_sections.iter() { - if sec.sh_type == SHT_NOBITS { continue; } - let off = sec.file_offset as usize; - if off + sec.data.len() <= output.len() && !sec.data.is_empty() { - output[off..off + sec.data.len()].copy_from_slice(&sec.data); - } - } - - // Write .got - let off = got_offset as usize; - if off + got_data.len() <= output.len() { - output[off..off + got_data.len()].copy_from_slice(&got_data); - } - - // Write .got.plt - let off = gotplt_offset as usize; - if off + gotplt_data.len() <= output.len() { - output[off..off + gotplt_data.len()].copy_from_slice(&gotplt_data); - } - - // Write .dynamic - let off = dynamic_offset as usize; - if off + dynamic_data.len() <= output.len() { - output[off..off + dynamic_data.len()].copy_from_slice(&dynamic_data); - } - - // Write to file - std::fs::write(output_path, &output) - .map_err(|e| format!("failed to write {}: {}", output_path, e))?; - - // Set executable permission - #[cfg(unix)] - { - use std::os::unix::fs::PermissionsExt; - let _ = std::fs::set_permissions(output_path, - std::fs::Permissions::from_mode(0o755)); - } - - Ok(()) -} - -/// Resolve symbol address in shared library context. -pub(super) fn resolve_sym_addr_shared(obj_idx: usize, sym: &InputSymbol, ctx: &RelocContext) -> u32 { - if sym.sym_type == STT_SECTION { - if sym.section_index != SHN_UNDEF && sym.section_index != SHN_ABS { - match ctx.section_map.get(&(obj_idx, sym.section_index as usize)) { - Some(&(sec_out_idx, sec_out_offset)) => { - ctx.output_sections[sec_out_idx].addr + sec_out_offset - } - None => 0, - } - } else { - 0 - } - } else if sym.name.is_empty() { - 0 - } else if sym.binding == STB_LOCAL { - // Local symbols - resolve via section map - if sym.section_index != SHN_UNDEF && sym.section_index != SHN_ABS { - match ctx.section_map.get(&(obj_idx, sym.section_index as usize)) { - Some(&(sec_out_idx, sec_out_offset)) => { - ctx.output_sections[sec_out_idx].addr + sec_out_offset + sym.value - } - None => sym.value, - } - } else if sym.section_index == SHN_ABS { - sym.value - } else { - 0 - } - } else { - match ctx.global_symbols.get(&sym.name) { - Some(gs) => gs.address, - None => { - if sym.section_index != SHN_UNDEF && sym.section_index != SHN_ABS { - match ctx.section_map.get(&(obj_idx, sym.section_index as usize)) { - Some(&(sec_out_idx, sec_out_offset)) => { - ctx.output_sections[sec_out_idx].addr + sec_out_offset + sym.value - } - None => sym.value, - } - } else { - 0 - } - } - } - } -} - -// ══════════════════════════════════════════════════════════════════════════════ diff --git a/src/backend/i686/linker/symbols.rs b/src/backend/i686/linker/symbols.rs deleted file mode 100644 index d5a5d2f4eb..0000000000 --- a/src/backend/i686/linker/symbols.rs +++ /dev/null @@ -1,356 +0,0 @@ -//! Symbol resolution for the i686 linker. -//! -//! Phases 6-9: global symbol resolution, COMMON symbol allocation, -//! PLT/GOT marking, undefined symbol checking, PLT/GOT list building, -//! and IFUNC collection. - -use std::collections::HashMap; - -use super::types::*; -use crate::backend::linker_common; - -pub(super) fn resolve_symbols( - inputs: &[InputObject], - _output_sections: &[OutputSection], - section_map: &SectionMap, - dynlib_syms: &HashMap, bool, u8)>, -) -> (HashMap, HashMap<(usize, usize), String>) { - let mut global_symbols: HashMap = HashMap::new(); - let mut sym_resolution: HashMap<(usize, usize), String> = HashMap::new(); - - // First pass: collect definitions - for (obj_idx, obj) in inputs.iter().enumerate() { - for (sym_idx, sym) in obj.symbols.iter().enumerate() { - if sym.name.is_empty() || sym.sym_type == STT_FILE || sym.sym_type == STT_SECTION { - continue; - } - if sym.section_index == SHN_UNDEF { continue; } - - let (out_sec_idx, sec_offset) = if sym.section_index != SHN_ABS && sym.section_index != SHN_COMMON { - section_map.get(&(obj_idx, sym.section_index as usize)) - .copied().unwrap_or((usize::MAX, 0)) - } else { - (usize::MAX, 0) - }; - - let new_sym = LinkerSymbol { - address: 0, - size: sym.size, - sym_type: sym.sym_type, - binding: sym.binding, - visibility: sym.visibility, - is_defined: true, - needs_plt: false, - needs_got: false, - output_section: out_sec_idx, - section_offset: sec_offset + sym.value, - plt_index: 0, - got_index: 0, - is_dynamic: false, - dynlib: String::new(), - needs_copy: false, - copy_addr: 0, - version: None, - uses_textrel: false, - }; - - match global_symbols.get(&sym.name) { - None => { - // Note: STB_LOCAL symbols are deliberately inserted here when no - // entry exists yet. Unlike ELF64 backends that skip locals entirely, - // the i686 backend must allow locals as fallback definitions because - // glibc's static archives contain cross-object references that resolve - // through local symbols. The Some arm below prevents locals from - // *overriding* any existing entry (global, weak, or other local). - global_symbols.insert(sym.name.clone(), new_sym); - } - Some(existing) => { - // Local symbols must not override any existing entry. - // They have file scope only and should not shadow globals - // or weaks from other objects (e.g. a static "data" in one - // file must not shadow a global "data" in another). - if sym.binding == STB_LOCAL { - // Already have a definition; keep it. - } else if sym.binding == STB_GLOBAL && (existing.binding == STB_WEAK || existing.binding == STB_LOCAL) - || (!existing.is_defined && new_sym.is_defined) - { - global_symbols.insert(sym.name.clone(), new_sym); - } - } - } - - sym_resolution.insert((obj_idx, sym_idx), sym.name.clone()); - } - } - - // Second pass: resolve undefined references against dynamic libraries - for (obj_idx, obj) in inputs.iter().enumerate() { - for (sym_idx, sym) in obj.symbols.iter().enumerate() { - if sym.name.is_empty() || sym.sym_type == STT_FILE { continue; } - sym_resolution.insert((obj_idx, sym_idx), sym.name.clone()); - - if sym.section_index == SHN_UNDEF { - if global_symbols.contains_key(&sym.name) { continue; } - - if let Some((lib, dyn_sym_type, dyn_size, dyn_ver, _is_default, dyn_binding)) = dynlib_syms.get(&sym.name) { - let is_func = *dyn_sym_type == STT_FUNC || *dyn_sym_type == STT_GNU_IFUNC; - global_symbols.insert(sym.name.clone(), LinkerSymbol { - address: 0, - size: *dyn_size, - sym_type: *dyn_sym_type, - binding: *dyn_binding, - visibility: STV_DEFAULT, - is_defined: false, - needs_plt: is_func, - needs_got: is_func, - output_section: usize::MAX, - section_offset: 0, - plt_index: 0, - got_index: 0, - is_dynamic: true, - dynlib: lib.clone(), - needs_copy: !is_func, - copy_addr: 0, - version: dyn_ver.clone(), - uses_textrel: false, - }); - } else { - global_symbols.entry(sym.name.clone()).or_insert(LinkerSymbol { - address: 0, - size: 0, - sym_type: sym.sym_type, - binding: sym.binding, - visibility: STV_DEFAULT, - is_defined: false, - needs_plt: false, - needs_got: false, - output_section: usize::MAX, - section_offset: 0, - plt_index: 0, - got_index: 0, - is_dynamic: false, - dynlib: String::new(), - needs_copy: false, - copy_addr: 0, - version: None, - uses_textrel: false, - }); - } - } - } - } - - // Resolve section symbols - for (obj_idx, obj) in inputs.iter().enumerate() { - for (sym_idx, sym) in obj.symbols.iter().enumerate() { - if sym.sym_type == STT_SECTION && sym.section_index != SHN_UNDEF { - sym_resolution.insert((obj_idx, sym_idx), - format!("__section_{}_{}", obj_idx, sym.section_index)); - } - } - } - - (global_symbols, sym_resolution) -} - -// ══════════════════════════════════════════════════════════════════════════════ -// Phase 6b: Allocate COMMON symbols in .bss -// ══════════════════════════════════════════════════════════════════════════════ - -/// Allocate COMMON symbols (tentative definitions) in the .bss section. -/// -/// In C, a global variable declared without an initializer (e.g. `int x;`) may be -/// emitted as a COMMON symbol (SHN_COMMON) by the compiler. These symbols need -/// space allocated in .bss during linking. For each COMMON symbol, we: -/// 1. Find or create the .bss output section -/// 2. Align the current offset to the symbol's alignment requirement -/// 3. Update the LinkerSymbol to point into the .bss section -pub(super) fn allocate_common_symbols( - inputs: &[InputObject], - output_sections: &mut Vec, - section_name_to_idx: &mut HashMap, - global_symbols: &mut HashMap, -) { - // Collect COMMON symbols: (name, alignment, size) - // For COMMON symbols, InputSymbol.value is the alignment requirement, .size is the size. - let mut common_syms: Vec<(String, u32, u32)> = Vec::new(); - for obj in inputs.iter() { - for sym in &obj.symbols { - if sym.section_index == SHN_COMMON && !sym.name.is_empty() { - // Only add if this symbol is still in global_symbols with output_section == usize::MAX - // (i.e., it wasn't overridden by a real definition from another object) - if let Some(gs) = global_symbols.get(&sym.name) { - if gs.output_section == usize::MAX && gs.is_defined && !gs.is_dynamic { - // Check we haven't already added this symbol (could appear in multiple objects) - if !common_syms.iter().any(|(n, _, _)| n == &sym.name) { - common_syms.push((sym.name.clone(), sym.value.max(1), sym.size)); - } - } - } - } - } - } - - if common_syms.is_empty() { return; } - - // Find or create .bss section - let bss_idx = if let Some(&idx) = section_name_to_idx.get(".bss") { - idx - } else { - let idx = output_sections.len(); - output_sections.push(OutputSection { - name: ".bss".to_string(), - sh_type: SHT_NOBITS, - flags: SHF_ALLOC | SHF_WRITE, - data: Vec::new(), - align: 4, - addr: 0, - file_offset: 0, - }); - section_name_to_idx.insert(".bss".to_string(), idx); - idx - }; - - let mut bss_off = output_sections[bss_idx].data.len() as u32; - for (name, alignment, size) in &common_syms { - let a = (*alignment).max(1); - bss_off = (bss_off + a - 1) & !(a - 1); - - if let Some(sym) = global_symbols.get_mut(name) { - sym.output_section = bss_idx; - sym.section_offset = bss_off; - } - - if *alignment > output_sections[bss_idx].align { - output_sections[bss_idx].align = *alignment; - } - bss_off += size; - } - - // Extend .bss data to reflect the new size - let new_len = bss_off as usize; - if new_len > output_sections[bss_idx].data.len() { - output_sections[bss_idx].data.resize(new_len, 0); - } -} - -// ══════════════════════════════════════════════════════════════════════════════ -// Phase 7: PLT/GOT marking + undefined check -// ══════════════════════════════════════════════════════════════════════════════ - -pub(super) fn mark_plt_got_needs( - inputs: &[InputObject], - global_symbols: &mut HashMap, - _is_static: bool, -) { - for obj in inputs.iter() { - for sec in &obj.sections { - for &(_, rel_type, sym_idx, _) in &sec.relocations { - let sym = if (sym_idx as usize) < obj.symbols.len() { - &obj.symbols[sym_idx as usize] - } else { continue; }; - - if sym.sym_type == STT_SECTION || sym.name.is_empty() { continue; } - - match rel_type { - R_386_PLT32 => { - if let Some(gs) = global_symbols.get_mut(&sym.name) { - if gs.is_dynamic { - gs.needs_plt = true; - gs.needs_got = true; - } - } - } - R_386_GOT32 | R_386_GOT32X => { - if let Some(gs) = global_symbols.get_mut(&sym.name) { - gs.needs_got = true; - } - } - R_386_TLS_GOTIE | R_386_TLS_IE => { - if let Some(gs) = global_symbols.get_mut(&sym.name) { - gs.needs_got = true; - } - } - _ => {} - } - } - } - } -} - -pub(super) fn check_undefined_symbols(global_symbols: &HashMap) -> Result<(), String> { - let truly_undefined: Vec<&String> = global_symbols.iter() - .filter(|(n, s)| !s.is_defined && !s.is_dynamic && s.binding != STB_WEAK - && !linker_common::is_linker_defined_symbol(n)) - .map(|(n, _)| n) - .collect(); - - if !truly_undefined.is_empty() { - return Err(format!("undefined symbols: {}", truly_undefined.iter() - .map(|s| s.as_str()).collect::>().join(", "))); - } - Ok(()) -} - -// ══════════════════════════════════════════════════════════════════════════════ -// Phase 8: PLT/GOT list building -// ══════════════════════════════════════════════════════════════════════════════ - -pub(super) fn build_plt_got_lists( - global_symbols: &mut HashMap, -) -> (Vec, Vec, Vec, usize, usize) { - let mut plt_symbols: Vec = Vec::new(); - let mut got_dyn_symbols: Vec = Vec::new(); - let mut got_local_symbols: Vec = Vec::new(); - - for (name, sym) in global_symbols.iter() { - if sym.needs_plt { - plt_symbols.push(name.clone()); - } else if sym.needs_got && !sym.needs_plt { - if sym.is_dynamic { - got_dyn_symbols.push(name.clone()); - } else { - got_local_symbols.push(name.clone()); - } - } - } - plt_symbols.sort(); - got_dyn_symbols.sort(); - got_local_symbols.sort(); - - for (i, name) in plt_symbols.iter().enumerate() { - if let Some(sym) = global_symbols.get_mut(name) { - sym.plt_index = i; - sym.got_index = i; - } - } - // Dynamic GOT symbols come first (they need .dynsym entries + GLOB_DAT) - for (i, name) in got_dyn_symbols.iter().enumerate() { - if let Some(sym) = global_symbols.get_mut(name) { - sym.got_index = plt_symbols.len() + i; - } - } - // Local GOT symbols come after (filled at link time, no .dynsym needed) - for (i, name) in got_local_symbols.iter().enumerate() { - if let Some(sym) = global_symbols.get_mut(name) { - sym.got_index = plt_symbols.len() + got_dyn_symbols.len() + i; - } - } - - let num_plt = plt_symbols.len(); - let num_got_total = plt_symbols.len() + got_dyn_symbols.len() + got_local_symbols.len(); - (plt_symbols, got_dyn_symbols, got_local_symbols, num_plt, num_got_total) -} - -pub(super) fn collect_ifunc_symbols( - global_symbols: &HashMap, - is_static: bool, -) -> Vec { - if !is_static { return Vec::new(); } - let mut ifunc_symbols: Vec = global_symbols.iter() - .filter(|(_, s)| s.is_defined && s.sym_type == STT_GNU_IFUNC) - .map(|(n, _)| n.clone()) - .collect(); - ifunc_symbols.sort(); - ifunc_symbols -} diff --git a/src/backend/i686/linker/types.rs b/src/backend/i686/linker/types.rs deleted file mode 100644 index d89a588fce..0000000000 --- a/src/backend/i686/linker/types.rs +++ /dev/null @@ -1,332 +0,0 @@ -//! ELF32 types and constants for the i686 linker. -//! -//! Contains all i686/ELF32-specific constants, structures, and the linker symbol -//! types used throughout the linking process. Architecture-specific constants -//! (relocation types, dynamic tags with i32 types for ELF32) live here rather -//! than in the shared `elf` module which uses u64/i64 for ELF64. - -use std::collections::HashMap; - -// Re-export shared ELF constants used throughout the linker -pub(super) use crate::backend::elf::{ - ELF_MAGIC, ELFCLASS32, ELFDATA2LSB, EV_CURRENT, - ET_EXEC, ET_DYN, ET_REL, EM_386, - PT_LOAD, PT_DYNAMIC, PT_INTERP, PT_PHDR, PT_TLS, PT_GNU_STACK, PT_GNU_EH_FRAME, - SHT_NULL, SHT_PROGBITS, SHT_SYMTAB, SHT_STRTAB, SHT_RELA, - SHT_NOBITS, SHT_REL, SHT_DYNSYM, SHT_GROUP, - SHT_INIT_ARRAY, SHT_FINI_ARRAY, - STB_LOCAL, STB_GLOBAL, STB_WEAK, - STT_OBJECT, STT_FUNC, STT_SECTION, STT_FILE, STT_TLS, STT_GNU_IFUNC, - STV_DEFAULT, - SHN_UNDEF, SHN_ABS, SHN_COMMON, - PF_X, PF_W, PF_R, - read_u16, read_u32, read_cstr, read_i32, - parse_archive_members, parse_thin_archive_members, is_thin_archive, - parse_linker_script_entries, LinkerScriptEntry, - LinkerSymbolAddresses, get_standard_linker_symbols, -}; - -// ── ELF32-specific constants ────────────────────────────────────────────────── -// These either differ in type (i32 vs i64 for DT_*) or aren't in the shared module. - -pub(super) const SHT_NOTE: u32 = 7; -#[allow(dead_code)] // ELF standard section type, defined for reference -pub(super) const SHT_GNU_HASH: u32 = 0x6ffffff6; -#[allow(dead_code)] // ELF standard section type, defined for reference -pub(super) const SHT_GNU_VERSYM_CONST: u32 = 0x6fffffff; -#[allow(dead_code)] // ELF standard section type, defined for reference -pub(super) const SHT_GNU_VERNEED: u32 = 0x6ffffffe; - -// Section flags (i686 uses u32 instead of shared module's u64) -pub(super) const SHF_WRITE: u32 = 0x1; -pub(super) const SHF_ALLOC: u32 = 0x2; -pub(super) const SHF_EXECINSTR: u32 = 0x4; -#[allow(dead_code)] // ELF standard section flag, defined for reference -pub(super) const SHF_MERGE: u32 = 0x10; -#[allow(dead_code)] // ELF standard section flag, defined for reference -pub(super) const SHF_STRINGS: u32 = 0x20; -#[allow(dead_code)] // ELF standard section flag, defined for reference -pub(super) const SHF_INFO_LINK: u32 = 0x40; -pub(super) const SHF_GROUP: u32 = 0x200; -pub(super) const SHF_TLS: u32 = 0x400; - -// i386 relocation types -pub(super) const R_386_NONE: u32 = 0; -pub(super) const R_386_32: u32 = 1; -pub(super) const R_386_PC32: u32 = 2; -pub(super) const R_386_GOT32: u32 = 3; -pub(super) const R_386_PLT32: u32 = 4; -pub(super) const R_386_GOTOFF: u32 = 9; -pub(super) const R_386_GOTPC: u32 = 10; -pub(super) const R_386_TLS_TPOFF: u32 = 14; -pub(super) const R_386_TLS_IE: u32 = 15; -pub(super) const R_386_TLS_GOTIE: u32 = 16; -pub(super) const R_386_TLS_LE: u32 = 17; -pub(super) const R_386_TLS_GD: u32 = 18; -pub(super) const R_386_TLS_LE_32: u32 = 34; -pub(super) const R_386_TLS_DTPMOD32: u32 = 35; -pub(super) const R_386_TLS_DTPOFF32: u32 = 36; -pub(super) const R_386_TLS_TPOFF32: u32 = 37; -pub(super) const R_386_COPY: u32 = 5; -pub(super) const R_386_GLOB_DAT: u32 = 6; -pub(super) const R_386_JMP_SLOT: u32 = 7; -pub(super) const R_386_RELATIVE: u32 = 8; -pub(super) const R_386_IRELATIVE: u32 = 42; -pub(super) const R_386_GOT32X: u32 = 43; - -// Dynamic tags (i32 for ELF32, vs i64 in the shared module) -pub(super) const DT_NULL: i32 = 0; -pub(super) const DT_NEEDED: i32 = 1; -pub(super) const DT_PLTRELSZ: i32 = 2; -pub(super) const DT_PLTGOT: i32 = 3; -pub(super) const DT_STRTAB: i32 = 5; -pub(super) const DT_SYMTAB: i32 = 6; -pub(super) const DT_STRSZ: i32 = 10; -pub(super) const DT_SYMENT: i32 = 11; -pub(super) const DT_INIT: i32 = 12; -pub(super) const DT_FINI: i32 = 13; -pub(super) const DT_REL: i32 = 17; -pub(super) const DT_RELSZ: i32 = 18; -pub(super) const DT_RELENT: i32 = 19; -pub(super) const DT_PLTREL: i32 = 20; -pub(super) const DT_DEBUG: i32 = 21; -pub(super) const DT_TEXTREL: i32 = 22; -pub(super) const DT_JMPREL: i32 = 23; -pub(super) const DT_INIT_ARRAY: i32 = 25; -pub(super) const DT_FINI_ARRAY: i32 = 26; -pub(super) const DT_INIT_ARRAYSZ: i32 = 27; -pub(super) const DT_FINI_ARRAYSZ: i32 = 28; -pub(super) const DT_SONAME: i32 = 14; -pub(super) const DT_FLAGS: i32 = 30; -pub(super) const DT_GNU_HASH_TAG: i32 = 0x6ffffef5u32 as i32; -pub(super) const DT_VERNEED: i32 = 0x6ffffffe_u32 as i32; -pub(super) const DT_VERNEEDNUM: i32 = 0x6fffffff_u32 as i32; -pub(super) const DT_VERSYM: i32 = 0x6ffffff0_u32 as i32; - -pub(super) const PAGE_SIZE: u32 = 0x1000; -pub(super) const BASE_ADDR: u32 = 0x08048000; -pub(super) const INTERP: &[u8] = b"/lib/ld-linux.so.2\0"; - -// ── ELF32 structures ──────────────────────────────────────────────────────── - -#[derive(Clone, Debug)] -pub(super) struct Elf32Sym { - pub name: u32, - pub value: u32, - pub size: u32, - pub info: u8, - pub other: u8, - pub shndx: u16, -} - -#[allow(dead_code)] // Convenience accessors; not all used by every code path -impl Elf32Sym { - pub fn binding(&self) -> u8 { self.info >> 4 } - pub fn sym_type(&self) -> u8 { self.info & 0xf } -} - -#[derive(Clone, Debug)] -pub(super) struct Elf32Shdr { - pub name: u32, - pub sh_type: u32, - pub flags: u32, - #[allow(dead_code)] // Populated during ELF parsing; not yet read by linker - pub addr: u32, - pub offset: u32, - pub size: u32, - pub link: u32, - pub info: u32, - pub addralign: u32, - pub entsize: u32, -} - -// ── Input object types ──────────────────────────────────────────────────────── - -/// A parsed section from an input .o file. -#[derive(Clone)] -pub(super) struct InputSection { - pub name: String, - pub sh_type: u32, - pub flags: u32, - pub data: Vec, - pub align: u32, - /// Relocations: (offset, rel_type, sym_idx_in_input, addend) - pub relocations: Vec<(u32, u32, u32, i32)>, - /// Index in the input file's section header table. - pub input_index: usize, - #[allow(dead_code)] // Populated during ELF parsing; preserved for future SHF_MERGE support - pub entsize: u32, - #[allow(dead_code)] // Populated during ELF parsing; preserved for section linking - pub link: u32, - pub info: u32, -} - -/// A parsed symbol from an input .o file. -#[derive(Clone, Debug)] -pub(super) struct InputSymbol { - pub name: String, - pub value: u32, - pub size: u32, - pub binding: u8, - pub sym_type: u8, - #[allow(dead_code)] // Parsed from ELF; needed for future STV_HIDDEN/STV_PROTECTED handling - pub visibility: u8, - pub section_index: u16, -} - -/// A parsed input file. -pub(super) struct InputObject { - pub sections: Vec, - pub symbols: Vec, - pub filename: String, -} - -// ── Linker state types ──────────────────────────────────────────────────────── - -/// Resolved symbol info in the linker. -#[derive(Clone, Debug)] -pub(super) struct LinkerSymbol { - pub address: u32, - pub size: u32, - pub sym_type: u8, - pub binding: u8, - #[allow(dead_code)] // Tracked for future STV_HIDDEN/STV_PROTECTED handling - pub visibility: u8, - pub is_defined: bool, - pub needs_plt: bool, - pub needs_got: bool, - pub output_section: usize, - pub section_offset: u32, - pub plt_index: usize, - pub got_index: usize, - pub is_dynamic: bool, - pub dynlib: String, - pub needs_copy: bool, - pub copy_addr: u32, - pub version: Option, - /// Whether this dynamic data symbol uses text relocations instead of COPY. - pub uses_textrel: bool, -} - -/// A merged output section. -pub(super) struct OutputSection { - pub name: String, - pub sh_type: u32, - pub flags: u32, - pub data: Vec, - pub align: u32, - pub addr: u32, - pub file_offset: u32, -} - -/// Maps (object_index, section_index) -> (output_section_index, offset_in_output). -pub(super) type SectionMap = HashMap<(usize, usize), (usize, u32)>; - -/// Info about a dynamic symbol from a shared library. -pub(super) struct DynSymInfo { - pub name: String, - pub sym_type: u8, - pub size: u32, - #[allow(dead_code)] // Parsed from .so; needed for future weak-vs-global symbol preference - pub binding: u8, - pub version: Option, - pub is_default_ver: bool, -} - -// ── Helpers ────────────────────────────────────────────────────────────────── - -pub(super) fn align_up(value: u32, align: u32) -> u32 { - if align == 0 { return value; } - (value + align - 1) & !(align - 1) -} - -/// Append an ELF32 dynamic entry (8 bytes: tag + value). -pub(super) fn push_dyn(data: &mut Vec, tag: i32, val: u32) { - data.extend_from_slice(&tag.to_le_bytes()); - data.extend_from_slice(&val.to_le_bytes()); -} - -/// Determine the output section name for an input section. -/// -/// Returns `None` for sections that should not be included in the output -/// (metadata sections, non-allocated sections, etc.). -pub(super) fn output_section_name(name: &str, flags: u32, sh_type: u32) -> Option { - // Skip non-allocatable sections, symbol tables, relocation sections, etc. - if sh_type == SHT_NULL || sh_type == SHT_SYMTAB || sh_type == SHT_STRTAB - || sh_type == SHT_REL || sh_type == SHT_RELA || sh_type == SHT_GROUP { - return None; - } - if name == ".note.GNU-stack" || name == ".comment" { - return None; - } - - // Group by canonical output section name - if name.starts_with(".text") || name == ".init" || name == ".fini" { - if name == ".init" { return Some(".init".to_string()); } - if name == ".fini" { return Some(".fini".to_string()); } - return Some(".text".to_string()); - } - if name.starts_with(".rodata") { - return Some(".rodata".to_string()); - } - if name == ".eh_frame" { - return Some(".eh_frame".to_string()); - } - if name == ".tbss" || name.starts_with(".tbss.") { - return Some(".tbss".to_string()); - } - if name == ".tdata" || name.starts_with(".tdata.") { - return Some(".tdata".to_string()); - } - if flags & SHF_TLS != 0 { - return if sh_type == SHT_NOBITS { - Some(".tbss".to_string()) - } else { - Some(".tdata".to_string()) - }; - } - if name.starts_with(".data") { - return Some(".data".to_string()); - } - if name.starts_with(".bss") || sh_type == SHT_NOBITS { - return Some(".bss".to_string()); - } - if name == ".init_array" || name.starts_with(".init_array.") { - return Some(".init_array".to_string()); - } - if name == ".fini_array" || name.starts_with(".fini_array.") { - return Some(".fini_array".to_string()); - } - if name.starts_with(".note.") && sh_type == SHT_NOTE { - return Some(".note".to_string()); - } - if name.starts_with(".tm_clone_table") { - return Some(".data".to_string()); - } - - // For alloc sections whose names are valid C identifiers, preserve the - // original name. This is needed for __start_
/ __stop_
- // symbol auto-generation (GNU ld feature). Without this, custom sections - // like "my_cbs" would be merged into .text/.data/.rodata and the linker - // could not resolve __start_my_cbs / __stop_my_cbs symbols. - if flags & SHF_ALLOC != 0 { - if crate::backend::linker_common::is_valid_c_identifier_for_section(name) { - return Some(name.to_string()); - } - // Fall back to flag-based grouping for other alloc sections - // (e.g. .gcc_except_table, .stapsdt.base, .gnu.warning.*) - if flags & SHF_EXECINSTR != 0 { - return Some(".text".to_string()); - } - if flags & SHF_WRITE != 0 { - return if sh_type == SHT_NOBITS { - Some(".bss".to_string()) - } else { - Some(".data".to_string()) - }; - } - return Some(".rodata".to_string()); - } - - None -} diff --git a/src/backend/i686/mod.rs b/src/backend/i686/mod.rs deleted file mode 100644 index 64ad504866..0000000000 --- a/src/backend/i686/mod.rs +++ /dev/null @@ -1,7 +0,0 @@ -pub(crate) mod codegen; -#[cfg_attr(feature = "gcc_assembler", allow(dead_code))] // Built-in assembler unused when gcc handles assembly -pub(crate) mod assembler; -#[cfg_attr(feature = "gcc_linker", allow(dead_code))] // Built-in linker unused when gcc handles linking -pub(crate) mod linker; - -pub(crate) use codegen::emit::I686Codegen; diff --git a/src/backend/inline_asm.rs b/src/backend/inline_asm.rs deleted file mode 100644 index 3b7c38f07d..0000000000 --- a/src/backend/inline_asm.rs +++ /dev/null @@ -1,1122 +0,0 @@ -//! Shared inline assembly framework. -//! -//! All four backends use the same 4-phase inline assembly processing: -//! 1. Classify constraints and assign registers (specific first, then scratch) -//! 2. Load input values into registers, pre-load read-write outputs -//! 3. Substitute operand references in template and emit -//! 4. Store output registers back to stack slots -//! -//! Each backend implements `InlineAsmEmitter` to provide arch-specific register -//! classification, loading, and storage. The shared `emit_inline_asm_common` -//! orchestrates the phases. - -use std::borrow::Cow; -use crate::ir::reexports::{BlockId, Operand, Value}; -use crate::common::types::{AddressSpace, IrType}; -pub use crate::common::asm_constraints::constraint_is_immediate_only; -use super::state::CodegenState; - -/// Operand classification for inline asm. Each backend classifies its constraints -/// into these categories so the shared framework can orchestrate register -/// assignment, tied operand resolution, and GCC numbering. -#[derive(Debug, Clone, PartialEq)] -pub enum AsmOperandKind { - /// General-purpose register (e.g., x86 "r", ARM "r", RISC-V "r"). - GpReg, - /// Floating-point register (RISC-V "f"). - FpReg, - /// Memory operand (all arches "m"). - Memory, - /// Specific named register (x86 "a"→"rax", RISC-V "a0", etc.). - Specific(String), - /// Tied to another operand by index (e.g., "0", "1"). - Tied(usize), - /// Immediate value (RISC-V "I", "i", "n"). - Immediate, - /// Address for atomic ops (RISC-V "A"). - Address, - /// Zero-or-register (RISC-V "rJ", "J"). - ZeroOrReg, - /// Condition code output (GCC =@cc, e.g. =@cce, =@ccne). - /// The string is the condition suffix (e.g. "e", "ne", "s", "ns"). - ConditionCode(String), - /// x87 FPU stack top register st(0), selected by "t" constraint. - X87St0, - /// x87 FPU stack second register st(1), selected by "u" constraint. - X87St1, - /// GP register with accessible high-byte form (x86 "Q" constraint). - /// On x86-64, only rax/rbx/rcx/rdx have %ah/%bh/%ch/%dh forms. - /// Used when the asm template uses the %h modifier to access the - /// second byte (bits 8-15) of the register. - QReg, -} - -/// Per-operand state tracked by the shared inline asm framework. -/// Backends populate arch-specific fields (mem_addr, mem_offset, imm_value) -/// during constraint classification. -#[derive(Debug, Clone)] -pub struct AsmOperand { - pub kind: AsmOperandKind, - pub reg: String, - /// High register for 64-bit register pairs on i686. Empty when not a pair. - /// On i686, 64-bit values in "r" constraints require two 32-bit GP registers: - /// `reg` holds the low 32 bits, `reg_hi` holds the high 32 bits. - pub reg_hi: String, - pub name: Option, - /// x86: memory address string like "offset(%rbp)". - pub mem_addr: String, - /// RISC-V/ARM: stack offset for memory/address operands. - pub mem_offset: i64, - /// Immediate value for "I"/"i" constraints. - pub imm_value: Option, - /// Symbol name for "i" constraint operands that reference global/function addresses. - /// Used by %P and %a modifiers to emit raw symbol names in inline asm templates. - pub imm_symbol: Option, - /// IR type of this operand, used for correctly-sized loads/stores. - pub operand_type: IrType, - /// Original constraint string, used for fallback decisions. - pub constraint: String, - /// Segment prefix for memory operands (e.g., "%gs:" or "%fs:"). - /// Set from AddressSpace for __seg_gs/__seg_fs pointer dereferences. - pub seg_prefix: String, -} - -impl AsmOperand { - pub fn new(kind: AsmOperandKind, name: Option) -> Self { - Self { kind, reg: String::new(), reg_hi: String::new(), name, mem_addr: String::new(), mem_offset: 0, imm_value: None, imm_symbol: None, operand_type: IrType::I64, constraint: String::new(), seg_prefix: String::new() } - } - - /// Copy register assignment and addressing metadata from another operand. - /// Used for tied operands and "+" read-write propagation. - pub fn copy_metadata_from(&mut self, source: &AsmOperand) { - self.reg = source.reg.clone(); - self.reg_hi = source.reg_hi.clone(); - self.mem_addr = source.mem_addr.clone(); - self.mem_offset = source.mem_offset; - if matches!(source.kind, AsmOperandKind::Memory) { - self.kind = AsmOperandKind::Memory; - } else if matches!(source.kind, AsmOperandKind::Address) { - self.kind = AsmOperandKind::Address; - } else if matches!(source.kind, AsmOperandKind::FpReg) { - self.kind = AsmOperandKind::FpReg; - } else if matches!(source.kind, AsmOperandKind::X87St0) { - self.kind = AsmOperandKind::X87St0; - } else if matches!(source.kind, AsmOperandKind::X87St1) { - self.kind = AsmOperandKind::X87St1; - } - } -} - -/// Trait that backends implement to provide architecture-specific inline asm behavior. -/// The shared `emit_inline_asm_common` function calls these methods to handle the -/// architecture-dependent parts of inline assembly processing. -pub trait InlineAsmEmitter { - /// Mutable access to the codegen state (for emitting instructions). - fn asm_state(&mut self) -> &mut CodegenState; - - /// Classify a constraint string into an AsmOperandKind, and optionally - /// return the specific register name for Specific constraints. - fn classify_constraint(&self, constraint: &str) -> AsmOperandKind; - - /// Set up arch-specific operand metadata after classification. - /// Called once per operand. For memory/address operands, set mem_addr or mem_offset. - /// For immediate operands, set imm_value. - fn setup_operand_metadata(&self, op: &mut AsmOperand, val: &Operand, is_output: bool); - - /// Assign the next available scratch register for the given operand kind. - /// `excluded` contains register names that are already claimed by specific-register - /// constraints (e.g., "rcx" from "c" constraint) and must not be reused. - fn assign_scratch_reg(&mut self, kind: &AsmOperandKind, excluded: &[String]) -> String; - - /// Load an input value into its assigned register. Called during Phase 2. - fn load_input_to_reg(&mut self, op: &AsmOperand, val: &Operand, constraint: &str); - - /// Pre-load a read-write ("+") output's current value into its register. - fn preload_readwrite_output(&mut self, op: &AsmOperand, ptr: &Value); - - /// Substitute operand references in a single template line and return the result. - fn substitute_template_line(&self, line: &str, operands: &[AsmOperand], gcc_to_internal: &[usize], operand_types: &[IrType], goto_labels: &[(String, BlockId)]) -> String; - - /// Store an output register value back to its stack slot after the asm executes. - /// `all_output_regs` contains the register names of ALL output operands, used to - /// avoid clobbering other output registers when picking scratch registers. - fn store_output_from_reg(&mut self, op: &AsmOperand, ptr: &Value, constraint: &str, all_output_regs: &[&str]); - - /// Resolve memory operand addresses that require indirection (non-alloca pointers). - /// `excluded` contains registers claimed by specific-register constraints, - /// used to avoid conflicts when allocating a temp register for the address. - fn resolve_memory_operand(&mut self, _op: &mut AsmOperand, _val: &Operand, _excluded: &[String]) -> bool { - false - } - - /// Set up a memory operand for a register-to-memory fallback (e.g., "g" constraint - /// on i686 when all GP registers are exhausted). Unlike `setup_operand_metadata` for - /// Memory kind (which expects the value to be an address/pointer for "m" constraints), - /// this sets up a direct memory reference to the value's stack slot. For non-alloca - /// values, the slot holds the value directly at `offset(frame_pointer)`. - /// Constants are promoted to immediates instead. - fn setup_memory_fallback(&self, _op: &mut AsmOperand, _val: &Operand) { - // Default: no-op. Only i686 needs this because it's the only backend that - // can exhaust all GP registers. x86-64/ARM/RISC-V have enough registers. - } - - /// Returns true if the given type requires a register pair for GP register constraints. - /// On i686, 64-bit types (I64/U64) need two 32-bit registers to represent a single value. - /// Defaults to false (most architectures have 64-bit GP registers). - fn needs_register_pair(&self, _ty: IrType) -> bool { false } - - /// Reset scratch register allocation state (called at start of each inline asm). - fn reset_scratch_state(&mut self); - - /// Check whether a constant value fits the immediate constraint range for a given - /// constraint string. Backends can override this to provide architecture-specific - /// immediate ranges. The default implementation uses x86 semantics. - /// - /// For example, on RISC-V the 'K' constraint means a 5-bit unsigned CSR immediate - /// (0-31), while on x86 it means 0-255. Without this override, the shared framework - /// would incorrectly promote values like 128 to immediates on RISC-V. - fn constant_fits_immediate(&self, constraint: &str, value: i64) -> bool { - constant_fits_immediate_constraint(constraint, value) - } -} - -/// Check whether a constraint string contains an immediate alternative character. -/// Used by both IR lowering (to decide whether to try constant evaluation) and -/// the shared inline asm framework (to promote GpReg operands to Immediate when -/// the input is a compile-time constant). -/// -/// This covers the architecture-neutral immediate constraint letters ('I', 'i', 'n') -/// and the RISC-V 'K' constraint (5-bit unsigned CSR immediate, used in "rK" for -/// csrw/csrs/csrc instructions). Without recognizing 'K' here, the IR lowering -/// won't attempt constant evaluation for "rK" constraints, causing values like 0 -/// to be materialized through stack spills instead of as bare immediates. -/// Other architecture-specific immediate letters (e.g., x86 'N', 'e') are handled -/// separately by each backend's `classify_constraint`. -pub fn constraint_has_immediate_alt(constraint: &str) -> bool { - let stripped = constraint.trim_start_matches(['=', '+', '&', '%']); - // Named tied operands ("[name]") don't have immediates - if stripped.starts_with('[') && stripped.ends_with(']') { - return false; - } - stripped.chars().any(|c| matches!(c, 'I' | 'i' | 'n' | 'K')) -} - -/// Check whether a constant value fits the immediate constraint range for a given -/// multi-alternative constraint string. For example, x86 "Ir" with value 602 should -/// NOT be promoted to immediate (602 > 31), but "Ir" with value 5 should (5 <= 31). -/// -/// x86 immediate constraint ranges (from GCC docs): -/// 'i' - any integer constant (always fits) -/// 'n' - any integer constant (always fits, same as 'i' for our purposes) -/// 'I' - 0..31 (for shift counts / bit positions) -/// 'N' - 0..255 (unsigned byte) -/// 'e' - -(2^31)..((2^31)-1) (signed 32-bit) -/// 'K' - 0..0xFF (same as N for our purposes) -/// 'M' - 0..3 (for lea scale) -/// 'L' - 0xFF, 0xFFFF (mask constants) -/// 'J' - 0..0xFFFFFFFF (unsigned 32-bit) -/// 'O' - 0..127 (unsigned 7-bit) -/// -/// If the constraint contains 'i' or 'n', any constant fits. Otherwise, the value -/// must fit the range of at least one uppercase immediate letter present. -pub fn constant_fits_immediate_constraint(constraint: &str, value: i64) -> bool { - let stripped = constraint.trim_start_matches(['=', '+', '&', '%']); - // If constraint has 'i' or 'n', any constant value is accepted - if stripped.contains('i') || stripped.contains('n') { - return true; - } - // Check each uppercase immediate letter to see if value fits its range - for ch in stripped.chars() { - let fits = match ch { - 'I' => (0..=31).contains(&value), - 'N' | 'K' => (0..=255).contains(&value), - 'e' | 'E' => (-(1i64 << 31)..=((1i64 << 31) - 1)).contains(&value), - 'M' => (0..=3).contains(&value), - 'J' => (0..=0xFFFF_FFFF).contains(&value), - 'L' => value == 0xFF || value == 0xFFFF, - 'O' => (0..=127).contains(&value), - 'G' | 'H' => false, // floating-point immediate constraints, not integer - _ => continue, // not an immediate constraint letter - }; - if fits { - return true; - } - } - false -} - -/// Check whether a constraint string contains a memory alternative character. -/// Handles both single-character ("m") and multi-character constraints ("rm", "mq"). -/// Also recognizes "Q" which is an AArch64-specific memory constraint meaning -/// "a memory address with a single base register" (used for atomic ops like ldaxr/stlxr). -pub fn constraint_has_memory_alt(constraint: &str) -> bool { - let stripped = constraint.trim_start_matches(['=', '+', '&', '%']); - // Named tied operands ("[name]") are not memory constraints - if stripped.starts_with('[') && stripped.ends_with(']') { - return false; - } - // 'g' means "general operand" (register, memory, or immediate) — includes memory - stripped.chars().any(|c| c == 'm' || c == 'Q' || c == 'g') -} - -/// Check whether a constraint is memory-only (has memory alternative but no register -/// alternative). For constraints like "rm", "qm", "g" that allow both register and -/// memory, returns false — the backend will prefer registers, so the IR lowering -/// should provide a value (not an address). Only pure "m"/"o"/"V"/"Q" constraints need -/// the address for memory operand formatting. -/// Note: "Q" is AArch64-specific meaning "single base register memory address" and is -/// always memory-only (no register alternative), used for atomic ops like ldaxr/stlxr. -/// The `is_arm` flag controls whether 'Q' is treated as a memory constraint: -/// - On AArch64: 'Q' = memory-only (single base register addressing) -/// - On x86/x86-64: 'Q' = legacy byte register (rax/rbx/rcx/rdx with %h form) -/// - On RISC-V: 'Q' is not a standard constraint -pub fn constraint_is_memory_only(constraint: &str, is_arm: bool) -> bool { - let stripped = constraint.trim_start_matches(['=', '+', '&', '%']); - // Named tied operands ("[name]") are never memory-only - if stripped.starts_with('[') && stripped.ends_with(']') { - return false; - } - // 'Q' is memory-only ONLY on AArch64; on x86 it's a register constraint. - let has_mem = stripped.chars().any(|c| { - matches!(c, 'm' | 'o' | 'V' | 'p') || (c == 'Q' && is_arm) - }); - if !has_mem { - return false; - } - // Check for any register alternative (GP, FP, or specific register) - let has_reg = stripped.chars().any(|c| matches!(c, - 'r' | 'q' | 'R' | 'l' | // GP register - 'g' | // general (reg + mem + imm) - 'x' | 'v' | 'Y' | // FP register - 'a' | 'b' | 'c' | 'd' | 'S' | 'D' // specific register - )); - // Also check for tied operand (digits) — those get a register - let has_tied = stripped.chars().any(|c| c.is_ascii_digit()); - !has_reg && !has_tied -} - -/// Check whether a constraint requires an address (lvalue) rather than a value (rvalue). -/// This covers both memory-only constraints (m, o, V, Q) and address constraints. -/// -/// The `is_riscv` flag controls whether "A" is treated as an address constraint: -/// - On RISC-V, "A" means "address operand for AMO/LR/SC instructions" — the inline -/// asm template receives the address in a register, formatted as "(reg)". -/// - On x86, "A" means the accumulator register (rax/eax:edx), NOT an address. -/// -/// This is used by the IR lowering to decide whether to call lower_lvalue() (getting -/// the address) or lower_expr() (loading the value) for inline asm input operands. -pub fn constraint_needs_address(constraint: &str, is_riscv: bool, is_arm: bool) -> bool { - if constraint_is_memory_only(constraint, is_arm) { - return true; - } - // RISC-V "A" constraint: address for AMO/LR/SC instructions - if is_riscv { - let stripped = constraint.trim_start_matches(['=', '+', '&', '%']); - if stripped == "A" { - return true; - } - } - false -} - -/// Expand GCC dialect alternatives in an inline assembly template string. -/// -/// GCC inline assembly supports dialect alternatives: `{att|intel}` where the -/// first alternative is AT&T syntax and the second is Intel syntax. Since our -/// compiler always emits AT&T syntax, we select the first alternative from each -/// `{...|...}` group. -/// -/// Examples: -/// `pushf{l|d}` -> `pushfl` -/// `mov{l}\t{%0, %1|%1, %0}` -> `movl\t%0, %1` -/// `pop{l}\t%0` -> `popl\t%0` -/// `no_braces` -> `no_braces` (unchanged) -/// -/// This handles the syntax used in GCC's `` for i686 CPUID detection. -pub fn expand_dialect_alternatives(template: &str) -> Cow<'_, str> { - // Fast path: if there are no braces, return as-is - if !template.contains('{') { - return Cow::Borrowed(template); - } - - let mut result = String::with_capacity(template.len()); - let chars: Vec = template.chars().collect(); - let mut i = 0; - while i < chars.len() { - if chars[i] == '%' && i + 1 < chars.len() && chars[i + 1] == '{' { - // GCC escape sequence %{ -> literal '{' - result.push('{'); - i += 2; - } else if chars[i] == '%' && i + 1 < chars.len() && chars[i + 1] == '}' { - // GCC escape sequence %} -> literal '}' - result.push('}'); - i += 2; - } else if chars[i] == '{' { - // Could be a dialect alternatives group {att|intel} or literal braces - // used in ARM NEON instructions like ld1 {v0.4s}, [x0]. - // Scan ahead to find matching '}' and check for '|'. - let brace_start = i; - i += 1; // skip '{' - let start = i; - let mut depth = 1; - let mut pipe_pos = None; - // Find the matching '}' and the '|' separator - while i < chars.len() && depth > 0 { - if chars[i] == '{' { - depth += 1; - } else if chars[i] == '}' { - depth -= 1; - if depth == 0 { break; } - } else if chars[i] == '|' && depth == 1 && pipe_pos.is_none() { - pipe_pos = Some(i); - } - i += 1; - } - if let Some(p) = pipe_pos { - // This IS a dialect alternatives group: {alt1|alt2} - // Extract the first alternative (before '|') - let alt: String = chars[start..p].iter().collect(); - result.push_str(&alt); - } else { - // No pipe found: these are literal braces (e.g., ARM NEON {v0.4s}). - // Preserve the braces and their content as-is. - let content: String = chars[brace_start..i].iter().collect(); - result.push_str(&content); - if i < chars.len() && chars[i] == '}' { - result.push('}'); - } - } - if i < chars.len() && chars[i] == '}' { - i += 1; // skip '}' - } - } else { - result.push(chars[i]); - i += 1; - } - } - Cow::Owned(result) -} - -/// Shared inline assembly emission logic. All four backends call this from their -/// `emit_inline_asm` implementation, providing an `InlineAsmEmitter` to handle -/// arch-specific details. -pub fn emit_inline_asm_common( - emitter: &mut dyn InlineAsmEmitter, - template: &str, - outputs: &[(String, Value, Option)], - inputs: &[(String, Operand, Option)], - clobbers: &[String], - operand_types: &[IrType], - goto_labels: &[(String, BlockId)], - input_symbols: &[Option], -) { - emit_inline_asm_common_impl(emitter, template, outputs, inputs, clobbers, operand_types, goto_labels, input_symbols, &[]); -} - -pub fn emit_inline_asm_common_impl( - emitter: &mut dyn InlineAsmEmitter, - template: &str, - outputs: &[(String, Value, Option)], - inputs: &[(String, Operand, Option)], - clobbers: &[String], - operand_types: &[IrType], - goto_labels: &[(String, BlockId)], - input_symbols: &[Option], - seg_overrides: &[AddressSpace], -) { - emitter.reset_scratch_state(); - - // Phase 1: Classify all operands and assign registers - let (mut operands, input_tied_to) = classify_all_operands(emitter, outputs, inputs); - // Pre-populate operand types early so assign_scratch_registers can use them - // for register pair decisions on i686 (64-bit types need two 32-bit registers). - for (i, ty) in operand_types.iter().enumerate() { - if i < operands.len() { - operands[i].operand_type = *ty; - } - } - resolve_symbols_and_immediates(&mut operands, outputs, input_symbols); - let specific_regs = collect_excluded_registers(&operands, clobbers); - assign_scratch_registers(emitter, &mut operands, &input_tied_to, &specific_regs, outputs, inputs); - resolve_tied_and_types(&mut operands, &input_tied_to, outputs, operand_types); - let (_, gcc_to_internal) = finalize_operands_and_build_gcc_map( - emitter, &mut operands, outputs, inputs, &specific_regs, seg_overrides, - ); - - // Phase 2: Load input values into their assigned registers - load_inputs(emitter, &operands, outputs, inputs); - - // Phase 3: Substitute operand references in template and emit - // First, expand GCC dialect alternatives {att|intel} -> att - let expanded = expand_dialect_alternatives(template); - let lines: Vec<&str> = expanded.split('\n').collect(); - for line in &lines { - let line = line.trim().trim_start_matches('\t').trim(); - if line.is_empty() { - continue; - } - let resolved = emitter.substitute_template_line(line, &operands, &gcc_to_internal, operand_types, goto_labels); - emitter.asm_state().emit_fmt(format_args!(" {}", resolved)); - } - - // Phase 4: Store output register values back to their stack slots - let all_output_regs: Vec<&str> = outputs.iter().enumerate() - .filter(|(_, (c, _, _))| c.contains('=') || c.contains('+')) - .map(|(i, _)| operands[i].reg.as_str()) - .collect(); - for (i, (constraint, ptr, _)) in outputs.iter().enumerate() { - if constraint.contains('=') || constraint.contains('+') { - emitter.store_output_from_reg(&operands[i], ptr, constraint, &all_output_regs); - } - } -} - -/// Phase 1a: Classify all output and input operands, returning the operand -/// vector and the input-tied-to mapping. -fn classify_all_operands( - emitter: &mut dyn InlineAsmEmitter, - outputs: &[(String, Value, Option)], - inputs: &[(String, Operand, Option)], -) -> (Vec, Vec>) { - let total_operands = outputs.len() + inputs.len(); - let mut operands: Vec = Vec::with_capacity(total_operands); - - // Classify outputs - for (constraint, ptr, name) in outputs { - let kind = emitter.classify_constraint(constraint); - let mut op = AsmOperand::new(kind, name.clone()); - op.constraint = constraint.clone(); - if let AsmOperandKind::Specific(ref reg) = op.kind { - op.reg = reg.clone(); - } - let val = Operand::Value(*ptr); - emitter.setup_operand_metadata(&mut op, &val, true); - operands.push(op); - } - - // Track which inputs are tied (to avoid assigning scratch regs) - let mut input_tied_to: Vec> = Vec::with_capacity(inputs.len()); - - // Classify inputs - for (constraint, val, name) in inputs { - // Handle named tied operands: "[name]" resolves to the output with that name - let kind = if constraint.starts_with('[') && constraint.ends_with(']') { - let tied_name = &constraint[1..constraint.len()-1]; - let tied_idx = outputs.iter().position(|(_, _, oname)| { - oname.as_deref() == Some(tied_name) - }); - if let Some(idx) = tied_idx { - AsmOperandKind::Tied(idx) - } else { - emitter.classify_constraint(constraint) - } - } else { - emitter.classify_constraint(constraint) - }; - let mut op = AsmOperand::new(kind.clone(), name.clone()); - op.constraint = constraint.clone(); - if let AsmOperandKind::Specific(ref reg) = op.kind { - op.reg = reg.clone(); - } - if let AsmOperandKind::Tied(idx) = &kind { - input_tied_to.push(Some(*idx)); - } else { - input_tied_to.push(None); - } - emitter.setup_operand_metadata(&mut op, val, false); - - // For multi-alternative constraints (e.g., "Ir", "ri", "In") that were classified - // as GpReg but have a constant input value, promote to Immediate so the value - // is emitted as $value instead of loaded into a register. Only do this when - // the constraint actually contains an immediate alternative character AND the - // constant value fits the range of that immediate constraint. - if matches!(op.kind, AsmOperandKind::GpReg | AsmOperandKind::QReg) { - if let Operand::Const(c) = val { - if let Some(v) = c.to_i64() { - if emitter.constant_fits_immediate(constraint, v) { - op.imm_value = Some(v); - op.kind = AsmOperandKind::Immediate; - } - } - } - } - - // For pure immediate constraints ("i", "n", etc.) that were classified directly - // as Immediate by classify_constraint, populate imm_value from the constant - // operand. Without this, the imm_value stays None and gets replaced with 0 - // in resolve_symbols_and_immediates, causing incorrect assembly output - // (e.g., `bic x9, x10, 0` instead of `bic x9, x10, 36028797018963968`). - if matches!(op.kind, AsmOperandKind::Immediate) && op.imm_value.is_none() { - if let Operand::Const(c) = val { - if let Some(v) = c.to_i64() { - op.imm_value = Some(v); - } - } - } - - // For pure immediate-only constraints ("i", "n", etc.) that are still GpReg/QReg - // because the operand is a Value (not a Const), promote to Immediate with a - // placeholder value of 0. This happens in standalone bodies of static inline - // functions where "i" constraint parameters can't be resolved to constants. - // The standalone body is safe because: (1) always_inline functions are DCE'd - // if never called directly, and (2) .pushsection metadata with 0 won't be - // linked into the final binary. Without this, the backend would load the value - // into a register and substitute the register name (e.g., "x9") into data - // directives like .hword, causing linker errors ("undefined reference to x9"). - if matches!(op.kind, AsmOperandKind::GpReg | AsmOperandKind::QReg) && matches!(val, Operand::Value(_)) - && constraint_is_immediate_only(constraint) { - op.imm_value = Some(0); - op.kind = AsmOperandKind::Immediate; - } - - operands.push(op); - } - - (operands, input_tied_to) -} - -/// Phase 1b: Resolve input symbol names and handle unresolved immediates. -fn resolve_symbols_and_immediates( - operands: &mut [AsmOperand], - outputs: &[(String, Value, Option)], - input_symbols: &[Option], -) { - let num_outputs = outputs.len(); - - // Populate symbol names for input operands from input_symbols. - for (i, sym) in input_symbols.iter().enumerate() { - let op_idx = num_outputs + i; - if op_idx < operands.len() { - if let Some(ref s) = sym { - operands[op_idx].imm_symbol = Some(s.clone()); - // Promote to Immediate so the symbol is emitted directly - if matches!(operands[op_idx].kind, AsmOperandKind::GpReg | AsmOperandKind::QReg) { - operands[op_idx].kind = AsmOperandKind::Immediate; - } - } - } - } - - // For "+" read-write constraints, copy imm_symbol from synthetic inputs to outputs - { - let mut plus_idx = 0; - for (i, (constraint, _, _)) in outputs.iter().enumerate() { - if constraint.contains('+') { - let plus_input_idx = num_outputs + plus_idx; - if plus_input_idx < operands.len() { - if let Some(ref sym) = operands[plus_input_idx].imm_symbol.clone() { - operands[i].imm_symbol = Some(sym.clone()); - } - } - plus_idx += 1; - } - } - } - - // For Immediate operands that have neither an imm_value nor an imm_symbol, - // resolve to either a placeholder $0 or fall back to GpReg. - for op in operands.iter_mut() { - if matches!(op.kind, AsmOperandKind::Immediate) && op.imm_value.is_none() && op.imm_symbol.is_none() { - if constraint_is_immediate_only(&op.constraint) { - op.imm_value = Some(0); - } else { - op.kind = AsmOperandKind::GpReg; - } - } - } -} - -/// Phase 1c: Collect registers excluded from scratch allocation (specific-register -/// constraints and clobber registers). -fn collect_excluded_registers( - operands: &[AsmOperand], - clobbers: &[String], -) -> Vec { - let mut specific_regs: Vec = operands.iter() - .filter(|op| matches!(op.kind, AsmOperandKind::Specific(_))) - .map(|op| op.reg.clone()) - .collect(); - - for clobber in clobbers { - if clobber == "cc" || clobber == "memory" { - continue; - } - specific_regs.push(clobber.clone()); - // ARM64: wN and xN are the same physical register - if let Some(suffix) = clobber.strip_prefix('w') { - if suffix.chars().all(|c| c.is_ascii_digit()) { - specific_regs.push(format!("x{}", suffix)); - } - } else if let Some(suffix) = clobber.strip_prefix('x') { - if suffix.chars().all(|c| c.is_ascii_digit()) { - specific_regs.push(format!("w{}", suffix)); - } - } else if let Some(suffix) = clobber.strip_prefix('r') { - // GCC treats r0-r30 as aliases for x0-x30 on AArch64. - if suffix.chars().all(|c| c.is_ascii_digit()) { - if let Ok(n) = suffix.parse::() { - if n <= 30 { - specific_regs.push(format!("x{}", n)); - specific_regs.push(format!("w{}", n)); - } - } - } - } - // ARM64: v/d/s/q registers are all views of the same physical FP/SIMD register. - // Add all aliases so scratch allocation avoids conflicts. - let fp_suffix = clobber.strip_prefix('v') - .or_else(|| clobber.strip_prefix('d')) - .or_else(|| clobber.strip_prefix('s')) - .or_else(|| clobber.strip_prefix('q')); - if let Some(suffix) = fp_suffix { - if !suffix.is_empty() && suffix.chars().all(|c| c.is_ascii_digit()) { - for prefix in &["v", "d", "s", "q"] { - let alias = format!("{}{}", prefix, suffix); - if !specific_regs.contains(&alias) { - specific_regs.push(alias); - } - } - } - } - // x86-64: normalize sub-register names to 64-bit canonical form - if let Some(canonical) = x86_normalize_reg_to_64bit(clobber) { - if *canonical != **clobber { - specific_regs.push(canonical.into_owned()); - } - } - } - - specific_regs -} - -/// Phase 1d: Assign scratch registers to operands that need them, with -/// memory fallback when registers are exhausted. -fn assign_scratch_registers( - emitter: &mut dyn InlineAsmEmitter, - operands: &mut [AsmOperand], - input_tied_to: &[Option], - specific_regs: &[String], - outputs: &[(String, Value, Option)], - inputs: &[(String, Operand, Option)], -) { - let total_operands = outputs.len() + inputs.len(); - - // Count synthetic "+" inputs. These are the first `num_plus` entries in the - // inputs array, one per output that has a "+" constraint. They will be - // overwritten by finalize_operands_and_build_gcc_map with the output's - // register, so we must NOT allocate scratch registers for them (doing so - // would waste the limited register pool). - let num_plus = outputs.iter().filter(|(c, _, _)| c.contains('+')).count(); - - // First pass: assign registers to output operands only. - // Build a dynamic exclusion list that grows as registers are assigned, - // so the wraparound fallback never reuses an already-assigned register. - let mut output_excluded: Vec = specific_regs.to_vec(); - for i in 0..outputs.len() { - assign_one_scratch(emitter, operands, input_tied_to, &output_excluded, outputs, inputs, i, num_plus); - // Track the just-assigned register so it won't be reused by the - // next output operand when the scratch pool wraps around. - if !operands[i].reg.is_empty() { - let reg = &operands[i].reg; - if !output_excluded.contains(reg) { - output_excluded.push(reg.clone()); - } - if !operands[i].reg_hi.is_empty() { - let reg_hi = &operands[i].reg_hi; - if !output_excluded.contains(reg_hi) { - output_excluded.push(reg_hi.clone()); - } - } - } - } - - // Collect registers assigned to early-clobber ("&") and read-write ("+") - // outputs. Early-clobber means the output is written before all inputs are - // consumed. Read-write ("+") means the register is pre-loaded with an input - // value before the asm executes. In both cases, input operands must NOT - // share a register with the output, or the input loading phase would - // overwrite the pre-loaded value. - let mut input_excluded: Vec = specific_regs.to_vec(); - for i in 0..outputs.len() { - if (outputs[i].0.contains('&') || outputs[i].0.contains('+')) && !operands[i].reg.is_empty() { - let reg = &operands[i].reg; - // Normalize to 64-bit canonical name for x86 (e.g., "ecx" -> "rcx") - let canonical = x86_normalize_reg_to_64bit(reg) - .map(|c| c.into_owned()) - .unwrap_or_else(|| reg.clone()); - if !input_excluded.contains(&canonical) { - input_excluded.push(canonical); - } - // Also exclude the register name as-is in case it's already canonical - // or a non-x86 target - if !input_excluded.contains(reg) { - input_excluded.push(reg.clone()); - } - // Exclude high register of a register pair if present - if !operands[i].reg_hi.is_empty() { - let reg_hi = &operands[i].reg_hi; - if !input_excluded.contains(reg_hi) { - input_excluded.push(reg_hi.clone()); - } - } - } - } - - // Second pass: assign registers to non-synthetic input operands using the - // extended exclusion list that includes early-clobber output registers. - // Also track assigned registers to avoid reuse when the pool wraps around. - for i in outputs.len()..total_operands { - assign_one_scratch(emitter, operands, input_tied_to, &input_excluded, outputs, inputs, i, num_plus); - if !operands[i].reg.is_empty() { - let reg = &operands[i].reg; - if !input_excluded.contains(reg) { - input_excluded.push(reg.clone()); - } - if !operands[i].reg_hi.is_empty() { - let reg_hi = &operands[i].reg_hi; - if !input_excluded.contains(reg_hi) { - input_excluded.push(reg_hi.clone()); - } - } - } - } -} - -/// Helper: assign a scratch register to a single operand at index `i`. -fn assign_one_scratch( - emitter: &mut dyn InlineAsmEmitter, - operands: &mut [AsmOperand], - input_tied_to: &[Option], - excluded: &[String], - outputs: &[(String, Value, Option)], - inputs: &[(String, Operand, Option)], - i: usize, - num_plus: usize, -) { - if !operands[i].reg.is_empty() { - return; - } - match &operands[i].kind { - AsmOperandKind::Memory | AsmOperandKind::Immediate => {}, - AsmOperandKind::Tied(_) => {}, - AsmOperandKind::X87St0 => { operands[i].reg = "st(0)".to_string(); } - AsmOperandKind::X87St1 => { operands[i].reg = "st(1)".to_string(); } - kind => { - if i >= outputs.len() { - let input_idx = i - outputs.len(); - if input_tied_to[input_idx].is_some() { - return; - } - // Skip synthetic "+" inputs: they are the first `num_plus` entries - // in the inputs array and will get their registers from the - // corresponding output in finalize_operands_and_build_gcc_map. - if input_idx < num_plus { - return; - } - } - let reg = emitter.assign_scratch_reg(kind, excluded); - if reg.is_empty() && constraint_has_memory_alt(&operands[i].constraint) { - operands[i].kind = AsmOperandKind::Memory; - if i < outputs.len() { - let val = Operand::Value(outputs[i].1); - emitter.setup_memory_fallback(&mut operands[i], &val); - } else { - let input_idx = i - outputs.len(); - let val = inputs[input_idx].1; - emitter.setup_memory_fallback(&mut operands[i], &val); - } - } else { - operands[i].reg = reg; - // For 64-bit register pairs on 32-bit architectures (i686), - // allocate a second GP register for the high 32 bits. - if matches!(kind, AsmOperandKind::GpReg) && emitter.needs_register_pair(operands[i].operand_type) { - let reg_hi = emitter.assign_scratch_reg(kind, excluded); - operands[i].reg_hi = reg_hi; - } - } - } - } -} - -/// Phase 1e: Resolve tied operands and populate operand types. -fn resolve_tied_and_types( - operands: &mut [AsmOperand], - input_tied_to: &[Option], - outputs: &[(String, Value, Option)], - operand_types: &[IrType], -) { - let total_operands = operands.len(); - for i in 0..total_operands { - let tied_target = if let AsmOperandKind::Tied(tied_to) = operands[i].kind { - Some(tied_to) - } else if i >= outputs.len() { - let input_idx = i - outputs.len(); - if operands[i].reg.is_empty() { - input_tied_to[input_idx] - } else { - None - } - } else { - None - }; - if let Some(target) = tied_target { - if target < operands.len() { - let source = operands[target].clone(); - operands[i].copy_metadata_from(&source); - } - } - } - - // Populate operand types - for (i, ty) in operand_types.iter().enumerate() { - if i < operands.len() { - operands[i].operand_type = *ty; - } - } -} - -/// Phase 1f: Resolve memory operands, handle "+" read-write constraints, -/// build GCC operand numbering, and apply segment prefixes. -fn finalize_operands_and_build_gcc_map( - emitter: &mut dyn InlineAsmEmitter, - operands: &mut [AsmOperand], - outputs: &[(String, Value, Option)], - inputs: &[(String, Operand, Option)], - specific_regs: &[String], - seg_overrides: &[AddressSpace], -) -> (usize, Vec) { - let total_operands = outputs.len() + inputs.len(); - - // Resolve memory operand addresses for outputs - for (i, (_, ptr, _)) in outputs.iter().enumerate() { - if matches!(operands[i].kind, AsmOperandKind::Memory) { - let val = Operand::Value(*ptr); - emitter.resolve_memory_operand(&mut operands[i], &val, specific_regs); - } - } - - // Handle "+" read-write constraints: synthetic inputs share the output's register. - let num_plus = outputs.iter().filter(|(c,_,_)| c.contains('+')).count(); - let mut plus_idx = 0; - for (i, (constraint, _, _)) in outputs.iter().enumerate() { - if constraint.contains('+') { - let plus_input_idx = outputs.len() + plus_idx; - if plus_input_idx < total_operands { - let source = operands[i].clone(); - operands[plus_input_idx].copy_metadata_from(&source); - operands[plus_input_idx].kind = source.kind; - operands[plus_input_idx].operand_type = source.operand_type; - } - plus_idx += 1; - } - } - - // Build GCC operand number -> internal index mapping. - let num_gcc_operands = outputs.len() + inputs.len(); - let mut gcc_to_internal: Vec = Vec::with_capacity(num_gcc_operands); - gcc_to_internal.extend(0..outputs.len()); - gcc_to_internal.extend((num_plus..inputs.len()).map(|i| outputs.len() + i)); - for i in 0..num_plus { - gcc_to_internal.push(outputs.len() + i); - } - - // Resolve memory operand addresses for non-synthetic input operands - for (i, (_, val, _)) in inputs.iter().enumerate() { - if i < num_plus { continue; } - let op_idx = outputs.len() + i; - if matches!(operands[op_idx].kind, AsmOperandKind::Memory) { - emitter.resolve_memory_operand(&mut operands[op_idx], val, specific_regs); - } - } - - // Apply segment prefixes to memory operands (for __seg_gs/__seg_fs) - if !seg_overrides.is_empty() { - for (i, op) in operands.iter_mut().enumerate() { - if i < seg_overrides.len() { - match seg_overrides[i] { - AddressSpace::SegGs => op.seg_prefix = "%gs:".to_string(), - AddressSpace::SegFs => op.seg_prefix = "%fs:".to_string(), - AddressSpace::Default => {} - } - } - } - } - - (num_plus, gcc_to_internal) -} - -/// Phase 2: Load input values into their assigned registers, handling -/// x87 FPU stack ordering (st(1) before st(0) due to LIFO semantics). -fn load_inputs( - emitter: &mut dyn InlineAsmEmitter, - operands: &[AsmOperand], - outputs: &[(String, Value, Option)], - inputs: &[(String, Operand, Option)], -) { - // First pass: load non-x87 inputs - for (i, (constraint, val, _)) in inputs.iter().enumerate() { - let op_idx = outputs.len() + i; - match &operands[op_idx].kind { - AsmOperandKind::Memory | AsmOperandKind::Immediate => continue, - AsmOperandKind::X87St0 | AsmOperandKind::X87St1 => continue, - _ => {} - } - if operands[op_idx].reg.is_empty() { - continue; - } - emitter.load_input_to_reg(&operands[op_idx], val, constraint); - } - - // x87 FPU stack inputs must be loaded in reverse stack order: st(1) first, then st(0), - // because each fld pushes onto the stack (LIFO). - let mut x87_inputs: Vec<(usize, usize)> = Vec::new(); - for (i, (_, _, _)) in inputs.iter().enumerate() { - let op_idx = outputs.len() + i; - match &operands[op_idx].kind { - AsmOperandKind::X87St0 => x87_inputs.push((i, 0)), - AsmOperandKind::X87St1 => x87_inputs.push((i, 1)), - _ => {} - } - } - let mut x87_rw_outputs: Vec<(usize, usize)> = Vec::new(); - for (i, (constraint, _, _)) in outputs.iter().enumerate() { - if constraint.contains('+') { - match &operands[i].kind { - AsmOperandKind::X87St0 => x87_rw_outputs.push((i, 0)), - AsmOperandKind::X87St1 => x87_rw_outputs.push((i, 1)), - _ => {} - } - } - } - // Sort by stack position descending (st(1) loaded first, then st(0)) - x87_inputs.sort_by(|a, b| b.1.cmp(&a.1)); - x87_rw_outputs.sort_by(|a, b| b.1.cmp(&a.1)); - // Load x87 read-write outputs first (preload), then regular x87 inputs - for (out_idx, _) in &x87_rw_outputs { - emitter.preload_readwrite_output(&operands[*out_idx], &outputs[*out_idx].1); - } - for (inp_idx, _) in &x87_inputs { - let op_idx = outputs.len() + inp_idx; - emitter.load_input_to_reg(&operands[op_idx], &inputs[*inp_idx].1, &inputs[*inp_idx].0); - } -} - -/// Substitute `%l[name]` and `%lN` goto label references in an already-substituted line. -/// In GCC asm goto, `%l[name]` resolves to the assembly label for the C goto label `name`, -/// and `%lN` resolves to the label at index N (relative to the total number of -/// output+input operands, so label 0 is at GCC operand index = num_operands). -/// -/// This is called as a post-processing step after regular operand substitution, -/// to handle any remaining `%l[...]` or `%l` patterns that weren't consumed. -pub fn substitute_goto_labels(line: &str, goto_labels: &[(String, BlockId)], num_operands: usize) -> String { - if goto_labels.is_empty() { - return line.to_string(); - } - let mut result = String::new(); - let chars: Vec = line.chars().collect(); - let mut i = 0; - while i < chars.len() { - if chars[i] == '%' && i + 1 < chars.len() && chars[i + 1] == 'l' { - // Check for %l[name] or %l - if i + 2 < chars.len() && chars[i + 2] == '[' { - // %l[name] - named goto label reference - let mut j = i + 3; - while j < chars.len() && chars[j] != ']' { - j += 1; - } - let name: String = chars[i + 3..j].iter().collect(); - if j < chars.len() { j += 1; } // skip ] - // Look up the label name - if let Some((_, block_id)) = goto_labels.iter().find(|(n, _)| n == &name) { - result.push_str(&block_id.to_string()); - i = j; - continue; - } - // Not found - emit as-is - result.push(chars[i]); - i += 1; - } else if i + 2 < chars.len() && chars[i + 2].is_ascii_digit() { - // %l - positional goto label reference - // In GCC, %l0 refers to the first goto label (GCC numbers labels after operands) - let mut j = i + 2; - let mut num = 0usize; - while j < chars.len() && chars[j].is_ascii_digit() { - num = num * 10 + (chars[j] as usize - '0' as usize); - j += 1; - } - // GCC numbers goto labels after all output+input operands. - // %l where N >= num_operands refers to label (N - num_operands). - // If N < num_operands, this is not a valid label reference. - let label_idx = num.wrapping_sub(num_operands); - if label_idx < goto_labels.len() { - result.push_str(&goto_labels[label_idx].1.to_string()); - i = j; - continue; - } - // Not found - emit as-is - result.push(chars[i]); - i += 1; - } else { - result.push(chars[i]); - i += 1; - } - } else { - result.push(chars[i]); - i += 1; - } - } - result -} - -/// Normalize an x86 register name to its 64-bit canonical form. -/// -/// x86 registers have multiple aliases for the same physical register -/// (e.g., al/ax/eax/rax all refer to RAX). Inline asm clobbers may use -/// any of these forms, but the scratch register allocator uses 64-bit names. -/// Returns `Some(canonical)` if the name is a recognized x86 register, -/// `None` otherwise (non-x86 clobbers, "cc", "memory", etc.). -fn x86_normalize_reg_to_64bit(name: &str) -> Option> { - // Map of all sub-register names to their 64-bit parent. - // Legacy 8-bit: al/ah/bl/bh/cl/ch/dl/dh - // Legacy 8-bit (REX): sil/dil/spl/bpl - // 16-bit: ax/bx/cx/dx/si/di/sp/bp - // 32-bit: eax/ebx/ecx/edx/esi/edi/esp/ebp - // 64-bit: rax/rbx/rcx/rdx/rsi/rdi/rsp/rbp (already canonical) - // Extended: r8-r15, r8d-r15d, r8w-r15w, r8b-r15b - match name { - // RAX family - "al" | "ah" | "ax" | "eax" | "rax" => Some(Cow::Borrowed("rax")), - // RBX family - "bl" | "bh" | "bx" | "ebx" | "rbx" => Some(Cow::Borrowed("rbx")), - // RCX family - "cl" | "ch" | "cx" | "ecx" | "rcx" => Some(Cow::Borrowed("rcx")), - // RDX family - "dl" | "dh" | "dx" | "edx" | "rdx" => Some(Cow::Borrowed("rdx")), - // RSI family - "sil" | "si" | "esi" | "rsi" => Some(Cow::Borrowed("rsi")), - // RDI family - "dil" | "di" | "edi" | "rdi" => Some(Cow::Borrowed("rdi")), - // RSP family - "spl" | "sp" | "esp" | "rsp" => Some(Cow::Borrowed("rsp")), - // RBP family - "bpl" | "bp" | "ebp" | "rbp" => Some(Cow::Borrowed("rbp")), - _ => { - // Extended registers: r8-r15 and their sub-register forms - // r8d/r8w/r8b -> r8, r9d/r9w/r9b -> r9, etc. - let s = name.strip_prefix('r')?; - // Must start with a digit after 'r' - if !s.starts_with(|c: char| c.is_ascii_digit()) { - return None; - } - // Extract the number part - let num_end = s.find(|c: char| !c.is_ascii_digit()).unwrap_or(s.len()); - let num_str = &s[..num_end]; - let num: u32 = num_str.parse().ok()?; - if (8..=15).contains(&num) { - Some(Cow::Owned(format!("r{}", num))) - } else { - None - } - } - } -} diff --git a/src/backend/linker_common/README.md b/src/backend/linker_common/README.md deleted file mode 100644 index 2a36feac56..0000000000 --- a/src/backend/linker_common/README.md +++ /dev/null @@ -1,52 +0,0 @@ -# linker_common/ - -Shared linker infrastructure used by all four backend linkers (x86, ARM, RISC-V, i686). - -## Why this exists - -The x86, ARM, and RISC-V linkers share nearly identical logic for parsing ELF64 -objects, resolving symbols, loading archives, and emitting ELF headers. Before -this module, each backend had its own copy. `linker_common` extracts that -duplicated code behind the `GlobalSymbolOps` trait so each backend only -implements architecture-specific pieces (relocations, PLT/GOT, ELF headers). - -The i686 linker uses ELF32 (different field widths), so it uses a subset of -this module (mainly `DynSymbol`, `is_linker_defined_symbol`, `DynStrTab`, -hash functions, and argument parsing). - -## Module layout - -| File | Purpose | -|------|---------| -| `types.rs` | Core ELF64 data types: `Elf64Section`, `Elf64Symbol`, `Elf64Rela`, `Elf64Object`, `DynSymbol` | -| `parse_object.rs` | Parse ELF64 relocatable objects (.o) into `Elf64Object` | -| `parse_shared.rs` | Extract dynamic symbols and SONAME from shared libraries (.so) | -| `symbols.rs` | `GlobalSymbolOps` trait, `InputSection`/`OutputSection`, linker-defined symbol table | -| `merge.rs` | Merge input sections into output sections, allocate COMMON symbols | -| `dynamic.rs` | Match undefined globals against shared library exports, register object symbols | -| `archive.rs` | Load archives (.a, thin archives), iterative symbol resolution | -| `resolve_lib.rs` | Resolve `-l` library names to filesystem paths | -| `args.rs` | Parse `-Wl,` linker flags into structured `LinkerArgs` | -| `check.rs` | Post-link undefined symbol validation | -| `write.rs` | ELF64 section/program header emission helpers | -| `dynstr.rs` | `.dynstr` string table builder with deduplication | -| `hash.rs` | GNU and SysV ELF hash functions | -| `section_map.rs` | Input-to-output section name mapping (`.text.foo` -> `.text`) | -| `eh_frame.rs` | `.eh_frame_hdr` builder for stack unwinding | -| `gc_sections.rs` | `--gc-sections` dead section elimination | - -## Key design decisions - -- **`GlobalSymbolOps` trait**: Each backend has its own `GlobalSymbol` struct - (different fields for dynamic linking state). The trait abstracts over these - so shared functions like `register_symbols_elf64` and `match_shared_library_dynsyms` - work generically. - -- **ELF64 only in shared code**: The i686 backend uses ELF32 with `u32` fields - instead of `u64`. Rather than making everything generic over word size - (which adds complexity for little benefit), the shared code is ELF64-only - and i686 maintains its own ELF32 parser. - -- **Linker script support**: `load_shared_library_elf64` handles the case where - `libc.so` is actually a text file with `GROUP(...)` directives pointing to - the real `.so` files. This is common on modern Linux distributions. diff --git a/src/backend/linker_common/archive.rs b/src/backend/linker_common/archive.rs deleted file mode 100644 index ab338eeae7..0000000000 --- a/src/backend/linker_common/archive.rs +++ /dev/null @@ -1,225 +0,0 @@ -//! Archive and file loading for ELF64 linkers. -//! -//! Provides iterative archive member resolution (the `--start-group` algorithm), -//! regular and thin archive loading, and a generic file dispatch function that -//! handles archives, linker scripts, shared libraries, and object files. - -use std::collections::HashMap; -use std::path::Path; - -use crate::backend::elf::{ - ELF_MAGIC, ET_DYN, - STT_SECTION, STT_FILE, - read_u16, - parse_archive_members, parse_thin_archive_members, is_thin_archive, - parse_linker_script_entries, LinkerScriptEntry, -}; -use super::types::Elf64Object; -use super::symbols::GlobalSymbolOps; -use super::parse_object::parse_elf64_object; -use super::dynamic::register_symbols_elf64; -use super::resolve_lib::resolve_lib; - -/// Check if an archive member defines any currently-undefined, non-dynamic symbol. -fn member_resolves_undefined_generic( - obj: &Elf64Object, globals: &HashMap, -) -> bool { - for sym in &obj.symbols { - if sym.is_undefined() || sym.is_local() { continue; } - if sym.sym_type() == STT_SECTION || sym.sym_type() == STT_FILE { continue; } - if sym.name.is_empty() { continue; } - if let Some(existing) = globals.get(&sym.name) { - if !existing.is_defined() && !existing.is_dynamic() { return true; } - } - } - false -} - -/// Iterative archive member resolution (the --start-group algorithm). -/// -/// Given a list of parsed archive member objects, pull in members that define -/// any currently-undefined global symbol. Repeat until no more progress. -fn resolve_archive_members( - member_objects: &mut Vec, - objects: &mut Vec, - globals: &mut HashMap, - should_replace_extra: fn(&G) -> bool, -) { - let mut changed = true; - while changed { - changed = false; - let mut i = 0; - while i < member_objects.len() { - if member_resolves_undefined_generic(&member_objects[i], globals) { - let obj = member_objects.remove(i); - let obj_idx = objects.len(); - register_symbols_elf64(obj_idx, &obj, globals, should_replace_extra); - objects.push(obj); - changed = true; - } else { - i += 1; - } - } - } -} - -/// Load a regular archive (.a), parsing members and pulling in those that -/// resolve undefined symbols. -/// -/// When `whole_archive` is true, all members are unconditionally included -/// (equivalent to GNU ld's `--whole-archive` flag). This is essential for -/// shared library creation from convenience archives (e.g., libtool). -pub fn load_archive_elf64( - data: &[u8], archive_path: &str, - objects: &mut Vec, globals: &mut HashMap, - expected_machine: u16, should_replace_extra: fn(&G) -> bool, - whole_archive: bool, -) -> Result<(), String> { - let members = parse_archive_members(data)?; - let mut member_objects: Vec = Vec::new(); - for (name, offset, size) in &members { - let member_data = &data[*offset..*offset + *size]; - if member_data.len() < 4 || member_data[0..4] != ELF_MAGIC { continue; } - if expected_machine != 0 && member_data.len() >= 20 { - let e_machine = read_u16(member_data, 18); - if e_machine != expected_machine { continue; } - } - let full_name = format!("{}({})", archive_path, name); - if let Ok(obj) = parse_elf64_object(member_data, &full_name, expected_machine) { - member_objects.push(obj); - } - } - if whole_archive { - // --whole-archive: include ALL members unconditionally - for obj in member_objects.drain(..) { - let obj_idx = objects.len(); - register_symbols_elf64(obj_idx, &obj, globals, should_replace_extra); - objects.push(obj); - } - } else { - resolve_archive_members(&mut member_objects, objects, globals, should_replace_extra); - } - Ok(()) -} - -/// Load a GNU thin archive. Members are external files referenced by name -/// relative to the archive's directory. -/// -/// When `whole_archive` is true, all members are unconditionally included. -pub fn load_thin_archive_elf64( - data: &[u8], archive_path: &str, - objects: &mut Vec, globals: &mut HashMap, - expected_machine: u16, should_replace_extra: fn(&G) -> bool, - whole_archive: bool, -) -> Result<(), String> { - let member_names = parse_thin_archive_members(data)?; - let archive_dir = Path::new(archive_path) - .parent() - .unwrap_or_else(|| Path::new(".")); - - let mut member_objects: Vec = Vec::new(); - for name in &member_names { - let member_path = archive_dir.join(name); - let member_data = std::fs::read(&member_path).map_err(|e| { - format!("thin archive {}: failed to read member '{}': {}", - archive_path, member_path.display(), e) - })?; - if member_data.len() < 4 || member_data[0..4] != ELF_MAGIC { continue; } - let full_name = format!("{}({})", archive_path, name); - if let Ok(obj) = parse_elf64_object(&member_data, &full_name, expected_machine) { - member_objects.push(obj); - } - } - if whole_archive { - for obj in member_objects.drain(..) { - let obj_idx = objects.len(); - register_symbols_elf64(obj_idx, &obj, globals, should_replace_extra); - objects.push(obj); - } - } else { - resolve_archive_members(&mut member_objects, objects, globals, should_replace_extra); - } - Ok(()) -} - -/// Load a file, dispatching by format (archive, thin archive, linker script, -/// shared library, or object file). -/// -/// The `on_shared_lib` callback handles shared libraries (.so files). This allows -/// x86 and ARM to handle dynamic symbol extraction differently. Pass a no-op -/// closure for static-only linking. -/// -/// Currently unused: x86 and ARM linkers have their own `load_file` implementations. -/// This generic version will be used as those linkers migrate to shared infrastructure. -#[allow(dead_code)] // Planned shared infrastructure; x86/ARM linkers will migrate to this -pub fn load_file_elf64( - path: &str, - objects: &mut Vec, - globals: &mut HashMap, - expected_machine: u16, - lib_paths: &[String], - prefer_static: bool, - should_replace_extra: fn(&G) -> bool, - on_shared_lib: &mut dyn FnMut(&str, &[u8]) -> Result<(), String>, -) -> Result<(), String> { - if std::env::var("LINKER_DEBUG").is_ok() { - eprintln!("load_file: {}", path); - } - - let data = std::fs::read(path).map_err(|e| format!("failed to read '{}': {}", path, e))?; - - // Regular archive - if data.len() >= 8 && &data[0..8] == b"!\n" { - return load_archive_elf64(&data, path, objects, globals, expected_machine, should_replace_extra, false); - } - - // Thin archive - if is_thin_archive(&data) { - return load_thin_archive_elf64(&data, path, objects, globals, expected_machine, should_replace_extra, false); - } - - // Not ELF? Try linker script (handles GROUP and INPUT directives) - if data.len() >= 4 && data[0..4] != ELF_MAGIC { - if let Ok(text) = std::str::from_utf8(&data) { - if let Some(entries) = parse_linker_script_entries(text) { - let script_dir = Path::new(path).parent().map(|p| p.to_string_lossy().to_string()); - for entry in &entries { - match entry { - LinkerScriptEntry::Path(lib_path) => { - if Path::new(lib_path).exists() { - load_file_elf64(lib_path, objects, globals, expected_machine, lib_paths, prefer_static, should_replace_extra, on_shared_lib)?; - } else if let Some(ref dir) = script_dir { - let resolved = format!("{}/{}", dir, lib_path); - if Path::new(&resolved).exists() { - load_file_elf64(&resolved, objects, globals, expected_machine, lib_paths, prefer_static, should_replace_extra, on_shared_lib)?; - } - } - } - LinkerScriptEntry::Lib(lib_name) => { - if let Some(resolved_path) = resolve_lib(lib_name, lib_paths, prefer_static) { - load_file_elf64(&resolved_path, objects, globals, expected_machine, lib_paths, prefer_static, should_replace_extra, on_shared_lib)?; - } - } - } - } - return Ok(()); - } - } - return Err(format!("{}: not a valid ELF object or archive", path)); - } - - // Shared library - if data.len() >= 18 { - let e_type = u16::from_le_bytes([data[16], data[17]]); - if e_type == ET_DYN { - return on_shared_lib(path, &data); - } - } - - // Regular ELF object - let obj = parse_elf64_object(&data, path, expected_machine)?; - let obj_idx = objects.len(); - register_symbols_elf64(obj_idx, &obj, globals, should_replace_extra); - objects.push(obj); - Ok(()) -} diff --git a/src/backend/linker_common/args.rs b/src/backend/linker_common/args.rs deleted file mode 100644 index 7c8eecae13..0000000000 --- a/src/backend/linker_common/args.rs +++ /dev/null @@ -1,121 +0,0 @@ -//! Shared linker argument parsing. -//! -//! Extracts linker flags from the `user_args` passed through `-Wl,` and -//! direct `-L`/`-l` flags. Used by x86, ARM, and RISC-V linkers. - -use std::path::Path; - -/// Parsed linker arguments from user_args. -/// -/// Contains all the flags that are common across backends. Not all backends -/// use every field; unused fields are simply ignored. -#[derive(Debug, Default)] -pub struct LinkerArgs { - /// Extra library search paths from `-L` flags. - pub extra_lib_paths: Vec, - /// Library names from `-l` flags (without the `lib` prefix or `.a`/`.so` suffix). - pub libs_to_load: Vec, - /// Bare file paths (`.o`, `.a` files) passed as arguments. - pub extra_object_files: Vec, - /// Whether `--export-dynamic` / `-rdynamic` was passed. - pub export_dynamic: bool, - /// RPATH entries from `-Wl,-rpath=` or `-Wl,-rpath,`. - pub rpath_entries: Vec, - /// Use DT_RUNPATH instead of DT_RPATH (from `--enable-new-dtags`). - pub use_runpath: bool, - /// Symbol definitions from `--defsym=SYM=VAL`. - /// TODO: only supports symbol-to-symbol aliasing, not arbitrary expressions. - pub defsym_defs: Vec<(String, String)>, - /// Enable garbage collection of unused sections (from `--gc-sections`). - pub gc_sections: bool, - /// Whether `-static` was passed. - pub is_static: bool, -} - -/// Parse user linker arguments into a structured `LinkerArgs`. -/// -/// Handles `-L`, `-l`, `-Wl,` (with nested flags like `--defsym`, `--export-dynamic`, -/// `-rpath`, `--gc-sections`), `-rdynamic`, `-static`, and bare file paths. -pub fn parse_linker_args(user_args: &[String]) -> LinkerArgs { - let mut result = LinkerArgs::default(); - let args: Vec<&str> = user_args.iter().map(|s| s.as_str()).collect(); - let mut pending_rpath = false; // for -Wl,-rpath -Wl,/path two-arg form - let mut i = 0; - while i < args.len() { - let arg = args[i]; - if arg == "-rdynamic" { - result.export_dynamic = true; - } else if arg == "-static" { - result.is_static = true; - } else if let Some(path) = arg.strip_prefix("-L") { - let p = if path.is_empty() && i + 1 < args.len() { i += 1; args[i] } else { path }; - result.extra_lib_paths.push(p.to_string()); - } else if let Some(lib) = arg.strip_prefix("-l") { - let l = if lib.is_empty() && i + 1 < args.len() { i += 1; args[i] } else { lib }; - result.libs_to_load.push(l.to_string()); - } else if let Some(wl_arg) = arg.strip_prefix("-Wl,") { - let parts: Vec<&str> = wl_arg.split(',').collect(); - // Handle -Wl,-rpath -Wl,/path two-arg form - if pending_rpath && !parts.is_empty() { - result.rpath_entries.push(parts[0].to_string()); - pending_rpath = false; - i += 1; - continue; - } - let mut j = 0; - while j < parts.len() { - let part = parts[j]; - if part == "--export-dynamic" || part == "-export-dynamic" || part == "-E" { - result.export_dynamic = true; - } else if let Some(rp) = part.strip_prefix("-rpath=") { - result.rpath_entries.push(rp.to_string()); - } else if part == "-rpath" && j + 1 < parts.len() { - j += 1; - result.rpath_entries.push(parts[j].to_string()); - } else if part == "-rpath" { - // -rpath without following value in this -Wl, group; - // the path comes in the next -Wl, argument - pending_rpath = true; - } else if part == "--enable-new-dtags" { - result.use_runpath = true; - } else if part == "--disable-new-dtags" { - result.use_runpath = false; - } else if let Some(lpath) = part.strip_prefix("-L") { - result.extra_lib_paths.push(lpath.to_string()); - } else if let Some(lib) = part.strip_prefix("-l") { - result.libs_to_load.push(lib.to_string()); - } else if let Some(defsym_arg) = part.strip_prefix("--defsym=") { - if let Some(eq_pos) = defsym_arg.find('=') { - result.defsym_defs.push(( - defsym_arg[..eq_pos].to_string(), - defsym_arg[eq_pos + 1..].to_string(), - )); - } - } else if part == "--defsym" && j + 1 < parts.len() { - j += 1; - let defsym_arg = parts[j]; - if let Some(eq_pos) = defsym_arg.find('=') { - result.defsym_defs.push(( - defsym_arg[..eq_pos].to_string(), - defsym_arg[eq_pos + 1..].to_string(), - )); - } - } else if part == "--gc-sections" { - result.gc_sections = true; - } else if part == "--no-gc-sections" { - result.gc_sections = false; - } else if part == "-static" { - result.is_static = true; - } - // TODO: --whole-archive / --no-whole-archive are positional flags - // that need per-file tracking; currently handled in link_shared's - // custom parser (x86). Add here when link_builtin needs it. - j += 1; - } - } else if !arg.starts_with('-') && Path::new(arg).exists() { - result.extra_object_files.push(arg.to_string()); - } - i += 1; - } - result -} diff --git a/src/backend/linker_common/check.rs b/src/backend/linker_common/check.rs deleted file mode 100644 index a33de4cb82..0000000000 --- a/src/backend/linker_common/check.rs +++ /dev/null @@ -1,38 +0,0 @@ -//! Post-link undefined symbol checking. -//! -//! Validates that all required symbols have been resolved after linking, -//! filtering out dynamic, weak, and linker-defined symbols. - -use std::collections::HashMap; - -use crate::backend::elf::STB_WEAK; -use super::symbols::{GlobalSymbolOps, is_linker_defined_symbol}; - -/// Check for undefined symbols in the global symbol table and return an error -/// if any truly undefined symbols are found. -/// -/// Filters out dynamic symbols, weak symbols, and linker-defined symbols -/// using the `GlobalSymbolOps` trait methods. `max_report` limits how many -/// symbols are shown in the error message (typically 20). -pub fn check_undefined_symbols_elf64( - globals: &HashMap, - max_report: usize, -) -> Result<(), String> { - let mut truly_undefined: Vec<&String> = globals.iter() - .filter(|(name, sym)| { - !sym.is_defined() && !sym.is_dynamic() - && (sym.info() >> 4) != STB_WEAK - && !is_linker_defined_symbol(name) - }) - .map(|(name, _)| name) - .collect(); - if truly_undefined.is_empty() { - return Ok(()); - } - truly_undefined.sort(); - truly_undefined.truncate(max_report); - Err(format!( - "undefined symbols: {}", - truly_undefined.iter().map(|s| s.as_str()).collect::>().join(", ") - )) -} diff --git a/src/backend/linker_common/dynamic.rs b/src/backend/linker_common/dynamic.rs deleted file mode 100644 index 443bf020e4..0000000000 --- a/src/backend/linker_common/dynamic.rs +++ /dev/null @@ -1,229 +0,0 @@ -//! Shared dynamic linking: symbol matching, library loading, and symbol registration. -//! -//! Extracts the duplicated shared-library symbol matching logic from x86 and ARM -//! linkers into a single generic implementation. Also provides `register_symbols_elf64()` -//! for populating the global symbol table from object files. - -use std::collections::HashMap; -use std::path::Path; - -use crate::backend::elf::{ - ELF_MAGIC, - STB_WEAK, - STT_OBJECT, STT_SECTION, STT_FILE, - SHN_COMMON, - parse_linker_script_entries, LinkerScriptEntry, -}; -use super::types::{Elf64Object, DynSymbol}; -use super::symbols::{GlobalSymbolOps, is_linker_defined_symbol}; -use super::parse_shared::{parse_shared_library_symbols, parse_soname}; -use super::resolve_lib::resolve_lib; - -/// Match dynamic symbols from a shared library against undefined globals. -/// -/// For each undefined, non-dynamic global that matches a library export: -/// 1. Replace it with a dynamic symbol entry (via `GlobalSymbolOps::new_dynamic`) -/// 2. Track WEAK STT_OBJECT matches for alias registration -/// -/// After the first pass, a second pass registers any STT_OBJECT aliases at the -/// same (value, size) as matched WEAK symbols. This ensures COPY relocations -/// work correctly (e.g., `environ` is WEAK, `__environ` is GLOBAL in libc). -/// -/// Returns `true` if at least one symbol was matched (i.e., this library is needed). -pub fn match_shared_library_dynsyms( - dyn_syms: &[DynSymbol], - soname: &str, - globals: &mut HashMap, -) -> bool { - let mut lib_needed = false; - let mut matched_weak_objects: Vec<(u64, u64)> = Vec::new(); - - // First pass: match undefined symbols against library exports - for dsym in dyn_syms { - if let Some(existing) = globals.get(&dsym.name) { - if !existing.is_defined() && !existing.is_dynamic() { - lib_needed = true; - globals.insert(dsym.name.clone(), G::new_dynamic(dsym, soname)); - // Track WEAK STT_OBJECT for alias detection - let bind = dsym.info >> 4; - let stype = dsym.info & 0xf; - if bind == STB_WEAK && stype == STT_OBJECT - && !matched_weak_objects.contains(&(dsym.value, dsym.size)) - { - matched_weak_objects.push((dsym.value, dsym.size)); - } - } - } - } - - // Second pass: register aliases for matched WEAK STT_OBJECT symbols - if !matched_weak_objects.is_empty() { - for dsym in dyn_syms { - let stype = dsym.info & 0xf; - if stype == STT_OBJECT - && matched_weak_objects.contains(&(dsym.value, dsym.size)) - && !globals.contains_key(&dsym.name) - { - lib_needed = true; - globals.insert(dsym.name.clone(), G::new_dynamic(dsym, soname)); - } - } - } - - lib_needed -} - -/// Load a shared library file and match its exports against undefined globals. -/// -/// Handles linker script indirection (e.g., libc.so may be a text file pointing -/// to the real .so). Uses as-needed semantics: only adds DT_NEEDED if at least -/// one symbol was actually resolved. -pub fn load_shared_library_elf64( - path: &str, - globals: &mut HashMap, - needed_sonames: &mut Vec, - lib_paths: &[String], -) -> Result<(), String> { - let data = std::fs::read(path).map_err(|e| format!("failed to read '{}': {}", path, e))?; - - // Handle linker scripts (e.g., libc.so is often a text file with GROUP/INPUT) - if data.len() >= 4 && data[0..4] != ELF_MAGIC { - if let Ok(text) = std::str::from_utf8(&data) { - if let Some(entries) = parse_linker_script_entries(text) { - let script_dir = Path::new(path).parent() - .map(|p| p.to_string_lossy().to_string()); - for entry in &entries { - let resolved_path = match entry { - LinkerScriptEntry::Path(lib_path) => { - if Path::new(lib_path).exists() { - Some(lib_path.clone()) - } else if let Some(ref dir) = script_dir { - let p = format!("{}/{}", dir, lib_path); - if Path::new(&p).exists() { Some(p) } else { None } - } else { - None - } - } - LinkerScriptEntry::Lib(lib_name) => { - resolve_lib(lib_name, lib_paths, false) - } - }; - if let Some(resolved) = resolved_path { - let lib_data = std::fs::read(&resolved) - .map_err(|e| format!("failed to read '{}': {}", resolved, e))?; - if lib_data.len() >= 8 && &lib_data[0..8] == b"!\n" { - // Archives in linker scripts (like libc_nonshared.a) - // are silently skipped during shared lib loading - continue; - } - load_shared_library_elf64(&resolved, globals, needed_sonames, lib_paths)?; - } - } - return Ok(()); - } - } - return Err(format!("{}: not a valid ELF shared library", path)); - } - - let soname = parse_soname(&data).unwrap_or_else(|| { - Path::new(path).file_name() - .map(|n| n.to_string_lossy().to_string()) - .unwrap_or_else(|| path.to_string()) - }); - - let dyn_syms = parse_shared_library_symbols(&data, path)?; - let lib_needed = match_shared_library_dynsyms(&dyn_syms, &soname, globals); - - if lib_needed && !needed_sonames.contains(&soname) { - needed_sonames.push(soname); - } - Ok(()) -} - -/// Resolve remaining undefined symbols by searching default system libraries. -/// -/// After all explicit -l libraries have been loaded, this function searches -/// the standard system libraries (libc, libm, libgcc_s) for any remaining -/// undefined, non-weak, non-linker-defined symbols. -/// -/// `lib_search_paths` provides directories to search for the default libs. -/// `default_lib_names` lists the .so filenames to try (e.g., ["libc.so.6"]). -pub fn resolve_dynamic_symbols_elf64( - globals: &mut HashMap, - needed_sonames: &mut Vec, - lib_search_paths: &[String], - default_lib_names: &[&str], -) -> Result<(), String> { - // Check if there are any truly undefined symbols worth resolving - let has_undefined = globals.iter().any(|(name, sym)| { - !sym.is_defined() && !sym.is_dynamic() - && !is_linker_defined_symbol(name) - }); - if !has_undefined { return Ok(()); } - - // Find default libraries in the search paths - for lib_name in default_lib_names { - let lib_path = lib_search_paths.iter() - .map(|dir| format!("{}/{}", dir, lib_name)) - .find(|candidate| Path::new(candidate).exists()); - - if let Some(lib_path) = lib_path { - let data = match std::fs::read(&lib_path) { Ok(d) => d, Err(_) => continue }; - let soname = parse_soname(&data).unwrap_or_else(|| { - Path::new(&lib_path).file_name() - .map(|n| n.to_string_lossy().to_string()) - .unwrap_or_default() - }); - let dyn_syms = match parse_shared_library_symbols(&data, &lib_path) { - Ok(s) => s, Err(_) => continue, - }; - - let lib_needed = match_shared_library_dynsyms(&dyn_syms, &soname, globals); - if lib_needed && !needed_sonames.contains(&soname) { - needed_sonames.push(soname); - } - } - } - Ok(()) -} - -/// Register symbols from an object file into the global symbol table. -/// -/// Handles defined symbols, COMMON symbols, and undefined references. -/// For defined symbols, a GLOBAL definition replaces a WEAK one. -/// The `should_replace_extra` callback allows x86's linker to also check -/// `is_dynamic` when deciding whether to replace an existing symbol. -pub fn register_symbols_elf64( - obj_idx: usize, - obj: &Elf64Object, - globals: &mut HashMap, - should_replace_extra: fn(existing: &G) -> bool, -) { - for sym in &obj.symbols { - if sym.sym_type() == STT_SECTION || sym.sym_type() == STT_FILE { continue; } - if sym.name.is_empty() || sym.is_local() { continue; } - - let is_defined = !sym.is_undefined() && sym.shndx != SHN_COMMON; - - if is_defined { - let should_replace = match globals.get(&sym.name) { - None => true, - Some(e) => !e.is_defined() || should_replace_extra(e) - || (e.info() >> 4 == STB_WEAK && sym.is_global()), - }; - if should_replace { - globals.insert(sym.name.clone(), G::new_defined(obj_idx, sym)); - } - } else if sym.shndx == SHN_COMMON { - let should_insert = match globals.get(&sym.name) { - None => true, - Some(e) => !e.is_defined(), - }; - if should_insert { - globals.insert(sym.name.clone(), G::new_common(obj_idx, sym)); - } - } else if !globals.contains_key(&sym.name) { - globals.insert(sym.name.clone(), G::new_undefined(sym)); - } - } -} diff --git a/src/backend/linker_common/dynstr.rs b/src/backend/linker_common/dynstr.rs deleted file mode 100644 index 1e200003ea..0000000000 --- a/src/backend/linker_common/dynstr.rs +++ /dev/null @@ -1,39 +0,0 @@ -//! Dynamic string table builder for `.dynstr` section emission. -//! -//! Used by linkers that produce dynamically-linked executables (x86, i686, RISC-V). -//! Deduplicates strings and tracks offsets. - -use std::collections::HashMap; - -/// Dynamic string table builder. -/// -/// Used by linkers that produce dynamically-linked executables (x86, i686, RISC-V). -/// Deduplicates strings and tracks offsets for .dynstr section emission. -pub struct DynStrTab { - data: Vec, - offsets: HashMap, -} - -impl DynStrTab { - pub fn new() -> Self { - Self { data: vec![0], offsets: HashMap::new() } - } - - pub fn add(&mut self, s: &str) -> usize { - if s.is_empty() { return 0; } - if let Some(&off) = self.offsets.get(s) { return off; } - let off = self.data.len(); - self.data.extend_from_slice(s.as_bytes()); - self.data.push(0); - self.offsets.insert(s.to_string(), off); - off - } - - pub fn get_offset(&self, s: &str) -> usize { - if s.is_empty() { 0 } else { self.offsets.get(s).copied().unwrap_or(0) } - } - - pub fn as_bytes(&self) -> &[u8] { - &self.data - } -} diff --git a/src/backend/linker_common/eh_frame.rs b/src/backend/linker_common/eh_frame.rs deleted file mode 100644 index 51f999f9ac..0000000000 --- a/src/backend/linker_common/eh_frame.rs +++ /dev/null @@ -1,398 +0,0 @@ -//! .eh_frame_hdr builder for stack unwinding. -//! -//! Builds the .eh_frame_hdr section pointed to by PT_GNU_EH_FRAME. Contains a -//! binary search table mapping PC addresses to their FDE entries in .eh_frame. -//! -//! Format: -//! u8 version = 1 -//! u8 eh_frame_ptr_enc = DW_EH_PE_pcrel | DW_EH_PE_sdata4 (0x1b) -//! u8 fde_count_enc = DW_EH_PE_udata4 (0x03) -//! u8 table_enc = DW_EH_PE_datarel | DW_EH_PE_sdata4 (0x3b) -//! i32 eh_frame_ptr (PC-relative offset to .eh_frame start) -//! u32 fde_count (number of FDEs in the table) -//! For each FDE: -//! i32 initial_location (relative to eh_frame_hdr start) -//! i32 fde_address (relative to eh_frame_hdr start) - -/// Count the number of FDE entries in an .eh_frame section by scanning structure. -/// This only reads length and CIE_id fields, so it works on unrelocated data. -/// Used during layout to reserve space for .eh_frame_hdr (12 + 8 * count bytes). -pub fn count_eh_frame_fdes(data: &[u8]) -> usize { - let mut count = 0; - let mut pos = 0; - while pos + 4 <= data.len() { - let length = read_u32_le(data, pos) as u64; - if length == 0 { - // Zero terminator from a merged input section; skip it - pos += 4; - continue; - } - let (actual_length, header_size) = if length == 0xFFFFFFFF { - if pos + 12 > data.len() { break; } - (read_u64_le(data, pos + 4), 12usize) - } else { - (length, 4usize) - }; - let entry_data_start = pos + header_size; - let entry_end = entry_data_start + actual_length as usize; - if entry_end > data.len() || entry_data_start + 4 > data.len() { break; } - let cie_id = if length == 0xFFFFFFFF { - if entry_data_start + 8 > data.len() { break; } - read_u64_le(data, entry_data_start) - } else { - read_u32_le(data, entry_data_start) as u64 - }; - if cie_id != 0 { count += 1; } - pos = entry_end; - } - count -} - -/// Build .eh_frame_hdr data from the merged .eh_frame section. -/// -/// `eh_frame_data`: the merged .eh_frame section bytes -/// `eh_frame_vaddr`: virtual address where .eh_frame is loaded -/// `eh_frame_hdr_vaddr`: virtual address where .eh_frame_hdr will be loaded -/// `is_64bit`: true for 64-bit ELF, false for 32-bit -/// -/// Returns the .eh_frame_hdr section data, or empty vec if parsing fails. -pub fn build_eh_frame_hdr( - eh_frame_data: &[u8], - eh_frame_vaddr: u64, - eh_frame_hdr_vaddr: u64, - is_64bit: bool, -) -> Vec { - // Parse .eh_frame to find all FDEs and their initial_location values - let fdes = parse_eh_frame_fdes(eh_frame_data, eh_frame_vaddr, is_64bit); - - // Header: 4 bytes + eh_frame_ptr (4 bytes) + fde_count (4 bytes) - let header_size = 4 + 4 + 4; - let table_entry_size = 8; // two i32s per entry - let total_size = header_size + fdes.len() * table_entry_size; - let mut data = vec![0u8; total_size]; - - // Version - data[0] = 1; - // eh_frame_ptr encoding: DW_EH_PE_pcrel | DW_EH_PE_sdata4 - data[1] = 0x1b; - // fde_count encoding: DW_EH_PE_udata4 - data[2] = 0x03; - // table encoding: DW_EH_PE_datarel | DW_EH_PE_sdata4 - data[3] = 0x3b; - - // eh_frame_ptr: PC-relative offset from &data[4] to eh_frame - let eh_frame_ptr = eh_frame_vaddr as i64 - (eh_frame_hdr_vaddr as i64 + 4); - write_i32_le(&mut data, 4, eh_frame_ptr as i32); - - // fde_count - write_i32_le(&mut data, 8, fdes.len() as i32); - - // Table entries: sorted by initial_location - // Each entry is (initial_location - eh_frame_hdr_vaddr, fde_address - eh_frame_hdr_vaddr) - for (i, fde) in fdes.iter().enumerate() { - let off = header_size + i * table_entry_size; - let loc_rel = fde.initial_location as i64 - eh_frame_hdr_vaddr as i64; - let fde_rel = fde.fde_vaddr as i64 - eh_frame_hdr_vaddr as i64; - write_i32_le(&mut data, off, loc_rel as i32); - write_i32_le(&mut data, off + 4, fde_rel as i32); - } - - data -} - -/// An FDE entry parsed from .eh_frame -struct EhFrameFde { - initial_location: u64, - fde_vaddr: u64, -} - -/// Parse .eh_frame section to extract FDE entries. -/// -/// Returns a sorted list of FDEs by initial_location. -fn parse_eh_frame_fdes(data: &[u8], base_vaddr: u64, is_64bit: bool) -> Vec { - let mut fdes = Vec::new(); - let mut pos = 0; - - while pos + 4 <= data.len() { - let length = read_u32_le(data, pos) as u64; - if length == 0 { - // Zero terminator from a merged input section; skip it - pos += 4; - continue; - } - - let is_extended = length == 0xFFFFFFFF; - let (actual_length, header_size) = if is_extended { - if pos + 12 > data.len() { break; } - (read_u64_le(data, pos + 4), 12usize) - } else { - (length, 4usize) - }; - - let entry_start = pos; - let entry_data_start = pos + header_size; - let entry_end = entry_data_start + actual_length as usize; - if entry_end > data.len() { break; } - - // CIE_id field (4 or 8 bytes depending on extended) - if entry_data_start + 4 > data.len() { break; } - let cie_id = if is_extended { - if entry_data_start + 8 > data.len() { break; } - read_u64_le(data, entry_data_start) - } else { - read_u32_le(data, entry_data_start) as u64 - }; - - // CIE has cie_id == 0; FDE has cie_id != 0 (it's a pointer back to CIE) - if cie_id != 0 { - // This is an FDE - // The CIE_pointer is relative: entry_data_start - cie_id points to the CIE - let cie_id_field_size = if is_extended { 8 } else { 4 }; - let cie_pos = (entry_data_start as u64).wrapping_sub(cie_id) as usize; - - // Parse the CIE to get the FDE encoding - let fde_encoding = parse_cie_fde_encoding(data, cie_pos, is_64bit); - - // After CIE_pointer comes: initial_location, address_range, ... - let iloc_offset = entry_data_start + cie_id_field_size; - if iloc_offset + 4 > data.len() { pos = entry_end; continue; } - - let fde_vaddr = base_vaddr + entry_start as u64; - - // Decode initial_location based on the CIE's FDE encoding - let initial_location = decode_eh_pointer( - data, iloc_offset, fde_encoding, - base_vaddr + iloc_offset as u64, - is_64bit, - ); - - if let Some(iloc) = initial_location { - fdes.push(EhFrameFde { - initial_location: iloc, - fde_vaddr, - }); - } - } - - pos = entry_end; - } - - // Sort by initial_location for binary search - fdes.sort_by_key(|f| f.initial_location); - fdes -} - -/// Parse a CIE to extract the FDE pointer encoding (R augmentation). -/// -/// Returns the encoding byte, or 0x00 (DW_EH_PE_absptr) if not found. -fn parse_cie_fde_encoding(data: &[u8], cie_pos: usize, _is_64bit: bool) -> u8 { - if cie_pos + 4 > data.len() { return 0x00; } - - let length = read_u32_le(data, cie_pos) as u64; - if length == 0 || length == 0xFFFFFFFF { return 0x00; } - - let header_size = 4usize; - let cie_data_start = cie_pos + header_size; - let cie_end = cie_data_start + length as usize; - if cie_end > data.len() { return 0x00; } - - // CIE_id must be 0 - if cie_data_start + 4 > data.len() { return 0x00; } - let cie_id = read_u32_le(data, cie_data_start); - if cie_id != 0 { return 0x00; } - - // version (1 byte) - if cie_data_start + 5 > data.len() { return 0x00; } - let _version = data[cie_data_start + 4]; - - // augmentation string (null-terminated) - let aug_start = cie_data_start + 5; - let mut aug_end = aug_start; - while aug_end < cie_end && data[aug_end] != 0 { - aug_end += 1; - } - if aug_end >= cie_end { return 0x00; } - let aug_str: Vec = data[aug_start..aug_end].to_vec(); - let mut cur = aug_end + 1; // skip null terminator - - // code_alignment_factor (ULEB128) - let (_, n) = read_uleb128(data, cur); - cur += n; - // data_alignment_factor (SLEB128) - let (_, n) = read_sleb128(data, cur); - cur += n; - // return_address_register (ULEB128) - let (_, n) = read_uleb128(data, cur); - cur += n; - - // Parse augmentation data - if !aug_str.is_empty() && aug_str[0] == b'z' { - // Augmentation data length (ULEB128) - let (aug_data_len, n) = read_uleb128(data, cur); - cur += n; - let aug_data_end = cur + aug_data_len as usize; - - // Walk augmentation string after 'z' - for &ch in &aug_str[1..] { - if cur >= aug_data_end { break; } - match ch { - b'R' => { - // FDE encoding - if cur < data.len() { - return data[cur]; - } - return 0x00; - } - b'L' => { - // LSDA encoding (skip 1 byte) - cur += 1; - } - b'P' => { - // Personality encoding + pointer - if cur >= data.len() { return 0x00; } - let enc = data[cur]; - cur += 1; - let ptr_size = eh_pointer_size(enc, _is_64bit); - cur += ptr_size; - } - b'S' | b'B' => { - // Signal frame / has ABI tag - no data - } - _ => break, - } - } - } - - // Default: absolute pointer encoding - 0x00 -} - -/// Decode an eh_frame pointer value based on its encoding. -fn decode_eh_pointer(data: &[u8], offset: usize, encoding: u8, pc: u64, is_64bit: bool) -> Option { - if encoding == 0xFF { return None; } // DW_EH_PE_omit - - let base_enc = encoding & 0x0F; - let rel = encoding & 0x70; - - let (raw_val, _size) = match base_enc { - 0x00 => { // DW_EH_PE_absptr - if is_64bit { - if offset + 8 > data.len() { return None; } - (read_u64_le(data, offset) as i64, 8) - } else { - if offset + 4 > data.len() { return None; } - (read_u32_le(data, offset) as i32 as i64, 4) - } - } - 0x01 => { // DW_EH_PE_uleb128 - let (v, _) = read_uleb128(data, offset); - (v as i64, 0) - } - 0x02 => { // DW_EH_PE_udata2 - if offset + 2 > data.len() { return None; } - (u16::from_le_bytes([data[offset], data[offset+1]]) as i64, 2) - } - 0x03 => { // DW_EH_PE_udata4 - if offset + 4 > data.len() { return None; } - (read_u32_le(data, offset) as i64, 4) - } - 0x04 => { // DW_EH_PE_udata8 - if offset + 8 > data.len() { return None; } - (read_u64_le(data, offset) as i64, 8) - } - 0x09 => { // DW_EH_PE_sleb128 - let (v, _) = read_sleb128(data, offset); - (v, 0) - } - 0x0A => { // DW_EH_PE_sdata2 - if offset + 2 > data.len() { return None; } - (i16::from_le_bytes([data[offset], data[offset+1]]) as i64, 2) - } - 0x0B => { // DW_EH_PE_sdata4 - if offset + 4 > data.len() { return None; } - (read_i32_le(data, offset) as i64, 4) - } - 0x0C => { // DW_EH_PE_sdata8 - if offset + 8 > data.len() { return None; } - (read_u64_le(data, offset) as i64, 8) - } - _ => return None, - }; - - let base_val = match rel { - 0x00 => 0i64, // DW_EH_PE_absptr - 0x10 => pc as i64, // DW_EH_PE_pcrel - 0x20 => 0i64, // DW_EH_PE_textrel (not commonly used) - 0x30 => 0i64, // DW_EH_PE_datarel - _ => 0i64, - }; - - Some((base_val + raw_val) as u64) -} - -/// Return the byte size of an encoded pointer. -fn eh_pointer_size(encoding: u8, is_64bit: bool) -> usize { - match encoding & 0x0F { - 0x00 => if is_64bit { 8 } else { 4 }, // absptr - 0x02 | 0x0A => 2, // udata2/sdata2 - 0x03 | 0x0B => 4, // udata4/sdata4 - 0x04 | 0x0C => 8, // udata8/sdata8 - _ => 0, - } -} - -// ── Local binary helpers (avoid depending on elf::io to keep this self-contained) ── - -fn read_u32_le(data: &[u8], off: usize) -> u32 { - u32::from_le_bytes([data[off], data[off+1], data[off+2], data[off+3]]) -} - -fn read_i32_le(data: &[u8], off: usize) -> i32 { - i32::from_le_bytes([data[off], data[off+1], data[off+2], data[off+3]]) -} - -fn read_u64_le(data: &[u8], off: usize) -> u64 { - u64::from_le_bytes([ - data[off], data[off+1], data[off+2], data[off+3], - data[off+4], data[off+5], data[off+6], data[off+7], - ]) -} - -fn write_i32_le(data: &mut [u8], off: usize, val: i32) { - let b = val.to_le_bytes(); - data[off..off+4].copy_from_slice(&b); -} - -fn read_uleb128(data: &[u8], mut off: usize) -> (u64, usize) { - let start = off; - let mut result = 0u64; - let mut shift = 0; - loop { - if off >= data.len() { return (result, off - start); } - let byte = data[off]; - off += 1; - result |= ((byte & 0x7F) as u64) << shift; - if byte & 0x80 == 0 { break; } - shift += 7; - } - (result, off - start) -} - -fn read_sleb128(data: &[u8], mut off: usize) -> (i64, usize) { - let start = off; - let mut result = 0i64; - let mut shift = 0; - let mut byte; - loop { - if off >= data.len() { return (result, off - start); } - byte = data[off]; - off += 1; - result |= ((byte & 0x7F) as i64) << shift; - shift += 7; - if byte & 0x80 == 0 { break; } - } - if shift < 64 && byte & 0x40 != 0 { - result |= -(1i64 << shift); - } - (result, off - start) -} diff --git a/src/backend/linker_common/gc_sections.rs b/src/backend/linker_common/gc_sections.rs deleted file mode 100644 index 1101b43459..0000000000 --- a/src/backend/linker_common/gc_sections.rs +++ /dev/null @@ -1,114 +0,0 @@ -//! Garbage collection (`--gc-sections`) for ELF64 linkers. -//! -//! Performs BFS reachability from entry points (_start, main) and init/fini -//! arrays, following relocations transitively to find all reachable sections. -//! Returns the set of dead (unreachable) input sections to discard. - -use std::collections::{HashMap, HashSet, VecDeque}; -use crate::backend::elf::{ - SHF_ALLOC, SHF_EXCLUDE, - SHT_NULL, SHT_STRTAB, SHT_SYMTAB, SHT_RELA, SHT_REL, SHT_GROUP, - STB_GLOBAL, STB_WEAK, - SHN_UNDEF, SHN_ABS, SHN_COMMON, -}; -use super::Elf64Object; - -/// Perform `--gc-sections`: BFS reachability from entry points, return the set -/// of dead (unreachable) `(object_idx, section_idx)` pairs. -/// -/// Starting from entry-point sections (`_start`, `main`) and any init/fini -/// arrays, follows relocations transitively to find all reachable sections. -pub fn gc_collect_sections_elf64( - objects: &[Elf64Object], -) -> HashSet<(usize, usize)> { - // Build the set of all allocatable input sections - let mut all_sections: HashSet<(usize, usize)> = HashSet::new(); - for (obj_idx, obj) in objects.iter().enumerate() { - for (sec_idx, sec) in obj.sections.iter().enumerate() { - if sec.flags & SHF_ALLOC == 0 { continue; } - if matches!(sec.sh_type, SHT_NULL | SHT_STRTAB | SHT_SYMTAB | SHT_RELA | SHT_REL | SHT_GROUP) { continue; } - if sec.flags & SHF_EXCLUDE != 0 { continue; } - all_sections.insert((obj_idx, sec_idx)); - } - } - - // Build a map from symbol name -> (obj_idx, sec_idx) for defined symbols - let mut sym_to_section: HashMap<&str, (usize, usize)> = HashMap::new(); - for (obj_idx, obj) in objects.iter().enumerate() { - for sym in &obj.symbols { - if sym.shndx == SHN_UNDEF || sym.shndx == SHN_ABS || sym.shndx == SHN_COMMON { continue; } - let binding = sym.info >> 4; - if binding != STB_GLOBAL && binding != STB_WEAK { continue; } - if sym.name.is_empty() { continue; } - let sec_idx = sym.shndx as usize; - if sec_idx < obj.sections.len() { - sym_to_section.entry(sym.name.as_str()).or_insert((obj_idx, sec_idx)); - } - } - } - - // Seed the worklist with entry-point sections and sections that must be kept - let mut live: HashSet<(usize, usize)> = HashSet::new(); - let mut worklist: VecDeque<(usize, usize)> = VecDeque::new(); - - let mark_live = |key: (usize, usize), live: &mut HashSet<(usize, usize)>, wl: &mut VecDeque<(usize, usize)>| { - if all_sections.contains(&key) && live.insert(key) { - wl.push_back(key); - } - }; - - // Mark sections containing entry-point symbols as live - let entry_symbols = ["_start", "main", "__libc_csu_init", "__libc_csu_fini"]; - for &entry_name in &entry_symbols { - if let Some(&key) = sym_to_section.get(entry_name) { - mark_live(key, &mut live, &mut worklist); - } - } - - // Mark init/fini array sections as live (these are called by the runtime) - for (obj_idx, obj) in objects.iter().enumerate() { - for (sec_idx, sec) in obj.sections.iter().enumerate() { - if sec.flags & SHF_ALLOC == 0 { continue; } - let name = &sec.name; - // Keep init/fini arrays and .ctors/.dtors (runtime calls these) - if name == ".init_array" || name.starts_with(".init_array.") - || name == ".fini_array" || name.starts_with(".fini_array.") - || name == ".ctors" || name.starts_with(".ctors.") - || name == ".dtors" || name.starts_with(".dtors.") - || name == ".preinit_array" || name.starts_with(".preinit_array.") - || name == ".init" || name == ".fini" - || name == ".note.GNU-stack" - || name == ".note.gnu.build-id" - { - mark_live((obj_idx, sec_idx), &mut live, &mut worklist); - } - } - } - - // BFS: follow relocations from live sections to discover more live sections - while let Some((obj_idx, sec_idx)) = worklist.pop_front() { - let obj = &objects[obj_idx]; - // Follow relocations from this section - if sec_idx < obj.relocations.len() { - for rela in &obj.relocations[sec_idx] { - let sym_idx = rela.sym_idx as usize; - if sym_idx >= obj.symbols.len() { continue; } - let sym = &obj.symbols[sym_idx]; - - if sym.shndx != SHN_UNDEF && sym.shndx != SHN_ABS && sym.shndx != SHN_COMMON { - // Symbol is defined in this object file - let target = (obj_idx, sym.shndx as usize); - mark_live(target, &mut live, &mut worklist); - } else if !sym.name.is_empty() { - // Symbol is undefined here; look up in global symbol table - if let Some(&target) = sym_to_section.get(sym.name.as_str()) { - mark_live(target, &mut live, &mut worklist); - } - } - } - } - } - - // Return the dead sections (all sections minus live ones) - all_sections.difference(&live).copied().collect() -} diff --git a/src/backend/linker_common/hash.rs b/src/backend/linker_common/hash.rs deleted file mode 100644 index ac96003c20..0000000000 --- a/src/backend/linker_common/hash.rs +++ /dev/null @@ -1,27 +0,0 @@ -//! ELF hash functions for `.gnu.hash` and `.hash` section generation. -//! -//! Provides GNU and SysV hash computations used by linkers when building -//! the dynamic symbol hash tables. - -/// Compute the GNU hash of a symbol name. -pub fn gnu_hash(name: &[u8]) -> u32 { - let mut h: u32 = 5381; - for &b in name { - h = h.wrapping_mul(33).wrapping_add(b as u32); - } - h -} - -/// Compute the SysV ELF hash of a symbol name. -pub fn sysv_hash(name: &[u8]) -> u32 { - let mut h: u32 = 0; - for &b in name { - h = (h << 4).wrapping_add(b as u32); - let g = h & 0xf0000000; - if g != 0 { - h ^= g >> 24; - } - h &= !g; - } - h -} diff --git a/src/backend/linker_common/merge.rs b/src/backend/linker_common/merge.rs deleted file mode 100644 index 94495aabab..0000000000 --- a/src/backend/linker_common/merge.rs +++ /dev/null @@ -1,158 +0,0 @@ -//! Section merging and common symbol allocation for ELF64 linkers. -//! -//! Groups input sections by mapped name, computes output offsets with proper -//! alignment, sorts output sections by permission profile, and allocates -//! SHN_COMMON symbols into `.bss`. - -use std::collections::{HashMap, HashSet}; - -use crate::backend::elf::{ - SHT_NULL, SHT_PROGBITS, SHT_SYMTAB, SHT_STRTAB, SHT_RELA, SHT_REL, - SHT_NOBITS, SHT_GROUP, - SHF_WRITE, SHF_ALLOC, SHF_EXECINSTR, SHF_TLS, SHF_EXCLUDE, - SHN_COMMON, -}; -use super::types::Elf64Object; -use super::symbols::{InputSection, OutputSection, GlobalSymbolOps}; -use super::section_map::map_section_name; - -/// Merge input sections from all objects into output sections. -/// -/// Groups input sections by mapped name (e.g., `.text.foo` -> `.text`), -/// computes output offsets with proper alignment, and sorts output sections -/// by permission profile: RO -> Exec -> RW(progbits) -> RW(nobits). -pub fn merge_sections_elf64( - objects: &[Elf64Object], output_sections: &mut Vec, - section_map: &mut HashMap<(usize, usize), (usize, u64)>, -) { - let no_dead = HashSet::new(); - merge_sections_elf64_gc(objects, output_sections, section_map, &no_dead); -} - -/// Merge input sections into output sections, optionally skipping dead sections. -/// -/// When `dead_sections` is non-empty (from --gc-sections), sections in the set -/// are excluded from the output, effectively garbage-collecting unreferenced code. -pub fn merge_sections_elf64_gc( - objects: &[Elf64Object], output_sections: &mut Vec, - section_map: &mut HashMap<(usize, usize), (usize, u64)>, - dead_sections: &HashSet<(usize, usize)>, -) { - let mut output_map: HashMap = HashMap::new(); - - for obj_idx in 0..objects.len() { - for sec_idx in 0..objects[obj_idx].sections.len() { - let sec = &objects[obj_idx].sections[sec_idx]; - if sec.flags & SHF_ALLOC == 0 { continue; } - if matches!(sec.sh_type, SHT_NULL | SHT_STRTAB | SHT_SYMTAB | SHT_RELA | SHT_REL | SHT_GROUP) { continue; } - if sec.flags & SHF_EXCLUDE != 0 { continue; } - if !dead_sections.is_empty() && dead_sections.contains(&(obj_idx, sec_idx)) { continue; } - - let output_name = map_section_name(&sec.name).to_string(); - let alignment = sec.addralign.max(1); - - let out_idx = if let Some(&idx) = output_map.get(&output_name) { - if alignment > output_sections[idx].alignment { - output_sections[idx].alignment = alignment; - } - idx - } else { - let idx = output_sections.len(); - output_map.insert(output_name.clone(), idx); - output_sections.push(OutputSection { - name: output_name, sh_type: sec.sh_type, flags: sec.flags, - alignment, inputs: Vec::new(), data: Vec::new(), - addr: 0, file_offset: 0, mem_size: 0, - }); - idx - }; - - if sec.sh_type == SHT_PROGBITS { output_sections[out_idx].sh_type = SHT_PROGBITS; } - output_sections[out_idx].flags |= sec.flags & (SHF_WRITE | SHF_EXECINSTR | SHF_ALLOC | SHF_TLS); - output_sections[out_idx].inputs.push(InputSection { - object_idx: obj_idx, section_idx: sec_idx, output_offset: 0, size: sec.size, - }); - } - } - - for out_sec in output_sections.iter_mut() { - let mut off: u64 = 0; - for input in &mut out_sec.inputs { - let a = objects[input.object_idx].sections[input.section_idx].addralign.max(1); - off = (off + a - 1) & !(a - 1); - input.output_offset = off; - off += input.size; - } - out_sec.mem_size = off; - } - - for (out_idx, out_sec) in output_sections.iter().enumerate() { - for input in &out_sec.inputs { - section_map.insert((input.object_idx, input.section_idx), (out_idx, input.output_offset)); - } - } - - // Sort: RO -> Exec -> RW(progbits) -> RW(nobits) - let len = output_sections.len(); - let mut opts: Vec> = output_sections.drain(..).map(Some).collect(); - let mut sort_indices: Vec = (0..len).collect(); - sort_indices.sort_by_key(|&i| { - let sec = opts[i].as_ref().unwrap(); - let is_exec = sec.flags & SHF_EXECINSTR != 0; - let is_write = sec.flags & SHF_WRITE != 0; - let is_nobits = sec.sh_type == SHT_NOBITS; - if is_exec { (1u32, is_nobits as u32) } - else if !is_write { (0, is_nobits as u32) } - else { (2, is_nobits as u32) } - }); - - let mut index_remap: HashMap = HashMap::new(); - for (new_idx, &old_idx) in sort_indices.iter().enumerate() { - index_remap.insert(old_idx, new_idx); - } - for &old_idx in &sort_indices { - output_sections.push(opts[old_idx].take().unwrap()); - } - - let old_map: Vec<_> = section_map.drain().collect(); - for ((obj_idx, sec_idx), (old_out_idx, off)) in old_map { - if let Some(&new_out_idx) = index_remap.get(&old_out_idx) { - section_map.insert((obj_idx, sec_idx), (new_out_idx, off)); - } - } -} - -/// Allocate SHN_COMMON symbols into the .bss output section. -pub fn allocate_common_symbols_elf64( - globals: &mut HashMap, output_sections: &mut Vec, -) { - let common_syms: Vec<(String, u64, u64)> = globals.iter() - .filter(|(_, sym)| sym.section_idx() == SHN_COMMON && sym.is_defined()) - .map(|(name, sym)| (name.clone(), sym.value().max(1), sym.size())).collect(); - if common_syms.is_empty() { return; } - - let bss_idx = output_sections.iter().position(|s| s.name == ".bss").unwrap_or_else(|| { - let idx = output_sections.len(); - output_sections.push(OutputSection { - name: ".bss".to_string(), sh_type: SHT_NOBITS, - flags: SHF_ALLOC | SHF_WRITE, alignment: 1, - inputs: Vec::new(), data: Vec::new(), - addr: 0, file_offset: 0, mem_size: 0, - }); - idx - }); - - let mut bss_off = output_sections[bss_idx].mem_size; - for (name, alignment, size) in &common_syms { - let a = (*alignment).max(1); - bss_off = (bss_off + a - 1) & !(a - 1); - if let Some(sym) = globals.get_mut(name) { - sym.set_common_bss(bss_off); - } - if *alignment > output_sections[bss_idx].alignment { - output_sections[bss_idx].alignment = *alignment; - } - bss_off += size; - } - output_sections[bss_idx].mem_size = bss_off; -} diff --git a/src/backend/linker_common/mod.rs b/src/backend/linker_common/mod.rs deleted file mode 100644 index d80b62cd88..0000000000 --- a/src/backend/linker_common/mod.rs +++ /dev/null @@ -1,128 +0,0 @@ -//! Shared linker infrastructure for all backends. -//! -//! Split into focused submodules: -//! -//! - `types`: ELF64 object file types (Elf64Section, Elf64Symbol, Elf64Rela, Elf64Object, DynSymbol) -//! - `parse_object`: ELF64 relocatable object file parser -//! - `parse_shared`: Shared library (.so) symbol parsing and SONAME extraction -//! - `section_map`: Input-to-output section name mapping -//! - `dynstr`: Dynamic string table builder for .dynstr emission -//! - `hash`: GNU and SysV ELF hash functions -//! - `symbols`: InputSection, OutputSection, GlobalSymbolOps trait, linker-defined symbols -//! - `merge`: Section merging and common symbol allocation -//! - `dynamic`: Dynamic symbol matching, library loading, and symbol registration -//! - `archive`: Archive loading and generic file dispatch -//! - `resolve_lib`: Library name resolution helper -//! - `write`: ELF64 binary emission helpers (section/program headers, alignment) -//! - `args`: Shared linker argument parsing -//! - `check`: Post-link undefined symbol checking -//! - `eh_frame`: .eh_frame_hdr builder for stack unwinding -//! - `gc_sections`: Garbage collection (`--gc-sections`) for ELF64 linkers -//! -//! This module extracts the duplicated linker code that was copied across x86, -//! ARM, RISC-V, and (partially) i686 backends. It provides: -//! -//! - **ELF64 object parser**: `parse_elf64_object()` replaces near-identical -//! `parse_object()` functions in x86, ARM, and RISC-V linkers. -//! - **Shared library parser**: `parse_shared_library_symbols()` and `parse_soname()` -//! for extracting dynamic symbols from .so files. -//! - **Dynamic symbol matching**: `match_shared_library_dynsyms()`, -//! `load_shared_library_elf64()`, and `resolve_dynamic_symbols_elf64()` for -//! matching undefined globals against shared library exports with WEAK alias -//! detection and as-needed semantics. -//! - **Linker-defined symbols**: `LINKER_DEFINED_SYMBOLS` constant and -//! `is_linker_defined_symbol()` for the superset of symbols the linker -//! provides during layout (used by all 4 backends). -//! - **Archive loading**: `load_archive_members()` and `member_resolves_undefined()` -//! for iterative archive resolution (the --start-group algorithm). -//! - **Section mapping**: `map_section_name()` for input-to-output section mapping. -//! - **DynStrTab**: Dynamic string table builder for dynamic linking. -//! - **GNU hash**: `build_gnu_hash()` for .gnu.hash section generation. -//! - **ELF64 writing helpers**: `write_elf64_shdr()`, `write_elf64_phdr()`, -//! `write_elf64_phdr_at()`, `align_up_64()`, `pad_to()` for binary emission. -//! - **Argument parsing**: `parse_linker_args()` and `LinkerArgs` for shared -//! `-Wl,` flag parsing across backends. -//! - **Undefined symbol checking**: `check_undefined_symbols_elf64()` for -//! post-link validation via the `GlobalSymbolOps` trait. -//! -//! Each backend linker still handles its own: -//! - Architecture-specific relocation application -//! - PLT/GOT layout (different instruction sequences per arch) -//! - ELF header emission (different e_machine, base addresses) -//! - Dynamic linking specifics (version tables, etc.) - -// ── Submodule declarations ────────────────────────────────────────────── - -mod types; -mod parse_object; -mod parse_shared; -mod section_map; -mod dynstr; -mod hash; -mod symbols; -mod merge; -mod dynamic; -mod archive; -mod resolve_lib; -mod write; -mod args; -mod check; -mod eh_frame; -mod gc_sections; - -// ── Re-exports ────────────────────────────────────────────────────────── -// -// Re-export all public items at the linker_common:: level so that external -// callers see no change from the previous flat-file layout. - -// types.rs -pub use types::{Elf64Section, Elf64Symbol, Elf64Rela, Elf64Object, DynSymbol}; - -// parse_object.rs -pub use parse_object::parse_elf64_object; - -// parse_shared.rs -pub use parse_shared::{parse_shared_library_symbols, parse_soname}; - -// dynstr.rs -pub use dynstr::DynStrTab; - -// hash.rs -pub use hash::{gnu_hash, sysv_hash}; - -// symbols.rs -pub use symbols::{ - OutputSection, GlobalSymbolOps, - is_linker_defined_symbol, - is_valid_c_identifier_for_section, resolve_start_stop_symbols, -}; - -// merge.rs -pub use merge::{merge_sections_elf64, merge_sections_elf64_gc, allocate_common_symbols_elf64}; - -// dynamic.rs -pub use dynamic::{ - load_shared_library_elf64, - resolve_dynamic_symbols_elf64, register_symbols_elf64, -}; - -// archive.rs -pub use archive::{load_archive_elf64, load_thin_archive_elf64}; - -// resolve_lib.rs -pub use resolve_lib::resolve_lib; - -// write.rs -pub use write::{write_elf64_shdr, write_elf64_phdr, write_elf64_phdr_at, align_up_64, pad_to}; - -// args.rs -pub use args::parse_linker_args; - -// check.rs -pub use check::check_undefined_symbols_elf64; - -// eh_frame.rs -pub use eh_frame::{count_eh_frame_fdes, build_eh_frame_hdr}; - -// gc_sections.rs -pub use gc_sections::gc_collect_sections_elf64; diff --git a/src/backend/linker_common/parse_object.rs b/src/backend/linker_common/parse_object.rs deleted file mode 100644 index d7892ebf97..0000000000 --- a/src/backend/linker_common/parse_object.rs +++ /dev/null @@ -1,178 +0,0 @@ -//! ELF64 relocatable object file parser. -//! -//! This single function replaces the near-identical `parse_object()` functions -//! in x86/linker/elf.rs, arm/linker/elf.rs, and riscv/linker/elf_read.rs. -//! The only parameter that differed was the expected e_machine value. - -use crate::backend::elf::{ - ELF_MAGIC, ELFCLASS64, ELFDATA2LSB, ET_REL, - SHT_NOBITS, SHT_SYMTAB, SHT_RELA, - read_u16, read_u32, read_u64, read_i64, read_cstr, -}; -use super::types::{Elf64Section, Elf64Symbol, Elf64Rela, Elf64Object}; - -/// Parse an ELF64 relocatable object file (.o). -/// -/// `expected_machine` is the ELF e_machine value to validate (e.g., EM_X86_64, -/// EM_AARCH64, EM_RISCV). Pass 0 to skip machine validation. -pub fn parse_elf64_object(data: &[u8], source_name: &str, expected_machine: u16) -> Result { - if data.len() < 64 { - return Err(format!("{}: file too small for ELF header", source_name)); - } - if data[0..4] != ELF_MAGIC { - return Err(format!("{}: not an ELF file", source_name)); - } - if data[4] != ELFCLASS64 { - return Err(format!("{}: not 64-bit ELF", source_name)); - } - if data[5] != ELFDATA2LSB { - return Err(format!("{}: not little-endian ELF", source_name)); - } - - let e_type = read_u16(data, 16); - if e_type != ET_REL { - return Err(format!("{}: not a relocatable object (type={})", source_name, e_type)); - } - - if expected_machine != 0 { - let e_machine = read_u16(data, 18); - if e_machine != expected_machine { - return Err(format!("{}: wrong machine type (expected={}, got={})", - source_name, expected_machine, e_machine)); - } - } - - let e_shoff = read_u64(data, 40) as usize; - let e_shentsize = read_u16(data, 58) as usize; - let e_shnum = read_u16(data, 60) as usize; - let e_shstrndx = read_u16(data, 62) as usize; - - if e_shoff == 0 || e_shnum == 0 { - return Err(format!("{}: no section headers", source_name)); - } - - // Parse section headers - let mut sections = Vec::with_capacity(e_shnum); - for i in 0..e_shnum { - let off = e_shoff + i * e_shentsize; - if off + e_shentsize > data.len() { - return Err(format!("{}: section header {} out of bounds", source_name, i)); - } - sections.push(Elf64Section { - name_idx: read_u32(data, off), - name: String::new(), - sh_type: read_u32(data, off + 4), - flags: read_u64(data, off + 8), - addr: read_u64(data, off + 16), - offset: read_u64(data, off + 24), - size: read_u64(data, off + 32), - link: read_u32(data, off + 40), - info: read_u32(data, off + 44), - addralign: read_u64(data, off + 48), - entsize: read_u64(data, off + 56), - }); - } - - // Read section name string table - if e_shstrndx < sections.len() { - let shstrtab = §ions[e_shstrndx]; - let strtab_off = shstrtab.offset as usize; - let strtab_size = shstrtab.size as usize; - if strtab_off + strtab_size <= data.len() { - let strtab_data = &data[strtab_off..strtab_off + strtab_size]; - for sec in &mut sections { - sec.name = read_cstr(strtab_data, sec.name_idx as usize); - } - } - } - - // Read section data - let mut section_data = Vec::with_capacity(e_shnum); - for sec in §ions { - if sec.sh_type == SHT_NOBITS || sec.size == 0 { - section_data.push(Vec::new()); - } else { - let start = sec.offset as usize; - let end = start + sec.size as usize; - if end > data.len() { - return Err(format!("{}: section '{}' data out of bounds", source_name, sec.name)); - } - section_data.push(data[start..end].to_vec()); - } - } - - // Find symbol table and its string table - let mut symbols = Vec::new(); - for i in 0..sections.len() { - if sections[i].sh_type == SHT_SYMTAB { - let strtab_idx = sections[i].link as usize; - let strtab_data = if strtab_idx < section_data.len() { - §ion_data[strtab_idx] - } else { - continue; - }; - let sym_data = §ion_data[i]; - let sym_count = sym_data.len() / 24; // sizeof(Elf64_Sym) = 24 - for j in 0..sym_count { - let off = j * 24; - if off + 24 > sym_data.len() { - break; - } - let name_idx = read_u32(sym_data, off); - let mut name = read_cstr(strtab_data, name_idx as usize); - // Strip @PLT suffix from symbol names. Some assemblers (including - // our own in older versions) embed the @PLT modifier in the symbol - // name instead of using R_X86_64_PLT32 relocation type. The linker - // should resolve these against the base symbol name. - if let Some(base) = name.strip_suffix("@PLT") { - name = base.to_string(); - } - symbols.push(Elf64Symbol { - name_idx, - name, - info: sym_data[off + 4], - other: sym_data[off + 5], - shndx: read_u16(sym_data, off + 6), - value: read_u64(sym_data, off + 8), - size: read_u64(sym_data, off + 16), - }); - } - break; - } - } - - // Parse relocations - index by the section they apply to - let mut relocations = vec![Vec::new(); e_shnum]; - for i in 0..sections.len() { - if sections[i].sh_type == SHT_RELA { - let target_sec = sections[i].info as usize; - let rela_data = §ion_data[i]; - let rela_count = rela_data.len() / 24; // sizeof(Elf64_Rela) = 24 - let mut relas = Vec::with_capacity(rela_count); - for j in 0..rela_count { - let off = j * 24; - if off + 24 > rela_data.len() { - break; - } - let r_info = read_u64(rela_data, off + 8); - relas.push(Elf64Rela { - offset: read_u64(rela_data, off), - sym_idx: (r_info >> 32) as u32, - rela_type: (r_info & 0xffffffff) as u32, - addend: read_i64(rela_data, off + 16), - }); - } - if target_sec < relocations.len() { - relocations[target_sec] = relas; - } - } - } - - Ok(Elf64Object { - sections, - symbols, - section_data, - relocations, - source_name: source_name.to_string(), - }) -} diff --git a/src/backend/linker_common/parse_shared.rs b/src/backend/linker_common/parse_shared.rs deleted file mode 100644 index d14db0cf02..0000000000 --- a/src/backend/linker_common/parse_shared.rs +++ /dev/null @@ -1,483 +0,0 @@ -//! Shared library (.so) symbol parsing. -//! -//! Extracts dynamic symbols from shared libraries by reading `.dynsym` via section -//! headers, or falling back to program headers (`PT_DYNAMIC`) for stripped libraries. -//! Also provides SONAME extraction via `parse_soname()`. - -use crate::backend::elf::{ - ELF_MAGIC, ELFCLASS64, ELFDATA2LSB, ET_DYN, - SHT_DYNAMIC, SHT_DYNSYM, SHT_GNU_VERSYM, SHT_GNU_VERDEF, - SHN_UNDEF, PT_DYNAMIC, - DT_NULL, DT_SONAME, DT_SYMTAB, DT_STRTAB, DT_STRSZ, - DT_GNU_HASH, DT_VERSYM, - read_u16, read_u32, read_u64, read_i64, read_cstr, -}; -use super::types::DynSymbol; - -/// Extract dynamic symbols from a shared library (.so) file. -/// -/// Reads the .dynsym section to find exported symbols. Used by x86 and RISC-V -/// linkers for dynamic linking resolution. -pub fn parse_shared_library_symbols(data: &[u8], lib_name: &str) -> Result, String> { - if data.len() < 64 { - return Err(format!("{}: file too small for ELF header", lib_name)); - } - if data[0..4] != ELF_MAGIC { - return Err(format!("{}: not an ELF file", lib_name)); - } - if data[4] != ELFCLASS64 || data[5] != ELFDATA2LSB { - return Err(format!("{}: not 64-bit little-endian ELF", lib_name)); - } - - let e_type = read_u16(data, 16); - if e_type != ET_DYN { - return Err(format!("{}: not a shared library (type={})", lib_name, e_type)); - } - - let e_shoff = read_u64(data, 40) as usize; - let e_shentsize = read_u16(data, 58) as usize; - let e_shnum = read_u16(data, 60) as usize; - - // Try section headers first (the standard approach) - if e_shoff != 0 && e_shnum != 0 { - let mut sections = Vec::with_capacity(e_shnum); - for i in 0..e_shnum { - let off = e_shoff + i * e_shentsize; - if off + e_shentsize > data.len() { - break; - } - sections.push(( - read_u32(data, off + 4), // sh_type - read_u64(data, off + 24), // offset - read_u64(data, off + 32), // size - read_u32(data, off + 40), // link - )); - } - - // Locate .gnu.version (SHT_GNU_VERSYM) and .gnu.verdef (SHT_GNU_VERDEF) sections - let mut versym_shdr: Option<(usize, usize)> = None; // (offset, size) - let mut verdef_shdr: Option<(usize, usize, usize)> = None; // (offset, size, link) - for &(sh_type, offset, size, link) in §ions { - if sh_type == SHT_GNU_VERSYM { - versym_shdr = Some((offset as usize, size as usize)); - } else if sh_type == SHT_GNU_VERDEF { - verdef_shdr = Some((offset as usize, size as usize, link as usize)); - } - } - - // Parse version definitions to build index -> version string mapping - let mut ver_names: std::collections::HashMap = std::collections::HashMap::new(); - if let Some((vd_off, vd_size, vd_link)) = verdef_shdr { - // Get the string table for verdef (typically the dynstr) - let vd_strtab = if vd_link < sections.len() { - let (_, s_off, s_sz, _) = sections[vd_link]; - let s_off = s_off as usize; - let s_sz = s_sz as usize; - if s_off + s_sz <= data.len() { &data[s_off..s_off + s_sz] } else { &[] as &[u8] } - } else { - &[] as &[u8] - }; - - let mut pos = vd_off; - let end = vd_off + vd_size; - while pos < end && pos + 20 <= data.len() { - let vd_ndx = read_u16(data, pos + 4); - let vd_cnt = read_u16(data, pos + 6); - let vd_aux = read_u32(data, pos + 12) as usize; - let vd_next = read_u32(data, pos + 16) as usize; - - // First verdaux entry has the version name - if vd_cnt > 0 { - let aux_pos = pos + vd_aux; - if aux_pos + 8 <= data.len() { - let vda_name = read_u32(data, aux_pos) as usize; - if vda_name < vd_strtab.len() { - let name = read_cstr(vd_strtab, vda_name); - ver_names.insert(vd_ndx, name); - } - } - } - - if vd_next == 0 { break; } - pos += vd_next; - } - } - - // Find .dynsym and its string table - for i in 0..sections.len() { - let (sh_type, offset, size, link) = sections[i]; - if sh_type == SHT_DYNSYM { - let strtab_idx = link as usize; - if strtab_idx >= sections.len() { continue; } - let (_, str_off, str_size, _) = sections[strtab_idx]; - let str_off = str_off as usize; - let str_size = str_size as usize; - if str_off + str_size > data.len() { continue; } - let strtab = &data[str_off..str_off + str_size]; - - let sym_off = offset as usize; - let sym_size = size as usize; - if sym_off + sym_size > data.len() { continue; } - let sym_data = &data[sym_off..sym_off + sym_size]; - let sym_count = sym_data.len() / 24; - - let mut symbols = Vec::new(); - for j in 1..sym_count { - let off = j * 24; - if off + 24 > sym_data.len() { break; } - let name_idx = read_u32(sym_data, off) as usize; - let info = sym_data[off + 4]; - let shndx = read_u16(sym_data, off + 6); - let value = read_u64(sym_data, off + 8); - let size = read_u64(sym_data, off + 16); - - if shndx == SHN_UNDEF { continue; } - - // Check .gnu.version: if the hidden bit (0x8000) is set and - // the version index is >= 2, this is a non-default version - // (symbol@VERSION, not symbol@@VERSION). Such symbols should - // not be available for linking, matching GNU ld behavior. - if let Some((vs_off, vs_size)) = versym_shdr { - if vs_size >= sym_count * 2 && vs_off + vs_size <= data.len() { - let ver_entry = vs_off + j * 2; - let raw_ver = read_u16(data, ver_entry); - let hidden = raw_ver & 0x8000 != 0; - let ver_idx = raw_ver & 0x7fff; - if hidden && ver_idx >= 2 { - continue; - } - } - } - - let name = read_cstr(strtab, name_idx); - if name.is_empty() { continue; } - - // Look up version for this symbol from .gnu.version table - let (version, is_default_ver) = if let Some((vs_off, _vs_size)) = versym_shdr { - let vs_entry = vs_off + j * 2; - if vs_entry + 2 <= data.len() { - let raw_ver = read_u16(data, vs_entry); - let hidden = raw_ver & 0x8000 != 0; - let ver_idx = raw_ver & 0x7fff; - if ver_idx >= 2 { - (ver_names.get(&ver_idx).cloned(), !hidden) - } else { - (None, !hidden) - } - } else { - (None, true) - } - } else { - (None, true) - }; - - symbols.push(DynSymbol { name, info, value, size, version, is_default_ver }); - } - return Ok(symbols); - } - } - } - - // Fallback: use PT_DYNAMIC program header to find DT_SYMTAB/DT_STRTAB. - // This handles shared libraries without section headers (e.g., our own - // emitted .so files, or stripped libraries). - parse_shared_library_symbols_from_phdrs(data, lib_name) -} - -/// Parse dynamic symbols using program headers (PT_DYNAMIC) instead of section headers. -/// -/// When a shared library has no section headers (e_shoff == 0), we can still find -/// the dynamic symbol table by: -/// 1. Locating PT_DYNAMIC in the program header table -/// 2. Reading DT_SYMTAB, DT_STRTAB, DT_STRSZ from the dynamic section -/// 3. Determining symtab size from DT_GNU_HASH (number of symbols) or by -/// scanning until we hit the strtab address -fn parse_shared_library_symbols_from_phdrs(data: &[u8], lib_name: &str) -> Result, String> { - let e_phoff = read_u64(data, 32) as usize; - let e_phentsize = read_u16(data, 54) as usize; - let e_phnum = read_u16(data, 56) as usize; - - if e_phoff == 0 || e_phnum == 0 { - return Err(format!("{}: no program headers and no section headers", lib_name)); - } - - // Find PT_DYNAMIC - let mut dyn_offset = 0usize; - let mut dyn_size = 0usize; - for i in 0..e_phnum { - let ph = e_phoff + i * e_phentsize; - if ph + e_phentsize > data.len() { break; } - let p_type = read_u32(data, ph); - if p_type == PT_DYNAMIC { - dyn_offset = read_u64(data, ph + 8) as usize; - dyn_size = read_u64(data, ph + 32) as usize; - break; - } - } - - if dyn_offset == 0 { - return Err(format!("{}: no PT_DYNAMIC segment found", lib_name)); - } - - // Read dynamic entries to find DT_SYMTAB, DT_STRTAB, DT_STRSZ, DT_GNU_HASH, DT_VERSYM - let mut symtab_addr: u64 = 0; - let mut strtab_addr: u64 = 0; - let mut strsz: u64 = 0; - let mut gnu_hash_addr: u64 = 0; - let mut versym_addr: u64 = 0; - - let mut pos = dyn_offset; - let dyn_end = dyn_offset + dyn_size; - while pos + 16 <= dyn_end && pos + 16 <= data.len() { - let tag = read_i64(data, pos); - let val = read_u64(data, pos + 8); - match tag { - x if x == DT_NULL => break, - x if x == DT_SYMTAB => symtab_addr = val, - x if x == DT_STRTAB => strtab_addr = val, - x if x == DT_STRSZ => strsz = val, - x if x == DT_GNU_HASH => gnu_hash_addr = val, - x if x == DT_VERSYM => versym_addr = val, - _ => {} - } - pos += 16; - } - - if symtab_addr == 0 || strtab_addr == 0 { - return Err(format!("{}: missing DT_SYMTAB or DT_STRTAB in dynamic section", lib_name)); - } - - // For shared libraries with base address 0 (PIC), the DT_ values are - // virtual addresses. We need to convert them to file offsets. - // For our emitted .so files, vaddr == file offset (base_addr = 0 and - // segments are identity-mapped). For system .so files loaded at higher - // addresses, we'd need to use the PT_LOAD mappings. Since we primarily - // need this for our own .so output, use identity mapping and also try - // PT_LOAD-based translation. - let symtab_file_offset = vaddr_to_file_offset(data, e_phoff, e_phentsize, e_phnum, symtab_addr); - let strtab_file_offset = vaddr_to_file_offset(data, e_phoff, e_phentsize, e_phnum, strtab_addr); - - if strtab_file_offset + strsz as usize > data.len() { - return Err(format!("{}: strtab extends beyond file", lib_name)); - } - let strtab = &data[strtab_file_offset..strtab_file_offset + strsz as usize]; - - // Determine number of dynamic symbols. We can get this from .gnu.hash - // (the symoffset + number of hashed symbols), or by scanning symbols - // until we reach the strtab address. - let sym_count = if gnu_hash_addr != 0 { - let gnu_hash_file_offset = vaddr_to_file_offset(data, e_phoff, e_phentsize, e_phnum, gnu_hash_addr); - count_dynsyms_from_gnu_hash(data, gnu_hash_file_offset) - } else { - // Fallback: symtab ends where strtab begins (they're typically adjacent) - let sym_size = if strtab_file_offset > symtab_file_offset { - strtab_file_offset - symtab_file_offset - } else { - // Can't determine size; try a reasonable max - 1024 * 24 - }; - sym_size / 24 - }; - - // Resolve versym file offset if DT_VERSYM was found - let versym_file_offset = if versym_addr != 0 { - vaddr_to_file_offset(data, e_phoff, e_phentsize, e_phnum, versym_addr) - } else { - 0 - }; - - let mut symbols = Vec::new(); - for j in 1..sym_count { - let off = symtab_file_offset + j * 24; - if off + 24 > data.len() { break; } - let name_idx = read_u32(data, off) as usize; - let info = data[off + 4]; - let shndx = read_u16(data, off + 6); - let value = read_u64(data, off + 8); - let size = read_u64(data, off + 16); - - if shndx == SHN_UNDEF { continue; } - - // Check versym: skip non-default (hidden) versioned symbols - if versym_addr != 0 { - let ver_entry = versym_file_offset + j * 2; - if ver_entry + 2 <= data.len() { - let raw_ver = read_u16(data, ver_entry); - let hidden = raw_ver & 0x8000 != 0; - let ver_idx = raw_ver & 0x7fff; - if hidden && ver_idx >= 2 { - continue; - } - } - } - - let name = read_cstr(strtab, name_idx); - if name.is_empty() { continue; } - - symbols.push(DynSymbol { name, info, value, size, version: None, is_default_ver: true }); - } - - Ok(symbols) -} - -/// Convert a virtual address to a file offset using PT_LOAD program headers. -pub(crate) fn vaddr_to_file_offset( - data: &[u8], e_phoff: usize, e_phentsize: usize, e_phnum: usize, vaddr: u64, -) -> usize { - use crate::backend::elf::PT_LOAD; - for i in 0..e_phnum { - let ph = e_phoff + i * e_phentsize; - if ph + e_phentsize > data.len() { break; } - let p_type = read_u32(data, ph); - if p_type != PT_LOAD { continue; } - let p_offset = read_u64(data, ph + 8); - let p_vaddr = read_u64(data, ph + 16); - let p_filesz = read_u64(data, ph + 32); - if vaddr >= p_vaddr && vaddr < p_vaddr + p_filesz { - return (p_offset + (vaddr - p_vaddr)) as usize; - } - } - // If no PT_LOAD matches, assume identity mapping (vaddr == file offset) - vaddr as usize -} - -/// Count the total number of dynamic symbols from a .gnu.hash section. -/// -/// The .gnu.hash header contains symoffset (first hashed symbol index). -/// We scan the hash chains to find the highest symbol index, then add 1. -fn count_dynsyms_from_gnu_hash(data: &[u8], offset: usize) -> usize { - if offset + 16 > data.len() { return 0; } - let nbuckets = read_u32(data, offset) as usize; - let symoffset = read_u32(data, offset + 4) as usize; - let bloom_size = read_u32(data, offset + 8) as usize; - - let buckets_off = offset + 16 + bloom_size * 8; - let chains_off = buckets_off + nbuckets * 4; - - if buckets_off + nbuckets * 4 > data.len() { return symoffset; } - - // Find the maximum bucket value (highest starting symbol index) - let mut max_sym = symoffset; - for i in 0..nbuckets { - let bucket_val = read_u32(data, buckets_off + i * 4) as usize; - if bucket_val >= max_sym { - // Walk the chain from this bucket to find the last symbol - let mut idx = bucket_val; - loop { - let chain_pos = chains_off + (idx - symoffset) * 4; - if chain_pos + 4 > data.len() { break; } - let chain_val = read_u32(data, chain_pos); - if idx + 1 > max_sym { max_sym = idx + 1; } - if chain_val & 1 != 0 { break; } // end of chain - idx += 1; - } - } - } - - max_sym -} - -/// Get the SONAME from a shared library's .dynamic section. -/// -/// Tries section headers first, then falls back to program headers (PT_DYNAMIC) -/// for shared libraries that lack section headers (e.g., our own emitted .so files). -pub fn parse_soname(data: &[u8]) -> Option { - if data.len() < 64 || data[0..4] != ELF_MAGIC { - return None; - } - - - let e_shoff = read_u64(data, 40) as usize; - let e_shentsize = read_u16(data, 58) as usize; - let e_shnum = read_u16(data, 60) as usize; - - // Try section headers first - if e_shoff != 0 && e_shnum != 0 { - for i in 0..e_shnum { - let off = e_shoff + i * e_shentsize; - if off + 64 > data.len() { break; } - let sh_type = read_u32(data, off + 4); - if sh_type == SHT_DYNAMIC { - let dyn_off = read_u64(data, off + 24) as usize; - let dyn_size = read_u64(data, off + 32) as usize; - let link = read_u32(data, off + 40) as usize; - - let str_sec_off = e_shoff + link * e_shentsize; - if str_sec_off + 64 > data.len() { return None; } - let str_off = read_u64(data, str_sec_off + 24) as usize; - let str_size = read_u64(data, str_sec_off + 32) as usize; - if str_off + str_size > data.len() { return None; } - let strtab = &data[str_off..str_off + str_size]; - - let mut pos = dyn_off; - while pos + 16 <= dyn_off + dyn_size && pos + 16 <= data.len() { - let tag = read_i64(data, pos); - let val = read_u64(data, pos + 8); - if tag == DT_NULL { break; } - if tag == DT_SONAME { - return Some(read_cstr(strtab, val as usize)); - } - pos += 16; - } - return None; - } - } - return None; - } - - // Fallback: use program headers (PT_DYNAMIC) to find the dynamic section - let e_phoff = read_u64(data, 32) as usize; - let e_phentsize = read_u16(data, 54) as usize; - let e_phnum = read_u16(data, 56) as usize; - - if e_phoff == 0 || e_phnum == 0 { return None; } - - // Find PT_DYNAMIC - let mut dyn_file_offset = 0usize; - let mut dyn_filesz = 0usize; - for i in 0..e_phnum { - let ph = e_phoff + i * e_phentsize; - if ph + e_phentsize > data.len() { break; } - let p_type = read_u32(data, ph); - if p_type == PT_DYNAMIC { - dyn_file_offset = read_u64(data, ph + 8) as usize; - dyn_filesz = read_u64(data, ph + 32) as usize; - break; - } - } - - if dyn_file_offset == 0 { return None; } - - // First pass: find DT_STRTAB and DT_SONAME offset - let mut strtab_addr: u64 = 0; - let mut strsz: u64 = 0; - let mut soname_offset: Option = None; - - let mut pos = dyn_file_offset; - let dyn_end = dyn_file_offset + dyn_filesz; - while pos + 16 <= dyn_end && pos + 16 <= data.len() { - let tag = read_i64(data, pos); - let val = read_u64(data, pos + 8); - match tag { - x if x == DT_NULL => break, - x if x == DT_STRTAB => strtab_addr = val, - x if x == DT_STRSZ => strsz = val, - x if x == DT_SONAME => soname_offset = Some(val), - _ => {} - } - pos += 16; - } - - if strtab_addr == 0 || soname_offset.is_none() { return None; } - - let strtab_file_off = vaddr_to_file_offset(data, e_phoff, e_phentsize, e_phnum, strtab_addr); - let name_off = soname_offset.unwrap() as usize; - if strtab_file_off + name_off >= data.len() { return None; } - if strsz > 0 && strtab_file_off + strsz as usize <= data.len() { - let strtab = &data[strtab_file_off..strtab_file_off + strsz as usize]; - Some(read_cstr(strtab, name_off)) - } else { - // Best effort: read from strtab_file_off + name_off - Some(read_cstr(&data[strtab_file_off..], name_off)) - } -} diff --git a/src/backend/linker_common/resolve_lib.rs b/src/backend/linker_common/resolve_lib.rs deleted file mode 100644 index 87e075b339..0000000000 --- a/src/backend/linker_common/resolve_lib.rs +++ /dev/null @@ -1,37 +0,0 @@ -//! Library name resolution helper. -//! -//! Resolves `-l` library names to filesystem paths by searching library -//! directories, handling both exact (`:filename`) and prefix (`libfoo.so/.a`) -//! forms. - -use std::path::Path; - -/// Resolve a library name to a path by searching directories. -/// -/// Handles both `-l:filename` (exact match) and `-lfoo` (lib prefix search). -/// When `prefer_static` is true, searches for `.a` before `.so`. -pub fn resolve_lib(name: &str, paths: &[String], prefer_static: bool) -> Option { - if let Some(exact) = name.strip_prefix(':') { - for dir in paths { - let p = format!("{}/{}", dir, exact); - if Path::new(&p).exists() { return Some(p); } - } - return None; - } - if prefer_static { - for dir in paths { - let a = format!("{}/lib{}.a", dir, name); - if Path::new(&a).exists() { return Some(a); } - let so = format!("{}/lib{}.so", dir, name); - if Path::new(&so).exists() { return Some(so); } - } - } else { - for dir in paths { - let so = format!("{}/lib{}.so", dir, name); - if Path::new(&so).exists() { return Some(so); } - let a = format!("{}/lib{}.a", dir, name); - if Path::new(&a).exists() { return Some(a); } - } - } - None -} diff --git a/src/backend/linker_common/section_map.rs b/src/backend/linker_common/section_map.rs deleted file mode 100644 index 17e2caa62e..0000000000 --- a/src/backend/linker_common/section_map.rs +++ /dev/null @@ -1,25 +0,0 @@ -//! Input-to-output section name mapping. -//! -//! Maps input section names like `.text.foo` to their standard output section -//! names like `.text`. Used by all linker backends during section merging. - -/// Map an input section name to the standard output section name. -/// -/// This is the shared implementation used by all linker backends. Input sections -/// like `.text.foo` are merged into `.text`, `.rodata.bar` into `.rodata`, etc. -/// RISC-V additionally maps `.sdata`/`.sbss` (via `map_section_name_riscv()`). -pub fn map_section_name(name: &str) -> &str { - if name.starts_with(".text.") || name == ".text" { return ".text"; } - if name.starts_with(".data.rel.ro") { return ".data.rel.ro"; } - if name.starts_with(".data.") || name == ".data" { return ".data"; } - if name.starts_with(".rodata.") || name == ".rodata" { return ".rodata"; } - if name.starts_with(".bss.") || name == ".bss" { return ".bss"; } - if name.starts_with(".init_array") { return ".init_array"; } - if name.starts_with(".fini_array") { return ".fini_array"; } - if name.starts_with(".tbss.") || name == ".tbss" { return ".tbss"; } - if name.starts_with(".tdata.") || name == ".tdata" { return ".tdata"; } - if name.starts_with(".gcc_except_table") { return ".gcc_except_table"; } - if name.starts_with(".eh_frame") { return ".eh_frame"; } - if name.starts_with(".note.") { return name; } - name -} diff --git a/src/backend/linker_common/symbols.rs b/src/backend/linker_common/symbols.rs deleted file mode 100644 index e6c8fff575..0000000000 --- a/src/backend/linker_common/symbols.rs +++ /dev/null @@ -1,147 +0,0 @@ -//! Shared linker data structures and linker-defined symbol tables. -//! -//! Contains `InputSection`, `OutputSection`, and the `GlobalSymbolOps` trait -//! shared across x86 and ARM 64-bit linkers. Also defines the set of -//! linker-provided symbols and `__start_`/`__stop_` resolution logic. - -use super::types::{Elf64Symbol, DynSymbol}; - -/// Reference to one input section placed within an output section. -pub struct InputSection { - pub object_idx: usize, - pub section_idx: usize, - pub output_offset: u64, - pub size: u64, -} - -/// A merged output section in the final executable or shared library. -pub struct OutputSection { - pub name: String, - pub sh_type: u32, - pub flags: u64, - pub alignment: u64, - pub inputs: Vec, - pub data: Vec, - pub addr: u64, - pub file_offset: u64, - pub mem_size: u64, -} - -/// Trait abstracting over backend-specific GlobalSymbol types. -/// -/// Provides the interface needed by shared linker functions: symbol registration, -/// section merging, dynamic symbol matching, and common symbol allocation. -/// Each backend implements this for its own GlobalSymbol struct. -pub trait GlobalSymbolOps: Clone { - fn is_defined(&self) -> bool; - fn is_dynamic(&self) -> bool; - fn info(&self) -> u8; - fn section_idx(&self) -> u16; - fn value(&self) -> u64; - fn size(&self) -> u64; - fn new_defined(obj_idx: usize, sym: &Elf64Symbol) -> Self; - fn new_common(obj_idx: usize, sym: &Elf64Symbol) -> Self; - fn new_undefined(sym: &Elf64Symbol) -> Self; - fn set_common_bss(&mut self, bss_offset: u64); - - /// Create a GlobalSymbol representing a dynamic symbol resolved from a shared library. - fn new_dynamic(dsym: &DynSymbol, soname: &str) -> Self; -} - -// ── Linker-defined symbols ────────────────────────────────────────────── -// -// These symbols are provided by the linker during layout and should not be -// reported as undefined. The superset covers all architectures (x86, ARM, -// RISC-V, i686). Architecture-specific symbols (e.g., __global_pointer$ for -// RISC-V) are included; having extra entries is harmless. - -/// Symbols that the linker defines during layout. -/// -/// Used by `is_linker_defined_symbol()` and `resolve_dynamic_symbols_elf64()` -/// to avoid false "undefined symbol" errors and unnecessary shared library lookups. -pub const LINKER_DEFINED_SYMBOLS: &[&str] = &[ - "_GLOBAL_OFFSET_TABLE_", - "__bss_start", "__bss_start__", "__BSS_END__", - "_edata", "edata", "_end", "end", "__end", "__end__", - "_etext", "etext", - "__ehdr_start", "__executable_start", - // Note: _start is intentionally excluded -- it comes from crt1.o, not the linker. - // Suppressing it here would mask missing-CRT errors. - "__dso_handle", "_DYNAMIC", - "__data_start", "data_start", "__DATA_BEGIN__", - "__SDATA_BEGIN__", - "__init_array_start", "__init_array_end", - "__fini_array_start", "__fini_array_end", - "__preinit_array_start", "__preinit_array_end", - "__rela_iplt_start", "__rela_iplt_end", - "__rel_iplt_start", "__rel_iplt_end", - "__global_pointer$", // RISC-V - "_IO_stdin_used", - "_init", "_fini", - "___tls_get_addr", // i686 TLS - "__tls_get_addr", // x86-64 TLS - // Exception handling / unwinding (often weak, but may appear undefined) - "_ITM_registerTMCloneTable", "_ITM_deregisterTMCloneTable", - "__gcc_personality_v0", "_Unwind_Resume", "_Unwind_ForcedUnwind", "_Unwind_GetCFA", - "__pthread_initialize_minimal", "_dl_rtld_map", - "__GNU_EH_FRAME_HDR", - "__getauxval", - // Dynamic linker debug interface (provided by ld-linux*.so at runtime) - "_r_debug", "_dl_debug_state", "_dl_mcount", -]; - -/// Check whether a symbol name is one that the linker provides during layout. -/// -/// This includes the static list of well-known linker symbols, plus the -/// GNU ld `__start_
` / `__stop_
` pattern: when an undefined -/// symbol matches `__start_X` or `__stop_X` where `X` is a valid C identifier, -/// the linker will auto-generate it to point to the start/end of section `X`. -pub fn is_linker_defined_symbol(name: &str) -> bool { - if LINKER_DEFINED_SYMBOLS.contains(&name) { - return true; - } - // Recognize __start_ and __stop_ patterns (GNU ld feature). - // The section name must be a valid C identifier for this to apply. - // Note: GNU ld only resolves these when section X actually exists, but we - // accept the pattern here to suppress "undefined symbol" errors early. - // Actual resolution (in each backend) is guarded by section existence. - if let Some(suffix) = name.strip_prefix("__start_").or_else(|| name.strip_prefix("__stop_")) { - return is_valid_c_identifier_for_section(suffix); - } - false -} - -/// Check if a string is a valid C identifier (used for __start_/__stop_ section pattern). -/// Also used by RISC-V linker which has different section structures. -pub fn is_valid_c_identifier_for_section(s: &str) -> bool { - if s.is_empty() { - return false; - } - let mut chars = s.chars(); - let first = chars.next().unwrap(); - if !first.is_ascii_alphabetic() && first != '_' { - return false; - } - chars.all(|c| c.is_ascii_alphanumeric() || c == '_') -} - -/// Resolve `__start_
` and `__stop_
` symbols against the output sections. -/// -/// GNU ld auto-generates these symbols when there are undefined references to -/// `__start_X` or `__stop_X` where `X` is the name of an existing output section -/// that is also a valid C identifier. `__start_X` gets the address of the section's -/// start, `__stop_X` gets the address of the section's end (start + size). -/// -/// Returns a vector of (name, address) pairs for all resolved symbols. -pub fn resolve_start_stop_symbols( - output_sections: &[OutputSection], -) -> Vec<(String, u64)> { - let mut result = Vec::new(); - for sec in output_sections { - if is_valid_c_identifier_for_section(&sec.name) { - result.push((format!("__start_{}", sec.name), sec.addr)); - result.push((format!("__stop_{}", sec.name), sec.addr + sec.mem_size)); - } - } - result -} diff --git a/src/backend/linker_common/types.rs b/src/backend/linker_common/types.rs deleted file mode 100644 index ae8bebd0ad..0000000000 --- a/src/backend/linker_common/types.rs +++ /dev/null @@ -1,87 +0,0 @@ -//! ELF64 object file types shared across linker backends. -//! -//! These types are used by x86, ARM, and RISC-V linkers. The i686 linker uses -//! its own ELF32 types since field widths differ (u32 vs u64). - -use crate::backend::elf::SHN_UNDEF; -use crate::backend::elf::{STB_GLOBAL, STB_LOCAL, STB_WEAK}; - -/// Parsed ELF64 section header. -#[derive(Debug, Clone)] -#[allow(dead_code)] // All fields populated during parsing; not every backend reads every field -pub struct Elf64Section { - pub name_idx: u32, - pub name: String, - pub sh_type: u32, - pub flags: u64, - pub addr: u64, - pub offset: u64, - pub size: u64, - pub link: u32, - pub info: u32, - pub addralign: u64, - pub entsize: u64, -} - -/// Parsed ELF64 symbol. -#[derive(Debug, Clone)] -#[allow(dead_code)] // All fields populated during parsing; not every backend reads every field -pub struct Elf64Symbol { - pub name_idx: u32, - pub name: String, - pub info: u8, - pub other: u8, - pub shndx: u16, - pub value: u64, - pub size: u64, -} - -#[allow(dead_code)] // Convenience accessors; not all used by every backend yet -impl Elf64Symbol { - pub fn binding(&self) -> u8 { self.info >> 4 } - pub fn sym_type(&self) -> u8 { self.info & 0xf } - pub fn visibility(&self) -> u8 { self.other & 0x3 } - pub fn is_undefined(&self) -> bool { self.shndx == SHN_UNDEF } - pub fn is_global(&self) -> bool { self.binding() == STB_GLOBAL } - pub fn is_weak(&self) -> bool { self.binding() == STB_WEAK } - pub fn is_local(&self) -> bool { self.binding() == STB_LOCAL } -} - -/// Parsed ELF64 relocation with addend (RELA). -#[derive(Debug, Clone)] -pub struct Elf64Rela { - pub offset: u64, - pub sym_idx: u32, - pub rela_type: u32, - pub addend: i64, -} - -/// Parsed ELF64 object file (.o). -#[derive(Debug)] -pub struct Elf64Object { - pub sections: Vec, - pub symbols: Vec, - pub section_data: Vec>, - /// Relocations indexed by the section they apply to. - pub relocations: Vec>, - pub source_name: String, -} - -/// Dynamic symbol from a shared library (.so). -#[derive(Debug, Clone)] -pub struct DynSymbol { - pub name: String, - pub info: u8, - pub value: u64, - pub size: u64, - /// GLIBC version string for this symbol (e.g. "GLIBC_2.3"), if any. - pub version: Option, - /// Whether this is the default version (@@GLIBC_x.y vs @GLIBC_x.y). - #[allow(dead_code)] // Populated during .so parsing; used by i686 linker's version preference logic - pub is_default_ver: bool, -} - -#[allow(dead_code)] // Convenience accessor; used by x86/ARM linkers via type alias -impl DynSymbol { - pub fn sym_type(&self) -> u8 { self.info & 0xf } -} diff --git a/src/backend/linker_common/write.rs b/src/backend/linker_common/write.rs deleted file mode 100644 index 68f51cd7ef..0000000000 --- a/src/backend/linker_common/write.rs +++ /dev/null @@ -1,64 +0,0 @@ -//! ELF64 binary emission helpers. -//! -//! Common functions for writing ELF64 section headers, program headers, -//! and performing alignment/padding. Used by x86, RISC-V, and ARM linkers. - -/// Write a 64-byte ELF64 section header to the buffer. -pub fn write_elf64_shdr( - elf: &mut Vec, name: u32, sh_type: u32, flags: u64, - addr: u64, offset: u64, size: u64, link: u32, info: u32, - align: u64, entsize: u64, -) { - elf.extend_from_slice(&name.to_le_bytes()); - elf.extend_from_slice(&sh_type.to_le_bytes()); - elf.extend_from_slice(&flags.to_le_bytes()); - elf.extend_from_slice(&addr.to_le_bytes()); - elf.extend_from_slice(&offset.to_le_bytes()); - elf.extend_from_slice(&size.to_le_bytes()); - elf.extend_from_slice(&link.to_le_bytes()); - elf.extend_from_slice(&info.to_le_bytes()); - elf.extend_from_slice(&align.to_le_bytes()); - elf.extend_from_slice(&entsize.to_le_bytes()); -} - -/// Write a 56-byte ELF64 program header by appending to the buffer. -pub fn write_elf64_phdr( - elf: &mut Vec, p_type: u32, p_flags: u32, - offset: u64, vaddr: u64, paddr: u64, - filesz: u64, memsz: u64, p_align: u64, -) { - elf.extend_from_slice(&p_type.to_le_bytes()); - elf.extend_from_slice(&p_flags.to_le_bytes()); - elf.extend_from_slice(&offset.to_le_bytes()); - elf.extend_from_slice(&vaddr.to_le_bytes()); - elf.extend_from_slice(&paddr.to_le_bytes()); - elf.extend_from_slice(&filesz.to_le_bytes()); - elf.extend_from_slice(&memsz.to_le_bytes()); - elf.extend_from_slice(&p_align.to_le_bytes()); -} - -/// Write a 56-byte ELF64 program header at a specific offset (for backpatching). -pub fn write_elf64_phdr_at( - elf: &mut [u8], off: usize, p_type: u32, p_flags: u32, - offset: u64, vaddr: u64, paddr: u64, - filesz: u64, memsz: u64, p_align: u64, -) { - elf[off..off+4].copy_from_slice(&p_type.to_le_bytes()); - elf[off+4..off+8].copy_from_slice(&p_flags.to_le_bytes()); - elf[off+8..off+16].copy_from_slice(&offset.to_le_bytes()); - elf[off+16..off+24].copy_from_slice(&vaddr.to_le_bytes()); - elf[off+24..off+32].copy_from_slice(&paddr.to_le_bytes()); - elf[off+32..off+40].copy_from_slice(&filesz.to_le_bytes()); - elf[off+40..off+48].copy_from_slice(&memsz.to_le_bytes()); - elf[off+48..off+56].copy_from_slice(&p_align.to_le_bytes()); -} - -/// Align `val` up to the next multiple of `align` (power-of-two alignment). -pub fn align_up_64(val: u64, align: u64) -> u64 { - if align <= 1 { val } else { (val + align - 1) & !(align - 1) } -} - -/// Extend buffer with zero bytes to reach `target` length. -pub fn pad_to(buf: &mut Vec, target: usize) { - if buf.len() < target { buf.resize(target, 0); } -} diff --git a/src/backend/liveness.rs b/src/backend/liveness.rs deleted file mode 100644 index e2bc818c11..0000000000 --- a/src/backend/liveness.rs +++ /dev/null @@ -1,1211 +0,0 @@ -//! Liveness analysis for IR values. -//! -//! Computes live intervals for each IR value in an IrFunction. A live interval -//! represents the range [def_point, last_use_point] where a value is live and -//! needs to be preserved (either in a register or a stack slot). -//! -//! The analysis supports loops via backward dataflow iteration: -//! 1. First, assign sequential program points to all instructions and terminators. -//! 2. Run backward dataflow to compute live-in/live-out sets for each block. -//! This correctly handles values that are live across loop back-edges. -//! 3. Build intervals by taking the union of def/use points and live-through blocks. -//! -//! ## Performance -//! -//! The dataflow uses compact bitsets instead of hash sets for gen/kill/live_in/live_out. -//! Value IDs are remapped to a dense [0..N) range so bitsets are minimal size. -//! This eliminates per-iteration heap allocation and replaces hash-table operations -//! with fast word-level bitwise ops (union = OR, difference = AND-NOT, equality = ==). - -use crate::common::fx_hash::{FxHashMap, FxHashSet}; -use crate::common::types::IrType; -use crate::ir::reexports::{ - Instruction, - IrBinOp, - IrConst, - IrFunction, - Operand, - Terminator, - Value, -}; - -/// A live interval for an IR value: [start, end] in program point numbering. -/// start = the point where the value is defined -/// end = the last point where the value is used -#[derive(Debug, Clone, Copy)] -pub struct LiveInterval { - pub start: u32, - pub end: u32, - pub value_id: u32, -} - -/// Result of liveness analysis: maps value IDs to their live intervals. -pub struct LivenessResult { - pub intervals: Vec, - /// Program points that are Call or CallIndirect instructions. - /// Used by the register allocator to identify values that cross call boundaries. - pub call_points: Vec, - /// Loop nesting depth for each block (block_index -> depth). - /// Depth 0 = not in any loop. Depth 1 = in one loop. Depth 2 = nested, etc. - /// Used by the register allocator to weight uses inside loops more heavily. - pub block_loop_depth: Vec, -} - -// ── Compact bitset for dataflow ────────────────────────────────────────────── - -/// A compact bitset stored as a contiguous slice of u64 words. -/// Supports O(1) insert/contains and O(n/64) union/difference/equality. -#[derive(Clone)] -struct BitSet { - words: Vec, -} - -impl BitSet { - /// Create a new empty bitset that can hold indices [0..num_bits). - fn new(num_bits: usize) -> Self { - let num_words = num_bits.div_ceil(64); - Self { words: vec![0u64; num_words] } - } - - #[inline(always)] - fn insert(&mut self, idx: usize) { - let word = idx / 64; - let bit = idx % 64; - self.words[word] |= 1u64 << bit; - } - - #[inline(always)] - fn contains(&self, idx: usize) -> bool { - let word = idx / 64; - let bit = idx % 64; - (self.words[word] >> bit) & 1 != 0 - } - - /// self = self | other. Returns true if self changed. - fn union_with(&mut self, other: &BitSet) -> bool { - let mut changed = false; - for (w, o) in self.words.iter_mut().zip(other.words.iter()) { - let old = *w; - *w |= *o; - changed |= *w != old; - } - changed - } - - /// Computes: self = gen ∪ (out - kill) in one pass. Returns true if self changed. - fn assign_gen_union_out_minus_kill(&mut self, gen: &BitSet, out: &BitSet, kill: &BitSet) -> bool { - let mut changed = false; - for i in 0..self.words.len() { - let new_val = gen.words[i] | (out.words[i] & !kill.words[i]); - if new_val != self.words[i] { - self.words[i] = new_val; - changed = true; - } - } - changed - } - - /// Iterate over all set bits, calling f(bit_index) for each. - fn for_each_set_bit(&self, mut f: impl FnMut(usize)) { - for (word_idx, &word) in self.words.iter().enumerate() { - if word == 0 { continue; } - let base = word_idx * 64; - let mut w = word; - while w != 0 { - let tz = w.trailing_zeros() as usize; - f(base + tz); - w &= w - 1; // clear lowest set bit - } - } - } - - /// Clear all bits. - fn clear(&mut self) { - for w in &mut self.words { - *w = 0; - } - } -} - -/// Intermediate state built during Phase 1 (program point assignment and gen/kill). -struct ProgramPointState { - block_start_points: Vec, - block_end_points: Vec, - def_points: Vec, - last_use_points: Vec, - block_gen: Vec, - block_kill: Vec, - block_id_to_idx: FxHashMap, - setjmp_block_indices: Vec, - call_points: Vec, - num_points: u32, -} - -/// Compute live intervals for all non-alloca values in a function. -/// -/// Uses backward dataflow analysis to correctly handle loops: -/// - live_in[B] = gen[B] ∪ (live_out[B] - kill[B]) -/// - live_out[B] = ∪ live_in[S] for all successors S of B -/// -/// Values that are live-in to a block have their interval extended to cover -/// from the block's start point through the entire block. This correctly -/// extends intervals through loop back-edges. -pub fn compute_live_intervals(func: &IrFunction) -> LivenessResult { - let num_blocks = func.blocks.len(); - if num_blocks == 0 { - return LivenessResult { intervals: Vec::new(), call_points: Vec::new(), block_loop_depth: Vec::new() }; - } - - let alloca_set = collect_alloca_set(func); - let (value_ids, id_to_dense) = build_dense_value_map(func, &alloca_set); - - let num_values = value_ids.len(); - if num_values == 0 { - return LivenessResult { intervals: Vec::new(), call_points: Vec::new(), block_loop_depth: Vec::new() }; - } - - // Phase 1: Assign program points and build gen/kill sets. - let mut ps = assign_program_points(func, num_blocks, num_values, &alloca_set, &id_to_dense); - - // Phase 1b: Extend liveness of GEP base values for GEP folding. - extend_gep_base_liveness(func, &alloca_set, &id_to_dense, - &mut ps.last_use_points, &mut ps.block_gen); - - // Phase 1c: Extend liveness for F128 source pointers. - extend_f128_source_liveness(func, &alloca_set, &id_to_dense, - &mut ps.last_use_points, &mut ps.block_gen); - - // Phase 2: Build successor lists for the CFG. - let successors = build_successor_lists(func, num_blocks, &ps.block_id_to_idx); - - // Phase 2b: Compute loop nesting depth per block. - let block_loop_depth = compute_loop_depth(&successors, num_blocks); - - // Phase 3: Backward dataflow to compute live-in/live-out per block. - let (live_in, live_out) = run_backward_dataflow( - num_blocks, num_values, &successors, &ps.block_gen, &ps.block_kill, - ); - - // Phase 4: Extend intervals for values that are live-in or live-out of blocks. - extend_intervals_from_liveness( - num_blocks, &live_in, &live_out, - &ps.block_start_points, &ps.block_end_points, - &mut ps.def_points, &mut ps.last_use_points, - ); - - // Phase 4b: Handle setjmp/longjmp. - extend_intervals_for_setjmp( - &ps.setjmp_block_indices, ps.num_points, &live_in, &live_out, - &mut ps.last_use_points, - ); - - // Phase 5: Build and sort intervals. - let intervals = build_intervals(&value_ids, &ps.def_points, &ps.last_use_points); - - LivenessResult { - intervals, - call_points: ps.call_points, - block_loop_depth, - } -} - -/// Collect alloca values (not register-allocatable). -fn collect_alloca_set(func: &IrFunction) -> FxHashSet { - let mut alloca_set: FxHashSet = FxHashSet::default(); - for block in &func.blocks { - for inst in &block.instructions { - if let Instruction::Alloca { dest, .. } = inst { - alloca_set.insert(dest.0); - } - } - } - alloca_set -} - -/// Collect all non-alloca value IDs and build a dense remapping: -/// sparse value_id -> dense index in [0..num_values). -fn build_dense_value_map(func: &IrFunction, alloca_set: &FxHashSet) -> (Vec, FxHashMap) { - let mut value_ids: Vec = Vec::new(); - let mut seen: FxHashSet = FxHashSet::default(); - - let maybe_add = |id: u32, alloca_set: &FxHashSet, seen: &mut FxHashSet, value_ids: &mut Vec| { - if !alloca_set.contains(&id) && seen.insert(id) { - value_ids.push(id); - } - }; - - for block in &func.blocks { - for inst in &block.instructions { - if let Some(dest) = inst.dest() { - maybe_add(dest.0, alloca_set, &mut seen, &mut value_ids); - } - for_each_operand_in_instruction(inst, |op| { - if let Operand::Value(v) = op { - maybe_add(v.0, alloca_set, &mut seen, &mut value_ids); - } - }); - for_each_value_use_in_instruction(inst, |v| { - maybe_add(v.0, alloca_set, &mut seen, &mut value_ids); - }); - } - for_each_operand_in_terminator(&block.terminator, |op| { - if let Operand::Value(v) = op { - maybe_add(v.0, alloca_set, &mut seen, &mut value_ids); - } - }); - } - - let mut id_to_dense: FxHashMap = FxHashMap::default(); - id_to_dense.reserve(value_ids.len()); - for (dense_idx, &vid) in value_ids.iter().enumerate() { - id_to_dense.insert(vid, dense_idx); - } - - (value_ids, id_to_dense) -} - -/// Phase 1: Assign sequential program points to all instructions/terminators -/// and build per-block gen/kill bitsets, def/use point arrays, call points, -/// and setjmp block indices. -fn assign_program_points( - func: &IrFunction, - num_blocks: usize, - num_values: usize, - alloca_set: &FxHashSet, - id_to_dense: &FxHashMap, -) -> ProgramPointState { - let mut point: u32 = 0; - let mut block_start_points: Vec = Vec::with_capacity(num_blocks); - let mut block_end_points: Vec = Vec::with_capacity(num_blocks); - let mut def_points: Vec = vec![u32::MAX; num_values]; - let mut last_use_points: Vec = vec![u32::MAX; num_values]; - let mut block_gen: Vec = Vec::with_capacity(num_blocks); - let mut block_kill: Vec = Vec::with_capacity(num_blocks); - let mut block_id_to_idx: FxHashMap = FxHashMap::default(); - let mut setjmp_block_indices: Vec = Vec::new(); - let mut call_points: Vec = Vec::new(); - - for (block_idx, block) in func.blocks.iter().enumerate() { - block_id_to_idx.insert(block.label.0, block_idx); - let block_start = point; - block_start_points.push(block_start); - let mut gen = BitSet::new(num_values); - let mut kill = BitSet::new(num_values); - - for inst in &block.instructions { - if is_returns_twice_call(inst) { - setjmp_block_indices.push(block_idx); - } - - // Track call instruction program points for register allocation. - // InlineAsm instructions with register operands are treated as call - // points because they may clobber caller-saved registers (r8-r11 on - // x86). This ensures values whose live ranges span inline asm get - // callee-saved registers (which survive inline asm), while values NOT - // spanning inline asm can safely use caller-saved registers. - // - // Empty inline asm barriers (e.g., `asm volatile("" ::: "memory")`) - // are NOT call points since they don't use any GP registers. These - // are common in the kernel for memory barriers, preempt_disable/enable, - // etc. Treating them as call points would unnecessarily force values - // into callee-saved registers across simple barriers. - match inst { - Instruction::Call { .. } | Instruction::CallIndirect { .. } => { - call_points.push(point); - } - Instruction::InlineAsm { outputs, inputs, .. } => { - // Only treat as call point if the asm has register operands - // (outputs or inputs that bind to GP registers). - if !outputs.is_empty() || !inputs.is_empty() { - call_points.push(point); - } - } - // Memcpy uses rep movsb which clobbers rdi, rsi, rcx. - // VaArg/VaCopy/VaArgStruct clobber rdi/rsi for struct copy. - // VaStart is included conservatively for safety. - // Treat these as call points so caller-saved registers (including - // rdi/rsi) are not allocated across them. - Instruction::Memcpy { .. } - | Instruction::VaArg { .. } - | Instruction::VaStart { .. } - | Instruction::VaCopy { .. } - | Instruction::VaArgStruct { .. } => { - call_points.push(point); - } - // i128 div/rem emit implicit calls to __divti3/__udivti3/__modti3/__umodti3. - // These are BinOp instructions at the IR level but generate `call` at the - // assembly level, clobbering all caller-saved registers. We must treat them - // as call points so the register allocator doesn't assign caller-saved - // registers to values whose live ranges span these operations. - Instruction::BinOp { op, ty, .. } - if matches!(ty, IrType::I128 | IrType::U128) - && matches!(op, IrBinOp::SDiv | IrBinOp::UDiv | IrBinOp::SRem | IrBinOp::URem) => - { - call_points.push(point); - } - // i128 <-> float casts emit implicit calls to compiler-rt helpers - // (__floattidf/__fixdfti/etc.). Same reasoning as i128 div/rem above. - Instruction::Cast { from_ty, to_ty, .. } - if (matches!(from_ty, IrType::I128 | IrType::U128) && to_ty.is_float()) - || (from_ty.is_float() && matches!(to_ty, IrType::I128 | IrType::U128)) => - { - call_points.push(point); - } - _ => {} - } - - record_instruction_uses_dense(inst, point, alloca_set, id_to_dense, &mut last_use_points); - - // Record InlineAsm output definitions BEFORE gen collection so - // that promoted (non-alloca) outputs are in the kill set and won't - // be treated as upward-exposed uses. - // - // Only kill output values that are first defined here (promoted asm - // outputs). Output values already defined earlier (e.g., pointer - // values passed through for indirect stores like `"=a"(*ptr)`) are - // merely *used* by the InlineAsm — the asm reads the pointer from - // the slot and stores through it, but does not overwrite the pointer - // itself. Killing such values truncates their live interval, letting - // the slot packer reuse their slot too early, which corrupts the - // pointer on the next loop iteration. - if let Instruction::InlineAsm { outputs, .. } = inst { - for (_, out_val, _) in outputs { - if !alloca_set.contains(&out_val.0) { - if let Some(&dense) = id_to_dense.get(&out_val.0) { - if def_points[dense] == u32::MAX { - def_points[dense] = point; - kill.insert(dense); - } - } - } - } - } - - collect_instruction_gen_dense(inst, alloca_set, id_to_dense, &kill, &mut gen); - - if let Some(dest) = inst.dest() { - if !alloca_set.contains(&dest.0) { - if let Some(&dense) = id_to_dense.get(&dest.0) { - if def_points[dense] == u32::MAX { - def_points[dense] = point; - } - kill.insert(dense); - } - } - } - - point += 1; - } - - record_terminator_uses_dense(&block.terminator, point, alloca_set, id_to_dense, &mut last_use_points); - collect_terminator_gen_dense(&block.terminator, alloca_set, id_to_dense, &kill, &mut gen); - let block_end = point; - block_end_points.push(block_end); - point += 1; - - block_gen.push(gen); - block_kill.push(kill); - } - - ProgramPointState { - block_start_points, - block_end_points, - def_points, - last_use_points, - block_gen, - block_kill, - block_id_to_idx, - setjmp_block_indices, - call_points, - num_points: point, - } -} - -/// Phase 2: Build successor lists from block terminators and asm goto labels. -fn build_successor_lists( - func: &IrFunction, - num_blocks: usize, - block_id_to_idx: &FxHashMap, -) -> Vec> { - let mut successors: Vec> = vec![Vec::new(); num_blocks]; - for (idx, block) in func.blocks.iter().enumerate() { - for target_id in terminator_targets(&block.terminator) { - if let Some(&target_idx) = block_id_to_idx.get(&target_id) { - successors[idx].push(target_idx); - } - } - // InlineAsm goto_labels are implicit control flow edges. - for inst in &block.instructions { - if let Instruction::InlineAsm { goto_labels, .. } = inst { - for (_, label) in goto_labels { - if let Some(&target_idx) = block_id_to_idx.get(&label.0) { - if !successors[idx].contains(&target_idx) { - successors[idx].push(target_idx); - } - } - } - } - } - } - successors -} - -/// Phase 3: Backward dataflow to compute live-in/live-out per block. -/// live_in[B] = gen[B] ∪ (live_out[B] - kill[B]) -/// live_out[B] = ∪ live_in[S] for all successors S of B -fn run_backward_dataflow( - num_blocks: usize, - num_values: usize, - successors: &[Vec], - block_gen: &[BitSet], - block_kill: &[BitSet], -) -> (Vec, Vec) { - let mut live_in: Vec = (0..num_blocks).map(|_| BitSet::new(num_values)).collect(); - let mut live_out: Vec = (0..num_blocks).map(|_| BitSet::new(num_values)).collect(); - let mut tmp_out = BitSet::new(num_values); - - // Iterate until fixpoint (backward order converges faster). - // MAX_ITERATIONS is a safety bound for pathological irreducible control flow. - let mut changed = true; - let mut iteration = 0; - const MAX_ITERATIONS: u32 = 50; - while changed && iteration < MAX_ITERATIONS { - changed = false; - iteration += 1; - - for idx in (0..num_blocks).rev() { - tmp_out.clear(); - for &succ in &successors[idx] { - tmp_out.union_with(&live_in[succ]); - } - - if tmp_out.words != live_out[idx].words { - live_out[idx].words.copy_from_slice(&tmp_out.words); - changed = true; - } - - let in_changed = live_in[idx].assign_gen_union_out_minus_kill( - &block_gen[idx], &live_out[idx], &block_kill[idx] - ); - changed |= in_changed; - } - } - - (live_in, live_out) -} - -/// Phase 4: Extend intervals for values that are live-in or live-out of blocks. -/// A value live-in to a block has its interval cover the entire block. -fn extend_intervals_from_liveness( - num_blocks: usize, - live_in: &[BitSet], - live_out: &[BitSet], - block_start_points: &[u32], - block_end_points: &[u32], - def_points: &mut [u32], - last_use_points: &mut [u32], -) { - for idx in 0..num_blocks { - let start = block_start_points[idx]; - let end = block_end_points[idx]; - - live_in[idx].for_each_set_bit(|dense_idx| { - let def_entry = &mut def_points[dense_idx]; - if *def_entry == u32::MAX || start < *def_entry { - *def_entry = start; - } - let entry = &mut last_use_points[dense_idx]; - if *entry == u32::MAX { - *entry = start; - } - if end > *entry { - *entry = end; - } - }); - - live_out[idx].for_each_set_bit(|dense_idx| { - let def_entry = &mut def_points[dense_idx]; - if *def_entry == u32::MAX || start < *def_entry { - *def_entry = start; - } - let entry = &mut last_use_points[dense_idx]; - if *entry == u32::MAX { - *entry = end; - } - if end > *entry { - *entry = end; - } - }); - } -} - -/// Phase 4b: Handle setjmp/longjmp — extend intervals for values live at -/// setjmp call points to the end of the function, preventing slot reuse. -fn extend_intervals_for_setjmp( - setjmp_block_indices: &[usize], - num_points: u32, - live_in: &[BitSet], - live_out: &[BitSet], - last_use_points: &mut [u32], -) { - if setjmp_block_indices.is_empty() { - return; - } - let func_end = num_points.saturating_sub(1); - for &sjb in setjmp_block_indices { - live_in[sjb].for_each_set_bit(|dense_idx| { - let entry = &mut last_use_points[dense_idx]; - if *entry == u32::MAX || func_end > *entry { - *entry = func_end; - } - }); - live_out[sjb].for_each_set_bit(|dense_idx| { - let entry = &mut last_use_points[dense_idx]; - if *entry == u32::MAX || func_end > *entry { - *entry = func_end; - } - }); - } -} - -/// Phase 5: Build sorted live intervals from def/use point arrays. -fn build_intervals(value_ids: &[u32], def_points: &[u32], last_use_points: &[u32]) -> Vec { - let mut intervals: Vec = Vec::new(); - for (dense_idx, &vid) in value_ids.iter().enumerate() { - let start = def_points[dense_idx]; - if start == u32::MAX { continue; } - let end = last_use_points[dense_idx]; - let end = if end == u32::MAX { start } else { end.max(start) }; - intervals.push(LiveInterval { start, end, value_id: vid }); - } - intervals.sort_unstable_by_key(|iv| iv.start); - intervals -} - -/// Extend liveness of GEP base values so that their registers remain valid -/// at Load/Store use points where the GEP offset can be folded into the -/// addressing mode. -/// -/// For each GEP `%gep = gep %base, const_offset` whose result is only used -/// as a Load/Store ptr operand: -/// - Find all Load/Store instructions that use %gep as their ptr -/// - Record %base as "used" at those instruction program points -/// - Update the gen bitset for the block containing the Load/Store -/// -/// This ensures the register allocator keeps %base alive through the folded -/// Load/Store, enabling safe `offset(%base_reg)` addressing at codegen time. -fn extend_gep_base_liveness( - func: &IrFunction, - alloca_set: &FxHashSet, - id_to_dense: &FxHashMap, - last_use_points: &mut [u32], - block_gen: &mut [BitSet], -) { - // Phase A: Identify foldable GEPs with non-alloca bases. - // Same criteria as build_gep_fold_map in generation.rs. - let mut gep_info: FxHashMap = FxHashMap::default(); // gep_dest_id -> (base_id, offset) - - for block in &func.blocks { - for inst in &block.instructions { - if let Instruction::GetElementPtr { dest, base, offset: Operand::Const(c), .. } = inst { - // Skip alloca bases (already handled by existing fold logic) - if alloca_set.contains(&base.0) { - continue; - } - let offset_val = match c { - IrConst::I64(n) => *n, - IrConst::I32(n) => *n as i64, - IrConst::I16(n) => *n as i64, - IrConst::I8(n) => *n as i64, - _ => continue, - }; - if offset_val >= i32::MIN as i64 && offset_val <= i32::MAX as i64 { - gep_info.insert(dest.0, (base.0, offset_val)); - } - } - } - } - - if gep_info.is_empty() { - return; - } - - // Phase B: Verify each GEP dest is only used as Load/Store ptr operand. - // If used elsewhere, remove from the map (not foldable). - let mut non_foldable: FxHashSet = FxHashSet::default(); - - for block in &func.blocks { - for inst in &block.instructions { - match inst { - Instruction::Load { ptr, ty, .. } => { - // Load.ptr is foldable unless i128 - if matches!(ty, IrType::I128 | IrType::U128) - && gep_info.contains_key(&ptr.0) { - non_foldable.insert(ptr.0); - } - } - Instruction::Store { val, ptr, ty, .. } => { - // Store.val is NOT foldable; Store.ptr is (unless i128) - if let Operand::Value(v) = val { - if gep_info.contains_key(&v.0) { - non_foldable.insert(v.0); - } - } - if matches!(ty, IrType::I128 | IrType::U128) - && gep_info.contains_key(&ptr.0) { - non_foldable.insert(ptr.0); - } - } - _ => { - // Any other use invalidates folding - for_each_operand_in_instruction(inst, |op| { - if let Operand::Value(v) = op { - if gep_info.contains_key(&v.0) { - non_foldable.insert(v.0); - } - } - }); - for_each_value_use_in_instruction(inst, |v| { - if gep_info.contains_key(&v.0) { - non_foldable.insert(v.0); - } - }); - } - } - } - for_each_operand_in_terminator(&block.terminator, |op| { - if let Operand::Value(v) = op { - if gep_info.contains_key(&v.0) { - non_foldable.insert(v.0); - } - } - }); - } - - for id in &non_foldable { - gep_info.remove(id); - } - - if gep_info.is_empty() { - return; - } - - // Phase C: Extend base liveness to Load/Store points that use foldable GEP results. - let mut block_point: u32 = 0; - for (bi, block) in func.blocks.iter().enumerate() { - for inst in &block.instructions { - match inst { - Instruction::Load { .. } | Instruction::Store { .. } => { - let ptr_id = match inst { - Instruction::Load { ptr, .. } => ptr.0, - Instruction::Store { ptr, .. } => ptr.0, - _ => unreachable!("GEP analysis matched non-Load/Store instruction"), - }; - if let Some(&(base_id, _offset)) = gep_info.get(&ptr_id) { - // Extend base's last_use to this program point - if !alloca_set.contains(&base_id) { - if let Some(&dense) = id_to_dense.get(&base_id) { - let entry = &mut last_use_points[dense]; - if *entry == u32::MAX || block_point > *entry { - *entry = block_point; - } - // Also add to block's gen set (the base is "used" here) - block_gen[bi].insert(dense); - } - } - } - } - _ => {} - } - block_point += 1; - } - // Account for terminator point - block_point += 1; - } -} - -/// Extend liveness for F128 load source pointers. -/// -/// When an F128 value is loaded from memory, the codegen records which pointer -/// was used (via `track_f128_load` in state.rs). Later, during Call emission, -/// `emit_f128_operand_to_a0_a1` reads the pointer back from its stack slot to -/// reload the full 128-bit value. This creates an implicit dependency: the -/// pointer must remain live until the last use of the F128 dest value. -/// -/// Without this extension, the Tier 2 liveness analysis considers the pointer -/// dead after the Load instruction, allowing the register allocator (and -/// subsequently the Tier 3 slot allocator) to reuse its slot. If another value -/// is placed in that slot before the Call, the pointer is corrupted and the -/// Call dereferences garbage (typically causing SIGSEGV). -fn extend_f128_source_liveness( - func: &IrFunction, - alloca_set: &FxHashSet, - id_to_dense: &FxHashMap, - last_use_points: &mut [u32], - block_gen: &mut [BitSet], -) { - // Collect (ptr_id, dest_id) pairs for F128 loads with non-alloca pointers. - let mut f128_loads: Vec<(u32, u32)> = Vec::new(); - for block in &func.blocks { - for inst in &block.instructions { - if let Instruction::Load { dest, ptr, ty, .. } = inst { - if *ty == IrType::F128 && !alloca_set.contains(&ptr.0) { - f128_loads.push((ptr.0, dest.0)); - } - } - } - } - - if f128_loads.is_empty() { - return; - } - - // Extend each pointer's last_use_point to match its dest's last_use_point. - for &(ptr_id, dest_id) in &f128_loads { - let dest_dense = id_to_dense.get(&dest_id).copied(); - let ptr_dense = id_to_dense.get(&ptr_id).copied(); - if let (Some(dd), Some(pd)) = (dest_dense, ptr_dense) { - let dest_last = last_use_points[dd]; - if dest_last != u32::MAX { - let ptr_entry = &mut last_use_points[pd]; - if *ptr_entry == u32::MAX || dest_last > *ptr_entry { - *ptr_entry = dest_last; - } - } - } - } - - // Update gen sets so backward dataflow propagation keeps the pointer live - // in predecessor blocks when the dest value is used in a successor block. - for (bi, block) in func.blocks.iter().enumerate() { - for inst in &block.instructions { - let mut check_use = |vid: u32| { - for &(ptr_id, dest_id) in &f128_loads { - if vid == dest_id { - if let Some(&pd) = id_to_dense.get(&ptr_id) { - block_gen[bi].insert(pd); - } - } - } - }; - for_each_operand_in_instruction(inst, |op| { - if let Operand::Value(v) = op { - check_use(v.0); - } - }); - for_each_value_use_in_instruction(inst, |v| { - check_use(v.0); - }); - } - for_each_operand_in_terminator(&block.terminator, |op| { - if let Operand::Value(v) = op { - for &(ptr_id, dest_id) in &f128_loads { - if v.0 == dest_id { - if let Some(&pd) = id_to_dense.get(&ptr_id) { - block_gen[bi].insert(pd); - } - } - } - } - }); - } -} - -/// Record uses of operands in an instruction (dense index version). -fn record_instruction_uses_dense( - inst: &Instruction, - point: u32, - alloca_set: &FxHashSet, - id_to_dense: &FxHashMap, - last_use: &mut [u32], -) { - let mut record = |vid: u32| { - if !alloca_set.contains(&vid) { - if let Some(&dense) = id_to_dense.get(&vid) { - let entry = &mut last_use[dense]; - if *entry == u32::MAX || point > *entry { - *entry = point; - } - } - } - }; - - for_each_operand_in_instruction(inst, |op| { - if let Operand::Value(v) = op { - record(v.0); - } - }); - - for_each_value_use_in_instruction(inst, |v| { - record(v.0); - }); -} - -/// Record uses in a terminator (dense index version). -fn record_terminator_uses_dense( - term: &Terminator, - point: u32, - alloca_set: &FxHashSet, - id_to_dense: &FxHashMap, - last_use: &mut [u32], -) { - for_each_operand_in_terminator(term, |op| { - if let Operand::Value(v) = op { - if !alloca_set.contains(&v.0) { - if let Some(&dense) = id_to_dense.get(&v.0) { - let entry = &mut last_use[dense]; - if *entry == u32::MAX || point > *entry { - *entry = point; - } - } - } - } - }); -} - -/// Collect gen set for a block's instruction (dense bitset version). -fn collect_instruction_gen_dense( - inst: &Instruction, - alloca_set: &FxHashSet, - id_to_dense: &FxHashMap, - kill: &BitSet, - gen: &mut BitSet, -) { - let mut add_use = |vid: u32| { - if !alloca_set.contains(&vid) { - if let Some(&dense) = id_to_dense.get(&vid) { - if !kill.contains(dense) { - gen.insert(dense); - } - } - } - }; - - for_each_operand_in_instruction(inst, |op| { - if let Operand::Value(v) = op { - add_use(v.0); - } - }); - - for_each_value_use_in_instruction(inst, |v| { - add_use(v.0); - }); -} - -/// Collect gen set for a terminator (dense bitset version). -fn collect_terminator_gen_dense( - term: &Terminator, - alloca_set: &FxHashSet, - id_to_dense: &FxHashMap, - kill: &BitSet, - gen: &mut BitSet, -) { - for_each_operand_in_terminator(term, |op| { - if let Operand::Value(v) = op { - if !alloca_set.contains(&v.0) { - if let Some(&dense) = id_to_dense.get(&v.0) { - if !kill.contains(dense) { - gen.insert(dense); - } - } - } - } - }); -} - -/// Get successor block IDs from a terminator. -fn terminator_targets(term: &Terminator) -> Vec { - match term { - Terminator::Branch(target) => vec![target.0], - Terminator::CondBranch { true_label, false_label, .. } => { - vec![true_label.0, false_label.0] - } - Terminator::IndirectBranch { possible_targets, .. } => { - possible_targets.iter().map(|t| t.0).collect() - } - Terminator::Switch { cases, default, .. } => { - let mut targets = vec![default.0]; - for (_, label) in cases { - targets.push(label.0); - } - targets - } - _ => vec![], - } -} - -/// Return true if the instruction is a call to setjmp, _setjmp, sigsetjmp, or __sigsetjmp. -/// These functions "return twice": once normally (returning 0) and again when longjmp is called. -/// Values live at the call point must have their intervals extended to prevent stack slot reuse. -fn is_returns_twice_call(inst: &Instruction) -> bool { - if let Instruction::Call { func, .. } = inst { - matches!(func.as_str(), "setjmp" | "_setjmp" | "sigsetjmp" | "__sigsetjmp") - } else { - false - } -} - -/// Iterate over all Operand references in an instruction. -/// This is the single canonical source of truth for instruction operand traversal. -/// All code that needs to enumerate operands (liveness, use-counting, GEP fold -/// verification) should call this rather than hand-rolling its own match. -pub(super) fn for_each_operand_in_instruction(inst: &Instruction, mut f: impl FnMut(&Operand)) { - match inst { - Instruction::Alloca { .. } => {} - Instruction::DynAlloca { size, .. } => f(size), - Instruction::Store { val, .. } => f(val), - Instruction::Load { .. } => {} - Instruction::BinOp { lhs, rhs, .. } => { f(lhs); f(rhs); } - Instruction::UnaryOp { src, .. } => f(src), - Instruction::Cmp { lhs, rhs, .. } => { f(lhs); f(rhs); } - Instruction::Call { info, .. } => { for a in &info.args { f(a); } } - Instruction::CallIndirect { func_ptr, info } => { f(func_ptr); for a in &info.args { f(a); } } - Instruction::GetElementPtr { offset, .. } => f(offset), - Instruction::Cast { src, .. } => f(src), - Instruction::Copy { src, .. } => f(src), - Instruction::GlobalAddr { .. } => {} - Instruction::Memcpy { .. } => {} - Instruction::VaArg { .. } => {} - Instruction::VaStart { .. } => {} - Instruction::VaEnd { .. } => {} - Instruction::VaCopy { .. } => {} - Instruction::VaArgStruct { .. } => {} - Instruction::AtomicRmw { ptr, val, .. } => { f(ptr); f(val); } - Instruction::AtomicCmpxchg { ptr, expected, desired, .. } => { f(ptr); f(expected); f(desired); } - Instruction::AtomicLoad { ptr, .. } => f(ptr), - Instruction::AtomicStore { ptr, val, .. } => { f(ptr); f(val); } - Instruction::Fence { .. } => {} - Instruction::Phi { incoming, .. } => { for (op, _) in incoming { f(op); } } - Instruction::LabelAddr { .. } => {} - Instruction::GetReturnF64Second { .. } => {} - Instruction::SetReturnF64Second { src } => f(src), - Instruction::GetReturnF32Second { .. } => {} - Instruction::SetReturnF32Second { src } => f(src), - Instruction::GetReturnF128Second { .. } => {}, - Instruction::SetReturnF128Second { src } => f(src), - Instruction::InlineAsm { inputs, .. } => { - for (_, op, _) in inputs { f(op); } - } - Instruction::Intrinsic { args, .. } => { for a in args { f(a); } } - Instruction::Select { cond, true_val, false_val, .. } => { f(cond); f(true_val); f(false_val); } - Instruction::StackSave { .. } => {} - Instruction::StackRestore { .. } => {} - Instruction::ParamRef { .. } => {} - } -} - -/// Iterate over Value references (non-Operand) used in an instruction. -/// These are pointer/base values used directly (not wrapped in Operand), -/// e.g., the `ptr` in Store/Load, `base` in GEP, `dest`/`src` in Memcpy. -/// Canonical traversal — shared by liveness, use-counting, and GEP fold analysis. -pub(super) fn for_each_value_use_in_instruction(inst: &Instruction, mut f: impl FnMut(&Value)) { - match inst { - Instruction::Store { ptr, .. } => f(ptr), - Instruction::Load { ptr, .. } => f(ptr), - Instruction::GetElementPtr { base, .. } => f(base), - Instruction::Memcpy { dest, src, .. } => { f(dest); f(src); } - Instruction::VaArg { va_list_ptr, .. } => f(va_list_ptr), - Instruction::VaStart { va_list_ptr } => f(va_list_ptr), - Instruction::VaEnd { va_list_ptr } => f(va_list_ptr), - Instruction::VaCopy { dest_ptr, src_ptr } => { f(dest_ptr); f(src_ptr); } - Instruction::VaArgStruct { dest_ptr, va_list_ptr, .. } => { f(dest_ptr); f(va_list_ptr); } - Instruction::InlineAsm { outputs, .. } => { - for (_, v, _) in outputs { f(v); } - } - Instruction::Intrinsic { dest_ptr: Some(dp), .. } => { - f(dp); - } - Instruction::StackRestore { ptr } => f(ptr), - _ => {} - } -} - -/// Iterate over all Operand references in a terminator. -/// Canonical traversal — shared by liveness, use-counting, and GEP fold analysis. -pub(super) fn for_each_operand_in_terminator(term: &Terminator, mut f: impl FnMut(&Operand)) { - match term { - Terminator::Return(Some(op)) => f(op), - Terminator::CondBranch { cond, .. } => f(cond), - Terminator::IndirectBranch { target, .. } => f(target), - Terminator::Switch { val, .. } => f(val), - _ => {} - } -} - -/// Compute the loop nesting depth for each block in the CFG. -/// -/// Uses DFS-based back-edge detection: an edge src -> dst where dst is an -/// ancestor in the DFS tree is a back edge defining a natural loop. For each -/// back edge (src -> header), all blocks on any path from header to src form -/// the loop body. The depth of a block is the number of loop bodies it belongs to. -/// -/// This is used by the register allocator to weight uses inside loops more -/// heavily, so that inner-loop temporaries get priority for register allocation. -fn compute_loop_depth(successors: &[Vec], num_blocks: usize) -> Vec { - if num_blocks == 0 { - return Vec::new(); - } - - let mut depth = vec![0u32; num_blocks]; - - // Build predecessor lists from successor lists. - let mut predecessors: Vec> = vec![Vec::new(); num_blocks]; - for (src, succs) in successors.iter().enumerate() { - for &dst in succs { - if dst < num_blocks { - predecessors[dst].push(src); - } - } - } - - // DFS to classify edges. An edge src -> dst is a back edge if dst is an - // ancestor of src in the DFS tree (i.e., dst was visited but not finished). - // State: 0 = unvisited, 1 = in-progress (on stack), 2 = finished. - let mut state = vec![0u8; num_blocks]; - let mut back_edges: Vec<(usize, usize)> = Vec::new(); // (src, header) - - // Iterative DFS to avoid stack overflow on deeply nested CFGs. - let mut stack: Vec<(usize, usize)> = Vec::new(); // (block, successor_index) - state[0] = 1; // Mark entry block as in-progress - stack.push((0, 0)); - - while let Some(&mut (block, ref mut succ_idx)) = stack.last_mut() { - if *succ_idx < successors[block].len() { - let next = successors[block][*succ_idx]; - *succ_idx += 1; - if next < num_blocks { - match state[next] { - 0 => { - // Unvisited: push to stack - state[next] = 1; - stack.push((next, 0)); - } - 1 => { - // Back edge: next is an ancestor (in-progress) - back_edges.push((block, next)); - } - _ => { - // Cross or forward edge: ignore - } - } - } - } else { - // All successors processed: mark as finished - state[block] = 2; - stack.pop(); - } - } - - // For each back edge (src -> header), find the natural loop body. - // The loop body consists of all blocks that can reach `src` without going - // through `header`, plus `header` itself. We compute this by a reverse - // BFS/DFS from `src` following predecessor edges, stopping at `header`. - for &(tail, header) in &back_edges { - // All blocks in the loop body get +1 depth - depth[header] += 1; - if tail != header { - // BFS backwards from tail, stopping at header - let mut worklist = vec![tail]; - let mut visited = vec![false; num_blocks]; - visited[header] = true; // Don't go past header - visited[tail] = true; - depth[tail] += 1; - - while let Some(b) = worklist.pop() { - for &pred in &predecessors[b] { - if pred < num_blocks && !visited[pred] { - visited[pred] = true; - depth[pred] += 1; - worklist.push(pred); - } - } - } - } - } - - depth -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::common::types::IrType; - use crate::ir::reexports::{BasicBlock, BlockId, IrBinOp}; - - /// Verify that InlineAsm with register operands is treated as a call point. - /// This is critical for register allocation: values spanning inline asm with - /// register constraints must get callee-saved registers, since inline asm may - /// clobber caller-saved registers (r8-r11 on x86). - #[test] - fn test_inline_asm_with_operands_is_call_point() { - let mut func = IrFunction::new("test".to_string(), IrType::I32, vec![], false); - func.blocks.push(BasicBlock { - label: BlockId(0), - instructions: vec![ - Instruction::BinOp { - dest: Value(0), op: IrBinOp::Add, - lhs: Operand::Const(IrConst::I32(1)), - rhs: Operand::Const(IrConst::I32(2)), - ty: IrType::I32, - }, - // Inline asm with an output register constraint - Instruction::InlineAsm { - template: "nop".to_string(), - outputs: vec![("=r".to_string(), Value(1), Some("out".to_string()))], - inputs: vec![], - clobbers: vec![], - operand_types: vec![IrType::I32], - goto_labels: vec![], - input_symbols: vec![], - seg_overrides: vec![], - }, - ], - terminator: Terminator::Return(Some(Operand::Value(Value(0)))), - source_spans: Vec::new(), - }); - func.next_value_id = 2; - - let result = compute_live_intervals(&func); - // The InlineAsm instruction should appear as a call point - assert!(!result.call_points.is_empty(), - "InlineAsm with register operands should be a call point"); - } - - /// Verify that empty inline asm barriers (no inputs/outputs) are NOT call points. - /// Memory barriers like `asm volatile("" ::: "memory")` don't use GP registers - /// and should not force values into callee-saved registers. - #[test] - fn test_empty_inline_asm_barrier_not_call_point() { - let mut func = IrFunction::new("test".to_string(), IrType::I32, vec![], false); - func.blocks.push(BasicBlock { - label: BlockId(0), - instructions: vec![ - Instruction::BinOp { - dest: Value(0), op: IrBinOp::Add, - lhs: Operand::Const(IrConst::I32(1)), - rhs: Operand::Const(IrConst::I32(2)), - ty: IrType::I32, - }, - // Empty inline asm barrier - no outputs or inputs - Instruction::InlineAsm { - template: String::new(), - outputs: vec![], - inputs: vec![], - clobbers: vec!["memory".to_string()], - operand_types: vec![], - goto_labels: vec![], - input_symbols: vec![], - seg_overrides: vec![], - }, - ], - terminator: Terminator::Return(Some(Operand::Value(Value(0)))), - source_spans: Vec::new(), - }); - func.next_value_id = 1; - - let result = compute_live_intervals(&func); - // Call points should only contain the calls, not the empty barrier - assert!(result.call_points.is_empty(), - "Empty inline asm barriers should NOT be call points"); - } -} diff --git a/src/backend/mod.rs b/src/backend/mod.rs deleted file mode 100644 index de4eb30f4b..0000000000 --- a/src/backend/mod.rs +++ /dev/null @@ -1,387 +0,0 @@ -pub(crate) mod asm_expr; // Shared assembly expression evaluator (arithmetic, bitwise, parens) -pub(crate) mod asm_preprocess; // Shared GAS preprocessing: comments, macros, rept, conditionals -pub(crate) mod common; -#[allow(dead_code)] // Defines ELF standard constants/helpers; not all used by every backend -pub(crate) mod elf; -pub(crate) mod elf_writer_common; // Shared x86/i686 assembler ELF writer -#[cfg_attr(feature = "gcc_linker", allow(dead_code))] // Built-in linker code unused when gcc handles linking -pub(crate) mod linker_common; -pub(crate) mod peephole_common; // Shared peephole optimizer utilities (word matching, LineStore) - -// Shared codegen framework, split into focused modules: -pub(crate) mod state; // CodegenState, StackSlot, SlotAddr -pub(crate) mod traits; // ArchCodegen trait with default implementations -pub(crate) mod generation; // Module/function/instruction dispatch -pub(crate) mod stack_layout; // Stack layout: slot assignment, alloca coalescing, regalloc helpers -pub(crate) mod call_abi; // Unified ABI classification: call args + callee params, stack computation -pub(crate) mod cast; // Cast and float operation classification -pub(crate) mod f128_softfloat; // Shared F128 soft-float orchestration (ARM + RISC-V) -pub(crate) mod inline_asm; // InlineAsmEmitter trait and shared framework -pub(crate) mod x86_common; // Shared x86/i686 register names, condition codes, asm template parsing - -// Register allocation and liveness analysis -pub(crate) mod liveness; // Live interval computation -pub(crate) mod regalloc; // Linear scan register allocator - - -pub(crate) mod x86; -pub(crate) mod i686; -pub(crate) mod arm; -pub(crate) mod riscv; - -use crate::ir::reexports::IrModule; - -/// Options that control code generation, parsed from CLI flags. -#[derive(Debug, Clone, Default)] -pub(crate) struct CodegenOptions { - /// Whether to generate position-independent code (-fPIC/-fpic) - pub(crate) pic: bool, - /// Whether to replace `ret` with `jmp __x86_return_thunk` (-mfunction-return=thunk-extern) - pub(crate) function_return_thunk: bool, - /// Whether to replace indirect calls/jumps with retpoline thunks (-mindirect-branch=thunk-extern) - pub(crate) indirect_branch_thunk: bool, - /// Patchable function entry: (total_nops, nops_before_entry). - /// -fpatchable-function-entry=N[,M] emits NOP padding around function entry points - /// and records them in __patchable_function_entries for runtime patching (ftrace). - pub(crate) patchable_function_entry: Option<(u32, u32)>, - /// Whether to emit endbr64 at function entry points (-fcf-protection=branch). - /// Required for Intel CET/IBT (Indirect Branch Tracking). - pub(crate) cf_protection_branch: bool, - /// Whether SSE is disabled (-mno-sse). When true, the x86 codegen avoids - /// SSE/XMM instructions in variadic prologues (XMM register saving) and - /// va_start sets fp_offset to overflow so va_arg never uses XMM regs. - /// TODO: Full -mno-sse support would also need to avoid SSE in float - /// operations, casts, and other FP codegen paths. Currently only the - /// variadic ABI path is gated, which is sufficient for the Linux kernel. - pub(crate) no_sse: bool, - /// Whether to use only general-purpose registers (-mgeneral-regs-only). - /// On AArch64, this prevents FP/SIMD register usage in variadic function - /// prologues (no q0-q7 saves) and sets __vr_offs=0 in va_start. - /// The Linux kernel uses this to avoid touching NEON/FP state. - /// TODO: Full -mgeneral-regs-only support would also need to avoid NEON/FP in - /// popcount, byte-swap, float casts, and other FP codegen paths. Currently only - /// the variadic ABI path is gated, which is sufficient for the Linux kernel - /// (kernel code doesn't use floats or popcount builtins in hot paths). - pub(crate) general_regs_only: bool, - /// Whether to use the kernel code model (-mcmodel=kernel). All symbols - /// are assumed to be in the negative 2GB of the virtual address space. - /// Uses absolute sign-extended 32-bit addressing (movq $symbol) for - /// global address references, producing R_X86_64_32S relocations. - pub(crate) code_model_kernel: bool, - /// Whether to disable jump table emission for switch statements (-fno-jump-tables). - /// When true, all switch statements use compare-and-branch chains instead of - /// indirect jumps through a jump table. Required by the Linux kernel when building - /// with retpoline (-mindirect-branch=thunk-extern) to avoid indirect jumps that - /// objtool would reject. - pub(crate) no_jump_tables: bool, - /// Whether to suppress linker relaxation (-mno-relax, RISC-V only). - /// When true, the codegen emits `.option norelax` at the top of the - /// assembly output, which prevents the GNU assembler from generating - /// R_RISCV_RELAX relocation entries. This is required for the Linux - /// kernel's EFI stub, which uses -fpic -mno-relax to ensure no - /// absolute symbol references are introduced by linker relaxation. - pub(crate) no_relax: bool, - /// Whether to emit debug info (.file/.loc directives) when compiling with -g. - /// When true, the codegen emits DWARF line number directives based on - /// source_spans attached to each IR instruction during lowering. - pub(crate) debug_info: bool, - /// Whether to place each function in its own ELF section (-ffunction-sections). - /// When true, each function is emitted into `.text.funcname` instead of `.text`. - /// This enables the linker's `--gc-sections` to discard unreferenced functions. - pub(crate) function_sections: bool, - /// Whether to place each data object in its own ELF section (-fdata-sections). - /// When true, each global variable is emitted into its own section - /// (e.g., `.data.varname`, `.rodata.varname`, `.bss.varname`). - /// This enables the linker's `--gc-sections` to discard unreferenced data. - pub(crate) data_sections: bool, - /// Whether to prepend `.code16gcc` to the assembly output (-m16). - /// When true, the GNU assembler treats the 32-bit instructions as code - /// that will run in 16-bit real mode, adding operand/address-size override - /// prefixes as needed. Used by the Linux kernel boot code. - pub(crate) code16gcc: bool, - /// Number of integer arguments passed in registers (i686 only, -mregparm=N). - /// 0 = standard cdecl (all args on stack), 1-3 = pass first N integer args - /// in EAX, EDX, ECX respectively. Used by the Linux kernel boot code - /// (-mregparm=3) to reduce code size in 16-bit real mode. - pub(crate) regparm: u8, - /// Whether to omit the frame pointer (-fomit-frame-pointer). - /// When true, functions do not set up EBP as a frame pointer, freeing it - /// as a general register and saving prologue/epilogue instructions. - /// Used by the Linux kernel boot code to reduce code size. - pub(crate) omit_frame_pointer: bool, - /// Whether to emit CFI directives (.cfi_startproc, .cfi_endproc, etc.) - /// for generating .eh_frame unwind tables. Enabled by default (like GCC). - /// Disabled by -fno-asynchronous-unwind-tables or -fno-unwind-tables. - /// Many programs (LuaJIT, libunwind users) require .eh_frame for exception - /// handling and stack unwinding. - pub(crate) emit_cfi: bool, -} - -/// Target architecture. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum Target { - X86_64, - I686, - Aarch64, - Riscv64, -} - -impl Target { - /// Return the GCC-style target triple for this architecture. - /// Used by configure scripts (via -dumpmachine) to detect the target. - pub fn triple(&self) -> &'static str { - match self { - Target::X86_64 => "x86_64-linux-gnu", - Target::I686 => "i686-linux-gnu", - Target::Aarch64 => "aarch64-linux-gnu", - Target::Riscv64 => "riscv64-linux-gnu", - } - } - - /// Return the dynamic linker path for this target. - pub(crate) fn dynamic_linker(&self) -> &'static str { - match self { - Target::X86_64 => "/lib64/ld-linux-x86-64.so.2", - Target::I686 => "/lib/ld-linux.so.2", - Target::Aarch64 => "/lib/ld-linux-aarch64.so.1", - Target::Riscv64 => "/lib/ld-linux-riscv64-lp64d.so.1", - } - } - - /// Return the implicit library search directories for this target. - /// This is used by the driver to emit `LIBRARY_PATH=...` during verbose - /// linking, which CMake parses to discover implicit link directories - /// (needed for `find_library()` to locate libraries like libm in - /// multiarch paths like /usr/lib/x86_64-linux-gnu/). - pub(crate) fn implicit_library_paths(&self) -> String { - let triple = self.triple(); - - // GCC lib base paths and versions to probe - let gcc_bases: &[&str] = match self { - Target::X86_64 => &[ - "/usr/lib/gcc/x86_64-linux-gnu", - "/usr/lib/gcc/x86_64-redhat-linux", - "/usr/lib64/gcc/x86_64-linux-gnu", - ], - Target::I686 => &[ - "/usr/lib/gcc-cross/i686-linux-gnu", - "/usr/lib/gcc/i686-linux-gnu", - "/usr/lib/gcc/i386-linux-gnu", - ], - Target::Aarch64 => &[ - "/usr/lib/gcc-cross/aarch64-linux-gnu", - "/usr/lib/gcc/aarch64-linux-gnu", - ], - Target::Riscv64 => &[ - "/usr/lib/gcc-cross/riscv64-linux-gnu", - "/usr/lib/gcc/riscv64-linux-gnu", - ], - }; - let gcc_versions: &[&str] = &["14", "13", "12", "11", "10", "9", "8", "7"]; - - let mut paths: Vec = Vec::new(); - - // Find GCC lib dir (contains crtbegin.o) - 'outer: for base in gcc_bases { - for ver in gcc_versions { - let dir = format!("{}/{}", base, ver); - if std::path::Path::new(&format!("{}/crtbegin.o", dir)).exists() { - paths.push(dir); - break 'outer; - } - } - } - - // Multiarch lib dirs - let lib_dir = format!("/usr/lib/{}", triple); - if std::path::Path::new(&lib_dir).exists() { - paths.push(lib_dir); - } - let lib_alt = format!("/lib/{}", triple); - if std::path::Path::new(&lib_alt).exists() { - paths.push(lib_alt); - } - - // Cross-compiler lib dirs - let cross_lib = format!("/usr/{}/lib", triple); - if std::path::Path::new(&cross_lib).exists() { - paths.push(cross_lib); - } - - // Generic fallback dirs - for dir in &["/usr/lib", "/lib"] { - if std::path::Path::new(dir).exists() { - paths.push(dir.to_string()); - } - } - - paths.join(":") - } - - /// Whether this target uses 32-bit pointers (ILP32 data model). - pub(crate) fn is_32bit(&self) -> bool { - matches!(self, Target::I686) - } - - /// Pointer size in bytes for this target. - pub(crate) fn ptr_size(&self) -> usize { - if self.is_32bit() { 4 } else { 8 } - } - - /// Get the assembler config for this target. - /// Only used when the `gcc_assembler` feature is enabled for GCC fallback. - #[cfg_attr(not(feature = "gcc_assembler"), allow(dead_code))] - pub(crate) fn assembler_config(&self) -> common::AssemblerConfig { - match self { - Target::X86_64 => common::AssemblerConfig { - command: "gcc", - extra_args: &[], - }, - Target::I686 => common::AssemblerConfig { - command: "i686-linux-gnu-gcc", - extra_args: &["-m32"], - }, - Target::Aarch64 => common::AssemblerConfig { - command: "aarch64-linux-gnu-gcc", - extra_args: &["-march=armv8-a+crc+crypto"], - }, - Target::Riscv64 => common::AssemblerConfig { - command: "riscv64-linux-gnu-gcc", - extra_args: &["-march=rv64gc", "-mabi=lp64d"], - }, - } - } - - /// Get the linker config for this target. - pub(crate) fn linker_config(&self) -> common::LinkerConfig { - // ELF e_machine constants (from elf.h): - // EM_386 = 3, EM_AARCH64 = 183, EM_X86_64 = 62, EM_RISCV = 243 - match self { - Target::X86_64 => common::LinkerConfig { - command: "gcc", - extra_args: &["-no-pie"], - expected_elf_machine: 62, // EM_X86_64 - arch_name: "x86-64", - }, - Target::I686 => common::LinkerConfig { - command: "i686-linux-gnu-gcc", - extra_args: &["-m32", "-no-pie"], - expected_elf_machine: 3, // EM_386 - arch_name: "i686", - }, - Target::Aarch64 => common::LinkerConfig { - command: "aarch64-linux-gnu-gcc", - // Use -no-pie to match non-PIC code generation. The previous - // default of -static prevented dlopen() of shared libraries - // at runtime, breaking postgres extension loading. The unit - // test harness passes -static explicitly for QEMU user-mode. - extra_args: &["-no-pie"], - expected_elf_machine: 183, // EM_AARCH64 - arch_name: "aarch64", - }, - Target::Riscv64 => common::LinkerConfig { - command: "riscv64-linux-gnu-gcc", - extra_args: &["-no-pie"], - expected_elf_machine: 243, // EM_RISCV - arch_name: "riscv64", - }, - } - } - - /// Generate assembly with full codegen options and optional source manager for debug info. - /// When `source_mgr` is provided and `opts.debug_info` is true, the codegen emits - /// .file/.loc directives for DWARF line number information. - pub(crate) fn generate_assembly_with_opts_and_debug( - &self, - module: &IrModule, - opts: &CodegenOptions, - source_mgr: Option<&crate::common::source::SourceManager>, - ) -> String { - match self { - Target::X86_64 => { - let mut cg = x86::X86Codegen::new(); - cg.apply_options(opts); - cg.state.function_sections = opts.function_sections; - cg.state.data_sections = opts.data_sections; - let raw = generation::generate_module_with_debug(&mut cg, module, opts.debug_info, source_mgr); - x86::codegen::peephole::peephole_optimize(raw) - } - Target::I686 => { - let mut cg = i686::I686Codegen::new(); - cg.apply_options(opts); - cg.state.function_sections = opts.function_sections; - cg.state.data_sections = opts.data_sections; - let raw = generation::generate_module_with_debug(&mut cg, module, opts.debug_info, source_mgr); - let optimized = i686::codegen::peephole::peephole_optimize(raw); - if opts.code16gcc { - format!(".code16gcc\n{}", optimized) - } else { - optimized - } - } - Target::Aarch64 => { - let mut cg = arm::ArmCodegen::new(); - cg.apply_options(opts); - cg.state.function_sections = opts.function_sections; - cg.state.data_sections = opts.data_sections; - let raw = generation::generate_module_with_debug(&mut cg, module, opts.debug_info, source_mgr); - arm::codegen::peephole::peephole_optimize(raw) - } - Target::Riscv64 => { - let mut cg = riscv::RiscvCodegen::new(); - cg.apply_options(opts); - cg.state.function_sections = opts.function_sections; - cg.state.data_sections = opts.data_sections; - cg.emit_pre_directives(); - let raw = generation::generate_module_with_debug(&mut cg, module, opts.debug_info, source_mgr); - riscv::codegen::peephole::peephole_optimize(raw) - } - } - } - - /// Assemble text to object file with dynamic extra arguments. - /// Used to pass through -mabi= and -march= flags from the CLI. - /// - /// When the `gcc_assembler` Cargo feature is enabled, uses GCC for assembling - /// (with a warning). When disabled (default), uses the built-in assembler. - pub(crate) fn assemble_with_extra(&self, asm_text: &str, output_path: &str, extra_args: &[String]) -> Result<(), String> { - // When gcc_assembler feature is enabled, use GCC for assembling - #[cfg(feature = "gcc_assembler")] - { - common::assemble_with_extra(&self.assembler_config(), asm_text, output_path, extra_args) - } - - // Default (gcc_assembler disabled): use the built-in assembler - #[cfg(not(feature = "gcc_assembler"))] - { - // Handle -Wa,--version: print GNU-compatible version string for - // kernel build system's as-version.sh probe. - if extra_args.iter().any(|a| a == "--version") { - println!("GNU assembler (Claude's C Compiler built-in) 2.42"); - return Ok(()); - } - - match self { - Target::Aarch64 => arm::assembler::assemble(asm_text, output_path), - Target::X86_64 => x86::assembler::assemble(asm_text, output_path), - Target::Riscv64 => riscv::assembler::assemble_with_args(asm_text, output_path, extra_args), - Target::I686 => i686::assembler::assemble(asm_text, output_path), - } - } - } - - /// Link object files into executable. - pub(crate) fn link(&self, object_files: &[&str], output_path: &str) -> Result<(), String> { - self.link_with_args(object_files, output_path, &[]) - } - - /// Link object files with additional user-provided linker args. - /// - /// By default, uses the built-in native linker for all architectures. - /// When the `gcc_linker` Cargo feature is enabled, GCC can be used as a - /// fallback for operations the built-in linker doesn't support (e.g., - /// -shared, -r). - pub(crate) fn link_with_args(&self, object_files: &[&str], output_path: &str, user_args: &[String]) -> Result<(), String> { - common::link_with_args(&self.linker_config(), object_files, output_path, user_args) - } -} diff --git a/src/backend/peephole_common.rs b/src/backend/peephole_common.rs deleted file mode 100644 index 086dcc701e..0000000000 --- a/src/backend/peephole_common.rs +++ /dev/null @@ -1,217 +0,0 @@ -//! Shared peephole optimizer utilities used by multiple backends. -//! -//! Several peephole passes across ARM, RISC-V, x86, and i686 perform the same -//! text-level operations on assembly lines: whole-word matching, register -//! replacement in source operands, and a compact line store that avoids -//! per-line `String` allocation. This module extracts those shared building -//! blocks so each backend can focus on arch-specific patterns. - -// ── Word-boundary matching helpers ─────────────────────────────────────── -// -// Assembly register names like "x1" must not match inside "x11", "x10", or -// symbol names like "main.x1.0". These functions treat alphanumeric, `.`, -// and `_` as word characters (common in ELF symbol names and GAS labels). - -/// Returns `true` if `b` is a "word character" for register/symbol matching: -/// ASCII alphanumeric, `.`, or `_`. -#[inline] -pub(crate) fn is_ident_char(b: u8) -> bool { - b.is_ascii_alphanumeric() || b == b'.' || b == b'_' -} - -/// Replace every whole-word occurrence of `old` with `new` in `text`. -/// -/// A word boundary is a position where the adjacent character is not an -/// identifier char. This prevents "x1" from matching inside "x11" or -/// "main.x1.0". -pub(crate) fn replace_whole_word(text: &str, old: &str, new: &str) -> String { - let bytes = text.as_bytes(); - let old_bytes = old.as_bytes(); - let old_len = old_bytes.len(); - let text_len = bytes.len(); - let mut result = String::with_capacity(text.len()); - let mut i = 0; - - while i < text_len { - if i + old_len <= text_len && &bytes[i..i + old_len] == old_bytes { - let before_ok = i == 0 || !is_ident_char(bytes[i - 1]); - let after_ok = i + old_len >= text_len || !is_ident_char(bytes[i + old_len]); - if before_ok && after_ok { - result.push_str(new); - i += old_len; - continue; - } - } - result.push(bytes[i] as char); - i += 1; - } - result -} - -/// Returns `true` if `text` contains `word` at a word boundary. -pub(crate) fn has_whole_word(text: &str, word: &str) -> bool { - let bytes = text.as_bytes(); - let word_bytes = word.as_bytes(); - let word_len = word_bytes.len(); - let text_len = bytes.len(); - - let mut i = 0; - while i + word_len <= text_len { - if &bytes[i..i + word_len] == word_bytes { - let before_ok = i == 0 || !is_ident_char(bytes[i - 1]); - let after_ok = i + word_len >= text_len || !is_ident_char(bytes[i + word_len]); - if before_ok && after_ok { - return true; - } - } - i += 1; - } - false -} - -/// Replace a register name in the *source* operand positions of an instruction. -/// -/// Given an assembly line like `" add x0, x1, x2"`, this replaces `old_reg` -/// with `new_reg` only in the operands *after* the first comma (i.e. the -/// source operands, not the destination). Returns `None` if no replacement -/// was made. Preserves leading whitespace. -pub(crate) fn replace_source_reg_in_instruction( - line: &str, - old_reg: &str, - new_reg: &str, -) -> Option { - let trimmed = line.trim(); - - // Find the first space to separate mnemonic from operands - let space_pos = trimmed.find(' ')?; - let args_start = space_pos + 1; - let args = &trimmed[args_start..]; - - // Find first comma -- everything after it is source operands - let comma_pos = args.find(',')?; - let after_first_arg = &args[comma_pos..]; - - // Only replace in the source part (after the first comma) - let new_suffix = replace_whole_word(after_first_arg, old_reg, new_reg); - if new_suffix == after_first_arg { - return None; - } - - // Build the new line - let prefix = &trimmed[..args_start + comma_pos]; - let new_trimmed = format!("{}{}", prefix, new_suffix); - - // Preserve leading whitespace - let leading = line.len() - line.trim_start().len(); - let leading_ws = &line[..leading]; - Some(format!("{}{}", leading_ws, new_trimmed)) -} - -// ── LineStore: compact assembly line storage ───────────────────────────── -// -// During peephole optimization we repeatedly access and occasionally replace -// individual assembly lines. Storing each line as its own `String` is -// wasteful (24 bytes overhead per line on 64-bit). `LineStore` keeps the -// original text as a single `String` and records byte-offset ranges for each -// line. Replaced lines go into a small side buffer. This typically reduces -// memory traffic significantly for large functions. - -/// Compact (start, len) entry for one line. 8 bytes vs. 24 for String. -/// When `len == u32::MAX`, the line has been replaced and `start` is the -/// index into the `replacements` vector. -#[derive(Clone, Copy)] -pub(crate) struct LineEntry { - start: u32, - len: u32, -} - -/// Compact storage for assembly lines that avoids per-line allocation. -pub(crate) struct LineStore { - /// The original assembly text (kept alive for the duration of optimization). - original: String, - /// One entry per line. - entries: Vec, - /// Side buffer for lines that have been replaced by optimization passes. - replacements: Vec, -} - -impl LineStore { - /// Build a `LineStore` from an assembly string. - pub(crate) fn new(asm: String) -> Self { - let bytes = asm.as_bytes(); - let total_len = bytes.len(); - let estimated_lines = total_len / 20 + 1; - let mut entries = Vec::with_capacity(estimated_lines); - - let mut start = 0usize; - let mut i = 0; - while i < total_len { - if bytes[i] == b'\n' { - entries.push(LineEntry { - start: start as u32, - len: (i - start) as u32, - }); - start = i + 1; - } - i += 1; - } - // Handle last line (no trailing newline) - if start <= total_len { - let remaining = total_len - start; - if remaining > 0 || entries.is_empty() { - entries.push(LineEntry { - start: start as u32, - len: remaining as u32, - }); - } - } - - LineStore { - original: asm, - entries, - replacements: Vec::new(), - } - } - - /// Get the text of line `idx`. - #[inline] - pub(crate) fn get(&self, idx: usize) -> &str { - let e = &self.entries[idx]; - if e.len == u32::MAX { - &self.replacements[e.start as usize] - } else { - let start = e.start as usize; - let end = start + e.len as usize; - &self.original[start..end] - } - } - - /// Number of lines. - #[inline] - pub(crate) fn len(&self) -> usize { - self.entries.len() - } - - /// Replace a line with new text. - pub(crate) fn replace(&mut self, idx: usize, new_text: String) { - let rep_idx = self.replacements.len(); - self.replacements.push(new_text); - self.entries[idx] = LineEntry { - start: rep_idx as u32, - len: u32::MAX, - }; - } - - /// Build the final output string, skipping lines where `skip(i)` returns true. - pub(crate) fn build_result(&self, skip: impl Fn(usize) -> bool) -> String { - let mut result = String::with_capacity(self.original.len()); - for i in 0..self.entries.len() { - if !skip(i) { - result.push_str(self.get(i)); - result.push('\n'); - } - } - result - } -} - diff --git a/src/backend/regalloc.rs b/src/backend/regalloc.rs deleted file mode 100644 index 574e1e8564..0000000000 --- a/src/backend/regalloc.rs +++ /dev/null @@ -1,573 +0,0 @@ -//! Linear scan register allocator. -//! -//! Assigns physical registers to IR values based on their live intervals. -//! Values with the longest live ranges and most uses get priority for register -//! assignment. Values that don't fit in available registers remain on the stack. -//! -//! Three-phase allocation: -//! 1. **Callee-saved registers** (x86: rbx, r12-r15; ARM: x20-x28; RISC-V: s1, s7-s11): -//! Assigned to values whose live ranges span function calls. These registers -//! are preserved across calls by the ABI, so no save/restore is needed at call -//! sites (but prologue/epilogue must save them). -//! -//! 2. **Caller-saved registers** (x86: r11, r10, r8, r9; ARM: x13, x14): -//! Assigned to values whose live ranges do NOT span any function call. These -//! registers are destroyed by calls, so they can only hold values between calls. -//! No prologue/epilogue save/restore is needed since we never assign them to -//! values that cross call boundaries. -//! -//! 3. **Callee-saved spillover**: After phases 1 and 2, any remaining callee-saved -//! registers are assigned to the highest-priority non-call-spanning values that -//! didn't fit in the caller-saved pool. This is critical for call-free hot loops -//! (e.g., hash functions, matrix multiply, sorting) where all values compete for -//! only a few caller-saved registers. The one-time prologue/epilogue save/restore -//! cost is amortized over many loop iterations. - -use crate::common::fx_hash::{FxHashMap, FxHashSet}; -use crate::common::types::IrType; -use crate::ir::reexports::{ - Instruction, - IrConst, - IrFunction, - Operand, -}; -use super::liveness::{LiveInterval, LivenessResult, compute_live_intervals, for_each_operand_in_instruction, for_each_operand_in_terminator}; - -/// A physical register assignment. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub struct PhysReg(pub u8); - -/// Result of register allocation for a function. -pub struct RegAllocResult { - /// Map from value ID -> assigned physical register. - pub assignments: FxHashMap, - /// Set of physical registers actually used (for prologue/epilogue save/restore). - pub used_regs: Vec, - /// The liveness analysis computed during register allocation, if any. - /// Cached here so that calculate_stack_space_common can reuse it for - /// Tier 2 liveness-based stack slot packing, avoiding a redundant - /// O(blocks * values * iterations) dataflow computation. - /// None when no registers were available (empty available_regs). - pub liveness: Option, -} - -/// Configuration for the register allocator. -pub struct RegAllocConfig { - /// Available callee-saved registers for allocation (e.g., s1-s11 for RISC-V). - pub available_regs: Vec, - /// Available caller-saved registers for allocation. - /// These are assigned to values whose live ranges do NOT span any call. - /// Since they don't cross calls, no prologue/epilogue save/restore is needed. - /// Examples: x86 r11, r10, r8, r9. - pub caller_saved_regs: Vec, - /// Whether to allow inline asm operands to be register-allocated. - /// Only enable this when the backend's asm emitter checks reg_assignments - /// before falling back to stack access. Currently only RISC-V does this. - pub allow_inline_asm_regalloc: bool, -} - -/// Run the linear scan register allocator on a function. -/// -/// Strategy: We assign callee-saved registers to values with the longest -/// live intervals. This is a simplified linear scan that doesn't split -/// intervals — values either get a register for their entire lifetime or -/// remain on the stack. -/// -/// We avoid allocating registers to: -/// - Alloca values (they represent stack addresses) -/// - i128/float values (they need special register paths) -/// - Values used only once right after definition (no benefit from register) -pub fn allocate_registers( - func: &IrFunction, - config: &RegAllocConfig, -) -> RegAllocResult { - if config.available_regs.is_empty() && config.caller_saved_regs.is_empty() { - return RegAllocResult { - assignments: FxHashMap::default(), - used_regs: Vec::new(), - liveness: None, - }; - } - - // Note: Register allocation is now enabled for functions with atomics. - // Atomic operations in all backends (x86, ARM, RISC-V) access their operands - // exclusively through regalloc-aware helpers (operand_to_rax/x0/t0 and - // store_rax_to/x0_to/t0_to), so register-allocated values work correctly. - // The atomic pointer operands are individually excluded from register - // allocation eligibility below since they need stable stack addresses - // for the memory access instructions. - - // On 32-bit targets, I64/U64 values need two registers (eax:edx) and cannot - // be allocated to a single callee-saved register. Exclude them from eligibility. - let is_32bit = crate::common::types::target_is_32bit(); - - // Liveness analysis now uses backward dataflow iteration to correctly - // handle loops (values live across back-edges have their intervals extended). - let liveness = compute_live_intervals(func); - - // Count uses per value for prioritization, weighted by loop depth. - // - // Uses inside loops are weighted more heavily because they execute more - // frequently. A use inside a loop at depth D contributes 10^D to the - // weighted use count (so a use in a singly-nested loop counts 10x, doubly- - // nested counts 100x, etc.). This ensures inner-loop temporaries get - // priority for register allocation over values in straight-line code, - // which is critical for performance in compute-heavy loops like zlib's - // deflate_slow, longest_match, and slide_hash. - let mut use_count: FxHashMap = FxHashMap::default(); - - // Precompute per-block loop weight: 10^depth, capped to avoid overflow. - let block_loop_weight: Vec = liveness.block_loop_depth.iter() - .map(|&d| { - match d { - 0 => 1, - 1 => 10, - 2 => 100, - 3 => 1000, - _ => 10_000, // cap at 10K for very deep nesting - } - }) - .collect(); - - // Collect values whose types don't fit in a single GPR. - let non_gpr_values = collect_non_gpr_values(func, is_32bit); - - // Helper closure to check if a type is unsuitable for GPR allocation - let is_non_gpr_type = |ty: &IrType| -> bool { - ty.is_float() || ty.is_long_double() - || matches!(ty, IrType::I128 | IrType::U128) - || (is_32bit && matches!(ty, IrType::I64 | IrType::U64)) - }; - - // Use a whitelist approach: only allocate registers for values produced - // by simple, well-understood instructions that store results via the - // standard accumulator path (e.g., store_rax_to on x86, store_t0_to on RISC-V). - let mut eligible: FxHashSet = FxHashSet::default(); - - for (block_idx, block) in func.blocks.iter().enumerate() { - // Get the loop weight for this block (default 1 if no loop info available). - let weight: u64 = if block_idx < block_loop_weight.len() { - block_loop_weight[block_idx] - } else { - 1 - }; - - for inst in &block.instructions { - // Values eligible for register allocation: those stored via the - // standard accumulator path (store_rax_to on x86, store_t0_to on RISC-V). - // Exclude float and i128 types since they use different register paths. - match inst { - Instruction::BinOp { dest, ty, .. } - | Instruction::UnaryOp { dest, ty, .. } => { - if !is_non_gpr_type(ty) { - eligible.insert(dest.0); - } - } - Instruction::Cmp { dest, .. } => { - eligible.insert(dest.0); - } - Instruction::Cast { dest, to_ty, from_ty, .. } => { - if !is_non_gpr_type(to_ty) && !is_non_gpr_type(from_ty) { - eligible.insert(dest.0); - } - } - Instruction::Load { dest, ty, .. } => { - if !is_non_gpr_type(ty) { - eligible.insert(dest.0); - } - } - Instruction::GetElementPtr { dest, .. } => { - eligible.insert(dest.0); - } - Instruction::Copy { dest, src: _ } => { - // Copy instructions are eligible unless the source produces a - // non-GPR value (float, i128, or i64 on 32-bit). We check both - // constant types and propagated non-GPR status from Value sources. - if !non_gpr_values.contains(&dest.0) { - eligible.insert(dest.0); - } - } - // Call results are eligible for callee-saved register allocation. - // The result arrives in the accumulator (rax on x86, x0 on ARM, a0 on - // RISC-V), and emit_call_store_result calls emit_store_result which - // uses store_rax_to/store_t0_to — both of which are register-aware - // and will emit a reg-to-reg move (e.g., movq %rax, %rbx) instead of - // a stack spill. - Instruction::Call { info, .. } - | Instruction::CallIndirect { info, .. } => { - if let Some(dest) = info.dest { - if !is_non_gpr_type(&info.return_type) { - eligible.insert(dest.0); - } - } - } - Instruction::Select { dest, ty, .. } => { - if !is_non_gpr_type(ty) { - eligible.insert(dest.0); - } - } - Instruction::GlobalAddr { dest, .. } | Instruction::LabelAddr { dest, .. } => { - eligible.insert(dest.0); - } - // Atomic operations store their results via store_rax_to/store_t0_to. - Instruction::AtomicLoad { dest, ty, .. } - | Instruction::AtomicRmw { dest, ty, .. } - | Instruction::AtomicCmpxchg { dest, ty, .. } => { - if !is_non_gpr_type(ty) { - eligible.insert(dest.0); - } - } - Instruction::ParamRef { dest, ty, .. } => { - if !is_non_gpr_type(ty) { - eligible.insert(dest.0); - } - } - _ => {} - } - - // Count uses of operands, weighted by loop depth of the containing block. - for_each_operand_in_instruction(inst, |op| { - if let Operand::Value(v) = op { - *use_count.entry(v.0).or_insert(0) += weight; - } - }); - } - for_each_operand_in_terminator(&block.terminator, |op| { - if let Operand::Value(v) = op { - *use_count.entry(v.0).or_insert(0) += weight; - } - }); - } - - // Exclude values used as pointers in instructions whose codegen paths use - // resolve_slot_addr() directly (not register-aware). - remove_ineligible_operands(func, &mut eligible, config); - - let call_points = &liveness.call_points; - - // Phase 1: Callee-saved registers for call-spanning values. - let candidates = build_sorted_candidates( - &liveness, &eligible, &FxHashMap::default(), call_points, &use_count, Some(true), - ); - - let num_regs = config.available_regs.len(); - let mut reg_free_until: Vec = vec![0; num_regs]; - let mut assignments: FxHashMap = FxHashMap::default(); - let mut used_regs_set: FxHashSet = FxHashSet::default(); - - for interval in &candidates { - if let Some(reg_idx) = find_best_callee_reg(®_free_until, interval.start, &config.available_regs, &used_regs_set) { - reg_free_until[reg_idx] = interval.end + 1; - assignments.insert(interval.value_id, config.available_regs[reg_idx]); - used_regs_set.insert(config.available_regs[reg_idx].0); - } - } - - let mut used_regs: Vec = used_regs_set.iter().map(|&r| PhysReg(r)).collect(); - used_regs.sort_by_key(|r| r.0); - - // Phase 2: Caller-saved registers for non-call-spanning values. - if !config.caller_saved_regs.is_empty() { - let caller_candidates = build_sorted_candidates( - &liveness, &eligible, &assignments, call_points, &use_count, Some(false), - ); - - let num_caller_regs = config.caller_saved_regs.len(); - let mut caller_free_until: Vec = vec![0; num_caller_regs]; - - for interval in &caller_candidates { - let mut best: Option = None; - let mut best_free_time: u32 = u32::MAX; - - for (i, &free_until) in caller_free_until.iter().enumerate() { - if free_until <= interval.start - && (best.is_none() || free_until < best_free_time) { - best = Some(i); - best_free_time = free_until; - } - } - - if let Some(reg_idx) = best { - caller_free_until[reg_idx] = interval.end + 1; - assignments.insert(interval.value_id, config.caller_saved_regs[reg_idx]); - } - } - } - - // Phase 3: Callee-saved spillover for non-call-spanning values. - // - // After Phases 1 and 2, there may be high-priority values in call-free loops - // that didn't get a register because the caller-saved pool overflowed. Assign - // remaining callee-saved registers to these overflow values. - { - let spillover_candidates = build_sorted_candidates( - &liveness, &eligible, &assignments, call_points, &use_count, Some(false), - ); - - for interval in &spillover_candidates { - if let Some(reg_idx) = find_best_callee_reg(®_free_until, interval.start, &config.available_regs, &used_regs_set) { - reg_free_until[reg_idx] = interval.end + 1; - assignments.insert(interval.value_id, config.available_regs[reg_idx]); - used_regs_set.insert(config.available_regs[reg_idx].0); - } - } - - used_regs = used_regs_set.iter().map(|&r| PhysReg(r)).collect(); - used_regs.sort_by_key(|r| r.0); - } - - RegAllocResult { - assignments, - used_regs, - liveness: Some(liveness), - } -} - -/// Collect values whose types don't fit in a single GPR (floats, i128, and -/// on 32-bit targets: i64/u64). Copy instructions that chain from these -/// values must also be excluded via fixpoint propagation. -fn collect_non_gpr_values(func: &IrFunction, is_32bit: bool) -> FxHashSet { - let is_non_gpr_type = |ty: &IrType| -> bool { - ty.is_float() || ty.is_long_double() - || matches!(ty, IrType::I128 | IrType::U128) - || (is_32bit && matches!(ty, IrType::I64 | IrType::U64)) - }; - - let mut non_gpr_values: FxHashSet = FxHashSet::default(); - - // First pass: collect non-GPR values from typed instructions - for block in &func.blocks { - for inst in &block.instructions { - match inst { - Instruction::BinOp { dest, ty, .. } - | Instruction::UnaryOp { dest, ty, .. } => { - if is_non_gpr_type(ty) { - non_gpr_values.insert(dest.0); - } - } - Instruction::Cast { dest, to_ty, from_ty, .. } => { - if is_non_gpr_type(to_ty) || is_non_gpr_type(from_ty) { - non_gpr_values.insert(dest.0); - } - } - Instruction::Load { dest, ty, .. } => { - if is_non_gpr_type(ty) { - non_gpr_values.insert(dest.0); - } - } - Instruction::Call { info, .. } - | Instruction::CallIndirect { info, .. } => { - if let Some(dest) = info.dest { - if is_non_gpr_type(&info.return_type) { - non_gpr_values.insert(dest.0); - } - } - } - Instruction::Select { dest, ty, .. } => { - if is_non_gpr_type(ty) { - non_gpr_values.insert(dest.0); - } - } - Instruction::AtomicLoad { dest, ty, .. } - | Instruction::AtomicRmw { dest, ty, .. } - | Instruction::AtomicCmpxchg { dest, ty, .. } => { - if is_non_gpr_type(ty) { - non_gpr_values.insert(dest.0); - } - } - _ => {} - } - } - } - - // Propagate non-GPR status through Copy chains: if a Copy's source is a - // non-GPR value, the dest is also non-GPR. Iterate until fixpoint since - // Copies can chain (Copy a->b, Copy b->c). - loop { - let mut changed = false; - for block in &func.blocks { - for inst in &block.instructions { - if let Instruction::Copy { dest, src } = inst { - if non_gpr_values.contains(&dest.0) { - continue; - } - let src_is_non_gpr = match src { - Operand::Value(v) => non_gpr_values.contains(&v.0), - Operand::Const(IrConst::F32(_)) | Operand::Const(IrConst::F64(_)) - | Operand::Const(IrConst::LongDouble(..)) - | Operand::Const(IrConst::I128(_)) => true, - Operand::Const(IrConst::I64(_)) if is_32bit => true, - _ => false, - }; - if src_is_non_gpr { - non_gpr_values.insert(dest.0); - changed = true; - } - } - } - } - if !changed { - break; - } - } - - non_gpr_values -} - -/// Remove values from the eligible set that are used as operands in instructions -/// whose codegen paths use resolve_slot_addr() directly (not register-aware). -/// This includes CallIndirect func pointers, Memcpy pointers, va_arg pointers, -/// atomic pointers, StackRestore, and InlineAsm operands. -fn remove_ineligible_operands(func: &IrFunction, eligible: &mut FxHashSet, config: &RegAllocConfig) { - for block in &func.blocks { - for inst in &block.instructions { - match inst { - Instruction::CallIndirect { func_ptr: Operand::Value(v), .. } => { - eligible.remove(&v.0); - } - Instruction::Memcpy { dest, src, .. } => { - eligible.remove(&dest.0); - eligible.remove(&src.0); - } - Instruction::VaArg { va_list_ptr, .. } => { - eligible.remove(&va_list_ptr.0); - } - Instruction::VaStart { va_list_ptr } => { - eligible.remove(&va_list_ptr.0); - } - Instruction::VaEnd { va_list_ptr } => { - eligible.remove(&va_list_ptr.0); - } - Instruction::VaCopy { dest_ptr, src_ptr } => { - eligible.remove(&dest_ptr.0); - eligible.remove(&src_ptr.0); - } - Instruction::VaArgStruct { dest_ptr, va_list_ptr, .. } => { - eligible.remove(&dest_ptr.0); - eligible.remove(&va_list_ptr.0); - } - Instruction::AtomicRmw { ptr: Operand::Value(v), .. } => { - eligible.remove(&v.0); - } - Instruction::AtomicCmpxchg { ptr: Operand::Value(v), .. } => { - eligible.remove(&v.0); - } - Instruction::AtomicLoad { ptr: Operand::Value(v), .. } => { - eligible.remove(&v.0); - } - Instruction::AtomicStore { ptr: Operand::Value(v), .. } => { - eligible.remove(&v.0); - } - Instruction::StackRestore { ptr } => { - eligible.remove(&ptr.0); - } - Instruction::InlineAsm { outputs, inputs, .. } => { - if !config.allow_inline_asm_regalloc { - // Inline asm operands are accessed via stack slots - // in codegen. Exclude them from register allocation - // unless the backend's asm emitter checks reg_assignments. - for (_, val, _) in outputs { - eligible.remove(&val.0); - } - for (_, op, _) in inputs { - if let Operand::Value(v) = op { - eligible.remove(&v.0); - } - } - } - // When allow_inline_asm_regalloc is true (RISC-V), the - // asm emitter checks reg_assignments before falling back - // to stack slot access. - } - _ => {} - } - } - } -} - -/// Check whether a live interval spans any function call point. -/// Uses binary search since call_points is sorted by program point. -fn spans_any_call(iv: &LiveInterval, call_points: &[u32]) -> bool { - let start_idx = call_points.partition_point(|&cp| cp < iv.start); - start_idx < call_points.len() && call_points[start_idx] <= iv.end -} - -/// Build a sorted list of allocation candidates from live intervals. -/// -/// Filters by eligibility, minimum span length, and call-spanning behavior: -/// - `spans_call == Some(true)`: only intervals that span a call -/// - `spans_call == Some(false)`: only intervals that do NOT span a call -/// - `spans_call == None`: all eligible intervals -/// -/// Results are sorted by weighted use count (descending), with interval length -/// as tiebreaker. -fn build_sorted_candidates<'a>( - liveness: &'a LivenessResult, - eligible: &FxHashSet, - already_assigned: &FxHashMap, - call_points: &[u32], - use_count: &FxHashMap, - spans_call: Option, -) -> Vec<&'a LiveInterval> { - let mut candidates: Vec<&LiveInterval> = liveness.intervals.iter() - .filter(|iv| eligible.contains(&iv.value_id)) - .filter(|iv| !already_assigned.contains_key(&iv.value_id)) - .filter(|iv| iv.end > iv.start) - .filter(|iv| match spans_call { - Some(true) => spans_any_call(iv, call_points), - Some(false) => !spans_any_call(iv, call_points), - None => true, - }) - .collect(); - - candidates.sort_by(|a, b| { - let score_a = use_count.get(&a.value_id).copied().unwrap_or(1); - let score_b = use_count.get(&b.value_id).copied().unwrap_or(1); - score_b.cmp(&score_a) - .then_with(|| { - let len_a = (a.end - a.start) as u64; - let len_b = (b.end - b.start) as u64; - len_b.cmp(&len_a) - }) - }); - - candidates -} - -/// Find the best callee-saved register for an interval, preferring registers -/// that are already in use (to minimize prologue/epilogue save/restore cost). -/// -/// Returns the index into `available_regs` of the chosen register, or None -/// if no register is free at the interval's start point. -fn find_best_callee_reg( - reg_free_until: &[u32], - interval_start: u32, - available_regs: &[PhysReg], - used_regs_set: &FxHashSet, -) -> Option { - let mut best_already_used: Option = None; - let mut best_already_used_free_time: u32 = u32::MAX; - let mut best_new: Option = None; - let mut best_new_free_time: u32 = u32::MAX; - - for (i, &free_until) in reg_free_until.iter().enumerate() { - if free_until <= interval_start { - let reg_id = available_regs[i].0; - if used_regs_set.contains(®_id) { - // Already saved/restored — reusing costs nothing extra. - if best_already_used.is_none() || free_until < best_already_used_free_time { - best_already_used = Some(i); - best_already_used_free_time = free_until; - } - } else { - // Would introduce a new callee-saved register. - if best_new.is_none() || free_until < best_new_free_time { - best_new = Some(i); - best_new_free_time = free_until; - } - } - } - } - - best_already_used.or(best_new) -} diff --git a/src/backend/riscv/README.md b/src/backend/riscv/README.md deleted file mode 100644 index 09afe18af1..0000000000 --- a/src/backend/riscv/README.md +++ /dev/null @@ -1,44 +0,0 @@ -# RISC-V 64-bit Backend - -The RISC-V backend targets RV64GC (RV64IMAFDC) with the LP64D calling -convention. It covers the full pipeline from IR to ELF executable: code -generation (instruction selection, register allocation, peephole optimization), -a builtin assembler (RV assembly syntax parser, instruction encoder, ELF object -writer), and a builtin linker (static and dynamic linking, shared library -output, TLS support). - -## Directory Structure - -``` -riscv/ - codegen/ Code generation and peephole optimizer - assembler/ Builtin RV64 assembler (parser, encoder, ELF writer) - linker/ Builtin RV64 linker (static/dynamic linking, TLS) -``` - -## Sub-Module Documentation - -| Module | README | -|--------|--------| -| Code generation | [`codegen/README.md`](codegen/README.md) | -| Assembler | [`assembler/README.md`](assembler/README.md) | -| Linker | [`linker/README.md`](linker/README.md) | - -## Key Characteristics - -- **ABI**: LP64D -- 8 GP argument registers, 8 FP argument registers, - hardware float struct classification, I128/F128 in GP register pairs -- **Accumulator model**: Values flow through `t0`; up to 11 callee-saved - registers (`s1`, `s2`-`s11`) available for allocation -- **F128 (long double)**: IEEE binary128 via soft-float library calls through - GP register pairs (`a0:a1`) -- **Software SIMD**: SSE-equivalent 128-bit vector operations emulated with - scalar RISC-V instructions -- **Software builtins**: CLZ, CTZ, BSWAP, POPCOUNT implemented in software - (no Zbb extension dependency) -- **Atomics**: AMO instructions for word/doubleword; LR/SC with bit masking - for sub-word atomics -- **Assembler**: RV assembly syntax, all RV64IMAFDC instructions, macro and - conditional preprocessor -- **Linker**: Static and dynamic linking, shared library (`.so`) output, - PLT/GOT generation, TLS (LE, IE, GD→LE relaxation for static binaries) diff --git a/src/backend/riscv/asm_stub.sh b/src/backend/riscv/asm_stub.sh deleted file mode 100755 index d72b3d979a..0000000000 --- a/src/backend/riscv/asm_stub.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/sh -# Placeholder assembler for RISC-V 64 backend. -# Set MY_ASM to point to this script to test the custom assembler integration. -# TODO: Replace this stub with a real assembler implementation. -echo "ERROR: RISC-V 64 custom assembler stub called but not yet implemented." >&2 -echo "Arguments: $@" >&2 -exit 1 diff --git a/src/backend/riscv/assembler/README.md b/src/backend/riscv/assembler/README.md deleted file mode 100644 index 395950f1ce..0000000000 --- a/src/backend/riscv/assembler/README.md +++ /dev/null @@ -1,525 +0,0 @@ -# RISC-V 64-bit Assembler -- Design Document - -## Overview - -This module implements a complete, self-contained assembler for the RISC-V 64-bit -ISA (RV64GC). It translates textual assembly source -- as emitted by the compiler's -code-generation backend -- into relocatable ELF object files (`.o`). The assembler -is designed to be invoked in-process (no fork/exec of an external tool), which -removes a hard dependency on a host `as` binary and dramatically improves -compilation latency on cross-compilation setups. - -The assembler supports the full RV64I base integer ISA, the M (multiply/divide), -A (atomics), F (single-precision float), D (double-precision float), -C (compressed 16-bit), Zbb (basic bit manipulation), and V (vector) standard extensions, -plus Zvksh/Zvksed vector crypto. It handles all standard assembler directives, -pseudo-instructions, relocation modifiers, numeric local labels, macro expansion, -and conditional assembly. - -### Capabilities at a glance - -- Full RV64IMAFDCV + Zbb + Zvksh/Zvksed instruction encoding -- 40+ pseudo-instructions (li, la, call, tail, mv, not, negw, seqz, ...) -- All standard assembler directives (.text, .data, .globl, .align, .byte, ...) -- Preprocessor: `.macro/.endm`, `.rept/.irp/.endr`, `.if/.else/.endif` -- Relocation modifier parsing (%pcrel_hi, %pcrel_lo, %hi, %lo, %tprel_*, %got_pcrel_hi, ...) -- RV64C compression pass (currently disabled; linker relaxation preferred) -- Numeric local labels (1:, 1b, 1f) with forward/backward reference resolution -- Correct ELF object file emission with .symtab, .strtab, .rela.* sections - -## Architecture / Pipeline - -``` - Assembly source (text) - | - v - +----------------------------+ - | Preprocessor | - | (asm_preprocess.rs, | - | shared across backends) | - | - Strip C-style comments | - | - Expand .macro/.endm | - | - Expand .rept/.irp/.endr | - +----------------------------+ - | - v - +----------------------------+ - | Parser (parser.rs) | - | - Tokenize lines | - | - Parse operands | - | - Recognize directives | - | - Evaluate .if/.else/.endif| - +----------------------------+ - | - Vec - | - v - +----------------------------+ - | Encoder (encoder/) | - | - Map mnemonic to ISA | - | - Encode R/I/S/B/U/J | - | - Expand pseudos | - | - Emit relocations | - +----------------------------+ - | - EncodeResult + Relocation - | - v - +----------------------------+ - | ELF Writer (elf_writer.rs) | - | - Section management | - | - Label/symbol tracking | - | - Directive execution | - | - Branch reloc resolution | - | - ELF64 serialization | - +----------------------------+ - | - v - Relocatable ELF .o file -``` - -Note: A compression pass (`compress.rs`) exists and can rewrite eligible 32-bit -instructions to 16-bit RVC equivalents, but it is currently disabled because -the linker handles relaxation via `R_RISCV_RELAX` and running our own -compression would conflict with the linker's relaxation pass. - -## File Inventory - -| File | Lines | Role | -|-----------------|--------|---------------------------------------------------------| -| `mod.rs` | ~100 | Public `assemble_with_args()` entry point; orchestrates parser → ELF writer pipeline, handles `-mabi=` flag for float ABI selection and `-march=` for RV32/RV64 and RVC detection | -| `parser.rs` | ~1060 | Line tokenizer and operand parser; splits assembly text into `AsmStatement` records, evaluates `.if/.else/.endif` conditionals | -| `encoder/` | ~2670 | Instruction encoder (split into focused submodules, see below) | -| `compress.rs` | ~850 | Post-encoding RV64C compression pass; rewrites eligible 32-bit instructions to 16-bit compressed equivalents (currently disabled) | -| `elf_writer.rs` | ~1410 | ELF object file builder; composes with `ElfWriterBase` (from shared `elf` module) for section/symbol management, adds RISC-V-specific pcrel_hi/lo pairing, branch resolution, numeric label handling, and ELF serialization | - -### Encoder Submodules (`encoder/`) - -The instruction encoder is organized as a directory of focused submodules: - -| File | Lines | Role | -|------|-------|------| -| `mod.rs` | ~926 | `EncodeResult`/`RelocType`/`Relocation` types, register encoding (`encode_reg`), format encoders (`encode_r/i/s/b/u/j`), opcode constants, `encode_instruction()` dispatch | -| `base.rs` | ~320 | RV64I base integer instructions (R/I/S/B/U/J-type) plus Zbb bit-manipulation extension | -| `atomics.rs` | ~106 | A-extension: LR/SC (load-reserved/store-conditional), AMO (atomic memory operations) | -| `system.rs` | ~115 | System instructions: ECALL, EBREAK, FENCE, FENCE.I, CSR read/write/set/clear | -| `float.rs` | ~191 | F/D floating-point extensions: arithmetic, comparisons, conversions, FMA, sign injection, classify, load/store | -| `pseudo.rs` | ~608 | Pseudo-instruction expansion: LI (large immediates), LA/LLA, CALL/TAIL, branch aliases, CSR shorthands, FP move/abs/neg, relocation modifier parsing | -| `compressed.rs` | ~196 | RVC compressed 16-bit instructions and `.insn` raw instruction directive | -| `vector.rs` | ~210 | RVV vector extension and Zvksh/Zvksed vector crypto instructions | - -## Key Data Structures - -### `ElfWriter` (elf_writer.rs) - -The ELF writer composes with `ElfWriterBase` (from the shared `elf` module) for -common infrastructure and adds RISC-V-specific logic. - -``` -ElfWriter { - pub base: ElfWriterBase, // Shared: sections, labels, symbols, directives - pending_branch_relocs: Vec, // unresolved local branches - pcrel_hi_counter: u32, // counter for synthetic .Lpcrel_hi labels - numeric_labels: HashMap>, // "1" -> [(sec, off), ...] - deferred_exprs: Vec, // forward label data expressions (.word sym - .) - elf_flags: u32, // ELF e_flags (default: RVC | FLOAT_ABI_DOUBLE) - elf_class: u8, // ELFCLASS64 (default) or ELFCLASS32 for RV32 - no_relax: bool, // suppress R_RISCV_RELAX (via .option norelax) -} -``` - -The `ElfWriterBase` (defined in `elf/writer_base.rs`) holds all shared state: -`current_section`, `sections` (as `HashMap`), -`section_order`, `labels`, `global_symbols`, `weak_symbols`, `symbol_types`, -`symbol_sizes`, `symbol_visibility`, `aliases`, and section push/pop stacks. -It provides shared methods for section management, directive processing, data -emission, and ELF serialization. It is also used by the ARM assembler. - -### `ObjSection` (elf/object_writer.rs) - -Represents a single ELF section being built (shared across ARM and RISC-V). - -``` -ObjSection { - name: String, // section name - data: Vec, // accumulated bytes - sh_type: u32, // SHT_PROGBITS, SHT_NOBITS, ... - sh_flags: u64, // SHF_ALLOC | SHF_WRITE | SHF_EXECINSTR - sh_addralign: u64, // required alignment - relocs: Vec, // pending relocations for this section -} -``` - -### `AsmStatement` (parser.rs) - -The parser's output. Each source line produces one `AsmStatement`. - -``` -AsmStatement::Label(String) // label definition: "name:" -AsmStatement::Directive(Directive) // typed assembler directive -AsmStatement::Instruction { mnemonic, operands, raw_operands } // RISC-V instruction -AsmStatement::Empty // blank line / comment-only -``` - -The `Directive` enum has ~30 variants covering all recognized directives -(section switches, data emission, symbol attributes, alignment, etc.). - -### `Operand` (parser.rs) - -A tagged union covering every operand form the parser recognizes. - -``` -Operand::Reg(String) // register: "x5", "a0", "sp", "fa3" -Operand::Imm(i64) // immediate: 42, -7, 0xff -Operand::Symbol(String) // symbol reference: "printf", ".LC0" -Operand::SymbolOffset(String, i64) // symbol + addend: "sym+8" -Operand::Mem { base, offset } // memory: 8(sp), 0(a0) -Operand::MemSymbol { base, symbol, modifier } // memory with reloc: %lo(sym)(a0) -Operand::Label(String) // branch target label -Operand::FenceArg(String) // fence operand: "iorw" -Operand::Csr(String) // CSR register name or number -Operand::RoundingMode(String) // rne, rtz, rdn, rup, rmm, dyn -``` - -### `EncodeResult` (encoder/mod.rs) - -The encoder returns one of several result variants depending on the instruction -class. - -``` -EncodeResult::Word(u32) // single 32-bit instruction -EncodeResult::Half(u16) // 16-bit compressed instruction -EncodeResult::Words(Vec) // multi-word sequence (e.g., li with large imm) -EncodeResult::WordWithReloc { word, reloc } // instruction + relocation -EncodeResult::WordsWithRelocs(Vec<(u32, Option)>) // multi-word + relocs (e.g., call = auipc+jalr) -EncodeResult::Skip // pseudo handled elsewhere (no output) -``` - -### `Relocation` / `RelocType` (encoder/mod.rs) - -``` -Relocation { - reloc_type: RelocType, // semantic relocation kind - symbol: String, // target symbol name - addend: i64, // constant addend -} - -RelocType::Branch -> R_RISCV_BRANCH (B-type, 12-bit PC-rel) -RelocType::Jal -> R_RISCV_JAL (J-type, 20-bit PC-rel) -RelocType::CallPlt -> R_RISCV_CALL_PLT (AUIPC+JALR pair) -RelocType::PcrelHi20 -> R_RISCV_PCREL_HI20 -RelocType::PcrelLo12I -> R_RISCV_PCREL_LO12_I -RelocType::PcrelLo12S -> R_RISCV_PCREL_LO12_S -RelocType::Hi20 -> R_RISCV_HI20 -RelocType::Lo12I -> R_RISCV_LO12_I -RelocType::Lo12S -> R_RISCV_LO12_S -RelocType::TprelHi20 -> R_RISCV_TPREL_HI20 -RelocType::TprelLo12I -> R_RISCV_TPREL_LO12_I -RelocType::TprelLo12S -> R_RISCV_TPREL_LO12_S -RelocType::TprelAdd -> R_RISCV_TPREL_ADD -RelocType::GotHi20 -> R_RISCV_GOT_HI20 -RelocType::TlsGdHi20 -> R_RISCV_TLS_GD_HI20 -RelocType::TlsGotHi20 -> R_RISCV_TLS_GOT_HI20 -RelocType::Abs32 -> R_RISCV_32 -RelocType::Abs64 -> R_RISCV_64 -RelocType::Add16 -> R_RISCV_ADD16 (16-bit symbol difference, e.g. .2byte) -RelocType::Sub16 -> R_RISCV_SUB16 -RelocType::Add32 -> R_RISCV_ADD32 (32-bit symbol difference, e.g. .4byte) -RelocType::Sub32 -> R_RISCV_SUB32 -RelocType::Add64 -> R_RISCV_ADD64 (64-bit symbol difference, e.g. .8byte) -RelocType::Sub64 -> R_RISCV_SUB64 -``` - -## Processing Algorithm Step by Step - -### Step 1: Preprocessing (`asm_preprocess.rs`) - -Before parsing, the shared `asm_preprocess` module runs several expansion passes -over the raw assembly text: - -1. Strips C-style block comments (`/* ... */`) and line comments. -2. Expands `.macro` / `.endm` definitions and invocations. -3. Expands `.rept` / `.irp` / `.endr` repetition blocks. - -These passes are shared across all assembler backends (RISC-V, ARM, x86). - -### Step 2: Parsing (`parser.rs`) - -The parser processes the preprocessed source line by line. For each line it: - -1. Detects a leading label (any identifier followed by `:`). Numeric labels - such as `1:` are recognized as re-definable local labels. -2. Identifies the mnemonic -- either an instruction name (`add`, `ld`, `beq`) - or a directive (`.text`, `.globl`, `.quad`). -3. Evaluates `.if` / `.else` / `.endif` conditional assembly blocks, skipping - lines inside false conditions. -4. Parses the operand list, recognizing: - - Integer and FP register names (x0-x31, a0-a7, s0-s11, t0-t6, f0-f31, - fa0-fa7, fs0-fs11, ft0-ft11), including ABI aliases. - - Immediates in decimal, hex (`0x`), octal (`0`), and binary (`0b`). - - Memory references in the form `offset(base)`. - - Relocation modifiers: `%pcrel_hi(sym)`, `%lo(sym)`, `%tprel_add(sym)`, etc. - - Bare symbol references and label references (including numeric `1b`, `1f`). - - CSR names, fence arguments, and FP rounding modes. - -### Step 3: Directive Execution (`elf_writer.rs`) - -The ELF writer iterates over parsed statements and handles directives inline: - -| Directive | Effect | -|------------------------|--------------------------------------------------| -| `.text` / `.data` / `.bss` / `.rodata` / `.section` | Switch or create a section | -| `.globl` / `.global` | Mark symbol as globally visible | -| `.local` | Mark symbol as local binding | -| `.weak` | Mark symbol as weak binding | -| `.hidden` / `.protected` / `.internal` | Set symbol visibility | -| `.type` | Set symbol type (function/object/tls) | -| `.size` | Set symbol size | -| `.byte` / `.short` / `.half` / `.word` / `.long` / `.quad` / `.8byte` | Emit literal data | -| `.zero` / `.space` | Emit N zero bytes (with optional fill value) | -| `.string` / `.asciz` / `.ascii` | Emit string data (with/without NUL) | -| `.align` / `.balign` / `.p2align` | Pad to alignment boundary | -| `.equ` / `.set` | Define a symbol with a constant value | -| `.comm` | Reserve common storage | -| `.pushsection` / `.popsection` / `.previous` | Push/pop section stack | -| `.option push/pop/rvc/norvc/norelax` | Control RVC compression and relaxation | -| `.insn` | Emit a raw instruction encoding | -| `.incbin` | Include binary file contents | -| `.attribute` / `.file` / `.ident` | Metadata / ignored | -| `.cfi_*` | Silently consumed (CFI info not emitted) | -| `.addrsig` / `.addrsig_sym` | Address-significance (silently consumed) | - -### Step 4: Instruction Encoding (`encoder/`) - -For each instruction mnemonic, the encoder: - -1. Looks up the mnemonic in a master dispatch table. The table covers: - - **R-type**: add, sub, sll, slt, sltu, xor, srl, sra, or, and, mul, div, - rem, addw, subw, sllw, srlw, sraw, mulw, divw, remw, plus all - variants (unsigned, word-width). - - **I-type**: addi, slti, sltiu, xori, ori, andi, slli, srli, srai, - addiw, slliw, srliw, sraiw, lb/lh/lw/ld/lbu/lhu/lwu, jalr. - - **S-type**: sb, sh, sw, sd. - - **B-type**: beq, bne, blt, bge, bltu, bgeu. - - **U-type**: lui, auipc. - - **J-type**: jal. - - **Atomics (A-extension)**: lr.w/d, sc.w/d, amo{swap,add,and,or,xor,min,max,minu,maxu}.w/d - - **Floating-point (F/D)**: fadd/fsub/fmul/fdiv/fsqrt, fmin/fmax, - fcvt (all int/float conversions), fmv.x.w/d, fmv.w.x/d.x, - fmadd/fmsub/fnmadd/fnmsub, feq/flt/fle, fclass, flw/fld/fsw/fsd, - fsgnj/fsgnjn/fsgnjx. - - **System**: ecall, ebreak, fence, fence.i, csrr/csrw/csrs/csrc and - their immediate variants (csrwi, csrsi, csrci). - -2. Expands pseudo-instructions into their real instruction sequences: - - | Pseudo | Expansion | - |----------------|------------------------------------------------------| - | `li rd, imm` | `lui + addi(w)` or single `addi`, up to 3-instruction sequences for 64-bit constants | - | `mv rd, rs` | `add rd, x0, rs` (uses ADD form for RV64C eligibility)| - | `not rd, rs` | `xori rd, rs, -1` | - | `neg rd, rs` | `sub rd, x0, rs` | - | `negw rd, rs` | `subw rd, x0, rs` | - | `sext.w rd, rs`| `addiw rd, rs, 0` | - | `seqz rd, rs` | `sltiu rd, rs, 1` | - | `snez rd, rs` | `sltu rd, x0, rs` | - | `sltz rd, rs` | `slt rd, rs, x0` | - | `sgtz rd, rs` | `slt rd, x0, rs` | - | `beqz/bnez` | `beq/bne rs, x0, label` | - | `blez/bgez/...`| Corresponding `bge`/`blt` with x0 | - | `bgt/ble/bgtu/bleu` | Swapped-operand `blt`/`bge` variants | - | `j label` | `jal x0, label` | - | `jr rs` | `jalr x0, 0(rs)` | - | `ret` | `jalr x0, 0(ra)` | - | `call sym` | `auipc ra, %pcrel_hi(sym)` + `jalr ra, %pcrel_lo(sym)(ra)` | - | `tail sym` | `auipc t1, %pcrel_hi(sym)` + `jalr x0, %pcrel_lo(sym)(t1)` | - | `la rd, sym` | `auipc rd, ...` + `addi rd, rd, ...` (pcrel pair) | - | `lla rd, sym` | Same as `la` (non-PIC) | - | `nop` | `addi x0, x0, 0` | - | `fmv.s/d` | `fsgnj.s/d rd, rs, rs` | - | `fabs.s/d` | `fsgnjx.s/d rd, rs, rs` | - | `fneg.s/d` | `fsgnjn.s/d rd, rs, rs` | - | `rdcycle/rdtime/rdinstret` | `csrrs rd, csr, x0` | - | `csrr/csrw/csrs/csrc` | Expanded `csrrs`/`csrrw`/`csrrc` forms | - -3. Produces an `EncodeResult` containing the machine code bytes and any - relocations required for symbol references. - -The encoding functions for each instruction format follow the RISC-V ISA -specification exactly: - -``` -R-type: [funct7 | rs2 | rs1 | funct3 | rd | opcode] -I-type: [ imm[11:0] | rs1 | funct3 | rd | opcode] -S-type: [imm[11:5]| rs2 | rs1 | funct3 | imm[4:0] | opcode] -B-type: [imm[12|10:5] | rs2 | rs1 | funct3 | imm[4:1|11] | opcode] -U-type: [ imm[31:12] | rd | opcode] -J-type: [imm[20|10:1|11|19:12] | rd | opcode] -``` - -### Step 5: Section Data Accumulation (`elf_writer.rs`) - -As instructions are encoded, the ELF writer appends the resulting bytes to the -current section's data buffer. For instructions with relocations: - -- **Intra-section branches** (same section, label already defined or to be - defined): recorded as `PendingReloc` entries for later resolution. -- **External symbol references**: recorded as `ObjReloc` entries in the - section's relocation list, to be emitted as `.rela.*` sections. - -For multi-word expansions (e.g., `call` emitting AUIPC+JALR), the assembler -generates synthetic labels (`.Lpcrel_hiN`) so that `%pcrel_lo` relocations can -reference the AUIPC's PC, as required by the RISC-V ABI. - -### Step 6: Local Branch Resolution (`elf_writer.rs` -- `resolve_local_branches`) - -Before ELF emission, the assembler resolves all pending intra-section branch -relocations: - -1. For each `PendingReloc`, it looks up the target label in the label table. -2. If the target is in the same section, it computes the PC-relative offset - and patches the instruction word directly in the section data buffer, - encoding the offset into the appropriate bit fields for the relocation type: - - **R_RISCV_BRANCH** (B-type): 12-bit signed offset, bit-scattered - - **R_RISCV_JAL** (J-type): 20-bit signed offset, bit-scattered - - **R_RISCV_CALL_PLT**: patches both AUIPC (hi20) and JALR (lo12) - - **R_RISCV_PCREL_HI20**: patches AUIPC upper 20 bits - - **R_RISCV_PCREL_LO12_I/S**: patches load/store lower 12 bits -3. If the target is in a different section or undefined, the relocation is - promoted to an external ELF relocation for the linker to resolve. - -### Step 7: ELF Object Emission (`elf_writer.rs`) - -The final step serializes the assembled state into a conformant ELF64 relocatable -object file. The layout is: - -``` -+----------------------------------+ offset 0 -| ELF Header (64 bytes) | -| - e_machine = EM_RISCV (243) | -| - e_flags = FLOAT_ABI_DOUBLE | -| | RVC | -+----------------------------------+ -| Section data | -| (.text, .data, .rodata, .bss, | -| .sdata, .init_array, etc.) | -| (each aligned per sh_addralign) | -+----------------------------------+ -| .rela.text (relocation entries) | -| .rela.data | -| (24 bytes per entry: ELF64_Rela)| -+----------------------------------+ -| .symtab (symbol table) | -| (24 bytes per entry: ELF64_Sym) | -| Ordering: NULL, section syms, | -| local syms, global | -+----------------------------------+ -| .strtab (symbol string table) | -+----------------------------------+ -| .shstrtab (section name strings)| -+----------------------------------+ -| Section header table | -| (64 bytes per header: Elf64_Shdr| -| NULL, content sections, | -| .rela.*, .symtab, .strtab, | -| .shstrtab) | -+----------------------------------+ -``` - -The writer performs several bookkeeping tasks: - -- **Symbol table construction**: Local labels (`.L*`) are included only if - they are referenced by a relocation (e.g., synthetic `%pcrel_lo` labels). - Section symbols are emitted for every content section. The `sh_info` field - of `.symtab` is set to the index of the first global symbol, per ELF spec. - -- **Relocation entries**: Each `ObjReloc` is serialized as an `Elf64_Rela` - entry (offset, r_info = symbol_index << 32 | type, addend). A companion - `R_RISCV_RELAX` relocation is emitted alongside `PCREL_HI20`, `CALL_PLT`, - `TPREL_*`, and `GOT_HI20` relocations to allow the linker to perform - relaxation optimizations (unless suppressed by `.option norelax`). - Additionally, `R_RISCV_ALIGN` relocations are emitted at `.align`, - `.balign`, and `.p2align` directives when relaxation is enabled and the - current section is executable (`SHF_EXECINSTR`). These mark NOP padding - regions so the linker can re-pad after relaxation shrinks preceding - instructions, maintaining correct alignment. Data sections use static - zero-padding for alignment without relocations. - -- **ELF flags**: Default is `EF_RISCV_FLOAT_ABI_DOUBLE | EF_RISCV_RVC` (0x05). - The float ABI can be overridden via the `-mabi=` flag passed to - `assemble_with_args()` (lp64/lp64f/lp64d/lp64q). - -## Key Design Decisions and Trade-offs - -### 1. Post-encoding compression vs. direct compressed emission - -The assembler first encodes all instructions as 32-bit words. A separate -compression pass (`compress.rs`) can then scan for eligible instructions and -rewrite them as 16-bit RVC equivalents. This two-phase approach is simpler than -trying to emit compressed instructions inline during encoding, because: - -- The compressor can examine each fully-formed 32-bit encoding and make a - binary yes/no decision. The encoder does not need to be aware of RVC - constraints at all. -- Relocation offset adjustment is localized to a single pass rather than - being spread throughout the encoder. -- The approach is trivially correct: removing the compression pass produces - a valid (if larger) object file. - -**Current status:** The compression pass is disabled. The linker handles -relaxation via `R_RISCV_RELAX` hints, and running assembler-side compression -would change code layout in ways the linker's relaxation pass doesn't expect. -The compressor code is retained for potential future use. - -### 2. Eager local branch resolution - -Branches to labels within the same section are resolved immediately in the -assembler (before ELF emission), rather than being emitted as relocations for -the linker. This reduces the number of relocations the linker must process and -produces smaller object files. The linker only sees cross-section and -cross-module symbol references. - -### 3. Synthetic labels for PCREL_LO12 - -The RISC-V ABI requires that `%pcrel_lo` relocations reference the *AUIPC -instruction's address*, not the symbol directly. The assembler generates -synthetic labels (`.Lpcrel_hiN`) at each AUIPC site and makes the corresponding -LO12 relocation reference that label. The `build_symbol_table` pass in the -ELF writer ensures these synthetic labels appear in `.symtab` whenever they -are referenced by a `.rela.*` entry. - -### 4. Numeric local labels - -Numeric labels (`1:`, `2:`, etc.) can be redefined multiple times. Forward -references (`1f`) resolve to the *next* definition; backward references (`1b`) -resolve to the *most recent* definition. During preprocessing, all numeric label -references are rewritten to unique synthetic names (`.Lnum_N_I`) so that -the rest of the pipeline can treat them as ordinary labels. - -### 5. In-process execution - -The assembler runs entirely in-process, sharing the compiler's address space. -There is no serialization to text and back, no fork/exec of a system assembler. -This means: - -- No dependency on a host RISC-V cross-assembler being installed. -- Faster compilation: no process spawning overhead. -- The compiler controls the exact assembly dialect and can rely on features - without worrying about toolchain version skew. - -### 6. Shared infrastructure - -The `ElfWriterBase` (in the `elf` module) and `asm_preprocess` module are shared between -the RISC-V and ARM assembler backends. This avoids duplicating section management, -symbol table construction, ELF serialization, macro expansion, and repetition -block handling. Each backend composes with the shared base and adds -architecture-specific instruction encoding, branch resolution, and relocation -logic. - -### 7. No linker relaxation in the assembler - -The assembler emits `R_RISCV_RELAX` hints alongside eligible relocations but -does not perform any relaxation itself. Relaxation (e.g., converting a -`lui+addi` pair to a single `addi` when the symbol is close to GP) is -intentionally left to the linker, which has full address layout information. -The assembler's job is to produce conservative, correct encodings. diff --git a/src/backend/riscv/assembler/compress.rs b/src/backend/riscv/assembler/compress.rs deleted file mode 100644 index b6b4e84979..0000000000 --- a/src/backend/riscv/assembler/compress.rs +++ /dev/null @@ -1,848 +0,0 @@ -//! RV64C compressed instruction support. -//! -//! Implements the RISC-V C (compressed) extension for RV64. After the assembler -//! encodes all instructions as 32-bit words, this module attempts to compress -//! eligible instructions into 16-bit (2-byte) equivalents, matching GCC's -//! default behavior for RV64GC targets. -//! -//! The compression pass runs after instruction encoding but before local branch -//! resolution, so that branch offsets are computed against the final (compressed) -//! layout. - -/// Try to compress a 32-bit RISC-V instruction into a 16-bit RV64C instruction. -/// -/// Returns `Some(halfword)` if the instruction can be compressed, `None` otherwise. -/// This only handles instructions without relocations; instructions with pending -/// relocations are not candidates for compression. -// Binary literals use groupings matching RISC-V compressed instruction format fields. -#[allow(clippy::unusual_byte_groupings)] -pub fn try_compress_rv64(word: u32) -> Option { - let opcode = word & 0x7F; - let rd = (word >> 7) & 0x1F; - let funct3 = (word >> 12) & 0x7; - let rs1 = (word >> 15) & 0x1F; - let rs2 = (word >> 20) & 0x1F; - let funct7 = (word >> 25) & 0x7F; - - match opcode { - // ── LUI (opcode 0110111) ── - 0b0110111 => { - // C.LUI: lui rd, imm where rd != {x0, x2} - // imm[17:12] from word[31:12] - if rd == 0 || rd == 2 { return None; } - let imm20 = (word >> 12) as i32; - // Sign-extend the 20-bit value - let imm20 = (imm20 << 12) >> 12; // sign-extend from 20 bits - // C.LUI uses imm[17:12], so the actual nzimm is bits 17:12 - // which is imm20[5:0] (since LUI loads imm into [31:12]) - // C.LUI constraint: nzimm != 0, and it's sign-extended from 6 bits - let nzimm = imm20; // this is the full 20-bit value - // C.LUI stores bits [17:12] = nzimm[5:0], sign-extended from bit 17 - // So nzimm must fit in signed 6-bit range: -32..31 (but not 0) - if nzimm == 0 { return None; } - if !(-32..=31).contains(&nzimm) { return None; } - let nzimm = nzimm as u32; - let bit17 = (nzimm >> 5) & 1; - let bits16_12 = nzimm & 0x1F; - Some((0b011_0_00000_00000_01 | (bit17 << 12) | (rd << 7) | (bits16_12 << 2)) as u16) - } - - // ── ADDI (opcode 0010011, funct3=000) ── - 0b0010011 if funct3 == 0b000 => { - let imm = (word as i32) >> 20; // sign-extended imm[11:0] - - if rd == 0 && rs1 == 0 && imm == 0 { - // C.NOP - Some(0b000_0_00000_00000_01) - } else if rd == rs1 && rd != 0 && imm != 0 && (-32..=31).contains(&imm) { - // C.ADDI: addi rd, rd, nzimm (including sp) - // GCC prefers C.ADDI over C.ADDI16SP when both apply - let nzimm = imm as u32; - let bit5 = (nzimm >> 5) & 1; - let bits4_0 = nzimm & 0x1F; - Some((0b000_0_00000_00000_01 - | (bit5 << 12) - | (rd << 7) - | (bits4_0 << 2)) as u16) - } else if rd == 2 && rs1 == 2 && imm != 0 && (imm % 16) == 0 - && (-512..=496).contains(&imm) { - // C.ADDI16SP: addi x2, x2, imm (for larger imm not fitting C.ADDI) - let uimm = imm as u32; - let bit9 = (uimm >> 9) & 1; - let bit4 = (uimm >> 4) & 1; - let bit6 = (uimm >> 6) & 1; - let bits8_7 = (uimm >> 7) & 0x3; - let bit5 = (uimm >> 5) & 1; - Some((0b011_0_00010_00000_01 - | (bit9 << 12) - | (bit4 << 6) - | (bit6 << 5) - | (bits8_7 << 3) - | (bit5 << 2)) as u16) - } else if rs1 == 2 && rd != 0 && rd != 2 - && is_creg(rd) && imm > 0 && (imm % 4) == 0 && imm <= 1020 { - // C.ADDI4SPN: addi rd', x2, uimm - let uimm = imm as u32; - let rd_prime = creg_num(rd); - let bits5_4 = (uimm >> 4) & 0x3; - let bits9_6 = (uimm >> 6) & 0xF; - let bit2 = (uimm >> 2) & 1; - let bit3 = (uimm >> 3) & 1; - Some(((bits5_4 << 11) - | (bits9_6 << 7) - | (bit2 << 6) - | (bit3 << 5) - | (rd_prime << 2)) as u16) - } else if rs1 == 0 && rd != 0 { - // C.LI: addi rd, x0, imm (li rd, imm) - if !(-32..=31).contains(&imm) { return None; } - let imm_u = imm as u32; - let bit5 = (imm_u >> 5) & 1; - let bits4_0 = imm_u & 0x1F; - Some((0b010_0_00000_00000_01 - | (bit5 << 12) - | (rd << 7) - | (bits4_0 << 2)) as u16) - } else { - None - } - } - - // ── ADDIW (opcode 0011011, funct3=000) ── - 0b0011011 if funct3 == 0b000 => { - let imm = (word as i32) >> 20; - if rd == rs1 && rd != 0 { - // C.ADDIW: addiw rd, rd, imm (imm can be 0 for sext.w) - if !(-32..=31).contains(&imm) { return None; } - let imm_u = imm as u32; - let bit5 = (imm_u >> 5) & 1; - let bits4_0 = imm_u & 0x1F; - Some((0b001_0_00000_00000_01 - | (bit5 << 12) - | (rd << 7) - | (bits4_0 << 2)) as u16) - } else if rs1 == 0 && rd != 0 { - // addiw rd, x0, imm => also C.ADDIW if we treat rd as rd/rs1 - // Actually this would be `sext.w rd` which is `addiw rd, rd, 0` - // This pattern doesn't match C.ADDIW since rd != rs1 - None - } else { - None - } - } - - // ── SLLI (opcode 0010011, funct3=001) ── - 0b0010011 if funct3 == 0b001 => { - // slli rd, rs1, shamt (funct7 encodes high bit of shamt for RV64) - let shamt = (word >> 20) & 0x3F; // 6-bit shift for RV64 - if rd == rs1 && rd != 0 && shamt != 0 { - // C.SLLI: slli rd, rd, shamt - let bit5 = (shamt >> 5) & 1; - let bits4_0 = shamt & 0x1F; - Some((0b000_0_00000_00000_10 - | (bit5 << 12) - | (rd << 7) - | (bits4_0 << 2)) as u16) - } else { - None - } - } - - // ── SRLI/SRAI (opcode 0010011, funct3=101) ── - 0b0010011 if funct3 == 0b101 => { - let shamt = (word >> 20) & 0x3F; - let is_srai = (funct7 & 0x20) != 0; - if rd == rs1 && is_creg(rd) && shamt != 0 { - let rd_prime = creg_num(rd); - let bit5 = (shamt >> 5) & 1; - let bits4_0 = shamt & 0x1F; - if is_srai { - // C.SRAI - Some((0b100_0_01_000_00000_01 - | (bit5 << 12) - | (rd_prime << 7) - | (bits4_0 << 2)) as u16) - } else { - // C.SRLI - Some((0b100_0_00_000_00000_01 - | (bit5 << 12) - | (rd_prime << 7) - | (bits4_0 << 2)) as u16) - } - } else { - None - } - } - - // ── ANDI (opcode 0010011, funct3=111) ── - 0b0010011 if funct3 == 0b111 => { - let imm = (word as i32) >> 20; - if rd == rs1 && is_creg(rd) { - if !(-32..=31).contains(&imm) { return None; } - let rd_prime = creg_num(rd); - let imm_u = imm as u32; - let bit5 = (imm_u >> 5) & 1; - let bits4_0 = imm_u & 0x1F; - // C.ANDI - Some((0b100_0_10_000_00000_01 - | (bit5 << 12) - | (rd_prime << 7) - | (bits4_0 << 2)) as u16) - } else { - None - } - } - - // ── ADD/SUB/AND/OR/XOR/ADDW/SUBW (opcode 0110011, R-type) ── - 0b0110011 => { - match (funct7, funct3) { - // ADD - (0b0000000, 0b000) => { - if rd == rs1 && rd != 0 && rs2 != 0 { - // C.ADD: add rd, rd, rs2 - Some((0b100_1_00000_00000_10 - | (rd << 7) - | (rs2 << 2)) as u16) - } else if rs1 == 0 && rd != 0 && rs2 != 0 { - // C.MV: add rd, x0, rs2 (mv rd, rs2) - Some((0b100_0_00000_00000_10 - | (rd << 7) - | (rs2 << 2)) as u16) - } else { - None - } - } - // SUB - (0b0100000, 0b000) => { - if rd == rs1 && is_creg(rd) && is_creg(rs2) { - let rd_prime = creg_num(rd); - let rs2_prime = creg_num(rs2); - // C.SUB - Some((0b100_0_11_000_00_000_01 - | (rd_prime << 7) - | (rs2_prime << 2)) as u16) - } else { - None - } - } - // XOR - (0b0000000, 0b100) => { - if rd == rs1 && is_creg(rd) && is_creg(rs2) { - let rd_prime = creg_num(rd); - let rs2_prime = creg_num(rs2); - // C.XOR - Some((0b100_0_11_000_01_000_01 - | (rd_prime << 7) - | (rs2_prime << 2)) as u16) - } else { - None - } - } - // OR - (0b0000000, 0b110) => { - if rd == rs1 && is_creg(rd) && is_creg(rs2) { - let rd_prime = creg_num(rd); - let rs2_prime = creg_num(rs2); - // C.OR - Some((0b100_0_11_000_10_000_01 - | (rd_prime << 7) - | (rs2_prime << 2)) as u16) - } else { - None - } - } - // AND - (0b0000000, 0b111) => { - if rd == rs1 && is_creg(rd) && is_creg(rs2) { - let rd_prime = creg_num(rd); - let rs2_prime = creg_num(rs2); - // C.AND - Some((0b100_0_11_000_11_000_01 - | (rd_prime << 7) - | (rs2_prime << 2)) as u16) - } else { - None - } - } - _ => None, - } - } - - // ── ADDW/SUBW (opcode 0111011, R-type word ops) ── - 0b0111011 => { - match (funct7, funct3) { - // ADDW - (0b0000000, 0b000) => { - if rd == rs1 && is_creg(rd) && is_creg(rs2) { - let rd_prime = creg_num(rd); - let rs2_prime = creg_num(rs2); - // C.ADDW - Some((0b100_1_11_000_01_000_01 - | (rd_prime << 7) - | (rs2_prime << 2)) as u16) - } else { - None - } - } - // SUBW - (0b0100000, 0b000) => { - if rd == rs1 && is_creg(rd) && is_creg(rs2) { - let rd_prime = creg_num(rd); - let rs2_prime = creg_num(rs2); - // C.SUBW - Some((0b100_1_11_000_00_000_01 - | (rd_prime << 7) - | (rs2_prime << 2)) as u16) - } else { - None - } - } - _ => None, - } - } - - // ── LD (opcode 0000011, funct3=011) ── - 0b0000011 if funct3 == 0b011 => { - let imm = (word as i32) >> 20; - if rs1 == 2 && rd != 0 { - // C.LDSP: ld rd, offset(sp) - offset must be multiple of 8, 0..504 - let offset = imm; - if (0..=504).contains(&offset) && (offset % 8) == 0 { - let uoff = offset as u32; - // Encoding: 011 | uimm[5] | rd | uimm[4:3|8:6] | 10 - let bit5 = (uoff >> 5) & 1; - let bits4_3 = (uoff >> 3) & 0x3; - let bits8_6 = (uoff >> 6) & 0x7; - Some((0b011_0_00000_00000_10 - | (bit5 << 12) - | (rd << 7) - | (bits4_3 << 5) - | (bits8_6 << 2)) as u16) - } else { - None - } - } else if is_creg(rs1) && is_creg(rd) { - // C.LD: ld rd', offset(rs1') - offset must be multiple of 8, 0..248 - let offset = imm; - if (0..=248).contains(&offset) && (offset % 8) == 0 { - let uoff = offset as u32; - let rs1_prime = creg_num(rs1); - let rd_prime = creg_num(rd); - // Encoding: 011 | uimm[5:3] | rs1' | uimm[7:6] | rd' | 00 - let bits5_3 = (uoff >> 3) & 0x7; - let bits7_6 = (uoff >> 6) & 0x3; - Some((0b011_000_000_00_000_00 - | (bits5_3 << 10) - | (rs1_prime << 7) - | (bits7_6 << 5) - | (rd_prime << 2)) as u16) - } else { - None - } - } else { - None - } - } - - // ── LW (opcode 0000011, funct3=010) ── - 0b0000011 if funct3 == 0b010 => { - let imm = (word as i32) >> 20; - if rs1 == 2 && rd != 0 { - // C.LWSP: lw rd, offset(sp) - offset must be multiple of 4, 0..252 - let offset = imm; - if (0..=252).contains(&offset) && (offset % 4) == 0 { - let uoff = offset as u32; - // Encoding: 010 | uimm[5] | rd | uimm[4:2|7:6] | 10 - let bit5 = (uoff >> 5) & 1; - let bits4_2 = (uoff >> 2) & 0x7; - let bits7_6 = (uoff >> 6) & 0x3; - Some((0b010_0_00000_00000_10 - | (bit5 << 12) - | (rd << 7) - | (bits4_2 << 4) - | (bits7_6 << 2)) as u16) - } else { - None - } - } else if is_creg(rs1) && is_creg(rd) { - // C.LW: lw rd', offset(rs1') - offset must be multiple of 4, 0..124 - let offset = imm; - if (0..=124).contains(&offset) && (offset % 4) == 0 { - let uoff = offset as u32; - let rs1_prime = creg_num(rs1); - let rd_prime = creg_num(rd); - // Encoding: 010 | uimm[5:3] | rs1' | uimm[2|6] | rd' | 00 - let bits5_3 = (uoff >> 3) & 0x7; - let bit2 = (uoff >> 2) & 1; - let bit6 = (uoff >> 6) & 1; - Some((0b010_000_000_00_000_00 - | (bits5_3 << 10) - | (rs1_prime << 7) - | (bit2 << 6) - | (bit6 << 5) - | (rd_prime << 2)) as u16) - } else { - None - } - } else { - None - } - } - - // ── SD (opcode 0100011, funct3=011) ── - 0b0100011 if funct3 == 0b011 => { - // S-type: imm[11:5] = funct7, imm[4:0] = rd field - let imm11_5 = (word >> 25) as i32; - let imm4_0 = ((word >> 7) & 0x1F) as i32; - let imm = (imm11_5 << 5) | imm4_0; - let imm = (imm << 20) >> 20; // sign-extend from 12 bits - - if rs1 == 2 { - // C.SDSP: sd rs2, offset(sp) - offset must be multiple of 8, 0..504 - if (0..=504).contains(&imm) && (imm % 8) == 0 { - let uoff = imm as u32; - // Encoding: 111 | uimm[5:3|8:6] | rs2 | 10 - let bits5_3 = (uoff >> 3) & 0x7; - let bits8_6 = (uoff >> 6) & 0x7; - Some((0b111_000000_00000_10 - | (bits5_3 << 10) - | (bits8_6 << 7) - | (rs2 << 2)) as u16) - } else { - None - } - } else if is_creg(rs1) && is_creg(rs2) { - // C.SD: sd rs2', offset(rs1') - offset must be multiple of 8, 0..248 - if (0..=248).contains(&imm) && (imm % 8) == 0 { - let uoff = imm as u32; - let rs1_prime = creg_num(rs1); - let rs2_prime = creg_num(rs2); - // Encoding: 111 | uimm[5:3] | rs1' | uimm[7:6] | rs2' | 00 - let bits5_3 = (uoff >> 3) & 0x7; - let bits7_6 = (uoff >> 6) & 0x3; - Some((0b111_000_000_00_000_00 - | (bits5_3 << 10) - | (rs1_prime << 7) - | (bits7_6 << 5) - | (rs2_prime << 2)) as u16) - } else { - None - } - } else { - None - } - } - - // ── SW (opcode 0100011, funct3=010) ── - 0b0100011 if funct3 == 0b010 => { - let imm11_5 = (word >> 25) as i32; - let imm4_0 = ((word >> 7) & 0x1F) as i32; - let imm = (imm11_5 << 5) | imm4_0; - let imm = (imm << 20) >> 20; - - if rs1 == 2 { - // C.SWSP: sw rs2, offset(sp) - offset must be multiple of 4, 0..252 - if (0..=252).contains(&imm) && (imm % 4) == 0 { - let uoff = imm as u32; - // Encoding: 110 | uimm[5:2|7:6] | rs2 | 10 - let bits5_2 = (uoff >> 2) & 0xF; - let bits7_6 = (uoff >> 6) & 0x3; - Some((0b110_000000_00000_10 - | (bits5_2 << 9) - | (bits7_6 << 7) - | (rs2 << 2)) as u16) - } else { - None - } - } else if is_creg(rs1) && is_creg(rs2) { - // C.SW: sw rs2', offset(rs1') - offset must be multiple of 4, 0..124 - if (0..=124).contains(&imm) && (imm % 4) == 0 { - let uoff = imm as u32; - let rs1_prime = creg_num(rs1); - let rs2_prime = creg_num(rs2); - // Encoding: 110 | uimm[5:3] | rs1' | uimm[2|6] | rs2' | 00 - let bits5_3 = (uoff >> 3) & 0x7; - let bit2 = (uoff >> 2) & 1; - let bit6 = (uoff >> 6) & 1; - Some((0b110_000_000_00_000_00 - | (bits5_3 << 10) - | (rs1_prime << 7) - | (bit2 << 6) - | (bit6 << 5) - | (rs2_prime << 2)) as u16) - } else { - None - } - } else { - None - } - } - - // ── FLD (opcode 0000111, funct3=011) ── - 0b0000111 if funct3 == 0b011 => { - let imm = (word as i32) >> 20; - if rs1 == 2 { - // C.FLDSP: fld rd, offset(sp) - offset must be multiple of 8, 0..504 - let offset = imm; - if (0..=504).contains(&offset) && (offset % 8) == 0 { - let uoff = offset as u32; - let bit5 = (uoff >> 5) & 1; - let bits4_3 = (uoff >> 3) & 0x3; - let bits8_6 = (uoff >> 6) & 0x7; - Some((0b001_0_00000_00000_10 - | (bit5 << 12) - | (rd << 7) - | (bits4_3 << 5) - | (bits8_6 << 2)) as u16) - } else { - None - } - } else if is_creg(rs1) && is_creg(rd) { - // C.FLD: fld rd', offset(rs1') - offset must be multiple of 8, 0..248 - let offset = imm; - if (0..=248).contains(&offset) && (offset % 8) == 0 { - let uoff = offset as u32; - let rs1_prime = creg_num(rs1); - let rd_prime = creg_num(rd); - let bits5_3 = (uoff >> 3) & 0x7; - let bits7_6 = (uoff >> 6) & 0x3; - Some((0b001_000_000_00_000_00 - | (bits5_3 << 10) - | (rs1_prime << 7) - | (bits7_6 << 5) - | (rd_prime << 2)) as u16) - } else { - None - } - } else { - None - } - } - - // ── FSD (opcode 0100111, funct3=011) ── - 0b0100111 if funct3 == 0b011 => { - let imm11_5 = (word >> 25) as i32; - let imm4_0 = ((word >> 7) & 0x1F) as i32; - let imm = (imm11_5 << 5) | imm4_0; - let imm = (imm << 20) >> 20; - - if rs1 == 2 { - // C.FSDSP: fsd rs2, offset(sp) - offset must be multiple of 8, 0..504 - if (0..=504).contains(&imm) && (imm % 8) == 0 { - let uoff = imm as u32; - let bits5_3 = (uoff >> 3) & 0x7; - let bits8_6 = (uoff >> 6) & 0x7; - Some((0b101_000000_00000_10 - | (bits5_3 << 10) - | (bits8_6 << 7) - | (rs2 << 2)) as u16) - } else { - None - } - } else if is_creg(rs1) && is_creg(rs2) { - // C.FSD: fsd rs2', offset(rs1') - offset must be multiple of 8, 0..248 - if (0..=248).contains(&imm) && (imm % 8) == 0 { - let uoff = imm as u32; - let rs1_prime = creg_num(rs1); - let rs2_prime = creg_num(rs2); - let bits5_3 = (uoff >> 3) & 0x7; - let bits7_6 = (uoff >> 6) & 0x3; - Some((0b101_000_000_00_000_00 - | (bits5_3 << 10) - | (rs1_prime << 7) - | (bits7_6 << 5) - | (rs2_prime << 2)) as u16) - } else { - None - } - } else { - None - } - } - - // ── JALR (opcode 1100111) ── - 0b1100111 => { - let imm = (word as i32) >> 20; - if imm == 0 && rs2 == 0 { - if rd == 0 && rs1 != 0 { - // C.JR: jalr x0, 0(rs1) - Some((0b100_0_00000_00000_10 - | (rs1 << 7)) as u16) - } else if rd == 1 && rs1 != 0 { - // C.JALR: jalr x1, 0(rs1) - Some((0b100_1_00000_00000_10 - | (rs1 << 7)) as u16) - } else { - None - } - } else { - None - } - } - - // ── JAL (opcode 1101111) ── - 0b1101111 => { - // C.J: jal x0, offset (only if rd=x0) - // Not compressing JAL here because it has relocations typically - // and the offset range is limited to +-2KiB for C.J - None - } - - // ── BEQ/BNE (opcode 1100011) ── - 0b1100011 => { - // C.BEQZ/C.BNEZ: beq/bne rs1', x0, offset - // Not compressing branches here because they typically have pending - // relocations for local labels, and we'd need to change the relocation - // type. This would be handled separately in a more advanced compression pass. - None - } - - // ── EBREAK (opcode 1110011) ── - 0b1110011 => { - if word == 0x00100073 { - // C.EBREAK - Some(0b100_1_00000_00000_10) - } else { - None - } - } - - _ => None, - } -} - -/// Check if a register number is in the "compressed" range (x8-x15). -/// These are the registers that can be encoded in 3-bit fields. -#[inline] -fn is_creg(reg: u32) -> bool { - (8..=15).contains(®) -} - -/// Convert a full register number (x8-x15) to its 3-bit compressed encoding (0-7). -#[inline] -fn creg_num(reg: u32) -> u32 { - debug_assert!(is_creg(reg), "register x{} is not in compressed range", reg); - reg - 8 -} - -/// Compress instructions in a section's data buffer. -/// -/// Takes the section data (which contains 32-bit instructions) and produces -/// a compressed version where eligible instructions are replaced with 16-bit -/// equivalents. Returns the new data and an offset mapping (old_offset -> new_offset) -/// for each 4-byte boundary in the original data. -/// -/// `reloc_offsets` is the set of offsets that have relocations; instructions at -/// those offsets should NOT be compressed because relocations assume 4-byte -/// instruction size. -pub fn compress_section( - data: &[u8], - reloc_offsets: &std::collections::HashSet, -) -> (Vec, Vec<(u64, u64)>) { - let mut new_data = Vec::with_capacity(data.len()); - let mut offset_map = Vec::new(); // (old_offset, new_offset) - - let mut pos = 0; - while pos < data.len() { - let old_offset = pos as u64; - let new_offset = new_data.len() as u64; - offset_map.push((old_offset, new_offset)); - - // Check if this is already a compressed (2-byte) instruction. - // In RISC-V, bits [1:0] != 0b11 indicates a 16-bit instruction. - if pos + 2 <= data.len() { - let low_byte = data[pos]; - if (low_byte & 0x03) != 0x03 { - // Already a 2-byte compressed instruction — pass through as-is - new_data.extend_from_slice(&data[pos..pos + 2]); - pos += 2; - continue; - } - } - - // Need 4 bytes for a full-width instruction - if pos + 4 > data.len() { - // Trailing bytes — copy them directly - new_data.extend_from_slice(&data[pos..]); - pos = data.len(); - break; - } - - // Don't compress instructions that have relocations - if reloc_offsets.contains(&old_offset) { - new_data.extend_from_slice(&data[pos..pos + 4]); - pos += 4; - continue; - } - - let word = u32::from_le_bytes([ - data[pos], data[pos + 1], data[pos + 2], data[pos + 3], - ]); - - if let Some(halfword) = try_compress_rv64(word) { - new_data.extend_from_slice(&halfword.to_le_bytes()); - } else { - new_data.extend_from_slice(&data[pos..pos + 4]); - } - - pos += 4; - } - - // Copy any trailing bytes (shouldn't happen for well-formed code) - if pos < data.len() { - let old_offset = pos as u64; - let new_offset = new_data.len() as u64; - offset_map.push((old_offset, new_offset)); - new_data.extend_from_slice(&data[pos..]); - } - - (new_data, offset_map) -} - -/// Remap an offset from old to new using the offset map. -/// If the exact offset is in the map, returns the new offset. -/// Otherwise, interpolates based on the nearest lower entry. -pub fn remap_offset(offset: u64, offset_map: &[(u64, u64)]) -> u64 { - // Binary search for the largest old_offset <= offset - match offset_map.binary_search_by_key(&offset, |&(old, _)| old) { - Ok(idx) => offset_map[idx].1, - Err(0) => offset, // Before any mapped offset - Err(idx) => { - // offset is between offset_map[idx-1] and offset_map[idx] - let (prev_old, prev_new) = offset_map[idx - 1]; - let delta = offset - prev_old; - prev_new + delta - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_compress_addi_sp() { - // addi sp, sp, -48 = 0xfd010113 - let result = try_compress_rv64(0xfd010113); - assert!(result.is_some()); - let hw = result.unwrap(); - assert_eq!(hw, 0x7179, "c.addi sp, -48 should be 0x7179, got 0x{:04x}", hw); - } - - #[test] - fn test_compress_sd_ra_sp() { - // sd ra, 40(sp) = 0x02113423 - let result = try_compress_rv64(0x02113423); - assert!(result.is_some()); - let hw = result.unwrap(); - assert_eq!(hw, 0xf406, "c.sdsp ra, 40(sp) should be 0xf406, got 0x{:04x}", hw); - } - - #[test] - fn test_compress_sd_s0_sp() { - // sd s0, 32(sp) = 0x02813023 - let result = try_compress_rv64(0x02813023); - assert!(result.is_some()); - let hw = result.unwrap(); - assert_eq!(hw, 0xf022, "c.sdsp s0, 32(sp) should be 0xf022, got 0x{:04x}", hw); - } - - #[test] - fn test_compress_addi_s0_sp_48() { - // addi s0, sp, 48 = 0x03010413 - // This is addi rd=s0(x8), rs1=sp(x2), imm=48 - // rd != rs1, and rs1=x2, rd=x8 which is in creg range - // This should be C.ADDI4SPN: addi rd', x2, uimm (scaled x4) - let result = try_compress_rv64(0x03010413); - assert!(result.is_some()); - let hw = result.unwrap(); - assert_eq!(hw, 0x1800, "c.addi4spn s0, sp, 48 should be 0x1800, got 0x{:04x}", hw); - } - - #[test] - fn test_compress_mv_t1_t0() { - // mv t1, t0 is now encoded as add t1, x0, t0 (0x00500333) for C.MV compatibility. - // C.MV t1, t0: 100_0_00110_00101_10 = 0x8316 - // add x6, x0, x5 = 0x00500333 - let result = try_compress_rv64(0x00500333); - assert!(result.is_some()); - assert_eq!(result.unwrap(), 0x8316, "c.mv t1, t0 should be 0x8316"); - } - - #[test] - fn test_compress_ld_ra_sp() { - // ld ra, 40(sp) = 0x02813083 - let result = try_compress_rv64(0x02813083); - assert!(result.is_some()); - let hw = result.unwrap(); - assert_eq!(hw, 0x70a2, "c.ldsp ra, 40(sp) should be 0x70a2, got 0x{:04x}", hw); - } - - #[test] - fn test_compress_ld_s0_sp() { - // ld s0, 32(sp) = 0x02013403 - let result = try_compress_rv64(0x02013403); - assert!(result.is_some()); - let hw = result.unwrap(); - assert_eq!(hw, 0x7402, "c.ldsp s0, 32(sp) should be 0x7402, got 0x{:04x}", hw); - } - - #[test] - fn test_compress_addi_sp_48() { - // addi sp, sp, 48 = 0x03010113 - let result = try_compress_rv64(0x03010113); - assert!(result.is_some()); - let hw = result.unwrap(); - assert_eq!(hw, 0x6145, "c.addi16sp 48 should be 0x6145, got 0x{:04x}", hw); - } - - #[test] - fn test_compress_ret() { - // ret = jalr x0, 0(x1) = 0x00008067 - let result = try_compress_rv64(0x00008067); - assert!(result.is_some()); - let hw = result.unwrap(); - assert_eq!(hw, 0x8082, "c.jr ra (ret) should be 0x8082, got 0x{:04x}", hw); - } - - #[test] - fn test_no_compress_with_reloc() { - // Instructions with relocations shouldn't be compressed - // (This is handled by compress_section, not try_compress_rv64) - } - - #[test] - fn test_compress_ebreak() { - let result = try_compress_rv64(0x00100073); - assert_eq!(result, Some(0x9002)); - } - - #[test] - fn test_compress_nop() { - // nop = addi x0, x0, 0 - let result = try_compress_rv64(0x00000013); - assert_eq!(result, Some(0x0001)); - } - - #[test] - fn test_compress_li() { - // li a0, 5 = addi a0, x0, 5 = 0x00500513 - let result = try_compress_rv64(0x00500513); - assert!(result.is_some()); - // C.LI a0, 5: 010_0_01010_00101_01 = 0x4515 (approximation, verify below) - let hw = result.unwrap(); - // a0 = x10, imm = 5 - // 010 | 0 | 01010 | 00101 | 01 - let expected: u16 = 0b0100_0101_0001_0101; - assert_eq!(hw, expected, "c.li a0, 5 = 0x{:04x}, expected 0x{:04x}", hw, expected); - } - - #[test] - fn test_compress_slli() { - // slli a0, a0, 3 = 0x00351513 - // a0 = x10, shamt = 3 - let word = 0x00351513; - let result = try_compress_rv64(word); - assert!(result.is_some()); - } -} diff --git a/src/backend/riscv/assembler/elf_writer.rs b/src/backend/riscv/assembler/elf_writer.rs deleted file mode 100644 index 701009591c..0000000000 --- a/src/backend/riscv/assembler/elf_writer.rs +++ /dev/null @@ -1,1422 +0,0 @@ -//! ELF object file writer for RISC-V. -//! -//! Takes parsed assembly statements and produces an ELF .o (relocatable) file -//! with proper sections, symbols, and relocations for RISC-V ELF (32 or 64-bit). -//! -//! Uses `ElfWriterBase` from `elf.rs` for shared section/symbol/relocation -//! management, directive processing, and ELF serialization. This file only -//! contains RISC-V-specific logic: instruction encoding dispatch, pcrel_hi/lo -//! pairing, RV64C compression, GNU numeric labels, and branch resolution. - -// ELF writer helpers; some section/relocation utilities defined for completeness. -#![allow(dead_code)] - -use std::collections::{HashMap, HashSet}; -use super::parser::{AsmStatement, Operand, Directive, DataValue, SymbolType, Visibility, SizeExpr}; -use super::encoder::{encode_instruction, encode_insn_directive, EncodeResult, RelocType}; -use super::compress; -use crate::backend::elf::{ - self, - SHF_ALLOC, SHF_EXECINSTR, SHF_WRITE, - SHT_PROGBITS, SHT_NOBITS, - STT_NOTYPE, STT_OBJECT, STT_FUNC, STT_TLS, - STV_HIDDEN, STV_PROTECTED, STV_INTERNAL, - ELFCLASS64, EM_RISCV, - ElfWriterBase, ObjReloc, -}; - -// ELF flags for RISC-V -pub(super) const EF_RISCV_RVC: u32 = 0x1; -const EF_RISCV_FLOAT_ABI_SOFT: u32 = 0x0; -pub(super) const EF_RISCV_FLOAT_ABI_SINGLE: u32 = 0x2; -pub(super) const EF_RISCV_FLOAT_ABI_DOUBLE: u32 = 0x4; -pub(super) const EF_RISCV_FLOAT_ABI_QUAD: u32 = 0x6; - -/// RISC-V NOP instruction: `addi x0, x0, 0` = 0x00000013 in little-endian -const RISCV_NOP: [u8; 4] = [0x13, 0x00, 0x00, 0x00]; - -/// The ELF writer for RISC-V. -/// -/// Composes with `ElfWriterBase` for shared infrastructure and adds -/// RISC-V-specific pcrel_hi/lo pairing, RV64C compression, GNU numeric -/// label resolution, and RISC-V branch/call relocation patching. -pub struct ElfWriter { - /// Shared ELF writer state (sections, symbols, labels, directives) - pub base: ElfWriterBase, - /// Pending relocations that reference local labels (resolved after all labels are known) - pending_branch_relocs: Vec, - /// Counter for generating synthetic pcrel_hi labels - pcrel_hi_counter: u32, - /// GNU numeric labels: e.g., "1" -> [(section, offset), ...] in definition order. - numeric_labels: HashMap>, - /// Deferred data expressions that reference forward labels (resolved after all statements). - deferred_exprs: Vec, - /// ELF e_flags to use (default: RVC + double-float ABI) - elf_flags: u32, - /// ELF class: ELFCLASS64 (default) or ELFCLASS32 for RV32 targets. - elf_class: u8, - /// When true, don't emit R_RISCV_RELAX relocations (set by `.option norelax`). - no_relax: bool, - /// Stack for `.option push`/`.option pop` to save/restore the `no_relax` state. - option_stack: Vec, -} - -/// A data expression that couldn't be evaluated immediately (e.g., forward label reference) -/// and must be resolved after all statements are processed. -struct DeferredExpr { - section: String, - offset: u64, - size: usize, - expr: String, -} - -struct PendingReloc { - section: String, - offset: u64, - reloc_type: u32, - symbol: String, - addend: i64, - /// For pcrel_lo12 relocations resolved locally: the offset of the - /// corresponding auipc (pcrel_hi) instruction. - pcrel_hi_offset: Option, -} - -/// Check if a label name is a GNU numeric label (e.g., "1", "42"). -fn is_numeric_label(name: &str) -> bool { - !name.is_empty() && name.chars().all(|c| c.is_ascii_digit()) -} - -/// Check if a symbol reference is a GNU numeric label reference (e.g., "1b", "1f", "42b"). -/// Returns Some((label_name, is_backward)) if it is, None otherwise. -fn parse_numeric_label_ref(symbol: &str) -> Option<(&str, bool)> { - if symbol.len() < 2 { - return None; - } - let last_char = symbol.as_bytes()[symbol.len() - 1]; - let is_backward = last_char == b'b' || last_char == b'B'; - let is_forward = last_char == b'f' || last_char == b'F'; - if !is_backward && !is_forward { - return None; - } - let label_part = &symbol[..symbol.len() - 1]; - if label_part.is_empty() || !label_part.chars().all(|c| c.is_ascii_digit()) { - return None; - } - Some((label_part, is_backward)) -} - -/// Pre-process assembly statements to resolve GNU numeric label references. -/// Numeric labels like `1:` can be defined multiple times. References like `1b` -/// (backward) and `1f` (forward) must resolve to the nearest matching definition. -fn resolve_numeric_label_refs(statements: &[AsmStatement]) -> Vec { - // First pass: collect all numeric label definition positions - let mut label_defs: HashMap> = HashMap::new(); - let mut instance_counter: HashMap = HashMap::new(); - - for (i, stmt) in statements.iter().enumerate() { - if let AsmStatement::Label(name) = stmt { - if is_numeric_label(name) { - let instance = instance_counter.entry(name.clone()).or_insert(0); - label_defs.entry(name.clone()).or_default().push((i, *instance)); - *instance += 1; - } - } - } - - if label_defs.is_empty() { - return statements.to_vec(); - } - - // Second pass: rewrite labels and references - let mut result = Vec::with_capacity(statements.len()); - let mut dot_counter: usize = 0; - - for (i, stmt) in statements.iter().enumerate() { - match stmt { - AsmStatement::Label(name) if is_numeric_label(name) => { - if let Some(defs) = label_defs.get(name) { - for &(def_idx, inst_id) in defs { - if def_idx == i { - let new_name = format!(".Lnum_{}_{}", name, inst_id); - result.push(AsmStatement::Label(new_name)); - break; - } - } - } else { - result.push(stmt.clone()); - } - } - AsmStatement::Instruction { mnemonic, operands, raw_operands } => { - let new_operands: Vec = operands.iter().map(|op| { - rewrite_numeric_ref_in_operand(op, i, &label_defs) - }).collect(); - result.push(AsmStatement::Instruction { - mnemonic: mnemonic.clone(), - operands: new_operands, - raw_operands: raw_operands.clone(), - }); - } - AsmStatement::Directive(dir) => { - let new_dir = rewrite_numeric_refs_in_directive(dir, i, &label_defs, &mut dot_counter); - // If the directive references '.', a synthetic label was inserted - // before the directive. Check and handle. - for d in new_dir { - result.push(d); - } - } - _ => result.push(stmt.clone()), - } - } - - result -} - -/// Rewrite a numeric label reference in an operand to a synthetic label name. -fn rewrite_numeric_ref_in_operand( - op: &Operand, - stmt_idx: usize, - label_defs: &HashMap>, -) -> Operand { - match op { - Operand::Symbol(s) => { - if let Some(new_name) = resolve_numeric_ref_name(s, stmt_idx, label_defs) { - Operand::Symbol(new_name) - } else { - op.clone() - } - } - Operand::Label(s) => { - if let Some(new_name) = resolve_numeric_ref_name(s, stmt_idx, label_defs) { - Operand::Label(new_name) - } else { - op.clone() - } - } - Operand::SymbolOffset(s, off) => { - if let Some(new_name) = resolve_numeric_ref_name(s, stmt_idx, label_defs) { - Operand::SymbolOffset(new_name, *off) - } else { - op.clone() - } - } - _ => op.clone(), - } -} - -/// Resolve a numeric label reference like "1b" or "2f" to a synthetic label name. -fn resolve_numeric_ref_name( - symbol: &str, - stmt_idx: usize, - label_defs: &HashMap>, -) -> Option { - let (label_name, is_backward) = parse_numeric_label_ref(symbol)?; - let defs = label_defs.get(label_name)?; - - if is_backward { - let mut best: Option = None; - for &(def_idx, inst_id) in defs { - if def_idx < stmt_idx { - best = Some(inst_id); - } - } - best.map(|inst_id| format!(".Lnum_{}_{}", label_name, inst_id)) - } else { - for &(def_idx, inst_id) in defs { - if def_idx > stmt_idx { - return Some(format!(".Lnum_{}_{}", label_name, inst_id)); - } - } - None - } -} - -/// Rewrite a symbol name that may be a numeric label ref or `.` (current position). -/// If it's `.`, a synthetic label is generated and `needs_dot_label` is set. -fn rewrite_symbol_name( - name: &str, - stmt_idx: usize, - label_defs: &HashMap>, - dot_counter: &mut usize, - needs_dot_label: &mut Option, -) -> String { - if name == "." { - let label = format!(".Ldot_{}", *dot_counter); - *dot_counter += 1; - *needs_dot_label = Some(label.clone()); - label - } else if let Some(resolved) = resolve_numeric_ref_name(name, stmt_idx, label_defs) { - resolved - } else { - name.to_string() - } -} - -/// Decompose a symbol name that may contain an embedded addend. -/// -/// For example, `"cgroup_bpf_enabled_key+144"` -> `("cgroup_bpf_enabled_key", 144)`. -/// If there is no embedded addend, returns `(name, 0)`. -/// -/// This is needed because inline asm operand substitution can produce symbol -/// references like `sym+offset` as a single string, but ELF relocations must -/// reference the base symbol with a numeric addend in the RELA entry. -fn decompose_symbol_addend(name: &str) -> (String, i64) { - // Split on the last `+` or `-` if the suffix is a plain integer. - // Names without arithmetic (e.g. `.Ldot_2`, `my_func`) pass through as-is. - if let Some(plus_pos) = name.rfind('+') { - let base = &name[..plus_pos]; - let offset_str = name[plus_pos + 1..].trim(); - if !base.is_empty() && !offset_str.is_empty() { - if let Ok(offset) = offset_str.parse::() { - return (base.to_string(), offset); - } - } - } else if let Some(minus_pos) = name.rfind('-') { - // Only if it's not the first character (not a negative number) - if minus_pos > 0 { - let base = &name[..minus_pos]; - let offset_str = &name[minus_pos..]; // includes the '-' - if !base.is_empty() { - if let Ok(offset) = offset_str.parse::() { - return (base.to_string(), offset); - } - } - } - } - (name.to_string(), 0) -} - -/// Rewrite numeric label refs and `.` in a DataValue. -fn rewrite_data_value( - dv: &DataValue, - stmt_idx: usize, - label_defs: &HashMap>, - dot_counter: &mut usize, - dot_labels: &mut Vec, -) -> DataValue { - match dv { - DataValue::SymbolDiff { sym_a, sym_b, addend } => { - let mut needs_dot = None; - let new_a = rewrite_symbol_name(sym_a, stmt_idx, label_defs, dot_counter, &mut needs_dot); - if let Some(l) = needs_dot.take() { dot_labels.push(l); } - let new_b = rewrite_symbol_name(sym_b, stmt_idx, label_defs, dot_counter, &mut needs_dot); - if let Some(l) = needs_dot.take() { dot_labels.push(l); } - DataValue::SymbolDiff { sym_a: new_a, sym_b: new_b, addend: *addend } - } - DataValue::Symbol { name, addend } => { - let mut needs_dot = None; - let new_name = rewrite_symbol_name(name, stmt_idx, label_defs, dot_counter, &mut needs_dot); - if let Some(l) = needs_dot.take() { dot_labels.push(l); } - DataValue::Symbol { name: new_name, addend: *addend } - } - DataValue::Expression(expr) => { - // Rewrite numeric refs and '.' in expression strings - let resolved = rewrite_expr_numeric_refs(expr, stmt_idx, label_defs, dot_counter, dot_labels); - DataValue::Expression(resolved) - } - other => other.clone(), - } -} - -/// Rewrite numeric label refs and '.' inside a raw expression string. -fn rewrite_expr_numeric_refs( - expr: &str, - stmt_idx: usize, - label_defs: &HashMap>, - dot_counter: &mut usize, - dot_labels: &mut Vec, -) -> String { - // Replace standalone '.' that represents current position - // We need to be careful: '.' could appear in symbol names like '.Lfoo' - // The pattern we're looking for is '.' surrounded by operators or parens - let mut result = String::with_capacity(expr.len()); - let bytes = expr.as_bytes(); - let len = bytes.len(); - let mut i = 0; - while i < len { - if bytes[i] == b'.' { - // Check if this is a standalone '.' (current position) - let prev_is_sep = i == 0 || matches!(bytes[i-1], b' ' | b'(' | b')' | b'+' | b'-' | b'*' | b'/' | b'|' | b'&' | b'^' | b',' | b'~'); - let next_is_sep = i + 1 >= len || matches!(bytes[i+1], b' ' | b'(' | b')' | b'+' | b'-' | b'*' | b'/' | b'|' | b'&' | b'^' | b',' | b'~'); - if prev_is_sep && next_is_sep { - let label = format!(".Ldot_{}", *dot_counter); - *dot_counter += 1; - dot_labels.push(label.clone()); - result.push_str(&label); - i += 1; - continue; - } - } - // Try to match a numeric label reference (digits followed by 'f' or 'b') - if bytes[i].is_ascii_digit() { - let start = i; - while i < len && bytes[i].is_ascii_digit() { - i += 1; - } - if i < len && (bytes[i] == b'f' || bytes[i] == b'F' || bytes[i] == b'b' || bytes[i] == b'B') { - let next_after = if i + 1 < len { bytes[i + 1] } else { b' ' }; - // Must not be followed by an alphanumeric (to avoid matching hex or identifiers) - if !next_after.is_ascii_alphanumeric() && next_after != b'_' { - let ref_str = &expr[start..=i]; - if let Some(resolved) = resolve_numeric_ref_name(ref_str, stmt_idx, label_defs) { - result.push_str(&resolved); - i += 1; - continue; - } - } - } - // Not a numeric ref, push digits as-is - result.push_str(&expr[start..i]); - continue; - } - result.push(bytes[i] as char); - i += 1; - } - result -} - -/// Rewrite a list of DataValues, collecting any dot labels needed. -fn rewrite_data_values( - values: &[DataValue], - stmt_idx: usize, - label_defs: &HashMap>, - dot_counter: &mut usize, - dot_labels: &mut Vec, -) -> Vec { - values.iter().map(|dv| rewrite_data_value(dv, stmt_idx, label_defs, dot_counter, dot_labels)).collect() -} - -/// Rewrite numeric label references and '.' in a directive. -/// Returns a list of statements: possibly a synthetic label before the directive. -fn rewrite_numeric_refs_in_directive( - dir: &Directive, - stmt_idx: usize, - label_defs: &HashMap>, - dot_counter: &mut usize, -) -> Vec { - let mut dot_labels: Vec = Vec::new(); - let new_dir = match dir { - Directive::Byte(vals) => { - let new_vals = rewrite_data_values(vals, stmt_idx, label_defs, dot_counter, &mut dot_labels); - Directive::Byte(new_vals) - } - Directive::Short(vals) => { - let new_vals = rewrite_data_values(vals, stmt_idx, label_defs, dot_counter, &mut dot_labels); - Directive::Short(new_vals) - } - Directive::Long(vals) => { - let new_vals = rewrite_data_values(vals, stmt_idx, label_defs, dot_counter, &mut dot_labels); - Directive::Long(new_vals) - } - Directive::Quad(vals) => { - let new_vals = rewrite_data_values(vals, stmt_idx, label_defs, dot_counter, &mut dot_labels); - Directive::Quad(new_vals) - } - _ => dir.clone(), - }; - let mut stmts = Vec::new(); - // Insert synthetic dot labels before the directive - for label in dot_labels { - stmts.push(AsmStatement::Label(label)); - } - stmts.push(AsmStatement::Directive(new_dir)); - stmts -} - -impl ElfWriter { - pub fn new() -> Self { - Self { - base: ElfWriterBase::new(RISCV_NOP, 2), - pending_branch_relocs: Vec::new(), - pcrel_hi_counter: 0, - numeric_labels: HashMap::new(), - deferred_exprs: Vec::new(), - elf_flags: EF_RISCV_FLOAT_ABI_DOUBLE | EF_RISCV_RVC, - elf_class: ELFCLASS64, - no_relax: false, - option_stack: Vec::new(), - } - } - - /// Set the ELF e_flags (e.g., to change float ABI from the default double-float). - pub fn set_elf_flags(&mut self, flags: u32) { - self.elf_flags = flags; - } - - /// Set the ELF class (ELFCLASS32 or ELFCLASS64). - pub fn set_elf_class(&mut self, class: u8) { - self.elf_class = class; - } - - // R_RISCV_RELAX relocations are emitted alongside CALL_PLT, BRANCH, and - // JAL relocations so the linker can perform relaxation (shortening - // auipc+jalr to jal, etc.). This is required because linker relaxation - // changes code layout, which would invalidate any locally-resolved offsets. - - /// R_RISCV_RELAX ELF relocation type - const R_RISCV_RELAX: u32 = 51; - - /// R_RISCV_ALIGN ELF relocation type - marks alignment padding that the - /// linker may need to adjust during relaxation. - const R_RISCV_ALIGN: u32 = 43; - - /// Emit alignment padding with an R_RISCV_ALIGN relocation in executable - /// sections (when relaxation is enabled). The linker needs these to know - /// where alignment padding exists so it can re-align after relaxation - /// changes code sizes. - fn emit_align_with_reloc(&mut self, align_bytes: u64) { - if align_bytes <= 1 { - return; - } - let offset_before = self.base.current_offset(); - self.base.align_to(align_bytes); - let offset_after = self.base.current_offset(); - let padding = offset_after - offset_before; - if padding > 0 && !self.no_relax { - // Only emit R_RISCV_ALIGN in executable sections where linker - // relaxation may change code sizes and require re-alignment. - if let Some(s) = self.base.sections.get_mut(&self.base.current_section) { - if (s.sh_flags & SHF_EXECINSTR) != 0 { - s.relocs.push(ObjReloc { - offset: offset_before, - reloc_type: Self::R_RISCV_ALIGN, - symbol_name: String::new(), - addend: padding as i64, - }); - } - } - } - } - - /// Process all parsed assembly statements. - pub fn process_statements(&mut self, statements: &[AsmStatement]) -> Result<(), String> { - let statements = resolve_numeric_label_refs(statements); - for stmt in &statements { - self.process_statement(stmt)?; - } - // Merge subsections (e.g., .text.__subsection.1 → .text) before resolving - // relocations. This is critical for kernel ALTERNATIVE macros which use - // .subsection 1 to place alternative code within the same section. - let remap = self.base.merge_subsections(); - // Fix up pending references that pointed to now-merged subsection names. - // Deferred expressions and branch relocs created inside a subsection store the - // subsection name as their section; after merging, that section no longer exists. - // Remap them to the parent section with the correct offset adjustment. - if !remap.is_empty() { - for reloc in &mut self.pending_branch_relocs { - if let Some((parent, offset_adj)) = remap.get(&reloc.section) { - reloc.offset += offset_adj; - reloc.section = parent.clone(); - } - } - for expr in &mut self.deferred_exprs { - if let Some((parent, offset_adj)) = remap.get(&expr.section) { - expr.offset += offset_adj; - expr.section = parent.clone(); - } - } - } - self.resolve_deferred_exprs()?; - // Compression is disabled: the linker handles relaxation via - // R_RISCV_RELAX. Running our own compression would change code - // layout in ways the linker's relaxation pass doesn't expect. - // self.compress_executable_sections(); - self.resolve_local_branches()?; - Ok(()) - } - - fn process_statement(&mut self, stmt: &AsmStatement) -> Result<(), String> { - match stmt { - AsmStatement::Empty => Ok(()), - - AsmStatement::Label(name) => { - self.base.ensure_text_section(); - let section = self.base.current_section.clone(); - let offset = self.base.current_offset(); - self.base.labels.insert(name.clone(), (section.clone(), offset)); - if is_numeric_label(name) { - self.numeric_labels - .entry(name.clone()) - .or_default() - .push((section, offset)); - } - Ok(()) - } - - AsmStatement::Directive(directive) => { - self.process_directive(directive) - } - - AsmStatement::Instruction { mnemonic, operands, raw_operands } => { - self.process_instruction(mnemonic, operands, raw_operands) - } - } - } - - fn process_directive(&mut self, directive: &Directive) -> Result<(), String> { - match directive { - Directive::PushSection(info) => { - self.base.push_section( - &info.name, - &info.flags, - info.flags_explicit, - Some(info.sec_type.as_str()), - ); - Ok(()) - } - Directive::PopSection => { - self.base.pop_section(); - Ok(()) - } - Directive::Previous => { - self.base.restore_previous_section(); - Ok(()) - } - Directive::Section(info) => { - self.base.process_section_directive( - &info.name, - &info.flags, - info.flags_explicit, - Some(info.sec_type.as_str()), - ); - Ok(()) - } - - Directive::Text => { - self.base.switch_to_standard_section(".text", SHT_PROGBITS, SHF_ALLOC | SHF_EXECINSTR); - Ok(()) - } - Directive::Data => { - self.base.switch_to_standard_section(".data", SHT_PROGBITS, SHF_ALLOC | SHF_WRITE); - Ok(()) - } - Directive::Bss => { - self.base.switch_to_standard_section(".bss", SHT_NOBITS, SHF_ALLOC | SHF_WRITE); - Ok(()) - } - Directive::Rodata => { - self.base.switch_to_standard_section(".rodata", SHT_PROGBITS, SHF_ALLOC); - Ok(()) - } - - Directive::Globl(sym) => { - for s in sym.split(',') { - let s = s.trim(); - if !s.is_empty() { self.base.set_global(s); } - } - Ok(()) - } - Directive::Weak(sym) => { - for s in sym.split(',') { - let s = s.trim(); - if !s.is_empty() { self.base.set_weak(s); } - } - Ok(()) - } - - Directive::SymVisibility(sym, vis) => { - let v = match vis { - Visibility::Hidden => STV_HIDDEN, - Visibility::Protected => STV_PROTECTED, - Visibility::Internal => STV_INTERNAL, - }; - self.base.set_visibility(sym, v); - Ok(()) - } - - Directive::Type(sym, st) => { - let elf_type = match st { - SymbolType::Function => STT_FUNC, - SymbolType::Object => STT_OBJECT, - SymbolType::TlsObject => STT_TLS, - SymbolType::NoType => STT_NOTYPE, - }; - self.base.set_symbol_type(sym, elf_type); - Ok(()) - } - - Directive::Size(sym, size_expr) => { - match size_expr { - SizeExpr::CurrentMinus(label) => { - self.base.set_symbol_size(sym, Some(label), None); - } - SizeExpr::Absolute(size) => { - self.base.set_symbol_size(sym, None, Some(*size)); - } - } - Ok(()) - } - - Directive::Align(val) => { - // RISC-V .align N means 2^N bytes (same as .p2align) - let bytes = 1u64 << val; - self.emit_align_with_reloc(bytes); - Ok(()) - } - - Directive::Balign(val) => { - self.emit_align_with_reloc(*val); - Ok(()) - } - - Directive::Byte(values) => { - for dv in values { - match dv { - DataValue::Integer(v) => self.base.emit_bytes(&[*v as u8]), - DataValue::Symbol { name, addend } => { - // Try resolving as alias (.set/.equ) — may be a forward ref - let resolved = self.base.resolve_expr_aliases(name); - let deferred_expr = if *addend != 0 { - format!("({}) + ({})", name, addend) - } else { - name.clone() - }; - let eval_expr = if *addend != 0 { - format!("({}) + ({})", resolved, addend) - } else { - resolved.clone() - }; - if let Ok(v) = crate::backend::asm_expr::parse_integer_expr(&eval_expr) { - self.base.emit_bytes(&[v as u8]); - } else { - // Defer: alias not yet defined (forward reference) - let section = self.base.current_section.clone(); - let offset = self.base.current_offset(); - self.deferred_exprs.push(DeferredExpr { - section, - offset, - size: 1, - expr: deferred_expr, - }); - self.base.emit_placeholder(1); - } - } - _ => self.base.emit_bytes(&[0u8]), - } - } - Ok(()) - } - - Directive::Short(values) => { - for dv in values { - match dv { - DataValue::Integer(v) => self.base.emit_bytes(&(*v as u16).to_le_bytes()), - DataValue::Expression(expr) => { - let resolved = self.base.resolve_expr_aliases(expr); - let resolved = self.base.resolve_expr_labels(&resolved); - match crate::backend::asm_expr::parse_integer_expr(&resolved) { - Ok(v) => self.base.emit_bytes(&(v as u16).to_le_bytes()), - Err(_) => { - // Defer: expression contains forward references - let section = self.base.current_section.clone(); - let offset = self.base.current_offset(); - self.deferred_exprs.push(DeferredExpr { - section, - offset, - size: 2, - expr: expr.clone(), - }); - self.base.emit_placeholder(2); - } - } - } - DataValue::Symbol { name, addend } => { - // Try resolving as alias (.set/.equ) — may be a forward ref - let resolved = self.base.resolve_expr_aliases(name); - let deferred_expr = if *addend != 0 { - format!("({}) + ({})", name, addend) - } else { - name.clone() - }; - let eval_expr = if *addend != 0 { - format!("({}) + ({})", resolved, addend) - } else { - resolved.clone() - }; - if let Ok(v) = crate::backend::asm_expr::parse_integer_expr(&eval_expr) { - self.base.emit_bytes(&(v as u16).to_le_bytes()); - } else { - // Defer: alias not yet defined (forward reference) - let section = self.base.current_section.clone(); - let offset = self.base.current_offset(); - self.deferred_exprs.push(DeferredExpr { - section, - offset, - size: 2, - expr: deferred_expr, - }); - self.base.emit_placeholder(2); - } - } - DataValue::SymbolDiff { sym_a, sym_b, addend } => { - let add_type = RelocType::Add16.elf_type(); - let sub_type = RelocType::Sub16.elf_type(); - let (base_a, extra_a) = decompose_symbol_addend(sym_a); - let (base_b, extra_b) = decompose_symbol_addend(sym_b); - self.base.add_reloc(add_type, base_a, *addend + extra_a); - self.base.add_reloc(sub_type, base_b, extra_b); - self.base.emit_placeholder(2); - } - } - } - Ok(()) - } - - Directive::Long(values) => { - for dv in values { - self.emit_data_value(dv, 4)?; - } - Ok(()) - } - - Directive::Quad(values) => { - for dv in values { - self.emit_data_value(dv, 8)?; - } - Ok(()) - } - - Directive::Zero { size, fill } => { - self.base.emit_bytes(&vec![*fill; *size]); - Ok(()) - } - - Directive::Asciz(s) => { - self.base.emit_bytes(s); - self.base.emit_bytes(&[0]); - Ok(()) - } - - Directive::Ascii(s) => { - self.base.emit_bytes(s); - Ok(()) - } - - Directive::Comm { sym, size, align } => { - self.base.emit_comm(sym, *size, *align); - Ok(()) - } - - Directive::Local(_) => Ok(()), - - Directive::Set(alias, target) => { - // If the target expression contains '.', resolve it as - // current offset, then try to evaluate to a constant. - let mut resolved = self.base.resolve_expr_aliases(target); - if resolved.contains('.') { - resolved = self.base.resolve_expr_labels(&resolved); - } - // Try to evaluate to a constant; if so, store the constant - if let Ok(v) = crate::backend::asm_expr::parse_integer_expr(&resolved) { - self.base.set_alias(alias, &v.to_string()); - } else { - self.base.set_alias(alias, &resolved); - } - Ok(()) - } - - Directive::ArchOption(opt) => { - let opt = opt.trim(); - if opt == "norelax" { - self.no_relax = true; - } else if opt == "relax" { - self.no_relax = false; - } else if opt == "push" { - self.option_stack.push(self.no_relax); - } else if opt == "pop" { - if let Some(saved) = self.option_stack.pop() { - self.no_relax = saved; - } - } - // rvc/norvc are silently accepted (compression not yet supported) - Ok(()) - } - - Directive::Attribute(_) => Ok(()), - - Directive::Cfi | Directive::Ignored => Ok(()), - - Directive::Insn(args) => { - self.base.ensure_text_section(); - match encode_insn_directive(args) { - Ok(EncodeResult::Word(word)) => { - self.base.emit_u32_le(word); - Ok(()) - } - Ok(EncodeResult::Half(half)) => { - self.base.emit_u16_le(half); - Ok(()) - } - Ok(_) => Ok(()), - Err(e) => Err(e), - } - } - - Directive::Incbin { path, skip, count } => { - let data = std::fs::read(path) - .map_err(|e| format!(".incbin: failed to read '{}': {}", path, e))?; - let skip = *skip as usize; - let data = if skip < data.len() { &data[skip..] } else { &[] }; - let data = match count { - Some(c) => { - let c = *c as usize; - if c < data.len() { &data[..c] } else { data } - } - None => data, - }; - self.base.emit_bytes(data); - Ok(()) - } - - Directive::Subsection(n) => { - self.base.set_subsection(*n); - Ok(()) - } - - Directive::Unknown { name, args } => { - Err(format!("unsupported RISC-V assembler directive: {} {}", name, args)) - } - } - } - - /// Emit a typed data value for .long (size=4) or .quad (size=8). - /// - /// Note: `SymbolDiff` sym_a/sym_b may contain embedded addends (e.g. - /// `"cgroup_bpf_enabled_key+144"`) from inline asm operand substitution. - /// These are decomposed via [`decompose_symbol_addend`] so the ELF - /// relocation references the base symbol with a proper addend. - fn emit_data_value(&mut self, dv: &DataValue, size: usize) -> Result<(), String> { - match dv { - DataValue::SymbolDiff { sym_a, sym_b, addend } => { - let (add_type, sub_type) = if size == 4 { - (RelocType::Add32.elf_type(), RelocType::Sub32.elf_type()) - } else { - (RelocType::Add64.elf_type(), RelocType::Sub64.elf_type()) - }; - // Decompose sym_a if it contains an embedded addend (e.g. - // "cgroup_bpf_enabled_key+144") so the relocation references - // the base symbol with a numeric addend rather than creating a - // bogus symbol named "symbol+offset". - let (base_a, extra_a) = decompose_symbol_addend(sym_a); - let (base_b, extra_b) = decompose_symbol_addend(sym_b); - self.base.add_reloc(add_type, base_a, *addend + extra_a); - self.base.add_reloc(sub_type, base_b, extra_b); - self.base.emit_placeholder(size); - } - DataValue::Symbol { name, addend } => { - // Try resolving as alias (.set/.equ) first — the "symbol" - // may actually be a compile-time constant defined via .set. - let resolved = self.base.resolve_expr_aliases(name); - if let Ok(v) = crate::backend::asm_expr::parse_integer_expr(&resolved) { - self.base.emit_data_integer(v + addend, size); - } else { - let reloc_type = if size == 4 { - RelocType::Abs32.elf_type() - } else { - RelocType::Abs64.elf_type() - }; - self.base.emit_data_symbol_ref(name, *addend, size, reloc_type); - } - } - DataValue::Integer(v) => { - self.base.emit_data_integer(*v, size); - } - DataValue::Expression(expr) => { - let mut resolved = self.base.resolve_expr_aliases(expr); - // Resolve .Ldot_N synthetic labels to current offset - resolved = self.base.resolve_expr_labels(&resolved); - match crate::backend::asm_expr::parse_integer_expr(&resolved) { - Ok(v) => self.base.emit_data_integer(v, size), - Err(_) => { - // Expression contains unresolved symbols (e.g., forward references). - // Defer resolution until all labels are known. - let section = self.base.current_section.clone(); - let offset = self.base.current_offset(); - self.deferred_exprs.push(DeferredExpr { - section, - offset, - size, - expr: expr.clone(), - }); - // Emit placeholder bytes that will be patched later - self.base.emit_placeholder(size); - } - } - } - } - Ok(()) - } - - /// Resolve deferred data expressions now that all labels are known. - fn resolve_deferred_exprs(&mut self) -> Result<(), String> { - let deferred = std::mem::take(&mut self.deferred_exprs); - for def in &deferred { - // Re-resolve with all labels now available (using stored label offsets) - let resolved = self.base.resolve_expr_aliases(&def.expr); - let resolved = self.base.resolve_expr_all_labels(&resolved, &def.section); - let value = match crate::backend::asm_expr::parse_integer_expr(&resolved) { - Ok(v) => v, - Err(_) => { - // Cross-section label reference: try resolving labels from - // ANY section. This handles kernel ALTERNATIVE macros where - // .2byte expressions in .alternative reference labels placed - // in .text (or a subsection merged into .text). - let cross_resolved = self.base.resolve_expr_cross_section(&def.expr); - // TODO: emit a warning on Err instead of silently producing 0. - // This fallback handles cases where macro argument splitting - // produces single-symbol expressions (e.g., ".Lnum_889_0" - // from "889f - 888f" being split by whitespace in macro args). - // Proper fix: make split_macro_args respect parameter count. - crate::backend::asm_expr::parse_integer_expr(&cross_resolved).unwrap_or_default() - } - }; - if let Some(section) = self.base.sections.get_mut(&def.section) { - let off = def.offset as usize; - if def.size == 4 && off + 4 <= section.data.len() { - section.data[off..off + 4].copy_from_slice(&(value as u32).to_le_bytes()); - } else if def.size == 8 && off + 8 <= section.data.len() { - section.data[off..off + 8].copy_from_slice(&value.to_le_bytes()); - } else if def.size == 2 && off + 2 <= section.data.len() { - section.data[off..off + 2].copy_from_slice(&(value as u16).to_le_bytes()); - } else if def.size == 1 && off < section.data.len() { - section.data[off] = value as u8; - } - } - } - Ok(()) - } - - fn process_instruction(&mut self, mnemonic: &str, operands: &[Operand], raw_operands: &str) -> Result<(), String> { - self.base.ensure_text_section(); - - match encode_instruction(mnemonic, operands, raw_operands) { - Ok(EncodeResult::Word(word)) => { - self.base.emit_u32_le(word); - Ok(()) - } - Ok(EncodeResult::Half(half)) => { - self.base.emit_u16_le(half); - Ok(()) - } - Ok(EncodeResult::WordWithReloc { word, reloc }) => { - let elf_type = reloc.reloc_type.elf_type(); - let is_pcrel_hi = elf_type == 23 || elf_type == 20 || elf_type == 22 || elf_type == 21; - - if is_pcrel_hi { - let label = format!(".Lpcrel_hi{}", self.pcrel_hi_counter); - self.pcrel_hi_counter += 1; - let section = self.base.current_section.clone(); - let offset = self.base.current_offset(); - self.base.labels.insert(label, (section, offset)); - } - - // For BRANCH (16), JAL (17): always emit as external relocations - // so the linker resolves offsets correctly after relaxation. - // For CALL_PLT (19): emit with paired R_RISCV_RELAX. - let is_branch_or_jal = elf_type == 16 || elf_type == 17; - let is_call_plt = elf_type == 19; - - if is_call_plt { - self.base.add_reloc(elf_type, reloc.symbol.clone(), reloc.addend); - if !self.no_relax { - self.base.add_reloc(Self::R_RISCV_RELAX, String::new(), 0); - } - self.base.emit_u32_le(word); - } else if is_branch_or_jal { - self.base.add_reloc(elf_type, reloc.symbol.clone(), reloc.addend); - self.base.emit_u32_le(word); - } else { - let is_local = reloc.symbol.starts_with(".L") || reloc.symbol.starts_with(".l") - || parse_numeric_label_ref(&reloc.symbol).is_some(); - - if is_local { - let offset = self.base.current_offset(); - self.pending_branch_relocs.push(PendingReloc { - section: self.base.current_section.clone(), - offset, - reloc_type: elf_type, - symbol: reloc.symbol.clone(), - addend: reloc.addend, - pcrel_hi_offset: None, - }); - self.base.emit_u32_le(word); - } else { - self.base.add_reloc(elf_type, reloc.symbol.clone(), reloc.addend); - self.base.emit_u32_le(word); - } - } - Ok(()) - } - Ok(EncodeResult::Words(words)) => { - for word in words { - self.base.emit_u32_le(word); - } - Ok(()) - } - Ok(EncodeResult::WordsWithRelocs(items)) => { - let mut pcrel_hi_label: Option = None; - - for (word, reloc_opt) in &items { - if let Some(reloc) = reloc_opt { - let elf_type = reloc.reloc_type.elf_type(); - let is_pcrel_hi = elf_type == 23; - let is_got_hi = elf_type == 20; - let is_tls_gd_hi = elf_type == 22; - let is_tls_got_hi = elf_type == 21; - - if is_pcrel_hi || is_got_hi || is_tls_gd_hi || is_tls_got_hi { - let label = format!(".Lpcrel_hi{}", self.pcrel_hi_counter); - self.pcrel_hi_counter += 1; - let section = self.base.current_section.clone(); - let offset = self.base.current_offset(); - self.base.labels.insert(label.clone(), (section, offset)); - pcrel_hi_label = Some(label); - - self.base.add_reloc(elf_type, reloc.symbol.clone(), reloc.addend); - if !self.no_relax { - self.base.add_reloc(Self::R_RISCV_RELAX, String::new(), 0); - } - self.base.emit_u32_le(*word); - continue; - } - - let is_pcrel_lo12_i = elf_type == 24; - let is_pcrel_lo12_s = elf_type == 25; - - if let Some(hi_label) = pcrel_hi_label.as_ref().filter(|_| is_pcrel_lo12_i || is_pcrel_lo12_s) { - let hi_label = hi_label.clone(); - self.base.add_reloc(elf_type, hi_label, 0); - if !self.no_relax { - self.base.add_reloc(Self::R_RISCV_RELAX, String::new(), 0); - } - self.base.emit_u32_le(*word); - continue; - } - - // For BRANCH (16), JAL (17): always emit as external - // relocations. For CALL_PLT (19): emit with R_RISCV_RELAX. - let is_branch_or_jal = elf_type == 16 || elf_type == 17; - let is_call_plt = elf_type == 19; - if is_call_plt { - self.base.add_reloc(elf_type, reloc.symbol.clone(), reloc.addend); - if !self.no_relax { - self.base.add_reloc(Self::R_RISCV_RELAX, String::new(), 0); - } - } else if is_branch_or_jal { - self.base.add_reloc(elf_type, reloc.symbol.clone(), reloc.addend); - } else { - let is_local = reloc.symbol.starts_with(".L") || reloc.symbol.starts_with(".l") - || parse_numeric_label_ref(&reloc.symbol).is_some(); - if is_local { - let offset = self.base.current_offset(); - self.pending_branch_relocs.push(PendingReloc { - section: self.base.current_section.clone(), - offset, - reloc_type: elf_type, - symbol: reloc.symbol.clone(), - addend: reloc.addend, - pcrel_hi_offset: None, - }); - } else { - self.base.add_reloc(elf_type, reloc.symbol.clone(), reloc.addend); - } - } - } - self.base.emit_u32_le(*word); - } - Ok(()) - } - Ok(EncodeResult::Skip) => Ok(()), - Err(e) => Err(e), - } - } - - /// Compress eligible 32-bit instructions in executable sections to 16-bit - /// RV64C equivalents. - fn compress_executable_sections(&mut self) { - let exec_sections: Vec = self.base.sections.iter() - .filter(|(_, s)| (s.sh_flags & SHF_EXECINSTR) != 0) - .map(|(name, _)| name.clone()) - .collect(); - - for sec_name in &exec_sections { - let mut reloc_offsets = HashSet::new(); - - for pr in &self.pending_branch_relocs { - if pr.section == *sec_name { - reloc_offsets.insert(pr.offset); - if pr.reloc_type == 19 { - reloc_offsets.insert(pr.offset + 4); - } - } - } - - if let Some(section) = self.base.sections.get(sec_name) { - for r in §ion.relocs { - reloc_offsets.insert(r.offset); - if r.reloc_type == 19 { - reloc_offsets.insert(r.offset + 4); - } - } - } - - let section_data = match self.base.sections.get(sec_name) { - Some(s) => s.data.clone(), - None => continue, - }; - - let (new_data, offset_map) = compress::compress_section(§ion_data, &reloc_offsets); - - if new_data.len() == section_data.len() { - continue; - } - - if let Some(section) = self.base.sections.get_mut(sec_name) { - section.data = new_data; - for r in &mut section.relocs { - r.offset = compress::remap_offset(r.offset, &offset_map); - } - } - - for pr in &mut self.pending_branch_relocs { - if pr.section == *sec_name { - pr.offset = compress::remap_offset(pr.offset, &offset_map); - } - } - - for (_, (label_sec, label_offset)) in self.base.labels.iter_mut() { - if label_sec == sec_name { - *label_offset = compress::remap_offset(*label_offset, &offset_map); - } - } - - for (_, defs) in self.numeric_labels.iter_mut() { - for (def_sec, def_offset) in defs.iter_mut() { - if def_sec == sec_name { - *def_offset = compress::remap_offset(*def_offset, &offset_map); - } - } - } - - for sym in &mut self.base.extra_symbols { - if sym.section_name == *sec_name { - sym.value = compress::remap_offset(sym.value, &offset_map); - } - } - } - } - - /// Resolve a numeric label reference like "1b" or "1f" to a (section, offset). - fn resolve_numeric_label_ref( - &self, - label_name: &str, - is_backward: bool, - ref_section: &str, - ref_offset: u64, - ) -> Option<(String, u64)> { - let defs = self.numeric_labels.get(label_name)?; - if is_backward { - let mut best: Option<&(String, u64)> = None; - for def in defs { - if def.0 == ref_section && def.1 <= ref_offset { - best = Some(def); - } - } - best.cloned() - } else { - for def in defs { - if def.0 == ref_section && def.1 > ref_offset { - return Some(def.clone()); - } - } - None - } - } - - /// Resolve local branch labels to PC-relative offsets using RISC-V relocation types. - fn resolve_local_branches(&mut self) -> Result<(), String> { - for reloc in &self.pending_branch_relocs { - // pcrel_lo12 relocations must always be emitted as external relocations - // so the linker can pair them with their corresponding pcrel_hi20. - let is_pcrel_lo = reloc.reloc_type == 24 || reloc.reloc_type == 25; - if is_pcrel_lo { - if let Some(section) = self.base.sections.get_mut(&reloc.section) { - section.relocs.push(ObjReloc { - offset: reloc.offset, - reloc_type: reloc.reloc_type, - symbol_name: reloc.symbol.clone(), - addend: reloc.addend, - }); - } - continue; - } - - let resolved = if let Some((label_name, is_backward)) = parse_numeric_label_ref(&reloc.symbol) { - self.resolve_numeric_label_ref(label_name, is_backward, &reloc.section, reloc.offset) - } else { - self.base.labels.get(&reloc.symbol).cloned() - }; - - let (target_section, target_offset) = match resolved { - Some(v) => v, - None => { - if let Some(section) = self.base.sections.get_mut(&reloc.section) { - section.relocs.push(ObjReloc { - offset: reloc.offset, - reloc_type: reloc.reloc_type, - symbol_name: reloc.symbol.clone(), - addend: reloc.addend, - }); - } - continue; - } - }; - - if target_section != reloc.section { - if let Some(section) = self.base.sections.get_mut(&reloc.section) { - section.relocs.push(ObjReloc { - offset: reloc.offset, - reloc_type: reloc.reloc_type, - symbol_name: reloc.symbol.clone(), - addend: reloc.addend, - }); - } - continue; - } - - let ref_offset = reloc.pcrel_hi_offset.unwrap_or(reloc.offset); - let pc_offset = (target_offset as i64) - (ref_offset as i64) + reloc.addend; - - if let Some(section) = self.base.sections.get_mut(&reloc.section) { - let instr_offset = reloc.offset as usize; - - match reloc.reloc_type { - 16 => { - // R_RISCV_BRANCH (B-type, 12-bit) - if instr_offset + 4 > section.data.len() { continue; } - let mut word = u32::from_le_bytes([ - section.data[instr_offset], - section.data[instr_offset + 1], - section.data[instr_offset + 2], - section.data[instr_offset + 3], - ]); - let imm = pc_offset as u32; - let bit12 = (imm >> 12) & 1; - let bit11 = (imm >> 11) & 1; - let bits10_5 = (imm >> 5) & 0x3F; - let bits4_1 = (imm >> 1) & 0xF; - word &= 0x01FFF07F; - word |= (bit12 << 31) | (bits10_5 << 25) | (bits4_1 << 8) | (bit11 << 7); - section.data[instr_offset..instr_offset + 4].copy_from_slice(&word.to_le_bytes()); - } - 17 => { - // R_RISCV_JAL (J-type, 20-bit) - if instr_offset + 4 > section.data.len() { continue; } - let mut word = u32::from_le_bytes([ - section.data[instr_offset], - section.data[instr_offset + 1], - section.data[instr_offset + 2], - section.data[instr_offset + 3], - ]); - let imm = pc_offset as u32; - let bit20 = (imm >> 20) & 1; - let bits10_1 = (imm >> 1) & 0x3FF; - let bit11 = (imm >> 11) & 1; - let bits19_12 = (imm >> 12) & 0xFF; - word &= 0x00000FFF; - word |= (bit20 << 31) | (bits10_1 << 21) | (bit11 << 20) | (bits19_12 << 12); - section.data[instr_offset..instr_offset + 4].copy_from_slice(&word.to_le_bytes()); - } - 19 => { - // R_RISCV_CALL_PLT (AUIPC + JALR pair, 8 bytes) - if instr_offset + 8 > section.data.len() { continue; } - - let hi = ((pc_offset as i32 + 0x800) >> 12) as u32; - let lo = ((pc_offset as i32) << 20 >> 20) as u32; - - let mut auipc = u32::from_le_bytes([ - section.data[instr_offset], - section.data[instr_offset + 1], - section.data[instr_offset + 2], - section.data[instr_offset + 3], - ]); - auipc = (auipc & 0xFFF) | (hi << 12); - section.data[instr_offset..instr_offset + 4].copy_from_slice(&auipc.to_le_bytes()); - - let mut jalr = u32::from_le_bytes([ - section.data[instr_offset + 4], - section.data[instr_offset + 5], - section.data[instr_offset + 6], - section.data[instr_offset + 7], - ]); - jalr = (jalr & 0xFFFFF) | ((lo & 0xFFF) << 20); - section.data[instr_offset + 4..instr_offset + 8].copy_from_slice(&jalr.to_le_bytes()); - } - 23 => { - // R_RISCV_PCREL_HI20 (AUIPC hi20) - if instr_offset + 4 > section.data.len() { continue; } - let hi = ((pc_offset as i32 + 0x800) >> 12) as u32; - let mut word = u32::from_le_bytes([ - section.data[instr_offset], - section.data[instr_offset + 1], - section.data[instr_offset + 2], - section.data[instr_offset + 3], - ]); - word = (word & 0xFFF) | (hi << 12); - section.data[instr_offset..instr_offset + 4].copy_from_slice(&word.to_le_bytes()); - } - 24 => { - // R_RISCV_PCREL_LO12_I (ADDI/LD lo12 I-type) - if instr_offset + 4 > section.data.len() { continue; } - let lo = (pc_offset as i32) & 0xFFF; - let mut word = u32::from_le_bytes([ - section.data[instr_offset], - section.data[instr_offset + 1], - section.data[instr_offset + 2], - section.data[instr_offset + 3], - ]); - word = (word & 0xFFFFF) | (((lo as u32) & 0xFFF) << 20); - section.data[instr_offset..instr_offset + 4].copy_from_slice(&word.to_le_bytes()); - } - 25 => { - // R_RISCV_PCREL_LO12_S (SW/SD lo12 S-type) - if instr_offset + 4 > section.data.len() { continue; } - let lo = (pc_offset as i32) & 0xFFF; - let mut word = u32::from_le_bytes([ - section.data[instr_offset], - section.data[instr_offset + 1], - section.data[instr_offset + 2], - section.data[instr_offset + 3], - ]); - let imm_lo = (lo as u32) & 0x1F; - let imm_hi = ((lo as u32) >> 5) & 0x7F; - word &= 0x01FFF07F; - word |= (imm_hi << 25) | (imm_lo << 7); - section.data[instr_offset..instr_offset + 4].copy_from_slice(&word.to_le_bytes()); - } - _ => { - section.relocs.push(ObjReloc { - offset: reloc.offset, - reloc_type: reloc.reloc_type, - symbol_name: reloc.symbol.clone(), - addend: reloc.addend, - }); - } - } - } - } - Ok(()) - } - - /// Write the final ELF object file. - pub fn write_elf(&mut self, output_path: &str) -> Result<(), String> { - let config = elf::ElfConfig { - e_machine: EM_RISCV, - e_flags: self.elf_flags, - elf_class: self.elf_class, - // RISC-V always uses RELA relocations, even in 32-bit mode - force_rela: true, - }; - // RISC-V needs include_referenced_locals=true for pcrel_hi synthetic labels - self.base.write_elf(output_path, &config, true) - } -} diff --git a/src/backend/riscv/assembler/encoder/atomics.rs b/src/backend/riscv/assembler/encoder/atomics.rs deleted file mode 100644 index b9ceeb59e8..0000000000 --- a/src/backend/riscv/assembler/encoder/atomics.rs +++ /dev/null @@ -1,106 +0,0 @@ -use super::*; - -// ── Atomics ── - -pub(crate) fn encode_lr(operands: &[Operand], funct3: u32) -> Result { - let rd = get_reg(operands, 0)?; - let (rs1, _offset) = get_mem(operands, 1)?; - // LR: funct7 = 00010 | aq | rl, rs2 = 0 - let funct7 = 0b0001000; // aq=0, rl=0 by default - Ok(EncodeResult::Word(encode_r(OP_AMO, rd, funct3, rs1, 0, funct7))) -} - -pub(crate) fn encode_sc(operands: &[Operand], funct3: u32) -> Result { - let rd = get_reg(operands, 0)?; - let rs2 = get_reg(operands, 1)?; - let (rs1, _offset) = get_mem(operands, 2)?; - let funct7 = 0b0001100; // SC: 00011 | aq=0 | rl=0 - Ok(EncodeResult::Word(encode_r(OP_AMO, rd, funct3, rs1, rs2, funct7))) -} - -pub(crate) fn encode_amo(operands: &[Operand], funct3: u32, funct5: u32) -> Result { - let rd = get_reg(operands, 0)?; - let rs2 = get_reg(operands, 1)?; - let (rs1, _offset) = get_mem(operands, 2)?; - let funct7 = funct5 << 2; // aq=0, rl=0 - Ok(EncodeResult::Word(encode_r(OP_AMO, rd, funct3, rs1, rs2, funct7))) -} - -pub(crate) fn encode_lr_suffixed(mnemonic: &str, operands: &[Operand]) -> Result { - // Parse lr.w, lr.d, lr.w.aq, lr.w.rl, lr.w.aqrl, etc. - let parts: Vec<&str> = mnemonic.split('.').collect(); - let funct3 = match parts.get(1).copied() { - Some("w") => 0b010, - Some("d") => 0b011, - _ => return Err(format!("lr: invalid width: {}", mnemonic)), - }; - let (aq, rl) = parse_aq_rl(&parts[2..]); - let rd = get_reg(operands, 0)?; - let (rs1, _) = get_mem(operands, 1)?; - let funct7 = (0b00010 << 2) | (aq << 1) | rl; - Ok(EncodeResult::Word(encode_r(OP_AMO, rd, funct3, rs1, 0, funct7))) -} - -pub(crate) fn encode_sc_suffixed(mnemonic: &str, operands: &[Operand]) -> Result { - let parts: Vec<&str> = mnemonic.split('.').collect(); - let funct3 = match parts.get(1).copied() { - Some("w") => 0b010, - Some("d") => 0b011, - _ => return Err(format!("sc: invalid width: {}", mnemonic)), - }; - let (aq, rl) = parse_aq_rl(&parts[2..]); - let rd = get_reg(operands, 0)?; - let rs2 = get_reg(operands, 1)?; - let (rs1, _) = get_mem(operands, 2)?; - let funct7 = (0b00011 << 2) | (aq << 1) | rl; - Ok(EncodeResult::Word(encode_r(OP_AMO, rd, funct3, rs1, rs2, funct7))) -} - -pub(crate) fn encode_amo_suffixed(mnemonic: &str, operands: &[Operand]) -> Result { - // Parse e.g. amoswap.w.aqrl, amoadd.d.aq, etc. - let parts: Vec<&str> = mnemonic.split('.').collect(); - if parts.len() < 2 { - return Err(format!("amo: invalid mnemonic: {}", mnemonic)); - } - - let op_name = parts[0]; // e.g., "amoswap", "amoadd" - let funct3 = match parts.get(1).copied() { - Some("w") => 0b010, - Some("d") => 0b011, - _ => return Err(format!("amo: invalid width in {}", mnemonic)), - }; - let (aq, rl) = parse_aq_rl(&parts[2..]); - - let funct5 = match op_name { - "amoswap" => 0b00001, - "amoadd" => 0b00000, - "amoxor" => 0b00100, - "amoand" => 0b01100, - "amoor" => 0b01000, - "amomin" => 0b10000, - "amomax" => 0b10100, - "amominu" => 0b11000, - "amomaxu" => 0b11100, - _ => return Err(format!("amo: unknown op: {}", op_name)), - }; - - let rd = get_reg(operands, 0)?; - let rs2 = get_reg(operands, 1)?; - let (rs1, _) = get_mem(operands, 2)?; - let funct7 = (funct5 << 2) | (aq << 1) | rl; - Ok(EncodeResult::Word(encode_r(OP_AMO, rd, funct3, rs1, rs2, funct7))) -} - -pub(crate) fn parse_aq_rl(suffixes: &[&str]) -> (u32, u32) { - let mut aq = 0u32; - let mut rl = 0u32; - for s in suffixes { - match *s { - "aq" => aq = 1, - "rl" => rl = 1, - "aqrl" => { aq = 1; rl = 1; } - _ => {} - } - } - (aq, rl) -} diff --git a/src/backend/riscv/assembler/encoder/base.rs b/src/backend/riscv/assembler/encoder/base.rs deleted file mode 100644 index f0a165ee84..0000000000 --- a/src/backend/riscv/assembler/encoder/base.rs +++ /dev/null @@ -1,320 +0,0 @@ -use super::*; - -// ── Instruction encoders ────────────────────────────────────────────── - -pub(crate) fn encode_lui(operands: &[Operand]) -> Result { - let rd = get_reg(operands, 0)?; - match &operands.get(1) { - Some(Operand::Imm(imm)) => { - Ok(EncodeResult::Word(encode_u(OP_LUI, rd, (*imm as u32) << 12))) - } - Some(Operand::Symbol(s)) => { - // %hi(symbol) - Ok(EncodeResult::WordWithReloc { - word: encode_u(OP_LUI, rd, 0), - reloc: Relocation { - reloc_type: if s.starts_with("%tprel_hi(") { - RelocType::TprelHi20 - } else { - RelocType::Hi20 - }, - symbol: extract_modifier_symbol(s), - addend: 0, - }, - }) - } - _ => Err("lui: invalid operands".to_string()), - } -} - -pub(crate) fn encode_auipc(operands: &[Operand]) -> Result { - let rd = get_reg(operands, 0)?; - match &operands.get(1) { - Some(Operand::Imm(imm)) => { - Ok(EncodeResult::Word(encode_u(OP_AUIPC, rd, (*imm as u32) << 12))) - } - Some(Operand::Symbol(s)) => { - let (reloc_type, symbol) = parse_reloc_modifier(s); - Ok(EncodeResult::WordWithReloc { - word: encode_u(OP_AUIPC, rd, 0), - reloc: Relocation { - reloc_type, - symbol, - addend: 0, - }, - }) - } - _ => Err("auipc: invalid operands".to_string()), - } -} - -pub(crate) fn encode_jal(operands: &[Operand]) -> Result { - // jal rd, offset OR jal offset (rd = ra) - if operands.len() == 1 { - // jal offset (implicit rd = ra) - match &operands[0] { - Operand::Imm(imm) => { - Ok(EncodeResult::Word(encode_j(OP_JAL, 1, *imm as i32))) - } - Operand::Symbol(s) | Operand::Label(s) | Operand::Reg(s) => { - Ok(EncodeResult::WordWithReloc { - word: encode_j(OP_JAL, 1, 0), - reloc: Relocation { - reloc_type: RelocType::Jal, - symbol: s.clone(), - addend: 0, - }, - }) - } - _ => Err("jal: invalid operand".to_string()), - } - } else { - let rd = get_reg(operands, 0)?; - match &operands[1] { - Operand::Imm(imm) => { - Ok(EncodeResult::Word(encode_j(OP_JAL, rd, *imm as i32))) - } - Operand::Symbol(s) | Operand::Label(s) | Operand::Reg(s) => { - Ok(EncodeResult::WordWithReloc { - word: encode_j(OP_JAL, rd, 0), - reloc: Relocation { - reloc_type: RelocType::Jal, - symbol: s.clone(), - addend: 0, - }, - }) - } - _ => Err("jal: invalid operand".to_string()), - } - } -} - -pub(crate) fn encode_jalr(operands: &[Operand]) -> Result { - // jalr rd, rs1, offset OR jalr rd, offset(rs1) OR jalr rs1 - match operands.len() { - 1 => { - // jalr rs1 (rd = ra, offset = 0) - let rs1 = get_reg(operands, 0)?; - Ok(EncodeResult::Word(encode_i(OP_JALR, 1, 0, rs1, 0))) - } - 2 => { - // jalr rd, rs1 (offset = 0) - let rd = get_reg(operands, 0)?; - match &operands[1] { - Operand::Reg(name) => { - let rs1 = reg_num(name).ok_or("invalid register")?; - Ok(EncodeResult::Word(encode_i(OP_JALR, rd, 0, rs1, 0))) - } - Operand::Mem { base, offset } => { - let rs1 = reg_num(base).ok_or("invalid base register")?; - Ok(EncodeResult::Word(encode_i(OP_JALR, rd, 0, rs1, *offset as i32))) - } - _ => Err("jalr: invalid operands".to_string()), - } - } - 3 => { - let rd = get_reg(operands, 0)?; - let rs1 = get_reg(operands, 1)?; - let imm = get_imm(operands, 2)?; - Ok(EncodeResult::Word(encode_i(OP_JALR, rd, 0, rs1, imm as i32))) - } - _ => Err("jalr: wrong number of operands".to_string()), - } -} - -pub(crate) fn encode_branch_instr(operands: &[Operand], funct3: u32) -> Result { - let rs1 = get_reg(operands, 0)?; - let rs2 = get_reg(operands, 1)?; - - match &operands.get(2) { - Some(Operand::Imm(imm)) => { - Ok(EncodeResult::Word(encode_b(OP_BRANCH, funct3, rs1, rs2, *imm as i32))) - } - Some(Operand::Symbol(s)) | Some(Operand::Label(s)) | Some(Operand::Reg(s)) => { - Ok(EncodeResult::WordWithReloc { - word: encode_b(OP_BRANCH, funct3, rs1, rs2, 0), - reloc: Relocation { - reloc_type: RelocType::Branch, - symbol: s.clone(), - addend: 0, - }, - }) - } - _ => Err("branch: expected offset or label as 3rd operand".to_string()), - } -} - -pub(crate) fn encode_load(operands: &[Operand], funct3: u32) -> Result { - let rd = get_reg(operands, 0)?; - match &operands.get(1) { - Some(Operand::Mem { base, offset }) => { - let rs1 = reg_num(base).ok_or("invalid base register")?; - Ok(EncodeResult::Word(encode_i(OP_LOAD, rd, funct3, rs1, *offset as i32))) - } - Some(Operand::MemSymbol { base, symbol, .. }) => { - let rs1 = reg_num(base).ok_or("invalid base register")?; - let (reloc_type, sym) = parse_reloc_modifier(symbol); - // Use Lo12I for load-type relocations - let reloc_type = match reloc_type { - RelocType::PcrelHi20 => RelocType::PcrelLo12I, - RelocType::Hi20 => RelocType::Lo12I, - RelocType::TprelHi20 => RelocType::TprelLo12I, - other => other, - }; - Ok(EncodeResult::WordWithReloc { - word: encode_i(OP_LOAD, rd, funct3, rs1, 0), - reloc: Relocation { - reloc_type, - symbol: sym, - addend: 0, - }, - }) - } - // Bare symbol: "ld rd, symbol" pseudo-instruction - // Expand to: auipc rd, %pcrel_hi(symbol) ; ld rd, 0(rd) - // with R_RISCV_PCREL_HI20 on auipc and R_RISCV_PCREL_LO12_I on ld - Some(Operand::Symbol(s)) | Some(Operand::Label(s)) => { - Ok(EncodeResult::WordsWithRelocs(vec![ - (encode_u(OP_AUIPC, rd, 0), Some(Relocation { - reloc_type: RelocType::PcrelHi20, - symbol: s.clone(), - addend: 0, - })), - (encode_i(OP_LOAD, rd, funct3, rd, 0), Some(Relocation { - reloc_type: RelocType::PcrelLo12I, - symbol: s.clone(), - addend: 0, - })), - ])) - } - _ => Err("load: expected memory operand".to_string()), - } -} - -pub(crate) fn encode_store(operands: &[Operand], funct3: u32) -> Result { - let rs2 = get_reg(operands, 0)?; - match &operands.get(1) { - Some(Operand::Mem { base, offset }) => { - let rs1 = reg_num(base).ok_or("invalid base register")?; - Ok(EncodeResult::Word(encode_s(OP_STORE, funct3, rs1, rs2, *offset as i32))) - } - Some(Operand::MemSymbol { base, symbol, .. }) => { - let rs1 = reg_num(base).ok_or("invalid base register")?; - let (reloc_type, sym) = parse_reloc_modifier(symbol); - let reloc_type = match reloc_type { - RelocType::PcrelHi20 => RelocType::PcrelLo12S, - RelocType::Hi20 => RelocType::Lo12S, - RelocType::TprelHi20 => RelocType::TprelLo12S, - other => other, - }; - Ok(EncodeResult::WordWithReloc { - word: encode_s(OP_STORE, funct3, rs1, rs2, 0), - reloc: Relocation { - reloc_type, - symbol: sym, - addend: 0, - }, - }) - } - _ => Err("store: expected memory operand".to_string()), - } -} - -pub(crate) fn encode_alu_imm(operands: &[Operand], funct3: u32) -> Result { - let rd = get_reg(operands, 0)?; - let rs1 = get_reg(operands, 1)?; - match &operands.get(2) { - Some(Operand::Imm(imm)) => { - Ok(EncodeResult::Word(encode_i(OP_OP_IMM, rd, funct3, rs1, *imm as i32))) - } - Some(Operand::Symbol(s)) => { - let (reloc_type, sym) = parse_reloc_modifier(s); - let reloc_type = match reloc_type { - RelocType::PcrelHi20 => RelocType::PcrelLo12I, - RelocType::Hi20 => RelocType::Lo12I, - RelocType::TprelHi20 => RelocType::TprelLo12I, - other => other, - }; - Ok(EncodeResult::WordWithReloc { - word: encode_i(OP_OP_IMM, rd, funct3, rs1, 0), - reloc: Relocation { - reloc_type, - symbol: sym, - addend: 0, - }, - }) - } - _ => Err("alu_imm: expected immediate".to_string()), - } -} - -pub(crate) fn encode_shift_imm(operands: &[Operand], funct3: u32, funct6: u32) -> Result { - let rd = get_reg(operands, 0)?; - let rs1 = get_reg(operands, 1)?; - let shamt = get_imm(operands, 2)? as u32; - // For RV64, shift amount is 6 bits - let imm = (funct6 << 6) | (shamt & 0x3F); - Ok(EncodeResult::Word(encode_i(OP_OP_IMM, rd, funct3, rs1, imm as i32))) -} - -pub(crate) fn encode_alu_reg(operands: &[Operand], funct3: u32, funct7: u32) -> Result { - let rd = get_reg(operands, 0)?; - let rs1 = get_reg(operands, 1)?; - let rs2 = get_reg(operands, 2)?; - Ok(EncodeResult::Word(encode_r(OP_OP, rd, funct3, rs1, rs2, funct7))) -} - -pub(crate) fn encode_alu_imm_w(operands: &[Operand], funct3: u32) -> Result { - let rd = get_reg(operands, 0)?; - let rs1 = get_reg(operands, 1)?; - let imm = get_imm(operands, 2)? as i32; - Ok(EncodeResult::Word(encode_i(OP_OP_IMM_32, rd, funct3, rs1, imm))) -} - -pub(crate) fn encode_shift_imm_w(operands: &[Operand], funct3: u32, funct7: u32) -> Result { - let rd = get_reg(operands, 0)?; - let rs1 = get_reg(operands, 1)?; - let shamt = get_imm(operands, 2)? as u32; - // For RV32/W operations, shift amount is 5 bits - let imm = (funct7 << 5) | (shamt & 0x1F); - Ok(EncodeResult::Word(encode_i(OP_OP_IMM_32, rd, funct3, rs1, imm as i32))) -} - -pub(crate) fn encode_alu_reg_w(operands: &[Operand], funct3: u32, funct7: u32) -> Result { - let rd = get_reg(operands, 0)?; - let rs1 = get_reg(operands, 1)?; - let rs2 = get_reg(operands, 2)?; - Ok(EncodeResult::Word(encode_r(OP_OP_32, rd, funct3, rs1, rs2, funct7))) -} - -// ── Zbb (bit manipulation) helpers ── - -/// Encode a Zbb unary instruction (clz, ctz, cpop, sext.b, sext.h, rev8). -/// These are I-type with funct3=001 and the 12-bit immediate encoding the operation. -pub(crate) fn encode_zbb_unary(operands: &[Operand], imm12: u32) -> Result { - let rd = get_reg(operands, 0)?; - let rs1 = get_reg(operands, 1)?; - Ok(EncodeResult::Word(encode_i(OP_OP_IMM, rd, 0b001, rs1, imm12 as i32))) -} - -/// Encode a Zbb unary instruction with funct3=101 (rev8, orc.b). -pub(crate) fn encode_zbb_unary_f5(operands: &[Operand], imm12: u32) -> Result { - let rd = get_reg(operands, 0)?; - let rs1 = get_reg(operands, 1)?; - Ok(EncodeResult::Word(encode_i(OP_OP_IMM, rd, 0b101, rs1, imm12 as i32))) -} - -/// Encode a Zbb unary word instruction (clzw, ctzw, cpopw). -/// These are I-type on OP-IMM-32 with funct3=001. -pub(crate) fn encode_zbb_unary_w(operands: &[Operand], imm12: u32) -> Result { - let rd = get_reg(operands, 0)?; - let rs1 = get_reg(operands, 1)?; - Ok(EncodeResult::Word(encode_i(OP_OP_IMM_32, rd, 0b001, rs1, imm12 as i32))) -} - -/// Encode zext.h rd, rs1 (R-type on OP-32: funct7=0000100, rs2=0, funct3=100). -pub(crate) fn encode_zbb_zexth(operands: &[Operand]) -> Result { - let rd = get_reg(operands, 0)?; - let rs1 = get_reg(operands, 1)?; - Ok(EncodeResult::Word(encode_r(OP_OP_32, rd, 0b100, rs1, 0, 0b0000100))) -} diff --git a/src/backend/riscv/assembler/encoder/compressed.rs b/src/backend/riscv/assembler/encoder/compressed.rs deleted file mode 100644 index 80e76d5237..0000000000 --- a/src/backend/riscv/assembler/encoder/compressed.rs +++ /dev/null @@ -1,196 +0,0 @@ -use super::*; - -// ── Explicit compressed instruction encoders ── - -// c.lui rd, nzimm -pub(crate) fn encode_c_lui(operands: &[Operand]) -> Result { - let rd = get_reg(operands, 0)?; - if rd == 0 || rd == 2 { return Err("c.lui: rd cannot be x0 or x2".into()); } - let imm = get_imm(operands, 1)?; - let nzimm = imm as i32; - if nzimm == 0 { return Err("c.lui: nzimm must not be zero".into()); } - let bit17 = ((nzimm >> 5) & 1) as u16; - let bits16_12 = (nzimm & 0x1F) as u16; - Ok(EncodeResult::Half(0b01 | ((bits16_12 & 0x1F) << 2) | ((rd as u16) << 7) | (bit17 << 12) | (0b011 << 13))) -} - -// c.li rd, imm -pub(crate) fn encode_c_li(operands: &[Operand]) -> Result { - let rd = get_reg(operands, 0)?; - let imm = get_imm(operands, 1)? as i32; - let bit5 = ((imm >> 5) & 1) as u16; - let bits4_0 = (imm & 0x1F) as u16; - Ok(EncodeResult::Half(0b01 | (bits4_0 << 2) | ((rd as u16) << 7) | (bit5 << 12) | (0b010 << 13))) -} - -// c.addi rd, nzimm -pub(crate) fn encode_c_addi(operands: &[Operand]) -> Result { - let rd = get_reg(operands, 0)?; - let imm = get_imm(operands, 1)? as i32; - let bit5 = ((imm >> 5) & 1) as u16; - let bits4_0 = (imm & 0x1F) as u16; - Ok(EncodeResult::Half(0b01 | (bits4_0 << 2) | ((rd as u16) << 7) | (bit5 << 12))) -} - -// c.mv rd, rs2 -pub(crate) fn encode_c_mv(operands: &[Operand]) -> Result { - let rd = get_reg(operands, 0)?; - let rs2 = get_reg(operands, 1)?; - Ok(EncodeResult::Half(0b10 | ((rs2 as u16) << 2) | ((rd as u16) << 7) | (0b100 << 13))) -} - -// c.add rd, rs2 -pub(crate) fn encode_c_add(operands: &[Operand]) -> Result { - let rd = get_reg(operands, 0)?; - let rs2 = get_reg(operands, 1)?; - Ok(EncodeResult::Half(0b10 | ((rs2 as u16) << 2) | ((rd as u16) << 7) | (1 << 12) | (0b100 << 13))) -} - -// c.jr rs1 -pub(crate) fn encode_c_jr(operands: &[Operand]) -> Result { - let rs1 = get_reg(operands, 0)?; - Ok(EncodeResult::Half(0b10 | ((rs1 as u16) << 7) | (0b100 << 13))) -} - -// c.jalr rs1 -pub(crate) fn encode_c_jalr(operands: &[Operand]) -> Result { - let rs1 = get_reg(operands, 0)?; - Ok(EncodeResult::Half(0b10 | ((rs1 as u16) << 7) | (1 << 12) | (0b100 << 13))) -} - -// ── .insn directive encoder ── - -/// Encode an .insn directive that allows arbitrary instruction encoding -pub fn encode_insn_directive(args: &str) -> Result { - let args = args.trim(); - - // Parse the format: .insn , , , , - // or .insn , , , , , - let parts: Vec<&str> = args.splitn(2, |c: char| c.is_whitespace() || c == ',').collect(); - if parts.is_empty() { - return Err("empty .insn directive".into()); - } - - let format = parts[0].trim().to_lowercase(); - - // Get remaining args after the format keyword - let rest = if parts.len() > 1 { parts[1].trim_start_matches(',').trim() } else { "" }; - let fields: Vec<&str> = rest.split(',').map(|s| s.trim()).filter(|s| !s.is_empty()).collect(); - - match format.as_str() { - "r" => encode_insn_r(&fields), - "i" => encode_insn_i(&fields), - "s" => encode_insn_s(&fields), - "b" | "sb" => encode_insn_b(&fields), - "u" => encode_insn_u(&fields), - "j" | "uj" => encode_insn_j(&fields), - // Raw 32-bit word: .insn 0x12345678 - _ => { - // Try parsing as a raw 32-bit value - if let Ok(word) = parse_insn_int(parts[0]) { - Ok(EncodeResult::Word(word as u32)) - } else { - Err(format!("unsupported .insn format: {}", format)) - } - } - } -} - -pub(crate) fn parse_insn_int(s: &str) -> Result { - let s = s.trim(); - if s.starts_with("0x") || s.starts_with("0X") { - i64::from_str_radix(&s[2..], 16).map_err(|e| format!("invalid hex in .insn: {}: {}", s, e)) - } else if s.starts_with("0b") || s.starts_with("0B") { - i64::from_str_radix(&s[2..], 2).map_err(|e| format!("invalid bin in .insn: {}: {}", s, e)) - } else { - s.parse::().map_err(|e| format!("invalid int in .insn: {}: {}", s, e)) - } -} - -pub(crate) fn parse_insn_reg(s: &str) -> Result { - let s = s.trim(); - reg_num(s).ok_or_else(|| format!("invalid register in .insn: {}", s)) -} - -pub(crate) fn encode_insn_r(fields: &[&str]) -> Result { - // .insn r opcode, funct3, funct7, rd, rs1, rs2 - if fields.len() < 6 { - return Err(format!(".insn r requires 6 fields (opcode, funct3, funct7, rd, rs1, rs2), got {}", fields.len())); - } - let opcode = parse_insn_int(fields[0])? as u32; - let funct3 = parse_insn_int(fields[1])? as u32; - let funct7 = parse_insn_int(fields[2])? as u32; - let rd = parse_insn_reg(fields[3])?; - let rs1 = parse_insn_reg(fields[4])?; - let rs2 = parse_insn_reg(fields[5])?; - Ok(EncodeResult::Word(encode_r(opcode, rd, funct3, rs1, rs2, funct7))) -} - -pub(crate) fn encode_insn_i(fields: &[&str]) -> Result { - // .insn i opcode, funct3, rd, rs1, imm - if fields.len() < 5 { - return Err(format!(".insn i requires 5 fields (opcode, funct3, rd, rs1, imm), got {}", fields.len())); - } - let opcode = parse_insn_int(fields[0])? as u32; - let funct3 = parse_insn_int(fields[1])? as u32; - let rd = parse_insn_reg(fields[2])?; - let rs1 = parse_insn_reg(fields[3])?; - let imm = parse_insn_int(fields[4])? as i32; - Ok(EncodeResult::Word(encode_i(opcode, rd, funct3, rs1, imm))) -} - -pub(crate) fn encode_insn_s(fields: &[&str]) -> Result { - // .insn s opcode, funct3, rs2, imm(rs1) - if fields.len() < 4 { - return Err(format!(".insn s requires 4 fields, got {}", fields.len())); - } - let opcode = parse_insn_int(fields[0])? as u32; - let funct3 = parse_insn_int(fields[1])? as u32; - let rs2 = parse_insn_reg(fields[2])?; - // Parse imm(rs1) - let last = fields[3].trim(); - if let Some(paren_pos) = last.find('(') { - let imm_str = &last[..paren_pos]; - let rs1_str = last[paren_pos+1..].trim_end_matches(')'); - let imm = parse_insn_int(imm_str)? as i32; - let rs1 = parse_insn_reg(rs1_str)?; - Ok(EncodeResult::Word(encode_s(opcode, funct3, rs1, rs2, imm))) - } else { - Err(".insn s: expected imm(rs1) format for last field".into()) - } -} - -pub(crate) fn encode_insn_b(fields: &[&str]) -> Result { - // .insn b/sb opcode, funct3, rs1, rs2, offset - if fields.len() < 5 { - return Err(format!(".insn b requires 5 fields, got {}", fields.len())); - } - let opcode = parse_insn_int(fields[0])? as u32; - let funct3 = parse_insn_int(fields[1])? as u32; - let rs1 = parse_insn_reg(fields[2])?; - let rs2 = parse_insn_reg(fields[3])?; - let imm = parse_insn_int(fields[4])? as i32; - Ok(EncodeResult::Word(encode_b(opcode, funct3, rs1, rs2, imm))) -} - -pub(crate) fn encode_insn_u(fields: &[&str]) -> Result { - // .insn u opcode, rd, imm - if fields.len() < 3 { - return Err(format!(".insn u requires 3 fields, got {}", fields.len())); - } - let opcode = parse_insn_int(fields[0])? as u32; - let rd = parse_insn_reg(fields[1])?; - let imm = parse_insn_int(fields[2])? as u32; - Ok(EncodeResult::Word(encode_u(opcode, rd, imm))) -} - -pub(crate) fn encode_insn_j(fields: &[&str]) -> Result { - // .insn j/uj opcode, rd, imm - if fields.len() < 3 { - return Err(format!(".insn j requires 3 fields, got {}", fields.len())); - } - let opcode = parse_insn_int(fields[0])? as u32; - let rd = parse_insn_reg(fields[1])?; - let imm = parse_insn_int(fields[2])? as i32; - Ok(EncodeResult::Word(encode_j(opcode, rd, imm))) -} diff --git a/src/backend/riscv/assembler/encoder/float.rs b/src/backend/riscv/assembler/encoder/float.rs deleted file mode 100644 index ea0647c431..0000000000 --- a/src/backend/riscv/assembler/encoder/float.rs +++ /dev/null @@ -1,191 +0,0 @@ -use super::*; - -// ── Floating-point instructions ── - -pub(crate) fn encode_float_load(operands: &[Operand], funct3: u32) -> Result { - let rd = get_freg(operands, 0)?; - match &operands.get(1) { - Some(Operand::Mem { base, offset }) => { - let rs1 = reg_num(base).ok_or("invalid base register")?; - Ok(EncodeResult::Word(encode_i(OP_LOAD_FP, rd, funct3, rs1, *offset as i32))) - } - Some(Operand::MemSymbol { base, symbol, .. }) => { - let rs1 = reg_num(base).ok_or("invalid base register")?; - let (reloc_type, sym) = parse_reloc_modifier(symbol); - let reloc_type = match reloc_type { - RelocType::PcrelHi20 => RelocType::PcrelLo12I, - RelocType::Hi20 => RelocType::Lo12I, - other => other, - }; - Ok(EncodeResult::WordWithReloc { - word: encode_i(OP_LOAD_FP, rd, funct3, rs1, 0), - reloc: Relocation { - reloc_type, - symbol: sym, - addend: 0, - }, - }) - } - _ => Err("float load: expected memory operand".to_string()), - } -} - -pub(crate) fn encode_float_store(operands: &[Operand], funct3: u32) -> Result { - let rs2 = get_freg(operands, 0)?; - match &operands.get(1) { - Some(Operand::Mem { base, offset }) => { - let rs1 = reg_num(base).ok_or("invalid base register")?; - Ok(EncodeResult::Word(encode_s(OP_STORE_FP, funct3, rs1, rs2, *offset as i32))) - } - Some(Operand::MemSymbol { base, symbol, .. }) => { - let rs1 = reg_num(base).ok_or("invalid base register")?; - let (reloc_type, sym) = parse_reloc_modifier(symbol); - let reloc_type = match reloc_type { - RelocType::PcrelHi20 => RelocType::PcrelLo12S, - RelocType::Hi20 => RelocType::Lo12S, - other => other, - }; - Ok(EncodeResult::WordWithReloc { - word: encode_s(OP_STORE_FP, funct3, rs1, rs2, 0), - reloc: Relocation { - reloc_type, - symbol: sym, - addend: 0, - }, - }) - } - _ => Err("float store: expected memory operand".to_string()), - } -} - -pub(crate) fn encode_fp_arith(operands: &[Operand], funct7: u32) -> Result { - let rd = get_freg(operands, 0)?; - let rs1 = get_freg(operands, 1)?; - let rs2 = get_freg(operands, 2)?; - // Check for optional rounding mode - let rm = if operands.len() > 3 { - match &operands[3] { - Operand::RoundingMode(s) => parse_rm(s), - _ => 0b111, // dynamic - } - } else { - 0b111 - }; - Ok(EncodeResult::Word(encode_r(OP_OP_FP, rd, rm, rs1, rs2, funct7))) -} - -pub(crate) fn encode_fp_arith_d(operands: &[Operand], funct7: u32) -> Result { - encode_fp_arith(operands, funct7) -} - -pub(crate) fn encode_fp_unary(operands: &[Operand], funct7: u32, rs2: u32) -> Result { - let rd = get_freg(operands, 0)?; - let rs1 = get_freg(operands, 1)?; - let rm = if operands.len() > 2 { - match &operands[2] { - Operand::RoundingMode(s) => parse_rm(s), - _ => 0b111, - } - } else { - 0b111 - }; - Ok(EncodeResult::Word(encode_r(OP_OP_FP, rd, rm, rs1, rs2, funct7))) -} - -pub(crate) fn encode_fp_sgnj(operands: &[Operand], funct7: u32, funct3: u32) -> Result { - let rd = get_freg(operands, 0)?; - let rs1 = get_freg(operands, 1)?; - let rs2 = get_freg(operands, 2)?; - Ok(EncodeResult::Word(encode_r(OP_OP_FP, rd, funct3, rs1, rs2, funct7))) -} - -pub(crate) fn encode_fp_cmp(operands: &[Operand], funct7: u32, funct3: u32) -> Result { - // Result goes to integer register - let rd = get_reg(operands, 0)?; - let rs1 = get_freg(operands, 1)?; - let rs2 = get_freg(operands, 2)?; - Ok(EncodeResult::Word(encode_r(OP_OP_FP, rd, funct3, rs1, rs2, funct7))) -} - -pub(crate) fn encode_fclass(operands: &[Operand], funct7: u32) -> Result { - let rd = get_reg(operands, 0)?; - let rs1 = get_freg(operands, 1)?; - Ok(EncodeResult::Word(encode_r(OP_OP_FP, rd, 0b001, rs1, 0, funct7))) -} - -pub(crate) fn encode_fcvt_int(operands: &[Operand], funct7: u32, rs2: u32) -> Result { - // Float to integer: result in integer register, source in float register - let rd = get_reg(operands, 0)?; - let rs1 = get_freg(operands, 1)?; - let rm = if operands.len() > 2 { - match &operands[2] { - Operand::RoundingMode(s) => parse_rm(s), - _ => 0b111, - } - } else { - 0b111 - }; - Ok(EncodeResult::Word(encode_r(OP_OP_FP, rd, rm, rs1, rs2, funct7))) -} - -pub(crate) fn encode_fcvt_from_int(operands: &[Operand], funct7: u32, rs2: u32) -> Result { - // Integer to float: result in float register, source in integer register - let rd = get_freg(operands, 0)?; - let rs1 = get_reg(operands, 1)?; - let rm = if operands.len() > 2 { - match &operands[2] { - Operand::RoundingMode(s) => parse_rm(s), - _ => 0b111, - } - } else { - 0b111 - }; - Ok(EncodeResult::Word(encode_r(OP_OP_FP, rd, rm, rs1, rs2, funct7))) -} - -pub(crate) fn encode_fcvt_fp(operands: &[Operand], funct7: u32, rs2: u32) -> Result { - // Float to float conversion (e.g., fcvt.s.d, fcvt.d.s) - let rd = get_freg(operands, 0)?; - let rs1 = get_freg(operands, 1)?; - let rm = if operands.len() > 2 { - match &operands[2] { - Operand::RoundingMode(s) => parse_rm(s), - _ => 0b111, - } - } else { - 0b111 - }; - Ok(EncodeResult::Word(encode_r(OP_OP_FP, rd, rm, rs1, rs2, funct7))) -} - -pub(crate) fn encode_fmv_x_f(operands: &[Operand], funct7: u32, _fmt: u32) -> Result { - // Float to integer register move - let rd = get_reg(operands, 0)?; - let rs1 = get_freg(operands, 1)?; - Ok(EncodeResult::Word(encode_r(OP_OP_FP, rd, 0b000, rs1, 0, funct7))) -} - -pub(crate) fn encode_fmv_f_x(operands: &[Operand], funct7: u32, _fmt: u32) -> Result { - // Integer to float register move - let rd = get_freg(operands, 0)?; - let rs1 = get_reg(operands, 1)?; - Ok(EncodeResult::Word(encode_r(OP_OP_FP, rd, 0b000, rs1, 0, funct7))) -} - -pub(crate) fn encode_fma(operands: &[Operand], opcode: u32, fmt: u32) -> Result { - let rd = get_freg(operands, 0)?; - let rs1 = get_freg(operands, 1)?; - let rs2 = get_freg(operands, 2)?; - let rs3 = get_freg(operands, 3)?; - let rm = if operands.len() > 4 { - match &operands[4] { - Operand::RoundingMode(s) => parse_rm(s), - _ => 0b111, - } - } else { - 0b111 - }; - // R4-type: rs3[31:27] | fmt[26:25] | rs2[24:20] | rs1[19:15] | rm[14:12] | rd[11:7] | opcode[6:0] - let word = (rs3 << 27) | (fmt << 25) | (rs2 << 20) | (rs1 << 15) | (rm << 12) | (rd << 7) | opcode; - Ok(EncodeResult::Word(word)) -} diff --git a/src/backend/riscv/assembler/encoder/mod.rs b/src/backend/riscv/assembler/encoder/mod.rs deleted file mode 100644 index 6aff3c0b8c..0000000000 --- a/src/backend/riscv/assembler/encoder/mod.rs +++ /dev/null @@ -1,926 +0,0 @@ -//! RISC-V instruction encoder. -//! -//! Encodes RISC-V instructions into 32-bit machine code words. -//! This covers the subset of instructions emitted by our codegen (RV64GC + Zbb). -//! -//! RISC-V base instructions are always 4 bytes (32 bits), little-endian. -//! The encoding uses six main formats: R, I, S, B, U, J. - -// Encoding helpers for all RISC-V instruction formats; not all formats used yet. -#![allow(dead_code)] - -mod base; -mod atomics; -mod system; -mod float; -mod pseudo; -mod compressed; -mod vector; - -pub(crate) use base::*; -pub(crate) use atomics::*; -pub(crate) use system::*; -pub(crate) use float::*; -pub(crate) use pseudo::*; -pub(crate) use compressed::*; -pub(crate) use vector::*; - -use super::parser::Operand; - -/// Result of encoding an instruction. -#[derive(Debug, Clone)] -pub enum EncodeResult { - /// Successfully encoded as a 4-byte instruction word - Word(u32), - /// Successfully encoded as a 2-byte compressed instruction - Half(u16), - /// Two 4-byte instruction words (e.g., pseudo-instructions like `call`, `li` with large imm) - Words(Vec), - /// Instruction needs a relocation to be applied later - WordWithReloc { - word: u32, - reloc: Relocation, - }, - /// Multiple words with relocations (e.g., `call` = auipc + jalr) - WordsWithRelocs(Vec<(u32, Option)>), - /// Skip this instruction (e.g., pseudo handled elsewhere) - Skip, -} - -/// RISC-V ELF relocation types -#[derive(Debug, Clone)] -pub enum RelocType { - /// R_RISCV_CALL_PLT (combined auipc+jalr, 8 bytes) - CallPlt, - /// R_RISCV_PCREL_HI20 - for AUIPC (high 20 bits of PC-relative) - PcrelHi20, - /// R_RISCV_PCREL_LO12_I - for ADDI/LW/LD (low 12 bits of PC-relative, I-type) - PcrelLo12I, - /// R_RISCV_PCREL_LO12_S - for SW/SD (low 12 bits of PC-relative, S-type) - PcrelLo12S, - /// R_RISCV_HI20 - for LUI (absolute high 20 bits) - Hi20, - /// R_RISCV_LO12_I - for ADDI/LW/LD (absolute low 12 bits, I-type) - Lo12I, - /// R_RISCV_LO12_S - for SW/SD (absolute low 12 bits, S-type) - Lo12S, - /// R_RISCV_BRANCH - 12-bit PC-relative branch (B-type) - Branch, - /// R_RISCV_JAL - 20-bit PC-relative jump (J-type) - Jal, - /// R_RISCV_64 - 64-bit absolute - Abs64, - /// R_RISCV_32 - 32-bit absolute - Abs32, - /// R_RISCV_GOT_HI20 - GOT-relative AUIPC - GotHi20, - /// R_RISCV_TLS_GD_HI20 - TlsGdHi20, - /// R_RISCV_TLS_GOT_HI20 - TlsGotHi20, - /// R_RISCV_TPREL_HI20 - TprelHi20, - /// R_RISCV_TPREL_LO12_I - TprelLo12I, - /// R_RISCV_TPREL_LO12_S - TprelLo12S, - /// R_RISCV_TPREL_ADD - TprelAdd, - /// R_RISCV_ADD16 - 16-bit addition (for symbol differences) - Add16, - /// R_RISCV_SUB16 - 16-bit subtraction (for symbol differences) - Sub16, - /// R_RISCV_ADD32 - 32-bit addition (for symbol differences) - Add32, - /// R_RISCV_SUB32 - 32-bit subtraction (for symbol differences) - Sub32, - /// R_RISCV_ADD64 - 64-bit addition (for symbol differences) - Add64, - /// R_RISCV_SUB64 - 64-bit subtraction (for symbol differences) - Sub64, -} - -impl RelocType { - /// Get the ELF relocation type number. - pub fn elf_type(&self) -> u32 { - match self { - RelocType::Branch => 16, // R_RISCV_BRANCH - RelocType::Jal => 17, // R_RISCV_JAL - RelocType::CallPlt => 19, // R_RISCV_CALL_PLT - RelocType::GotHi20 => 20, // R_RISCV_GOT_HI20 - RelocType::TlsGdHi20 => 22, // R_RISCV_TLS_GD_HI20 - RelocType::TlsGotHi20 => 21, // R_RISCV_TLS_GOT_HI20 - RelocType::PcrelHi20 => 23, // R_RISCV_PCREL_HI20 = 23 - RelocType::PcrelLo12I => 24, // R_RISCV_PCREL_LO12_I = 24 - RelocType::PcrelLo12S => 25, // R_RISCV_PCREL_LO12_S = 25 - RelocType::Hi20 => 26, // R_RISCV_HI20 - RelocType::Lo12I => 27, // R_RISCV_LO12_I - RelocType::Lo12S => 28, // R_RISCV_LO12_S - RelocType::TprelHi20 => 29, // R_RISCV_TPREL_HI20 - RelocType::TprelLo12I => 30, // R_RISCV_TPREL_LO12_I - RelocType::TprelLo12S => 31, // R_RISCV_TPREL_LO12_S - RelocType::TprelAdd => 32, // R_RISCV_TPREL_ADD - RelocType::Abs32 => 1, // R_RISCV_32 - RelocType::Abs64 => 2, // R_RISCV_64 - RelocType::Add16 => 34, // R_RISCV_ADD16 - RelocType::Sub16 => 38, // R_RISCV_SUB16 - RelocType::Add32 => 35, // R_RISCV_ADD32 - RelocType::Sub32 => 39, // R_RISCV_SUB32 - RelocType::Add64 => 36, // R_RISCV_ADD64 - RelocType::Sub64 => 40, // R_RISCV_SUB64 - } - } -} - -/// A relocation to be applied. -#[derive(Debug, Clone)] -pub struct Relocation { - pub reloc_type: RelocType, - pub symbol: String, - pub addend: i64, -} - -// ── Register encoding ────────────────────────────────────────────────── - -/// Parse a register name to its 5-bit encoding number (0-31). -pub fn reg_num(name: &str) -> Option { - let name = name.to_lowercase(); - match name.as_str() { - // ABI names for integer registers - "zero" => Some(0), - "ra" => Some(1), - "sp" => Some(2), - "gp" => Some(3), - "tp" => Some(4), - "t0" => Some(5), - "t1" => Some(6), - "t2" => Some(7), - "s0" | "fp" => Some(8), - "s1" => Some(9), - "a0" => Some(10), - "a1" => Some(11), - "a2" => Some(12), - "a3" => Some(13), - "a4" => Some(14), - "a5" => Some(15), - "a6" => Some(16), - "a7" => Some(17), - "s2" => Some(18), - "s3" => Some(19), - "s4" => Some(20), - "s5" => Some(21), - "s6" => Some(22), - "s7" => Some(23), - "s8" => Some(24), - "s9" => Some(25), - "s10" => Some(26), - "s11" => Some(27), - "t3" => Some(28), - "t4" => Some(29), - "t5" => Some(30), - "t6" => Some(31), - _ => { - // x0-x31 - if let Some(rest) = name.strip_prefix('x') { - let n: u32 = rest.parse().ok()?; - if n <= 31 { Some(n) } else { None } - } else { - None - } - } - } -} - -/// Parse a floating-point register name to its 5-bit encoding (0-31). -pub fn freg_num(name: &str) -> Option { - let name = name.to_lowercase(); - match name.as_str() { - "ft0" => Some(0), - "ft1" => Some(1), - "ft2" => Some(2), - "ft3" => Some(3), - "ft4" => Some(4), - "ft5" => Some(5), - "ft6" => Some(6), - "ft7" => Some(7), - "fs0" => Some(8), - "fs1" => Some(9), - "fa0" => Some(10), - "fa1" => Some(11), - "fa2" => Some(12), - "fa3" => Some(13), - "fa4" => Some(14), - "fa5" => Some(15), - "fa6" => Some(16), - "fa7" => Some(17), - "fs2" => Some(18), - "fs3" => Some(19), - "fs4" => Some(20), - "fs5" => Some(21), - "fs6" => Some(22), - "fs7" => Some(23), - "fs8" => Some(24), - "fs9" => Some(25), - "fs10" => Some(26), - "fs11" => Some(27), - "ft8" => Some(28), - "ft9" => Some(29), - "ft10" => Some(30), - "ft11" => Some(31), - _ => { - // f0-f31 - if name.starts_with('f') && !name.starts_with("ft") - && !name.starts_with("fs") && !name.starts_with("fa") - { - let n: u32 = name[1..].parse().ok()?; - if n <= 31 { Some(n) } else { None } - } else { - None - } - } - } -} - -/// Parse a vector register name to its 5-bit encoding (0-31). -/// Handles v0-v31. -pub fn vreg_num(name: &str) -> Option { - let name = name.to_lowercase(); - if let Some(rest) = name.strip_prefix('v') { - // Avoid matching "vector" etc. - must be just digits after 'v' - let n: u32 = rest.parse().ok()?; - if n <= 31 { Some(n) } else { None } - } else { - None - } -} - -/// Try integer register first, then float register -fn any_reg_num(name: &str) -> Option { - reg_num(name).or_else(|| freg_num(name)) -} - -/// Check if a register name is an integer register -fn is_int_reg(name: &str) -> bool { - reg_num(name).is_some() -} - -/// Check if a register name is a floating-point register -fn is_fp_reg(name: &str) -> bool { - freg_num(name).is_some() -} - -// ── Instruction format encoders ────────────────────────────────────── - -/// R-type: funct7[31:25] | rs2[24:20] | rs1[19:15] | funct3[14:12] | rd[11:7] | opcode[6:0] -fn encode_r(opcode: u32, rd: u32, funct3: u32, rs1: u32, rs2: u32, funct7: u32) -> u32 { - (funct7 << 25) | (rs2 << 20) | (rs1 << 15) | (funct3 << 12) | (rd << 7) | opcode -} - -/// I-type: imm[31:20] | rs1[19:15] | funct3[14:12] | rd[11:7] | opcode[6:0] -fn encode_i(opcode: u32, rd: u32, funct3: u32, rs1: u32, imm: i32) -> u32 { - let imm = (imm as u32) & 0xFFF; - (imm << 20) | (rs1 << 15) | (funct3 << 12) | (rd << 7) | opcode -} - -/// S-type: imm[11:5] | rs2[24:20] | rs1[19:15] | funct3[14:12] | imm[4:0] | opcode[6:0] -fn encode_s(opcode: u32, funct3: u32, rs1: u32, rs2: u32, imm: i32) -> u32 { - let imm = imm as u32; - let imm_11_5 = (imm >> 5) & 0x7F; - let imm_4_0 = imm & 0x1F; - (imm_11_5 << 25) | (rs2 << 20) | (rs1 << 15) | (funct3 << 12) | (imm_4_0 << 7) | opcode -} - -/// B-type: imm[12|10:5] | rs2 | rs1 | funct3 | imm[4:1|11] | opcode -fn encode_b(opcode: u32, funct3: u32, rs1: u32, rs2: u32, imm: i32) -> u32 { - let imm = imm as u32; - let bit12 = (imm >> 12) & 1; - let bit11 = (imm >> 11) & 1; - let bits10_5 = (imm >> 5) & 0x3F; - let bits4_1 = (imm >> 1) & 0xF; - (bit12 << 31) | (bits10_5 << 25) | (rs2 << 20) | (rs1 << 15) | (funct3 << 12) - | (bits4_1 << 8) | (bit11 << 7) | opcode -} - -/// U-type: imm[31:12] | rd[11:7] | opcode[6:0] -fn encode_u(opcode: u32, rd: u32, imm: u32) -> u32 { - (imm & 0xFFFFF000) | (rd << 7) | opcode -} - -/// J-type: imm[20|10:1|11|19:12] | rd[11:7] | opcode[6:0] -fn encode_j(opcode: u32, rd: u32, imm: i32) -> u32 { - let imm = imm as u32; - let bit20 = (imm >> 20) & 1; - let bits10_1 = (imm >> 1) & 0x3FF; - let bit11 = (imm >> 11) & 1; - let bits19_12 = (imm >> 12) & 0xFF; - (bit20 << 31) | (bits10_1 << 21) | (bit11 << 20) | (bits19_12 << 12) | (rd << 7) | opcode -} - -// ── Opcode constants ────────────────────────────────────────────────── - -const OP_LUI: u32 = 0b0110111; -const OP_AUIPC: u32 = 0b0010111; -const OP_JAL: u32 = 0b1101111; -const OP_JALR: u32 = 0b1100111; -const OP_BRANCH: u32 = 0b1100011; -const OP_LOAD: u32 = 0b0000011; -const OP_STORE: u32 = 0b0100011; -const OP_OP_IMM: u32 = 0b0010011; -const OP_OP: u32 = 0b0110011; -const OP_OP_IMM_32: u32 = 0b0011011; -const OP_OP_32: u32 = 0b0111011; -const OP_SYSTEM: u32 = 0b1110011; -const OP_MISC_MEM: u32 = 0b0001111; -const OP_AMO: u32 = 0b0101111; -const OP_LOAD_FP: u32 = 0b0000111; -const OP_STORE_FP: u32 = 0b0100111; -const OP_OP_FP: u32 = 0b1010011; -const OP_FMADD: u32 = 0b1000011; -const OP_FMSUB: u32 = 0b1000111; -const OP_FNMSUB: u32 = 0b1001011; -const OP_FNMADD: u32 = 0b1001111; -const OP_V: u32 = 0b1010111; // Vector arithmetic/config (RVV 1.0) -const OP_V_CRYPTO: u32 = 0b1110111; // Vector crypto (Zvk*) — uses OP-P encoding space per RVV Crypto spec - -// ── Helper functions ────────────────────────────────────────────────── - -fn get_reg(operands: &[Operand], idx: usize) -> Result { - match operands.get(idx) { - Some(Operand::Reg(name)) => { - reg_num(name).ok_or_else(|| format!("invalid integer register: {}", name)) - } - // GCC sometimes emits bare register numbers (0-31) in inline asm - Some(Operand::Imm(n)) if *n >= 0 && *n <= 31 => Ok(*n as u32), - other => Err(format!("expected register at operand {}, got {:?}", idx, other)), - } -} - -fn get_freg(operands: &[Operand], idx: usize) -> Result { - match operands.get(idx) { - Some(Operand::Reg(name)) => { - freg_num(name).ok_or_else(|| format!("invalid float register: {}", name)) - } - other => Err(format!("expected float register at operand {}, got {:?}", idx, other)), - } -} - -fn get_any_reg(operands: &[Operand], idx: usize) -> Result { - match operands.get(idx) { - Some(Operand::Reg(name)) => { - any_reg_num(name).ok_or_else(|| format!("invalid register: {}", name)) - } - // GCC sometimes emits bare register numbers (0-31) in inline asm - Some(Operand::Imm(n)) if *n >= 0 && *n <= 31 => Ok(*n as u32), - other => Err(format!("expected register at operand {}, got {:?}", idx, other)), - } -} - -fn get_vreg(operands: &[Operand], idx: usize) -> Result { - match operands.get(idx) { - Some(Operand::Reg(name)) => { - vreg_num(name).ok_or_else(|| format!("invalid vector register: {}", name)) - } - other => Err(format!("expected vector register at operand {}, got {:?}", idx, other)), - } -} - -fn get_imm(operands: &[Operand], idx: usize) -> Result { - match operands.get(idx) { - Some(Operand::Imm(v)) => Ok(*v), - other => Err(format!("expected immediate at operand {}, got {:?}", idx, other)), - } -} - -fn get_symbol(operands: &[Operand], idx: usize) -> Result<(String, i64), String> { - match operands.get(idx) { - Some(Operand::Symbol(s)) => Ok((s.clone(), 0)), - Some(Operand::Label(s)) => Ok((s.clone(), 0)), - Some(Operand::SymbolOffset(s, off)) => Ok((s.clone(), *off)), - // Register names like "f1", "a0", "ra", "zero", "s1" etc. can also be - // symbol names (e.g. `call f1` where f1 is a function). When an encoder - // expects a symbol operand, treat a Reg as a symbol name. - Some(Operand::Reg(s)) => Ok((s.clone(), 0)), - other => Err(format!("expected symbol at operand {}, got {:?}", idx, other)), - } -} - -fn get_mem(operands: &[Operand], idx: usize) -> Result<(u32, i64), String> { - match operands.get(idx) { - Some(Operand::Mem { base, offset }) => { - let base_reg = reg_num(base) - .ok_or_else(|| format!("invalid base register: {}", base))?; - Ok((base_reg, *offset)) - } - other => Err(format!("expected memory operand at operand {}, got {:?}", idx, other)), - } -} - -/// Parse a fence ordering string (e.g., "iorw") into a 4-bit mask. -fn parse_fence_bits(s: &str) -> u32 { - let s = s.to_lowercase(); - let mut bits = 0u32; - if s.contains('i') { bits |= 8; } - if s.contains('o') { bits |= 4; } - if s.contains('r') { bits |= 2; } - if s.contains('w') { bits |= 1; } - bits -} - -/// Parse a rounding mode to 3-bit encoding. -fn parse_rm(s: &str) -> u32 { - match s.to_lowercase().as_str() { - "rne" => 0b000, - "rtz" => 0b001, - "rdn" => 0b010, - "rup" => 0b011, - "rmm" => 0b100, - "dyn" => 0b111, - _ => 0b111, // default to dynamic - } -} - -// ── Main encode function ────────────────────────────────────────────── - -/// Encode a RISC-V instruction from its mnemonic and parsed operands. -pub fn encode_instruction(mnemonic: &str, operands: &[Operand], raw_operands: &str) -> Result { - let mn = mnemonic.to_lowercase(); - - match mn.as_str() { - // ── RV64I Base Instructions ── - - // U-type - "lui" => encode_lui(operands), - "auipc" => encode_auipc(operands), - - // J-type - "jal" => encode_jal(operands), - "jalr" => encode_jalr(operands), - - // B-type branches - "beq" => encode_branch_instr(operands, 0b000), - "bne" => encode_branch_instr(operands, 0b001), - "blt" => encode_branch_instr(operands, 0b100), - "bge" => encode_branch_instr(operands, 0b101), - "bltu" => encode_branch_instr(operands, 0b110), - "bgeu" => encode_branch_instr(operands, 0b111), - - // Loads (I-type) - "lb" => encode_load(operands, 0b000), - "lh" => encode_load(operands, 0b001), - "lw" => encode_load(operands, 0b010), - "ld" => encode_load(operands, 0b011), - "lbu" => encode_load(operands, 0b100), - "lhu" => encode_load(operands, 0b101), - "lwu" => encode_load(operands, 0b110), - - // Stores (S-type) - "sb" => encode_store(operands, 0b000), - "sh" => encode_store(operands, 0b001), - "sw" => encode_store(operands, 0b010), - "sd" => encode_store(operands, 0b011), - - // Immediate arithmetic (I-type) - "addi" => encode_alu_imm(operands, 0b000), - "slti" => encode_alu_imm(operands, 0b010), - "sltiu" => encode_alu_imm(operands, 0b011), - "xori" => encode_alu_imm(operands, 0b100), - "ori" => encode_alu_imm(operands, 0b110), - "andi" => encode_alu_imm(operands, 0b111), - - // Shifts immediate - "slli" => encode_shift_imm(operands, 0b001, 0b000000), - "srli" => encode_shift_imm(operands, 0b101, 0b000000), - "srai" => encode_shift_imm(operands, 0b101, 0b010000), - - // Register-register arithmetic (R-type) - // Auto-convert to immediate variants when 3rd operand is an immediate - "add" => if operands.len() >= 3 && matches!(operands[2], Operand::Imm(_)) { - encode_alu_imm(operands, 0b000) // -> addi - } else { - encode_alu_reg(operands, 0b000, 0b0000000) - }, - "sub" => encode_alu_reg(operands, 0b000, 0b0100000), - "sll" => if operands.len() >= 3 && matches!(operands[2], Operand::Imm(_)) { - encode_shift_imm(operands, 0b001, 0b000000) - } else { - encode_alu_reg(operands, 0b001, 0b0000000) - }, - "slt" => if operands.len() >= 3 && matches!(operands[2], Operand::Imm(_)) { - encode_alu_imm(operands, 0b010) // -> slti - } else { - encode_alu_reg(operands, 0b010, 0b0000000) - }, - "sltu" => if operands.len() >= 3 && matches!(operands[2], Operand::Imm(_)) { - encode_alu_imm(operands, 0b011) // -> sltiu - } else { - encode_alu_reg(operands, 0b011, 0b0000000) - }, - "xor" => if operands.len() >= 3 && matches!(operands[2], Operand::Imm(_)) { - encode_alu_imm(operands, 0b100) // -> xori - } else { - encode_alu_reg(operands, 0b100, 0b0000000) - }, - "srl" => if operands.len() >= 3 && matches!(operands[2], Operand::Imm(_)) { - encode_shift_imm(operands, 0b101, 0b000000) - } else { - encode_alu_reg(operands, 0b101, 0b0000000) - }, - "sra" => if operands.len() >= 3 && matches!(operands[2], Operand::Imm(_)) { - encode_shift_imm(operands, 0b101, 0b010000) - } else { - encode_alu_reg(operands, 0b101, 0b0100000) - }, - "or" => if operands.len() >= 3 && matches!(operands[2], Operand::Imm(_)) { - encode_alu_imm(operands, 0b110) // -> ori - } else { - encode_alu_reg(operands, 0b110, 0b0000000) - }, - "and" => if operands.len() >= 3 && matches!(operands[2], Operand::Imm(_)) { - encode_alu_imm(operands, 0b111) // -> andi - } else { - encode_alu_reg(operands, 0b111, 0b0000000) - }, - - // RV64I word (32-bit) operations - "addiw" => encode_alu_imm_w(operands, 0b000), - "slliw" => encode_shift_imm_w(operands, 0b001, 0b0000000), - "srliw" => encode_shift_imm_w(operands, 0b101, 0b0000000), - "sraiw" => encode_shift_imm_w(operands, 0b101, 0b0100000), - "addw" => if operands.len() >= 3 && matches!(operands[2], Operand::Imm(_)) { - encode_alu_imm_w(operands, 0b000) // -> addiw - } else { - encode_alu_reg_w(operands, 0b000, 0b0000000) - }, - "subw" => encode_alu_reg_w(operands, 0b000, 0b0100000), - // sllw/srlw/sraw: auto-convert to slliw/srliw/sraiw when 3rd operand is immediate - "sllw" => if operands.len() >= 3 && matches!(operands[2], Operand::Imm(_)) { - encode_shift_imm_w(operands, 0b001, 0b0000000) - } else { - encode_alu_reg_w(operands, 0b001, 0b0000000) - }, - "srlw" => if operands.len() >= 3 && matches!(operands[2], Operand::Imm(_)) { - encode_shift_imm_w(operands, 0b101, 0b0000000) - } else { - encode_alu_reg_w(operands, 0b101, 0b0000000) - }, - "sraw" => if operands.len() >= 3 && matches!(operands[2], Operand::Imm(_)) { - encode_shift_imm_w(operands, 0b101, 0b0100000) - } else { - encode_alu_reg_w(operands, 0b101, 0b0100000) - }, - - // ── M Extension (multiply/divide) ── - "mul" => encode_alu_reg(operands, 0b000, 0b0000001), - "mulh" => encode_alu_reg(operands, 0b001, 0b0000001), - "mulhsu" => encode_alu_reg(operands, 0b010, 0b0000001), - "mulhu" => encode_alu_reg(operands, 0b011, 0b0000001), - "div" => encode_alu_reg(operands, 0b100, 0b0000001), - "divu" => encode_alu_reg(operands, 0b101, 0b0000001), - "rem" => encode_alu_reg(operands, 0b110, 0b0000001), - "remu" => encode_alu_reg(operands, 0b111, 0b0000001), - "mulw" => encode_alu_reg_w(operands, 0b000, 0b0000001), - "divw" => encode_alu_reg_w(operands, 0b100, 0b0000001), - "divuw" => encode_alu_reg_w(operands, 0b101, 0b0000001), - "remw" => encode_alu_reg_w(operands, 0b110, 0b0000001), - "remuw" => encode_alu_reg_w(operands, 0b111, 0b0000001), - - // ── Zbb Extension (basic bit manipulation) ── - // Unary operations (encoded as I-type with rs2 field in immediate) - "clz" => encode_zbb_unary(operands, 0x600), // clz rd, rs1 - "ctz" => encode_zbb_unary(operands, 0x601), // ctz rd, rs1 - "cpop" => encode_zbb_unary(operands, 0x602), // cpop rd, rs1 - "sext.b" => encode_zbb_unary(operands, 0x604), // sext.b rd, rs1 - "sext.h" => encode_zbb_unary(operands, 0x605), // sext.h rd, rs1 - "rev8" => encode_zbb_unary_f5(operands, 0x6B8), // rev8 rd, rs1 (RV64, funct3=101) - "orc.b" => encode_zbb_unary_f5(operands, 0x287), // orc.b rd, rs1 (funct3=101) - // Unary word operations - "clzw" => encode_zbb_unary_w(operands, 0x600), // clzw rd, rs1 - "ctzw" => encode_zbb_unary_w(operands, 0x601), // ctzw rd, rs1 - "cpopw" => encode_zbb_unary_w(operands, 0x602), // cpopw rd, rs1 - // Register-register operations - "andn" => encode_alu_reg(operands, 0b111, 0b0100000), - "orn" => encode_alu_reg(operands, 0b110, 0b0100000), - "xnor" => encode_alu_reg(operands, 0b100, 0b0100000), - "max" => encode_alu_reg(operands, 0b110, 0b0000101), - "maxu" => encode_alu_reg(operands, 0b111, 0b0000101), - "min" => encode_alu_reg(operands, 0b100, 0b0000101), - "minu" => encode_alu_reg(operands, 0b101, 0b0000101), - "rol" => encode_alu_reg(operands, 0b001, 0b0110000), - "ror" => if operands.len() >= 3 && matches!(operands[2], Operand::Imm(_)) { - encode_shift_imm(operands, 0b101, 0b011000) // -> rori - } else { - encode_alu_reg(operands, 0b101, 0b0110000) - }, - "rolw" => encode_alu_reg_w(operands, 0b001, 0b0110000), - "rorw" => if operands.len() >= 3 && matches!(operands[2], Operand::Imm(_)) { - encode_shift_imm_w(operands, 0b101, 0b0110000) // -> roriw - } else { - encode_alu_reg_w(operands, 0b101, 0b0110000) - }, - // Shift-immediate rotate - "rori" => encode_shift_imm(operands, 0b101, 0b011000), - "roriw" => encode_shift_imm_w(operands, 0b101, 0b0110000), - // zext.h is R-type with rs2=0 on OP-32 - "zext.h" => encode_zbb_zexth(operands), - - // ── A Extension (atomics) ── - "lr.w" => encode_lr(operands, 0b010), - "lr.d" => encode_lr(operands, 0b011), - "sc.w" => encode_sc(operands, 0b010), - "sc.d" => encode_sc(operands, 0b011), - "amoswap.w" => encode_amo(operands, 0b010, 0b00001), - "amoadd.w" => encode_amo(operands, 0b010, 0b00000), - "amoxor.w" => encode_amo(operands, 0b010, 0b00100), - "amoand.w" => encode_amo(operands, 0b010, 0b01100), - "amoor.w" => encode_amo(operands, 0b010, 0b01000), - "amomin.w" => encode_amo(operands, 0b010, 0b10000), - "amomax.w" => encode_amo(operands, 0b010, 0b10100), - "amominu.w" => encode_amo(operands, 0b010, 0b11000), - "amomaxu.w" => encode_amo(operands, 0b010, 0b11100), - "amoswap.d" => encode_amo(operands, 0b011, 0b00001), - "amoadd.d" => encode_amo(operands, 0b011, 0b00000), - "amoxor.d" => encode_amo(operands, 0b011, 0b00100), - "amoand.d" => encode_amo(operands, 0b011, 0b01100), - "amoor.d" => encode_amo(operands, 0b011, 0b01000), - "amomin.d" => encode_amo(operands, 0b011, 0b10000), - "amomax.d" => encode_amo(operands, 0b011, 0b10100), - "amominu.d" => encode_amo(operands, 0b011, 0b11000), - "amomaxu.d" => encode_amo(operands, 0b011, 0b11100), - - // Handle .aq, .rl, .aqrl suffixes for atomics - s if s.starts_with("lr.") => encode_lr_suffixed(s, operands), - s if s.starts_with("sc.") => encode_sc_suffixed(s, operands), - s if s.starts_with("amo") => encode_amo_suffixed(s, operands), - - // ── System ── - "ecall" => Ok(EncodeResult::Word(0x00000073)), - "ebreak" => Ok(EncodeResult::Word(0x00100073)), - "fence" => encode_fence(operands), - "fence.i" => Ok(EncodeResult::Word(0x0000100F)), - "fence.tso" => Ok(EncodeResult::Word(0x8330000F)), - "pause" => Ok(EncodeResult::Word(0x0100000F)), - - // ── Privileged instructions ── - "wfi" => Ok(EncodeResult::Word(0x10500073)), - "mret" => Ok(EncodeResult::Word(0x30200073)), - "sret" => Ok(EncodeResult::Word(0x10200073)), - "sfence.vma" => encode_sfence_vma(operands), - - "csrrw" => encode_csr(operands, 0b001), - "csrrs" => encode_csr(operands, 0b010), - "csrrc" => encode_csr(operands, 0b011), - "csrrwi" => encode_csri(operands, 0b101), - "csrrsi" => encode_csri(operands, 0b110), - "csrrci" => encode_csri(operands, 0b111), - - // ── F Extension (single-precision float) ── - "flw" => encode_float_load(operands, 0b010), - "fsw" => encode_float_store(operands, 0b010), - "fadd.s" => encode_fp_arith(operands, 0b0000000), - "fsub.s" => encode_fp_arith(operands, 0b0000100), - "fmul.s" => encode_fp_arith(operands, 0b0001000), - "fdiv.s" => encode_fp_arith(operands, 0b0001100), - "fsqrt.s" => encode_fp_unary(operands, 0b0101100, 0b00000), - "fsgnj.s" => encode_fp_sgnj(operands, 0b0010000, 0b000), - "fsgnjn.s" => encode_fp_sgnj(operands, 0b0010000, 0b001), - "fsgnjx.s" => encode_fp_sgnj(operands, 0b0010000, 0b010), - "fmin.s" => encode_fp_sgnj(operands, 0b0010100, 0b000), - "fmax.s" => encode_fp_sgnj(operands, 0b0010100, 0b001), - "feq.s" => encode_fp_cmp(operands, 0b1010000, 0b010), - "flt.s" => encode_fp_cmp(operands, 0b1010000, 0b001), - "fle.s" => encode_fp_cmp(operands, 0b1010000, 0b000), - "fclass.s" => encode_fclass(operands, 0b1110000), - "fcvt.w.s" => encode_fcvt_int(operands, 0b1100000, 0b00000), - "fcvt.wu.s" => encode_fcvt_int(operands, 0b1100000, 0b00001), - "fcvt.l.s" => encode_fcvt_int(operands, 0b1100000, 0b00010), - "fcvt.lu.s" => encode_fcvt_int(operands, 0b1100000, 0b00011), - "fcvt.s.w" => encode_fcvt_from_int(operands, 0b1101000, 0b00000), - "fcvt.s.wu" => encode_fcvt_from_int(operands, 0b1101000, 0b00001), - "fcvt.s.l" => encode_fcvt_from_int(operands, 0b1101000, 0b00010), - "fcvt.s.lu" => encode_fcvt_from_int(operands, 0b1101000, 0b00011), - "fmv.x.w" | "fmv.x.s" => encode_fmv_x_f(operands, 0b1110000, 0b00), - "fmv.w.x" | "fmv.s.x" => encode_fmv_f_x(operands, 0b1111000, 0b00), - - // ── D Extension (double-precision float) ── - "fld" => encode_float_load(operands, 0b011), - "fsd" => encode_float_store(operands, 0b011), - "fadd.d" => encode_fp_arith_d(operands, 0b0000001), - "fsub.d" => encode_fp_arith_d(operands, 0b0000101), - "fmul.d" => encode_fp_arith_d(operands, 0b0001001), - "fdiv.d" => encode_fp_arith_d(operands, 0b0001101), - "fsqrt.d" => encode_fp_unary(operands, 0b0101101, 0b00000), - "fsgnj.d" => encode_fp_sgnj(operands, 0b0010001, 0b000), - "fsgnjn.d" => encode_fp_sgnj(operands, 0b0010001, 0b001), - "fsgnjx.d" => encode_fp_sgnj(operands, 0b0010001, 0b010), - "fmin.d" => encode_fp_sgnj(operands, 0b0010101, 0b000), - "fmax.d" => encode_fp_sgnj(operands, 0b0010101, 0b001), - "feq.d" => encode_fp_cmp(operands, 0b1010001, 0b010), - "flt.d" => encode_fp_cmp(operands, 0b1010001, 0b001), - "fle.d" => encode_fp_cmp(operands, 0b1010001, 0b000), - "fclass.d" => encode_fclass(operands, 0b1110001), - "fcvt.w.d" => encode_fcvt_int(operands, 0b1100001, 0b00000), - "fcvt.wu.d" => encode_fcvt_int(operands, 0b1100001, 0b00001), - "fcvt.l.d" => encode_fcvt_int(operands, 0b1100001, 0b00010), - "fcvt.lu.d" => encode_fcvt_int(operands, 0b1100001, 0b00011), - "fcvt.d.w" => encode_fcvt_from_int(operands, 0b1101001, 0b00000), - "fcvt.d.wu" => encode_fcvt_from_int(operands, 0b1101001, 0b00001), - "fcvt.d.l" => encode_fcvt_from_int(operands, 0b1101001, 0b00010), - "fcvt.d.lu" => encode_fcvt_from_int(operands, 0b1101001, 0b00011), - "fcvt.s.d" => encode_fcvt_fp(operands, 0b0100000, 0b00001), - "fcvt.d.s" => encode_fcvt_fp(operands, 0b0100001, 0b00000), - "fmv.x.d" => encode_fmv_x_f(operands, 0b1110001, 0b00), - "fmv.d.x" => encode_fmv_f_x(operands, 0b1111001, 0b00), - - // ── Fused multiply-add ── - "fmadd.s" => encode_fma(operands, OP_FMADD, 0b00), - "fmsub.s" => encode_fma(operands, OP_FMSUB, 0b00), - "fnmsub.s" => encode_fma(operands, OP_FNMSUB, 0b00), - "fnmadd.s" => encode_fma(operands, OP_FNMADD, 0b00), - "fmadd.d" => encode_fma(operands, OP_FMADD, 0b01), - "fmsub.d" => encode_fma(operands, OP_FMSUB, 0b01), - "fnmsub.d" => encode_fma(operands, OP_FNMSUB, 0b01), - "fnmadd.d" => encode_fma(operands, OP_FNMADD, 0b01), - - // ── Pseudo-instructions ── - "nop" => Ok(EncodeResult::Word(encode_i(OP_OP_IMM, 0, 0, 0, 0))), // addi x0, x0, 0 - "li" => encode_li(operands), - "mv" | "move" => encode_mv(operands), - "not" => encode_not(operands), - "neg" => encode_neg(operands), - "negw" => encode_negw(operands), - "sext.w" => encode_sext_w(operands), - "seqz" => encode_seqz(operands), - "snez" => encode_snez(operands), - "sltz" => encode_sltz(operands), - "sgtz" => encode_sgtz(operands), - - // Branch pseudo-instructions - "beqz" => encode_beqz(operands), - "bnez" => encode_bnez(operands), - "blez" => encode_blez(operands), - "bgez" => encode_bgez(operands), - "bltz" => encode_bltz(operands), - "bgtz" => encode_bgtz(operands), - "bgt" => encode_bgt(operands), - "ble" => encode_ble(operands), - "bgtu" => encode_bgtu(operands), - "bleu" => encode_bleu(operands), - - // Jump pseudo-instructions - "j" => encode_j_pseudo(operands), - "jr" => encode_jr(operands), - "ret" => Ok(EncodeResult::Word(encode_i(OP_JALR, 0, 0, 1, 0))), // jalr x0, x1, 0 - "call" => encode_call(operands), - "tail" => encode_tail(operands), - - // Address pseudo-instructions - "la" => encode_la(operands), - "lla" => encode_lla(operands), - - // CSR pseudo-instructions - "rdcycle" | "rdtime" | "rdinstret" => encode_rdcsr(mnemonic, operands), - "csrr" => encode_csrr(operands), - "csrw" => encode_csrw(operands), - "csrs" => encode_csrs(operands), - "csrc" => encode_csrc(operands), - - // Misc pseudo-instructions - "fmv.s" => encode_fmv_s(operands), - "fmv.d" => encode_fmv_d(operands), - "fabs.s" => encode_fabs_s(operands), - "fabs.d" => encode_fabs_d(operands), - "fneg.s" => encode_fneg_s(operands), - "fneg.d" => encode_fneg_d(operands), - - // `jump` pseudo-instruction (our codegen emits this) - "jump" => encode_jump(operands), - - // F/D CSR pseudo-instructions - // frcsr rd -> csrrs rd, fcsr, x0 - "frcsr" => { - let rd = get_reg(operands, 0)?; - Ok(EncodeResult::Word(encode_i(OP_SYSTEM, rd, 0b010, 0, 0x003))) - }, - // fscsr rs -> csrrw x0, fcsr, rs (or fscsr rd, rs -> csrrw rd, fcsr, rs) - "fscsr" => { - if operands.len() >= 2 { - let rd = get_reg(operands, 0)?; - let rs1 = get_reg(operands, 1)?; - Ok(EncodeResult::Word(encode_i(OP_SYSTEM, rd, 0b001, rs1, 0x003))) - } else { - let rs1 = get_reg(operands, 0)?; - Ok(EncodeResult::Word(encode_i(OP_SYSTEM, 0, 0b001, rs1, 0x003))) - } - }, - // frrm rd -> csrrs rd, frm, x0 - "frrm" => { - let rd = get_reg(operands, 0)?; - Ok(EncodeResult::Word(encode_i(OP_SYSTEM, rd, 0b010, 0, 0x002))) - }, - // fsrm rs -> csrrw x0, frm, rs (or fsrm rd, rs -> csrrw rd, frm, rs) - "fsrm" => { - if operands.len() >= 2 { - let rd = get_reg(operands, 0)?; - let rs1 = get_reg(operands, 1)?; - Ok(EncodeResult::Word(encode_i(OP_SYSTEM, rd, 0b001, rs1, 0x002))) - } else { - let rs1 = get_reg(operands, 0)?; - Ok(EncodeResult::Word(encode_i(OP_SYSTEM, 0, 0b001, rs1, 0x002))) - } - }, - // frflags rd -> csrrs rd, fflags, x0 - "frflags" => { - let rd = get_reg(operands, 0)?; - Ok(EncodeResult::Word(encode_i(OP_SYSTEM, rd, 0b010, 0, 0x001))) - }, - // fsflags rs -> csrrw x0, fflags, rs - "fsflags" => { - if operands.len() >= 2 { - let rd = get_reg(operands, 0)?; - let rs1 = get_reg(operands, 1)?; - Ok(EncodeResult::Word(encode_i(OP_SYSTEM, rd, 0b001, rs1, 0x001))) - } else { - let rs1 = get_reg(operands, 0)?; - Ok(EncodeResult::Word(encode_i(OP_SYSTEM, 0, 0b001, rs1, 0x001))) - } - }, - - // Explicit compressed instructions - "c.nop" => Ok(EncodeResult::Half(0x0001)), - "c.ebreak" => Ok(EncodeResult::Half(0x9002)), - "c.lui" => encode_c_lui(operands), - "c.li" => encode_c_li(operands), - "c.addi" => encode_c_addi(operands), - "c.mv" => encode_c_mv(operands), - "c.add" => encode_c_add(operands), - "c.jr" => encode_c_jr(operands), - "c.jalr" => encode_c_jalr(operands), - - // ── RVV (Vector) Extension ── - // TODO: masked variants (v0.t) are not yet supported; vm is hardcoded to 1 (unmasked). - - // Vector configuration: vsetvli rd, rs1, vtypei - "vsetvli" => encode_vsetvli(operands), - // Vector configuration: vsetivli rd, uimm, vtypei - "vsetivli" => encode_vsetivli(operands), - // Vector configuration: vsetvl rd, rs1, rs2 - "vsetvl" => encode_vsetvl(operands), - - // Vector loads - "vle8.v" => encode_vload(operands, 0b000, 0), // EEW=8 - "vle16.v" => encode_vload(operands, 0b101, 0), // EEW=16 - "vle32.v" => encode_vload(operands, 0b110, 0), // EEW=32 - "vle64.v" => encode_vload(operands, 0b111, 0), // EEW=64 - - // Vector stores - "vse8.v" => encode_vstore(operands, 0b000, 0), // EEW=8 - "vse16.v" => encode_vstore(operands, 0b101, 0), // EEW=16 - "vse32.v" => encode_vstore(operands, 0b110, 0), // EEW=32 - "vse64.v" => encode_vstore(operands, 0b111, 0), // EEW=64 - - // Vector load/store mask - "vlm.v" => encode_vload(operands, 0b000, 0x0B), // lumop=0b01011 - "vsm.v" => encode_vstore(operands, 0b000, 0x0B), // sumop=0b01011 - - // Vector integer arithmetic - OPIVV (funct3=000), OPIVX (funct3=100), OPIVI (funct3=011) - "vadd.vv" => encode_v_arith_vv(operands, 0b000000), - "vadd.vx" => encode_v_arith_vx(operands, 0b000000), - "vadd.vi" => encode_v_arith_vi(operands, 0b000000), - "vsub.vv" => encode_v_arith_vv(operands, 0b000010), - "vsub.vx" => encode_v_arith_vx(operands, 0b000010), - "vand.vv" => encode_v_arith_vv(operands, 0b001001), - "vand.vx" => encode_v_arith_vx(operands, 0b001001), - "vand.vi" => encode_v_arith_vi(operands, 0b001001), - "vor.vv" => encode_v_arith_vv(operands, 0b001010), - "vor.vx" => encode_v_arith_vx(operands, 0b001010), - "vor.vi" => encode_v_arith_vi(operands, 0b001010), - "vxor.vv" => encode_v_arith_vv(operands, 0b001011), - "vxor.vx" => encode_v_arith_vx(operands, 0b001011), - "vxor.vi" => encode_v_arith_vi(operands, 0b001011), - - // Vector slide instructions - "vslideup.vx" => encode_v_arith_vx(operands, 0b001110), - "vslideup.vi" => encode_v_arith_vi(operands, 0b001110), - "vslidedown.vx" => encode_v_arith_vx(operands, 0b001111), - "vslidedown.vi" => encode_v_arith_vi(operands, 0b001111), - - // Vector merge/move - "vmv.v.v" => encode_vmv_v_v(operands), - "vmv.v.x" => encode_vmv_v_x(operands), - "vmv.v.i" => encode_vmv_v_i(operands), - - // Vector misc - "vid.v" => encode_vid_v(operands), - - // Zvksh (SM3 crypto) - "vsm3c.vi" => encode_v_crypto_vi(operands, 0b101011), - "vsm3me.vv" => encode_v_crypto_vv(operands, 0b100000), - // Zvksed (SM4 crypto) - "vsm4k.vi" => encode_v_crypto_vi(operands, 0b100001), - "vsm4r.vs" => encode_v_crypto_vs(operands, 0b101001), - - _ => { - Err(format!("unsupported instruction: {} {}", mnemonic, raw_operands)) - } - } -} diff --git a/src/backend/riscv/assembler/encoder/pseudo.rs b/src/backend/riscv/assembler/encoder/pseudo.rs deleted file mode 100644 index 7e5a577769..0000000000 --- a/src/backend/riscv/assembler/encoder/pseudo.rs +++ /dev/null @@ -1,608 +0,0 @@ -use super::*; - -// ── Pseudo-instruction encoders ────────────────────────────────────── - -pub(crate) fn encode_li(operands: &[Operand]) -> Result { - let rd = get_reg(operands, 0)?; - let imm = get_imm(operands, 1)?; - - let words = encode_li_immediate(rd, imm); - if words.len() == 1 { - Ok(EncodeResult::Word(words[0])) - } else { - Ok(EncodeResult::Words(words)) - } -} - -/// Sign-extend a value from `bits` width to i64. -pub(crate) fn sign_extend_li(val: i64, bits: u32) -> i64 { - let shift = 64 - bits; - (val << shift) >> shift -} - -/// Emit lui + addiw (or just addi) for a 32-bit signed value into register `rd`. -/// -/// On RV64, the `li` pseudo-instruction uses `lui + addiw` (not `lui + addi`) -/// to ensure proper 32-bit sign extension. GAS always uses `addiw` after `lui` -/// for `li` on RV64. For small values (fits in 12 bits), `addi rd, x0, imm` -/// is sufficient since the result is the same. -pub(crate) fn encode_li_32bit(rd: u32, imm: i32) -> Vec { - if (-2048..=2047).contains(&imm) { - return vec![encode_i(OP_OP_IMM, rd, 0, 0, imm)]; // addi rd, x0, imm - } - let lo = (imm << 20) >> 20; // sign-extend low 12 bits - let hi = ((imm as u32).wrapping_add(if lo < 0 { 0x1000 } else { 0 })) & 0xFFFFF000; - let mut words = vec![encode_u(OP_LUI, rd, hi)]; - if lo != 0 { - // Use addiw (OP_OP_IMM_32) to match GAS behavior on RV64. - // lui sign-extends the 20-bit immediate to 64 bits, and addiw - // ensures the final 32-bit result is properly sign-extended. - words.push(encode_i(OP_OP_IMM_32, rd, 0, rd, lo)); // addiw rd, rd, lo - } - words -} - -/// Encode `li` pseudo-instruction for an arbitrary 64-bit immediate. -/// -/// Decomposes the value into a sequence of lui/addiw/slli/addi instructions. -/// For 64-bit values that don't fit in 32 bits, finds optimal shift amounts -/// such that the value = ((upper << shift1) + lo1) << shift2 + lo2 ... -/// where upper fits in 32 bits and each lo fits in 12 signed bits. -pub(crate) fn encode_li_immediate(rd: u32, imm: i64) -> Vec { - // Case 1: fits in 12 bits (addi rd, x0, imm) - if (-2048..=2047).contains(&imm) { - return vec![encode_i(OP_OP_IMM, rd, 0, 0, imm as i32)]; - } - - // Case 2: fits in 32 bits (lui + addi) - if (-0x80000000..=0x7FFFFFFF).contains(&imm) { - return encode_li_32bit(rd, imm as i32); - } - - // Case 3: 64-bit — try single shift: imm = (upper << shift) + lo12 - let lo12 = sign_extend_li(imm & 0xFFF, 12); - let mut best: Option> = None; - - for shift in 12..45 { - let remainder = imm.wrapping_sub(lo12); - if remainder & ((1i64 << shift) - 1) != 0 { - continue; - } - let upper = remainder >> shift; - if !(-0x80000000..=0x7FFFFFFF).contains(&upper) { - continue; - } - - let mut words = encode_li_32bit(rd, upper as i32); - // Convert addi to addiw after lui for proper 64-bit sign extension - if words.len() == 2 { - let first_opcode = words[0] & 0x7F; - let second_opcode = words[1] & 0x7F; - if first_opcode == OP_LUI && second_opcode == OP_OP_IMM { - words[1] = (words[1] & !0x7F) | OP_OP_IMM_32; - } - } - words.push(encode_i(OP_OP_IMM, rd, 0b001, rd, shift)); // slli - if lo12 != 0 { - words.push(encode_i(OP_OP_IMM, rd, 0, rd, lo12 as i32)); // addi - } - - if best.is_none() || words.len() < best.as_ref().unwrap().len() { - best = Some(words); - } - } - - if let Some(words) = best { - return words; - } - - // Case 4: two-level shift — imm = ((A << shift1) + lo_b) << shift2 + lo_c - for shift2 in 12..33 { - let remainder_c = imm.wrapping_sub(lo12); - if remainder_c & ((1i64 << shift2) - 1) != 0 { - continue; - } - let inner = remainder_c >> shift2; - let lo12_b = sign_extend_li(inner & 0xFFF, 12); - - for shift1 in 12..33 { - let remainder_b = inner.wrapping_sub(lo12_b); - if remainder_b & ((1i64 << shift1) - 1) != 0 { - continue; - } - let upper = remainder_b >> shift1; - if !(-0x80000000..=0x7FFFFFFF).contains(&upper) { - continue; - } - - let mut words = encode_li_32bit(rd, upper as i32); - if words.len() == 2 { - let first_opcode = words[0] & 0x7F; - let second_opcode = words[1] & 0x7F; - if first_opcode == OP_LUI && second_opcode == OP_OP_IMM { - words[1] = (words[1] & !0x7F) | OP_OP_IMM_32; - } - } - words.push(encode_i(OP_OP_IMM, rd, 0b001, rd, shift1)); // slli - if lo12_b != 0 { - words.push(encode_i(OP_OP_IMM, rd, 0, rd, lo12_b as i32)); // addi - } - words.push(encode_i(OP_OP_IMM, rd, 0b001, rd, shift2)); // slli - if lo12 != 0 { - words.push(encode_i(OP_OP_IMM, rd, 0, rd, lo12 as i32)); // addi - } - - if best.is_none() || words.len() < best.as_ref().unwrap().len() { - best = Some(words); - } - } - } - - if let Some(words) = best { - return words; - } - - // Case 5: three-level shift (needed for dense bit patterns across all 64 bits) - for shift3 in 12..23 { - let rem_c = imm.wrapping_sub(lo12); - if rem_c & ((1i64 << shift3) - 1) != 0 { - continue; - } - let v2 = rem_c >> shift3; - let lo12_b = sign_extend_li(v2 & 0xFFF, 12); - - for shift2 in 12..23 { - let rem_b = v2.wrapping_sub(lo12_b); - if rem_b & ((1i64 << shift2) - 1) != 0 { - continue; - } - let v1 = rem_b >> shift2; - let lo12_a = sign_extend_li(v1 & 0xFFF, 12); - - for shift1 in 12..23 { - let rem_a = v1.wrapping_sub(lo12_a); - if rem_a & ((1i64 << shift1) - 1) != 0 { - continue; - } - let upper = rem_a >> shift1; - if !(-0x80000000..=0x7FFFFFFF).contains(&upper) { - continue; - } - - let mut words = encode_li_32bit(rd, upper as i32); - if words.len() == 2 { - let first_opcode = words[0] & 0x7F; - let second_opcode = words[1] & 0x7F; - if first_opcode == OP_LUI && second_opcode == OP_OP_IMM { - words[1] = (words[1] & !0x7F) | OP_OP_IMM_32; - } - } - words.push(encode_i(OP_OP_IMM, rd, 0b001, rd, shift1)); - if lo12_a != 0 { - words.push(encode_i(OP_OP_IMM, rd, 0, rd, lo12_a as i32)); - } - words.push(encode_i(OP_OP_IMM, rd, 0b001, rd, shift2)); - if lo12_b != 0 { - words.push(encode_i(OP_OP_IMM, rd, 0, rd, lo12_b as i32)); - } - words.push(encode_i(OP_OP_IMM, rd, 0b001, rd, shift3)); - if lo12 != 0 { - words.push(encode_i(OP_OP_IMM, rd, 0, rd, lo12 as i32)); - } - - if best.is_none() || words.len() < best.as_ref().unwrap().len() { - best = Some(words); - } - } - } - } - - if let Some(words) = best { - return words; - } - - // Fallback: lui + addiw + slli 32, then add lower bits via addi chain - eprintln!("warning: li fallback for 0x{:x}", imm as u64); - let upper = (imm >> 32) as i32; - let mut words = encode_li_32bit(rd, upper); - if words.len() == 2 { - let first_opcode = words[0] & 0x7F; - let second_opcode = words[1] & 0x7F; - if first_opcode == OP_LUI && second_opcode == OP_OP_IMM { - words[1] = (words[1] & !0x7F) | OP_OP_IMM_32; - } - } - words.push(encode_i(OP_OP_IMM, rd, 0b001, rd, 32)); // slli rd, rd, 32 - let mut remaining = imm as i32 as i64; - while remaining != 0 { - let chunk = remaining.clamp(-2048, 2047); - words.push(encode_i(OP_OP_IMM, rd, 0, rd, chunk as i32)); - remaining -= chunk; - } - words -} - -pub(crate) fn encode_mv(operands: &[Operand]) -> Result { - let rd = get_reg(operands, 0)?; - let rs = get_reg(operands, 1)?; - // Use `add rd, x0, rs` instead of `addi rd, rs, 0` so the instruction - // is eligible for RV64C compression to C.MV (which requires the ADD form). - Ok(EncodeResult::Word(encode_r(OP_OP, rd, 0b000, 0, rs, 0b0000000))) // add rd, x0, rs -} - -pub(crate) fn encode_not(operands: &[Operand]) -> Result { - let rd = get_reg(operands, 0)?; - let rs1 = get_reg(operands, 1)?; - Ok(EncodeResult::Word(encode_i(OP_OP_IMM, rd, 0b100, rs1, -1))) // xori rd, rs1, -1 -} - -pub(crate) fn encode_neg(operands: &[Operand]) -> Result { - let rd = get_reg(operands, 0)?; - let rs2 = get_reg(operands, 1)?; - Ok(EncodeResult::Word(encode_r(OP_OP, rd, 0b000, 0, rs2, 0b0100000))) // sub rd, x0, rs2 -} - -pub(crate) fn encode_negw(operands: &[Operand]) -> Result { - let rd = get_reg(operands, 0)?; - let rs2 = get_reg(operands, 1)?; - Ok(EncodeResult::Word(encode_r(OP_OP_32, rd, 0b000, 0, rs2, 0b0100000))) -} - -pub(crate) fn encode_sext_w(operands: &[Operand]) -> Result { - let rd = get_reg(operands, 0)?; - let rs1 = get_reg(operands, 1)?; - Ok(EncodeResult::Word(encode_i(OP_OP_IMM_32, rd, 0, rs1, 0))) // addiw rd, rs1, 0 -} - -pub(crate) fn encode_seqz(operands: &[Operand]) -> Result { - let rd = get_reg(operands, 0)?; - let rs1 = get_reg(operands, 1)?; - Ok(EncodeResult::Word(encode_i(OP_OP_IMM, rd, 0b011, rs1, 1))) // sltiu rd, rs1, 1 -} - -pub(crate) fn encode_snez(operands: &[Operand]) -> Result { - let rd = get_reg(operands, 0)?; - let rs2 = get_reg(operands, 1)?; - Ok(EncodeResult::Word(encode_r(OP_OP, rd, 0b011, 0, rs2, 0b0000000))) // sltu rd, x0, rs2 -} - -pub(crate) fn encode_sltz(operands: &[Operand]) -> Result { - let rd = get_reg(operands, 0)?; - let rs1 = get_reg(operands, 1)?; - Ok(EncodeResult::Word(encode_r(OP_OP, rd, 0b010, rs1, 0, 0b0000000))) // slt rd, rs1, x0 -} - -pub(crate) fn encode_sgtz(operands: &[Operand]) -> Result { - let rd = get_reg(operands, 0)?; - let rs2 = get_reg(operands, 1)?; - Ok(EncodeResult::Word(encode_r(OP_OP, rd, 0b010, 0, rs2, 0b0000000))) // slt rd, x0, rs2 -} - -// Branch pseudo-instructions -pub(crate) fn encode_beqz(operands: &[Operand]) -> Result { - let rs1 = get_reg(operands, 0)?; - let label = get_branch_target(operands, 1)?; - Ok(EncodeResult::WordWithReloc { - word: encode_b(OP_BRANCH, 0b000, rs1, 0, 0), - reloc: Relocation { reloc_type: RelocType::Branch, symbol: label, addend: 0 }, - }) -} - -pub(crate) fn encode_bnez(operands: &[Operand]) -> Result { - let rs1 = get_reg(operands, 0)?; - let label = get_branch_target(operands, 1)?; - Ok(EncodeResult::WordWithReloc { - word: encode_b(OP_BRANCH, 0b001, rs1, 0, 0), - reloc: Relocation { reloc_type: RelocType::Branch, symbol: label, addend: 0 }, - }) -} - -pub(crate) fn encode_blez(operands: &[Operand]) -> Result { - let rs2 = get_reg(operands, 0)?; - let label = get_branch_target(operands, 1)?; - Ok(EncodeResult::WordWithReloc { - word: encode_b(OP_BRANCH, 0b101, 0, rs2, 0), // bge x0, rs - reloc: Relocation { reloc_type: RelocType::Branch, symbol: label, addend: 0 }, - }) -} - -pub(crate) fn encode_bgez(operands: &[Operand]) -> Result { - let rs1 = get_reg(operands, 0)?; - let label = get_branch_target(operands, 1)?; - Ok(EncodeResult::WordWithReloc { - word: encode_b(OP_BRANCH, 0b101, rs1, 0, 0), // bge rs, x0 - reloc: Relocation { reloc_type: RelocType::Branch, symbol: label, addend: 0 }, - }) -} - -pub(crate) fn encode_bltz(operands: &[Operand]) -> Result { - let rs1 = get_reg(operands, 0)?; - let label = get_branch_target(operands, 1)?; - Ok(EncodeResult::WordWithReloc { - word: encode_b(OP_BRANCH, 0b100, rs1, 0, 0), // blt rs, x0 - reloc: Relocation { reloc_type: RelocType::Branch, symbol: label, addend: 0 }, - }) -} - -pub(crate) fn encode_bgtz(operands: &[Operand]) -> Result { - let rs2 = get_reg(operands, 0)?; - let label = get_branch_target(operands, 1)?; - Ok(EncodeResult::WordWithReloc { - word: encode_b(OP_BRANCH, 0b100, 0, rs2, 0), // blt x0, rs - reloc: Relocation { reloc_type: RelocType::Branch, symbol: label, addend: 0 }, - }) -} - -pub(crate) fn encode_bgt(operands: &[Operand]) -> Result { - let rs1 = get_reg(operands, 0)?; - let rs2 = get_reg(operands, 1)?; - let label = get_branch_target(operands, 2)?; - Ok(EncodeResult::WordWithReloc { - word: encode_b(OP_BRANCH, 0b100, rs2, rs1, 0), // blt rs2, rs1 - reloc: Relocation { reloc_type: RelocType::Branch, symbol: label, addend: 0 }, - }) -} - -pub(crate) fn encode_ble(operands: &[Operand]) -> Result { - let rs1 = get_reg(operands, 0)?; - let rs2 = get_reg(operands, 1)?; - let label = get_branch_target(operands, 2)?; - Ok(EncodeResult::WordWithReloc { - word: encode_b(OP_BRANCH, 0b101, rs2, rs1, 0), // bge rs2, rs1 - reloc: Relocation { reloc_type: RelocType::Branch, symbol: label, addend: 0 }, - }) -} - -pub(crate) fn encode_bgtu(operands: &[Operand]) -> Result { - let rs1 = get_reg(operands, 0)?; - let rs2 = get_reg(operands, 1)?; - let label = get_branch_target(operands, 2)?; - Ok(EncodeResult::WordWithReloc { - word: encode_b(OP_BRANCH, 0b110, rs2, rs1, 0), // bltu rs2, rs1 - reloc: Relocation { reloc_type: RelocType::Branch, symbol: label, addend: 0 }, - }) -} - -pub(crate) fn encode_bleu(operands: &[Operand]) -> Result { - let rs1 = get_reg(operands, 0)?; - let rs2 = get_reg(operands, 1)?; - let label = get_branch_target(operands, 2)?; - Ok(EncodeResult::WordWithReloc { - word: encode_b(OP_BRANCH, 0b111, rs2, rs1, 0), // bgeu rs2, rs1 - reloc: Relocation { reloc_type: RelocType::Branch, symbol: label, addend: 0 }, - }) -} - -pub(crate) fn get_branch_target(operands: &[Operand], idx: usize) -> Result { - match operands.get(idx) { - Some(Operand::Symbol(s)) | Some(Operand::Label(s)) => Ok(s.clone()), - Some(Operand::Imm(v)) => Ok(format!("{}", v)), - // A register name can also be a symbol/label name (e.g. `beqz a0, t1` - // where t1 is a label). Treat Reg as symbol in branch target context. - Some(Operand::Reg(s)) => Ok(s.clone()), - _ => Err(format!("expected branch target at operand {}", idx)), - } -} - -pub(crate) fn encode_j_pseudo(operands: &[Operand]) -> Result { - // j offset -> jal x0, offset - match &operands[0] { - Operand::Symbol(s) | Operand::Label(s) | Operand::Reg(s) => { - Ok(EncodeResult::WordWithReloc { - word: encode_j(OP_JAL, 0, 0), - reloc: Relocation { - reloc_type: RelocType::Jal, - symbol: s.clone(), - addend: 0, - }, - }) - } - Operand::Imm(imm) => { - Ok(EncodeResult::Word(encode_j(OP_JAL, 0, *imm as i32))) - } - _ => Err("j: expected offset or label".to_string()), - } -} - -pub(crate) fn encode_jr(operands: &[Operand]) -> Result { - let rs1 = get_reg(operands, 0)?; - Ok(EncodeResult::Word(encode_i(OP_JALR, 0, 0, rs1, 0))) -} - -pub(crate) fn encode_call(operands: &[Operand]) -> Result { - // call symbol -> auipc ra, %pcrel_hi(symbol) ; jalr ra, %pcrel_lo(symbol)(ra) - let (symbol, addend) = get_symbol(operands, 0)?; - Ok(EncodeResult::WordsWithRelocs(vec![ - (encode_u(OP_AUIPC, 1, 0), Some(Relocation { - reloc_type: RelocType::CallPlt, - symbol: symbol.clone(), - addend, - })), - (encode_i(OP_JALR, 1, 0, 1, 0), None), // jalr ra, 0(ra) - ])) -} - -pub(crate) fn encode_tail(operands: &[Operand]) -> Result { - // tail symbol -> auipc t1, %pcrel_hi(symbol) ; jalr x0, %pcrel_lo(symbol)(t1) - let (symbol, addend) = get_symbol(operands, 0)?; - Ok(EncodeResult::WordsWithRelocs(vec![ - (encode_u(OP_AUIPC, 6, 0), Some(Relocation { // t1 = x6 - reloc_type: RelocType::CallPlt, - symbol: symbol.clone(), - addend, - })), - (encode_i(OP_JALR, 0, 0, 6, 0), None), - ])) -} - -pub(crate) fn encode_jump(operands: &[Operand]) -> Result { - // jump label, temp_reg -> auipc temp, %pcrel_hi(label) ; jalr x0, %pcrel_lo(label)(temp) - // Our codegen emits: jump .LBB42, t6 - let (symbol, addend) = get_symbol(operands, 0)?; - let temp = if operands.len() > 1 { - get_reg(operands, 1)? - } else { - 31 // t6 - }; - - Ok(EncodeResult::WordsWithRelocs(vec![ - (encode_u(OP_AUIPC, temp, 0), Some(Relocation { - reloc_type: RelocType::CallPlt, - symbol: symbol.clone(), - addend, - })), - (encode_i(OP_JALR, 0, 0, temp, 0), None), - ])) -} - -pub(crate) fn encode_la(operands: &[Operand]) -> Result { - // la rd, symbol -> auipc rd, %pcrel_hi(symbol) ; addi rd, rd, %pcrel_lo(symbol) - // TODO: For PIC, this should use GOT - encode_lla(operands) // for now, same as lla -} - -pub(crate) fn encode_lla(operands: &[Operand]) -> Result { - // lla rd, symbol -> auipc rd, %pcrel_hi(symbol) ; addi rd, rd, %pcrel_lo(symbol) - let rd = get_reg(operands, 0)?; - let (symbol, addend) = get_symbol(operands, 1)?; - - Ok(EncodeResult::WordsWithRelocs(vec![ - (encode_u(OP_AUIPC, rd, 0), Some(Relocation { - reloc_type: RelocType::PcrelHi20, - symbol: symbol.clone(), - addend, - })), - (encode_i(OP_OP_IMM, rd, 0, rd, 0), Some(Relocation { - reloc_type: RelocType::PcrelLo12I, - symbol, // TODO: This should reference the auipc label, not the symbol directly - addend, - })), - ])) -} - -pub(crate) fn encode_rdcsr(mnemonic: &str, operands: &[Operand]) -> Result { - let rd = get_reg(operands, 0)?; - let csr = match mnemonic { - "rdcycle" => 0xC00, - "rdtime" => 0xC01, - "rdinstret" => 0xC02, - _ => return Err(format!("unknown CSR pseudo: {}", mnemonic)), - }; - Ok(EncodeResult::Word(encode_i(OP_SYSTEM, rd, 0b010, 0, csr))) // csrrs rd, csr, x0 -} - -pub(crate) fn encode_csrr(operands: &[Operand]) -> Result { - let rd = get_reg(operands, 0)?; - let csr = get_csr_num(operands, 1)?; - Ok(EncodeResult::Word(encode_i(OP_SYSTEM, rd, 0b010, 0, csr as i32))) -} - -pub(crate) fn encode_csrw(operands: &[Operand]) -> Result { - let csr = get_csr_num(operands, 0)?; - if matches!(operands.get(1), Some(Operand::Imm(_))) { - let zimm = get_imm(operands, 1)? as u32 & 0x1F; - return Ok(EncodeResult::Word(encode_i(OP_SYSTEM, 0, 0b101, zimm, csr as i32))); - } - let rs1 = get_reg(operands, 1)?; - Ok(EncodeResult::Word(encode_i(OP_SYSTEM, 0, 0b001, rs1, csr as i32))) -} - -pub(crate) fn encode_csrs(operands: &[Operand]) -> Result { - let csr = get_csr_num(operands, 0)?; - if matches!(operands.get(1), Some(Operand::Imm(_))) { - let zimm = get_imm(operands, 1)? as u32 & 0x1F; - return Ok(EncodeResult::Word(encode_i(OP_SYSTEM, 0, 0b110, zimm, csr as i32))); - } - let rs1 = get_reg(operands, 1)?; - Ok(EncodeResult::Word(encode_i(OP_SYSTEM, 0, 0b010, rs1, csr as i32))) -} - -pub(crate) fn encode_csrc(operands: &[Operand]) -> Result { - let csr = get_csr_num(operands, 0)?; - if matches!(operands.get(1), Some(Operand::Imm(_))) { - let zimm = get_imm(operands, 1)? as u32 & 0x1F; - return Ok(EncodeResult::Word(encode_i(OP_SYSTEM, 0, 0b111, zimm, csr as i32))); - } - let rs1 = get_reg(operands, 1)?; - Ok(EncodeResult::Word(encode_i(OP_SYSTEM, 0, 0b011, rs1, csr as i32))) -} - -// Float pseudo-instructions -pub(crate) fn encode_fmv_s(operands: &[Operand]) -> Result { - let rd = get_freg(operands, 0)?; - let rs1 = get_freg(operands, 1)?; - // fsgnj.s rd, rs, rs - Ok(EncodeResult::Word(encode_r(OP_OP_FP, rd, 0b000, rs1, rs1, 0b0010000))) -} - -pub(crate) fn encode_fmv_d(operands: &[Operand]) -> Result { - let rd = get_freg(operands, 0)?; - let rs1 = get_freg(operands, 1)?; - Ok(EncodeResult::Word(encode_r(OP_OP_FP, rd, 0b000, rs1, rs1, 0b0010001))) -} - -pub(crate) fn encode_fabs_s(operands: &[Operand]) -> Result { - let rd = get_freg(operands, 0)?; - let rs1 = get_freg(operands, 1)?; - // fsgnjx.s rd, rs, rs - Ok(EncodeResult::Word(encode_r(OP_OP_FP, rd, 0b010, rs1, rs1, 0b0010000))) -} - -pub(crate) fn encode_fabs_d(operands: &[Operand]) -> Result { - let rd = get_freg(operands, 0)?; - let rs1 = get_freg(operands, 1)?; - Ok(EncodeResult::Word(encode_r(OP_OP_FP, rd, 0b010, rs1, rs1, 0b0010001))) -} - -pub(crate) fn encode_fneg_s(operands: &[Operand]) -> Result { - let rd = get_freg(operands, 0)?; - let rs1 = get_freg(operands, 1)?; - // fsgnjn.s rd, rs, rs - Ok(EncodeResult::Word(encode_r(OP_OP_FP, rd, 0b001, rs1, rs1, 0b0010000))) -} - -pub(crate) fn encode_fneg_d(operands: &[Operand]) -> Result { - let rd = get_freg(operands, 0)?; - let rs1 = get_freg(operands, 1)?; - Ok(EncodeResult::Word(encode_r(OP_OP_FP, rd, 0b001, rs1, rs1, 0b0010001))) -} - -// ── Relocation modifier parsing ────────────────────────────────────── - -/// Extract symbol name from %modifier(symbol) expressions -pub(crate) fn extract_modifier_symbol(s: &str) -> String { - if let Some(start) = s.find('(') { - if let Some(end) = s.rfind(')') { - return s[start + 1..end].to_string(); - } - } - s.to_string() -} - -/// Parse a relocation modifier like %pcrel_hi(symbol) and return (RelocType, symbol) -pub(crate) fn parse_reloc_modifier(s: &str) -> (RelocType, String) { - if s.starts_with("%pcrel_hi(") { - (RelocType::PcrelHi20, extract_modifier_symbol(s)) - } else if s.starts_with("%pcrel_lo(") { - (RelocType::PcrelLo12I, extract_modifier_symbol(s)) - } else if s.starts_with("%hi(") { - (RelocType::Hi20, extract_modifier_symbol(s)) - } else if s.starts_with("%lo(") { - (RelocType::Lo12I, extract_modifier_symbol(s)) - } else if s.starts_with("%tprel_hi(") { - (RelocType::TprelHi20, extract_modifier_symbol(s)) - } else if s.starts_with("%tprel_lo(") { - (RelocType::TprelLo12I, extract_modifier_symbol(s)) - } else if s.starts_with("%tprel_add(") { - (RelocType::TprelAdd, extract_modifier_symbol(s)) - } else if s.starts_with("%got_pcrel_hi(") { - (RelocType::GotHi20, extract_modifier_symbol(s)) - } else if s.starts_with("%tls_ie_pcrel_hi(") { - (RelocType::TlsGotHi20, extract_modifier_symbol(s)) - } else if s.starts_with("%tls_gd_pcrel_hi(") { - (RelocType::TlsGdHi20, extract_modifier_symbol(s)) - } else { - // Plain symbol - use as PC-relative - (RelocType::PcrelHi20, s.to_string()) - } -} diff --git a/src/backend/riscv/assembler/encoder/system.rs b/src/backend/riscv/assembler/encoder/system.rs deleted file mode 100644 index 056f022bac..0000000000 --- a/src/backend/riscv/assembler/encoder/system.rs +++ /dev/null @@ -1,115 +0,0 @@ -use super::*; - -// ── Fence ── - -pub(crate) fn encode_fence(operands: &[Operand]) -> Result { - let (pred, succ) = if operands.is_empty() { - (0xF, 0xF) // fence iorw, iorw - } else if operands.len() >= 2 { - let pred = match &operands[0] { - Operand::FenceArg(s) => parse_fence_bits(s), - _ => 0xF, - }; - let succ = match &operands[1] { - Operand::FenceArg(s) => parse_fence_bits(s), - _ => 0xF, - }; - (pred, succ) - } else { - (0xF, 0xF) - }; - let imm = ((pred << 4) | succ) as i32; - Ok(EncodeResult::Word(encode_i(OP_MISC_MEM, 0, 0, 0, imm))) -} - -/// Encode sfence.vma rs1, rs2 -/// Format: funct7=0001001 | rs2 | rs1 | funct3=000 | rd=00000 | opcode=1110011 -/// If no operands: sfence.vma zero, zero -/// If 1 operand: sfence.vma rs1, zero -/// If 2 operands: sfence.vma rs1, rs2 -pub(crate) fn encode_sfence_vma(operands: &[Operand]) -> Result { - let rs1 = if operands.is_empty() { 0 } else { get_reg(operands, 0)? }; - let rs2 = if operands.len() < 2 { 0 } else { get_reg(operands, 1)? }; - // sfence.vma is encoded as: funct7=0001001(0x09) | rs2 | rs1 | 000 | 00000 | SYSTEM(1110011) - let word = encode_r(OP_SYSTEM, 0, 0b000, rs1, rs2, 0b0001001); - Ok(EncodeResult::Word(word)) -} - -// ── CSR ── - -pub(crate) fn encode_csr(operands: &[Operand], funct3: u32) -> Result { - let rd = get_reg(operands, 0)?; - let csr = get_csr_num(operands, 1)?; - // If operand 2 is a bare immediate (not a register name), use the immediate - // CSR encoding (csrrwi/csrrsi/csrrci) instead of the register form. - // GNU as allows e.g. `csrrc t0, sstatus, 2` and auto-selects the immediate form. - if matches!(operands.get(2), Some(Operand::Imm(_))) { - let zimm = get_imm(operands, 2)? as u32; - let rs1 = zimm & 0x1F; - let imm_funct3 = funct3 | 0b100; // 001->101, 010->110, 011->111 - return Ok(EncodeResult::Word(encode_i(OP_SYSTEM, rd, imm_funct3, rs1, csr as i32))); - } - let rs1 = get_reg(operands, 2)?; - Ok(EncodeResult::Word(encode_i(OP_SYSTEM, rd, funct3, rs1, csr as i32))) -} - -pub(crate) fn encode_csri(operands: &[Operand], funct3: u32) -> Result { - let rd = get_reg(operands, 0)?; - let csr = get_csr_num(operands, 1)?; - let zimm = get_imm(operands, 2)? as u32; - let rs1 = zimm & 0x1F; - Ok(EncodeResult::Word(encode_i(OP_SYSTEM, rd, funct3, rs1, csr as i32))) -} - -pub(crate) fn get_csr_num(operands: &[Operand], idx: usize) -> Result { - match operands.get(idx) { - Some(Operand::Imm(v)) => Ok(*v as u32), - Some(Operand::Csr(name)) => csr_name_to_num(name), - Some(Operand::Symbol(name)) => csr_name_to_num(name), - Some(Operand::Reg(name)) => csr_name_to_num(name), // sometimes CSR names look like regs - other => Err(format!("expected CSR at operand {}, got {:?}", idx, other)), - } -} - -pub(crate) fn csr_name_to_num(name: &str) -> Result { - match name.to_lowercase().as_str() { - "fflags" => Ok(0x001), - "frm" => Ok(0x002), - "fcsr" => Ok(0x003), - "cycle" => Ok(0xC00), - "time" => Ok(0xC01), - "instret" => Ok(0xC02), - "cycleh" => Ok(0xC80), - "timeh" => Ok(0xC81), - "instreth" => Ok(0xC82), - "mstatus" => Ok(0x300), - "misa" => Ok(0x301), - "mie" => Ok(0x304), - "mtvec" => Ok(0x305), - "mscratch" => Ok(0x340), - "mepc" => Ok(0x341), - "mcause" => Ok(0x342), - "mtval" => Ok(0x343), - "mip" => Ok(0x344), - "sstatus" => Ok(0x100), - "sip" => Ok(0x144), - "sie" => Ok(0x104), - "stvec" => Ok(0x105), - "sscratch" => Ok(0x140), - "sepc" => Ok(0x141), - "scause" => Ok(0x142), - "stval" => Ok(0x143), - "satp" => Ok(0x180), - _ => { - // Try parsing as a number - if let Ok(v) = name.parse::() { - Ok(v) - } else if let Some(hex) = name.strip_prefix("0x") { - u32::from_str_radix(hex, 16) - .map_err(|_| format!("invalid CSR: {}", name)) - } else { - Err(format!("unknown CSR: {}", name)) - } - } - } -} diff --git a/src/backend/riscv/assembler/encoder/vector.rs b/src/backend/riscv/assembler/encoder/vector.rs deleted file mode 100644 index 2566f02a19..0000000000 --- a/src/backend/riscv/assembler/encoder/vector.rs +++ /dev/null @@ -1,216 +0,0 @@ -use super::*; - -// ── RVV (Vector) Extension Encoders ────────────────────────────────── - -/// Parse a vtypei field from operands starting at `start_idx`. -/// The vtypei is specified as a sequence of operands: e.g., e8, m8, ta, ma -/// Returns the encoded vtypei value. -pub(crate) fn parse_vtypei(operands: &[Operand], start_idx: usize) -> Result { - let mut sew: u32 = 0; // SEW encoding (3 bits): e8=000, e16=001, e32=010, e64=011 - let mut lmul: u32 = 0; // LMUL encoding (3 bits): m1=000, m2=001, m4=010, m8=011, mf2=111, mf4=110, mf8=101 - let mut ta: u32 = 0; // Tail agnostic - let mut ma: u32 = 0; // Mask agnostic - - for i in start_idx..operands.len() { - let name = match &operands[i] { - Operand::Symbol(s) => s.to_lowercase(), - Operand::Reg(s) => s.to_lowercase(), - // Raw immediate: treat as pre-encoded vtypei value - Operand::Imm(v) => return Ok(*v as u32 & 0x7FF), - _ => continue, - }; - match name.as_str() { - "e8" => sew = 0b000, - "e16" => sew = 0b001, - "e32" => sew = 0b010, - "e64" => sew = 0b011, - "m1" => lmul = 0b000, - "m2" => lmul = 0b001, - "m4" => lmul = 0b010, - "m8" => lmul = 0b011, - "mf2" => lmul = 0b111, - "mf4" => lmul = 0b110, - "mf8" => lmul = 0b101, - "ta" => ta = 1, - "tu" => ta = 0, - "ma" => ma = 1, - "mu" => ma = 0, - _ => return Err(format!("unknown vtypei field: {}", name)), - } - } - - // vtypei: [ma][ta][sew[2:0]][lmul[2:0]] - Ok((ma << 7) | (ta << 6) | (sew << 3) | lmul) -} - -/// Encode vsetvli rd, rs1, vtypei -/// Format: [0][vtypei[10:0]][rs1][111][rd][1010111] -pub(crate) fn encode_vsetvli(operands: &[Operand]) -> Result { - let rd = get_reg(operands, 0)?; - let rs1 = get_reg(operands, 1)?; - let vtypei = parse_vtypei(operands, 2)?; - // bit 31 = 0 for vsetvli - let word = ((vtypei & 0x7FF) << 20) | (rs1 << 15) | (0b111 << 12) | (rd << 7) | OP_V; - Ok(EncodeResult::Word(word)) -} - -/// Encode vsetivli rd, uimm[4:0], vtypei -/// Format: [11][vtypei[9:0]][uimm[4:0]][111][rd][1010111] -pub(crate) fn encode_vsetivli(operands: &[Operand]) -> Result { - let rd = get_reg(operands, 0)?; - let uimm = get_imm(operands, 1)? as u32 & 0x1F; - let vtypei = parse_vtypei(operands, 2)?; - // bits [31:30] = 11 for vsetivli - let word = (0b11u32 << 30) | ((vtypei & 0x3FF) << 20) | (uimm << 15) | (0b111 << 12) | (rd << 7) | OP_V; - Ok(EncodeResult::Word(word)) -} - -/// Encode vsetvl rd, rs1, rs2 -/// Format: [1000000][rs2][rs1][111][rd][1010111] -pub(crate) fn encode_vsetvl(operands: &[Operand]) -> Result { - let rd = get_reg(operands, 0)?; - let rs1 = get_reg(operands, 1)?; - let rs2 = get_reg(operands, 2)?; - let word = (0b1000000u32 << 25) | (rs2 << 20) | (rs1 << 15) | (0b111 << 12) | (rd << 7) | OP_V; - Ok(EncodeResult::Word(word)) -} - -/// Encode vector unit-stride load: vle{8,16,32,64}.v vd, (rs1) -/// Format: nf[31:29] | mew[28] | mop[27:26]=00 | vm[25] | lumop[24:20] | rs1[19:15] | width[14:12] | vd[11:7] | 0000111 -/// For unit-stride: mop=00, lumop=00000 (or 01011 for whole-reg/mask) -pub(crate) fn encode_vload(operands: &[Operand], width: u32, lumop: u32) -> Result { - let vd = get_vreg(operands, 0)?; - // The second operand should be a memory operand (rs1) like (a1) - let rs1 = match operands.get(1) { - Some(Operand::Mem { base, offset: 0 }) => { - reg_num(base).ok_or_else(|| format!("invalid base register: {}", base))? - } - Some(Operand::Reg(name)) => { - // Parenthesized register may be parsed differently - reg_num(name).ok_or_else(|| format!("invalid register: {}", name))? - } - other => return Err(format!("expected (rs1) at operand 1, got {:?}", other)), - }; - // vm=1 means unmasked (no v0.t) - let vm: u32 = 1; - // nf=000 (single segment), mew=0, mop=00 (bits 31:26 = 0) - let word = (vm << 25) - | (lumop << 20) | (rs1 << 15) | (width << 12) | (vd << 7) | OP_LOAD_FP; - Ok(EncodeResult::Word(word)) -} - -/// Encode vector unit-stride store: vse{8,16,32,64}.v vs3, (rs1) -/// Format: nf[31:29] | mew[28] | mop[27:26]=00 | vm[25] | sumop[24:20] | rs1[19:15] | width[14:12] | vs3[11:7] | 0100111 -pub(crate) fn encode_vstore(operands: &[Operand], width: u32, sumop: u32) -> Result { - let vs3 = get_vreg(operands, 0)?; - let rs1 = match operands.get(1) { - Some(Operand::Mem { base, offset: 0 }) => { - reg_num(base).ok_or_else(|| format!("invalid base register: {}", base))? - } - Some(Operand::Reg(name)) => { - reg_num(name).ok_or_else(|| format!("invalid register: {}", name))? - } - other => return Err(format!("expected (rs1) at operand 1, got {:?}", other)), - }; - let vm: u32 = 1; - // nf=000, mew=0, mop=00 (bits 31:26 = 0) - let word = (vm << 25) - | (sumop << 20) | (rs1 << 15) | (width << 12) | (vs3 << 7) | OP_STORE_FP; - Ok(EncodeResult::Word(word)) -} - -/// Encode vector arithmetic VV (vector-vector): funct6[31:26] | vm[25] | vs2[24:20] | vs1[19:15] | funct3[14:12]=000 | vd[11:7] | OP_V -pub(crate) fn encode_v_arith_vv(operands: &[Operand], funct6: u32) -> Result { - let vd = get_vreg(operands, 0)?; - let vs2 = get_vreg(operands, 1)?; - let vs1 = get_vreg(operands, 2)?; - let vm: u32 = 1; // unmasked - // funct3=000 (OPIVV) - let word = (funct6 << 26) | (vm << 25) | (vs2 << 20) | (vs1 << 15) | (vd << 7) | OP_V; - Ok(EncodeResult::Word(word)) -} - -/// Encode vector arithmetic VX (vector-scalar): funct6[31:26] | vm[25] | vs2[24:20] | rs1[19:15] | funct3[14:12]=100 | vd[11:7] | OP_V -pub(crate) fn encode_v_arith_vx(operands: &[Operand], funct6: u32) -> Result { - let vd = get_vreg(operands, 0)?; - let vs2 = get_vreg(operands, 1)?; - let rs1 = get_reg(operands, 2)?; - let vm: u32 = 1; - let word = (funct6 << 26) | (vm << 25) | (vs2 << 20) | (rs1 << 15) | (0b100 << 12) | (vd << 7) | OP_V; - Ok(EncodeResult::Word(word)) -} - -/// Encode vector arithmetic VI (vector-immediate): funct6[31:26] | vm[25] | vs2[24:20] | simm5[19:15] | funct3[14:12]=011 | vd[11:7] | OP_V -pub(crate) fn encode_v_arith_vi(operands: &[Operand], funct6: u32) -> Result { - let vd = get_vreg(operands, 0)?; - let vs2 = get_vreg(operands, 1)?; - let simm5 = get_imm(operands, 2)? as u32 & 0x1F; - let vm: u32 = 1; - let word = (funct6 << 26) | (vm << 25) | (vs2 << 20) | (simm5 << 15) | (0b011 << 12) | (vd << 7) | OP_V; - Ok(EncodeResult::Word(word)) -} - -/// vmv.v.v vd, vs1: OPIVV, funct6=010111, vm=1, vs2=0 -pub(crate) fn encode_vmv_v_v(operands: &[Operand]) -> Result { - let vd = get_vreg(operands, 0)?; - let vs1 = get_vreg(operands, 1)?; - // funct6=010111, vm=1, vs2=0, funct3=000 (OPIVV) - let word = (0b010111u32 << 26) | (1u32 << 25) | (vs1 << 15) | (vd << 7) | OP_V; - Ok(EncodeResult::Word(word)) -} - -/// vmv.v.x vd, rs1: OPIVX, funct6=010111, vm=1, vs2=0 -pub(crate) fn encode_vmv_v_x(operands: &[Operand]) -> Result { - let vd = get_vreg(operands, 0)?; - let rs1 = get_reg(operands, 1)?; - // funct6=010111, vm=1, vs2=0 - let word = (0b010111u32 << 26) | (1u32 << 25) | (rs1 << 15) | (0b100 << 12) | (vd << 7) | OP_V; - Ok(EncodeResult::Word(word)) -} - -/// vmv.v.i vd, simm5: OPIVI, funct6=010111, vm=1, vs2=0 -pub(crate) fn encode_vmv_v_i(operands: &[Operand]) -> Result { - let vd = get_vreg(operands, 0)?; - let simm5 = get_imm(operands, 1)? as u32 & 0x1F; - // funct6=010111, vm=1, vs2=0 - let word = (0b010111u32 << 26) | (1u32 << 25) | (simm5 << 15) | (0b011 << 12) | (vd << 7) | OP_V; - Ok(EncodeResult::Word(word)) -} - -/// vid.v vd: OPMVV, funct6=010100, vm=1, vs2=00000, rs1=10001 -/// Encoding: funct6=010100 | vm=1 | vs2=00000 | 10001 | 010 | vd | OP_V -pub(crate) fn encode_vid_v(operands: &[Operand]) -> Result { - let vd = get_vreg(operands, 0)?; - // vs2=0 (bits 24:20), funct6=010100, vm=1 - let word = (0b010100u32 << 26) | (1u32 << 25) | (0b10001u32 << 15) | (0b010 << 12) | (vd << 7) | OP_V; - Ok(EncodeResult::Word(word)) -} - -/// Encode Zvksh/Zvksed crypto instructions with VI format -/// vsm3c.vi, vsm4k.vi: funct6 | vm=1 | vs2 | uimm5 | 010 | vd | OP_V_CRYPTO -pub(crate) fn encode_v_crypto_vi(operands: &[Operand], funct6: u32) -> Result { - let vd = get_vreg(operands, 0)?; - let vs2 = get_vreg(operands, 1)?; - let uimm5 = get_imm(operands, 2)? as u32 & 0x1F; - let word = (funct6 << 26) | (1u32 << 25) | (vs2 << 20) | (uimm5 << 15) | (0b010 << 12) | (vd << 7) | OP_V_CRYPTO; - Ok(EncodeResult::Word(word)) -} - -/// Encode Zvksh crypto instructions with VV format -/// vsm3me.vv: funct6 | vm=1 | vs2 | vs1 | 010 | vd | OP_V_CRYPTO -pub(crate) fn encode_v_crypto_vv(operands: &[Operand], funct6: u32) -> Result { - let vd = get_vreg(operands, 0)?; - let vs2 = get_vreg(operands, 1)?; - let vs1 = get_vreg(operands, 2)?; - let word = (funct6 << 26) | (1u32 << 25) | (vs2 << 20) | (vs1 << 15) | (0b010 << 12) | (vd << 7) | OP_V_CRYPTO; - Ok(EncodeResult::Word(word)) -} - -/// Encode Zvksed crypto instructions with VS format -/// vsm4r.vs: funct6 | vm=1 | vs2 | 10000 | 010 | vd | OP_V_CRYPTO -pub(crate) fn encode_v_crypto_vs(operands: &[Operand], funct6: u32) -> Result { - let vd = get_vreg(operands, 0)?; - let vs2 = get_vreg(operands, 1)?; - let word = (funct6 << 26) | (1u32 << 25) | (vs2 << 20) | (0b10000u32 << 15) | (0b010 << 12) | (vd << 7) | OP_V_CRYPTO; - Ok(EncodeResult::Word(word)) -} diff --git a/src/backend/riscv/assembler/mod.rs b/src/backend/riscv/assembler/mod.rs deleted file mode 100644 index ececb1d286..0000000000 --- a/src/backend/riscv/assembler/mod.rs +++ /dev/null @@ -1,101 +0,0 @@ -//! Native RISC-V assembler. -//! -//! Parses `.s` assembly text (as emitted by the RISC-V codegen) and produces -//! ELF `.o` object files, removing the dependency on `riscv64-linux-gnu-gcc` -//! for assembly. -//! -//! Architecture: -//! - `parser.rs` – Tokenize + parse assembly text into `AsmStatement` items -//! - `encoder.rs` – Encode RISC-V instructions into 32-bit machine words -//! - `compress.rs` – RV64C compressed instruction support (32-bit → 16-bit) -//! - `elf_writer.rs` – Write ELF object files with sections, symbols, and relocations - -pub mod parser; -pub mod encoder; -pub mod compress; -pub mod elf_writer; - -use parser::parse_asm; -use elf_writer::{ElfWriter, EF_RISCV_RVC, EF_RISCV_FLOAT_ABI_SINGLE, EF_RISCV_FLOAT_ABI_DOUBLE, EF_RISCV_FLOAT_ABI_QUAD}; -use crate::backend::elf::{ELFCLASS32, ELFCLASS64}; - -/// Assemble RISC-V assembly text into an ELF object file, with extra args. -/// -/// Supports `-mabi=` to control ELF float ABI flags and ELF class (32/64-bit), -/// and `-march=` to control ELF class (rv32 vs rv64) and RVC flag. -pub fn assemble_with_args(asm_text: &str, output_path: &str, extra_args: &[String]) -> Result<(), String> { - let statements = parse_asm(asm_text)?; - let mut writer = ElfWriter::new(); - - // Collect ABI and arch info from extra args (last value wins, matching GCC behavior) - let mut abi_name: Option = None; - let mut march_name: Option = None; - for arg in extra_args { - if let Some(abi) = arg.strip_prefix("-mabi=") { - abi_name = Some(abi.to_string()); - } - if let Some(march) = arg.strip_prefix("-march=") { - march_name = Some(march.to_string()); - } - } - - // Determine RVC from -march= (check for 'c' extension in the arch string) - let has_rvc = match &march_name { - Some(march) => march_has_c_extension(march), - None => true, // default: assume RVC (matches rv64gc default) - }; - - // Set ELF flags based on ABI + RVC - if let Some(ref abi) = abi_name { - writer.set_elf_flags(elf_flags_for_abi(abi, has_rvc)); - if abi.starts_with("ilp32") { - writer.set_elf_class(ELFCLASS32); - } else { - writer.set_elf_class(ELFCLASS64); - } - } else if !has_rvc { - // No -mabi= but -march= without 'c': clear RVC from default flags - let default_flags = EF_RISCV_FLOAT_ABI_DOUBLE; - writer.set_elf_flags(default_flags); - } - - // -march= overrides ELF class (takes precedence, processed after -mabi=) - if let Some(ref march) = march_name { - if march.starts_with("rv32") { - writer.set_elf_class(ELFCLASS32); - } else if march.starts_with("rv64") { - writer.set_elf_class(ELFCLASS64); - } - } - - writer.process_statements(&statements)?; - writer.write_elf(output_path)?; - Ok(()) -} - -/// Map an ABI name to ELF e_flags, with optional RVC flag. -fn elf_flags_for_abi(abi: &str, has_rvc: bool) -> u32 { - let float_abi = match abi { - "lp64" | "ilp32" => 0x0, // soft-float - "lp64f" | "ilp32f" => EF_RISCV_FLOAT_ABI_SINGLE, - "lp64d" | "ilp32d" => EF_RISCV_FLOAT_ABI_DOUBLE, - "lp64q" | "ilp32q" => EF_RISCV_FLOAT_ABI_QUAD, - _ => EF_RISCV_FLOAT_ABI_DOUBLE, // default - }; - if has_rvc { float_abi | EF_RISCV_RVC } else { float_abi } -} - -/// Check if a -march= string includes the 'c' (compressed) extension. -/// Handles both shorthand (rv64gc) and explicit (rv64imafdc_zicsr) formats. -fn march_has_c_extension(march: &str) -> bool { - // Strip the rv32/rv64 prefix - let rest = if march.starts_with("rv32") || march.starts_with("rv64") { - &march[4..] - } else { - march - }; - // The base ISA letters come before the first '_' (extension separator) - let base = rest.split('_').next().unwrap_or(rest); - // 'g' expands to 'imafd' (no 'c'), so only check for explicit 'c' - base.contains('c') -} diff --git a/src/backend/riscv/assembler/parser.rs b/src/backend/riscv/assembler/parser.rs deleted file mode 100644 index 74a2a37a51..0000000000 --- a/src/backend/riscv/assembler/parser.rs +++ /dev/null @@ -1,1062 +0,0 @@ -//! RISC-V assembly parser. -//! -//! Parses the textual assembly format emitted by our RISC-V codegen into -//! structured `AsmStatement` values. The parser handles: -//! - Labels (global and local) -//! - Directives (.section, .globl, .type, .align, .byte, .long, .dword, etc.) -//! with fully typed representation (no string re-parsing in ELF writer) -//! - RISC-V instructions (add, sub, ld, sd, beq, call, ret, etc.) -//! - CFI directives (passed through as-is for DWARF unwind info) - -// Some parser helper functions and enum variants are defined for completeness -// and used only by the encoder or ELF writer, not the parser entry point itself. -#![allow(dead_code)] - -use crate::backend::asm_expr; -use crate::backend::asm_preprocess::{self, CommentStyle}; -use crate::backend::elf; - -/// A parsed assembly operand. -#[derive(Debug, Clone)] -pub enum Operand { - /// Register: x0-x31, zero, ra, sp, gp, tp, t0-t6, s0-s11, a0-a7, - /// f0-f31, ft0-ft11, fs0-fs11, fa0-fa7 - Reg(String), - /// Immediate value: 42, -1, 0x1000 - Imm(i64), - /// Symbol reference: function name, label, etc. - Symbol(String), - /// Symbol with addend: symbol+offset or symbol-offset - SymbolOffset(String, i64), - /// Memory operand: offset(base) e.g., 8(sp) or -16(s0) - Mem { base: String, offset: i64 }, - /// Memory operand with symbol: %lo(symbol)(base) or similar - MemSymbol { base: String, symbol: String, modifier: String }, - /// Label reference for branches - Label(String), - /// Fence operand: iorw etc. - FenceArg(String), - /// CSR register name or number - Csr(String), - /// Rounding mode: rne, rtz, rdn, rup, rmm, dyn - RoundingMode(String), -} - -/// A data value in a .byte/.short/.long/.quad directive. -/// Can be a literal integer, a symbol reference (with optional addend), -/// or a symbol difference expression (A - B, with optional addend on A). -#[derive(Debug, Clone)] -pub enum DataValue { - /// A literal integer value. - Integer(i64), - /// A symbol reference, possibly with an addend: `sym` or `sym+4` or `sym-8`. - Symbol { name: String, addend: i64 }, - /// A symbol difference: `sym_a - sym_b`, possibly with addend on sym_a. - SymbolDiff { sym_a: String, sym_b: String, addend: i64 }, - /// A raw expression string that needs alias resolution at emit time. - Expression(String), -} - -/// Symbol type as parsed from `.type sym, @function` etc. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum SymbolType { - Function, - Object, - TlsObject, - NoType, -} - -/// Symbol visibility as parsed from `.hidden`, `.protected`, `.internal`. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum Visibility { - Hidden, - Protected, - Internal, -} - -/// A size expression from `.size sym, expr`. -#[derive(Debug, Clone)] -pub enum SizeExpr { - /// `.- label` — current position minus label - CurrentMinus(String), - /// A literal size value. - Absolute(u64), -} - -/// Section type from `.section` directive. -#[derive(Debug, Clone)] -pub struct SectionInfo { - pub name: String, - pub flags: String, - pub sec_type: String, - /// True when the directive explicitly included a flags field (even if empty). - pub flags_explicit: bool, -} - -/// A typed assembly directive. All argument parsing happens in the parser, -/// so the ELF writer only needs to pattern-match on these variants. -#[derive(Debug, Clone)] -pub enum Directive { - /// `.section name, "flags", @type` - Section(SectionInfo), - /// `.text` - Text, - /// `.data` - Data, - /// `.bss` - Bss, - /// `.rodata` - Rodata, - /// `.globl sym` or `.global sym` - Globl(String), - /// `.weak sym` - Weak(String), - /// `.hidden sym`, `.protected sym`, `.internal sym` - SymVisibility(String, Visibility), - /// `.type sym, @function` etc. - Type(String, SymbolType), - /// `.size sym, expr` - Size(String, SizeExpr), - /// `.align N` or `.p2align N` — power-of-2 alignment - Align(u64), - /// `.balign N` — byte alignment - Balign(u64), - /// `.byte val, val, ...` - Byte(Vec), - /// `.short val, ...` / `.hword` / `.2byte` / `.half` - Short(Vec), - /// `.long val, ...` / `.4byte` / `.word` - Long(Vec), - /// `.quad val, ...` / `.8byte` / `.xword` / `.dword` - Quad(Vec), - /// `.zero N[, fill]` / `.space N[, fill]` - Zero { size: usize, fill: u8 }, - /// `.asciz "str"` / `.string "str"` — null-terminated string (raw bytes) - Asciz(Vec), - /// `.ascii "str"` — string without null terminator (raw bytes) - Ascii(Vec), - /// `.comm sym, size[, align]` - Comm { sym: String, size: u64, align: u64 }, - /// `.local sym` - Local(String), - /// `.set sym, val` / `.equ sym, val` - Set(String, String), - /// `.option ...` — RISC-V specific - ArchOption(String), - /// `.attribute ...` — RISC-V attribute - Attribute(String), - /// CFI directives — silently ignored - Cfi, - /// Other ignorable directives: .file, .loc, .ident, etc. - Ignored, - /// `.pushsection name, "flags", @type` — push current section and switch - PushSection(SectionInfo), - /// `.popsection` — pop section stack - PopSection, - /// `.previous` — swap to previous section - Previous, - /// `.insn ...` — emit raw instruction encoding - Insn(String), - /// `.incbin "file"[, skip[, count]]` — include binary file contents - Incbin { path: String, skip: u64, count: Option }, - /// `.subsection N` — switch to numbered subsection - Subsection(u64), - /// Unknown directive — preserved for forward compatibility - Unknown { name: String, args: String }, -} - -/// A parsed assembly statement. -#[derive(Debug, Clone)] -pub enum AsmStatement { - /// A label definition: "name:" - Label(String), - /// A typed assembly directive. - Directive(Directive), - /// A RISC-V instruction with mnemonic and operands - Instruction { - mnemonic: String, - operands: Vec, - /// The raw text of the operand string (for fallback/debugging) - raw_operands: String, - }, - /// An empty line or comment - Empty, -} - -/// Comment style for RISC-V GAS: `#` and `//`. -const COMMENT_STYLE: CommentStyle = CommentStyle::HashAndSlashSlash; - -pub fn parse_asm(text: &str) -> Result, String> { - // Pre-process: strip C-style /* ... */ comments (may span multiple lines) - let text = asm_preprocess::strip_c_comments(text); - - // Expand .macro/.endm definitions and invocations - let raw_lines: Vec<&str> = text.lines().collect(); - let macro_expanded = asm_preprocess::expand_macros(&raw_lines, &COMMENT_STYLE)?; - let macro_refs: Vec<&str> = macro_expanded.iter().map(|s| s.as_str()).collect(); - - // Expand .rept/.endr and .irp/.endr blocks - let expanded_lines = asm_preprocess::expand_rept_blocks(¯o_refs, &COMMENT_STYLE, parse_int_literal)?; - - let mut statements = Vec::new(); - // Stack for .if/.else/.endif conditional assembly. - // Each entry is true if the current block is active (emitting code). - let mut if_stack: Vec = Vec::new(); - for (line_num, line) in expanded_lines.iter().enumerate() { - let line = line.trim(); - - // Skip empty lines - if line.is_empty() { - statements.push(AsmStatement::Empty); - continue; - } - - // Strip comments - let line = strip_comment(line); - let line = line.trim(); - if line.is_empty() { - statements.push(AsmStatement::Empty); - continue; - } - - // Handle .if/.else/.elseif/.endif before anything else - let lower = line.to_ascii_lowercase(); - if lower.starts_with(".endif") { - if if_stack.pop().is_none() { - return Err(format!("Line {}: .endif without matching .if", line_num + 1)); - } - continue; - } - if lower.starts_with(".else") { - if let Some(top) = if_stack.last_mut() { - *top = !*top; - } - continue; - } - if lower.starts_with(".if ") || lower.starts_with(".if\t") { - let cond_str = line[3..].trim(); - // Evaluate the condition: if we're already in a false block, push false - let active = if if_stack.last().copied().unwrap_or(true) { - asm_preprocess::eval_if_condition(cond_str) - } else { - false - }; - if_stack.push(active); - continue; - } - - // If we're inside a false .if block, skip this line - if !if_stack.last().copied().unwrap_or(true) { - continue; - } - - // Handle ';' as statement separator (GAS syntax). - // Split the line on ';' and parse each part independently. - let parts = asm_preprocess::split_on_semicolons(line); - for part in parts { - let part = part.trim(); - if part.is_empty() { - continue; - } - match parse_line(part) { - Ok(stmts) => statements.extend(stmts), - Err(e) => return Err(format!("Line {}: {}: '{}'", line_num + 1, e, part)), - } - } - } - Ok(statements) -} - -/// Convenience wrapper: strip line comment using RISC-V comment style. -fn strip_comment(line: &str) -> &str { - asm_preprocess::strip_comment(line, &COMMENT_STYLE) -} - -fn parse_line(line: &str) -> Result, String> { - // Check for label definition (name:) - // Labels can be at the start of the line, possibly followed by an instruction - if let Some(colon_pos) = line.find(':') { - let potential_label = line[..colon_pos].trim(); - // Verify it looks like a valid label (no spaces before colon, alphanumeric + _ + .) - if !potential_label.is_empty() - && !potential_label.contains(' ') - && !potential_label.contains('\t') - && (!potential_label.starts_with('.') - || potential_label.starts_with(".L") - || potential_label.starts_with(".l")) - { - // Make sure this isn't a directive - if !potential_label.starts_with('.') - || potential_label.starts_with(".L") - || potential_label.starts_with(".l") - { - let mut result = vec![AsmStatement::Label(potential_label.to_string())]; - // Check for instruction/directive after the label on the same line - let rest = line[colon_pos + 1..].trim(); - if !rest.is_empty() { - result.extend(parse_line(rest)?); - } - return Ok(result); - } - } - } - - let trimmed = line.trim(); - - // Handle quoted instructions from macro expansion (e.g. "nop" or "j label") - if trimmed.starts_with('"') && trimmed.ends_with('"') && trimmed.len() >= 2 { - let inner = trimmed[1..trimmed.len() - 1].trim(); - if inner.is_empty() { - return Ok(vec![AsmStatement::Empty]); - } - return parse_line(inner); - } - - // Directive: starts with . - if trimmed.starts_with('.') { - return Ok(vec![parse_directive(trimmed)?]); - } - - // Instruction - Ok(vec![parse_instruction(trimmed)?]) -} - -/// Split a directive line into name and args, then dispatch to typed parsing. -fn parse_directive(line: &str) -> Result { - let (name, args) = if let Some(space_pos) = line.find([' ', '\t']) { - let name = &line[..space_pos]; - let args = line[space_pos..].trim(); - (name, args) - } else { - (line, "") - }; - - let directive = match name { - ".section" => { - let info = parse_section_args(args); - Directive::Section(info) - } - ".text" => Directive::Text, - ".data" => Directive::Data, - ".bss" => Directive::Bss, - ".rodata" => Directive::Rodata, - - ".globl" | ".global" => Directive::Globl(args.trim().to_string()), - ".weak" => Directive::Weak(args.trim().to_string()), - ".hidden" => Directive::SymVisibility(args.trim().to_string(), Visibility::Hidden), - ".protected" => Directive::SymVisibility(args.trim().to_string(), Visibility::Protected), - ".internal" => Directive::SymVisibility(args.trim().to_string(), Visibility::Internal), - - ".type" => parse_type_directive(args), - - ".size" => parse_size_directive(args), - - ".align" | ".p2align" => { - let val: u64 = args.trim().split(',').next() - .and_then(|s| parse_int_literal(s.trim()).ok()) - .unwrap_or(0) as u64; - Directive::Align(val) - } - ".balign" => { - let val: u64 = args.trim().split(',').next() - .and_then(|s| parse_int_literal(s.trim()).ok()) - .unwrap_or(1) as u64; - Directive::Balign(val) - } - - ".byte" => { - let values = parse_data_values(args)?; - Directive::Byte(values) - } - ".short" | ".hword" | ".2byte" | ".half" => { - let values = parse_data_values(args)?; - Directive::Short(values) - } - ".long" | ".4byte" | ".word" | ".int" => { - let values = parse_data_values(args)?; - Directive::Long(values) - } - ".quad" | ".8byte" | ".xword" | ".dword" => { - let values = parse_data_values(args)?; - Directive::Quad(values) - } - - ".zero" | ".space" => { - let parts: Vec<&str> = args.trim().split(',').collect(); - let size: usize = parse_int_literal(parts[0].trim()) - .map_err(|_| format!("invalid .zero size: {}", args))? as usize; - let fill: u8 = if parts.len() > 1 { - parse_data_value_int(parts[1].trim())? as u8 - } else { - 0 - }; - Directive::Zero { size, fill } - } - ".fill" => { - // .fill repeat, size, value - let parts: Vec<&str> = args.splitn(3, ',').collect(); - let repeat = parse_int_literal(parts[0].trim()) - .map_err(|_| format!("bad .fill repeat: {}", parts[0].trim()))? as u64; - let size = if parts.len() > 1 { - parse_int_literal(parts[1].trim()) - .map_err(|_| format!("bad .fill size: {}", parts[1].trim()))? as u64 - } else { - 1 - }; - let value = if parts.len() > 2 { - parse_int_literal(parts[2].trim()) - .map_err(|_| format!("bad .fill value: {}", parts[2].trim()))? as u64 - } else { - 0 - }; - let total_bytes = (repeat * size.min(8)) as usize; - if value == 0 { - Directive::Zero { size: total_bytes, fill: 0 } - } else { - let mut data = Vec::with_capacity(total_bytes); - let value_bytes = value.to_le_bytes(); - for _ in 0..repeat { - for j in 0..size.min(8) as usize { - data.push(value_bytes[j]); - } - } - Directive::Ascii(data) - } - } - - ".asciz" | ".string" => { - let s = elf::parse_string_literal(args)?; - Directive::Asciz(s) - } - ".ascii" => { - let s = elf::parse_string_literal(args)?; - Directive::Ascii(s) - } - - ".comm" => { - let parts: Vec<&str> = args.split(',').collect(); - let sym = if !parts.is_empty() { parts[0].trim().to_string() } else { String::new() }; - let size: u64 = if parts.len() >= 2 { parts[1].trim().parse().unwrap_or(0) } else { 0 }; - let align: u64 = if parts.len() > 2 { parts[2].trim().parse().unwrap_or(1) } else { 1 }; - Directive::Comm { sym, size, align } - } - - ".local" => Directive::Local(args.trim().to_string()), - - ".set" | ".equ" => { - let parts: Vec<&str> = args.splitn(2, ',').collect(); - let sym = if !parts.is_empty() { parts[0].trim().to_string() } else { String::new() }; - let val = if parts.len() > 1 { parts[1].trim().to_string() } else { String::new() }; - Directive::Set(sym, val) - } - ".symver" => { - // .symver name, alias@@VERSION -> treat as Set(alias, name) for default version - let parts: Vec<&str> = args.splitn(2, ',').collect(); - if parts.len() == 2 { - let name = parts[0].trim(); - let ver_string = parts[1].trim(); - if let Some(at_pos) = ver_string.find('@') { - let alias = &ver_string[..at_pos]; - if !alias.is_empty() { - Directive::Set(alias.to_string(), name.to_string()) - } else { - Directive::Ignored - } - } else { - Directive::Ignored - } - } else { - Directive::Ignored - } - } - - ".option" => Directive::ArchOption(args.to_string()), - ".attribute" => Directive::Attribute(args.to_string()), - ".insn" => Directive::Insn(args.to_string()), - - // CFI directives - ".cfi_startproc" | ".cfi_endproc" | ".cfi_def_cfa_offset" - | ".cfi_offset" | ".cfi_def_cfa_register" | ".cfi_restore" - | ".cfi_remember_state" | ".cfi_restore_state" - | ".cfi_adjust_cfa_offset" | ".cfi_def_cfa" - | ".cfi_sections" | ".cfi_personality" | ".cfi_lsda" - | ".cfi_rel_offset" | ".cfi_register" | ".cfi_return_column" - | ".cfi_undefined" | ".cfi_same_value" | ".cfi_escape" - | ".cfi_signal_frame" => Directive::Cfi, - - ".pushsection" => { - Directive::PushSection(parse_section_args(args)) - } - ".popsection" => Directive::PopSection, - ".previous" => Directive::Previous, - - ".org" => { - // .org expressions like ". - (X) + (Y)" are used as size assertions - // in kernel alternative macros. Silently ignore them. - Directive::Ignored - } - - ".incbin" => { - let parts: Vec<&str> = args.splitn(3, ',').collect(); - let path = elf::parse_string_literal(parts[0].trim())?; - let path = String::from_utf8(path).map_err(|_| "invalid .incbin path".to_string())?; - let skip = if parts.len() > 1 { - parse_int_literal(parts[1].trim()).unwrap_or(0) as u64 - } else { 0 }; - let count = if parts.len() > 2 { - Some(parse_int_literal(parts[2].trim()).unwrap_or(0) as u64) - } else { None }; - Directive::Incbin { path, skip, count } - } - - ".subsection" => { - let n = parse_int_literal(args.trim()).unwrap_or(0) as u64; - Directive::Subsection(n) - } - - // Other ignorable directives - ".file" | ".loc" | ".ident" | ".addrsig" | ".addrsig_sym" - | ".build_attributes" | ".eabi_attribute" | ".end" - | ".altmacro" | ".noaltmacro" - | ".purgem" => Directive::Ignored, - - _ => Directive::Unknown { - name: name.to_string(), - args: args.to_string(), - }, - }; - - Ok(AsmStatement::Directive(directive)) -} - -/// Parse `.section name, "flags", @type` arguments. -fn parse_section_args(args: &str) -> SectionInfo { - let parts: Vec<&str> = args.split(',').collect(); - let name = parts[0].trim().trim_matches('"').to_string(); - let flags_explicit = parts.len() > 1; - let flags = if flags_explicit { - parts[1].trim().trim_matches('"').to_string() - } else { - String::new() - }; - let sec_type = if parts.len() > 2 { - parts[2].trim().to_string() - } else { - // Default type based on section name - if name == ".bss" || name.starts_with(".bss.") || name.starts_with(".tbss") { - "@nobits".to_string() - } else { - "@progbits".to_string() - } - }; - SectionInfo { name, flags, sec_type, flags_explicit } -} - -/// Parse `.type sym, @function` etc. -fn parse_type_directive(args: &str) -> Directive { - let parts: Vec<&str> = args.splitn(2, ',').collect(); - if parts.len() == 2 { - let sym = parts[0].trim().to_string(); - let ty = parts[1].trim(); - let st = match ty { - "%function" | "@function" => SymbolType::Function, - "%object" | "@object" => SymbolType::Object, - "@tls_object" => SymbolType::TlsObject, - _ => SymbolType::NoType, - }; - Directive::Type(sym, st) - } else { - // Malformed .type directive — treat as no-type - Directive::Type(args.trim().to_string(), SymbolType::NoType) - } -} - -/// Parse `.size sym, expr`. -fn parse_size_directive(args: &str) -> Directive { - let parts: Vec<&str> = args.splitn(2, ',').collect(); - if parts.len() == 2 { - let sym = parts[0].trim().to_string(); - let size_expr = parts[1].trim(); - if let Some(label) = size_expr.strip_prefix(".-") { - Directive::Size(sym, SizeExpr::CurrentMinus(label.to_string())) - } else if let Ok(size) = size_expr.parse::() { - Directive::Size(sym, SizeExpr::Absolute(size)) - } else { - // Can't parse — use 0 - Directive::Size(sym, SizeExpr::Absolute(0)) - } - } else { - // Malformed .size directive - Directive::Size(args.trim().to_string(), SizeExpr::Absolute(0)) - } -} - -/// Parse a comma-separated list of data values for .byte/.short/.long/.quad. -/// Each value can be an integer, a symbol reference, or a symbol difference. -fn parse_data_values(args: &str) -> Result, String> { - let mut values = Vec::new(); - for part in args.split(',') { - let trimmed = part.trim(); - if trimmed.is_empty() { - continue; - } - values.push(parse_single_data_value(trimmed)?); - } - Ok(values) -} - -/// Check if a string looks like a GNU numeric label reference (e.g. "2f", "1b", "42f"). -fn is_numeric_label_ref(s: &str) -> bool { - asm_preprocess::is_numeric_label_ref(s) -} - -/// Strip balanced outer parentheses from an expression. -/// E.g. "((1b) - .)" -> "(1b) - ." -> calls recursively until no outer parens. -fn strip_outer_parens(s: &str) -> &str { - let s = s.trim(); - if !s.starts_with('(') || !s.ends_with(')') { - return s; - } - // Check if the outer parens are balanced (the open paren at 0 matches the close at end) - let inner = &s[1..s.len() - 1]; - let mut depth = 0i32; - for ch in inner.bytes() { - match ch { - b'(' => depth += 1, - b')' => { - depth -= 1; - if depth < 0 { - return s; // Close paren inside doesn't match, outer parens aren't a pair - } - } - _ => {} - } - } - if depth == 0 { - strip_outer_parens(inner) - } else { - s - } -} - -/// Parse a single data value: integer, symbol, symbol+offset, or symbol_diff. -fn parse_single_data_value(s: &str) -> Result { - let s = s.trim(); - if s.is_empty() { - return Ok(DataValue::Integer(0)); - } - - // Strip balanced outer parentheses (GCC wraps inline asm operands in parens, - // e.g. "((1b) - .)" for numeric label diff expressions) - let s = strip_outer_parens(s); - - // First check if this could be a symbol difference: A - B - // The first char must be a symbol-start character (letter, _, .), a digit - // (for numeric label references like "2f - 1f"), or '(' for parenthesized symbols. - // However, if there are extra operators (/, *, <<, >>), this is a complex - // expression and should go through the Expression path instead. - let first = s.chars().next().unwrap(); - let is_sym_start = first.is_ascii_alphabetic() || first == '_' || first == '.'; - let could_be_numeric_ref = first.is_ascii_digit(); - let starts_with_paren = first == '('; - let has_complex_ops = s.contains('/') || s.contains('*') || s.contains("<<") || s.contains(">>"); - if !has_complex_ops && (is_sym_start || could_be_numeric_ref || starts_with_paren) { - // Try to find a symbol difference expression - if let Some(minus_pos) = find_symbol_diff_minus(s) { - let sym_a_raw = strip_outer_parens(s[..minus_pos].trim()); - let rest = s[minus_pos + 1..].trim(); - - // rest might be "B" or "B + offset" - let (sym_b_raw, addend) = if let Some(plus_pos) = rest.find('+') { - let b = rest[..plus_pos].trim(); - let add_str = rest[plus_pos + 1..].trim(); - let add_val: i64 = add_str.parse().unwrap_or(0); - (b, add_val) - } else { - (rest, 0i64) - }; - let sym_b = strip_outer_parens(sym_b_raw); - - // Verify sym_b looks like a symbol (or a numeric label ref) - if !sym_b.is_empty() { - let b_first = sym_b.chars().next().unwrap(); - let b_is_sym = b_first.is_ascii_alphabetic() || b_first == '_' || b_first == '.'; - if b_is_sym || is_numeric_label_ref(sym_b) { - return Ok(DataValue::SymbolDiff { - sym_a: sym_a_raw.to_string(), - sym_b: sym_b.to_string(), - addend, - }); - } - } - } - - if is_sym_start { - // Not a symbol diff — parse as symbol reference with optional addend - let (sym, addend) = parse_symbol_addend(s); - return Ok(DataValue::Symbol { name: sym, addend }); - } else if could_be_numeric_ref && is_numeric_label_ref(s) { - // Standalone numeric label reference like "2f" or "1b" - return Ok(DataValue::Symbol { name: s.to_string(), addend: 0 }); - } else if starts_with_paren { - // Parenthesized single symbol, e.g. "(1b)" or "(.Lfoo)" - let inner = strip_outer_parens(s); - if inner != s { - return parse_single_data_value(inner); - } - } - } - - // Try to parse as constant+symbol (e.g., "0x80000000 + some_symbol") - // This handles cases where the constant comes before the symbol in an additive expression. - if could_be_numeric_ref || first == '(' { - for (i, c) in s.char_indices().skip(1) { - if c == '+' { - let left = s[..i].trim(); - let right = s[i + 1..].trim(); - if let Ok(offset) = parse_data_value_int(left) { - // Check if right side looks like a symbol - let r_first = right.chars().next().unwrap_or('\0'); - if r_first.is_ascii_alphabetic() || r_first == '_' || r_first == '.' { - let (sym, extra_addend) = parse_symbol_addend(right); - return Ok(DataValue::Symbol { - name: sym, - addend: offset + extra_addend, - }); - } - } - } - } - } - - // Try to parse as integer; if it fails and contains symbol-like chars, - // store as a raw expression for alias resolution at emit time. - match parse_data_value_int(s) { - Ok(v) => Ok(DataValue::Integer(v)), - Err(_) => Ok(DataValue::Expression(s.to_string())), - } -} - -/// Parse a data value as a plain integer (used by .byte, .zero fill, etc). -fn parse_data_value_int(s: &str) -> Result { - let s = s.trim(); - if s.is_empty() { - return Ok(0); - } - asm_expr::parse_integer_expr(s) -} - -/// Find the position of the '-' operator in a symbol difference expression. -fn find_symbol_diff_minus(expr: &str) -> Option { - asm_preprocess::find_symbol_diff_minus(expr) -} - -fn parse_symbol_addend(s: &str) -> (String, i64) { - if let Some(plus_pos) = s.find('+') { - let sym = s[..plus_pos].trim().to_string(); - let off: i64 = s[plus_pos + 1..].trim().parse().unwrap_or(0); - (sym, off) - } else if let Some(minus_pos) = s.find('-') { - if minus_pos > 0 { - let sym = s[..minus_pos].trim().to_string(); - let off_str = &s[minus_pos..]; - let off: i64 = off_str.parse().unwrap_or(0); - (sym, off) - } else { - (s.to_string(), 0) - } - } else { - (s.to_string(), 0) - } -} - -fn parse_instruction(line: &str) -> Result { - // Split mnemonic from operands - let (mnemonic, operands_str) = if let Some(space_pos) = line.find([' ', '\t']) { - (&line[..space_pos], line[space_pos..].trim()) - } else { - (line, "") - }; - - let mnemonic = mnemonic.to_lowercase(); - let operands = parse_operands(operands_str, &mnemonic)?; - - Ok(AsmStatement::Instruction { - mnemonic, - operands, - raw_operands: operands_str.to_string(), - }) -} - -/// Determine which operand positions for a given instruction mnemonic must be -/// parsed as symbols rather than registers. This prevents function/variable -/// names that happen to match register names (e.g., `f1`, `a0`, `ra`, `s1`) -/// from being misclassified. -fn symbol_operand_mask(mnemonic: &str) -> u8 { - match mnemonic { - // call — operand 0 is always a symbol - "call" | "tail" => 0b0000_0001, - // la/lla rd, — operand 1 is always a symbol - "la" | "lla" => 0b0000_0010, - // jump