From bd5f561610ae65133ba98f3f3a92ecf796ddb9b6 Mon Sep 17 00:00:00 2001 From: Igor Kirillov Date: Mon, 23 Feb 2026 15:34:18 +0000 Subject: [PATCH 1/2] [experimental] [AArch64] Reorder RTA functions by runtime call density On AArch64, non-smashable calls to the runtime (movz/movk/blr sequences) are significantly more expensive than on x86 due to instruction encoding overhead. This patch reorders functions during retranslateAll so that functions with the highest runtime call density (weighted by profile execution counts) are placed first in code.hot. The idea is that clustering call-heavy functions near the start of the TC improves icache locality for runtime call sequences. Controlled by two new config options: - Eval.JitRuntimeCallReorder (bool, default: true, AArch64 only) - Eval.JitRuntimeCallReorderLimitMB (uint32, default: 64) Diagnostic output available via TRACE=mcg:6. --- hphp/doc/configs.specification | 2 + hphp/runtime/vm/jit/mcgen-translate.cpp | 335 ++++++++++++++++++++++++ 2 files changed, 337 insertions(+) diff --git a/hphp/doc/configs.specification b/hphp/doc/configs.specification index a7e8e7df58aa4..9a2c086f6f9ae 100644 --- a/hphp/doc/configs.specification +++ b/hphp/doc/configs.specification @@ -1800,6 +1800,8 @@ The format can be found in hphp/tools/configs/generate_configs.rs - uint32_t Jit.RetranslateAllRequest, UNKNOWN - uint32_t Jit.RetranslateAllSeconds, UNKNOWN - bool Jit.RerunRetranslateAll = false, UNKNOWN +- bool Jit.RuntimeCallReorder = true, UNKNOWN +- uint32_t Jit.RuntimeCallReorderLimitMB = 64, UNKNOWN - bool Jit.BuildOutliningHashes = false, UNKNOWN - bool Jit.PGOLayoutSplitHotCold = true, UNKNOWN - bool Jit.PGOLayoutResplitFrozen = true, UNKNOWN diff --git a/hphp/runtime/vm/jit/mcgen-translate.cpp b/hphp/runtime/vm/jit/mcgen-translate.cpp index 0972790a5eb16..121ed1d6afa59 100644 --- a/hphp/runtime/vm/jit/mcgen-translate.cpp +++ b/hphp/runtime/vm/jit/mcgen-translate.cpp @@ -42,6 +42,8 @@ #include "hphp/runtime/base/vm-worker.h" #include "hphp/runtime/ext/server/ext_server.h" +#include "hphp/util/abi-cxx.h" +#include "hphp/util/arch.h" #include "hphp/util/boot-stats.h" #include "hphp/util/configs/jit.h" #include "hphp/util/job-queue.h" @@ -52,10 +54,16 @@ #include "hphp/util/roar.h" #include "hphp/util/trace.h" +#include "hphp/vixl/a64/instructions-a64.h" + #include "hphp/zend/zend-strtod.h" #include +#include +#include +#include + TRACE_SET_MOD(mcg) namespace HPHP::jit::mcgen { @@ -372,6 +380,326 @@ void scheduleSerializeOptProf() { } } +/* + * Per-function runtime call density info, computed by scanning the generated + * machine code for runtime (non-JIT) call sites and weighing them by profile + * block execution counts. + * + * density = weighted_runtime_calls / hot_bytes + * where weighted_runtime_calls = sum over all call sites of max(blockWeight, 0) + */ +struct FuncRuntimeCallDensity { + size_t hotBytes{0}; + int64_t weightedCalls{0}; // sum of block weights for runtime call sites + size_t numCalls{0}; // total number of runtime call sites + double density() const { + return hotBytes > 0 ? (double)weightedCalls / (double)hotBytes : 0.0; + } +}; + +struct RuntimeCallSiteInfo { + TCA callAddr; // fake TC address of the movz instruction + uint64_t target; // absolute address being called + std::string symbol; // resolved symbol name + int64_t blockWeight; // profile count of enclosing block + SrcKey blockSk; // SrcKey of enclosing block +}; + +/* + * Scan a single FuncMetaInfo for runtime (non-JIT) call sites. + * Returns per-call-site details and aggregate density info. + */ +std::pair, FuncRuntimeCallDensity> +scanRuntimeCalls(tc::FuncMetaInfo& info, const ProfData* pd) { + using namespace vixl; + + std::vector runtimeCalls; + FuncRuntimeCallDensity density; + + auto const func = info.func; + if (!func) return {runtimeCalls, density}; + + auto view = info.tcBuf.view(); + if (!view) return {runtimeCalls, density}; + + for (auto const& translator : info.translators) { + if (!translator->translateSuccess()) continue; + + auto const range = translator->range(); + auto const mainBegin = range.main.begin(); + auto const mainEnd = range.main.end(); + if (!mainBegin || !mainEnd || mainBegin >= mainEnd) { + continue; + } + + density.hotBytes += (mainEnd - mainBegin); + + // Get the region for block weight lookup. + auto* rt = dynamic_cast(translator.get()); + auto const& meta = translator->meta(); + + // Scan addressImmediates entries within this translation's main range + // for movz/movk/blr sequences targeting runtime (non-JIT) addresses. + for (auto aiAddr : meta.addressImmediates) { + // Skip entries outside this translation's main range. + if (aiAddr < mainBegin || aiAddr >= mainEnd) continue; + + // Convert fake TC address to real (readable) memory. + auto const realAddr = view->main().toDestAddress(aiAddr); + auto const realEnd = view->main().toDestAddress(mainEnd); + auto const inst = Instruction::Cast(realAddr); + auto const instEnd = Instruction::Cast(realEnd); + + uint64_t target = 0; + uint32_t rd = 0; + auto const seqLen = [&]() -> size_t { + // Inline decode of movz/movk sequence (same logic as + // decodePossibleMovSequence in relocation-arm.cpp). + if (inst->IsMovz()) { + target = (uint64_t)inst->ImmMoveWide() + << (16 * inst->ShiftMoveWide()); + } else if (inst->IsMovn()) { + target = ~((uint64_t)inst->ImmMoveWide() + << (16 * inst->ShiftMoveWide())); + if (!inst->SixtyFourBits()) { + target &= (1UL << 32) - 1; + } + } else if (inst->IsLogicalImmediate() && + inst->Rn() == kZeroRegCode) { + target = inst->ImmLogical(); + } else { + return 0; + } + size_t length = 1; + rd = inst->Rd(); + auto next = inst->NextInstruction(); + while ((next < instEnd && next->IsMovk() && next->Rd() == rd) || + (next < instEnd && next->IsNop())) { + if (next->IsMovk()) { + auto const shift = 16 * next->ShiftMoveWide(); + auto const mask = 0xffffLL << shift; + target &= ~mask; + target |= (uint64_t)next->ImmMoveWide() << shift; + } + length++; + next = next->NextInstruction(); + } + return length; + }(); + + if (seqLen == 0) continue; + + // Check if the instruction after the mov sequence is BLR with + // the same register. + auto const afterSeq = inst + seqLen * kInstructionSize; + if (afterSeq >= instEnd) continue; + if (!afterSeq->IsUncondBranchReg()) continue; + if (afterSeq->Rn() != rd) continue; + // Only count BLR (calls), not BR (jumps). + if (afterSeq->Mask(UnconditionalBranchToRegisterMask) != BLR) + continue; + + // Check if target is a runtime call (not JIT code). + if (tc::isValidCodeAddress(reinterpret_cast(target))) continue; + + // Resolve the symbol name. + auto symbol = getNativeFunctionName( + reinterpret_cast(target)); + if (symbol.empty()) { + std::ostringstream tmp; + tmp << "0x" << std::hex << target; + symbol = tmp.str(); + } + + // Look up block weight via bcMap. + int64_t blockWeight = -1; + SrcKey blockSk; + if (rt && rt->region) { + // Find the bcMap entry covering this TCA. + auto const& bcMap = meta.bcMap; + for (size_t i = 0; i < bcMap.size(); ++i) { + auto const entryStart = bcMap[i].aStart; + auto const entryEnd = (i + 1 < bcMap.size()) + ? bcMap[i + 1].aStart : mainEnd; + if (aiAddr >= entryStart && aiAddr < entryEnd) { + blockSk = bcMap[i].sk; + // Find the region block matching this SrcKey and get its + // profile execution count. + for (auto const& block : rt->region->blocks()) { + if (block->start() == blockSk || + (block->start().func() == blockSk.func() && + block->start().offset() <= blockSk.offset() && + blockSk.offset() <= block->last().offset())) { + auto const bid = block->profTransID(); + if (pd && bid != kInvalidTransID) { + blockWeight = pd->transCounter(bid); + } + break; + } + } + break; + } + } + } + + runtimeCalls.push_back( + RuntimeCallSiteInfo{aiAddr, target, std::move(symbol), + blockWeight, blockSk}); + + // Accumulate weighted count (use 0 for unknown weights). + density.numCalls++; + if (blockWeight > 0) { + density.weightedCalls += blockWeight; + } + } + } + + return {std::move(runtimeCalls), density}; +} + +/* + * Dump per-function runtime call info for all functions in jobs. + * For each function, scan the machine code for movz/movk/blr sequences + * (non-smashable calls emitted by vasm `call` on ARM) and report: + * - function name, hot bytes size, runtime call density + * - for each runtime call: the call-site TCA, target address + symbol name, + * and the weight (profile execution count) of the enclosing block. + * + * Gated by FTRACE level 6. + */ +void dumpRuntimeCallInfo(std::vector& jobs) { + auto const pd = globalProfData(); + + std::ostringstream os; + os << "=== Runtime Call Info (pre-relocation) ===\n"; + + int funcIdx = 0; + for (auto& info : jobs) { + auto const func = info.func; + if (!func) { funcIdx++; continue; } + + auto [runtimeCalls, densityInfo] = scanRuntimeCalls(info, pd); + + // Only print functions that have runtime calls. + if (runtimeCalls.empty()) { funcIdx++; continue; } + + auto const funcName = func->fullName()->data(); + os << "\n func[" << funcIdx << "] " << funcName + << " hot_bytes=" << densityInfo.hotBytes + << " runtime_calls=" << runtimeCalls.size() + << " runtime_call_density=" + << std::fixed << std::setprecision(3) << densityInfo.density() + << " (" << densityInfo.weightedCalls + << " / " << densityInfo.hotBytes << ")\n"; + + for (auto const& ci : runtimeCalls) { + os << " call @ " << (void*)ci.callAddr + << " -> " << ci.symbol + << " weight=" << ci.blockWeight; + if (ci.blockSk.valid()) { + os << " block=" << showShort(ci.blockSk); + } + os << "\n"; + } + + funcIdx++; + } + + os << "\n=== End Runtime Call Info ===\n"; + FTRACE(6, "{}", os.str()); +} + +/* + * Reorder functions in `jobs` by runtime call density so that functions with + * the highest density (weighted_runtime_calls / hot_bytes) are placed first. + * + * The greedy algorithm sorts functions by density in descending order and + * promotes them to the front of the jobs vector until the cumulative hot bytes + * of promoted functions reaches a configurable limit. + * + * The limit is controlled by the Cfg::Jit::RuntimeCallReorderLimitMB + * config option (default: 64 MB). + * + * Functions that have no runtime calls or zero density retain their original + * relative order after the promoted functions. + */ +void reorderByRuntimeCallDensity(std::vector& jobs) { + // Wait for all worker threads to finish translating. + if (auto const disp = s_dispatcher.load(std::memory_order_acquire)) { + disp->waitEmpty(false); + } + + auto const limitMB = Cfg::Jit::RuntimeCallReorderLimitMB; + auto const limitBytes = static_cast(limitMB) * 1024UL * 1024UL; + + auto const pd = globalProfData(); + auto const n = jobs.size(); + + // Compute density for each function. + struct IndexedDensity { + size_t idx; + FuncRuntimeCallDensity info; + }; + std::vector densities; + densities.reserve(n); + for (size_t i = 0; i < n; ++i) { + auto [_, densityInfo] = scanRuntimeCalls(jobs[i], pd); + densities.push_back(IndexedDensity{i, densityInfo}); + } + + // Separate functions with positive density from the rest. + std::vector promoted; + std::vector remaining; + + // Sort candidates by density descending. + std::vector candidates; + for (size_t i = 0; i < n; ++i) { + if (densities[i].info.density() > 0.0) { + candidates.push_back(i); + } + } + std::sort(candidates.begin(), candidates.end(), + [&](size_t a, size_t b) { + return densities[a].info.density() > + densities[b].info.density(); + }); + + // Greedily select functions until we reach the size limit. + size_t cumulativeBytes = 0; + hphp_fast_set promotedSet; + for (auto idx : candidates) { + auto const hotBytes = densities[idx].info.hotBytes; + if (cumulativeBytes + hotBytes > limitBytes && !promoted.empty()) { + break; + } + promoted.push_back(idx); + promotedSet.insert(idx); + cumulativeBytes += hotBytes; + } + + // Collect remaining indices in original order. + for (size_t i = 0; i < n; ++i) { + if (promotedSet.count(i) == 0) { + remaining.push_back(i); + } + } + + // Build the reordered jobs vector. + std::vector reordered; + reordered.reserve(n); + for (auto idx : promoted) { + reordered.push_back(std::move(jobs[idx])); + } + for (auto idx : remaining) { + reordered.push_back(std::move(jobs[idx])); + } + jobs = std::move(reordered); + + FTRACE(4, "reorderByRuntimeCallDensity: promoted {} functions " + "({} bytes) to the front (limit {} MB)\n", + promoted.size(), cumulativeBytes, limitMB); +} + // GCC GCOV API extern "C" void __gcov_reset() __attribute__((__weak__)); // LLVM/clang API. See llvm-project/compiler-rt/lib/profile/InstrProfiling.h @@ -504,6 +832,13 @@ void retranslateAll(bool skipSerialize) { } // 6) Relocate the machine code into code.hot in the desired order + if (arch() == Arch::ARM && Cfg::Jit::RuntimeCallReorder) { + reorderByRuntimeCallDensity(jobs); + if (Trace::moduleEnabled(Trace::mcg, 6)) { + dumpRuntimeCallInfo(jobs); + } + } + tc::relocatePublishSortedOptFuncs(std::move(jobs)); if (auto const dispatcher = s_dispatcher.load(std::memory_order_acquire)) { From 9af06bdfe79ae01011b9d875ec745d8736b79821 Mon Sep 17 00:00:00 2001 From: Igor Kirillov Date: Fri, 27 Feb 2026 14:42:19 +0000 Subject: [PATCH 2/2] Fix assertion error --- hphp/runtime/vm/jit/mcgen-translate.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hphp/runtime/vm/jit/mcgen-translate.cpp b/hphp/runtime/vm/jit/mcgen-translate.cpp index 121ed1d6afa59..b1986a8aaf6cd 100644 --- a/hphp/runtime/vm/jit/mcgen-translate.cpp +++ b/hphp/runtime/vm/jit/mcgen-translate.cpp @@ -527,7 +527,8 @@ scanRuntimeCalls(tc::FuncMetaInfo& info, const ProfData* pd) { // profile execution count. for (auto const& block : rt->region->blocks()) { if (block->start() == blockSk || - (block->start().func() == blockSk.func() && + (!blockSk.prologue() && !blockSk.funcEntry() && + block->start().func() == blockSk.func() && block->start().offset() <= blockSk.offset() && blockSk.offset() <= block->last().offset())) { auto const bid = block->profTransID();