Skip to content

Commit 34a42c3

Browse files
authored
Merge pull request #2151 from riscv-software-src/speed-up-fetch
Further speed up instruction fetch
2 parents 8697fdd + fa368ff commit 34a42c3

File tree

12 files changed

+108
-48
lines changed

12 files changed

+108
-48
lines changed

riscv/common.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,16 @@
1919
# define UNUSED
2020
#endif
2121

22+
#ifndef __has_builtin
23+
# define __has_builtin(x) 0
24+
#endif
25+
26+
#if __has_cpp_attribute(assume)
27+
# define assume(x) [[assume(x)]]
28+
#elif __has_builtin(__builtin_assume)
29+
# define assume(x) __builtin_assume(x)
30+
#else
31+
# define assume(x) ((void) 0)
32+
#endif
33+
2234
#endif

riscv/csrs.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -773,6 +773,8 @@ bool misa_csr_t::unlogged_write(const reg_t val) noexcept {
773773
}
774774
}
775775

776+
proc->get_mmu()->flush_tlb();
777+
776778
return basic_csr_t::unlogged_write(new_misa);
777779
}
778780

riscv/decode_macros.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,8 @@ static inline bool is_aligned(const unsigned val, const unsigned pos)
225225
#define zext_xlen(x) zext(x, xlen)
226226

227227
#define set_pc(x) \
228-
do { p->check_pc_alignment(x); \
228+
do { if (unlikely((x) & ~p->pc_alignment_mask())) \
229+
return p->throw_instruction_address_misaligned(x); \
229230
npc = sext_xlen(x); \
230231
} while (0)
231232

riscv/execute.cc

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -229,19 +229,19 @@ void processor_t::step(size_t n)
229229
state.prv_changed = false;
230230
state.v_changed = false;
231231

232-
#define advance_pc() \
232+
#define advance_pc() { \
233233
if (unlikely(invalid_pc(pc))) { \
234234
switch (pc) { \
235235
case PC_SERIALIZE_BEFORE: state.serialized = true; break; \
236236
case PC_SERIALIZE_AFTER: ++instret; break; \
237237
default: abort(); \
238238
} \
239239
pc = state.pc; \
240-
break; \
240+
goto serialize; \
241241
} else { \
242242
state.pc = pc; \
243243
instret++; \
244-
}
244+
}}
245245

246246
try
247247
{
@@ -302,19 +302,21 @@ void processor_t::step(size_t n)
302302
else while (instret < n)
303303
{
304304
// Main simulation loop, fast path.
305-
for (auto ic_entry = _mmu->access_icache(pc); ; ) {
305+
for (auto ic_entry = _mmu->access_icache(pc); instret < n; instret++) {
306306
auto fetch = ic_entry->data;
307-
pc = execute_insn_fast(this, pc, fetch);
308-
ic_entry = &_mmu->icache[_mmu->icache_index(pc)];
309-
if (unlikely(ic_entry->tag != pc))
310-
break;
311-
if (unlikely(instret + 1 == n))
312-
break;
313-
instret++;
314-
state.pc = pc;
307+
ic_entry = ic_entry->next;
308+
auto new_pc = execute_insn_fast(this, pc, fetch);
309+
if (unlikely(ic_entry->tag != new_pc)) {
310+
ic_entry = &_mmu->icache[_mmu->icache_index(new_pc)];
311+
_mmu->icache[_mmu->icache_index(pc)].next = ic_entry;
312+
if (ic_entry->tag != new_pc) {
313+
pc = new_pc;
314+
advance_pc();
315+
break;
316+
}
317+
}
318+
state.pc = pc = ic_entry->tag;
315319
}
316-
317-
advance_pc();
318320
}
319321
}
320322
catch(trap_t& t)
@@ -361,6 +363,7 @@ void processor_t::step(size_t n)
361363
in_wfi = true;
362364
}
363365

366+
serialize:
364367
state.minstret->bump((state.mcountinhibit->read() & MCOUNTINHIBIT_IR) ? 0 : instret);
365368

366369
// Model a hart whose CPI is 1.

riscv/insn_template.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
#define DECODE_MACRO_USAGE_LOGGED 0
77

88
#define PROLOGUE \
9-
reg_t npc = sext_xlen(pc + insn_length(OPCODE))
9+
reg_t npc = sext_xlen(pc + insn_length(OPCODE)); \
10+
if (!p->extension_enabled(EXT_ZCA)) assume(insn_length(OPCODE) % 4 == 0)
1011

1112
#define EPILOGUE \
1213
trace_opcode(p, OPCODE, insn); \

riscv/insns/c_add.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,2 @@
11
require_extension(EXT_ZCA);
2-
require(insn.rvc_rs2() != 0);
32
WRITE_RD(sext_xlen(RVC_RS1 + RVC_RS2));

riscv/insns/c_jalr.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
require_extension(EXT_ZCA);
2-
require(insn.rvc_rs1() != 0);
32
reg_t tmp = npc;
43
set_pc(RVC_RS1 & ~reg_t(1));
54
WRITE_REG(X_RA, tmp);

riscv/insns/c_mv.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,2 @@
11
require_extension(EXT_ZCA);
2-
require(insn.rvc_rs2() != 0);
32
WRITE_RD(RVC_RS2);

riscv/mmu.cc

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ void mmu_t::flush_tlb()
3939
memset(tlb_insn, -1, sizeof(tlb_insn));
4040
memset(tlb_load, -1, sizeof(tlb_load));
4141
memset(tlb_store, -1, sizeof(tlb_store));
42+
memset(pte_cache, -1, sizeof(pte_cache));
4243

4344
flush_icache();
4445
}
@@ -106,7 +107,9 @@ mmu_t::insn_parcel_t mmu_t::fetch_slow_path(reg_t vaddr)
106107

107108
auto [tlb_hit, host_addr, paddr] = access_tlb(tlb_insn, vaddr, TLB_FLAGS);
108109
auto access_info = generate_access_info(vaddr, FETCH, {});
109-
check_triggers(triggers::OPERATION_EXECUTE, vaddr, access_info.effective_virt);
110+
111+
if (check_triggers_fetch)
112+
check_triggers(triggers::OPERATION_EXECUTE, vaddr, access_info.effective_virt);
110113

111114
if (!tlb_hit) {
112115
paddr = translate(access_info, sizeof(insn_parcel_t));
@@ -125,7 +128,8 @@ mmu_t::insn_parcel_t mmu_t::fetch_slow_path(reg_t vaddr)
125128

126129
auto res = perform_intrapage_fetch(vaddr, host_addr, paddr);
127130

128-
check_triggers(triggers::OPERATION_EXECUTE, vaddr, access_info.effective_virt, from_le(res));
131+
if (check_triggers_fetch)
132+
check_triggers(triggers::OPERATION_EXECUTE, vaddr, access_info.effective_virt, from_le(res));
129133

130134
return res;
131135
}
@@ -248,16 +252,17 @@ void mmu_t::load_slow_path_intrapage(reg_t len, uint8_t* bytes, mem_access_info_
248252
{
249253
reg_t vaddr = access_info.vaddr;
250254
auto [tlb_hit, host_addr, paddr] = access_tlb(tlb_load, vaddr, TLB_FLAGS);
251-
if (!tlb_hit || access_info.flags.is_special_access()) {
255+
bool special = access_info.flags.is_special_access() && !access_info.flags.lr;
256+
if (!tlb_hit || special) {
252257
paddr = translate(access_info, len);
253258
host_addr = (uintptr_t)sim->addr_to_mem(paddr);
254259

255-
if (!access_info.flags.is_special_access())
260+
if (!special)
256261
refill_tlb(vaddr, paddr, (char*)host_addr, LOAD);
262+
}
257263

258-
if (access_info.flags.lr && !sim->reservable(paddr)) {
259-
throw trap_load_access_fault(access_info.effective_virt, access_info.transformed_vaddr, 0, 0);
260-
}
264+
if (access_info.flags.lr && !sim->reservable(paddr)) {
265+
throw trap_load_access_fault(access_info.effective_virt, access_info.transformed_vaddr, 0, 0);
261266
}
262267

263268
perform_intrapage_load(vaddr, host_addr, paddr, len, bytes, access_info.flags);
@@ -282,7 +287,9 @@ void mmu_t::load_slow_path(reg_t original_addr, reg_t len, uint8_t* bytes, xlate
282287

283288
auto access_info = generate_access_info(original_addr, LOAD, xlate_flags);
284289
reg_t transformed_addr = access_info.transformed_vaddr;
285-
check_triggers(triggers::OPERATION_LOAD, transformed_addr, access_info.effective_virt);
290+
291+
if (check_triggers_load)
292+
check_triggers(triggers::OPERATION_LOAD, transformed_addr, access_info.effective_virt);
286293

287294
if ((transformed_addr & (len - 1)) == 0) {
288295
load_slow_path_intrapage(len, bytes, access_info);
@@ -302,12 +309,14 @@ void mmu_t::load_slow_path(reg_t original_addr, reg_t len, uint8_t* bytes, xlate
302309
}
303310
}
304311

305-
while (len > sizeof(reg_t)) {
306-
check_triggers(triggers::OPERATION_LOAD, transformed_addr, access_info.effective_virt, reg_from_bytes(sizeof(reg_t), bytes));
307-
len -= sizeof(reg_t);
308-
bytes += sizeof(reg_t);
312+
if (check_triggers_load) {
313+
while (len > sizeof(reg_t)) {
314+
check_triggers(triggers::OPERATION_LOAD, transformed_addr, access_info.effective_virt, reg_from_bytes(sizeof(reg_t), bytes));
315+
len -= sizeof(reg_t);
316+
bytes += sizeof(reg_t);
317+
}
318+
check_triggers(triggers::OPERATION_LOAD, transformed_addr, access_info.effective_virt, reg_from_bytes(len, bytes));
309319
}
310-
check_triggers(triggers::OPERATION_LOAD, transformed_addr, access_info.effective_virt, reg_from_bytes(len, bytes));
311320

312321
if (proc && unlikely(proc->get_log_commits_enabled()))
313322
proc->state.log_mem_read.push_back(std::make_tuple(original_addr, 0, len));
@@ -367,7 +376,8 @@ void mmu_t::store_slow_path(reg_t original_addr, reg_t len, const uint8_t* bytes
367376

368377
auto access_info = generate_access_info(original_addr, STORE, xlate_flags);
369378
reg_t transformed_addr = access_info.transformed_vaddr;
370-
if (actually_store) {
379+
380+
if (actually_store && check_triggers_store) {
371381
reg_t trig_len = len;
372382
const uint8_t* trig_bytes = bytes;
373383
while (trig_len > sizeof(reg_t)) {

riscv/mmu.h

Lines changed: 40 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ struct insn_fetch_t
4343

4444
struct icache_entry_t {
4545
reg_t tag;
46+
icache_entry_t* next;
4647
insn_fetch_t data;
4748
};
4849

@@ -56,6 +57,11 @@ struct dtlb_entry_t {
5657
reg_t tag;
5758
};
5859

60+
struct pte_cache_entry_t {
61+
reg_t paddr;
62+
reg_t pte;
63+
};
64+
5965
struct xlate_flags_t {
6066
const bool forced_virt : 1 {false};
6167
const bool hlvx : 1 {false};
@@ -129,7 +135,7 @@ class mmu_t
129135
T ss_load(reg_t addr) {
130136
if ((addr & (sizeof(T) - 1)) != 0)
131137
throw trap_store_access_fault((proc) ? proc->state.v : false, addr, 0, 0);
132-
return load<T>(addr, {.forced_virt=false, .hlvx=false, .lr=false, .ss_access=true});
138+
return load<T>(addr, {.ss_access=true});
133139
}
134140

135141
template<typename T>
@@ -156,7 +162,7 @@ class mmu_t
156162
void ss_store(reg_t addr, T val) {
157163
if ((addr & (sizeof(T) - 1)) != 0)
158164
throw trap_store_access_fault((proc) ? proc->state.v : false, addr, 0, 0);
159-
store<T>(addr, val, {.forced_virt=false, .hlvx=false, .lr=false, .ss_access=true});
165+
store<T>(addr, val, {.ss_access=true});
160166
}
161167

162168
// AMO/Zicbom faults should be reported as store faults
@@ -188,13 +194,9 @@ class mmu_t
188194
// for shadow stack amoswap
189195
template<typename T>
190196
T ssamoswap(reg_t addr, reg_t value) {
191-
bool forced_virt = false;
192-
bool hlvx = false;
193-
bool lr = false;
194-
bool ss_access = true;
195-
store_slow_path(addr, sizeof(T), nullptr, {forced_virt, hlvx, lr, ss_access}, false, true);
196-
auto data = load<T>(addr, {forced_virt, hlvx, lr, ss_access});
197-
store<T>(addr, value, {forced_virt, hlvx, lr, ss_access});
197+
store_slow_path(addr, sizeof(T), nullptr, {.ss_access=true}, false, true);
198+
auto data = load<T>(addr, {.ss_access=true});
199+
store<T>(addr, value, {.ss_access=true});
198200
return data;
199201
}
200202

@@ -272,7 +274,10 @@ class mmu_t
272274
store_slow_path(vaddr, size, nullptr, {}, false, true);
273275
}
274276

275-
reg_t paddr = translate(generate_access_info(vaddr, STORE, {}), 1);
277+
auto [tlb_hit, host_addr, paddr] = access_tlb(tlb_store, vaddr);
278+
if (!tlb_hit)
279+
paddr = translate(generate_access_info(vaddr, STORE, {}), 1);
280+
276281
if (sim->reservable(paddr))
277282
return load_reservation_address == paddr;
278283
else
@@ -321,6 +326,7 @@ class mmu_t
321326

322327
insn_fetch_t fetch = {proc->decode_insn(insn), insn};
323328
entry->tag = addr;
329+
entry->next = &icache[icache_index(addr + length)];
324330
entry->data = fetch;
325331

326332
auto [check_tracer, _, paddr] = access_tlb(tlb_insn, addr, TLB_FLAGS, TLB_CHECK_TRACER);
@@ -407,6 +413,9 @@ class mmu_t
407413
dtlb_entry_t tlb_store[TLB_ENTRIES];
408414
dtlb_entry_t tlb_insn[TLB_ENTRIES];
409415

416+
static const reg_t PTE_CACHE_ENTRIES = 251;
417+
pte_cache_entry_t pte_cache[PTE_CACHE_ENTRIES];
418+
410419
typedef bloom_filter_t<reg_t, simple_hash1, simple_hash2, TLB_ENTRIES * 16, 3> reverse_tags_t;
411420
reverse_tags_t tlb_store_reverse_tags;
412421
reverse_tags_t tlb_insn_reverse_tags;
@@ -463,6 +472,9 @@ class mmu_t
463472

464473
template<typename T> inline reg_t pte_load(reg_t pte_paddr, reg_t addr, bool virt, access_type trap_type)
465474
{
475+
if (auto [hit, pte] = pte_cache_access(pte_paddr); hit)
476+
return pte;
477+
466478
const size_t ptesize = sizeof(T);
467479

468480
if (!pmp_ok(pte_paddr, ptesize, LOAD, PRV_S, false))
@@ -475,7 +487,10 @@ class mmu_t
475487
} else if (!mmio_load(pte_paddr, ptesize, (uint8_t*)&target_pte)) {
476488
throw_access_exception(virt, addr, trap_type);
477489
}
478-
return from_target(target_pte);
490+
491+
auto res = from_target(target_pte);
492+
pte_cache_insert(pte_paddr, res);
493+
return res;
479494
}
480495

481496
template<typename T> inline void pte_store(reg_t pte_paddr, reg_t new_pte, reg_t addr, bool virt, access_type trap_type)
@@ -492,6 +507,20 @@ class mmu_t
492507
} else if (!mmio_store(pte_paddr, ptesize, (uint8_t*)&target_pte)) {
493508
throw_access_exception(virt, addr, trap_type);
494509
}
510+
511+
pte_cache_insert(pte_paddr, new_pte);
512+
}
513+
514+
std::tuple<bool, reg_t> pte_cache_access(reg_t key)
515+
{
516+
auto e = pte_cache[key % PTE_CACHE_ENTRIES];
517+
return std::make_tuple(e.paddr == key, e.pte);
518+
}
519+
520+
void pte_cache_insert(reg_t key, reg_t value)
521+
{
522+
if (value & PTE_V)
523+
pte_cache[key % PTE_CACHE_ENTRIES] = {key, value};
495524
}
496525

497526
inline insn_parcel_t fetch_insn_parcel(reg_t addr) {

0 commit comments

Comments
 (0)