Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions packages/chip/.yamllint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,15 @@ rules:
allowed-values: ["true", "false", "on", "off"]
comments:
min-spaces-from-content: 1
# Indentation stays enforced for hand-authored YAML. The chip tree also
# carries PyYAML-generated evidence reports whose default `safe_dump` block
# style does not match yamllint's expected sequence indentation; relax the
# rule only for those generator-owned subtrees and dated audit files. Schema
# and content validators still enforce semantics.
indentation:
ignore: |
board/kicad/e1-phone/production/
board/kicad/e1-phone/e1-phone-objective-completion-audit-*.yaml
board/kicad/e1-phone/e1-phone-readiness-unblock-register-*.yaml
board/kicad/e1-phone/kicad-route-readiness-inventory-*.yaml
docs/evidence/pd/e1-soc-pd-input-contract.yaml
65 changes: 38 additions & 27 deletions packages/chip/benchmarks/compiler/autovec/run_vector_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
Invoked by scripts/run_e1_rvv_vector.sh, which owns tool discovery and the
fail-closed evidence emission.
"""

from __future__ import annotations

import argparse
Expand Down Expand Up @@ -63,9 +64,16 @@ def _mnemonic(disasm: str) -> str:

def build(gcc: Path, march: str, kernel: str, driver: Path, kernels: Path, out: Path) -> None:
cmd = [
str(gcc), "-O3", f"-march={march}", "-mabi=lp64d",
str(gcc),
"-O3",
f"-march={march}",
"-mabi=lp64d",
f"-DKERNEL_{kernel}=1",
str(driver), str(kernels), "-lm", "-o", str(out),
str(driver),
str(kernels),
"-lm",
"-o",
str(out),
]
subprocess.run(cmd, check=True, capture_output=True, text=True)

Expand All @@ -77,7 +85,8 @@ def run_execlog(qemu: Path, cpu: str, plugin: Path, elf: Path, log: Path) -> int
with log.open("wb") as fh:
proc = subprocess.run(
[str(qemu), "-cpu", cpu, "-plugin", str(plugin), "-d", "plugin", str(elf)],
stdout=subprocess.DEVNULL, stderr=fh,
stdout=subprocess.DEVNULL,
stderr=fh,
)
return proc.returncode

Expand Down Expand Up @@ -146,10 +155,7 @@ def main() -> int:
# Kernels the driver wraps (driver.c uses -DKERNEL_<name>). 2D / collision
# kernels (conv2d, rope, histogram, trmv) are out of this functional sweep.
driver_src = args.driver.read_text()
wrapped = {
k["name"] for k in spec["kernels"]
if f"KERNEL_{k['name']}" in driver_src
}
wrapped = {k["name"] for k in spec["kernels"] if f"KERNEL_{k['name']}" in driver_src}

results = []
for entry in spec["kernels"]:
Expand All @@ -175,26 +181,32 @@ def main() -> int:
s_total, s_vec, _ = measure_region(s_log, s_begin, s_end)

reduction = round(s_total / v_total, 3) if v_total else None
results.append({
"kernel": name,
"group": entry.get("group"),
"elem_type": entry.get("elem_type"),
"expected_vectorized": entry.get("expected_vectorized"),
"scalar_dynamic_insns": s_total,
"vector_dynamic_insns": v_total,
"vector_dynamic_vec_ops": v_vec,
"scalar_dynamic_vec_ops": s_vec,
"dynamic_insn_reduction_x": reduction,
"autovectorized": v_vec > 0,
"result_checksum_match": v_exit == s_exit,
"vector_op_histogram": dict(sorted(v_hist.items(), key=lambda kv: -kv[1])),
})
print(f" {name:28s} scalar={s_total:>9d} vector={v_total:>9d} "
f"reduction={reduction}x vec_ops={v_vec}", file=sys.stderr)
results.append(
{
"kernel": name,
"group": entry.get("group"),
"elem_type": entry.get("elem_type"),
"expected_vectorized": entry.get("expected_vectorized"),
"scalar_dynamic_insns": s_total,
"vector_dynamic_insns": v_total,
"vector_dynamic_vec_ops": v_vec,
"scalar_dynamic_vec_ops": s_vec,
"dynamic_insn_reduction_x": reduction,
"autovectorized": v_vec > 0,
"result_checksum_match": v_exit == s_exit,
"vector_op_histogram": dict(sorted(v_hist.items(), key=lambda kv: -kv[1])),
}
)
print(
f" {name:28s} scalar={s_total:>9d} vector={v_total:>9d} "
f"reduction={reduction}x vec_ops={v_vec}",
file=sys.stderr,
)

vectorized = [r for r in results if r["autovectorized"]]
reductions = [r["dynamic_insn_reduction_x"] for r in vectorized
if r["dynamic_insn_reduction_x"]]
reductions = [
r["dynamic_insn_reduction_x"] for r in vectorized if r["dynamic_insn_reduction_x"]
]
geomean = None
if reductions:
prod = 1.0
Expand Down Expand Up @@ -227,8 +239,7 @@ def main() -> int:
"metric": "dynamic_instruction_count (kernel region, execlog-windowed)",
"kernel_count": len(results),
"autovectorized_count": len(vectorized),
"checksum_mismatches": [r["kernel"] for r in results
if not r["result_checksum_match"]],
"checksum_mismatches": [r["kernel"] for r in results if not r["result_checksum_match"]],
"checksum_note": (
"A mismatch on a floating-point reduction kernel (e.g. "
"dot_product_f32_unrolled4) is expected: vectorized reductions "
Expand Down
42 changes: 21 additions & 21 deletions packages/chip/benchmarks/cpu/branch/bpu_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from collections import defaultdict
from collections.abc import Iterable
from dataclasses import dataclass, field
from typing import Any

BR_NONE = 0
BR_COND = 1
Expand All @@ -29,7 +30,7 @@
BR_IND = 4

# Per-table geometry mirrors rtl/cpu/bpu/bpu_pkg.sv.
DEFAULT_GEOMETRY: dict[str, object] = {
DEFAULT_GEOMETRY: dict[str, Any] = {
"FETCH_BLOCK_BYTES": 32,
# Experiment-only front-end limit: how many conditional-branch predictions
# can be carried for one fetched block. Many production predictors carry
Expand Down Expand Up @@ -397,9 +398,8 @@
self.storage[tid][idx] = ctr - 1
if self.local_history_bits > 0:
idx = (pc >> 1) % self.local_history_entries
self.local_history[idx] = (
((self.local_history[idx] << 1) | int(taken)) &
_mask(self.local_history_bits)
self.local_history[idx] = ((self.local_history[idx] << 1) | int(taken)) & _mask(
self.local_history_bits
)


Expand Down Expand Up @@ -530,56 +530,58 @@
return entry["target"], t + 1, entry["ctr"]
return None, 0, 0

def update(self, pc: int, hist: int, target: int, provider: int, misp: bool) -> None:
self.updates += 1
if self.updates % self.geo["ITTAGE_USEFUL_RESET_PERIOD"] == 0:
for table in self.storage:
for entry in table.values():
entry["useful"] = max(entry.get("useful", 0) - 1, 0)
if provider > 0:
idx, tag = self._index_tag(provider - 1, pc, hist)
entry = self.storage[provider - 1].get(idx)
if entry is not None and entry["tag"] == tag:
if entry["target"] == target:
entry["ctr"] = min(entry["ctr"] + 1, _mask(self.geo["ITTAGE_CTR_W"]))
entry["useful"] = min(
entry.get("useful", 0) + 1,
provider_entry = self.storage[provider - 1].get(idx)
if provider_entry is not None and provider_entry["tag"] == tag:
if provider_entry["target"] == target:
provider_entry["ctr"] = min(
provider_entry["ctr"] + 1, _mask(self.geo["ITTAGE_CTR_W"])
)
provider_entry["useful"] = min(
provider_entry.get("useful", 0) + 1,
_mask(self.geo["ITTAGE_USEFUL_W"]),
)
elif (
provider >= self.geo["ITTAGE_REPLACE_MIN_PROVIDER"]
and entry["ctr"] <= self.geo["ITTAGE_REPLACE_WEAK_CTR"]
and provider_entry["ctr"] <= self.geo["ITTAGE_REPLACE_WEAK_CTR"]
):
entry["target"] = target
entry["ctr"] = 1 << (self.geo["ITTAGE_CTR_W"] - 1)
entry["useful"] = 0
elif entry["ctr"] == 0:
provider_entry["target"] = target
provider_entry["ctr"] = 1 << (self.geo["ITTAGE_CTR_W"] - 1)
provider_entry["useful"] = 0
elif provider_entry["ctr"] == 0:
self.storage[provider - 1].pop(idx, None)
else:
entry["ctr"] -= 1
entry["useful"] = max(entry.get("useful", 0) - 1, 0)
provider_entry["ctr"] -= 1
provider_entry["useful"] = max(provider_entry.get("useful", 0) - 1, 0)
if misp:
for higher in range(max(provider, 0), self.geo["ITTAGE_TABLES"]):
idx, tag = self._index_tag(higher, pc, hist)
if idx not in self.storage[higher]:
self.storage[higher][idx] = {
"tag": tag,
"target": target,
"ctr": 1 << (self.geo["ITTAGE_CTR_W"] - 1),
"useful": 0,
}
return
for higher in range(max(provider, 0), self.geo["ITTAGE_TABLES"]):
idx, tag = self._index_tag(higher, pc, hist)
victim = self.storage[higher].get(idx)
if victim.get("useful", 0) == 0:
if victim is not None and victim.get("useful", 0) == 0:
self.storage[higher][idx] = {
"tag": tag,
"target": target,
"ctr": 1 << (self.geo["ITTAGE_CTR_W"] - 1),
"useful": 0,
}
return

Check notice on line 584 in packages/chip/benchmarks/cpu/branch/bpu_model.py

View check run for this annotation

codefactor.io / CodeFactor

packages/chip/benchmarks/cpu/branch/bpu_model.py#L533-L584

Complex Method


@dataclass
Expand Down Expand Up @@ -684,75 +686,73 @@
return taken, target
return False, event.pc + self.geometry["FETCH_BLOCK_BYTES"]

def _step(self, event: BranchEvent) -> None:
ittage_hist = self._ittage_history()
pred_taken, pred_target = self._predict(event)
pred_taken, pred_target = self._apply_fetch_block_slot_limit(
event, pred_taken, pred_target
)
pred_taken, pred_target = self._apply_fetch_block_slot_limit(event, pred_taken, pred_target)
actual_taken = event.taken
actual_target = event.target
misp = (pred_taken != actual_taken) or (actual_taken and pred_target != actual_target)

# Update PMU-style counters.
self.counters["pred"] += 1
if event.kind == BR_COND:
self.counters["cond"] += 1
if misp:
self.counters["cond_misp"] += 1
elif event.kind == BR_CALL:
self.counters["call"] += 1
if misp:
self.counters["ind_misp"] += 1
elif event.kind == BR_IND:
self.counters["ind"] = self.counters.get("ind", 0) + 1
if misp:
self.counters["ind_misp"] += 1
elif event.kind == BR_RET:
self.counters["ret"] += 1
if misp:
self.counters["ret_misp"] += 1
if misp:
self.counters["misp"] += 1

# Train tables.
if event.kind == BR_COND:
_, provider, low_conf = self.tage.predict(event.pc, self.hist)
sc_override, _ = self.sc.predict(event.pc, self.hist, low_conf)
if sc_override:
self.counters["sc_override"] += 1
self.tage.update(event.pc, self.hist, self.hist, actual_taken, provider, misp)
self.sc.update(event.pc, self.hist, actual_taken, low_conf)
self.loop.update(event.pc, actual_target, actual_taken)
self.ftb.update(event.pc, actual_target, event.kind)
elif event.kind == BR_CALL:
_, provider, _ = self.ittage.predict(event.pc, ittage_hist)
self.ittage.update(event.pc, ittage_hist, actual_target, provider, misp)
return_pc = (
event.call_return_pc
if event.call_return_pc is not None
else event.pc + self.geometry["FETCH_BLOCK_BYTES"]
)
self.ras.commit_push(return_pc)
self.ftb.update(event.pc, actual_target, event.kind)
elif event.kind == BR_IND:
_, provider, _ = self.ittage.predict(event.pc, ittage_hist)
self.ittage.update(event.pc, ittage_hist, actual_target, provider, misp)
self.ftb.update(event.pc, actual_target, event.kind)
elif event.kind == BR_RET:
self.ras.commit_pop()
self.ftb.update(event.pc, actual_target, event.kind)

# Shift the global history register.
if event.kind == BR_COND:
self.hist = ((self.hist << 1) | int(actual_taken)) & _mask(
self.geometry["TAGE_HIST_LEN"][-1]
)
elif event.kind in (BR_CALL, BR_IND):
self._update_target_history(actual_target)
self._update_path_history(event.pc)

self._advance_fetch_block_slot_state(event)

Check notice on line 755 in packages/chip/benchmarks/cpu/branch/bpu_model.py

View check run for this annotation

codefactor.io / CodeFactor

packages/chip/benchmarks/cpu/branch/bpu_model.py#L689-L755

Complex Method

def _ittage_history(self) -> int:
hist = self.hist
Expand Down
24 changes: 7 additions & 17 deletions packages/chip/benchmarks/cpu/branch/compare_mpki_rtl.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,8 @@ def build_evidence() -> dict:
e1_model = m["e1_mpki"]
cva6_model = m["cva6_mpki"]
ratio_rtl = (cva6_model / e1_rtl) if e1_rtl > 0 else None
rel_gap = (abs(e1_rtl - e1_model) / e1_model) if e1_model > 0 else (
0.0 if e1_rtl == 0 else None
rel_gap = (
(abs(e1_rtl - e1_model) / e1_model) if e1_model > 0 else (0.0 if e1_rtl == 0 else None)
)
per_trace[name] = {
"trace_class": info["trace_class"] or m.get("trace_class"),
Expand All @@ -169,9 +169,7 @@ def build_evidence() -> dict:
"improvement_ratio_cva6_model_over_e1_rtl": (
round(ratio_rtl, 4) if ratio_rtl is not None else None
),
"e1_rtl_vs_model_rel_gap": (
round(rel_gap, 4) if rel_gap is not None else None
),
"e1_rtl_vs_model_rel_gap": (round(rel_gap, 4) if rel_gap is not None else None),
}

paired = list(per_trace.values())
Expand All @@ -186,12 +184,8 @@ def build_evidence() -> dict:
pearson = _pearson(e1_rtl_vals, e1_model_vals)
spearman = _spearman(e1_rtl_vals, e1_model_vals)

rel_geo_gap = (
abs(e1_rtl_geo - e1_model_geo) / e1_model_geo if e1_model_geo > 0 else None
)
converged = bool(
rel_geo_gap is not None and rel_geo_gap <= RTL_MODEL_REL_GAP_CONVERGED
)
rel_geo_gap = abs(e1_rtl_geo - e1_model_geo) / e1_model_geo if e1_model_geo > 0 else None
converged = bool(rel_geo_gap is not None and rel_geo_gap <= RTL_MODEL_REL_GAP_CONVERGED)

ratio_rtl_geo = cva6_model_geo / e1_rtl_geo if e1_rtl_geo > 0 else None

Expand Down Expand Up @@ -251,9 +245,7 @@ def build_evidence() -> dict:
"rtl_citations": CVA6_RTL_CITATIONS,
},
"model_comparison_source": str(MODEL_COMPARISON_PATH.relative_to(ROOT)),
"rtl_evidence_sources": sorted(
{p["rtl_source"] for p in per_trace.values()}
),
"rtl_evidence_sources": sorted({p["rtl_source"] for p in per_trace.values()}),
"shared_trace_count": len(per_trace),
"rtl_traces_without_model_pair": sorted(missing_model),
"per_trace": per_trace,
Expand All @@ -270,9 +262,7 @@ def build_evidence() -> dict:
"spearman_rho": round(spearman, 6) if spearman is not None else None,
"e1_model_geomean_mpki": round(e1_model_geo, 6),
"e1_rtl_geomean_mpki": round(e1_rtl_geo, 6),
"geomean_relative_gap": (
round(rel_geo_gap, 6) if rel_geo_gap is not None else None
),
"geomean_relative_gap": (round(rel_geo_gap, 6) if rel_geo_gap is not None else None),
"convergence_band_rel_gap": RTL_MODEL_REL_GAP_CONVERGED,
"converged": converged,
},
Expand Down
8 changes: 5 additions & 3 deletions packages/chip/benchmarks/cpu/branch/test_bpu_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,11 @@
synthetic_alias_thrash,
synthetic_alternating,
synthetic_always_taken_loop,
synthetic_dual_branch_fetch_block,
synthetic_btb_confidence_churn,
synthetic_control_indirect_pair,
synthetic_gpu_occupancy_phase,
synthetic_dual_branch_fetch_block,
synthetic_gpu_nested_reconvergence,
synthetic_gpu_occupancy_phase,
synthetic_loop_known_count,
synthetic_phase_change_server,
synthetic_recursive_call_return,
Expand Down Expand Up @@ -293,7 +293,9 @@ def test_weak_ittage_yields_to_stable_ftb_target():
"ctr": 1 << (sim.geometry["ITTAGE_CTR_W"] - 1),
}

pred_taken, pred_target = sim._predict(BranchEvent(pc=pc, target=stable, taken=True, kind=BR_IND))
pred_taken, pred_target = sim._predict(
BranchEvent(pc=pc, target=stable, taken=True, kind=BR_IND)
)

assert pred_taken
assert pred_target == stable
Expand Down
4 changes: 3 additions & 1 deletion packages/chip/benchmarks/cpu/branch/traces.py
Original file line number Diff line number Diff line change
Expand Up @@ -625,7 +625,9 @@ def synthetic_vtable_path_correlated(paths: int = 4, repeats: int = 192) -> Iter
)


def synthetic_interpreter_dispatch_mixed(opcodes: int = 9, repeats: int = 160) -> Iterator[BranchEvent]:
def synthetic_interpreter_dispatch_mixed(
opcodes: int = 9, repeats: int = 160
) -> Iterator[BranchEvent]:
"""Interpreter/VM dispatch: bytecode indirects mixed with local branches."""
dispatch_pc = 0x8007_A000
guard_pc = 0x8007_A040
Expand Down
4 changes: 3 additions & 1 deletion packages/chip/benchmarks/mlperf/energy.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,9 @@ def modeled_energy_joules_per_inference(config: NpuScaleConfig) -> float:
return energy_nj / 1e9


def energy_block(config: NpuScaleConfig, integration_window_seconds: float, sample_count: int) -> dict[str, Any]:
def energy_block(
config: NpuScaleConfig, integration_window_seconds: float, sample_count: int
) -> dict[str, Any]:
"""Build the schema ``energy_joules_per_inference`` object (G-7).

Matches ``docs/benchmarks/report-schema.yaml`` field-for-field:
Expand Down
12 changes: 4 additions & 8 deletions packages/chip/benchmarks/mlperf/loadgen.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,11 @@
import time
from collections.abc import Sequence
from dataclasses import dataclass, field
from enum import Enum
from enum import StrEnum
from typing import Protocol


class Scenario(str, Enum):
class Scenario(StrEnum):
SINGLE_STREAM = "SingleStream"
OFFLINE = "Offline"

Expand Down Expand Up @@ -137,9 +137,7 @@ def run_loadgen(sut: SystemUnderTest, config: LoadGenConfig) -> LoadGenResult:
batch_response = sut.issue_query([sample])
issue_end = time.perf_counter_ns()
if len(batch_response) != 1:
raise ValueError(
"SingleStream SUT must return exactly one response per query"
)
raise ValueError("SingleStream SUT must return exactly one response per query")
responses.append(batch_response[0])
latencies_ns.append(issue_end - issue_start)
wall_time_ns = time.perf_counter_ns() - wall_start
Expand All @@ -162,9 +160,7 @@ def run_loadgen(sut: SystemUnderTest, config: LoadGenConfig) -> LoadGenResult:
batch_response = sut.issue_query(samples)
wall_time_ns = time.perf_counter_ns() - wall_start
if len(batch_response) != config.query_count:
raise ValueError(
"Offline SUT must return one response per submitted sample"
)
raise ValueError("Offline SUT must return one response per submitted sample")
# Offline reports throughput, not per-query latency, but we still
# capture the batch wall time so the throughput is auditable.
throughput = config.query_count / (wall_time_ns / 1e9) if wall_time_ns else 0.0
Expand Down
4 changes: 3 additions & 1 deletion packages/chip/benchmarks/mlperf/run_e1_npu_mlperf.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,9 @@ def _scenario_from_cli(value: str) -> Scenario:


def _score(dataset: list[Any], responses: list[Any]) -> dict[str, Any]:
correct = sum(1 for response in responses if response.prediction == dataset[response.index].label)
correct = sum(
1 for response in responses if response.prediction == dataset[response.index].label
)
total = len(responses)
return {
"correct": correct,
Expand Down
7 changes: 6 additions & 1 deletion packages/chip/benchmarks/mlperf/test_mlperf_harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,12 @@ def test_single_stream_uses_one_query_at_a_time_and_npu_counters(self) -> None:
self.assertEqual(sut.counters.inferences, len(dataset))
self.assertEqual(sut.counters.npu_commands, len(dataset) * 2)
self.assertEqual(sut.counters.npu_macs, len(dataset) * macs_per_inference())
self.assertTrue(all(response.prediction == dataset[response.index].label for response in result.responses))
self.assertTrue(
all(
response.prediction == dataset[response.index].label
for response in result.responses
)
)

def test_offline_reports_throughput_and_accuracy(self) -> None:
report = build_report([Scenario.SINGLE_STREAM, Scenario.OFFLINE], 8)
Expand Down
4 changes: 3 additions & 1 deletion packages/chip/benchmarks/mlperf/test_mlperf_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,9 @@ def test_report_is_self_consistent_and_fail_closed_on_power() -> None:
assert scenario["accuracy"]["top1_accuracy"] == 1.0
assert scenario["energy_joules_per_inference"]["value"] > 0
assert scenario["npu_counters"]["npu_commands"] == scenario["query_count"] * 2
assert scenario["npu_counters"]["npu_macs"] == scenario["query_count"] * macs_per_inference()
assert (
scenario["npu_counters"]["npu_macs"] == scenario["query_count"] * macs_per_inference()
)
assert scenario["observed_macs_per_inference"] == float(macs_per_inference())
assert report["workload"]["macs_per_inference"] == macs_per_inference()
assert report["summary"]["npu_macs_total"] == (
Expand Down
Loading
Loading