Skip to content
This repository was archived by the owner on May 31, 2026. It is now read-only.

Commit 90bbf01

Browse files
Paradoxdovclaude
andcommitted
v0.4.20: critical — fix kernel-name lookup + summary aggregation
Two CRITICAL data-integrity bugs found by carefully re-reading a Habr user's 16-hour marathon log/report.json against an Intel i3-12100 + 2× Netac DDR4-3200. The marathon caught 24 real errors but the verdict and JSON had been misreporting them for months: BUG 1 — KERNEL NAME MISATTRIBUTION g_err_records[i].test stores the kernel_id_t enum value, but display code used g_tests[r->test].name to look up a name — indexing the test catalogue array by an enum value rather than by array position. The enum values do NOT match array positions: KER_AVX2_SUSTAINED = 12, but position 12 in g_tests[] happens to be L3 Cache Stress. Result: every error from AVX2 Sustained was labelled "L3 Cache Stress" in the verdict, in [ERR] log lines, and in report.json. Total misattribution that completely broke field triage. Field data: 24 errors all sharing XOR=0x20000000000000 (bit 53) were ALL from AVX2 Sustained (log shows "[1/14] AVX2 Sustained -> errors=1" 24 times, "L3 Cache Stress -> errors=0" 726 times). report.json labelled every one as "test":"L3 Cache Stress". The user's Habr question "почему ВСЕГО ОШИБОК: 0?" pointed straight at this hole. Fix: added tests_idx_for_kernel(k) / name_for_kernel(k) helpers that walk g_tests[] looking for the matching .k field, and use those in both display sites. All 14 kernel_id_t values now resolve to the right name regardless of enum-vs-position mismatch. BUG 2 — PER-TEST SUMMARY OVERWRITTEN, NOT ACCUMULATED g_summary[i] = r assigned the LAST pass's result to each test's summary. In a 16-hour marathon (726 passes) with intermittent errors (1 per pass), the FINAL pass might be clean, leaving the summary table showing "errors: 0" for every test — completely hiding 24 cumulative errors. This also fed JSON: summary.total_errors became 0, verdict became "PASS" even though error_records_total was 24. An automated post-test analyzer reading the JSON would tell the user "all good" while the UI verdict screen said "ЗАМЕНИТЬ" — direct contradiction. Fix: accumulate errors/bytes/time_ms across passes, keep status "sticky" (FAIL beats PASS beats SKIP). First pass initializes, subsequent passes add to running totals. BUG 3 — "DIMM2" instead of "DDR4-B2" in verdict text The technical-detail stuck-bit line used didx+1 ("DIMM2"), an internal array index, when the user-facing SMBIOS Type 17 locator string ("DDR4-B2") was available right there in g_dimms[didx].locator. Replaced with the locator everywhere. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent e2bffc4 commit 90bbf01

3 files changed

Lines changed: 80 additions & 34 deletions

File tree

MemForge2.src.c

Lines changed: 78 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -839,7 +839,7 @@ static void init_splash(CHAR16 *stage) {
839839
cls();
840840
UINTN cy = g_h / 2;
841841
/* Title — large centered line. */
842-
CHAR16 *title = L"MEMFORGE v0.4.19";
842+
CHAR16 *title = L"MEMFORGE v0.4.20";
843843
UINTN tx = (g_w - StrLen(title) * g_char_w) / 2;
844844
gfx_draw_str_color(tx, cy - g_char_h * 2, title, COL_ACCENT_HI);
845845
/* Stage indicator — what we're doing right now. */
@@ -943,7 +943,7 @@ static UINTN g_card_cols = 1;
943943
compute_layout(). */
944944
static int g_show_cards = 1;
945945

946-
/* v0.4.19 — focused cards layout for small screens (g_h < 900).
946+
/* v0.4.20 — focused cards layout for small screens (g_h < 900).
947947
Instead of one full-width row per test (14 rows × ~40 px = 560 px,
948948
which on a 1024×768 screen eats 70% of vertical space and clips the
949949
core panel + footer), we draw:
@@ -1013,7 +1013,7 @@ static void compute_layout(UINTN n_tests) {
10131013
g_card_w = g_inner;
10141014
g_card_row_h = g_compact ? g_char_h : (g_char_h + 16);
10151015

1016-
/* v0.4.19 — focused layout on small screens.
1016+
/* v0.4.20 — focused layout on small screens.
10171017
On g_h<900 the per-test card list eats 60-70% of vertical space
10181018
and clips the core panel / footer (YgrecK field report on 1024×768
10191019
Radeon HD 4350). Replace with: 1-row strip of all test dots +
@@ -1227,9 +1227,9 @@ static void render_header(UINT64 elapsed_ms, UINTN done, UINTN total) {
12271227
UINTN cols = g_text_cols;
12281228
if (cols >= 110) {
12291229
SPrint(buf, sizeof(buf),
1230-
T(L" MEMFORGE v0.4.19 | %ld.%ld ГБ RAM | %s "
1230+
T(L" MEMFORGE v0.4.20 | %ld.%ld ГБ RAM | %s "
12311231
L"| %s | %02d:%02d | ост ~%02d:%02d | Тесты %d/%d",
1232-
L" MEMFORGE v0.4.19 | %ld.%ld GB RAM | %s "
1232+
L" MEMFORGE v0.4.20 | %ld.%ld GB RAM | %s "
12331233
L"| %s | %02d:%02d | ETA ~%02d:%02d | Tests %d/%d"),
12341234
ram_gb_x10 / 10, ram_gb_x10 % 10,
12351235
pass_tag,
@@ -1239,25 +1239,25 @@ static void render_header(UINT64 elapsed_ms, UINTN done, UINTN total) {
12391239
(UINT32)done, (UINT32)total);
12401240
} else if (cols >= 90) {
12411241
SPrint(buf, sizeof(buf),
1242-
T(L" MEMFORGE v0.4.19 | %ld.%ld ГБ RAM | %s | %s | %02d:%02d | ост ~%02d:%02d",
1243-
L" MEMFORGE v0.4.19 | %ld.%ld GB RAM | %s | %s | %02d:%02d | ETA ~%02d:%02d"),
1242+
T(L" MEMFORGE v0.4.20 | %ld.%ld ГБ RAM | %s | %s | %02d:%02d | ост ~%02d:%02d",
1243+
L" MEMFORGE v0.4.20 | %ld.%ld GB RAM | %s | %s | %02d:%02d | ETA ~%02d:%02d"),
12441244
ram_gb_x10 / 10, ram_gb_x10 % 10,
12451245
pass_tag,
12461246
err_tag,
12471247
secs / 60, secs % 60,
12481248
eta_secs / 60, eta_secs % 60);
12491249
} else if (cols >= 70) {
12501250
SPrint(buf, sizeof(buf),
1251-
T(L" MEMFORGE v0.4.19 | %ld.%ld ГБ RAM | %s | %s | %02d:%02d",
1252-
L" MEMFORGE v0.4.19 | %ld.%ld GB RAM | %s | %s | %02d:%02d"),
1251+
T(L" MEMFORGE v0.4.20 | %ld.%ld ГБ RAM | %s | %s | %02d:%02d",
1252+
L" MEMFORGE v0.4.20 | %ld.%ld GB RAM | %s | %s | %02d:%02d"),
12531253
ram_gb_x10 / 10, ram_gb_x10 % 10,
12541254
pass_tag,
12551255
err_tag,
12561256
secs / 60, secs % 60);
12571257
} else {
12581258
SPrint(buf, sizeof(buf),
1259-
T(L" MEMFORGE v0.4.19 | %s | %s | %02d:%02d",
1260-
L" MEMFORGE v0.4.19 | %s | %s | %02d:%02d"),
1259+
T(L" MEMFORGE v0.4.20 | %s | %s | %02d:%02d",
1260+
L" MEMFORGE v0.4.20 | %s | %s | %02d:%02d"),
12611261
pass_tag,
12621262
err_tag,
12631263
secs / 60, secs % 60);
@@ -1783,7 +1783,7 @@ static int dominant_dimm_idx(void) {
17831783
return best;
17841784
}
17851785

1786-
/* v0.4.19 — detect dual-channel interleave ambiguity.
1786+
/* v0.4.20 — detect dual-channel interleave ambiguity.
17871787
On consumer desktops with dual/quad-channel memory, the iMC interleaves
17881788
addresses between channels at 64-byte (cache-line) granularity. A
17891789
SINGLE bad chip on one stick produces errors that, when mapped through
@@ -1792,7 +1792,7 @@ static int dominant_dimm_idx(void) {
17921792

17931793
Field report from a Habr user (Netac DDR4 kit): same stuck bit
17941794
D[53] was reported 24 times, distributed as A2 (8) + B2 (11) + ? (5).
1795-
Pre-v0.4.19 verdict confidently said "REPLACE: DDR4-B2 (HIGH)" — but
1795+
Pre-v0.4.20 verdict confidently said "REPLACE: DDR4-B2 (HIGH)" — but
17961796
physically it's likely ONE bad chip on one of A2/B2, NOT both.
17971797

17981798
This helper returns the list of DIMM indices that each hold >=25% of
@@ -4774,15 +4774,15 @@ static void amd_thermal_probe(void) {
47744774
}
47754775

47764776
static UINT32 amd_thermal_sample(void) {
4777-
/* v0.4.19 — correct decode per Linux k10temp / FreeBSD amdtemp.c:
4777+
/* v0.4.20 — correct decode per Linux k10temp / FreeBSD amdtemp.c:
47784778
SMN 0x59800 (SMU_THM_TCON_CUR_TMP)
47794779
bits [31:21] raw temperature value (11 bits, mask 0x7FF)
47804780
bit 19 TempRangeSel — when SET, scale is -49°C..+206°C
47814781
(subtract 49°C from the raw decode); when CLEAR
47824782
scale is 0..225°C (no offset).
47834783
temp_c = (raw * 0.125) - (range_sel ? 49 : 0)
47844784

4785-
Pre-v0.4.19 code was missing both the 0x7FF mask AND the bit-19
4785+
Pre-v0.4.20 code was missing both the 0x7FF mask AND the bit-19
47864786
range adjustment, which inflated readings by ~49°C on Ryzen SKUs
47874787
that report on the -49..206 scale (most Renoir/Cezanne/Zen3+
47884788
desktop parts). Field report on Ryzen 5 4500 showed Tctl=93°C at
@@ -6402,6 +6402,26 @@ static test_def_t g_tests[] = {
64026402
};
64036403
#define N_TESTS (sizeof(g_tests) / sizeof(g_tests[0]))
64046404

6405+
/* v0.4.20 — map a kernel enum (KER_*) to its position in g_tests[].
6406+
CRITICAL: do NOT index g_tests[] directly by a kernel_id_t value.
6407+
The enum values do not match array positions (e.g., KER_AVX2_SUSTAINED
6408+
= 12 maps to position 0 in g_tests because AVX2 Sustained is the
6409+
first row of the table, while position 12 happens to be L3 Cache
6410+
Stress). Before this helper existed, an AVX2 error was displayed in
6411+
the verdict, JSON and log as "T=L3 Cache Stress" — total
6412+
misattribution that completely broke field triage. Always use this
6413+
helper for kernel→display-name lookup. */
6414+
static int tests_idx_for_kernel(kernel_id_t k) {
6415+
for (UINTN i = 0; i < N_TESTS; i++) {
6416+
if (g_tests[i].k == k) return (int)i;
6417+
}
6418+
return -1;
6419+
}
6420+
static CHAR16 *name_for_kernel(kernel_id_t k) {
6421+
int ti = tests_idx_for_kernel(k);
6422+
return (ti >= 0) ? g_tests[ti].name : L"(unknown kernel)";
6423+
}
6424+
64056425
/* Activity row painter — invoked by render_header() on every tick to show
64066426
what test is running, how long it's been on this test, and (critically)
64076427
a per-second countdown when Bit Fade is in its silent wait phase. Lives
@@ -6534,7 +6554,7 @@ typedef struct {
65346554
} card_info_t;
65356555
static card_info_t g_cards[N_TESTS];
65366556

6537-
/* v0.4.19 — Forward decls for focused-mode helpers (defined below
6557+
/* v0.4.20 — Forward decls for focused-mode helpers (defined below
65386558
card_paint so they can share the same color-lookup logic). */
65396559
static void card_paint_full(UINTN i);
65406560
static void card_strip_paint(UINTN i);
@@ -6648,7 +6668,7 @@ static void card_paint_full(UINTN i) {
66486668
}
66496669
}
66506670

6651-
/* ---------- Focused-mode card painters (v0.4.19) ---------- */
6671+
/* ---------- Focused-mode card painters (v0.4.20) ---------- */
66526672

66536673
/* Paint the small status dot for test i in the top strip. The strip is
66546674
one row tall and shows N evenly-spaced dots, one per test. The dot
@@ -7982,7 +8002,7 @@ static void render_simple_verdict(UINT64 total_ms) {
79828002
}
79838003
} else { /* VERDICT_FAIL */
79848004
int didx = dominant_dimm_idx();
7985-
/* v0.4.19 — interleave detection.
8005+
/* v0.4.20 — interleave detection.
79868006
If errors are distributed across 2+ DIMMs (typical dual-channel
79878007
interleave hiding a single bad chip behind two DIMM labels),
79888008
we MUST NOT confidently name one DIMM. Verdict instead tells
@@ -8205,8 +8225,8 @@ static void render_summary(UINT64 total_ms) {
82058225
UINTN hrow = (g_hdr_h / 2 - g_char_h / 2) / g_char_h;
82068226
CHAR16 buf[200];
82078227
SPrint(buf, sizeof(buf),
8208-
T(L" MEMFORGE v0.4.19 ИТОГИ | %d сек | Ядра %d/%d",
8209-
L" MEMFORGE v0.4.19 SUMMARY | %d sec | Cores %d/%d"),
8228+
T(L" MEMFORGE v0.4.20 ИТОГИ | %d сек | Ядра %d/%d",
8229+
L" MEMFORGE v0.4.20 SUMMARY | %d sec | Cores %d/%d"),
82108230
(UINT32)(total_ms / 1000),
82118231
(UINT32)g_n_enabled, (UINT32)g_n_cores);
82128232
say_at_rc(0, hrow, buf);
@@ -8288,18 +8308,23 @@ static void render_summary(UINT64 total_ms) {
82888308
CHAR16 chip[64] = L"";
82898309
if (didx >= 0)
82908310
chip_label_for_bit((UINT32)didx, bp, chip, 64);
8311+
/* v0.4.20 — use SMBIOS Type 17 locator string ("DDR4-B2")
8312+
instead of array-index-based "DIMM%d" which had nothing
8313+
to do with the physical slot label the user sees. */
8314+
CHAR8 *loc = (didx >= 0 && g_dimms[didx].locator[0])
8315+
? g_dimms[didx].locator : (CHAR8*)"?";
82918316
if (didx >= 0 && chip[0]) {
82928317
/* Full info: DIMM + exact chip designator */
82938318
SPrint(sb, sizeof(sb),
8294-
T(L"⚠ Застрял бит D[%d] → DIMM%d, %s: %d ошибок",
8295-
L"⚠ Stuck bit D[%d] → DIMM%d, %s: %d errors"),
8296-
bp, didx + 1, chip, stuck_n);
8319+
T(L"⚠ Застрял бит D[%d] → %a, %s: %d ошибок",
8320+
L"⚠ Stuck bit D[%d] → %a, %s: %d errors"),
8321+
bp, loc, chip, stuck_n);
82978322
} else if (didx >= 0) {
82988323
/* DIMM known, exact chip not — say so plainly */
82998324
SPrint(sb, sizeof(sb),
8300-
T(L"⚠ Застрял бит D[%d] → DIMM%d (точный чип не определён по SPD): %d ошибок",
8301-
L"⚠ Stuck bit D[%d] → DIMM%d (exact chip unknown per SPD): %d errors"),
8302-
bp, didx + 1, stuck_n);
8325+
T(L"⚠ Застрял бит D[%d] → %a (точный чип не определён по SPD): %d ошибок",
8326+
L"⚠ Stuck bit D[%d] → %a (exact chip unknown per SPD): %d errors"),
8327+
bp, loc, stuck_n);
83038328
} else {
83048329
SPrint(sb, sizeof(sb),
83058330
T(L"⚠ Застрял бит D[%d] (планку определить не удалось): %d ошибок",
@@ -8839,7 +8864,7 @@ static void write_json_report(UINT64 total_ms) {
88398864
L"\"at\":{\"t_ms\":%ld,\"temp_c\":%d,\"pkg_w\":%d,"
88408865
L"\"throttle\":%d,\"vid_mv\":%d}}",
88418866
(i > 0) ? "," : "",
8842-
g_tests[r->test].name, r->core + 1,
8867+
name_for_kernel(r->test), r->core + 1,
88438868
r->phys_addr, r->expected, r->actual, r->xor_mask, r->pass_idx,
88448869
dimm_lab,
88458870
cc.bank_group, cc.bank, cc.row, cc.column,
@@ -9993,7 +10018,7 @@ EFI_STATUS efi_main(EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE *SystemTable) {
999310018
}
999410019
}
999510020

9996-
log_line(L"=== MemForge2 v0.4.19 init ===");
10021+
log_line(L"=== MemForge2 v0.4.20 init ===");
999710022
log_line(L"[WATCHDOG] UEFI 5-min watchdog disabled at app entry");
999810023
/* Show splash IMMEDIATELY so the user sees the program is alive while
999910024
INI parsing, SMBus probes and SMBIOS walk happen. Without this, the
@@ -10038,7 +10063,7 @@ EFI_STATUS efi_main(EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE *SystemTable) {
1003810063
if (uefi_call_wrapper(g_gop->QueryMode, 4,
1003910064
g_gop, m, &info_sz, &info) != EFI_SUCCESS)
1004010065
continue;
10041-
/* v0.4.19 — also log PixelFormat and PixelsPerScanLine
10066+
/* v0.4.20 — also log PixelFormat and PixelsPerScanLine
1004210067
so we can see if a card (e.g. old Radeon HD 4350) only
1004310068
offers BltOnly modes (PixelFormat=3) that prevent
1004410069
direct-fb rendering. */
@@ -10053,7 +10078,7 @@ EFI_STATUS efi_main(EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE *SystemTable) {
1005310078
log_line(L"[GFX] NO GOP PROTOCOL FOUND — firmware has no UEFI graphics. "
1005410079
L"Falling back to 800x600 default. UI will not render correctly.");
1005510080
}
10056-
/* v0.4.19 — MP Services Protocol diagnostic. Without this log it
10081+
/* v0.4.20 — MP Services Protocol diagnostic. Without this log it
1005710082
was impossible to tell from a field report whether multi-core
1005810083
dispatch failed (LocateProtocol error / GetNumberOfProcessors
1005910084
returned 1) or the test was simply running on a single-core
@@ -10668,7 +10693,28 @@ EFI_STATUS efi_main(EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE *SystemTable) {
1066810693
per-test results to survive that. Cheap (1× per test, not
1066910694
1× per log line). */
1067010695
flush_log_now();
10671-
g_summary[i] = r;
10696+
/* v0.4.20 — ACCUMULATE across marathon passes, do not OVERWRITE.
10697+
Pre-v0.4.20 the line was `g_summary[i] = r;` which kept only
10698+
the LAST pass's per-test result. On a 16-hour marathon with
10699+
an intermittent error rate of 1 per pass, that meant the
10700+
final summary table showed "errors: 0" because the most
10701+
recent pass happened to be clean — completely hiding the
10702+
24 cumulative errors found across earlier passes. Also fed
10703+
into JSON `summary.total_errors: 0` and `verdict: "PASS"`,
10704+
which then misled any automated post-test analyzer. */
10705+
if (g_run_passes_done == 0) {
10706+
/* First pass: initialize with this pass's result */
10707+
g_summary[i] = r;
10708+
} else {
10709+
/* Subsequent passes: accumulate counts; status is "sticky":
10710+
FAIL wins over PASS wins over SKIP. */
10711+
g_summary[i].errors += r.errors;
10712+
g_summary[i].bytes += r.bytes;
10713+
g_summary[i].time_ms += r.time_ms;
10714+
if (r.status == 2) g_summary[i].status = 2; /* FAIL is sticky */
10715+
else if (g_summary[i].status == 0 && r.status == 1)
10716+
g_summary[i].status = 1; /* upgrade SKIP→PASS */
10717+
}
1067210718
/* Bump cumulative error counter shown in the live header. */
1067310719
g_run_total_errors += r.errors;
1067410720

@@ -10763,7 +10809,7 @@ EFI_STATUS efi_main(EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE *SystemTable) {
1076310809
SPrint(lb, sizeof(lb),
1076410810
L"[ERR] T=%s Core=%d Addr=0x%lx Exp=0x%lx Act=0x%lx XOR=0x%lx DIMM=%s "
1076510811
L"~bg=%d ~bank=%d ~row=0x%lx ~col=0x%x",
10766-
g_tests[r->test].name, r->core + 1,
10812+
name_for_kernel(r->test), r->core + 1,
1076710813
r->phys_addr, r->expected, r->actual, r->xor_mask,
1076810814
dimm_lab,
1076910815
coords.bank_group, coords.bank, coords.row, coords.column);

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ EnableAVX=1
191191
;MarathonHours=0 ; 0 = off, 1..24 = run for N hours
192192

193193
[Meta]
194-
Version=0.4.19
194+
Version=0.4.20
195195
Language=en ; "ru" or "en"
196196

197197
[Display]

quantai.ini

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ EnableAVX=1
8282

8383
; Used by the UEFI binary for the menu language. Also used by the analyzer.
8484
[Meta]
85-
Version=0.4.19
85+
Version=0.4.20
8686
Debug=0
8787
Language=en
8888

0 commit comments

Comments
 (0)