Skip to content

Commit 355c53c

Browse files
committed
test(hw): driver-probe-error check + rx-capture data-flow assertion
``assert_no_kernel_faults`` only catches panic/oops/BUG/SError — drivers can fail to probe silently (bad DT overlay apply, phandle mismatch, missing regulator) and the suite would reach the IIO-device assertion with a confusing "not found" message instead of the probe-error root cause. Similarly, ``Link status: DATA`` only says JESD trained at link-up — DMA / TPL / clock-path can silently stop delivering samples and the test wouldn't notice. Add two helpers in ``test/hw/hw_helpers.py`` and wire them into every hw test that has IIO verification: 1. ``assert_no_probe_errors(dmesg_txt)`` — scans for ``probe of <dev> failed with error``, ``Error applying overlay`` / ``failed to apply overlay``, ``Error resolving``. Reuses ``_DMESG_BENIGN_SUBSTRINGS`` (now also allowlisting the stock-Kuiper ZCU102 / ZynqMP watchdog, DisplayPort, and Ceva AHCI probes that fire on every boot regardless of overlay). 2. ``assert_rx_capture_valid(ctx, device_candidates, …)`` — uses raw libiio (works with any buffered AXI ADC regardless of whether a pyadi-iio wrapper exists for its device name), enables every non-output scan channel, refills a one-shot buffer, and asserts at least one channel is non-zero + at least one channel's ``|std|`` >= 1 LSB. Device selection tries an ordered candidate list, then falls back to any ``axi-*`` / ``cf-*`` / TPL frontend on the context. ``TimeoutError`` from the refill path gets remapped to a clear ``AssertionError`` pointing at the stalled DMA. Wired into: - ``test_ad9081_zcu102_xsa_hw.py`` → ``axi-ad9081-rx-hpc`` / ``ad_ip_jesd204_tpl_adc``. - ``test_ad9081_zcu102_system_hw.py`` → same. - ``test_adrv9009_zcu102_hw.py`` → ``axi-adrv9009-rx-hpc``. Also restructured to use ``board`` fixture (from the ``target``-swap fix) so the VCU118-style teardown power-off runs. - ``test_fmcdaq3_vcu118_hw.py`` → ``axi-ad9680-core-lpc`` / ``axi-ad9680-hpc``. Four findings surfaced via this check, all fixed: - ``assert_no_probe_errors`` tripped on stock Kuiper ZynqMP boot noise (``cdns-wdt: probe of ffcb0000.watchdog failed with error -2``, DisplayPort + Ceva AHCI) → three specific device-node addresses added to the benign list so a real watchdog/display/sata regression elsewhere still trips. - ``adi.ad9081(uri=…)`` fails with ``'NoneType' object has no attribute 'channels'`` when the design exposes the TPL core rather than ``axi-ad9081-rx-hpc`` → moved to raw libiio. - Fallback initially picked the control-plane device (``ad9528-1``) which isn't AXI-DMA-backed → narrowed fallback to ``axi-*`` / ``cf-*`` / TPL only.
1 parent 016f443 commit 355c53c

5 files changed

Lines changed: 244 additions & 22 deletions

File tree

test/hw/hw_helpers.py

Lines changed: 174 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -294,8 +294,11 @@ def require_hw_prereqs() -> None:
294294
"failed to load firmware",
295295
# Wifi/USB hotplug noise seen on some Kuiper releases:
296296
"cfg80211: failed to load",
297-
# Harmless driver-level probe deferrals re-tried later:
297+
# Harmless driver-level probe deferrals re-tried later. The kernel
298+
# surfaces these both symbolically (``-EPROBE_DEFER``) and as the
299+
# raw errno (``-517``) depending on the caller.
298300
"EPROBE_DEFER",
301+
"error -517",
299302
# ZynqMP early-boot WARNING: the kernel logs a Call trace through
300303
# gic_of_init / of_irq_init because the RPU-bus interrupt-controller
301304
# cannot be initialized from Linux on ZynqMP. Always benign; the
@@ -305,6 +308,14 @@ def require_hw_prereqs() -> None:
305308
"irqchip_init",
306309
"__primary_switched",
307310
"rpu-bus/interrupt-controller",
311+
# Stock Kuiper ZynqMP (ZCU102) probes these SoC peripherals from the
312+
# base DTS regardless of the overlay we merge in — the hardware is
313+
# either unconfigured (no DisplayPort monitor attached) or not wired
314+
# out on the board (no SATA). Match by device-node address so a
315+
# genuine regression on the same driver elsewhere still trips.
316+
"ffcb0000.watchdog", # Cadence WDT — unroutable clocks
317+
"fd4a0000.display", # ZynqMP DisplayPort — no monitor + DPMS pipe
318+
"fd0c0000.ahci", # Ceva AHCI/SATA — not routed on ZCU102
308319
)
309320

310321
# Hard-fail patterns — these indicate a genuine kernel fault.
@@ -346,6 +357,41 @@ def assert_no_kernel_faults(dmesg_txt: str) -> None:
346357
assert not bad, "Kernel fault(s) detected in dmesg:\n" + "\n".join(bad)
347358

348359

360+
# Driver-probe-failure patterns in dmesg. These appear when a probe()
361+
# callback returns a negative errno other than -EPROBE_DEFER (the defer
362+
# path is the normal retry-until-resolved dance and is allowlisted via
363+
# _DMESG_BENIGN_SUBSTRINGS above). Regex, not plain substrings —
364+
# ``probe of <dev> failed with error <N>`` is the canonical kernel
365+
# message. Overlay-apply errors fall in the same bucket because a
366+
# failed overlay almost always cascades into silent probe misses.
367+
_DMESG_PROBE_ERROR_PATTERNS = (
368+
r"probe of \S+ failed with error",
369+
r"Error applying overlay",
370+
r"failed to apply overlay",
371+
r"Error resolving",
372+
)
373+
374+
375+
def assert_no_probe_errors(dmesg_txt: str) -> None:
376+
"""Fail the calling test if *dmesg_txt* contains driver-probe errors.
377+
378+
Complements :func:`assert_no_kernel_faults` — a driver can fail to
379+
probe without ever producing a kernel fault (e.g. a DT overlay
380+
apply error, a regulator not showing up, a phandle mismatch).
381+
Reuses :data:`_DMESG_BENIGN_SUBSTRINGS` so known-benign probe
382+
chatter (firmware loads, ``-EPROBE_DEFER`` retries, ZynqMP early-
383+
boot warnings) does not fire.
384+
"""
385+
compiled = [_re.compile(p) for p in _DMESG_PROBE_ERROR_PATTERNS]
386+
bad: list[str] = []
387+
for line in dmesg_txt.splitlines():
388+
if any(s in line for s in _DMESG_BENIGN_SUBSTRINGS):
389+
continue
390+
if any(rx.search(line) for rx in compiled):
391+
bad.append(line)
392+
assert not bad, "Driver probe errors detected in dmesg:\n" + "\n".join(bad)
393+
394+
349395
def shell_out(shell, cmd: str) -> str:
350396
"""Run *cmd* via an ``ADIShellDriver`` and return the output as a string.
351397
@@ -475,6 +521,133 @@ def assert_jesd_links_data(
475521
return rx_status, tx_status
476522

477523

524+
def assert_rx_capture_valid(
525+
ctx,
526+
device_candidates: str | tuple[str, ...],
527+
n_samples: int = 2**12,
528+
min_std: float = 1.0,
529+
context: str = "",
530+
) -> dict:
531+
"""Capture ``n_samples`` from an IIO device and verify data is flowing.
532+
533+
Covers the "IIO device probed but no samples actually arrive" failure
534+
mode: the JESD204 link reports DATA, drivers probe cleanly, IIO
535+
devices appear, but the DMA / JESD transport / clock path silently
536+
stops delivering samples. The buffer comes back, but every sample
537+
is zero, or every sample is latched to one value.
538+
539+
Asserts:
540+
541+
- At least one RX channel is not all-zero (DMA actually transferred
542+
bytes).
543+
- At least one RX channel's |std| is ``>= min_std`` LSBs (samples
544+
actually vary — noise floor alone clears this threshold easily,
545+
but a latched converter does not).
546+
547+
Uses raw libiio so it works with any buffered IIO device, including
548+
AD9081 designs that expose the buffered frontend as the TPL core
549+
(``ad_ip_jesd204_tpl_adc``) rather than ``axi-ad9081-rx-hpc``.
550+
551+
Args:
552+
ctx: A live ``iio.Context`` (e.g. from :func:`open_iio_context`).
553+
device_candidates: IIO device name to capture from, or a tuple
554+
of candidate names — the first one present on *ctx* wins.
555+
n_samples: buffer depth for the capture.
556+
min_std: minimum |std| across all channels, in raw-LSB units.
557+
context: tag prepended to assertion-failure messages.
558+
559+
Returns:
560+
``dict`` mapping channel id → captured ``numpy.ndarray``.
561+
"""
562+
import iio
563+
import numpy as np
564+
565+
suffix = f" ({context})" if context else ""
566+
candidates = (
567+
(device_candidates,) if isinstance(device_candidates, str) else tuple(device_candidates)
568+
)
569+
all_names = sorted(d.name for d in ctx.devices if d.name)
570+
571+
def _has_rx_scan(d):
572+
return any(c.scan_element and not c.output for c in d.channels)
573+
574+
dev = next((d for d in (ctx.find_device(n) for n in candidates) if d is not None), None)
575+
if dev is None or not _has_rx_scan(dev):
576+
# No named candidate is RX-buffered — fall back to the first
577+
# *AXI DMA frontend* on the context (name starts with ``axi-`` /
578+
# ``cf-`` or contains ``tpl``). Control-plane devices like
579+
# ``ad9528`` or ``ad9371-phy`` may expose scan channels too, but
580+
# they aren't wired to an AXI-DMA and ``buf.refill()`` would just
581+
# time out on them.
582+
buffered = [
583+
d
584+
for d in ctx.devices
585+
if d.name
586+
and (d.name.startswith("axi-") or d.name.startswith("cf-") or "tpl" in d.name)
587+
and _has_rx_scan(d)
588+
]
589+
dev = buffered[0] if buffered else None
590+
assert dev is not None, (
591+
f"No RX-buffered IIO device found{suffix}. "
592+
f"Tried: {list(candidates)}. Present: {all_names}"
593+
)
594+
595+
scan_channels = [c for c in dev.channels if c.scan_element and not c.output]
596+
assert scan_channels, (
597+
f"No RX scan channels on {dev.name!r}{suffix}. Present: {all_names}"
598+
)
599+
600+
print(f"rx capture{suffix}: selected IIO device {dev.name!r}")
601+
buf = None
602+
try:
603+
for ch in scan_channels:
604+
ch.enabled = True
605+
buf = iio.Buffer(dev, n_samples, False)
606+
try:
607+
buf.refill()
608+
except TimeoutError as exc:
609+
raise AssertionError(
610+
f"Buffer refill timed out on {dev.name!r}{suffix} — "
611+
"AXI DMA is not delivering samples (JESD or DMA path "
612+
"stalled). Present devices: " + ", ".join(all_names)
613+
) from exc
614+
per_channel: dict[str, np.ndarray] = {}
615+
for ch in scan_channels:
616+
raw = ch.read(buf)
617+
# AXI ADC frontends emit signed int16 (or sign-extended
618+
# int14/int12); dtype=int16 is correct for every chip this
619+
# suite currently runs against.
620+
per_channel[ch.id] = np.frombuffer(raw, dtype=np.int16)
621+
finally:
622+
if buf is not None:
623+
del buf
624+
for ch in scan_channels:
625+
try:
626+
ch.enabled = False
627+
except Exception:
628+
pass
629+
630+
nonzero = [name for name, arr in per_channel.items() if arr.any()]
631+
assert nonzero, (
632+
f"All channels on {dev.name!r} returned zero samples{suffix} — "
633+
"JESD/DMA/clock path is likely stalled."
634+
)
635+
636+
stds = {name: float(np.abs(arr).std()) for name, arr in per_channel.items()}
637+
max_std = max(stds.values())
638+
assert max_std >= min_std, (
639+
f"All channels on {dev.name!r} latched to a constant value{suffix} "
640+
f"(max |std|={max_std:.3g} < {min_std}) — data path stuck."
641+
)
642+
643+
print(
644+
f"rx capture{suffix}: device={dev.name}, "
645+
f"{len(per_channel)} channel(s), {n_samples} samples, "
646+
f"non-zero={list(nonzero)}, max |std|={max_std:.2f}"
647+
)
648+
return per_channel
649+
650+
478651
def _kernel_cache_key(platform_arch: str, config_path: Path) -> str:
479652
"""Return a short sha256 over *platform_arch* and the config file bytes."""
480653
import hashlib

test/hw/test_ad9081_zcu102_system_hw.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@
4646
acquire_xsa,
4747
assert_jesd_links_data,
4848
assert_no_kernel_faults,
49+
assert_no_probe_errors,
50+
assert_rx_capture_valid,
4951
collect_dmesg,
5052
compile_dts_to_dtb,
5153
deploy_and_boot,
@@ -266,6 +268,7 @@ def test_ad9081_zcu102_system_hw(board, built_kernel_image_zynqmp, tmp_path):
266268

267269
# --- 9. Verify: kernel probe + IIO context + JESD DATA state ---
268270
assert_no_kernel_faults(dmesg_txt)
271+
assert_no_probe_errors(dmesg_txt)
269272
assert "AD9081 Rev." in dmesg_txt or "probed ADC AD9081" in dmesg_txt, (
270273
"AD9081 probe signature was not found in kernel dmesg output"
271274
)
@@ -292,3 +295,11 @@ def test_ad9081_zcu102_system_hw(board, built_kernel_image_zynqmp, tmp_path):
292295
)
293296
print(f"$ cat .../84a90000.axi?jesd204?rx/status\n{rx_status}")
294297
print(f"$ cat .../84b90000.axi?jesd204?tx/status\n{tx_status}")
298+
299+
# Data-path smoke test: capture a real buffer and verify samples flow.
300+
assert_rx_capture_valid(
301+
ctx,
302+
("axi-ad9081-rx-hpc", "ad_ip_jesd204_tpl_adc"),
303+
n_samples=2**12,
304+
context="ad9081 system",
305+
)

test/hw/test_ad9081_zcu102_xsa_hw.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@
3535
acquire_xsa,
3636
assert_jesd_links_data,
3737
assert_no_kernel_faults,
38+
assert_no_probe_errors,
39+
assert_rx_capture_valid,
3840
collect_dmesg,
3941
compile_dts_to_dtb,
4042
deploy_and_boot,
@@ -184,6 +186,7 @@ def test_ad9081_zcu102_xsa_hw(board, built_kernel_image_zynqmp, tmp_path):
184186

185187
# --- 7. Verify: kernel probe + IIO context + JESD DATA state ---
186188
assert_no_kernel_faults(dmesg_txt)
189+
assert_no_probe_errors(dmesg_txt)
187190
assert "AD9081 Rev." in dmesg_txt or "probed ADC AD9081" in dmesg_txt, (
188191
"AD9081 probe signature was not found in kernel dmesg output"
189192
)
@@ -210,3 +213,11 @@ def test_ad9081_zcu102_xsa_hw(board, built_kernel_image_zynqmp, tmp_path):
210213
)
211214
print(f"$ cat .../84a90000.axi?jesd204?rx/status\n{rx_status}")
212215
print(f"$ cat .../84b90000.axi?jesd204?tx/status\n{tx_status}")
216+
217+
# Data-path smoke test: capture a real buffer and verify samples flow.
218+
assert_rx_capture_valid(
219+
ctx,
220+
("axi-ad9081-rx-hpc", "ad_ip_jesd204_tpl_adc"),
221+
n_samples=2**12,
222+
context="ad9081 xsa",
223+
)

test/hw/test_adrv9009_zcu102_hw.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@
3838
acquire_xsa,
3939
assert_jesd_links_data,
4040
assert_no_kernel_faults,
41+
assert_no_probe_errors,
42+
assert_rx_capture_valid,
4143
collect_dmesg,
4244
compile_dts_to_dtb,
4345
deploy_and_boot,
@@ -214,6 +216,7 @@ def test_adrv9009_zcu102_hw(board, built_kernel_image_zynqmp, tmp_path):
214216

215217
# --- 7. Verify: kernel probe + IIO context + JESD DATA state ---
216218
assert_no_kernel_faults(dmesg_txt)
219+
assert_no_probe_errors(dmesg_txt)
217220
assert "adrv9009-phy" in dmesg_txt or "Talise" in dmesg_txt, (
218221
"ADRV9009 phy probe signature was not found in kernel dmesg output"
219222
)
@@ -238,6 +241,14 @@ def test_adrv9009_zcu102_hw(board, built_kernel_image_zynqmp, tmp_path):
238241
print(f"$ cat .../axi?jesd204?rx/status\n{rx_status}")
239242
print(f"$ cat .../axi?jesd204?tx/status\n{tx_status}")
240243

244+
# --- 8b. Data-path smoke test: capture a real RX buffer. ---
245+
assert_rx_capture_valid(
246+
ctx,
247+
("axi-adrv9009-rx-hpc", "axi-adrv9009-rx-obs-hpc"),
248+
n_samples=2**12,
249+
context="adrv9009 initial boot",
250+
)
251+
241252
# --- 9. Load all four canonical Talise filter profiles ---
242253
# The remote libiio write path drops its TCP socket when the driver
243254
# holds the CPU for Talise re-init, surfacing as ``BrokenPipeError``.
@@ -279,5 +290,7 @@ def test_adrv9009_zcu102_hw(board, built_kernel_image_zynqmp, tmp_path):
279290
# relock both links before re-reading sysfs status.
280291
time.sleep(3.0)
281292
assert_jesd_links_data(shell, context=f"after {filename}")
282-
assert_no_kernel_faults(shell_out(shell, "dmesg"))
293+
dmesg = shell_out(shell, "dmesg")
294+
assert_no_kernel_faults(dmesg)
295+
assert_no_probe_errors(dmesg)
283296
print(f" {filename}: RX+TX JESD DATA OK")

test/hw/test_fmcdaq3_vcu118_hw.py

Lines changed: 34 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -33,34 +33,48 @@
3333
allow_module_level=True,
3434
)
3535

36+
from test.hw.hw_helpers import ( # noqa: E402
37+
DEFAULT_OUT_DIR,
38+
assert_no_kernel_faults,
39+
assert_no_probe_errors,
40+
assert_rx_capture_valid,
41+
collect_dmesg,
42+
open_iio_context,
43+
)
44+
3645

3746
@pytest.mark.lg_feature(["fmcdaq3", "vcu118"])
38-
def test_fmcdaq3_vcu118_boot_hw(target):
47+
def test_fmcdaq3_vcu118_boot_hw(board):
3948
"""Boot FMCDAQ3+VCU118 with the prebuilt Kuiper image and verify IIO."""
40-
shell = _boot_and_get_shell(target)
41-
_assert_probed_drivers(shell)
42-
_assert_iio_devices(shell)
49+
out_dir = DEFAULT_OUT_DIR
50+
out_dir.mkdir(parents=True, exist_ok=True)
51+
52+
board.transition("shell")
53+
shell = board.target.get_driver("ADIShellDriver")
4354

55+
dmesg_txt = collect_dmesg(
56+
shell,
57+
out_dir,
58+
label="fmcdaq3_vcu118",
59+
grep_pattern="ad9680|ad9152|ad9528|jesd204|probe|failed|error",
60+
)
61+
assert_no_kernel_faults(dmesg_txt)
62+
assert_no_probe_errors(dmesg_txt)
4463

45-
def _boot_and_get_shell(target):
46-
"""Drive ``BootFabric`` through ``powered_off`` → ``shell`` and return shell."""
47-
strategy = target.get_driver("Strategy")
48-
strategy.transition("powered_off")
49-
strategy.transition("shell")
50-
return target.get_driver("ADIShellDriver")
64+
lowered = dmesg_txt.lower()
65+
assert "ad9680" in lowered, "AD9680 driver messages not seen in dmesg"
66+
assert "ad9152" in lowered, "AD9152 driver messages not seen in dmesg"
5167

68+
_assert_iio_devices(shell)
5269

53-
def _assert_probed_drivers(shell) -> None:
54-
"""Fail unless dmesg shows AD9680 / AD9152 / AD9528 / JESD driver probes."""
55-
out = shell.run_check(
56-
"dmesg | grep -Ei 'ad9680|ad9152|ad9528|jesd204|fail|error' | tail -n 200; true"
70+
# Data-path smoke test: capture a real AD9680 RX buffer.
71+
ctx, _ = open_iio_context(shell)
72+
assert_rx_capture_valid(
73+
ctx,
74+
("axi-ad9680-core-lpc", "axi-ad9680-hpc", "axi-ad9680-rx-hpc"),
75+
n_samples=2**12,
76+
context="fmcdaq3 boot",
5777
)
58-
dmesg = "\n".join(out) if isinstance(out, list) else str(out)
59-
print("\n=== FMCDAQ3 probe-relevant dmesg ===")
60-
print(dmesg)
61-
print("====================================")
62-
assert "ad9680" in dmesg.lower(), "AD9680 driver messages not seen in dmesg"
63-
assert "ad9152" in dmesg.lower(), "AD9152 driver messages not seen in dmesg"
6478

6579

6680
def _assert_iio_devices(shell) -> None:

0 commit comments

Comments
 (0)