kiritigowda
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 77 additions & 18 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 77 additions & 18 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 61 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎scripts/merge_reports.py‎
Lines changed: 183 additions & 0 deletions b/‎scripts/merge_reports.py‎
Lines changed: 183 additions & 0 deletions
@@ -48,8 +48,16 @@ concurrency:
 #                     would dutifully report 19 SKIPPED rows, which is
 #                     accurate but uninformative noise on every run —
 #                     so we omit it.
-#   * Khronos sample — `vision,enhanced_vision,framework`. CTS-conformant
-#                     reference impl; ships both profiles.
+#   * Khronos sample — `vision,framework` + `enhanced_vision` split
+#                     across TWO invocations. CTS-conformant reference
+#                     impl, ships both profiles on paper, but the
+#                     sample-impl's enhanced_vision tensor kernels
+#                     (TensorAdd, TensorSub, ...) SIGSEGV inside
+#                     vxProcessGraph and take the whole bench process
+#                     down. The split (rock-solid set first, crash-
+#                     prone set second, merge via merge_reports.py)
+#                     guarantees we ALWAYS get vision+framework data
+#                     even when the enhanced_vision invocation dies.
 #   * rustVX        — `vision,enhanced_vision,framework`. CTS-conformant
 #                     for Vision (5923/5923) and Enhanced Vision
 #                     (1235/1235) per the rustVX README.
@@ -304,20 +312,47 @@ jobs:
 
       # Khronos sample is a CTS-conformant reference impl that ships
       # both the Vision (42 kernels) and Enhanced Vision (19 kernels)
-      # profiles, so we exercise `vision,enhanced_vision,framework` at
-      # smoke time. `continue-on-error: true` keeps the artifact upload
-      # alive if any specific kernel crashes mid-run; the comparison job
-      # downstream handles whichever JSON files actually got produced.
-      - name: Run smoke benchmark (vision + enhanced_vision + framework, VGA × 5 iters)
+      # profiles. In practice the enhanced_vision tensor kernels in
+      # the sample-impl are buggy at runtime — `TensorAdd` SIGSEGVs
+      # the moment we invoke `vxProcessGraph`, taking the whole bench
+      # process down with it and losing JSON output for every kernel
+      # that hadn't run yet (openvx-mark writes its report only at
+      # end-of-run).
+      #
+      # Workaround: split the smoke into TWO invocations along
+      # feature-set lines, each writing to its own output dir, then
+      # merge with scripts/merge_reports.py. The first invocation
+      # (vision + framework) is rock-solid and always produces a
+      # JSON. The second invocation (enhanced_vision) is the one that
+      # might crash — `|| true` keeps the step alive, and if it
+      # crashed, the merger silently skips its missing JSON. End
+      # result: we ALWAYS get the vision+framework smoke data, and
+      # we get enhanced_vision data when the sample impl cooperates.
+      - name: Run smoke benchmark (split: vision+framework, then enhanced_vision, VGA × 5 iters)
         continue-on-error: true
         run: |
           set -eo pipefail
           cd build-smoke
           export LD_LIBRARY_PATH=${{ steps.stage.outputs.lib_dir }}:${LD_LIBRARY_PATH:-}
           ./openvx-mark --validate-timing
-          ./openvx-mark --feature-set vision,enhanced_vision,framework \
+          # 1. Rock-solid set first — always produces a JSON.
+          ./openvx-mark --feature-set vision,framework \
             --resolution VGA --iterations 5 --warmup 1 --threads 1 \
-            --output-dir smoke-results
+            --output-dir smoke-results-base
+          # 2. Crash-prone set — `|| true` so the step survives a
+          #    SIGSEGV inside e.g. the Khronos sample's TensorAdd.
+          ./openvx-mark --feature-set enhanced_vision \
+            --resolution VGA --iterations 5 --warmup 1 --threads 1 \
+            --output-dir smoke-results-extra \
+            || echo "enhanced_vision smoke crashed (Khronos sample known issue) — vision results still saved"
+          # 3. Merge whichever JSONs survived into the final smoke
+          #    report. merge_reports.py handles the missing-input
+          #    case silently.
+          mkdir -p smoke-results
+          python3 ../scripts/merge_reports.py \
+            smoke-results-base/benchmark_results.json \
+            smoke-results-extra/benchmark_results.json \
+            --output smoke-results/benchmark_results.json
 
       - name: Upload Khronos sample artifact
         if: always()
@@ -718,13 +753,21 @@ jobs:
 
       - name: Build & bench against Khronos sample (single-threaded, FHD × 20)
         if: always() && steps.detect.outputs.khronos == 'true'
-        # `continue-on-error: true` so a crash inside a single
-        # enhanced_vision kernel (the reference impl has known per-
-        # kernel quirks under heavy use) doesn't take out the
-        # comparison signal for whichever kernels did complete.
-        # `openvx-mark` only writes its JSON at end-of-run, but the
-        # surrounding job steps still upload artifacts as long as we
-        # reach them.
+        # Khronos sample's enhanced_vision tensor kernels
+        # (TensorAdd, TensorSub, ...) SIGSEGV inside vxProcessGraph
+        # and take the whole bench process down — losing JSON output
+        # for every kernel that hadn't run yet (openvx-mark writes
+        # its report only at end-of-run). Same architecture as the
+        # split smoke step above: bench the rock-solid feature sets
+        # in their own invocation first (always produces a JSON),
+        # then bench enhanced_vision in a second invocation that's
+        # allowed to crash (`|| echo …`), and merge whichever JSONs
+        # survived into the final per-impl report consumed by the
+        # comparison phase downstream.
+        #
+        # `continue-on-error: true` is belt-and-suspenders — even if
+        # the merge step itself fails for some reason, the
+        # comparison job continues with whatever Khronos data it has.
         continue-on-error: true
         run: |
           set -eo pipefail
@@ -738,9 +781,25 @@ jobs:
           cmake --build . -j$(nproc)
           export LD_LIBRARY_PATH=${{ github.workspace }}/impl/khronos/lib:${LD_LIBRARY_PATH:-}
           ./openvx-mark --validate-timing
-          ./openvx-mark --feature-set vision,enhanced_vision,framework \
+          # 1. Rock-solid: vision (42 kernels) + framework benchmarks.
+          ./openvx-mark --feature-set vision,framework \
             --resolution FHD --iterations 20 --warmup 5 --threads 1 \
-            --output-dir results
+            --output-dir results-base
+          # 2. Crash-prone: enhanced_vision (19 kernels). Note the
+          #    Khronos sample's HOG and tensor support is patchy —
+          #    HOGCells / HOGFeatures graph_setup tends to fail
+          #    cleanly (SKIPPED) but tensor kernels often SIGSEGV.
+          ./openvx-mark --feature-set enhanced_vision \
+            --resolution FHD --iterations 20 --warmup 5 --threads 1 \
+            --output-dir results-extra \
+            || echo "enhanced_vision FHD bench crashed (Khronos sample known issue) — vision+framework results still saved"
+          # 3. Merge into the canonical `results/` dir the downstream
+          #    comparison phase expects.
+          mkdir -p results
+          python3 ../scripts/merge_reports.py \
+            results-base/benchmark_results.json \
+            results-extra/benchmark_results.json \
+            --output results/benchmark_results.json
           ./openvx-mark --dump-outputs dump-khronos --seed 42 || true
 
       - name: Build & bench against rustVX (single-threaded, FHD × 20)
 
@@ -6,6 +6,67 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 
 ## [Unreleased]
 
+### Fixed — Khronos sample compatibility (verify_fns + CI split-and-merge)
+
+Three Khronos OpenVX-sample-impl issues surfaced once rustVX was
+fully green:
+
+- **`LaplacianPyramid_S16` / `LaplacianReconstruct_S16`** showed up
+  as `SKIPPED (vxVerifyGraph failed)` and `VERIFY FAILED`. The
+  Khronos sample rejects S16 LaplacianPyramid at vxVerifyGraph with
+  `VX_ERROR_INVALID_PARAMETERS` (-10) — a different error code than
+  AMD MIVisionX's `VX_ERROR_INVALID_FORMAT` (-14). The bench's
+  status accept-list already handled -14; added -10 so the
+  standalone `verify_fn` path agrees with the runner's bench-level
+  skip decision on both impls. Both code paths are spec-compliant
+  ways to express "impl gap — S16 input is not supported".
+
+- **`MatchTemplate`** showed up as `VERIFY FAILED`. The previous L2
+  test embedded a `250`-valued bright square in a `10`-valued dark
+  source, giving a per-pixel diff of 240 — the resulting L2 sum
+  saturates INT16 (`256 × 240² / 256 × 256 = 14.7M` ≫ 32767). The
+  saturation DIRECTION (positive clamp vs negative wraparound) is
+  impl-dependent — the Khronos sample's saturated cells came out
+  negative, so the bench's `argmin` search picked one of those
+  spurious negatives instead of the true match at (24, 24).
+  Switched the bench to a much smaller intensity delta (`100` vs
+  `110`, diff 10) so the L2 output stays well under INT16_MAX on
+  every impl. Also replaced the strict argmin search with a
+  structural "match cell value is notably smaller than two
+  far-away corner cells" check — less fragile across impls.
+
+### Changed — Khronos sample CI now splits bench into 2 invocations
+
+The OpenVX-sample-impl's enhanced_vision tensor kernels are buggy
+at runtime — `TensorAdd` SIGSEGVs inside `vxProcessGraph` and takes
+the entire bench process down, losing JSON output for every kernel
+that hadn't run yet (openvx-mark writes its report only at
+end-of-run).
+
+Fix in CI: split the Khronos sample bench step into TWO
+invocations, each writing to its own output dir, then merge with
+the new `scripts/merge_reports.py`:
+
+  1. **`vision,framework`** — rock-solid set; always produces a JSON.
+  2. **`enhanced_vision`**   — crash-prone set; `|| echo …` keeps
+                               the step alive when the impl SIGSEGVs.
+  3. **`merge_reports.py`**  — silently skips any missing input
+                               (the crashed-invocation case),
+                               produces a valid merged JSON from
+                               whichever invocations survived.
+
+End result: we ALWAYS get vision+framework data from Khronos
+sample (the data the downstream compare reports rely on), and we
+get enhanced_vision data on top when the sample impl cooperates.
+Applied to both Phase 1 smoke and Phase 2 FHD×20 comparison runs.
+
+`scripts/merge_reports.py` is a new utility — takes N openvx-mark
+JSON reports and concatenates their `results` arrays into one
+report with the original schema. Other top-level blocks
+(`system`, `openvx`, `feature_set_availability`, `conformance`,
+etc.) are unioned per-key. It's reusable for any future
+impl/setup where a single bench invocation can crash mid-run.
+
 ### Fixed — HOGFeatures vxProcessGraph UAF on raw-pointer params (rustVX)
 
 `HOGFeatures` was still failing on rustVX as `SKIPPED (vxProcessGraph
 
@@ -0,0 +1,183 @@
+#!/usr/bin/env python3
+"""
+merge_reports.py — merge multiple openvx-mark / opencv-mark JSON
+benchmark reports produced by separate invocations into a single
+JSON file whose schema matches the original output format.
+
+WHY THIS EXISTS
+---------------
+openvx-mark writes its JSON report ONLY at the end of a successful
+benchmark run. If the linked OpenVX implementation crashes mid-run
+(SIGSEGV, hang, abort, etc.) we lose ALL the data for the benchmarks
+that hadn't been measured yet. The classic case is the Khronos sample
+implementation's enhanced_vision tensor kernels — its TensorAdd /
+TensorSub / TensorMul kernels have known buggy implementations that
+SIGSEGV the moment we invoke `vxProcessGraph`, taking the whole bench
+process down with them.
+
+Workaround: split a single bench invocation into N invocations along
+feature-set lines (e.g. `vision,framework` first, then `enhanced_vision`
+second), each writing to its own output directory. If the
+enhanced_vision invocation crashes, the vision+framework JSON is
+already on disk and the comparison report can be built from it.
+
+This script merges the JSONs from those split invocations back into
+one report whose schema matches what a single full invocation would
+have produced. The downstream comparison + summary scripts
+(compare_reports.py, three_way_summary.py, ci_pairwise_summary.py)
+then operate on the merged JSON without needing any awareness of the
+split.
+
+MERGE SEMANTICS
+---------------
+  * `results`                 : concat from every input (this is the
+                                per-bench measurements array — the
+                                main signal we care about).
+  * `feature_set_availability`,
+    `kernel_availability`,
+    `conformance`             : per-key union from every input. When a
+                                key (e.g. "vision") exists in only
+                                one input, take that value; when in
+                                multiple, take the most-permissive
+                                value (true > false).
+  * `scores`, `scaling_analysis`
+                              : pick from whichever input ran the
+                                relevant feature set. We prefer the
+                                LAST input that produced a non-empty
+                                value (CI orders feature-set runs
+                                most-likely-to-succeed first so the
+                                last successful one wins).
+  * everything else (`system`, `openvx`, `benchmark`, `build`,
+    `threading`, `timing_audit`, `config`)
+                              : taken from the first input, since
+                                these describe the test environment
+                                and don't differ between split runs
+                                of the same binary.
+
+USAGE
+-----
+  scripts/merge_reports.py \
+      results-vision/benchmark_results.json \
+      results-enhanced/benchmark_results.json \
+      --output merged/benchmark_results.json
+
+Any input that does not exist (e.g. because that invocation crashed
+before writing its JSON) is silently skipped. As long as at least
+ONE input exists, the merge produces a valid output.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+
+def _load_optional(path: Path) -> Optional[Dict[str, Any]]:
+    """Return parsed JSON or None if file missing/empty/malformed.
+
+    Empty/missing inputs are valid here — they mean "this invocation
+    crashed before writing", which is the case we exist to handle.
+    """
+    if not path.exists():
+        return None
+    try:
+        text = path.read_text()
+        if not text.strip():
+            return None
+        return json.loads(text)
+    except (OSError, json.JSONDecodeError) as exc:
+        print(f"WARNING: skipping {path}: {exc}", file=sys.stderr)
+        return None
+
+
+def _merge_bool_dict(a: Dict[str, Any], b: Dict[str, Any]) -> Dict[str, Any]:
+    """Merge two {string: bool} maps (or nested maps of bools).
+
+    Resolution rule: for any key present in both, OR the values so the
+    merged map reflects "this feature is available in AT LEAST one of
+    the invocations". For nested dicts, recurse.
+    """
+    out = dict(a)
+    for k, v in b.items():
+        if k not in out:
+            out[k] = v
+        elif isinstance(v, dict) and isinstance(out[k], dict):
+            out[k] = _merge_bool_dict(out[k], v)
+        elif isinstance(v, bool) and isinstance(out[k], bool):
+            out[k] = out[k] or v
+        else:
+            # Last-write-wins for non-bool/non-dict values; this
+            # branch is reached only for unexpected schema drift.
+            out[k] = v
+    return out
+
+
+def merge(reports: List[Dict[str, Any]]) -> Dict[str, Any]:
+    """Merge N benchmark JSON reports into one."""
+    assert reports, "merge() requires at least one report"
+
+    # Start from the first report — its `system`, `openvx`,
+    # `benchmark`, `build`, `threading`, `timing_audit`, `config`
+    # blocks describe the test environment and are taken verbatim.
+    merged: Dict[str, Any] = dict(reports[0])
+
+    # Concat the `results` arrays from every input (this is the main
+    # signal — per-bench measurements).
+    all_results: List[Any] = []
+    for r in reports:
+        all_results.extend(r.get("results", []))
+    merged["results"] = all_results
+
+    # Merge feature-set/kernel availability + conformance via union.
+    for key in ("feature_set_availability", "kernel_availability", "conformance"):
+        merged_val: Dict[str, Any] = {}
+        for r in reports:
+            v = r.get(key)
+            if isinstance(v, dict):
+                merged_val = _merge_bool_dict(merged_val, v)
+        if merged_val:
+            merged[key] = merged_val
+
+    # Scores + scaling_analysis: prefer the LAST input that produced a
+    # non-empty value (most-recent-successful-run wins).
+    for key in ("scores", "scaling_analysis"):
+        for r in reversed(reports):
+            v = r.get(key)
+            if v:
+                merged[key] = v
+                break
+
+    return merged
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__,
+                                     formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument("inputs", nargs="+", type=Path,
+                        help="One or more openvx-mark JSON report files. "
+                             "Files that don't exist are silently skipped "
+                             "(useful when an invocation crashed before "
+                             "writing).")
+    parser.add_argument("--output", "-o", required=True, type=Path,
+                        help="Path to write the merged JSON.")
+    args = parser.parse_args()
+
+    reports = [r for r in (_load_optional(p) for p in args.inputs) if r]
+    if not reports:
+        print("ERROR: no valid input reports — every input file was "
+              "missing/empty/malformed.", file=sys.stderr)
+        return 1
+
+    merged = merge(reports)
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    args.output.write_text(json.dumps(merged, indent=2))
+    print(f"merged {len(reports)} report(s) into {args.output} "
+          f"({len(merged.get('results', []))} total benchmark rows)")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())