Skip to content

mem: free intermediate arrays during YoloX inference#483

Open
KRRT7 wants to merge 2 commits intoUnstructured-IO:mainfrom
KRRT7:mem/free-intermediates-yolox-inference
Open

mem: free intermediate arrays during YoloX inference#483
KRRT7 wants to merge 2 commits intoUnstructured-IO:mainfrom
KRRT7:mem/free-intermediates-yolox-inference

Conversation

@KRRT7
Copy link

@KRRT7 KRRT7 commented Mar 19, 2026

Free origin_img, img/ort_inputs, and output at the points where they become dead in image_processing(), instead of letting them linger until function return.

The biggest win is origin_img — the full-resolution numpy copy of the input PIL image — which currently stays alive through the entire ONNX session.run() call. Savings are proportional to image size: larger pages (higher DPI renders) carry a bigger unused array through inference.

Benchmark

Measured with memray (memray run + memray stats --json), 3 iterations per approach, on Apple M3 Max / Python 3.12. ONNX inference workspace simulated as a 35 MiB allocation.

bench_free_intermediates

Free intermediates in YoloX image_processing()
Simulated ONNX workspace: 35 MiB  |  3 iterations  |  Python 3.12.12

Image size                         Baseline  Optimized      Saved      %
--------------------------------------------------------------------------
612x792 (fast.pdf)                   56.1MB     54.7MB      1.4MB   2.5%
1700x2200 (letter@200dpi)            77.8MB     67.1MB     10.7MB  13.8%
2550x3300 (letter@300dpi)           109.1MB     89.2MB     19.9MB  18.2%

At the default 200 DPI render resolution (1700×2200 for US Letter), this frees ~11 MiB of dead weight before ONNX inference. Zero behavior change — just earlier cleanup of arrays that are never read again.

Reproduce

pip install memray numpy pillow opencv-python-headless plotly kaleido
python benchmarks/bench_free_intermediates.py --runs 3 --report
benchmarks/bench_free_intermediates.py
"""Benchmark: freeing intermediate arrays during YoloX image_processing().

Measures how much memory is saved by explicitly deleting origin_img,
img/ort_inputs, and output at the points where they become dead,
instead of letting them linger until function return.

The savings are proportional to image size -- larger pages (higher DPI)
benefit more because origin_img (the full-resolution numpy copy of the
input PIL image) remains alive through ONNX inference in the baseline code.

Uses memray for accurate measurement of all allocations (Python + native).

Usage:
    pip install memray numpy pillow opencv-python-headless
    python benchmarks/bench_free_intermediates.py
    python benchmarks/bench_free_intermediates.py --runs 5
    python benchmarks/bench_free_intermediates.py --report [PATH]
"""

from __future__ import annotations

import argparse
import gc
import json
import subprocess
import sys
import tempfile
import textwrap
from pathlib import Path

INPUT_SHAPE = (1024, 768)
WORKSPACE_MIB = 35
IMAGE_SIZES = [
    ("612x792 (fast.pdf)", 612, 792),
    ("1700x2200 (letter@200dpi)", 1700, 2200),
    ("2550x3300 (letter@300dpi)", 2550, 3300),
]


def _build_script(approach, width, height, runs):
    return textwrap.dedent(f"""\
        import gc, cv2, numpy as np
        from PIL import Image
        INPUT_SHAPE = {INPUT_SHAPE}
        WORKSPACE_MIB = {WORKSPACE_MIB}
        def preprocess(img, input_size=INPUT_SHAPE, swap=(2, 0, 1)):
            if len(img.shape) == 3:
                padded_img = np.ones((input_size[0], input_size[1], 3), dtype=np.uint8) * 114
            else:
                padded_img = np.ones(input_size, dtype=np.uint8) * 114
            r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
            r = min(int(img.shape[0] * r) / img.shape[0], int(img.shape[1] * r) / img.shape[1])
            ns = (int(img.shape[0] * r), int(img.shape[1] * r))
            padded_img[:ns[0], :ns[1]] = cv2.resize(
                img, (ns[1], ns[0]), interpolation=cv2.INTER_LINEAR).astype(np.uint8)
            return np.ascontiguousarray(padded_img.transpose(swap), dtype=np.float32), r
        def baseline(image):
            origin_img = np.array(image)
            img, ratio = preprocess(origin_img)
            ort_input = img[None, :, :, :]
            workspace = np.empty(int(WORKSPACE_MIB * 1024 * 1024 / 4), dtype=np.float32)
            output = np.zeros((1, 100, 6), dtype=np.float32)
            del workspace
            return output[0]
        def optimized(image):
            origin_img = np.array(image)
            img, ratio = preprocess(origin_img)
            del origin_img
            ort_input = img[None, :, :, :]
            workspace = np.empty(int(WORKSPACE_MIB * 1024 * 1024 / 4), dtype=np.float32)
            del img, ort_input
            output = np.zeros((1, 100, 6), dtype=np.float32)
            del workspace
            return output[0]
        image = Image.fromarray(np.random.randint(0, 255, ({height}, {width}, 3), dtype=np.uint8))
        gc.collect()
        for _ in range({runs}):
            {"baseline" if approach == "baseline" else "optimized"}(image)
            gc.collect()
    """)


def _run_memray(script_body):
    with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
        f.write(script_body)
        script_path = f.name
    bin_path = tempfile.mktemp(suffix=".bin")
    json_path = tempfile.mktemp(suffix=".json")
    try:
        subprocess.run(
            [sys.executable, "-m", "memray", "run",
             "--trace-python-allocators", "--native", "-o", bin_path, script_path],
            capture_output=True, check=True)
        subprocess.run(
            [sys.executable, "-m", "memray", "stats",
             "--json", "-n", "30", "-o", json_path, bin_path],
            capture_output=True, check=True)
        with open(json_path) as f:
            return json.load(f)
    finally:
        for p in (script_path, bin_path, json_path):
            Path(p).unlink(missing_ok=True)


def _peak_mib(stats):
    return stats["metadata"]["peak_memory"] / (1024 * 1024)


def generate_report(results, runs, output):
    import plotly.graph_objects as go
    sizes = [r["label"] for r in results]
    bl = [r["baseline"] for r in results]
    opt = [r["optimized"] for r in results]
    saved = [r["saved"] for r in results]
    pct = [r["pct"] for r in results]
    fig = go.Figure()
    fig.add_trace(go.Bar(name="Baseline", x=sizes, y=bl, marker_color="#94a3b8",
        text=[f"{v:.1f} MB" for v in bl], textposition="inside",
        insidetextanchor="middle", textfont=dict(size=13, color="white")))
    fig.add_trace(go.Bar(name="With del statements", x=sizes, y=opt, marker_color="#6366f1",
        text=[f"{v:.1f} MB" for v in opt], textposition="inside",
        insidetextanchor="middle", textfont=dict(size=13, color="white")))
    for i, (s, p) in enumerate(zip(saved, pct)):
        fig.add_annotation(x=sizes[i], y=max(bl[i], opt[i]) + 12,
            text=f"<b>-{s:.1f} MB ({p:.0f}%)</b>", showarrow=False,
            font=dict(size=13, color="#dc2626"))
    fig.update_layout(
        template="simple_white", paper_bgcolor="white", plot_bgcolor="white",
        font=dict(family="Inter, sans-serif", color="#374151"),
        title=dict(text=(
            "<b>Free intermediate arrays during YoloX inference</b>"
            f'<br><span style="font-size:11px;color:#9ca3af">'
            f"Simulated ONNX workspace: {WORKSPACE_MIB} MiB  |  "
            f"{runs} iterations  |  Python {sys.version.split()[0]}  |  memray</span>"),
            x=0.5, xanchor="center", font=dict(size=15)),
        barmode="group", bargap=0.25, bargroupgap=0.1, showlegend=True,
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5),
        yaxis=dict(title="Peak memory (MB)", range=[0, max(bl) * 1.3], gridcolor="#f3f4f6"),
        xaxis=dict(title="Input image size"),
        margin=dict(l=60, r=40, t=100, b=70), height=480, width=700)
    fig.write_image(output, scale=2)
    print(f"  Report: {output}")


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--runs", type=int, default=3)
    parser.add_argument("--report", nargs="?", const="benchmarks/bench_free_intermediates.png",
        metavar="PATH")
    args = parser.parse_args()
    print(f"Free intermediates in YoloX image_processing()")
    print(f"Simulated ONNX workspace: {WORKSPACE_MIB} MiB  |  "
          f"{args.runs} iterations  |  Python {sys.version.split()[0]}")
    print()
    print(f"{'Image size':<32} {'Baseline':>10} {'Optimized':>10} {'Saved':>10} {'%':>6}")
    print("-" * 74)
    results = []
    for label, w, h in IMAGE_SIZES:
        bp = _peak_mib(_run_memray(_build_script("baseline", w, h, args.runs)))
        op = _peak_mib(_run_memray(_build_script("optimized", w, h, args.runs)))
        s, p = bp - op, ((bp - op) / bp * 100) if bp > 0 else 0
        chart_label = f"{w}x{h}<br>({label.split('(')[1]}" if "(" in label else f"{w}x{h}"
        results.append(dict(label=chart_label, baseline=bp, optimized=op, saved=s, pct=p))
        print(f"{label:<32} {bp:>8.1f}MB {op:>8.1f}MB {s:>8.1f}MB {p:>5.1f}%")
    print()
    print("Savings come from freeing origin_img (full image array) before ONNX inference.")
    if args.report is not None:
        generate_report(results, runs=args.runs, output=args.report)


if __name__ == "__main__":
    main()

@KRRT7 KRRT7 force-pushed the mem/free-intermediates-yolox-inference branch 2 times, most recently from 3837285 to 9025807 Compare March 19, 2026 06:58
Delete origin_img, img/ort_inputs, and output at the points where they
become dead instead of letting them linger until function return.

The biggest win is origin_img — the full-resolution numpy copy of the
input PIL image — which stays alive through ONNX inference in the
current code. Savings are proportional to image size.
@KRRT7 KRRT7 force-pushed the mem/free-intermediates-yolox-inference branch from e42e101 to 4bfd7c4 Compare March 19, 2026 07:01
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant