Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions .github/workflows/vp8-encoder-ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# VP8 encoder + decoder unit tests on all host OS images.
# Excludes:
# - Two tests that require System.Drawing/GDI+ to write BMP artifacts; migrate those to
# ImageSharp (or similar) to run the full suite here.
name: vp8-encoder

on:
push:
branches:
- master
pull_request:
branches:
- master

jobs:
vp8-unit-tests:
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
- uses: actions/setup-dotnet@v4
with:
dotnet-version: '10.0.x'
- name: Restore
run: dotnet restore test/SIPSorcery.VP8.UnitTest/SIPSorcery.VP8.UnitTest.csproj
- name: Test
run: |
dotnet test test/SIPSorcery.VP8.UnitTest/SIPSorcery.VP8.UnitTest.csproj -c Release -f net10.0 --no-restore --verbosity normal \
--filter 'FullyQualifiedName!=Vpx.Net.UnitTest.VP8CodecUnitTest.DecodeKeyFrame&FullyQualifiedName!=Vpx.Net.UnitTest.vpx_decoder_unittest.DecodeKeyFrameFromFileTest'
3 changes: 2 additions & 1 deletion SIPSorcery.VP8.slnf
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
"path": "SIPSorcery.slnx",
"projects": [
"src\\SIPSorcery.VP8\\SIPSorcery.VP8.csproj",
"test\\SIPSorcery.VP8.UnitTest\\SIPSorcery.VP8.UnitTest.csproj"
"test\\SIPSorcery.VP8.UnitTest\\SIPSorcery.VP8.UnitTest.csproj",
"test\\SIPSorcery.VP8.Benchmarks\\SIPSorcery.VP8.Benchmarks.csproj"
]
}
}
1 change: 1 addition & 0 deletions SIPSorcery.slnx
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
<Project Path="test/SIPSorcery.OpenAI.Realtime.UnitTest/SIPSorcery.OpenAI.Realtime.UnitTests.csproj" />
<Project Path="test/SIPSorcery.VP8.TestVectors/SIPSorcery.VP8.TestVectors.csproj" />
<Project Path="test/SIPSorcery.VP8.UnitTest/SIPSorcery.VP8.UnitTest.csproj" />
<Project Path="test/SIPSorcery.VP8.Benchmarks/SIPSorcery.VP8.Benchmarks.csproj" />
<Project Path="test/SIPSorceryMedia.Abstractions.UnitTest/SIPSorceryMedia.Abstractions.UnitTest.csproj" />
<Project Path="test/unit/SIPSorcery.UnitTests.csproj" />
<Project Path="test/VideoCaptureTest/VideoCaptureTest.csproj" />
Expand Down
71 changes: 71 additions & 0 deletions src/SIPSorcery.VP8/DcPredSumKernels.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
//-----------------------------------------------------------------------------
// SIMD byte-sum reductions used by the encoder's DC_PRED helpers
// (DcPred16x16 / DcPred8x8 in mb_encoder).
//
// Sse2: PSADBW on (v, 0) gives two int16 partial sums in lanes 0 and 4 of
// a Vector128<ushort>; their sum equals the 16-byte sum.
// AdvSimd: ZeroExtendWidening{Lower,Upper} + AddAcross.
//
// These tiny reduces (3 calls/MB) are not a major perf target but match the
// plan's Tier 6 polish layer.
//
// Author: Claude Opus 4.7 (commissioned by Aaron Clauson).
//
// License: BSD 3-Clause "New" or "Revised" License, see included LICENSE.md file.
//-----------------------------------------------------------------------------

using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;

namespace Vpx.Net
{
internal static unsafe class DcPredSumKernels
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int Sum16(byte* p)
{
if (Sse2.IsSupported)
{
var v = Sse2.LoadVector128(p);
var sad = Sse2.SumAbsoluteDifferences(v, Vector128<byte>.Zero);
return sad.GetElement(0) + sad.GetElement(4);
}
if (AdvSimd.Arm64.IsSupported)
{
var v = AdvSimd.LoadVector128(p);
var lo = AdvSimd.ZeroExtendWideningLower(v.GetLower());
var hi = AdvSimd.ZeroExtendWideningLower(v.GetUpper());
var sum = AdvSimd.Add(lo, hi);
return AdvSimd.Arm64.AddAcross(sum).ToScalar();
}

int s = 0;
for (int i = 0; i < 16; i++) s += p[i];
return s;
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int Sum8(byte* p)
{
if (Sse2.IsSupported)
{
// Load 8 bytes into the low qword; PSADBW vs zero leaves the sum in lane 0.
var v = Sse2.LoadScalarVector128((long*)p).AsByte();
var sad = Sse2.SumAbsoluteDifferences(v, Vector128<byte>.Zero);
return sad.GetElement(0);
}
if (AdvSimd.Arm64.IsSupported)
{
var v = AdvSimd.LoadVector64(p);
var widened = AdvSimd.ZeroExtendWideningLower(v);
return AdvSimd.Arm64.AddAcross(widened).ToScalar();
}

int s = 0;
for (int i = 0; i < 8; i++) s += p[i];
return s;
}
}
}
149 changes: 149 additions & 0 deletions src/SIPSorcery.VP8/EncodeProfiler.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
//-----------------------------------------------------------------------------
// Optional high-resolution phase timers for encoder diagnostics. When
// <see cref="Enabled"/> is true, scoped regions attribute wall-clock time
// to broad buckets (DCT, quantize, tokenize, etc.). Zero overhead when disabled.
//-----------------------------------------------------------------------------

using System;
using System.Diagnostics;
using System.Text;

namespace Vpx.Net
{
/// <summary>Encode hot-path phase buckets for <see cref="EncodeProfiler"/>.</summary>
public enum Vp8EncodeProfilePhase
{
SimdMemOps,
Fdct,
Walsh,
Quantize,
Tokenize,
Reconstruct,
PackTokens,
StitchLastFrame,
/// <summary>Keyframe or inter: first-partition header plus 1056 coef-update flags (before MB phase 1).</summary>
FirstPartitionHeader,
/// <summary>Per-MB scalar context shuffle (9-byte above load/store) and EOB skip checks only — does not double-count <see cref="SimdMemOps"/>.</summary>
Phase1MbScalarCtx,
/// <summary>Phase-2 bit writing in partition 0 after phase 1 (skip probs, KF/inter modes, etc.) before token partition.</summary>
Phase2FirstPartitionBits,
}

/// <summary>
/// Thread-local (static) encode timings. Enable around a keyed encode, then read
/// <see cref="GetReport"/> or individual tick fields for BenchmarkDotNet / diagnostics.
/// </summary>
public static class EncodeProfiler
{
// ThreadStatic so BenchmarkDotNet host / parallel jobs cannot corrupt a single encode's totals.
[ThreadStatic] private static long tSimdMemOps;
[ThreadStatic] private static long tFdct;
[ThreadStatic] private static long tWalsh;
[ThreadStatic] private static long tQuantize;
[ThreadStatic] private static long tTokenize;
[ThreadStatic] private static long tReconstruct;
[ThreadStatic] private static long tPack;
[ThreadStatic] private static long tStitch;
[ThreadStatic] private static long tFirstPartHdr;
[ThreadStatic] private static long tPhase1ScalarCtx;
[ThreadStatic] private static long tPhase2FirstPart;

[ThreadStatic] private static bool tEnabled;

/// <summary>When false (default), <see cref="Scope"/> no-ops on this thread.</summary>
/// <remarks>Thread-local so parallel unit tests (or parallel encodes) do not inherit
/// profiling from another thread.</remarks>
public static bool Enabled
{
get => tEnabled;
set => tEnabled = value;
}

public static void Reset()
{
tSimdMemOps = tFdct = tWalsh = tQuantize = tTokenize = tReconstruct = tPack = tStitch = 0;
tFirstPartHdr = tPhase1ScalarCtx = tPhase2FirstPart = 0;
}

public readonly struct Scope : IDisposable
{
private readonly Vp8EncodeProfilePhase _phase;
private readonly long _t0;

public Scope(Vp8EncodeProfilePhase phase)
{
_phase = phase;
if (!Enabled)
{
_t0 = 0;
return;
}

_t0 = Stopwatch.GetTimestamp();
}

public void Dispose()
{
if (_t0 == 0) return;
long dt = Stopwatch.GetTimestamp() - _t0;
Add(_phase, dt);
}
}

private static void Add(Vp8EncodeProfilePhase phase, long ticks)
{
switch (phase)
{
case Vp8EncodeProfilePhase.SimdMemOps: tSimdMemOps += ticks; break;
case Vp8EncodeProfilePhase.Fdct: tFdct += ticks; break;
case Vp8EncodeProfilePhase.Walsh: tWalsh += ticks; break;
case Vp8EncodeProfilePhase.Quantize: tQuantize += ticks; break;
case Vp8EncodeProfilePhase.Tokenize: tTokenize += ticks; break;
case Vp8EncodeProfilePhase.Reconstruct: tReconstruct += ticks; break;
case Vp8EncodeProfilePhase.PackTokens: tPack += ticks; break;
case Vp8EncodeProfilePhase.StitchLastFrame: tStitch += ticks; break;
case Vp8EncodeProfilePhase.FirstPartitionHeader: tFirstPartHdr += ticks; break;
case Vp8EncodeProfilePhase.Phase1MbScalarCtx: tPhase1ScalarCtx += ticks; break;
case Vp8EncodeProfilePhase.Phase2FirstPartitionBits: tPhase2FirstPart += ticks; break;
}
}

/// <summary>Sum of all profiled bucket ticks (current thread). Nested MB work can make this exceed wall-clock.</summary>
public static long GetScopedTotalTicks() =>
tSimdMemOps + tFdct + tWalsh + tQuantize + tTokenize + tReconstruct + tPack + tStitch
+ tFirstPartHdr + tPhase1ScalarCtx + tPhase2FirstPart;

/// <summary>One-line wall-clock vs scoped-sum (scoped may exceed wall when buckets overlap, e.g. SimdMemOps inside other work).</summary>
public static string FormatWallClockCompare(double wallMilliseconds)
{
double scopedMs = GetScopedTotalTicks() * 1000.0 / Stopwatch.Frequency;
return $"Wall-clock: {wallMilliseconds:F3} ms | Scoped sum: {scopedMs:F3} ms | Delta: {wallMilliseconds - scopedMs:F3} ms (negative => overlapping scopes or unscoped work)";
}

/// <summary>Human-readable breakdown; ticks are Stopwatch raw units (current thread only).</summary>
public static string GetReport()
{
long total = GetScopedTotalTicks();
double ToMs(long t) => t * 1000.0 / Stopwatch.Frequency;
var sb = new StringBuilder(384);
sb.AppendLine($"Vp8EncodeProfiler (scoped sum {ToMs(total):F3} ms — sum of buckets; can exceed wall if scopes overlap)");
void Line(string name, long t)
{
if (total <= 0) sb.AppendLine($" {name,-22} {ToMs(t):F3} ms");
else sb.AppendLine($" {name,-22} {ToMs(t):F3} ms ({100.0 * t / total:F1}%)");
}
Line("FirstPartitionHeader", tFirstPartHdr);
Line("Phase1MbScalarCtx", tPhase1ScalarCtx);
Line("Phase2FirstPartitionBits", tPhase2FirstPart);
Line("SimdMemOps", tSimdMemOps);
Line("Fdct", tFdct);
Line("Walsh", tWalsh);
Line("Quantize", tQuantize);
Line("Tokenize", tTokenize);
Line("Reconstruct", tReconstruct);
Line("PackTokens", tPack);
Line("StitchLastFrame", tStitch);
return sb.ToString();
}
}
}
121 changes: 121 additions & 0 deletions src/SIPSorcery.VP8/EncoderMemoryOps.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
//-----------------------------------------------------------------------------
// Pluggable copy / residual helpers for the VP8 frame encoder. Two
// implementations (legacy nested loops vs span + SIMD) are swapped via
// IVp8FrameEncodePipeline — no per-MB feature flags.
//-----------------------------------------------------------------------------

using System;

namespace Vpx.Net
{
internal interface IEncoderMemoryOps
{
/// <summary>
/// Pipeline-level capability flag. When false (Legacy pipeline),
/// every SIMD encoder kernel dispatcher in mb_encoder.cs falls
/// back to the scalar reference (fdct/idct/walsh/quantize/dcpred)
/// regardless of whether the host CPU supports the SIMD path.
/// When true (Optimized pipeline) the SIMD modules are used when
/// their CPU-feature gate is also satisfied.
/// </summary>
bool UseSimdEncoderKernels { get; }

void CopyPlaneRect(byte[] src, int baseOffset, int srcStride, int x, int y, int w, int h, byte[] dst);

void CopyRowAt(byte[] src, int offset, int count, byte[] dst);

void CopyColumn(byte[] src, int srcStride, int columnIndex, int firstRow, int rows, byte[] dst);

void CopyRowFrom2d(byte[] src, int srcStride, int srcRow, int srcCol, byte[] dst, int dstOffset, int count);

void SubtractFlat(byte[] src, byte pred, short[] dst);

void SubtractPerPixel(byte[] src, byte[] pred, short[] dst);
}

internal sealed class LegacyEncoderMemoryOps : IEncoderMemoryOps
{
public static readonly LegacyEncoderMemoryOps Instance = new LegacyEncoderMemoryOps();

private LegacyEncoderMemoryOps() { }

public bool UseSimdEncoderKernels => false;

public void CopyPlaneRect(byte[] src, int baseOffset, int srcStride, int x, int y, int w, int h, byte[] dst)
{
for (int r = 0; r < h; r++)
for (int c = 0; c < w; c++)
dst[r * w + c] = src[baseOffset + (y + r) * srcStride + (x + c)];
}

public void CopyRowAt(byte[] src, int offset, int count, byte[] dst)
{
for (int i = 0; i < count; i++) dst[i] = src[offset + i];
}

public void CopyColumn(byte[] src, int srcStride, int columnIndex, int firstRow, int rows, byte[] dst)
{
for (int r = 0; r < rows; r++) dst[r] = src[(firstRow + r) * srcStride + columnIndex];
}

public void CopyRowFrom2d(byte[] src, int srcStride, int srcRow, int srcCol, byte[] dst, int dstOffset, int count)
{
for (int i = 0; i < count; i++) dst[dstOffset + i] = src[srcRow * srcStride + srcCol + i];
}

public void SubtractFlat(byte[] src, byte pred, short[] dst)
{
for (int i = 0; i < src.Length; i++) dst[i] = (short)(src[i] - pred);
}

public void SubtractPerPixel(byte[] src, byte[] pred, short[] dst)
{
for (int i = 0; i < src.Length; i++) dst[i] = (short)(src[i] - pred[i]);
}
}

internal sealed class SpanSimdEncoderMemoryOps : IEncoderMemoryOps
{
public static readonly SpanSimdEncoderMemoryOps Instance = new SpanSimdEncoderMemoryOps();

private SpanSimdEncoderMemoryOps() { }

public bool UseSimdEncoderKernels => true;

public void CopyPlaneRect(byte[] src, int baseOffset, int srcStride, int x, int y, int w, int h, byte[] dst)
{
using var _ = new EncodeProfiler.Scope(Vp8EncodeProfilePhase.SimdMemOps);
EncoderSimdKernels.CopyPlaneRectSpan(src, baseOffset, srcStride, x, y, w, h, dst);
}

public void CopyRowAt(byte[] src, int offset, int count, byte[] dst)
{
using var _ = new EncodeProfiler.Scope(Vp8EncodeProfilePhase.SimdMemOps);
EncoderSimdKernels.CopySpan(src, offset, dst, 0, count);
}

public void CopyColumn(byte[] src, int srcStride, int columnIndex, int firstRow, int rows, byte[] dst)
{
using var _ = new EncodeProfiler.Scope(Vp8EncodeProfilePhase.SimdMemOps);
EncoderSimdKernels.CopyColumn(src, srcStride, columnIndex, firstRow, rows, dst);
}

public void CopyRowFrom2d(byte[] src, int srcStride, int srcRow, int srcCol, byte[] dst, int dstOffset, int count)
{
using var _ = new EncodeProfiler.Scope(Vp8EncodeProfilePhase.SimdMemOps);
EncoderSimdKernels.CopySpan(src, srcRow * srcStride + srcCol, dst, dstOffset, count);
}

public void SubtractFlat(byte[] src, byte pred, short[] dst)
{
using var _ = new EncodeProfiler.Scope(Vp8EncodeProfilePhase.SimdMemOps);
EncoderSimdKernels.SubtractFlatToShort(src, pred, dst);
}

public void SubtractPerPixel(byte[] src, byte[] pred, short[] dst)
{
using var _ = new EncodeProfiler.Scope(Vp8EncodeProfilePhase.SimdMemOps);
EncoderSimdKernels.SubtractPerPixelToShort(src, pred, dst);
}
}
}
Loading
Loading