Skip to content

Commit 9968f6d

Browse files
committed
Check asm
1 parent 9aefb71 commit 9968f6d

File tree

3 files changed

+229
-5
lines changed

3 files changed

+229
-5
lines changed

src/api/internal.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,7 @@ impl<T: Pixel> ContextInner<T> {
300300
enc.height,
301301
enc.bit_depth as u8,
302302
enc.chroma_sampling,
303+
CpuFeatureLevel::default(),
303304
))
304305
} else {
305306
None

src/denoise.rs renamed to src/denoise/mod.rs

Lines changed: 101 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
use crate::api::FrameQueue;
2+
use crate::cpu_features::CpuFeatureLevel;
23
use crate::EncoderStatus;
34
use arrayvec::ArrayVec;
5+
use cfg_if::cfg_if;
46
use ndarray::{Array3, ArrayView3, ArrayViewMut3};
57
use ndrustfft::{
68
ndfft, ndfft_r2c, ndifft, ndifft_r2c, Complex, FftHandler, R2cFftHandler,
@@ -26,6 +28,9 @@ const COMPLEX_COUNT: usize = (SB_SIZE / 2 + 1) * SB_SIZE * TB_SIZE;
2628
const CCNT2: usize = COMPLEX_COUNT * 2;
2729
const INC: usize = SB_SIZE - SO_SIZE;
2830

31+
#[cfg(nasm_x86_64)]
32+
mod x86;
33+
2934
/// This denoiser is based on the DFTTest plugin from Vapoursynth.
3035
/// This type of denoising was chosen because it provides
3136
/// high quality while not being too slow.
@@ -43,6 +48,8 @@ where
4348
pub(crate) cur_frameno: u64,
4449

4550
denoiser: Box<dyn Denoiser<T>>,
51+
#[cfg(feature = "check_asm")]
52+
rust_denoiser: DftDenoiserRust<T>,
4653
}
4754

4855
impl<T> DftDenoiser<T>
@@ -52,7 +59,7 @@ where
5259
// This should only need to run once per video.
5360
pub fn new(
5461
sigma: f32, width: usize, height: usize, bit_depth: u8,
55-
chroma_sampling: ChromaSampling,
62+
chroma_sampling: ChromaSampling, cpu: CpuFeatureLevel,
5663
) -> Self {
5764
if size_of::<T>() == 1 {
5865
assert!(bit_depth <= 8);
@@ -81,20 +88,62 @@ where
8188
effective_heights.push(e_h);
8289
}
8390

84-
let denoiser: Box<dyn Denoiser<T>> = Box::new(DftDenoiserRust::new(
91+
#[cfg(feature = "check_asm")]
92+
let rust_denoiser = DftDenoiserRust::new(
8593
sigma,
8694
dest_scale,
8795
src_scale,
8896
peak,
89-
pad_dimensions,
90-
effective_heights,
91-
));
97+
pad_dimensions.clone(),
98+
effective_heights.clone(),
99+
);
100+
101+
cfg_if! {
102+
if #[cfg(nasm_x86_64)] {
103+
let denoiser: Box<dyn Denoiser<T>> = match cpu {
104+
CpuFeatureLevel::AVX512ICL |
105+
CpuFeatureLevel::AVX512 |
106+
CpuFeatureLevel::AVX2 => {
107+
// SAFETY: This is safe because we verified that AVX2 is enabled.
108+
Box::new(unsafe {
109+
x86::DftDenoiserAvx2::new(
110+
sigma,
111+
dest_scale,
112+
src_scale,
113+
peak,
114+
pad_dimensions,
115+
effective_heights,
116+
)
117+
})
118+
},
119+
_ => Box::new(DftDenoiserRust::new(
120+
sigma,
121+
dest_scale,
122+
src_scale,
123+
peak,
124+
pad_dimensions,
125+
effective_heights,
126+
))
127+
};
128+
} else {
129+
let denoiser: Box<dyn Denoiser<T>> = Box::new(DftDenoiserRust::new(
130+
sigma,
131+
dest_scale,
132+
src_scale,
133+
peak,
134+
pad_dimensions,
135+
effective_heights,
136+
));
137+
}
138+
}
92139

93140
DftDenoiser {
94141
chroma_sampling,
95142
frame_buffer: VecDeque::with_capacity(TB_MIDPOINT),
96143
cur_frameno: 0,
97144
denoiser,
145+
#[cfg(feature = "check_asm")]
146+
rust_denoiser,
98147
}
99148
}
100149

@@ -173,6 +222,53 @@ where
173222
}
174223
self.denoiser.do_filtering(&pad, &mut dest);
175224

225+
#[cfg(feature = "check_asm")]
226+
{
227+
let mut rust_dest = (**orig_frame).clone();
228+
let mut pad = ArrayVec::<_, TB_SIZE>::new();
229+
for i in 0..TB_SIZE {
230+
let dec = self.chroma_sampling.get_decimation().unwrap_or((0, 0));
231+
let mut pad_frame = [
232+
Plane::new(
233+
self.rust_denoiser.pad_dimensions(0).0,
234+
self.rust_denoiser.pad_dimensions(0).1,
235+
0,
236+
0,
237+
0,
238+
0,
239+
),
240+
Plane::new(
241+
self.rust_denoiser.pad_dimensions(1).0,
242+
self.rust_denoiser.pad_dimensions(1).1,
243+
dec.0,
244+
dec.1,
245+
0,
246+
0,
247+
),
248+
Plane::new(
249+
self.rust_denoiser.pad_dimensions(2).0,
250+
self.rust_denoiser.pad_dimensions(2).1,
251+
dec.0,
252+
dec.1,
253+
0,
254+
0,
255+
),
256+
];
257+
258+
let frame = frames.get(&i).unwrap_or(&frames[&TB_MIDPOINT]);
259+
self.rust_denoiser.copy_pad(frame, &mut pad_frame);
260+
pad.push(pad_frame);
261+
}
262+
self.rust_denoiser.do_filtering(&pad, &mut rust_dest);
263+
264+
for p in 0..3 {
265+
assert_eq!(
266+
dest.planes[p].iter().collect::<Vec<_>>(),
267+
rust_dest.planes[p].iter().collect::<Vec<_>>()
268+
);
269+
}
270+
}
271+
176272
if self.frame_buffer.len() == TB_MIDPOINT {
177273
self.frame_buffer.pop_front();
178274
}

src/denoise/x86.rs

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
use arrayvec::ArrayVec;
2+
use ndrustfft::{Complex, FftHandler, R2cFftHandler};
3+
use v_frame::pixel::Pixel;
4+
5+
use super::{
6+
create_window, Denoiser, BLOCK_VOLUME, CCNT2, COMPLEX_COUNT, SB_SIZE,
7+
TB_SIZE,
8+
};
9+
10+
pub(super) struct DftDenoiserAvx2<T>
11+
where
12+
T: Pixel,
13+
{
14+
dest_scale: f32,
15+
src_scale: f32,
16+
peak: T,
17+
18+
// These indices refer to planes of the input
19+
pad_dimensions: ArrayVec<(usize, usize), 3>,
20+
effective_heights: ArrayVec<usize, 3>,
21+
22+
hw: [f32; BLOCK_VOLUME],
23+
dftgc: [Complex<f32>; COMPLEX_COUNT],
24+
fft: (R2cFftHandler<f32>, FftHandler<f32>, FftHandler<f32>),
25+
sigmas: [f32; CCNT2],
26+
}
27+
28+
impl<T> DftDenoiserAvx2<T>
29+
where
30+
T: Pixel,
31+
{
32+
#[target_feature(enable = "avx2")]
33+
pub unsafe fn new(
34+
sigma: f32, dest_scale: f32, src_scale: f32, peak: T,
35+
pad_dimensions: ArrayVec<(usize, usize), 3>,
36+
effective_heights: ArrayVec<usize, 3>,
37+
) -> Self {
38+
let hw = create_window();
39+
let mut dftgr = [0f32; BLOCK_VOLUME];
40+
41+
let fft = (
42+
R2cFftHandler::new(SB_SIZE),
43+
FftHandler::new(SB_SIZE),
44+
FftHandler::new(TB_SIZE),
45+
);
46+
47+
let mut wscale = 0.0f32;
48+
for k in 0..BLOCK_VOLUME {
49+
dftgr[k] = 255.0 * hw[k];
50+
wscale += hw[k].powi(2);
51+
}
52+
let wscale = 1.0 / wscale;
53+
54+
let mut sigmas = [0f32; CCNT2];
55+
sigmas.fill(sigma / wscale);
56+
57+
let mut denoiser = DftDenoiserAvx2 {
58+
dest_scale,
59+
src_scale,
60+
peak,
61+
pad_dimensions,
62+
effective_heights,
63+
hw,
64+
fft,
65+
sigmas,
66+
dftgc: [Complex::default(); COMPLEX_COUNT],
67+
};
68+
69+
let mut dftgc = [Complex::default(); COMPLEX_COUNT];
70+
denoiser.real_to_complex_3d(&dftgr, &mut dftgc);
71+
denoiser.dftgc = dftgc;
72+
73+
denoiser
74+
}
75+
}
76+
77+
impl<T> Denoiser<T> for DftDenoiserAvx2<T>
78+
where
79+
T: Pixel,
80+
{
81+
#[inline(always)]
82+
fn pad_dimensions(&self, plane: usize) -> (usize, usize) {
83+
self.pad_dimensions[plane]
84+
}
85+
86+
#[inline(always)]
87+
fn effective_height(&self, plane: usize) -> usize {
88+
self.effective_heights[plane]
89+
}
90+
91+
#[inline(always)]
92+
fn src_scale(&self) -> f32 {
93+
self.src_scale
94+
}
95+
96+
#[inline(always)]
97+
fn dest_scale(&self) -> f32 {
98+
self.dest_scale
99+
}
100+
101+
#[inline(always)]
102+
fn peak(&self) -> T {
103+
self.peak
104+
}
105+
106+
#[inline(always)]
107+
fn hw(&self) -> &[f32; BLOCK_VOLUME] {
108+
&self.hw
109+
}
110+
111+
#[inline(always)]
112+
fn dftgc(&self) -> &[Complex<f32>; COMPLEX_COUNT] {
113+
&self.dftgc
114+
}
115+
116+
#[inline(always)]
117+
fn sigmas(&self) -> &[f32; CCNT2] {
118+
&self.sigmas
119+
}
120+
121+
#[inline(always)]
122+
fn ffts(
123+
&mut self,
124+
) -> &mut (R2cFftHandler<f32>, FftHandler<f32>, FftHandler<f32>) {
125+
&mut self.fft
126+
}
127+
}

0 commit comments

Comments
 (0)