Skip to content

macro does not work on arrays, no way to have a manual default implementation #18

Open
@fschutt

Description

@fschutt

The problem is that I'm trying to vectorize this code:

https://github.com/fschutt/layout2d/blob/master/src/rect.rs#L43-L107

This crate is not helpful - if I use the simd function, the SIMD will also be used in the default function. There is no way to use SIMD in the function, but no SIMD in the default, fallback function.

Second, loading from fields takes a considerable amount of time in SIMD, which is why I have that weird layout with the four-number array. The macro completely fails on arrays, it does not vectorize arrays at all.

I don't know what the goal is - if you want me to write hand-vectorized code, then you need to provide to write a manual fallback function. If you want to do this work "automagically", then the macro should be smarter about arrays.

Currently the code generated with arrays is horrible:

pub fn rotate(x: &mut [f32], y: &mut [f32], in_angle: f32) {
    pub extern crate runtime_target_feature_rt as rt;
    static PTR: rt::atomic::Atomic<fn(&mut [f32], &mut [f32], f32)> =
        rt::atomic::Atomic::new(setup);
    fn setup(x: &mut [f32], y: &mut [f32], in_angle: f32) {
        let chosen_function = if rt::have_avx2() {
            with_enable_avx2
        } else if rt::have_sse4_1() {
            with_enable_sse4_1
        } else {
            default
        };
        PTR.store(chosen_function, rt::atomic::Ordering::Relaxed);
        chosen_function(x, y, in_angle)
    }
    fn default(x: &mut [f32], y: &mut [f32], in_angle: f32) {
        let center_y = ((y[1] - y[2]) * 0.5) + y[2];
        let center_x = ((x[1] - x[2]) * 0.5) + x[2];
        x[0] -= center_x;
        x[1] -= center_x;
        x[2] -= center_x;
        x[3] -= center_x;
        y[0] -= center_y;
        y[1] -= center_y;
        y[2] -= center_y;
        y[3] -= center_y;
        let k_angle = in_angle.to_radians();
        let s = k_angle.sin();
        let c = k_angle.cos();
        let tl_x = (x[0] * c) - (y[0] * s);
        let tr_x = (x[1] * c) - (y[1] * s);
        let bl_x = (x[2] * c) - (y[2] * s);
        let br_x = (x[3] * c) - (y[3] * s);
        let tl_y = (x[0] * s) + (y[0] * c);
        let tr_y = (x[1] * s) + (y[1] * c);
        let bl_y = (x[2] * s) + (y[2] * c);
        let br_y = (x[3] * s) + (y[3] * c);
        x[0] = tl_x;
        x[1] = tr_x;
        x[2] = bl_x;
        x[3] = br_x;
        y[0] = tl_y;
        y[1] = tr_y;
        y[2] = bl_y;
        y[3] = br_y;
        x[0] += center_x;
        x[1] += center_x;
        x[2] += center_x;
        x[3] += center_x;
        y[0] += center_y;
        y[1] += center_y;
        y[2] += center_y;
        y[3] += center_y;
    }
    #[target_feature = "+avx2"]
    fn with_enable_avx2(x: &mut [f32], y: &mut [f32], in_angle: f32) {
        let center_y = ((y[1] - y[2]) * 0.5) + y[2];
        let center_x = ((x[1] - x[2]) * 0.5) + x[2];
        x[0] -= center_x;
        x[1] -= center_x;
        x[2] -= center_x;
        x[3] -= center_x;
        y[0] -= center_y;
        y[1] -= center_y;
        y[2] -= center_y;
        y[3] -= center_y;
        let k_angle = in_angle.to_radians();
        let s = k_angle.sin();
        let c = k_angle.cos();
        let tl_x = (x[0] * c) - (y[0] * s);
        let tr_x = (x[1] * c) - (y[1] * s);
        let bl_x = (x[2] * c) - (y[2] * s);
        let br_x = (x[3] * c) - (y[3] * s);
        let tl_y = (x[0] * s) + (y[0] * c);
        let tr_y = (x[1] * s) + (y[1] * c);
        let bl_y = (x[2] * s) + (y[2] * c);
        let br_y = (x[3] * s) + (y[3] * c);
        x[0] = tl_x;
        x[1] = tr_x;
        x[2] = bl_x;
        x[3] = br_x;
        y[0] = tl_y;
        y[1] = tr_y;
        y[2] = bl_y;
        y[3] = br_y;
        x[0] += center_x;
        x[1] += center_x;
        x[2] += center_x;
        x[3] += center_x;
        y[0] += center_y;
        y[1] += center_y;
        y[2] += center_y;
        y[3] += center_y;
    }
    #[target_feature = "+sse4.1"]
    fn with_enable_sse4_1(x: &mut [f32], y: &mut [f32], in_angle: f32) {
        let center_y = ((y[1] - y[2]) * 0.5) + y[2];
        let center_x = ((x[1] - x[2]) * 0.5) + x[2];
        x[0] -= center_x;
        x[1] -= center_x;
        x[2] -= center_x;
        x[3] -= center_x;
        y[0] -= center_y;
        y[1] -= center_y;
        y[2] -= center_y;
        y[3] -= center_y;
        let k_angle = in_angle.to_radians();
        let s = k_angle.sin();
        let c = k_angle.cos();
        let tl_x = (x[0] * c) - (y[0] * s);
        let tr_x = (x[1] * c) - (y[1] * s);
        let bl_x = (x[2] * c) - (y[2] * s);
        let br_x = (x[3] * c) - (y[3] * s);
        let tl_y = (x[0] * s) + (y[0] * c);
        let tr_y = (x[1] * s) + (y[1] * c);
        let bl_y = (x[2] * s) + (y[2] * c);
        let br_y = (x[3] * s) + (y[3] * c);
        x[0] = tl_x;
        x[1] = tr_x;
        x[2] = bl_x;
        x[3] = br_x;
        y[0] = tl_y;
        y[1] = tr_y;
        y[2] = bl_y;
        y[3] = br_y;
        x[0] += center_x;
        x[1] += center_x;
        x[2] += center_x;
        x[3] += center_x;
        y[0] += center_y;
        y[1] += center_y;
        y[2] += center_y;
        y[3] += center_y;
    }
    PTR.load(rt::atomic::Ordering::Relaxed)(x, y, in_angle)
}

If I, however, use the stdsimd functions, I get those too in the default() function, which completely works against the point of this library:

pub fn rotate(x: &mut [f32], y: &mut [f32], in_angle: f32) {
    pub extern crate runtime_target_feature_rt as rt;
    static PTR: rt::atomic::Atomic<fn(&mut [f32], &mut [f32], f32)> =
        rt::atomic::Atomic::new(setup);
    fn setup(x: &mut [f32], y: &mut [f32], in_angle: f32) {
        let chosen_function = if rt::have_avx2() {
            with_enable_avx2
        } else if rt::have_sse4_1() {
            with_enable_sse4_1
        } else {
            default
        };
        PTR.store(chosen_function, rt::atomic::Ordering::Relaxed);
        chosen_function(x, y, in_angle)
    }
    fn default(x: &mut [f32], y: &mut [f32], in_angle: f32) {
        use stdsimd;
        let center_y = ((y[0] - y[2]) * 0.5) + y[2];
        let center_x = ((x[1] - x[0]) * 0.5) + x[0];
        let mut simd_x_dir = simd::f32x4::load(&x, 0);
        let mut simd_y_dir = simd::f32x4::load(&y, 0);
        simd_x_dir = simd_x_dir - simd::f32x4::splat(center_x);
        simd_y_dir = simd_y_dir - simd::f32x4::splat(center_y);
        let k_angle = in_angle.to_radians();
        let s = k_angle.sin();
        let c = k_angle.cos();
        let mut simd_x_new =
            (simd_x_dir * simd::f32x4::splat(c)) - (simd_y_dir * simd::f32x4::splat(s));
        simd_y_dir = (simd_x_dir * simd::f32x4::splat(s)) + (simd_y_dir * simd::f32x4::splat(c));
        simd_x_new = simd_x_new + simd::f32x4::splat(center_x);
        simd_y_dir = simd_y_dir + simd::f32x4::splat(center_y);
        simd_x_new.store(x, 0);
        simd_y_dir.store(y, 0);
    }
    #[target_feature = "+avx2"]
    fn with_enable_avx2(x: &mut [f32], y: &mut [f32], in_angle: f32) {
        use stdsimd;
        let center_y = ((y[0] - y[2]) * 0.5) + y[2];
        let center_x = ((x[1] - x[0]) * 0.5) + x[0];
        let mut simd_x_dir = simd::f32x4::load(&x, 0);
        let mut simd_y_dir = simd::f32x4::load(&y, 0);
        simd_x_dir = simd_x_dir - simd::f32x4::splat(center_x);
        simd_y_dir = simd_y_dir - simd::f32x4::splat(center_y);
        let k_angle = in_angle.to_radians();
        let s = k_angle.sin();
        let c = k_angle.cos();
        let mut simd_x_new =
            (simd_x_dir * simd::f32x4::splat(c)) - (simd_y_dir * simd::f32x4::splat(s));
        simd_y_dir = (simd_x_dir * simd::f32x4::splat(s)) + (simd_y_dir * simd::f32x4::splat(c));
        simd_x_new = simd_x_new + simd::f32x4::splat(center_x);
        simd_y_dir = simd_y_dir + simd::f32x4::splat(center_y);
        simd_x_new.store(x, 0);
        simd_y_dir.store(y, 0);
    }
    #[target_feature = "+sse4.1"]
    fn with_enable_sse4_1(x: &mut [f32], y: &mut [f32], in_angle: f32) {
        use stdsimd;
        let center_y = ((y[0] - y[2]) * 0.5) + y[2];
        let center_x = ((x[1] - x[0]) * 0.5) + x[0];
        let mut simd_x_dir = simd::f32x4::load(&x, 0);
        let mut simd_y_dir = simd::f32x4::load(&y, 0);
        simd_x_dir = simd_x_dir - simd::f32x4::splat(center_x);
        simd_y_dir = simd_y_dir - simd::f32x4::splat(center_y);
        let k_angle = in_angle.to_radians();
        let s = k_angle.sin();
        let c = k_angle.cos();
        let mut simd_x_new =
            (simd_x_dir * simd::f32x4::splat(c)) - (simd_y_dir * simd::f32x4::splat(s));
        simd_y_dir = (simd_x_dir * simd::f32x4::splat(s)) + (simd_y_dir * simd::f32x4::splat(c));
        simd_x_new = simd_x_new + simd::f32x4::splat(center_x);
        simd_y_dir = simd_y_dir + simd::f32x4::splat(center_y);
        simd_x_new.store(x, 0);
        simd_y_dir.store(y, 0);
    }
    PTR.load(rt::atomic::Ordering::Relaxed)(x, y, in_angle)
}

Now the default function has SIMD function, which is agains the point.

So, in practice, this library is currently useless.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions