EzDev.org

Neon

A powerful Swift programmatic UI layout framework.


Arm Neon Intrinsics vs hand assembly

http://hilbert-space.de/?p=22

On this site which is quite dated it shows that hand written asm would give a much greater improvement then the intrinsics. I am wondering if this is the current truth even now in 2012.

So has the compilation optimization improved for intrinsics using gnu cross compiler ?


Source: (StackOverflow)

Why does arm-gcc decrement/increment the stack pointer even when the stack is never accessed?

When compiling this program with arm-elf-gcc-4.5 -O3 -march=armv7-a -mthumb -mfpu=neon -mfloat-abi=softfp:

#include <arm_neon.h>
extern float32x4_t cross(const float32x4_t& v1, const float32x4_t& v2) {
    float32x4x2_t
        xxyyzz1(vzipq_f32(v1, v1)),
        xxyyzz2(vzipq_f32(v2, v2));

    float32x2_t
        xx1(vget_low_f32(xxyyzz1.val[0])),
        yy1(vget_high_f32(xxyyzz1.val[0])),
        zz1(vget_low_f32(xxyyzz1.val[1])),
        xx2(vget_low_f32(xxyyzz2.val[0])),
        yy2(vget_high_f32(xxyyzz2.val[0])),
        zz2(vget_low_f32(xxyyzz2.val[1]));

    float32x2_t
        x(vmls_f32(vmul_f32(yy1, zz2), zz1, yy2)),
        y(vmls_f32(vmul_f32(zz1, xx2), xx1, zz2)),
        z(vmls_f32(vmul_f32(xx1, yy2), yy1, xx2));

    return vcombine_f32(vuzp_f32(x, y).val[0], z);
}

...this is what I get. Notice the two useless instructions marked with @<<<

_Z5crossRK19__simd128_float32_tS1_:
    vldmia  r0, {d16-d17}
    vldmia  r1, {d22-d23}
    vmov    q10, q8  @ v4sf
    vmov    q9, q11  @ v4sf
    vzip.32 q8, q10
    vzip.32 q11, q9
    vmov    d24, d17
    vmov    d21, d22
    vmov    d22, d23
    vmul.f32    d17, d24, d18
    vmul.f32    d19, d20, d21
    vmls.f32    d19, d16, d18
    vmls.f32    d17, d20, d22
    vmul.f32    d16, d16, d22
    vuzp.32 d17, d19
    vmls.f32    d16, d24, d21
    sub sp, sp, #80 @<<<
    vswp    d17, d16
    vmov    r0, r1, d16  @ v4sf
    vmov    r2, r3, d17
    add sp, sp, #80 @<<<
    bx

The stack is never accessed, yet the stack pointer get decremented, then incremented by the same amount. Why?

If I modify the original code to include asm comment at the end of the prologue and the begining of the epilogue, like this:

#include <arm_neon.h>
extern float32x4_t cross(const float32x4_t& v1, const float32x4_t& v2) {
    asm volatile("# End of prologue");
    float32x4x2_t
        xxyyzz1(vzipq_f32(v1, v1)),
        xxyyzz2(vzipq_f32(v2, v2));

    float32x2_t
        xx1(vget_low_f32(xxyyzz1.val[0])),
        yy1(vget_high_f32(xxyyzz1.val[0])),
        zz1(vget_low_f32(xxyyzz1.val[1])),
        xx2(vget_low_f32(xxyyzz2.val[0])),
        yy2(vget_high_f32(xxyyzz2.val[0])),
        zz2(vget_low_f32(xxyyzz2.val[1]));

    float32x2_t
        x(vmls_f32(vmul_f32(yy1, zz2), zz1, yy2)),
        y(vmls_f32(vmul_f32(zz1, xx2), xx1, zz2)),
        z(vmls_f32(vmul_f32(xx1, yy2), yy1, xx2));

    float32x4_t res(vcombine_f32(vuzp_f32(x, y).val[0], z));
    asm volatile("# Start of epilogue");
    return res;
}

Then I get slightly different version:

_Z5crossRK19__simd128_float32_tS1_:
    sub sp, sp, #80
    # End of prologue
    vldmia  r0, {d16-d17}
    vldmia  r1, {d22-d23}
    vmov    q10, q8  @ v4sf
    vmov    q9, q11  @ v4sf
    vzip.32 q8, q10
    vzip.32 q11, q9
    vmov    d24, d17
    vmov    d21, d22
    vmov    d22, d23
    vmul.f32    d17, d24, d18
    vmul.f32    d19, d20, d21
    vmls.f32    d19, d16, d18
    vmls.f32    d17, d20, d22
    vmul.f32    d16, d16, d22
    vuzp.32 d17, d19
    vmls.f32    d16, d24, d21
    vswp    d17, d16
    # Start of epilogue
    vmov    r0, r1, d16  @ v4sf
    vmov    r2, r3, d17
    add sp, sp, #80
    bx  lr

The stack pointer decrement/increment clearly is part of the prologue/epilogue, and happens even if the stack is not used. Is that to comply with some standard, or is it a gcc optimization bug?

EDIT: Compiler is arm-elf-gcc-4.5 (GCC) 4.5.0, configured with: /opt/local/var/macports/build/_opt_local_var_macports_sources_rsync.macports.org_release_ports_cross_arm-elf-gcc/work/gcc-4.5.0/configure --prefix=/opt/local --infodir=/opt/local/share/info --mandir=/opt/local/share/man --target=arm-elf --program-prefix=arm-elf- --program-suffix=-4.5 --without-included-gettext --enable-obsolete --with-newlib --disable-__cxa_atexit --enable-multilib --enable-biendian --disable-libgfortran --with-gxx-include-dir=/opt/local/arm-elf/include/c++/4.5.0/ --enable-languages=c,c++,objc --build=x86_64-apple-darwin10 --enable-fpu

EDIT: I managed to pinpoint the problem using the following C source. It only happens when using arrays of vector types as temporaries, such as float32x4x2_t which is declared as struct { float32x4_t val[2]; }, even tho these temporaries are made registers. I believe this is a bug, so I reported it.

#include <arm_neon.h>

// This one is ok
extern float32x4_t add(float32x4_t* v1, float32x4_t* v2) {
    return vaddq_f32(*v1, *v2);
#if 0
    produced assembly:

add:
    vldmia  r0, {d16-d17}
    vldmia  r1, {d18-d19}
    vadd.f32    q8, q8, q9
    vmov    r0, r1, d16
    vmov    r2, r3, d17
    bx  lr

#endif
}

// This one uses float32x4x2_t temporaries and has the bug
extern float32x4_t cross(float32x4_t* v1, float32x4_t* v2) {
    float32x4x2_t
        xxyyzz1=vzipq_f32(*v1, *v1),
        xxyyzz2=vzipq_f32(*v2, *v2);

    float32x2_t
        xx1=vget_low_f32(xxyyzz1.val[0]),
        yy1=vget_high_f32(xxyyzz1.val[0]),
        zz1=vget_low_f32(xxyyzz1.val[1]),
        xx2=vget_low_f32(xxyyzz2.val[0]),
        yy2=vget_high_f32(xxyyzz2.val[0]),
        zz2=vget_low_f32(xxyyzz2.val[1]);

    float32x2_t
        x=vmls_f32(vmul_f32(yy1, zz2), zz1, yy2),
        y=vmls_f32(vmul_f32(zz1, xx2), xx1, zz2),
        z=vmls_f32(vmul_f32(xx1, yy2), yy1, xx2);

    return vcombine_f32(vuzp_f32(x, y).val[0], z);
#if 0
    produced assembly:

cross:
    vldmia  r0, {d18-d19}
    vldmia  r1, {d16-d17}
    vmov    q10, q9
    vmov    q11, q8
    vzip.32 q9, q10
    vzip.32 q8, q11
    vmov    d24, d19
    vmov    d21, d16
    vmov    d16, d17
    vmul.f32    d19, d20, d21
    vmul.f32    d17, d24, d22
    vmls.f32    d17, d20, d16
    vmls.f32    d19, d18, d22
    vmul.f32    d16, d18, d16
    vuzp.32 d17, d19
    vmls.f32    d16, d24, d21
    sub sp, sp, #48           @ here
    vswp    d17, d16
    vmov    r0, r1, d16
    vmov    r2, r3, d17
    add sp, sp, #48           @ and here
    bx  lr

#endif
}

Source: (StackOverflow)

(opencv rc1) What causes Mat multiplication to be 20x slower than per-pixel multiplication?

// 700 ms
cv::Mat in(height,width,CV_8UC1);
in /= 4;

Replaced with

//40 ms
cv::Mat in(height,width,CV_8UC1);
for (int y=0; y < in.rows; ++y)
{
    unsigned char* ptr = in.data + y*in.step1();
    for (int x=0; x < in.cols; ++x)
    {
        ptr[x] /= 4;
    }
}

What can cause such behavior? Is it due to opencv "promoting" Mat with Scalar multiplication to a Mat with Mat multiplication, or is it a specific failed optimization for arm? (NEON is enabled).


Source: (StackOverflow)

Fast conversion of 16-bit big-endian to little-endian in ARM

I need to convert big arrays of 16-bit integer values from big-endian to little-endian format.

Now I use for conversion the following function:

inline void Reorder16bit(const uint8_t * src, uint8_t * dst)
{
    uint16_t value = *(uint16_t*)src;
    *(uint16_t*)dst = value >> 8 | value << 8;
}

void Reorder16bit(const uint8_t * src, size_t size, uint8_t * dst)
{
    assert(size%2 == 0);
    for(size_t i = 0; i < size; i += 2)
        Reorder16bit(src + i, dst + i);
}

I use GCC. Target platform is ARMv7 (Raspberry Phi 2B).

Is there any way to optimize it?

This conversion is needed for loading audio samples which can be as in little- endian as in big-endian format. Of course it is not a bottleneck now, but it takes about 10% of total processing time. And I think that is too much for such a simple operation.


Source: (StackOverflow)