Rate This Document
Findability
Accuracy
Completeness
Readability

Inline Assembly Implementation

C language implementation:

void AddFloatC(float* dst, float* src1, float* src2, int count)
{
    for (int i = 0; i < count; i++) {
        dst[i] = src1[i] + src2[i];
    }
}
int main()
{
    float dst[ARRAY_NUMS] = {0.0};
    float src1[ARRAY_NUMS];
    float src2[ARRAY_NUMS];

    struct timeval start;
    struct timeval end;
    double dt;

    InitArray(dst, src1, src2); 
    gettimeofday(&start,NULL);
    AddFloatNeonAsm(dst, src1, src2, ARRAY_NUMS);
    gettimeofday(&end,NULL);
    dt=(end.tv_sec-start.tv_sec) * 1000+(end.tv_usec-start.tv_usec) / 1000.0;
    cout<<"Time used of Normal NEON ASM code: "<<dt<<"ms"<<"\tcheck dst[52] value: "<<dst[52]<<endl;
    return 0;
}

Inline assembly implementation:

void AddFloatNeonAsm(float* dst, float* src1, float* src2, int count)
{
    __asm__ volatile(
        "1:                               \n"
        "ld1     {v0.4s}, [%[src1]], #16  \n"
        "ld1     {v1.4s}, [%[src2]], #16  \n"
        "fadd    v0.4s, v0.4s, v1.4s      \n"
        "subs    %[count], %[count], #4   \n"
        "st1     {v0.4s}, [%[dst]], #16   \n"
        "bgt     1b                       \n"
        : [dst] "+r" (dst)
        : [src1] "r" (src1), [src2] "r" (src2), [count] "r" (count)
        : "memory", "v0", "v1"
    );
} 
int main()
{
    float dst[ARRAY_NUMS] = {0.0};
    float src1[ARRAY_NUMS];
    float src2[ARRAY_NUMS];

    struct timeval start;
    struct timeval end;
    double dt;

    InitArray(dst, src1, src2); 
    gettimeofday(&start,NULL);
    AddFloatC(dst, src1, src2, ARRAY_NUMS);
    gettimeofday(&end,NULL);
    dt=(end.tv_sec-start.tv_sec) * 1000 + (end.tv_usec-start.tv_usec) / 1000.0;
    cout<<"Time used of Normal C code: "<<dt<<"ms"<<"\t\tcheck dst[52] value: "<<dst[52]<<endl;

    return 0;
}

The bgt jump instruction reads a corresponding NZCV flag in the NZCV system register to check whether the jump condition is met. Because the subs instruction is different from a common sub instruction, when subtraction is performed, operations are performed on corresponding condition flags.

b in bgt 1b: b is added after 1 to prevent bgt from processing 1 as an immediate. It notifies the compiler that 1 indicates a label instead of an immediate.

The execution of the C language code takes 3.062 ms, and the execution of the NEON inline assembly code takes 0.331 ms.