Inline Assembly Implementation
C language implementation:
void AddFloatC(float* dst, float* src1, float* src2, int count)
{
for (int i = 0; i < count; i++) {
dst[i] = src1[i] + src2[i];
}
}
int main()
{
float dst[ARRAY_NUMS] = {0.0};
float src1[ARRAY_NUMS];
float src2[ARRAY_NUMS];
struct timeval start;
struct timeval end;
double dt;
InitArray(dst, src1, src2);
gettimeofday(&start,NULL);
AddFloatNeonAsm(dst, src1, src2, ARRAY_NUMS);
gettimeofday(&end,NULL);
dt=(end.tv_sec-start.tv_sec) * 1000+(end.tv_usec-start.tv_usec) / 1000.0;
cout<<"Time used of Normal NEON ASM code: "<<dt<<"ms"<<"\tcheck dst[52] value: "<<dst[52]<<endl;
return 0;
}
Inline assembly implementation:
void AddFloatNeonAsm(float* dst, float* src1, float* src2, int count)
{
__asm__ volatile(
"1: \n"
"ld1 {v0.4s}, [%[src1]], #16 \n"
"ld1 {v1.4s}, [%[src2]], #16 \n"
"fadd v0.4s, v0.4s, v1.4s \n"
"subs %[count], %[count], #4 \n"
"st1 {v0.4s}, [%[dst]], #16 \n"
"bgt 1b \n"
: [dst] "+r" (dst)
: [src1] "r" (src1), [src2] "r" (src2), [count] "r" (count)
: "memory", "v0", "v1"
);
}
int main()
{
float dst[ARRAY_NUMS] = {0.0};
float src1[ARRAY_NUMS];
float src2[ARRAY_NUMS];
struct timeval start;
struct timeval end;
double dt;
InitArray(dst, src1, src2);
gettimeofday(&start,NULL);
AddFloatC(dst, src1, src2, ARRAY_NUMS);
gettimeofday(&end,NULL);
dt=(end.tv_sec-start.tv_sec) * 1000 + (end.tv_usec-start.tv_usec) / 1000.0;
cout<<"Time used of Normal C code: "<<dt<<"ms"<<"\t\tcheck dst[52] value: "<<dst[52]<<endl;
return 0;
}
The bgt jump instruction reads a corresponding NZCV flag in the NZCV system register to check whether the jump condition is met. Because the subs instruction is different from a common sub instruction, when subtraction is performed, operations are performed on corresponding condition flags.
b in bgt 1b: b is added after 1 to prevent bgt from processing 1 as an immediate. It notifies the compiler that 1 indicates a label instead of an immediate.
The execution of the C language code takes 3.062 ms, and the execution of the NEON inline assembly code takes 0.331 ms.
Parent topic: NEON Assembly Programming