内嵌汇编实现示例
C语言实现版本:
void AddFloatC(float* dst, float* src1, float* src2, int count)
{
for (int i = 0; i < count; i++) {
dst[i] = src1[i] + src2[i];
}
}
int main()
{
float dst[ARRAY_NUMS] = {0.0};
float src1[ARRAY_NUMS];
float src2[ARRAY_NUMS];
struct timeval start;
struct timeval end;
double dt;
InitArray(dst, src1, src2);
gettimeofday(&start,NULL);
AddFloatNeonAsm(dst, src1, src2, ARRAY_NUMS);
gettimeofday(&end,NULL);
dt=(end.tv_sec-start.tv_sec) * 1000+(end.tv_usec-start.tv_usec) / 1000.0;
cout<<"Time used of Normal NEON ASM code: "<<dt<<"ms"<<"\tcheck dst[52] value: "<<dst[52]<<endl;
return 0;
}
内嵌汇编实现版本:
void AddFloatNeonAsm(float* dst, float* src1, float* src2, int count)
{
__asm__ volatile(
"1: \n"
"ld1 {v0.4s}, [%[src1]], #16 \n"
"ld1 {v1.4s}, [%[src2]], #16 \n"
"fadd v0.4s, v0.4s, v1.4s \n"
"subs %[count], %[count], #4 \n"
"st1 {v0.4s}, [%[dst]], #16 \n"
"bgt 1b \n"
: [dst] "+r" (dst)
: [src1] "r" (src1), [src2] "r" (src2), [count] "r" (count)
: "memory", "v0", "v1"
);
}
int main()
{
float dst[ARRAY_NUMS] = {0.0};
float src1[ARRAY_NUMS];
float src2[ARRAY_NUMS];
struct timeval start;
struct timeval end;
double dt;
InitArray(dst, src1, src2);
gettimeofday(&start,NULL);
AddFloatC(dst, src1, src2, ARRAY_NUMS);
gettimeofday(&end,NULL);
dt=(end.tv_sec-start.tv_sec) * 1000 + (end.tv_usec-start.tv_usec) / 1000.0;
cout<<"Time used of Normal C code: "<<dt<<"ms"<<"\t\tcheck dst[52] value: "<<dst[52]<<endl;
return 0;
}
其中的bgt跳转指令会去读取NZCV系统寄存器中相应的NZCV标记位以判断是否满足跳转条件,关键在于subs指令不同于普通的sub指令,在做减法时,会对相应condition flags位进行操作。
“bgt 1b”中的b的含义是:避免bgt把后面的1当成立即数处理,所以在1后加了b,告知编译器前面的1表示label,而不是立即数。
C语言执行时间为3.062ms,而NEON内嵌汇编的版本执行时间为0.331ms。
父主题: NEON汇编编程