Intrinsics实现示例
如果需要在代码中调用NEON Intrinsics函数,需要加入头文件"arm_neon.h"。以数组加法为例。
C语言实现:
using namespace std;
void add(int* out, int* input1, int* input2, int count)
{
for(int i = 0; i < count; i += 1)
{
out[i] = input1[i] + input2[i];
}
}
int main()
{
int count;
count = 10000 * 4;
int a[count];
int b[count];
int c[count];
clock_t start, finish;
double duration;
for(int i = 0; i < count; i += 1)
{
a[i] = rand();
}
for(int i = 0; i < count; i += 1)
{
b[i] = rand();
}
start = clock();
for(int i = 0; i < count; i += 1)
{
add(c, a, b, count);
}
finish = clock();
duration = (double)(finish - start) / CLOCKS_PER_SEC;
printf( "%f seconds\n", duration);
return 0;
}
输出结果如下:
1.910000 seconds
NEON Intrinsics实现:
using namespace std;
void add_neon(int* out, int* input1, int* input2, int count)
{
int32x4_t input1_neon, input2_neon, out_neon;
for(int i = 0; i < count; i += 4)
{
input1_neon = vld1q_s32(input1);
input1 += 4;
input2_neon = vld1q_s32(input2);
input2 += 4;
out_neon = vaddq_s32(input1_neon, input2_neon);
vst1q_s32(out, out_neon);
out += 4;
}
}
int main()
{
int count;
count = 10000 * 4;
int a[count];
int b[count];
int c[count];
clock_t start, finish;
double duration;
for(int i = 0; i < count; i += 1)
{
a[i] = rand();
}
for(int i = 0; i < count; i += 1)
{
b[i] = rand();
}
start = clock();
for(int i = 0; i < count; i += 1)
{
add_neon(c, a, b, count);
}
finish = clock();
duration = (double)(finish - start) / CLOCKS_PER_SEC;
printf( "%f seconds for neon\n", duration);
return 0;
}
输出结果:
0.360000 seconds for neon
可以看出,使用NEON Intrinsics实现,性能有了明显的提升。
父主题: NEON Intrinsics编程