Intrinsics Implementation
To call NEON intrinsics in the code, add the header file arm_neon.h. Take array addition as an example.
C language implementation:
using namespace std;
void add(int* out, int* input1, int* input2, int count)
{
for(int i = 0; i < count; i += 1)
{
out[i] = input1[i] + input2[i];
}
}
int main()
{
int count;
count = 10000 * 4;
int a[count];
int b[count];
int c[count];
clock_t start, finish;
double duration;
for(int i = 0; i < count; i += 1)
{
a[i] = rand();
}
for(int i = 0; i < count; i += 1)
{
b[i] = rand();
}
start = clock();
for(int i = 0; i < count; i += 1)
{
add(c, a, b, count);
}
finish = clock();
duration = (double)(finish - start) / CLOCKS_PER_SEC;
printf( "%f seconds\n", duration);
return 0;
}
Return:
1.910000 seconds
NEON intrinsics implementation:
using namespace std;
void add_neon(int* out, int* input1, int* input2, int count)
{
int32x4_t input1_neon, input2_neon, out_neon;
for(int i = 0; i < count; i += 4)
{
input1_neon = vld1q_s32(input1);
input1 += 4;
input2_neon = vld1q_s32(input2);
input2 += 4;
out_neon = vaddq_s32(input1_neon, input2_neon);
vst1q_s32(out, out_neon);
out += 4;
}
}
int main()
{
int count;
count = 10000 * 4;
int a[count];
int b[count];
int c[count];
clock_t start, finish;
double duration;
for(int i = 0; i < count; i += 1)
{
a[i] = rand();
}
for(int i = 0; i < count; i += 1)
{
b[i] = rand();
}
start = clock();
for(int i = 0; i < count; i += 1)
{
add_neon(c, a, b, count);
}
finish = clock();
duration = (double)(finish - start) / CLOCKS_PER_SEC;
printf( "%f seconds for neon\n", duration);
return 0;
}
Return:
0.360000 seconds for neon
It shows that the performance is greatly improved when NEON intrinsics are used.
Parent topic: NEON Intrinsics Programming