我要评分
获取效率
正确性
完整性
易理解

Intrinsics Implementation

To call NEON intrinsics in the code, add the header file arm_neon.h. Take array addition as an example.

C language implementation:

using namespace std;

void add(int* out, int* input1, int* input2, int count)
{
        for(int i = 0; i < count; i += 1)
        {
                out[i] = input1[i] + input2[i];
        }
}

int main()
{
        int count;
        count = 10000 * 4;
        int a[count];
        int b[count];
        int c[count];

        clock_t start, finish;
        double duration;

        for(int i = 0; i < count; i += 1)
        {
                a[i] = rand();
        }

        for(int i = 0; i < count; i += 1)
        {
                b[i] = rand();
        }

        start = clock();
        for(int i = 0; i < count; i += 1)
        {
                add(c, a, b, count);
        }
        finish = clock();
        duration = (double)(finish - start) / CLOCKS_PER_SEC;
        printf( "%f seconds\n", duration);

        return 0;
}

Return:

1.910000 seconds

NEON intrinsics implementation:

using namespace std;

void add_neon(int* out, int* input1, int* input2, int count)
{
        int32x4_t input1_neon, input2_neon, out_neon;
        for(int i = 0; i < count; i += 4)
        {
                input1_neon = vld1q_s32(input1);
                input1 += 4;
                input2_neon = vld1q_s32(input2);
                input2 += 4;
                out_neon = vaddq_s32(input1_neon, input2_neon);
                vst1q_s32(out, out_neon);
                out += 4;
        }
}

int main()
{
        int count;
        count = 10000 * 4;
        int a[count];
        int b[count];
        int c[count];

        clock_t start, finish;
        double duration;

        for(int i = 0; i < count; i += 1)
        {
                a[i] = rand();
        }

        for(int i = 0; i < count; i += 1)
        {
                b[i] = rand();
        }

        start = clock();
        for(int i = 0; i < count; i += 1)
        {
                add_neon(c, a, b, count);
        }
        finish = clock();
        duration = (double)(finish - start) / CLOCKS_PER_SEC;
        printf( "%f seconds for neon\n", duration);

        return 0;
}

Return:

0.360000 seconds for neon

It shows that the performance is greatly improved when NEON intrinsics are used.