中文
注册
我要评分
文档获取效率
文档正确性
内容完整性
文档易理解
在线提单
论坛求助
鲲鹏小智

intrinsics实现示例

如果需要在代码中调用NEON Intrinsics函数,需要加入头文件"arm_neon.h"。以数组加法为例。

C语言实现:

using namespace std;

void add(int* out, int* input1, int* input2, int count)
{
        for(int i = 0; i < count; i += 1)
        {
                out[i] = input1[i] + input2[i];
        }
}

int main()
{
        int count;
        count = 10000 * 4;
        int a[count];
        int b[count];
        int c[count];

        clock_t start, finish;
        double duration;

        for(int i = 0; i < count; i += 1)
        {
                a[i] = rand();
        }

        for(int i = 0; i < count; i += 1)
        {
                b[i] = rand();
        }

        start = clock();
        for(int i = 0; i < count; i += 1)
        {
                add(c, a, b, count);
        }
        finish = clock();
        duration = (double)(finish - start) / CLOCKS_PER_SEC;
        printf( "%f seconds\n", duration);

        return 0;
}

输出结果如下:

1.910000 seconds

NEON intrinsics实现:

using namespace std;

void add_neon(int* out, int* input1, int* input2, int count)
{
        int32x4_t input1_neon, input2_neon, out_neon;
        for(int i = 0; i < count; i += 4)
        {
                input1_neon = vld1q_s32(input1);
                input1 += 4;
                input2_neon = vld1q_s32(input2);
                input2 += 4;
                out_neon = vaddq_s32(input1_neon, input2_neon);
                vst1q_s32(out, out_neon);
                out += 4;
        }
}

int main()
{
        int count;
        count = 10000 * 4;
        int a[count];
        int b[count];
        int c[count];

        clock_t start, finish;
        double duration;

        for(int i = 0; i < count; i += 1)
        {
                a[i] = rand();
        }

        for(int i = 0; i < count; i += 1)
        {
                b[i] = rand();
        }

        start = clock();
        for(int i = 0; i < count; i += 1)
        {
                add_neon(c, a, b, count);
        }
        finish = clock();
        duration = (double)(finish - start) / CLOCKS_PER_SEC;
        printf( "%f seconds for neon\n", duration);

        return 0;
}

输出结果:

0.360000 seconds for neon

可以看出来,使用NEON intrinsics实现,性能有了明显的提升。

搜索结果
找到“0”个结果

当前产品无相关内容

未找到相关内容,请尝试其他搜索词