说明

识别 minmax 和 uzp1/uzp2 指令联合优化机会，减少指令数从而提升性能。

使用方法

使用-fconvert-minmax选项使能minmax优化，uzp1/uzp2指令优化在-O3以上等级默认使能。

注：依赖-O3及以上优化等级。

结果

测试用例如下：

typedef unsigned char uint8_t;
typedef long int intptr_t;
typedef signed short int int16_t;

static __attribute__((always_inline)) inline uint8_t clip (int x )
{
    return ( (x & ~((1 << 8)-1)) ? (-x)>>31 & ((1 << 8)-1) : x );
}

void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
     intptr_t stride, int width, int height, int16_t *buf)
{
    const int pad = (8 > 9) ? (-10 * ((1 << 8)-1)) : 0;
    for( int y = 0; y < height; y++ ) {
        /* This loop is not being vectorized now.  */
        for( int x = -2; x < width+3; x++ ) {
            int v = ((src)[x-2*stride] + (src)[x+3*stride] - 5*((src)[x-stride]
             + (src)[x+2*stride]) + 20*((src)[x] + (src)[x+stride]));
            dstv[x] = clip ( (v + 16) >> 5 );
            buf[x+2] = v + pad;
        }

        /* Produces two versions of the code: 3xUZP1/2xMAX/2xMIN + 1xUZP1/1xMAX/1xMIN.  */
        for( int x = 0; x < width; x++ )
            dstc[x] = clip ((((buf+2)[x-2*1] + (buf+2)[x+3*1] - 5*((buf+2)[x-1]
                  + (buf+2)[x+2*1]) + 20*((buf+2)[x] + (buf+2)[x+1]))
                 - 32*pad + 512) >> 10);

        /* Priduces two versions of the code: 1xUZP1/2xMAX/2xMIN + 0xUZP1/1xMAX/1xMIN.  */
        for( int x = 0; x < width; x++ )
            dsth[x] = clip ((((src)[x-2*1] + (src)[x+3*1] - 5*((src)[x-1]
                  + (src)[x+2*1]) + 20*((src)[x] + (src)[x+1]))
                 + 16) >> 5);

        dsth += stride;
        dstv += stride;
        dstc += stride;
        src += stride;
    }
}

测试命令：

gcc -O3 -fconvert-minmax -S test.c -o test.s

图1 选项未打开

图2 选项已经打开

相比选项未打开时，选项打开后，生成的汇编代码指令使用了smax、umin和uzp1指令。

选项 -fconvect-minmax

说明

使用方法

结果