识别并简化向量化过程中生成的冗余指令。
使用参数--param=tree-forwprop-perm=1使能,默认为0。
测试用例如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | #include <stdio.h> #include <stdlib.h> typedef unsigned short int sum_t; typedef unsigned int sum2_t; typedef long int intptr_t; typedef unsigned char data; #define BITS_PER_SUM (8 * sizeof(sum_t)) static sum2_t bar(sum2_t a ) { sum2_t s = ((a>>(BITS_PER_SUM-1))&(((sum2_t)1<<BITS_PER_SUM)+1))*((sum_t)-1); return (a+s)^s; } int foo(data *pix1, intptr_t i_pix1, data *pix2, intptr_t i_pix2 ) { sum2_t tmp[4][4]; sum2_t a0, a1, a2, a3; sum2_t sum = 0; for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 ) { a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM); a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM); a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM); a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM); sum2_t t0 = a0 + a1; sum2_t t1 = a0 - a1; sum2_t t2 = a2 + a3; sum2_t t3 = a2 - a3; tmp[i][0] = t0 + t2; tmp[i][2] = t0 - t2; tmp[i][1] = t1 + t3; tmp[i][3] = t1 - t3; } for( int i = 0; i < 4; i++ ) { sum2_t t0 = tmp[0][i] + tmp[1][i]; sum2_t t1 = tmp[0][i] - tmp[1][i]; sum2_t t2 = tmp[2][i] + tmp[3][i]; sum2_t t3 = tmp[2][i] - tmp[3][i]; a0 = t0 + t2; a2 = t0 - t2; a1 = t1 + t3; a3 = t1 - t3; sum += bar(a0) + bar(a1) + bar(a2) + bar(a3); } return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1; } |
测试命令:
1 | gcc -ftree-slp-transpose-vectorize -O3 -mtune=tsv110 --param=tree-forwprop-perm=1 -S test.c -o test.s |
相比选项未打开时,选项打开后,生成的汇编代码指令明显减少。