https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111793
Richard Biener <rguenth at gcc dot gnu.org> changed: What |Removed |Added ---------------------------------------------------------------------------- Blocks| |53947 Target| |x86_64-*-* Keywords| |missed-optimization --- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> --- #pragma omp declare simd simdlen(16) inbranch __attribute__((noinline)) int foo (int a, int b) { return a + b; } we present if-conversion with the following which I think is good: <bb 3> [local count: 1073741824]: # iter.49_5 = PHI <0(2), iter.49_6(7)> # ivtmp_19 = PHI <16(2), ivtmp_18(7)> b_2 = b.45[iter.49_5]; a_1 = a.44[iter.49_5]; _8 = mask.48_7(D) >> iter.49_5; _9 = _8 & 1; if (_9 == 0) goto <bb 8>; [20.00%] else goto <bb 4>; [80.00%] <bb 8> [local count: 214748360]: goto <bb 5>; [100.00%] <bb 4> [local count: 858993464]: _3 = a_1 + b_2; retval.43[iter.49_5] = _3; <bb 5> [local count: 1073741824]: iter.49_6 = iter.49_5 + 1; ivtmp_18 = ivtmp_19 - 1; if (ivtmp_18 != 0) goto <bb 7>; [100.00%] else goto <bb 6>; [0.00%] and we if-convert the body to <bb 3> [local count: 1073741824]: # iter.49_5 = PHI <iter.49_6(7), 0(15)> # ivtmp_19 = PHI <ivtmp_18(7), 16(15)> b_2 = b.45[iter.49_5]; a_1 = a.44[iter.49_5]; _8 = mask.48_7(D) >> iter.49_5; _9 = _8 & 1; _22 = _9 == 0; _36 = (unsigned int) a_1; _37 = (unsigned int) b_2; _38 = _36 + _37; _3 = (int) _38; _39 = ~_22; _40 = &retval.43[iter.49_5]; .MASK_STORE (_40, 32B, _39, _3); iter.49_6 = iter.49_5 + 1; ivtmp_18 = ivtmp_19 - 1; if (ivtmp_18 != 0) goto <bb 7>; [100.00%] else goto <bb 6>; [0.00%] but rather than recovering the mask in its original form we vectorize this as vect_cst__50 = {mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D)}; <bb 3> [local count: 10631108]: # vect_vec_iv_.125_42 = PHI <_43(7), { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }(2)> # vectp_b.126_44 = PHI <vectp_b.126_45(7), &b.45(2)> # vectp_a.129_47 = PHI <vectp_a.129_48(7), &a.44(2)> # vectp_retval.140_61 = PHI <vectp_retval.140_62(7), &retval.43(2)> # ivtmp_64 = PHI <ivtmp_65(7), 0(2)> _43 = vect_vec_iv_.125_42 + { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 }; vect_b_2.128_46 = MEM <vector(16) int> [(int *)vectp_b.126_44]; vect_a_1.131_49 = MEM <vector(16) int> [(int *)vectp_a.129_47]; vect__8.132_51 = vect_cst__50 >> vect_vec_iv_.125_42; vect__9.133_53 = vect__8.132_51 & { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; mask__22.134_55 = vect__9.133_53 == { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; vect__36.135_56 = VIEW_CONVERT_EXPR<vector(16) unsigned int>(vect_a_1.131_49); vect__37.136_57 = VIEW_CONVERT_EXPR<vector(16) unsigned int>(vect_b_2.128_46); vect__38.137_58 = vect__36.135_56 + vect__37.136_57; vect__3.138_59 = VIEW_CONVERT_EXPR<vector(16) int>(vect__38.137_58); mask__39.139_60 = ~mask__22.134_55; if (mask__39.139_60 == { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }) goto <bb 20>; [20.00%] else goto <bb 21>; [80.00%] <bb 21> [local count: 8504886]: .MASK_STORE (vectp_retval.140_61, 512B, mask__39.139_60, vect__3.138_59); <bb 20> [local count: 10631108]: vectp_b.126_45 = vectp_b.126_44 + 64; vectp_a.129_48 = vectp_a.129_47 + 64; vectp_retval.140_62 = vectp_retval.140_61 + 64; ivtmp_65 = ivtmp_64 + 1; if (ivtmp_65 < 1) goto <bb 7>; [0.00%] else goto <bb 17>; [100.00%] in the end leaving us with <bb 2> [local count: 1073741824]: vect_cst__50 = {mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D)}; vect__8.132_51 = vect_cst__50 >> { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; vect__9.133_53 = vect__8.132_51 & { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; vect__36.135_56 = VIEW_CONVERT_EXPR<vector(16) unsigned int>(simd.46_13(D)); vect__37.136_57 = VIEW_CONVERT_EXPR<vector(16) unsigned int>(simd.47_15(D)); vect__38.137_58 = vect__36.135_56 + vect__37.136_57; vect__3.138_59 = VIEW_CONVERT_EXPR<vector(16) int>(vect__38.137_58); mask__39.139_60 = vect__9.133_53 != { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; if (mask__39.139_60 == { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }) goto <bb 4>; [20.00%] else goto <bb 3>; [80.00%] <bb 3> [local count: 858993419]: .MASK_STORE (&retval.43, 512B, mask__39.139_60, vect__3.138_59); <bb 4> [local count: 1073741824]: _10 = VIEW_CONVERT_EXPR<vector(16) int>(retval.43); return _10; and _ZGVeM16vv_foo: .LFB4: .cfi_startproc movl $1, %eax pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 vpbroadcastd %edi, %zmm2 vpaddd %zmm1, %zmm0, %zmm0 vpbroadcastd %eax, %zmm3 movq %rsp, %rbp .cfi_def_cfa_register 6 andq $-64, %rsp vpsrlvd .LC0(%rip), %zmm2, %zmm2 vpxor %xmm1, %xmm1, %xmm1 vpandd %zmm3, %zmm2, %zmm2 vpcmpd $4, %zmm1, %zmm2, %k1 kortestw %k1, %k1 je .L25 vmovdqa32 %zmm0, -64(%rsp){%k1} .L25: vmovdqa32 -64(%rsp), %zmm0 leave .cfi_def_cfa 7, 8 ret Referenced Bugs: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53947 [Bug 53947] [meta-bug] vectorizer missed-optimizations