[Bug tree-optimization/107715] New: TSVC s161 for double runs at zen4 30 times slower when vectorization is enabled

hubicka at gcc dot gnu.org via Gcc-bugs Wed, 16 Nov 2022 06:10:17 -0800

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107715


            Bug ID: 107715
           Summary: TSVC s161 for double runs at zen4 30 times slower when
                    vectorization is enabled
           Product: gcc
           Version: 13.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: hubicka at gcc dot gnu.org
  Target Milestone: ---

jh@alberti:~/tsvc/bin> more test.c
typedef double real_t;
#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D];
int
main()
{
    for (int nl = 0; nl < iterations/2; nl++) {
        for (int i = 0; i < LEN_1D-1; ++i) {
            if (b[i] < (real_t)0.) {
                goto L20;
            }
            a[i] = c[i] + d[i] * e[i];
            goto L10;
L20:
            c[i+1] = a[i] + d[i] * d[i];
L10:
            ;
        }
    }
    return 0;
}

jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast -march=native test.c
-fno-tree-vectorize
jh@alberti:~/tsvc/bin> time ./a.out

real    0m1.170s
user    0m1.170s
sys     0m0.000s

jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast -march=native test.c 
jh@alberti:~/tsvc/bin> time ./a.out

real    0m37.269s
user    0m37.258s
sys     0m0.004s


It is not quite clear to me why this happens. It seems that all the time is
spent by movapd:
       │ b0:┌─→vmovapd     0x6bc880(%rax),%zmm2
       │    │  vmovapd     0x63f880(%rax),%zmm0
  0.00 │    │  vcmpltpd    %zmm1,%zmm2,%k1     
       │    │  vmovapd     0x6fb080(%rax),%zmm2
       │    │  vfmadd132pd %zmm0,%zmm2,%zmm0   
       │    │  vmovapd     0x6bc880(%rax),%zmm2
       │    │  vmovupd     %zmm0,0x67e088(%rax){%k1}
 99.94 │    │  vmovapd     0x63f880(%rax),%zmm0
       │    │  add         $0x40,%rax          
       │    │  vcmpgepd    %zmm1,%zmm2,%k1     
       │    │  vmovapd     0x67e040(%rax),%zmm2
  0.02 │    │  vfmadd132pd 0x601040(%rax),%zmm2,%zmm0
  0.04 │    │  vmovapd     0x6fb040(%rax),%zmm2
  0.00 │    │  vblendmpd   %zmm0,%zmm2,%zmm0{%k1}
       │    │  vmovapd     %zmm0,0x6fb040(%rax)  
       │    │  cmp         $0x3e7c0,%rax         
       │    └──jne         b0   

Since I do not initialize the array in reduced testcase we always execute the
jump to L20.

Exctending the testcase by array initialization:
typedef double real_t;
#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D];
enum {SET1D_RECIP_IDX = -1, SET1D_RECIP_IDX_SQ = -2};
void set_1d_array(real_t * arr, int length, real_t value, int stride)
{
    if (stride == SET1D_RECIP_IDX) {
        for (int i = 0; i < length; i++) {
            arr[i] = 1. / (real_t) (i+1);
        }
    } else if (stride == SET1D_RECIP_IDX_SQ) {
        for (int i = 0; i < length; i++) {
            arr[i] = 1. / (real_t) ((i+1) * (i+1));
        }
    } else {
        for (int i = 0; i < length; i += stride) {
            arr[i] = value;
        }
    }
}
int
main()
{
    set_1d_array(a, LEN_1D, 1.,1);
    set_1d_array(b, LEN_1D, 1.,1);
    set_1d_array(c, LEN_1D, 1.,1);
    set_1d_array(d, LEN_1D, 1.,1);
    set_1d_array(e, LEN_1D, 1.,1);
    for (int nl = 0; nl < iterations/2; nl++) {
        for (int i = 0; i < LEN_1D-1; ++i) {
            if (b[i] < (real_t)0.) {
                goto L20;
            }
            a[i] = c[i] + d[i] * e[i];
            goto L10;
L20:
            c[i+1] = a[i] + d[i] * d[i];
L10:
            ;
        }
    }
    return 0;
}
jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast -march=native test.c 
-fno-tree-vectorize
jh@alberti:~/tsvc/bin> time ./a.out

real    0m0.910s
user    0m0.910s
sys     0m0.000s
jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast -march=native test.c  
jh@alberti:~/tsvc/bin> time ./a.out

real    0m1.866s
user    0m1.866s
sys     0m0.000s
jh@alberti:~/tsvc/bin> 

still gets about 2x regression for vectorization.

[Bug tree-optimization/107715] New: TSVC s161 for double runs at zen4 30 times slower when vectorization is enabled

Reply via email to