https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99398

            Bug ID: 99398
           Summary: Miss to optimize vector permutation fed by CTOR and
                    CTOR/CST
           Product: gcc
           Version: 11.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: linkw at gcc dot gnu.org
  Target Milestone: ---

#include "altivec.h"

vector long long foo(long long a, long long b) {
  vector long long v1 = {a, 0};
  vector long long v2 = {b, 0};
  vector unsigned char vc = {0,1,2,3,4,5,6,7, 16,17,18,19,20,21,22,23};
  vector long long vres = (vector long long)vec_perm ((vector unsigned char)v1,
(vector unsigned char)v2, vc);
  return vres;
}

gcc -Ofast -mcpu=power9, it generates (asm on BE btw)

        mtvsrdd 32,3,9
        mtvsrdd 33,4,9
        lxv 34,0(10)
        vperm 2,0,1,2
        blr

But it can be optimized into:

        mtvsrdd 34,3,4
        blr

The gimple at optimized dumping looks like:

__vector long foo (long long int a, long long int b)
{
  __vector long vres;
  __vector long v2;
  __vector long v1;
  __vector unsigned char _5;
  __vector unsigned char _6;
  __vector unsigned char _7;

  <bb 2> [local count: 1073741824]:
  v1_2 = {a_1(D), 0};
  v2_4 = {b_3(D), 0};
  _5 = VIEW_CONVERT_EXPR<__vector unsigned char>(v1_2);
  _6 = VIEW_CONVERT_EXPR<__vector unsigned char>(v2_4);
  _7 = VEC_PERM_EXPR <_5, _6, { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21,
22, 23 }>;
  vres_8 = VIEW_CONVERT_EXPR<__vector long>(_7);
  return vres_8;

}

But it can look like:

__vector long foo (long long int a, long long int b)
{
  vector(2) long long int _10;

  <bb 2> [local count: 1073741824]:
  _10 = {a_1(D), b_3(D)};
  return _10;

}

Reply via email to