[Bug middle-end/29756] SSE intrinsics hard to use without redundant temporaries appearing

rguenth at gcc dot gnu.org Thu, 12 May 2016 06:40:48 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=29756


--- Comment #7 from Richard Biener <rguenth at gcc dot gnu.org> ---
So I have it down to a x86 combine issue:

;; v_28 = BIT_FIELD_INSERT <v_27(D), _30, 0 (32 bits)>;

(insn 7 6 8 (set (reg:SF 116)
        (vec_select:SF (reg/v:V4SF 115 [ v ])
            (parallel [
                    (const_int 0 [0])
                ]))) t.c:5 -1
     (nil))

(insn 8 7 9 (set (reg:V4SF 117)
        (reg/v:V4SF 109 [ v ])) t.c:11 -1
     (nil))

(insn 9 8 10 (set (reg:V4SF 117)
        (vec_merge:V4SF (vec_duplicate:V4SF (reg:SF 116))
            (reg:V4SF 117)
            (const_int 1 [0x1]))) t.c:11 -1
     (nil))

(insn 10 9 0 (set (reg/v:V4SF 110 [ v ])
        (reg:V4SF 117)) t.c:11 -1
     (nil))

that's from what vec_set_optab produces

;; _29 = __builtin_ia32_shufps (v_28, v_28, 0);

(insn 11 10 12 (set (reg:V4SF 119)
        (reg/v:V4SF 110 [ v ])) t.c:12 -1
     (nil))

(insn 12 11 13 (set (reg:V4SF 120)
        (reg/v:V4SF 110 [ v ])) t.c:12 -1
     (nil))

(insn 13 12 14 (set (reg:V4SF 118)
        (vec_select:V4SF (vec_concat:V8SF (reg:V4SF 119)
                (reg:V4SF 120))
            (parallel [
                    (const_int 0 [0])
                    (const_int 0 [0])
                    (const_int 4 [0x4])
                    (const_int 4 [0x4])
                ]))) t.c:12 -1
     (nil))

(insn 14 13 0 (set (reg:V4SF 111 [ _29 ])
        (reg:V4SF 118)) t.c:12 -1
     (nil))

and that's the shuffle.  And after combine we have

(insn 7 4 53 2 (set (reg:SF 116)
        (vec_select:SF (reg/v:V4SF 115 [ v ])
            (parallel [
                    (const_int 0 [0])
                ]))) t.c:5 2423 {*vec_extractv4sf_0}
     (nil))
(insn 9 53 13 2 (set (reg:V4SF 117 [ v ])
        (vec_merge:V4SF (vec_duplicate:V4SF (reg:SF 116))
            (const_vector:V4SF [
                    (const_double:SF 0.0 [0x0.0p+0])
                    (const_double:SF 0.0 [0x0.0p+0])
                    (const_double:SF 0.0 [0x0.0p+0])
                    (const_double:SF 0.0 [0x0.0p+0])
                ])
            (const_int 1 [0x1]))) t.c:11 2420 {vec_setv4sf_0}
     (expr_list:REG_DEAD (reg:SF 116)
        (nil)))
(insn 13 9 15 2 (set (reg:V4SF 118)
        (vec_select:V4SF (vec_concat:V8SF (reg:V4SF 117 [ v ])
                (reg:V4SF 117 [ v ]))
            (parallel [
                    (const_int 0 [0])
                    (const_int 0 [0])
                    (const_int 4 [0x4])
                    (const_int 4 [0x4])
                ]))) t.c:12 2405 {sse_shufps_v4sf}
     (expr_list:REG_DEAD (reg:V4SF 117 [ v ])
        (nil)))

which combine doesn't manage to get down to

(insn 9 4 13 2 (set (reg:V4SF 104)
        (vec_select:V4SF (vec_concat:V8SF (reg/v:V4SF 103 [ v ])
                (reg/v:V4SF 103 [ v ]))
            (parallel [
                    (const_int 0 [0])
                    (const_int 0 [0])
                    (const_int 4 [0x4])
                    (const_int 4 [0x4])
                ]))) t.c:18 2405 {sse_shufps_v4sf}
     (nil))



The testcase was the following.

#include <emmintrin.h>

template <int N> inline float component(__v4sf v)
{
  return (reinterpret_cast<const float*>(&v))[N];
}

inline __v4sf fill(float f)
{
  __v4sf v;
  *(reinterpret_cast<float*>(&v))=f;
  return ((__m128) __builtin_ia32_shufps ((__v4sf)(v), (__v4sf)(v), 0));
}

template <int N> inline __v4sf component_fill(__v4sf v)
{
  return ((__m128) __builtin_ia32_shufps ((__v4sf)(v), (__v4sf)(v), 
                  ((((N) << 6) | ((N) << 4) | ((N) << 2) | (N)))));
}

__v4sf transform_bad(__v4sf m[4],__v4sf v)
{
  return m[0]*fill(component<0>(v))
      +m[1]*fill(component<1>(v))
      +m[2]*fill(component<2>(v))
      +m[3]*fill(component<3>(v));
}

__v4sf transform_good(__v4sf m[4],__v4sf v)
{
  return m[0]*component_fill<0>(v)
      +m[1]*component_fill<1>(v)
      +m[2]*component_fill<2>(v)
      +m[3]*component_fill<3>(v);
}

[Bug middle-end/29756] SSE intrinsics hard to use without redundant temporaries appearing

Reply via email to