PR 54300 is a problem in regcprop where the compiler sees
(parallel [(set (x) (y)
(set (y) (x)]) (REG_UNUSED (y))
as a single-set insn (since the other operand, y, is not used) and
replaces a use of x with a use of y. However, it fails to take into
account that y has been clobbered in the insn itself.
I considered changing single_set() to not return this case, but then
decided that would potentially cause missed optimization opportunities
in passes like combine which do know how to deal with cases like this.
The fix consists of two parts:
a) Spotting the unused sets and ensuring that their values are killed in
the value chains
b) Disabling the simple-move optimization when we've killed something in a).
The test is unfortunately ARM specific -- I'm not aware of any generic
code that triggers this.
gcc/
PR rtl-optimization/54300
* regcprop.c (copyprop_hardreg_forward_1): Ensure any unused
outputs in a single-set are killed from the value chains.
gcc/testsuite:
PR rtl-optimization/54300
* gcc.target/arm/pr54300.C: New test.
Bootstrapped/tested on x86_64 and tested on arm-eabi.
R.
Index: regcprop.c
===================================================================
--- regcprop.c (revision 204974)
+++ regcprop.c (working copy)
@@ -747,6 +747,7 @@ copyprop_hardreg_forward_1 (basic_block
int n_ops, i, alt, predicated;
bool is_asm, any_replacements;
rtx set;
+ rtx link;
bool replaced[MAX_RECOG_OPERANDS];
bool changed = false;
struct kill_set_value_data ksvd;
@@ -815,6 +816,23 @@ copyprop_hardreg_forward_1 (basic_block
if (recog_op_alt[i][alt].earlyclobber)
kill_value (recog_data.operand[i], vd);
+ /* If we have dead sets in the insn, then we need to note these as we
+ would clobbers. */
+ for (link = REG_NOTES (insn); link; link = XEXP (link, 1))
+ {
+ if (REG_NOTE_KIND (link) == REG_UNUSED)
+ {
+ kill_value (XEXP (link, 0), vd);
+ /* Furthermore, if the insn looked like a single-set,
+ but the dead store kills the source value of that
+ set, then we can no-longer use the plain move
+ special case below. */
+ if (set
+ && reg_overlap_mentioned_p (XEXP (link, 0), SET_SRC (set)))
+ set = NULL;
+ }
+ }
+
/* Special-case plain move instructions, since we may well
be able to do the move from a different register class. */
if (set && REG_P (SET_SRC (set)))
Index: testsuite/gcc.target/arm/pr54300.C
===================================================================
--- testsuite/gcc.target/arm/pr54300.C (revision 0)
+++ testsuite/gcc.target/arm/pr54300.C (revision 0)
@@ -0,0 +1,61 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_neon } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+#include <stdlib.h>
+
+struct __attribute__ ((aligned(8))) _v16u8_ {
+ uint8x16_t val;
+ _v16u8_( const int16x8_t &src) { val = vreinterpretq_u8_s16(src); }
+ operator int16x8_t () const { return vreinterpretq_s16_u8(val); }
+};
+typedef struct _v16u8_ v16u8;
+
+struct __attribute__ ((aligned(4))) _v8u8_ {
+ uint8x8_t val;
+ _v8u8_( const uint8x8_t &src) { val = src; }
+ operator int16x4_t () const { return vreinterpret_s16_u8(val); }
+};
+typedef struct _v8u8_ v8u8;
+
+typedef v16u8 v8i16;
+typedef int32x4_t v4i32;
+typedef const short cv1i16;
+typedef const unsigned char cv1u8;
+typedef const v8i16 cv8i16;
+
+static inline __attribute__((always_inline)) v8u8 zero_64(){ return vdup_n_u8(
0 ); }
+
+static inline __attribute__((always_inline)) v8i16 loadlo_8i16( cv8i16* p ){
+ return vcombine_s16( vld1_s16( (cv1i16 *)p ), zero_64() );
+}
+static inline __attribute__((always_inline)) v8i16 _loadlo_8i16( cv8i16* p,
int offset ){
+ return loadlo_8i16( (cv8i16*)(&((cv1u8*)p)[offset]) );
+}
+
+void __attribute__((noinline))
+test(unsigned short *_Inp, int32_t *_Out,
+ unsigned int s1v, unsigned int dv0,
+ unsigned int smask_v)
+{
+ int32x4_t c = vdupq_n_s32(0);
+
+ for(unsigned int sv=0 ; sv!=dv0 ; sv=(sv+s1v)&smask_v )
+ {
+ int32x4_t s;
+ s = vmovl_s16( vget_low_s16( _loadlo_8i16( (cv8i16*) _Inp, sv ) ) );
+ c = vaddq_s32( c, s );
+ }
+ vst1q_s32( _Out, c );
+}
+
+main()
+{
+ unsigned short a[4] = {1, 2, 3, 4};
+ int32_t b[4] = {0, 0, 0, 0};
+ test(a, b, 1, 1, ~0);
+ if (b[0] != 1 || b[1] != 2 || b[2] != 3 || b[3] != 4)
+ abort();
+}