Re: Rework constant subreg folds and handle more variable-length cases

Richard Biener Thu, 11 Jul 2019 02:31:27 -0700

On Thu, Jul 11, 2019 at 10:03 AM Richard Sandiford
<richard.sandif...@arm.com> wrote:
>
> This patch rewrites the way simplify_subreg handles constants.
> It uses similar native_encode/native_decode routines to the
> tree-level handling of VIEW_CONVERT_EXPR, meaning that we can
> move between rtx constants and the target memory image of them.
>
> The main point of this patch is to support subregs of constant-length
> vectors for VLA vectors, beyond the very simple cases that were already
> handled.  Many of the new tests failed before the patch for variable-
> length vectors.
>
> The boolean side is tested more by the upcoming SVE ACLE work.
>
> Tested on aarch64-linux-gnu, aarch64_be-elf and x86_64-linux-gnu.
> OK to install?


Hmm.  So is subreg [offset] defined in terms of memory order or in
terms of register order?  I wonder if you need to handle
FLOAT_WORDS_BIG_ENDIAN, REG_WORDS_BIG_ENDIAN
and whether BYTES/WORDS_BIG_ENDIAN have any meaning here
at all?

I'm always struggling with this when working on BIT_FIELD_REFs
on GIMPLE [registers]...

Richard.

> Richard
>
>
> 2019-07-11  Richard Sandiford  <richard.sandif...@arm.com>
>
> gcc/
>         * defaults.h (TARGET_UNIT): New macro.
>         (target_unit): New type.
>         * rtl.h (native_encode_rtx, native_decode_rtx)
>         (native_decode_vector_rtx, subreg_size_lsb): Declare.
>         (subreg_lsb_1): Turn into an inline wrapper around subreg_size_lsb.
>         * rtlanal.c (subreg_lsb_1): Delete.
>         (subreg_size_lsb): New function.
>         * simplify-rtx.c: Include rtx-vector-builder.h
>         (simplify_immed_subreg): Delete.
>         (native_encode_rtx, native_decode_vector_rtx, native_decode_rtx)
>         (simplify_const_vector_byte_offset, simplify_const_vector_subreg): New
>         functions.
>         (simplify_subreg): Use them.
>         (test_vector_subregs_modes, test_vector_subregs_repeating)
>         (test_vector_subregs_fore_back, test_vector_subregs_stepped)
>         (test_vector_subregs): New functions.
>         (test_vector_ops): Call test_vector_subregs for integer vector
>         modes with at least 2 elements.
>
> Index: gcc/defaults.h
> ===================================================================
> *** gcc/defaults.h      2019-07-11 08:33:57.000000000 +0100
> --- gcc/defaults.h      2019-07-11 08:33:58.069250175 +0100
> *************** #define TARGET_VTABLE_USES_DESCRIPTORS 0
> *** 1459,1462 ****
> --- 1459,1474 ----
>   #define DWARF_GNAT_ENCODINGS_DEFAULT DWARF_GNAT_ENCODINGS_GDB
>   #endif
>
> + /* Done this way to keep gengtype happy.  */
> + #if BITS_PER_UNIT == 8
> + #define TARGET_UNIT uint8_t
> + #elif BITS_PER_UNIT == 16
> + #define TARGET_UNIT uint16_t
> + #elif BITS_PER_UNIT == 32
> + #define TARGET_UNIT uint32_t
> + #else
> + #error Unknown BITS_PER_UNIT
> + #endif
> + typedef TARGET_UNIT target_unit;
> +
>   #endif  /* ! GCC_DEFAULTS_H */
> Index: gcc/rtl.h
> ===================================================================
> *** gcc/rtl.h   2019-07-11 08:33:57.000000000 +0100
> --- gcc/rtl.h   2019-07-11 08:33:58.069250175 +0100
> *************** extern int rtx_cost (rtx, machine_mode,
> *** 2400,2411 ****
>   extern int address_cost (rtx, machine_mode, addr_space_t, bool);
>   extern void get_full_rtx_cost (rtx, machine_mode, enum rtx_code, int,
>                                struct full_rtx_costs *);
>   extern poly_uint64 subreg_lsb (const_rtx);
> ! extern poly_uint64 subreg_lsb_1 (machine_mode, machine_mode, poly_uint64);
>   extern poly_uint64 subreg_size_offset_from_lsb (poly_uint64, poly_uint64,
>                                                 poly_uint64);
>   extern bool read_modify_subreg_p (const_rtx);
>
>   /* Return the subreg byte offset for a subreg whose outer mode is
>      OUTER_MODE, whose inner mode is INNER_MODE, and where there are
>      LSB_SHIFT *bits* between the lsb of the outer value and the lsb of
> --- 2400,2429 ----
>   extern int address_cost (rtx, machine_mode, addr_space_t, bool);
>   extern void get_full_rtx_cost (rtx, machine_mode, enum rtx_code, int,
>                                struct full_rtx_costs *);
> + extern bool native_encode_rtx (machine_mode, rtx, vec<target_unit> &,
> +                              unsigned int, unsigned int);
> + extern rtx native_decode_rtx (machine_mode, vec<target_unit>,
> +                             unsigned int);
> + extern rtx native_decode_vector_rtx (machine_mode, vec<target_unit>,
> +                                    unsigned int, unsigned int, unsigned 
> int);
>   extern poly_uint64 subreg_lsb (const_rtx);
> ! extern poly_uint64 subreg_size_lsb (poly_uint64, poly_uint64, poly_uint64);
>   extern poly_uint64 subreg_size_offset_from_lsb (poly_uint64, poly_uint64,
>                                                 poly_uint64);
>   extern bool read_modify_subreg_p (const_rtx);
>
> + /* Given a subreg's OUTER_MODE, INNER_MODE, and SUBREG_BYTE, return the
> +    bit offset at which the subreg begins (counting from the least 
> significant
> +    bit of the operand).  */
> +
> + inline poly_uint64
> + subreg_lsb_1 (machine_mode outer_mode, machine_mode inner_mode,
> +             poly_uint64 subreg_byte)
> + {
> +   return subreg_size_lsb (GET_MODE_SIZE (outer_mode),
> +                         GET_MODE_SIZE (inner_mode), subreg_byte);
> + }
> +
>   /* Return the subreg byte offset for a subreg whose outer mode is
>      OUTER_MODE, whose inner mode is INNER_MODE, and where there are
>      LSB_SHIFT *bits* between the lsb of the outer value and the lsb of
> Index: gcc/rtlanal.c
> ===================================================================
> *** gcc/rtlanal.c       2019-07-11 08:33:57.000000000 +0100
> --- gcc/rtlanal.c       2019-07-11 08:33:58.069250175 +0100
> *************** loc_mentioned_in_p (rtx *loc, const_rtx
> *** 3611,3633 ****
>     return 0;
>   }
>
> ! /* Helper function for subreg_lsb.  Given a subreg's OUTER_MODE, INNER_MODE,
> !    and SUBREG_BYTE, return the bit offset where the subreg begins
> !    (counting from the least significant bit of the operand).  */
>
>   poly_uint64
> ! subreg_lsb_1 (machine_mode outer_mode,
> !             machine_mode inner_mode,
> !             poly_uint64 subreg_byte)
>   {
>     poly_uint64 subreg_end, trailing_bytes, byte_pos;
>
>     /* A paradoxical subreg begins at bit position 0.  */
> !   if (paradoxical_subreg_p (outer_mode, inner_mode))
> !     return 0;
>
> !   subreg_end = subreg_byte + GET_MODE_SIZE (outer_mode);
> !   trailing_bytes = GET_MODE_SIZE (inner_mode) - subreg_end;
>     if (WORDS_BIG_ENDIAN && BYTES_BIG_ENDIAN)
>       byte_pos = trailing_bytes;
>     else if (!WORDS_BIG_ENDIAN && !BYTES_BIG_ENDIAN)
> --- 3611,3641 ----
>     return 0;
>   }
>
> ! /* Reinterpret a subreg as a bit extraction from an integer and return
> !    the position of the least significant bit of the extracted value.
> !    In other words, if the extraction were performed as a shift right
> !    and mask, return the number of bits to shift right.
> !
> !    The outer value of the subreg has OUTER_BYTES bytes and starts at
> !    byte offset SUBREG_BYTE within an inner value of INNER_BYTES bytes.  */
>
>   poly_uint64
> ! subreg_size_lsb (poly_uint64 outer_bytes,
> !                poly_uint64 inner_bytes,
> !                poly_uint64 subreg_byte)
>   {
>     poly_uint64 subreg_end, trailing_bytes, byte_pos;
>
>     /* A paradoxical subreg begins at bit position 0.  */
> !   gcc_checking_assert (ordered_p (outer_bytes, inner_bytes));
> !   if (maybe_gt (outer_bytes, inner_bytes))
> !     {
> !       gcc_checking_assert (known_eq (subreg_byte, 0U));
> !       return 0;
> !     }
>
> !   subreg_end = subreg_byte + outer_bytes;
> !   trailing_bytes = inner_bytes - subreg_end;
>     if (WORDS_BIG_ENDIAN && BYTES_BIG_ENDIAN)
>       byte_pos = trailing_bytes;
>     else if (!WORDS_BIG_ENDIAN && !BYTES_BIG_ENDIAN)
> Index: gcc/simplify-rtx.c
> ===================================================================
> *** gcc/simplify-rtx.c  2019-07-11 08:33:57.000000000 +0100
> --- gcc/simplify-rtx.c  2019-07-11 08:33:58.073250143 +0100
> *************** Software Foundation; either version 3, o
> *** 35,40 ****
> --- 35,41 ----
>   #include "flags.h"
>   #include "selftest.h"
>   #include "selftest-rtl.h"
> + #include "rtx-vector-builder.h"
>
>   /* Simplification and canonicalization of RTL.  */
>
> *************** simplify_ternary_operation (enum rtx_cod
> *** 6092,6433 ****
>     return 0;
>   }
>
> ! /* Evaluate a SUBREG of a CONST_INT or CONST_WIDE_INT or CONST_DOUBLE
> !    or CONST_FIXED or CONST_VECTOR, returning another CONST_INT or
> !    CONST_WIDE_INT or CONST_DOUBLE or CONST_FIXED or CONST_VECTOR.
> !
> !    Works by unpacking INNER_BYTES bytes of OP into a collection of 8-bit 
> values
> !    represented as a little-endian array of 'unsigned char', selecting by 
> BYTE,
> !    and then repacking them again for OUTERMODE.  If OP is a CONST_VECTOR,
> !    FIRST_ELEM is the number of the first element to extract, otherwise
> !    FIRST_ELEM is ignored.  */
>
> ! static rtx
> ! simplify_immed_subreg (fixed_size_mode outermode, rtx op,
> !                      machine_mode innermode, unsigned int byte,
> !                      unsigned int first_elem, unsigned int inner_bytes)
> ! {
> !   enum {
> !     value_bit = 8,
> !     value_mask = (1 << value_bit) - 1
> !   };
> !   unsigned char value[MAX_BITSIZE_MODE_ANY_MODE / value_bit];
> !   int value_start;
> !   int i;
> !   int elem;
> !
> !   int num_elem;
> !   rtx * elems;
> !   int elem_bitsize;
> !   rtx result_s = NULL;
> !   rtvec result_v = NULL;
> !   enum mode_class outer_class;
> !   scalar_mode outer_submode;
> !   int max_bitsize;
>
> !   /* Some ports misuse CCmode.  */
> !   if (GET_MODE_CLASS (outermode) == MODE_CC && CONST_INT_P (op))
> !     return op;
>
> !   /* We have no way to represent a complex constant at the rtl level.  */
> !   if (COMPLEX_MODE_P (outermode))
> !     return NULL_RTX;
>
> !   /* We support any size mode.  */
> !   max_bitsize = MAX (GET_MODE_BITSIZE (outermode),
> !                    inner_bytes * BITS_PER_UNIT);
>
> !   /* Unpack the value.  */
>
> !   if (GET_CODE (op) == CONST_VECTOR)
>       {
> !       num_elem = CEIL (inner_bytes, GET_MODE_UNIT_SIZE (innermode));
> !       elem_bitsize = GET_MODE_UNIT_BITSIZE (innermode);
>       }
> !   else
>       {
> !       num_elem = 1;
> !       elem_bitsize = max_bitsize;
> !     }
> !   /* If this asserts, it is too complicated; reducing value_bit may help.  
> */
> !   gcc_assert (BITS_PER_UNIT % value_bit == 0);
> !   /* I don't know how to handle endianness of sub-units.  */
> !   gcc_assert (elem_bitsize % BITS_PER_UNIT == 0);
> !
> !   for (elem = 0; elem < num_elem; elem++)
> !     {
> !       unsigned char * vp;
> !       rtx el = (GET_CODE (op) == CONST_VECTOR
> !               ? CONST_VECTOR_ELT (op, first_elem + elem)
> !               : op);
>
> !       /* Vectors are kept in target memory order.  (This is probably
> !        a mistake.)  */
> !       {
> !       unsigned byte = (elem * elem_bitsize) / BITS_PER_UNIT;
> !       unsigned ibyte = (((num_elem - 1 - elem) * elem_bitsize)
> !                         / BITS_PER_UNIT);
> !       unsigned word_byte = WORDS_BIG_ENDIAN ? ibyte : byte;
> !       unsigned subword_byte = BYTES_BIG_ENDIAN ? ibyte : byte;
> !       unsigned bytele = (subword_byte % UNITS_PER_WORD
> !                        + (word_byte / UNITS_PER_WORD) * UNITS_PER_WORD);
> !       vp = value + (bytele * BITS_PER_UNIT) / value_bit;
> !       }
>
> !       switch (GET_CODE (el))
>         {
> !       case CONST_INT:
> !         for (i = 0;
> !              i < HOST_BITS_PER_WIDE_INT && i < elem_bitsize;
> !              i += value_bit)
> !           *vp++ = INTVAL (el) >> i;
> !         /* CONST_INTs are always logically sign-extended.  */
> !         for (; i < elem_bitsize; i += value_bit)
> !           *vp++ = INTVAL (el) < 0 ? -1 : 0;
> !         break;
> !
> !       case CONST_WIDE_INT:
> !         {
> !           rtx_mode_t val = rtx_mode_t (el, GET_MODE_INNER (innermode));
> !           unsigned char extend = wi::sign_mask (val);
> !           int prec = wi::get_precision (val);
> !
> !           for (i = 0; i < prec && i < elem_bitsize; i += value_bit)
> !             *vp++ = wi::extract_uhwi (val, i, value_bit);
> !           for (; i < elem_bitsize; i += value_bit)
> !             *vp++ = extend;
> !         }
> !         break;
>
> !       case CONST_DOUBLE:
> !         if (TARGET_SUPPORTS_WIDE_INT == 0 && GET_MODE (el) == VOIDmode)
>             {
> !             unsigned char extend = 0;
> !             /* If this triggers, someone should have generated a
> !                CONST_INT instead.  */
> !             gcc_assert (elem_bitsize > HOST_BITS_PER_WIDE_INT);
> !
> !             for (i = 0; i < HOST_BITS_PER_WIDE_INT; i += value_bit)
> !               *vp++ = CONST_DOUBLE_LOW (el) >> i;
> !             while (i < HOST_BITS_PER_DOUBLE_INT && i < elem_bitsize)
> !               {
> !                 *vp++
> !                   = CONST_DOUBLE_HIGH (el) >> (i - HOST_BITS_PER_WIDE_INT);
> !                 i += value_bit;
> !               }
> !
> !             if (CONST_DOUBLE_HIGH (el) >> (HOST_BITS_PER_WIDE_INT - 1))
> !               extend = -1;
> !             for (; i < elem_bitsize; i += value_bit)
> !               *vp++ = extend;
>             }
> !         else
> !           {
> !             /* This is big enough for anything on the platform.  */
> !             long tmp[MAX_BITSIZE_MODE_ANY_MODE / 32];
> !             scalar_float_mode el_mode;
> !
> !             el_mode = as_a <scalar_float_mode> (GET_MODE (el));
> !             int bitsize = GET_MODE_BITSIZE (el_mode);
> !
> !             gcc_assert (bitsize <= elem_bitsize);
> !             gcc_assert (bitsize % value_bit == 0);
> !
> !             real_to_target (tmp, CONST_DOUBLE_REAL_VALUE (el),
> !                             GET_MODE (el));
> !
> !             /* real_to_target produces its result in words affected by
> !                FLOAT_WORDS_BIG_ENDIAN.  However, we ignore this,
> !                and use WORDS_BIG_ENDIAN instead; see the documentation
> !                of SUBREG in rtl.texi.  */
> !             for (i = 0; i < bitsize; i += value_bit)
> !               {
> !                 int ibase;
> !                 if (WORDS_BIG_ENDIAN)
> !                   ibase = bitsize - 1 - i;
> !                 else
> !                   ibase = i;
> !                 *vp++ = tmp[ibase / 32] >> i % 32;
> !               }
>
> !             /* It shouldn't matter what's done here, so fill it with
> !                zero.  */
> !             for (; i < elem_bitsize; i += value_bit)
> !               *vp++ = 0;
> !           }
> !         break;
>
> !         case CONST_FIXED:
> !         if (elem_bitsize <= HOST_BITS_PER_WIDE_INT)
> !           {
> !             for (i = 0; i < elem_bitsize; i += value_bit)
> !               *vp++ = CONST_FIXED_VALUE_LOW (el) >> i;
> !           }
> !         else
> !           {
> !             for (i = 0; i < HOST_BITS_PER_WIDE_INT; i += value_bit)
> !               *vp++ = CONST_FIXED_VALUE_LOW (el) >> i;
> !               for (; i < HOST_BITS_PER_DOUBLE_INT && i < elem_bitsize;
> !                  i += value_bit)
> !               *vp++ = CONST_FIXED_VALUE_HIGH (el)
> !                       >> (i - HOST_BITS_PER_WIDE_INT);
> !             for (; i < elem_bitsize; i += value_bit)
> !               *vp++ = 0;
> !           }
> !           break;
>
> !       default:
> !         gcc_unreachable ();
>         }
>       }
>
> !   /* Now, pick the right byte to start with.  */
> !   /* Renumber BYTE so that the least-significant byte is byte 0.  A special
> !      case is paradoxical SUBREGs, which shouldn't be adjusted since they
> !      will already have offset 0.  */
> !   if (inner_bytes >= GET_MODE_SIZE (outermode))
>       {
> !       unsigned ibyte = inner_bytes - GET_MODE_SIZE (outermode) - byte;
> !       unsigned word_byte = WORDS_BIG_ENDIAN ? ibyte : byte;
> !       unsigned subword_byte = BYTES_BIG_ENDIAN ? ibyte : byte;
> !       byte = (subword_byte % UNITS_PER_WORD
> !             + (word_byte / UNITS_PER_WORD) * UNITS_PER_WORD);
>       }
>
> !   /* BYTE should still be inside OP.  (Note that BYTE is unsigned,
> !      so if it's become negative it will instead be very large.)  */
> !   gcc_assert (byte < inner_bytes);
>
> !   /* Convert from bytes to chunks of size value_bit.  */
> !   value_start = byte * (BITS_PER_UNIT / value_bit);
>
> !   /* Re-pack the value.  */
> !   num_elem = GET_MODE_NUNITS (outermode);
>
> !   if (VECTOR_MODE_P (outermode))
>       {
> !       result_v = rtvec_alloc (num_elem);
> !       elems = &RTVEC_ELT (result_v, 0);
>       }
> !   else
> !     elems = &result_s;
>
> !   outer_submode = GET_MODE_INNER (outermode);
> !   outer_class = GET_MODE_CLASS (outer_submode);
> !   elem_bitsize = GET_MODE_BITSIZE (outer_submode);
>
> !   gcc_assert (elem_bitsize % value_bit == 0);
> !   gcc_assert (elem_bitsize + value_start * value_bit <= max_bitsize);
>
> !   for (elem = 0; elem < num_elem; elem++)
> !     {
> !       unsigned char *vp;
>
> !       /* Vectors are stored in target memory order.  (This is probably
> !        a mistake.)  */
> !       {
> !       unsigned byte = (elem * elem_bitsize) / BITS_PER_UNIT;
> !       unsigned ibyte = (((num_elem - 1 - elem) * elem_bitsize)
> !                         / BITS_PER_UNIT);
> !       unsigned word_byte = WORDS_BIG_ENDIAN ? ibyte : byte;
> !       unsigned subword_byte = BYTES_BIG_ENDIAN ? ibyte : byte;
> !       unsigned bytele = (subword_byte % UNITS_PER_WORD
> !                        + (word_byte / UNITS_PER_WORD) * UNITS_PER_WORD);
> !       vp = value + value_start + (bytele * BITS_PER_UNIT) / value_bit;
> !       }
>
> !       switch (outer_class)
> !       {
> !       case MODE_INT:
> !       case MODE_PARTIAL_INT:
> !         {
> !           int u;
> !           int base = 0;
> !           int units
> !             = (GET_MODE_BITSIZE (outer_submode) + HOST_BITS_PER_WIDE_INT - 
> 1)
> !             / HOST_BITS_PER_WIDE_INT;
> !           HOST_WIDE_INT tmp[MAX_BITSIZE_MODE_ANY_INT / 
> HOST_BITS_PER_WIDE_INT];
> !           wide_int r;
>
> !           if (GET_MODE_PRECISION (outer_submode) > MAX_BITSIZE_MODE_ANY_INT)
> !             return NULL_RTX;
> !           for (u = 0; u < units; u++)
> !             {
> !               unsigned HOST_WIDE_INT buf = 0;
> !               for (i = 0;
> !                    i < HOST_BITS_PER_WIDE_INT && base + i < elem_bitsize;
> !                    i += value_bit)
> !                 buf |= (unsigned HOST_WIDE_INT)(*vp++ & value_mask) << i;
>
> !               tmp[u] = buf;
> !               base += HOST_BITS_PER_WIDE_INT;
> !             }
> !           r = wide_int::from_array (tmp, units,
> !                                     GET_MODE_PRECISION (outer_submode));
> ! #if TARGET_SUPPORTS_WIDE_INT == 0
> !           /* Make sure r will fit into CONST_INT or CONST_DOUBLE.  */
> !           if (wi::min_precision (r, SIGNED) > HOST_BITS_PER_DOUBLE_INT)
> !             return NULL_RTX;
> ! #endif
> !           elems[elem] = immed_wide_int_const (r, outer_submode);
> !         }
> !         break;
>
> !       case MODE_FLOAT:
> !       case MODE_DECIMAL_FLOAT:
> !         {
> !           REAL_VALUE_TYPE r;
> !           long tmp[MAX_BITSIZE_MODE_ANY_MODE / 32] = { 0 };
>
> !           /* real_from_target wants its input in words affected by
> !              FLOAT_WORDS_BIG_ENDIAN.  However, we ignore this,
> !              and use WORDS_BIG_ENDIAN instead; see the documentation
> !              of SUBREG in rtl.texi.  */
> !           for (i = 0; i < elem_bitsize; i += value_bit)
> !             {
> !               int ibase;
> !               if (WORDS_BIG_ENDIAN)
> !                 ibase = elem_bitsize - 1 - i;
> !               else
> !                 ibase = i;
> !               tmp[ibase / 32] |= (*vp++ & value_mask) << i % 32;
> !             }
>
> !           real_from_target (&r, tmp, outer_submode);
> !           elems[elem] = const_double_from_real_value (r, outer_submode);
> !         }
> !         break;
>
> !       case MODE_FRACT:
> !       case MODE_UFRACT:
> !       case MODE_ACCUM:
> !       case MODE_UACCUM:
> !         {
> !           FIXED_VALUE_TYPE f;
> !           f.data.low = 0;
> !           f.data.high = 0;
> !           f.mode = outer_submode;
> !
> !           for (i = 0;
> !                i < HOST_BITS_PER_WIDE_INT && i < elem_bitsize;
> !                i += value_bit)
> !             f.data.low |= (unsigned HOST_WIDE_INT)(*vp++ & value_mask) << i;
> !           for (; i < elem_bitsize; i += value_bit)
> !             f.data.high |= ((unsigned HOST_WIDE_INT)(*vp++ & value_mask)
> !                            << (i - HOST_BITS_PER_WIDE_INT));
> !
> !           elems[elem] = CONST_FIXED_FROM_FIXED_VALUE (f, outer_submode);
> !           }
> !           break;
>
> !       default:
> !         gcc_unreachable ();
> !       }
>       }
> !   if (VECTOR_MODE_P (outermode))
> !     return gen_rtx_CONST_VECTOR (outermode, result_v);
> !   else
> !     return result_s;
>   }
>
>   /* Simplify SUBREG:OUTERMODE(OP:INNERMODE, BYTE)
> --- 6093,6534 ----
>     return 0;
>   }
>
> ! /* Try to calculate NUM_BYTES bytes of the target memory image of X,
> !    starting at byte FIRST_BYTE.  Return true on success and add the
> !    bytes to BYTES, such that each byte has BITS_PER_UNIT bits and such
> !    that the bytes follow target memory order.  Leave BYTES unmodified
> !    on failure.
>
> !    MODE is the mode of X.  The caller must reserve NUM_BYTES bytes in
> !    BYTES before calling this function.  */
>
> ! bool
> ! native_encode_rtx (machine_mode mode, rtx x, vec<target_unit> &bytes,
> !                  unsigned int first_byte, unsigned int num_bytes)
> ! {
> !   /* Check the mode is sensible.  */
> !   gcc_assert (GET_MODE (x) == VOIDmode
> !             ? is_a <scalar_int_mode> (mode)
> !             : mode == GET_MODE (x));
> !
> !   if (GET_CODE (x) == CONST_VECTOR)
> !     {
> !       /* CONST_VECTOR_ELT follows target memory order, so no shuffling
> !        is necessary.  The only complication is that MODE_VECTOR_BOOL
> !        vectors can have several elements per byte.  */
> !       unsigned int elt_bits = vector_element_size (GET_MODE_BITSIZE (mode),
> !                                                  GET_MODE_NUNITS (mode));
> !       unsigned int elt = first_byte * BITS_PER_UNIT / elt_bits;
> !       if (elt_bits < BITS_PER_UNIT)
> !       {
> !         /* This is the only case in which elements can be smaller than
> !            a byte.  */
> !         gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
> !         for (unsigned int i = 0; i < num_bytes; ++i)
> !           {
> !             target_unit value = 0;
> !             for (unsigned int j = 0; j < BITS_PER_UNIT; j += elt_bits)
> !               {
> !                 value |= (INTVAL (CONST_VECTOR_ELT (x, elt)) & 1) << j;
> !                 elt += 1;
> !               }
> !             bytes.quick_push (value);
> !           }
> !         return true;
> !       }
>
> !       unsigned int start = bytes.length ();
> !       unsigned int elt_bytes = GET_MODE_UNIT_SIZE (mode);
> !       /* Make FIRST_BYTE relative to ELT.  */
> !       first_byte %= elt_bytes;
> !       while (num_bytes > 0)
> !       {
> !         /* Work out how many bytes we want from element ELT.  */
> !         unsigned int chunk_bytes = MIN (num_bytes, elt_bytes - first_byte);
> !         if (!native_encode_rtx (GET_MODE_INNER (mode),
> !                                 CONST_VECTOR_ELT (x, elt), bytes,
> !                                 first_byte, chunk_bytes))
> !           {
> !             bytes.truncate (start);
> !             return false;
> !           }
> !         elt += 1;
> !         first_byte = 0;
> !         num_bytes -= chunk_bytes;
> !       }
> !       return true;
> !     }
>
> !   /* All subsequent cases are limited to scalars.  */
> !   scalar_mode smode;
> !   if (!is_a <scalar_mode> (mode, &smode))
> !     return false;
>
> !   /* Make sure that the region is in range.  */
> !   unsigned int end_byte = first_byte + num_bytes;
> !   unsigned int mode_bytes = GET_MODE_SIZE (smode);
> !   gcc_assert (end_byte <= mode_bytes);
>
> !   if (CONST_SCALAR_INT_P (x))
>       {
> !       /* The target memory layout is affected by both BYTES_BIG_ENDIAN
> !        and WORDS_BIG_ENDIAN.  Use the subreg machinery to get the lsb
> !        position of each byte.  */
> !       rtx_mode_t value (x, smode);
> !       for (unsigned int byte = first_byte; byte < end_byte; ++byte)
> !       {
> !         /* Always constant because the inputs are.  */
> !         unsigned int lsb
> !           = subreg_size_lsb (1, mode_bytes, byte).to_constant ();
> !         bytes.quick_push (wi::extract_uhwi (value, lsb, BITS_PER_UNIT));
> !       }
> !       return true;
>       }
> !
> !   if (CONST_DOUBLE_P (x))
>       {
> !       /* real_to_target produces an array of integers in target memory 
> order.
> !        All integers before the last one have 32 bits; the last one may
> !        have 32 bits or fewer, depending on whether the mode bitsize
> !        is divisible by 32.  Each of these integers is then laid out
> !        in target memory as any other integer would be.  */
> !       long el32[MAX_BITSIZE_MODE_ANY_MODE / 32];
> !       real_to_target (el32, CONST_DOUBLE_REAL_VALUE (x), smode);
>
> !       /* The (maximum) number of target bytes per element of el32.  */
> !       unsigned int bytes_per_el32 = 32 / BITS_PER_UNIT;
> !       gcc_assert (bytes_per_el32 != 0);
>
> !       /* Build up the integers in a similar way to the CONST_SCALAR_INT_P
> !        handling above.  */
> !       for (unsigned int byte = first_byte; byte < end_byte; ++byte)
>         {
> !         unsigned int index = byte / bytes_per_el32;
> !         unsigned int subbyte = byte % bytes_per_el32;
> !         unsigned int int_bytes = MIN (bytes_per_el32,
> !                                       mode_bytes - index * bytes_per_el32);
> !         /* Always constant because the inputs are.  */
> !         unsigned int lsb
> !           = subreg_size_lsb (1, int_bytes, subbyte).to_constant ();
> !         bytes.quick_push ((unsigned long) el32[index] >> lsb);
> !       }
> !       return true;
> !     }
>
> !   if (GET_CODE (x) == CONST_FIXED)
> !     {
> !       for (unsigned int byte = first_byte; byte < end_byte; ++byte)
> !       {
> !         /* Always constant because the inputs are.  */
> !         unsigned int lsb
> !           = subreg_size_lsb (1, mode_bytes, byte).to_constant ();
> !         unsigned HOST_WIDE_INT piece = CONST_FIXED_VALUE_LOW (x);
> !         if (lsb >= HOST_BITS_PER_WIDE_INT)
>             {
> !             lsb -= HOST_BITS_PER_WIDE_INT;
> !             piece = CONST_FIXED_VALUE_HIGH (x);
>             }
> !         bytes.quick_push (piece >> lsb);
> !       }
> !       return true;
> !     }
>
> !   return false;
> ! }
>
> ! /* Read a vector of mode MODE from the target memory image given by BYTES,
> !    starting at byte FIRST_BYTE.  The vector is known to be encodable using
> !    NPATTERNS interleaved patterns with NELTS_PER_PATTERN elements each,
> !    and BYTES is known to have enough bytes to supply NPATTERNS *
> !    NELTS_PER_PATTERN vector elements.  Each element of BYTES contains
> !    BITS_PER_UNIT bits and the bytes are in target memory order.
>
> !    Return the vector on success, otherwise return NULL_RTX.  */
> !
> ! rtx
> ! native_decode_vector_rtx (machine_mode mode, vec<target_unit> bytes,
> !                         unsigned int first_byte, unsigned int npatterns,
> !                         unsigned int nelts_per_pattern)
> ! {
> !   rtx_vector_builder builder (mode, npatterns, nelts_per_pattern);
> !
> !   unsigned int elt_bits = vector_element_size (GET_MODE_BITSIZE (mode),
> !                                              GET_MODE_NUNITS (mode));
> !   if (elt_bits < BITS_PER_UNIT)
> !     {
> !       /* This is the only case in which elements can be smaller than a byte.
> !        Element 0 is always in the lsb of the containing byte.  */
> !       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
> !       for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
> !       {
> !         unsigned int bit_index = first_byte * BITS_PER_UNIT + i * elt_bits;
> !         unsigned int byte_index = bit_index / BITS_PER_UNIT;
> !         unsigned int lsb = bit_index % BITS_PER_UNIT;
> !         builder.quick_push (bytes[byte_index] & (1 << lsb)
> !                             ? CONST1_RTX (BImode)
> !                             : CONST0_RTX (BImode));
> !       }
> !     }
> !   else
> !     {
> !       for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
> !       {
> !         rtx x = native_decode_rtx (GET_MODE_INNER (mode), bytes, 
> first_byte);
> !         if (!x)
> !           return NULL_RTX;
> !         builder.quick_push (x);
> !         first_byte += elt_bits / BITS_PER_UNIT;
>         }
>       }
> +   return builder.build ();
> + }
>
> ! /* Read an rtx of mode MODE from the target memory image given by BYTES,
> !    starting at byte FIRST_BYTE.  Each element of BYTES contains 
> BITS_PER_UNIT
> !    bits and the bytes are in target memory order.  The image has enough
> !    values to specify all bytes of MODE.
> !
> !    Return the rtx on success, otherwise return NULL_RTX.  */
> !
> ! rtx
> ! native_decode_rtx (machine_mode mode, vec<target_unit> bytes,
> !                  unsigned int first_byte)
> ! {
> !   if (VECTOR_MODE_P (mode))
>       {
> !       /* If we know at compile time how many elements there are,
> !        pull each element directly from BYTES.  */
> !       unsigned int nelts;
> !       if (GET_MODE_NUNITS (mode).is_constant (&nelts))
> !       return native_decode_vector_rtx (mode, bytes, first_byte, nelts, 1);
> !       return NULL_RTX;
>       }
>
> !   scalar_int_mode imode;
> !   if (is_a <scalar_int_mode> (mode, &imode)
> !       && GET_MODE_PRECISION (imode) <= MAX_BITSIZE_MODE_ANY_INT)
> !     {
> !       /* Pull the bytes msb first, so that we can use simple
> !        shift-and-insert wide_int operations.  */
> !       unsigned int size = GET_MODE_SIZE (imode);
> !       wide_int result (wi::zero (GET_MODE_PRECISION (imode)));
> !       for (unsigned int i = 0; i < size; ++i)
> !       {
> !         unsigned int lsb = (size - i - 1) * BITS_PER_UNIT;
> !         /* Always constant because the inputs are.  */
> !         unsigned int subbyte
> !           = subreg_size_offset_from_lsb (1, size, lsb).to_constant ();
> !         result <<= BITS_PER_UNIT;
> !         result |= bytes[first_byte + subbyte];
> !       }
> !       return immed_wide_int_const (result, imode);
> !     }
> !
> !   scalar_float_mode fmode;
> !   if (is_a <scalar_float_mode> (mode, &fmode))
> !     {
> !       /* We need to build an array of integers in target memory order.
> !        All integers before the last one have 32 bits; the last one may
> !        have 32 bits or fewer, depending on whether the mode bitsize
> !        is divisible by 32.  */
> !       long el32[MAX_BITSIZE_MODE_ANY_MODE / 32];
> !       unsigned int num_el32 = CEIL (GET_MODE_BITSIZE (fmode), 32);
> !       memset (el32, 0, num_el32 * sizeof (long));
> !
> !       /* The (maximum) number of target bytes per element of el32.  */
> !       unsigned int bytes_per_el32 = 32 / BITS_PER_UNIT;
> !       gcc_assert (bytes_per_el32 != 0);
> !
> !       unsigned int mode_bytes = GET_MODE_SIZE (fmode);
> !       for (unsigned int byte = 0; byte < mode_bytes; ++byte)
> !       {
> !         unsigned int index = byte / bytes_per_el32;
> !         unsigned int subbyte = byte % bytes_per_el32;
> !         unsigned int int_bytes = MIN (bytes_per_el32,
> !                                       mode_bytes - index * bytes_per_el32);
> !         /* Always constant because the inputs are.  */
> !         unsigned int lsb
> !           = subreg_size_lsb (1, int_bytes, subbyte).to_constant ();
> !         el32[index] |= (unsigned long) bytes[first_byte + byte] << lsb;
> !       }
> !       REAL_VALUE_TYPE r;
> !       real_from_target (&r, el32, fmode);
> !       return const_double_from_real_value (r, fmode);
> !     }
> !
> !   if (ALL_SCALAR_FIXED_POINT_MODE_P (mode))
> !     {
> !       scalar_mode smode = as_a <scalar_mode> (mode);
> !       FIXED_VALUE_TYPE f;
> !       f.data.low = 0;
> !       f.data.high = 0;
> !       f.mode = smode;
> !
> !       unsigned int mode_bytes = GET_MODE_SIZE (smode);
> !       for (unsigned int byte = 0; byte < mode_bytes; ++byte)
> !       {
> !         /* Always constant because the inputs are.  */
> !         unsigned int lsb
> !           = subreg_size_lsb (1, mode_bytes, byte).to_constant ();
> !         unsigned HOST_WIDE_INT unit = bytes[first_byte + byte];
> !         if (lsb >= HOST_BITS_PER_WIDE_INT)
> !           f.data.high |= unit << (lsb - HOST_BITS_PER_WIDE_INT);
> !         else
> !           f.data.low |= unit << lsb;
> !       }
> !       return CONST_FIXED_FROM_FIXED_VALUE (f, mode);
> !     }
>
> !   return NULL_RTX;
> ! }
>
> ! /* Simplify a byte offset BYTE into CONST_VECTOR X.  The main purpose
> !    is to convert a runtime BYTE value into a constant one.  */
>
> ! static poly_uint64
> ! simplify_const_vector_byte_offset (rtx x, poly_uint64 byte)
> ! {
> !   /* Cope with MODE_VECTOR_BOOL by operating on bits rather than bytes.  */
> !   machine_mode mode = GET_MODE (x);
> !   unsigned int elt_bits = vector_element_size (GET_MODE_BITSIZE (mode),
> !                                              GET_MODE_NUNITS (mode));
> !   /* The number of bits needed to encode one element from each pattern.  */
> !   unsigned int sequence_bits = CONST_VECTOR_NPATTERNS (x) * elt_bits;
> !
> !   /* Identify the start point in terms of a sequence number and a byte 
> offset
> !      within that sequence.  */
> !   poly_uint64 first_sequence;
> !   unsigned HOST_WIDE_INT subbit;
> !   if (can_div_trunc_p (byte * BITS_PER_UNIT, sequence_bits,
> !                      &first_sequence, &subbit))
>       {
> !       unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
> !       if (nelts_per_pattern == 1)
> !       /* This is a duplicated vector, so the value of FIRST_SEQUENCE
> !          doesn't matter.  */
> !       byte = subbit / BITS_PER_UNIT;
> !       else if (nelts_per_pattern == 2 && known_gt (first_sequence, 0U))
> !       {
> !         /* The subreg drops the first element from each pattern and
> !            only uses the second element.  Find the first sequence
> !            that starts on a byte boundary.  */
> !         subbit += least_common_multiple (sequence_bits, BITS_PER_UNIT);
> !         byte = subbit / BITS_PER_UNIT;
> !       }
>       }
> !   return byte;
> ! }
>
> ! /* Subroutine of simplify_subreg in which:
>
> !    - X is known to be a CONST_VECTOR
> !    - OUTERMODE is known to be a vector mode
>
> !    Try to handle the subreg by operating on the CONST_VECTOR encoding
> !    rather than on each individual element of the CONST_VECTOR.
>
> !    Return the simplified subreg on success, otherwise return NULL_RTX.  */
>
> ! static rtx
> ! simplify_const_vector_subreg (machine_mode outermode, rtx x,
> !                             machine_mode innermode, unsigned int first_byte)
> ! {
> !   /* Paradoxical subregs of vectors have dubious semantics.  */
> !   if (paradoxical_subreg_p (outermode, innermode))
> !     return NULL_RTX;
>
> !   /* We can only preserve the semantics of a stepped pattern if the new
> !      vector element is the same as the original one.  */
> !   if (CONST_VECTOR_STEPPED_P (x)
> !       && GET_MODE_INNER (outermode) != GET_MODE_INNER (innermode))
> !     return NULL_RTX;
>
> !   /* Cope with MODE_VECTOR_BOOL by operating on bits rather than bytes.  */
> !   unsigned int x_elt_bits
> !     = vector_element_size (GET_MODE_BITSIZE (innermode),
> !                          GET_MODE_NUNITS (innermode));
> !   unsigned int out_elt_bits
> !     = vector_element_size (GET_MODE_BITSIZE (outermode),
> !                          GET_MODE_NUNITS (outermode));
> !
> !   /* The number of bits needed to encode one element from every pattern
> !      of the original vector.  */
> !   unsigned int x_sequence_bits = CONST_VECTOR_NPATTERNS (x) * x_elt_bits;
> !
> !   /* The number of bits needed to encode one element from every pattern
> !      of the result.  */
> !   unsigned int out_sequence_bits
> !     = least_common_multiple (x_sequence_bits, out_elt_bits);
> !
> !   /* Work out the number of interleaved patterns in the output vector
> !      and the number of encoded elements per pattern.  */
> !   unsigned int out_npatterns = out_sequence_bits / out_elt_bits;
> !   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
> !
> !   /* The encoding scheme requires the number of elements to be a multiple
> !      of the number of patterns, so that each pattern appears at least once
> !      and so that the same number of elements appear from each pattern.  */
> !   bool ok_p = multiple_p (GET_MODE_NUNITS (outermode), out_npatterns);
> !   unsigned int const_nunits;
> !   if (GET_MODE_NUNITS (outermode).is_constant (&const_nunits)
> !       && (!ok_p || out_npatterns * nelts_per_pattern > const_nunits))
> !     {
> !       /* Either the encoding is invalid, or applying it would give us
> !        more elements than we need.  Just encode each element directly.  */
> !       out_npatterns = const_nunits;
> !       nelts_per_pattern = 1;
> !     }
> !   else if (!ok_p)
> !     return NULL_RTX;
>
> !   /* Get enough bytes of X to form the new encoding.  */
> !   unsigned int buffer_bits = out_npatterns * nelts_per_pattern * 
> out_elt_bits;
> !   unsigned int buffer_bytes = CEIL (buffer_bits, BITS_PER_UNIT);
> !   auto_vec<target_unit, 128> buffer (buffer_bytes);
> !   if (!native_encode_rtx (innermode, x, buffer, first_byte, buffer_bytes))
> !     return NULL_RTX;
>
> !   /* Reencode the bytes as OUTERMODE.  */
> !   return native_decode_vector_rtx (outermode, buffer, 0, out_npatterns,
> !                                  nelts_per_pattern);
> ! }
>
> ! /* Try to simplify a subreg of a constant by encoding the subreg region
> !    as a sequence of target bytes and reading them back in the new mode.
> !    Return the new value on success, otherwise return null.
>
> !    The subreg has outer mode OUTERMODE, inner mode INNERMODE, inner value X
> !    and byte offset FIRST_BYTE.  */
>
> ! static rtx
> ! simplify_immed_subreg (fixed_size_mode outermode, rtx x,
> !                      machine_mode innermode, unsigned int first_byte)
> ! {
> !   unsigned int buffer_bytes = GET_MODE_SIZE (outermode);
> !   auto_vec<target_unit, 128> buffer (buffer_bytes);
> !
> !   /* Paradoxical subregs read undefined values for bytes outside of the
> !      inner value.  For consistency, treat all the extra bytes as zero.  */
> !   unsigned int inner_bytes = buffer_bytes;
> !   if (paradoxical_subreg_p (outermode, innermode))
> !     {
> !       if (!GET_MODE_SIZE (innermode).is_constant (&inner_bytes))
> !       return NULL_RTX;
> !
> !       /* Add any leading bytes due to big-endian layout.  The number of
> !        bytes must be constant because both modes have constant size.  */
> !       unsigned int leading_bytes
> !       = -byte_lowpart_offset (outermode, innermode).to_constant ();
> !       buffer.quick_grow_cleared (leading_bytes);
>       }
> !
> !   if (!native_encode_rtx (innermode, x, buffer, first_byte, inner_bytes))
> !     return NULL_RTX;
> !
> !   /* Add any trailing zero bytes due to a paraodixcal subreg.  */
> !   buffer.quick_grow_cleared (buffer_bytes);
> !
> !   return native_decode_rtx (outermode, buffer, 0);
>   }
>
>   /* Simplify SUBREG:OUTERMODE(OP:INNERMODE, BYTE)
> *************** simplify_subreg (machine_mode outermode,
> *** 6456,6461 ****
> --- 6557,6565 ----
>     if (outermode == innermode && known_eq (byte, 0U))
>       return op;
>
> +   if (GET_CODE (op) == CONST_VECTOR)
> +     byte = simplify_const_vector_byte_offset (op, byte);
> +
>     if (multiple_p (byte, GET_MODE_UNIT_SIZE (innermode)))
>       {
>         rtx elt;
> *************** simplify_subreg (machine_mode outermode,
> *** 6475,6504 ****
>         || CONST_FIXED_P (op)
>         || GET_CODE (op) == CONST_VECTOR)
>       {
> -       /* simplify_immed_subreg deconstructs OP into bytes and constructs
> -        the result from bytes, so it only works if the sizes of the modes
> -        and the value of the offset are known at compile time.  Cases that
> -        that apply to general modes and offsets should be handled here
> -        before calling simplify_immed_subreg.  */
> -       fixed_size_mode fs_outermode, fs_innermode;
>         unsigned HOST_WIDE_INT cbyte;
> !       if (is_a <fixed_size_mode> (outermode, &fs_outermode)
> !         && is_a <fixed_size_mode> (innermode, &fs_innermode)
> !         && byte.is_constant (&cbyte))
> !       return simplify_immed_subreg (fs_outermode, op, fs_innermode, cbyte,
> !                                     0, GET_MODE_SIZE (fs_innermode));
> !
> !       /* Handle constant-sized outer modes and variable-sized inner modes.  
> */
> !       unsigned HOST_WIDE_INT first_elem;
> !       if (GET_CODE (op) == CONST_VECTOR
> !         && is_a <fixed_size_mode> (outermode, &fs_outermode)
> !         && constant_multiple_p (byte, GET_MODE_UNIT_SIZE (innermode),
> !                                 &first_elem))
> !       return simplify_immed_subreg (fs_outermode, op, innermode, 0,
> !                                     first_elem,
> !                                     GET_MODE_SIZE (fs_outermode));
>
> !       return NULL_RTX;
>       }
>
>     /* Changing mode twice with SUBREG => just change it once,
> --- 6579,6599 ----
>         || CONST_FIXED_P (op)
>         || GET_CODE (op) == CONST_VECTOR)
>       {
>         unsigned HOST_WIDE_INT cbyte;
> !       if (byte.is_constant (&cbyte))
> !       {
> !         if (GET_CODE (op) == CONST_VECTOR && VECTOR_MODE_P (outermode))
> !           {
> !             rtx tmp = simplify_const_vector_subreg (outermode, op,
> !                                                     innermode, cbyte);
> !             if (tmp)
> !               return tmp;
> !           }
>
> !         fixed_size_mode fs_outermode;
> !         if (is_a <fixed_size_mode> (outermode, &fs_outermode))
> !           return simplify_immed_subreg (fs_outermode, op, innermode, cbyte);
> !       }
>       }
>
>     /* Changing mode twice with SUBREG => just change it once,
> *************** test_vec_merge (machine_mode mode)
> *** 7077,7082 ****
> --- 7172,7330 ----
>                  simplify_rtx (nvm));
>   }
>
> + /* Test subregs of integer vector constant X, trying elements in
> +    the range [MIN_ELT, MIN_ELT + constant_lower_bound (NELTS)),
> +    where NELTS is the number of elements in X.  Subregs involving
> +    elements [MIN_ELT, MIN_ELT + FIRST_VALID) are expected to fail.  */
> +
> + static void
> + test_vector_subregs_modes (rtx x, poly_uint64 elt_bias = 0,
> +                          unsigned int first_valid = 0)
> + {
> +   machine_mode inner_mode = GET_MODE (x);
> +   scalar_mode int_mode = GET_MODE_INNER (inner_mode);
> +
> +   for (unsigned int modei = 0; modei < NUM_MACHINE_MODES; ++modei)
> +     {
> +       machine_mode outer_mode = (machine_mode) modei;
> +       if (!VECTOR_MODE_P (outer_mode))
> +       continue;
> +
> +       unsigned int outer_nunits;
> +       if (GET_MODE_INNER (outer_mode) == int_mode
> +         && GET_MODE_NUNITS (outer_mode).is_constant (&outer_nunits)
> +         && multiple_p (GET_MODE_NUNITS (inner_mode), outer_nunits))
> +       {
> +         /* Test subregs in which the outer mode is a smaller,
> +            constant-sized vector of the same element type.  */
> +         unsigned int limit
> +           = constant_lower_bound (GET_MODE_NUNITS (inner_mode));
> +         for (unsigned int elt = 0; elt < limit; elt += outer_nunits)
> +           {
> +             rtx expected = NULL_RTX;
> +             if (elt >= first_valid)
> +               {
> +                 rtx_vector_builder builder (outer_mode, outer_nunits, 1);
> +                 for (unsigned int i = 0; i < outer_nunits; ++i)
> +                   builder.quick_push (CONST_VECTOR_ELT (x, elt + i));
> +                 expected = builder.build ();
> +               }
> +             poly_uint64 byte = (elt_bias + elt) * GET_MODE_SIZE (int_mode);
> +             ASSERT_RTX_EQ (expected,
> +                            simplify_subreg (outer_mode, x,
> +                                             inner_mode, byte));
> +           }
> +       }
> +       else if (known_eq (GET_MODE_SIZE (outer_mode),
> +                        GET_MODE_SIZE (inner_mode))
> +              && known_eq (elt_bias, 0U)
> +              && (GET_MODE_SIZE (inner_mode).is_constant ()
> +                  || !CONST_VECTOR_STEPPED_P (x)))
> +       {
> +         /* Try converting to OUTER_MODE and back.  */
> +         rtx outer_x = simplify_subreg (outer_mode, x, inner_mode, 0);
> +         ASSERT_TRUE (outer_x != NULL_RTX);
> +         ASSERT_RTX_EQ (x, simplify_subreg (inner_mode, outer_x,
> +                                            outer_mode, 0));
> +       }
> +     }
> +
> +   if (BYTES_BIG_ENDIAN == WORDS_BIG_ENDIAN)
> +     {
> +       /* Test each byte in the element range.  */
> +       unsigned int limit
> +       = constant_lower_bound (GET_MODE_SIZE (inner_mode));
> +       for (unsigned int i = 0; i < limit; ++i)
> +       {
> +         unsigned int elt = i / GET_MODE_SIZE (int_mode);
> +         rtx expected = NULL_RTX;
> +         if (elt >= first_valid)
> +           {
> +             unsigned int byte_shift = i % GET_MODE_SIZE (int_mode);
> +             if (BYTES_BIG_ENDIAN)
> +               byte_shift = GET_MODE_SIZE (int_mode) - byte_shift - 1;
> +             rtx_mode_t vec_elt (CONST_VECTOR_ELT (x, elt), int_mode);
> +             wide_int shifted_elt
> +               = wi::lrshift (vec_elt, byte_shift * BITS_PER_UNIT);
> +             expected = immed_wide_int_const (shifted_elt, QImode);
> +           }
> +         poly_uint64 byte = elt_bias * GET_MODE_SIZE (int_mode) + i;
> +         ASSERT_RTX_EQ (expected,
> +                        simplify_subreg (QImode, x, inner_mode, byte));
> +       }
> +     }
> + }
> +
> + /* Test constant subregs of integer vector mode INNER_MODE, using 1
> +    element per pattern.  */
> +
> + static void
> + test_vector_subregs_repeating (machine_mode inner_mode)
> + {
> +   poly_uint64 nunits = GET_MODE_NUNITS (inner_mode);
> +   unsigned int min_nunits = constant_lower_bound (nunits);
> +   scalar_mode int_mode = GET_MODE_INNER (inner_mode);
> +   unsigned int count = gcd (min_nunits, 8);
> +
> +   rtx_vector_builder builder (inner_mode, count, 1);
> +   for (unsigned int i = 0; i < count; ++i)
> +     builder.quick_push (gen_int_mode (8 - i, int_mode));
> +   rtx x = builder.build ();
> +
> +   test_vector_subregs_modes (x);
> +   if (!nunits.is_constant ())
> +     test_vector_subregs_modes (x, nunits - min_nunits);
> + }
> +
> + /* Test constant subregs of integer vector mode INNER_MODE, using 2
> +    elements per pattern.  */
> +
> + static void
> + test_vector_subregs_fore_back (machine_mode inner_mode)
> + {
> +   poly_uint64 nunits = GET_MODE_NUNITS (inner_mode);
> +   unsigned int min_nunits = constant_lower_bound (nunits);
> +   scalar_mode int_mode = GET_MODE_INNER (inner_mode);
> +   unsigned int count = gcd (min_nunits, 4);
> +
> +   rtx_vector_builder builder (inner_mode, count, 2);
> +   for (unsigned int i = 0; i < count; ++i)
> +     builder.quick_push (gen_int_mode (i, int_mode));
> +   for (unsigned int i = 0; i < count; ++i)
> +     builder.quick_push (gen_int_mode (-(int) i, int_mode));
> +   rtx x = builder.build ();
> +
> +   test_vector_subregs_modes (x);
> +   if (!nunits.is_constant ())
> +     test_vector_subregs_modes (x, nunits - min_nunits, count);
> + }
> +
> + /* Test constant subregs of integer vector mode INNER_MODE, using 3
> +    elements per pattern.  */
> +
> + static void
> + test_vector_subregs_stepped (machine_mode inner_mode)
> + {
> +   /* Build { 0, 1, 2, 3, ... }.  */
> +   scalar_mode int_mode = GET_MODE_INNER (inner_mode);
> +   rtx_vector_builder builder (inner_mode, 1, 3);
> +   for (unsigned int i = 0; i < 3; ++i)
> +     builder.quick_push (gen_int_mode (i, int_mode));
> +   rtx x = builder.build ();
> +
> +   test_vector_subregs_modes (x);
> + }
> +
> + /* Test constant subregs of integer vector mode INNER_MODE.  */
> +
> + static void
> + test_vector_subregs (machine_mode inner_mode)
> + {
> +   test_vector_subregs_repeating (inner_mode);
> +   test_vector_subregs_fore_back (inner_mode);
> +   test_vector_subregs_stepped (inner_mode);
> + }
> +
>   /* Verify some simplifications involving vectors.  */
>
>   static void
> *************** test_vector_ops ()
> *** 7091,7097 ****
>           test_vector_ops_duplicate (mode, scalar_reg);
>           if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
>               && maybe_gt (GET_MODE_NUNITS (mode), 2))
> !           test_vector_ops_series (mode, scalar_reg);
>           test_vec_merge (mode);
>         }
>       }
> --- 7339,7348 ----
>           test_vector_ops_duplicate (mode, scalar_reg);
>           if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
>               && maybe_gt (GET_MODE_NUNITS (mode), 2))
> !           {
> !             test_vector_ops_series (mode, scalar_reg);
> !             test_vector_subregs (mode);
> !           }
>           test_vec_merge (mode);
>         }
>       }

Re: Rework constant subreg folds and handle more variable-length cases

Reply via email to