These patches enhance the vec_extract built-in on modern PowerPC server systems. Currently, vec_extract is optimized for constant element numbers for vector double/vector long on any VSX system, and constant element numbers for vector char/vector short/vector int on ISA 3.0 (power9) systems.
If the vec_extract is not handled, the compiler currently stores the vector into memory, and then indexes the element via normal memory addressing. This creates a store-load hit. This patch and the successive patches will enable better code generation of vec_extract on 64-bit systems with direct move (power8) and above. This particular patch changes the infrastructure so that in the next patch, I can add support for extracting a variable element of vector double or vector long. This particular patch is just infrastructure, and does not change the code generation. In addition, I discovered a previous change for ISA 3.0 extraction spelled an instruction wrong, and it is fixed here. It turns out that I messed up the constraints, so the register allocator would never generate this instruction. This patch just uses the correct name, but it won't be until the next patch that the constraints will be fixed so it can be generated. I have tested this patch and there are no regressions. Can I apply this to the trunk? These sets of patches depend on the DImode in Altivec registers patches that have not been back ported to GCC 6.2, so it is for trunk only. The next patch will enhance vec_extract to to allow the element number to be variable for vec_extract of vector double/vector long on 64-bit systems with direct move using the VSLO instruction. The third patch will enhance vec_extract to better optimize extracting elements if the vector is in memory. Right now, the code only optimizes extracting element 0. The new patch will allow both constant and variable element numbers. The fourth patch will enhance vec_extract for the other vector types. I might split it up into two patches, one for vector float, and the other for vector char, vector short, and vector long. I built spec 2006 with all of the patches, and there were some benchmarks that generated a few changes, and a few benchmarks that generated a lot (gamess had over 500 places that were optimized). I ran a comparison between the old compiler and one with the patches installed on several of the benchmarks that showed the most changes. I did not see any performance changes on the benchmarks that I ran. I believe this is because vec_extract is typically generated at the end of vector reductions, and it does not account for much time in the whole benchmark. User written code that uses vec_extract would hopefully see speed improvements. 2016-07-27 Michael Meissner <meiss...@linux.vnet.ibm.com> * config/rs6000/vector.md (vec_extract<mode>): Change the calling signature of rs6000_expand_vector_extract so that the element number is a RTX instead of a constant integer. * config/rs6000/rs6000-protos.h (rs6000_expand_vector_extract): Likewise. * config/rs6000/rs6000.c (rs6000_expand_vector_extract): Likewise. (altivec_expand_vec_ext_builtin): Likewise. * config/rs6000/altivec.md (reduc_plus_scal_<mode>): Likewise. * config/rs6000/vsx.md (vsx_extract_<mode>): Fix spelling of the MFVSRLD instruction. -- Michael Meissner, IBM IBM, M/S 2506R, 550 King Street, Littleton, MA 01460-6245, USA email: meiss...@linux.vnet.ibm.com, phone: +1 (978) 899-4797
Index: gcc/config/rs6000/vector.md =================================================================== --- gcc/config/rs6000/vector.md (revision 238772) +++ gcc/config/rs6000/vector.md (working copy) @@ -858,8 +858,7 @@ (define_expand "vec_extract<mode>" (match_operand 2 "const_int_operand" "")] "VECTOR_MEM_ALTIVEC_OR_VSX_P (<MODE>mode)" { - rs6000_expand_vector_extract (operands[0], operands[1], - INTVAL (operands[2])); + rs6000_expand_vector_extract (operands[0], operands[1], operands[2]); DONE; }) Index: gcc/config/rs6000/rs6000-protos.h =================================================================== --- gcc/config/rs6000/rs6000-protos.h (revision 238772) +++ gcc/config/rs6000/rs6000-protos.h (working copy) @@ -61,7 +61,7 @@ extern void convert_int_to_float128 (rtx extern void rs6000_expand_vector_init (rtx, rtx); extern void paired_expand_vector_init (rtx, rtx); extern void rs6000_expand_vector_set (rtx, rtx, int); -extern void rs6000_expand_vector_extract (rtx, rtx, int); +extern void rs6000_expand_vector_extract (rtx, rtx, rtx); extern bool altivec_expand_vec_perm_const (rtx op[4]); extern void altivec_expand_vec_perm_le (rtx op[4]); extern bool rs6000_expand_vec_perm_const (rtx op[4]); Index: gcc/config/rs6000/rs6000.c =================================================================== --- gcc/config/rs6000/rs6000.c (revision 238772) +++ gcc/config/rs6000/rs6000.c (working copy) @@ -6911,35 +6911,35 @@ rs6000_expand_vector_set (rtx target, rt /* Extract field ELT from VEC into TARGET. */ void -rs6000_expand_vector_extract (rtx target, rtx vec, int elt) +rs6000_expand_vector_extract (rtx target, rtx vec, rtx elt) { machine_mode mode = GET_MODE (vec); machine_mode inner_mode = GET_MODE_INNER (mode); rtx mem; - if (VECTOR_MEM_VSX_P (mode)) + if (VECTOR_MEM_VSX_P (mode) && CONST_INT_P (elt)) { switch (mode) { default: break; case V1TImode: - gcc_assert (elt == 0 && inner_mode == TImode); + gcc_assert (INTVAL (elt) == 0 && inner_mode == TImode); emit_move_insn (target, gen_lowpart (TImode, vec)); break; case V2DFmode: - emit_insn (gen_vsx_extract_v2df (target, vec, GEN_INT (elt))); + emit_insn (gen_vsx_extract_v2df (target, vec, elt)); return; case V2DImode: - emit_insn (gen_vsx_extract_v2di (target, vec, GEN_INT (elt))); + emit_insn (gen_vsx_extract_v2di (target, vec, elt)); return; case V4SFmode: - emit_insn (gen_vsx_extract_v4sf (target, vec, GEN_INT (elt))); + emit_insn (gen_vsx_extract_v4sf (target, vec, elt)); return; case V16QImode: if (TARGET_VEXTRACTUB) { - emit_insn (gen_vsx_extract_v16qi (target, vec, GEN_INT (elt))); + emit_insn (gen_vsx_extract_v16qi (target, vec, elt)); return; } else @@ -6947,7 +6947,7 @@ rs6000_expand_vector_extract (rtx target case V8HImode: if (TARGET_VEXTRACTUB) { - emit_insn (gen_vsx_extract_v8hi (target, vec, GEN_INT (elt))); + emit_insn (gen_vsx_extract_v8hi (target, vec, elt)); return; } else @@ -6955,7 +6955,7 @@ rs6000_expand_vector_extract (rtx target case V4SImode: if (TARGET_VEXTRACTUB) { - emit_insn (gen_vsx_extract_v4si (target, vec, GEN_INT (elt))); + emit_insn (gen_vsx_extract_v4si (target, vec, elt)); return; } else @@ -6963,13 +6963,16 @@ rs6000_expand_vector_extract (rtx target } } + gcc_assert (CONST_INT_P (elt)); + /* Allocate mode-sized buffer. */ mem = assign_stack_temp (mode, GET_MODE_SIZE (mode)); emit_move_insn (mem, vec); /* Add offset to field within buffer matching vector element. */ - mem = adjust_address_nv (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode)); + mem = adjust_address_nv (mem, inner_mode, + INTVAL (elt) * GET_MODE_SIZE (inner_mode)); emit_move_insn (target, adjust_address_nv (mem, inner_mode, 0)); } @@ -14658,14 +14661,18 @@ altivec_expand_vec_ext_builtin (tree exp { machine_mode tmode, mode0; tree arg0, arg1; - int elt; rtx op0; + rtx op1; arg0 = CALL_EXPR_ARG (exp, 0); arg1 = CALL_EXPR_ARG (exp, 1); op0 = expand_normal (arg0); - elt = get_element_number (TREE_TYPE (arg0), arg1); + op1 = expand_normal (arg1); + + /* Call get_element_number to validate arg1 if it is a constant. */ + if (TREE_CODE (arg1) == INTEGER_CST) + (void) get_element_number (TREE_TYPE (arg0), arg1); tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0))); mode0 = TYPE_MODE (TREE_TYPE (arg0)); @@ -14676,7 +14683,7 @@ altivec_expand_vec_ext_builtin (tree exp if (optimize || !target || !register_operand (target, tmode)) target = gen_reg_rtx (tmode); - rs6000_expand_vector_extract (target, op0, elt); + rs6000_expand_vector_extract (target, op0, op1); return target; } Index: gcc/config/rs6000/vsx.md =================================================================== --- gcc/config/rs6000/vsx.md (revision 238772) +++ gcc/config/rs6000/vsx.md (working copy) @@ -2159,7 +2159,7 @@ (define_insn "vsx_extract_<mode>" else if (element == VECTOR_ELEMENT_MFVSRLD_64BIT && INT_REGNO_P (op0_regno) && TARGET_P9_VECTOR && TARGET_POWERPC64 && TARGET_DIRECT_MOVE) - return "mfvsrdl %0,%x1"; + return "mfvsrld %0,%x1"; else if (VSX_REGNO_P (op0_regno)) { Index: gcc/config/rs6000/altivec.md =================================================================== --- gcc/config/rs6000/altivec.md (revision 238772) +++ gcc/config/rs6000/altivec.md (working copy) @@ -2781,7 +2781,7 @@ (define_expand "reduc_plus_scal_<mode>" emit_insn (gen_altivec_vspltisw (vzero, const0_rtx)); emit_insn (gen_altivec_vsum4s<VI_char>s (vtmp1, operands[1], vzero)); emit_insn (gen_altivec_vsumsws_direct (dest, vtmp1, vzero)); - rs6000_expand_vector_extract (operands[0], vtmp2, elt); + rs6000_expand_vector_extract (operands[0], vtmp2, GEN_INT (elt)); DONE; })