Spencer Abson <spencer.ab...@arm.com> writes: > [...] > +/* If REF describes the high half of a 128-bit vector, return this > + vector. Otherwise, return NULL_TREE. */ > +static tree > +aarch64_v128_highpart_ref (const_tree ref) > +{ > + if (TREE_CODE (ref) != SSA_NAME) > + return NULL_TREE; > + > + gassign *stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (ref)); > + if (!stmt || gimple_assign_rhs_code (stmt) != BIT_FIELD_REF) > + return NULL_TREE; > + > + /* Look for a BIT_FIELD_REF that denotes the most significant 64 > + bits of a 128-bit vector. */ > + tree bf_ref = gimple_assign_rhs1 (stmt); > + unsigned int offset = BYTES_BIG_ENDIAN ? 0 : 64; > + > + if (bit_field_size (bf_ref).to_constant () != 64 > + || bit_field_offset (bf_ref).to_constant () != offset) > + return NULL_TREE;
There should be a comment justifying the to_constants, but... > + > + tree obj = TREE_OPERAND (bf_ref, 0); > + tree type = TREE_TYPE (obj); > + > + if (VECTOR_TYPE_P (type) && tree_fits_uhwi_p (TYPE_SIZE (type)) > + && tree_to_uhwi (TYPE_SIZE (type)) == 128) > + return obj; ...I think the fact that we only test this later suggests that the to_constants might not be safe, or at least not future-proof. I think we should instead use: if (maybe_ne (bit_field_size (bf_ref), 64) || maybe_ne (bit_field_offset (bf_ref), offset)) return NULL_TREE; > + > + return NULL_TREE; > +} > + > +/* Build and return a new VECTOR_CST of type OUT_TY using the > + elements of VEC_IN. */ Might be worth saying "using repeated copies of the elements of VEC_IN". > +static tree > +aarch64_build_vector_cst (const_tree vec_in, tree out_ty) > +{ > + gcc_assert (TREE_CODE (vec_in) == VECTOR_CST > + && VECTOR_TYPE_P (out_ty)); > + unsigned HOST_WIDE_INT nelts > + = VECTOR_CST_NELTS (vec_in).to_constant (); > + > + tree_vector_builder vec_out (out_ty, nelts, 1); > + for (unsigned i = 0; i < nelts; i++) > + vec_out.quick_push (VECTOR_CST_ELT (vec_in, i)); > + > + return vec_out.build (); > +} > + > +/* Try to fold STMT, a call to to a lowpart-operating builtin, to > + it's highpart-operating equivalent if doing so would save > + unnecessary data movement instructions. > + > + Return the new call if so, otherwise nullptr. */ > +static gcall * > +aarch64_fold_lo_call_to_hi (unsigned int fcode, gcall *stmt, > + gimple_stmt_iterator *gsi) > +{ > + /* Punt until as late as possible: > + 1) By folding away BIT_FIELD_REFs we remove information about the > + operands that may be useful to other optimizers. > + > + 2) For simplicity, we'd like the expression > + > + x = BIT_FIELD_REF<a, x, y> > + > + to imply that A is not a VECTOR_CST. This assumption is unlikely > + to hold before constant prop/folding. */ > + if (!(cfun->curr_properties & PROP_last_full_fold)) > + return nullptr; > + > + tree builtin_hi = aarch64_get_highpart_builtin (fcode); > + gcc_assert (builtin_hi != NULL_TREE); > + > + /* Prefer to use the highpart builtin when at least one vector > + argument is a reference to the high half of a 128b vector, and > + all others are VECTOR_CSTs that we can extend to 128b. */ > + auto_vec<unsigned int, 2> vec_constants; > + auto_vec<unsigned int, 2> vec_highparts; > + /* The arguments and signature of the new call. */ > + auto_vec<tree, 4> call_args; > + auto_vec<tree, 4> call_types; > + > + /* The interesting args are those that differ between the lo/hi > + builtins. Walk the function signatures to find these. */ > + tree types_hi = TYPE_ARG_TYPES (TREE_TYPE (builtin_hi)); > + tree types_lo = TYPE_ARG_TYPES (gimple_call_fntype (stmt)); > + unsigned int argno = 0; > + while (types_lo != void_list_node && types_hi != void_list_node) > + { > + tree type_lo = TREE_VALUE (types_lo); > + tree type_hi = TREE_VALUE (types_hi); > + tree arg = gimple_call_arg (stmt, argno); > + if (!types_compatible_p (type_lo, type_hi)) > + { > + /* Check our assumptions about this pair. */ > + gcc_assert (wi::to_widest (TYPE_SIZE (type_lo)) == 64 > + && wi::to_widest (TYPE_SIZE (type_hi)) == 128); > + > + tree vq = aarch64_v128_highpart_ref (arg); > + if (vq && is_gimple_reg (vq)) > + { > + vec_highparts.safe_push (argno); > + arg = vq; > + } > + else if (TREE_CODE (arg) == VECTOR_CST) > + vec_constants.safe_push (argno); > + else > + return nullptr; > + } > + call_args.safe_push (arg); > + call_types.safe_push (type_hi); > + > + argno++; > + types_hi = TREE_CHAIN (types_hi); > + types_lo = TREE_CHAIN (types_lo); > + } > + gcc_assert (types_lo == void_list_node > + && types_hi == void_list_node); > + > + if (vec_highparts.is_empty ()) > + return nullptr; > + > + /* Build and return a new call to BUILTIN_HI. */ > + for (auto i : vec_constants) > + call_args[i] = aarch64_build_vector_cst (call_args[i], > + call_types[i]); > + for (auto i : vec_highparts) > + { > + if (!types_compatible_p (TREE_TYPE (call_args[i]), > + call_types[i])) Formatting nit, sorry, but: redundant braces between the "for" and "if". > + { > + tree vce_ssa = make_ssa_name (call_types[i]); > + tree vce_expr = build1 (VIEW_CONVERT_EXPR, > + call_types[i], call_args[i]); > + gsi_insert_before (gsi, > + gimple_build_assign (vce_ssa, vce_expr), > + GSI_SAME_STMT); > + call_args[i] = vce_ssa; > + } > + } > + > + gcall *new_call > + = gimple_build_call_vec (builtin_hi, call_args); > + gimple_call_set_lhs (new_call, gimple_call_lhs (stmt)); > + return new_call; > +} > + > +#undef LO_HI_PAIR > +#define LO_HI_PAIR(A, B) case AARCH64_SIMD_BUILTIN_##A: > + > /* Try to fold STMT, given that it's a call to the built-in function with > subcode FCODE. Return the new statement on success and null on > failure. */ > @@ -5190,6 +5369,10 @@ aarch64_general_gimple_fold_builtin (unsigned int > fcode, gcall *stmt, > } > break; > } > + break; This break is redundant; there's already one above. > + LO_HI_PAIRINGS > + new_stmt = aarch64_fold_lo_call_to_hi (fcode, stmt, gsi); > + break; The last two lines should be indented by 6 columns rather than 8. OK with those changes, thanks. Richard