To get back to this...
Richard Sandiford <[email protected]> writes:
> Richard Guenther <[email protected]> writes:
>> On Thu, Mar 31, 2011 at 3:32 PM, Richard Sandiford
>> <[email protected]> wrote:
>>> This patch adds an array_mode_supported_p hook, which says whether
>>> MAX_FIXED_MODE_SIZE should be ignored for a given type of array.
>>> It follows on from the discussion here:
>>>
>>> http://gcc.gnu.org/ml/gcc/2011-03/msg00342.html
>>>
>>> The intended use of the hook is to allow small arrays of vectors
>>> to have a non-BLK mode, and hence to be stored in rtl registers.
>>> These arrays are used both in the ARM arm_neon.h API and in the
>>> optabs proposed in:
>>>
>>> http://gcc.gnu.org/ml/gcc/2011-03/msg00322.html
>>>
>>> The tail end of the thread was about the definition of TYPE_MODE:
>>>
>>> #define TYPE_MODE(NODE) \
>>> (TREE_CODE (TYPE_CHECK (NODE)) == VECTOR_TYPE \
>>> ? vector_type_mode (NODE) : (NODE)->type.mode)
>>>
>>> with this outcome:
>>>
>>> http://gcc.gnu.org/ml/gcc/2011-03/msg00470.html
>>>
>>> To summarise my take on it:
>>>
>>> - The current definition of TYPE_MODE isn't sufficient even for vector
>>> modes and vector_mode_supported_p, because non-vector types can have
>>> vector modes.
>>>
>>> - We should no longer treat types as having one mode everywhere.
>>> We should instead replace TYPE_MODE with a function that takes
>>> a context. Tests of things like vector_mode_supported_p would
>>> move from layout_type to this new function.
>>>
>>> I think this patch fits within that scheme. array_mode_supported_p
>>> would be treated in the same way as vector_mode_supported_p.
>>>
>>> I realise the ideal would be to get rid of TYPE_MODE first.
>>> But that's going to be a longer-term thing. Now that there's
>>> at least a plan, I'd like to press ahead with the array stuff
>>> on the basis that
>>>
>>> (a) although the new hook won't work with the "target" attribute,
>>> our current mode handling doesn't work in just the same way.
>>>
>>> (b) the new hook doesn't interfere with the plan.
>>>
>>> (c) getting good code from the intrinsics (and support for these
>>> instructions in the vectoriser) is going to be much more important
>>> to most ARM users than the ability to turn Neon on and off for
>>> individual functions in a TU.
>>>
>>> To give an example of the difference, the Neon code posted here:
>>>
>>> http://hilbert-space.de/?p=22
>>>
>>> produces this inner loop before the patch (but with
>>> http://gcc.gnu.org/ml/gcc-patches/2011-03/msg01996.html applied):
>>>
>>> .L3:
>>> vld3.8 {d16-d18}, [r1]!
>>> vstmia ip, {d16-d18}
>>> fldd d19, [sp, #24]
>>> adr r5, .L6
>>> ldmia r5, {r4-r5}
>>> fldd d16, [sp, #32]
>>> vmov d18, r4, r5 @ v8qi
>>> vmull.u8 q9, d19, d18
>>> adr r5, .L6+8
>>> ldmia r5, {r4-r5}
>>> vmov d17, r4, r5 @ v8qi
>>> vstmia sp, {d18-d19}
>>> vmlal.u8 q9, d16, d17
>>> fldd d16, [sp, #40]
>>> adr r5, .L6+16
>>> ldmia r5, {r4-r5}
>>> vmov d17, r4, r5 @ v8qi
>>> vmlal.u8 q9, d16, d17
>>> add r3, r3, #1
>>> vshrn.i16 d16, q9, #8
>>> cmp r3, r2
>>> vst1.8 {d16}, [r0]!
>>> bne .L3
>>>
>>> With both patches applied, the inner loop is:
>>>
>>> .L3:
>>> vld3.8 {d18-d20}, [r1]!
>>> vmull.u8 q8, d18, d21
>>> vmlal.u8 q8, d19, d22
>>> vmlal.u8 q8, d20, d23
>>> add r3, r3, #1
>>> vshrn.i16 d16, q8, #8
>>> cmp r3, r2
>>> vst1.8 {d16}, [r0]!
>>> bne .L3
>>>
>>> Tested on arm-linux-gnueabi. OK to install?
>>
>> It looks reasonable given the past discussion, but - can you move forward
>> with the Neon stuff a bit to see if it really fits? Or is this all
>> that is needed
>> for the load/store lane support as well (apart from vectorizer changes of
>> course).
>
> Yeah, I have a prototype that hacks up some C support for generating the
> (otherwise internal-only) load/store built-in functions that the vectoriser
> is suppsoed to generate. This patch is all that seems to be needed for the
> types and optabs generation to work in the natural way.
>
> I'm happy to leave it until the vectoriser stuff is in a more
> submittable state though.
The vectorisation stuff has now been approved and uses this hook to
detect whether interleaved loads & stores are supported. Also...
> Especially given:
>
>> Can you check the code generated by for example
>>
>> float foo(char *p)
>> {
>> float a[2];
>> int i;
>> ((char *)a)[0] = p[0];
>> ((char *)a)[1] = p[1];
>> ((char *)a)[2] = p[2];
>> ((char *)a)[3] = p[3];
>> ((char *)a)[4] = p[4];
>> ((char *)a)[5] = p[5];
>> ((char *)a)[6] = p[6];
>> ((char *)a)[7] = p[7];
>> return a[0] + a[1];
>> }
>>
>> for an array a that would get such a larger mode? Thus, check what
>> happens with partial defs of different types (just to avoid ICEs like the
>> ones Jakub was fixing yesterday).
>
> OK, I tried:
>
> #include "arm_neon.h"
>
> uint32x2_t foo(char *p)
> {
> uint32x2_t a[2];
> int i;
> ((char *)a)[0] = p[0];
> ((char *)a)[1] = p[1];
> ((char *)a)[2] = p[2];
> ((char *)a)[3] = p[3];
> ((char *)a)[4] = p[4];
> ((char *)a)[5] = p[5];
> ((char *)a)[6] = p[6];
> ((char *)a)[7] = p[7];
> ((char *)a)[8] = p[8];
> ((char *)a)[9] = p[9];
> ((char *)a)[10] = p[10];
> ((char *)a)[11] = p[11];
> ((char *)a)[12] = p[12];
> ((char *)a)[13] = p[13];
> ((char *)a)[14] = p[14];
> ((char *)a)[15] = p[15];
> return vadd_u32 (a[0], a[1]);
> }
>
> uint32x4_t bar(char *p, uint32x4_t *b)
> {
> uint32x4_t a[2];
> int i;
> ((char *)a)[0] = p[0];
> ((char *)a)[1] = p[1];
> ((char *)a)[2] = p[2];
> ((char *)a)[3] = p[3];
> ((char *)a)[4] = p[4];
> ((char *)a)[5] = p[5];
> ((char *)a)[6] = p[6];
> ((char *)a)[7] = p[7];
> ((char *)a)[8] = p[8];
> ((char *)a)[9] = p[9];
> ((char *)a)[10] = p[10];
> ((char *)a)[11] = p[11];
> ((char *)a)[12] = p[12];
> ((char *)a)[13] = p[13];
> ((char *)a)[14] = p[14];
> ((char *)a)[15] = p[15];
> ((char *)a)[16 + 0] = p[16 + 0];
> ((char *)a)[16 + 1] = p[16 + 1];
> ((char *)a)[16 + 2] = p[16 + 2];
> ((char *)a)[16 + 3] = p[16 + 3];
> ((char *)a)[16 + 4] = p[16 + 4];
> ((char *)a)[16 + 5] = p[16 + 5];
> ((char *)a)[16 + 6] = p[16 + 6];
> ((char *)a)[16 + 7] = p[16 + 7];
> ((char *)a)[16 + 8] = p[16 + 8];
> ((char *)a)[16 + 9] = p[16 + 9];
> ((char *)a)[16 + 10] = p[16 + 10];
> ((char *)a)[16 + 11] = p[16 + 11];
> ((char *)a)[16 + 12] = p[16 + 12];
> ((char *)a)[16 + 13] = p[16 + 13];
> ((char *)a)[16 + 14] = p[16 + 14];
> ((char *)a)[16 + 15] = p[16 + 15];
> return vaddq_u32 (a[0], a[1]);
> }
>
> It seemed to avoid the problem Jakub was seeing, but the second function
> hit the known const_int reload failure for these modes:
>
> http://gcc.gnu.org/bugzilla/show_bug.cgi?id=46329
...I've just committed the fix for this PR. Thanks to everyone for
all the reviews.
Tested on x86_64-linux-gnu and arm-linux-gnueabi. Do the
target-independent bits look OK? How about the ARM bits?
Thanks,
Richard
gcc/
* hooks.h (hook_bool_mode_uhwi_false): Declare.
* hooks.c (hook_bool_mode_uhwi_false): New function.
* target.def (array_mode_supported_p): New hook.
* doc/tm.texi.in (TARGET_ARRAY_MODE_SUPPORTED_P): Add @hook.
* doc/tm.texi: Regenerate.
* stor-layout.c (mode_for_array): New function.
(layout_type): Use it.
* config/arm/arm.c (arm_array_mode_supported_p): New function.
(TARGET_ARRAY_MODE_SUPPORTED_P): Define.
Index: gcc/hooks.h
===================================================================
--- gcc/hooks.h 2011-04-21 10:47:30.000000000 +0100
+++ gcc/hooks.h 2011-04-21 10:47:48.000000000 +0100
@@ -36,6 +36,8 @@ extern bool hook_bool_mode_const_rtx_fal
extern bool hook_bool_mode_const_rtx_true (enum machine_mode, const_rtx);
extern bool hook_bool_mode_rtx_false (enum machine_mode, rtx);
extern bool hook_bool_mode_rtx_true (enum machine_mode, rtx);
+extern bool hook_bool_mode_uhwi_false (enum machine_mode,
+ unsigned HOST_WIDE_INT);
extern bool hook_bool_tree_false (tree);
extern bool hook_bool_const_tree_false (const_tree);
extern bool hook_bool_tree_true (tree);
Index: gcc/hooks.c
===================================================================
--- gcc/hooks.c 2011-04-21 10:47:30.000000000 +0100
+++ gcc/hooks.c 2011-04-21 10:47:48.000000000 +0100
@@ -117,6 +117,15 @@ hook_bool_mode_rtx_true (enum machine_mo
return true;
}
+/* Generic hook that takes (enum machine_mode, unsigned HOST_WIDE_INT)
+ and returns false. */
+bool
+hook_bool_mode_uhwi_false (enum machine_mode mode ATTRIBUTE_UNUSED,
+ unsigned HOST_WIDE_INT value ATTRIBUTE_UNUSED)
+{
+ return false;
+}
+
/* Generic hook that takes (FILE *, const char *) and does nothing. */
void
hook_void_FILEptr_constcharptr (FILE *a ATTRIBUTE_UNUSED, const char *b
ATTRIBUTE_UNUSED)
Index: gcc/target.def
===================================================================
--- gcc/target.def 2011-04-21 10:47:30.000000000 +0100
+++ gcc/target.def 2011-04-21 10:47:48.000000000 +0100
@@ -1565,6 +1565,38 @@ DEFHOOK
bool, (enum machine_mode mode),
hook_bool_mode_false)
+/* True if we should try to use a scalar mode to represent an array,
+ overriding the usual MAX_FIXED_MODE limit. */
+DEFHOOK
+(array_mode_supported_p,
+ "Return true if GCC should try to use a scalar mode to store an array\n\
+of @var{nelems} elements, given that each element has mode @var{mode}.\n\
+Returning true here overrides the usual @code{MAX_FIXED_MODE} limit\n\
+and allows GCC to use any defined integer mode.\n\
+\n\
+One use of this hook is to support vector load and store operations\n\
+that operate on several homogeneous vectors. For example, ARM NEON\n\
+has operations like:\n\
+\n\
+@smallexample\n\
+int8x8x3_t vld3_s8 (const int8_t *)\n\
+@end smallexample\n\
+\n\
+where the return type is defined as:\n\
+\n\
+@smallexample\n\
+typedef struct int8x8x3_t\n\
+@{\n\
+ int8x8_t val[3];\n\
+@} int8x8x3_t;\n\
+@end smallexample\n\
+\n\
+If this hook allows @code{val} to have a scalar mode, then\n\
+@code{int8x8x3_t} can have the same mode. GCC can then store\n\
+@code{int8x8x3_t}s in registers rather than forcing them onto the stack.",
+ bool, (enum machine_mode mode, unsigned HOST_WIDE_INT nelems),
+ hook_bool_mode_uhwi_false)
+
/* Compute cost of moving data from a register of class FROM to one of
TO, using MODE. */
DEFHOOK
Index: gcc/doc/tm.texi.in
===================================================================
--- gcc/doc/tm.texi.in 2011-04-21 10:47:30.000000000 +0100
+++ gcc/doc/tm.texi.in 2011-04-21 10:47:48.000000000 +0100
@@ -4263,6 +4263,8 @@ insns involving vector mode @var{mode}.
must have move patterns for this mode.
@end deftypefn
+@hook TARGET_ARRAY_MODE_SUPPORTED_P
+
@hook TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P
Define this to return nonzero for machine modes for which the port has
small register classes. If this target hook returns nonzero for a given
Index: gcc/doc/tm.texi
===================================================================
--- gcc/doc/tm.texi 2011-04-21 10:47:30.000000000 +0100
+++ gcc/doc/tm.texi 2011-04-21 10:47:48.000000000 +0100
@@ -4277,6 +4277,34 @@ insns involving vector mode @var{mode}.
must have move patterns for this mode.
@end deftypefn
+@deftypefn {Target Hook} bool TARGET_ARRAY_MODE_SUPPORTED_P (enum machine_mode
@var{mode}, unsigned HOST_WIDE_INT @var{nelems})
+Return true if GCC should try to use a scalar mode to store an array
+of @var{nelems} elements, given that each element has mode @var{mode}.
+Returning true here overrides the usual @code{MAX_FIXED_MODE} limit
+and allows GCC to use any defined integer mode.
+
+One use of this hook is to support vector load and store operations
+that operate on several homogeneous vectors. For example, ARM NEON
+has operations like:
+
+@smallexample
+int8x8x3_t vld3_s8 (const int8_t *)
+@end smallexample
+
+where the return type is defined as:
+
+@smallexample
+typedef struct int8x8x3_t
+@{
+ int8x8_t val[3];
+@} int8x8x3_t;
+@end smallexample
+
+If this hook allows @code{val} to have a scalar mode, then
+@code{int8x8x3_t} can have the same mode. GCC can then store
+@code{int8x8x3_t}s in registers rather than forcing them onto the stack.
+@end deftypefn
+
@deftypefn {Target Hook} bool TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P (enum
machine_mode @var{mode})
Define this to return nonzero for machine modes for which the port has
small register classes. If this target hook returns nonzero for a given
Index: gcc/stor-layout.c
===================================================================
--- gcc/stor-layout.c 2011-04-21 10:47:30.000000000 +0100
+++ gcc/stor-layout.c 2011-04-21 10:47:48.000000000 +0100
@@ -546,6 +546,34 @@ get_mode_alignment (enum machine_mode mo
return MIN (BIGGEST_ALIGNMENT, MAX (1, mode_base_align[mode]*BITS_PER_UNIT));
}
+/* Return the natural mode of an array, given that it is SIZE bytes in
+ total and has elements of type ELEM_TYPE. */
+
+static enum machine_mode
+mode_for_array (tree elem_type, tree size)
+{
+ tree elem_size;
+ unsigned HOST_WIDE_INT int_size, int_elem_size;
+ bool limit_p;
+
+ /* One-element arrays get the component type's mode. */
+ elem_size = TYPE_SIZE (elem_type);
+ if (simple_cst_equal (size, elem_size))
+ return TYPE_MODE (elem_type);
+
+ limit_p = true;
+ if (host_integerp (size, 1) && host_integerp (elem_size, 1))
+ {
+ int_size = tree_low_cst (size, 1);
+ int_elem_size = tree_low_cst (elem_size, 1);
+ if (int_elem_size > 0
+ && int_size % int_elem_size == 0
+ && targetm.array_mode_supported_p (TYPE_MODE (elem_type),
+ int_size / int_elem_size))
+ limit_p = false;
+ }
+ return mode_for_size_tree (size, MODE_INT, limit_p);
+}
/* Subroutine of layout_decl: Force alignment required for the data type.
But if the decl itself wants greater alignment, don't override that. */
@@ -2040,14 +2068,8 @@ layout_type (tree type)
&& (TYPE_MODE (TREE_TYPE (type)) != BLKmode
|| TYPE_NO_FORCE_BLK (TREE_TYPE (type))))
{
- /* One-element arrays get the component type's mode. */
- if (simple_cst_equal (TYPE_SIZE (type),
- TYPE_SIZE (TREE_TYPE (type))))
- SET_TYPE_MODE (type, TYPE_MODE (TREE_TYPE (type)));
- else
- SET_TYPE_MODE (type, mode_for_size_tree (TYPE_SIZE (type),
- MODE_INT, 1));
-
+ SET_TYPE_MODE (type, mode_for_array (TREE_TYPE (type),
+ TYPE_SIZE (type)));
if (TYPE_MODE (type) != BLKmode
&& STRICT_ALIGNMENT && TYPE_ALIGN (type) < BIGGEST_ALIGNMENT
&& TYPE_ALIGN (type) < GET_MODE_ALIGNMENT (TYPE_MODE (type)))
Index: gcc/config/arm/arm.c
===================================================================
--- gcc/config/arm/arm.c 2011-04-21 10:47:30.000000000 +0100
+++ gcc/config/arm/arm.c 2011-04-21 10:47:48.000000000 +0100
@@ -243,6 +243,8 @@ static rtx arm_pic_static_addr (rtx orig
static bool cortex_a9_sched_adjust_cost (rtx, rtx, rtx, int *);
static bool xscale_sched_adjust_cost (rtx, rtx, rtx, int *);
static bool fa726te_sched_adjust_cost (rtx, rtx, rtx, int *);
+static bool arm_array_mode_supported_p (enum machine_mode,
+ unsigned HOST_WIDE_INT);
static enum machine_mode arm_preferred_simd_mode (enum machine_mode);
static bool arm_class_likely_spilled_p (reg_class_t);
static bool arm_vector_alignment_reachable (const_tree type, bool is_packed);
@@ -399,6 +401,8 @@ #define TARGET_ADDRESS_COST arm_address_
#define TARGET_SHIFT_TRUNCATION_MASK arm_shift_truncation_mask
#undef TARGET_VECTOR_MODE_SUPPORTED_P
#define TARGET_VECTOR_MODE_SUPPORTED_P arm_vector_mode_supported_p
+#undef TARGET_ARRAY_MODE_SUPPORTED_P
+#define TARGET_ARRAY_MODE_SUPPORTED_P arm_array_mode_supported_p
#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE arm_preferred_simd_mode
#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
@@ -22514,6 +22518,20 @@ arm_vector_mode_supported_p (enum machin
return false;
}
+/* Implements target hook array_mode_supported_p. */
+
+static bool
+arm_array_mode_supported_p (enum machine_mode mode,
+ unsigned HOST_WIDE_INT nelems)
+{
+ if (TARGET_NEON
+ && (VALID_NEON_DREG_MODE (mode) || VALID_NEON_QREG_MODE (mode))
+ && (nelems >= 2 && nelems <= 4))
+ return true;
+
+ return false;
+}
+
/* Use the option -mvectorize-with-neon-quad to override the use of doubleword
registers when autovectorizing for Neon, at least until multiple vector
widths are supported properly by the middle-end. */