On 17 May 2016 at 14:27, James Greenhalgh <james.greenha...@arm.com> wrote: > On Mon, May 16, 2016 at 10:09:31AM +0100, Jiong Wang wrote: >> AArch64 support vector multiply by element for V2DF, V2SF, V4SF, V2SI, >> V4SI, V4HI, V8HI. >> >> All above are well supported by "*aarch64_mul3_elt<mode>" pattern and >> "*aarch64_mul3_elt_<vswap_width_name><mode>" if there is lane size >> change. >> >> Above patterns are trying to match "(mul (vec_dup (vec_select)))" >> which is genuinely vector multiply by element. >> >> While vector multiply by element can also comes from "(mul (vec_dup >> (scalar" where the scalar value is already sitting in vector register >> then duplicated to other lanes, and there is no lane size change. >> >> We have "*aarch64_mul3_elt_to_128df" to match this already, but it's >> restricted for V2DF while this patch extends this support to more modes, >> for example vector integer operations. >> >> For the testcase included, the following codegen change will happen: >> >> >> - ldr w0, [x3, 160] >> - dup v1.2s, w0 >> - mul v1.2s, v1.2s, v2.2s >> + ldr s1, [x3, 160] >> + mul v1.2s, v0.2s, v1.s[0] >> >> OK for trunk? >> >> 2016-05-16 Jiong Wang<jiong.w...@arm.com> >> >> gcc/ >> * config/aarch64/aarch64-simd.md (*aarch64_mul3_elt_to_128df): Extend to >> all >> supported modes. Rename to "*aarch64_mul3_elt_from_dup". >> >> gcc/testsuite/ >> * /gcc.target/aarch64/simd/vmul_elem_1.c: New. > > > This ChangeLog formatting is incorrect. It should look like: > > gcc/ > > 2016-05-17 Jiong Wang <jiong.w...@arm.com> > > * config/aarch64/aarch64-simd.md (*aarch64_mul3_elt_to_128df): Extend > to all supported modes. Rename to... > (*aarch64_mul3_elt_from_dup): ...this. > > gcc/testsuite/ > > 2016-05-17 Jiong Wang <jiong.w...@arm.com> > > * gcc.target/aarch64/simd/vmul_elem_1.c: New. > > Otherwise, this patch is OK. >
Hi Jiong, The new testcase fails on aarch64_be, at execution time. Christophe. > Thanks, > James > >> diff --git a/gcc/config/aarch64/aarch64-simd.md >> b/gcc/config/aarch64/aarch64-simd.md >> index >> eb18defef15c24bf2334045e92bf7c34b989136d..7f338ff78fabccee868a4befbffed54c3e842dc9 >> 100644 >> --- a/gcc/config/aarch64/aarch64-simd.md >> +++ b/gcc/config/aarch64/aarch64-simd.md >> @@ -371,15 +371,15 @@ >> [(set_attr "type" "neon<fp>_mul_<Vetype>_scalar<q>")] >> ) >> >> -(define_insn "*aarch64_mul3_elt_to_128df" >> - [(set (match_operand:V2DF 0 "register_operand" "=w") >> - (mult:V2DF >> - (vec_duplicate:V2DF >> - (match_operand:DF 2 "register_operand" "w")) >> - (match_operand:V2DF 1 "register_operand" "w")))] >> +(define_insn "*aarch64_mul3_elt_from_dup<mode>" >> + [(set (match_operand:VMUL 0 "register_operand" "=w") >> + (mult:VMUL >> + (vec_duplicate:VMUL >> + (match_operand:<VEL> 1 "register_operand" "<h_con>")) >> + (match_operand:VMUL 2 "register_operand" "w")))] >> "TARGET_SIMD" >> - "fmul\\t%0.2d, %1.2d, %2.d[0]" >> - [(set_attr "type" "neon_fp_mul_d_scalar_q")] >> + "<f>mul\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"; >> + [(set_attr "type" "neon<fp>_mul_<Vetype>_scalar<q>")] >> ) >> >> (define_insn "aarch64_rsqrte_<mode>2" >> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmul_elem_1.c >> b/gcc/testsuite/gcc.target/aarch64/simd/vmul_elem_1.c >> new file mode 100644 >> index >> 0000000000000000000000000000000000000000..290a4e9adbc5d9ce1335ca28120e437293776f30 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/aarch64/simd/vmul_elem_1.c >> @@ -0,0 +1,519 @@ >> +/* Test the vmul_n_f64 AArch64 SIMD intrinsic. */ >> + >> +/* { dg-do run } */ >> +/* { dg-options "-O2 --save-temps" } */ >> + >> +#include "arm_neon.h" >> + >> +extern void abort (void); >> + >> +#define A (132.4f) >> +#define B (-0.0f) >> +#define C (-34.8f) >> +#define D (289.34f) >> +float32_t expected2_1[2] = {A * A, B * A}; >> +float32_t expected2_2[2] = {A * B, B * B}; >> +float32_t expected4_1[4] = {A * A, B * A, C * A, D * A}; >> +float32_t expected4_2[4] = {A * B, B * B, C * B, D * B}; >> +float32_t expected4_3[4] = {A * C, B * C, C * C, D * C}; >> +float32_t expected4_4[4] = {A * D, B * D, C * D, D * D}; >> +float32_t _elemA = A; >> +float32_t _elemB = B; >> +float32_t _elemC = C; >> +float32_t _elemD = D; >> + >> +#define AD (1234.5) >> +#define BD (-0.0) >> +#define CD (71.3) >> +#define DD (-1024.4) >> +float64_t expectedd2_1[2] = {AD * CD, BD * CD}; >> +float64_t expectedd2_2[2] = {AD * DD, BD * DD}; >> +float64_t _elemdC = CD; >> +float64_t _elemdD = DD; >> + >> + >> +#define AS (1024) >> +#define BS (-31) >> +#define CS (0) >> +#define DS (655) >> +int32_t expecteds2_1[2] = {AS * AS, BS * AS}; >> +int32_t expecteds2_2[2] = {AS * BS, BS * BS}; >> +int32_t expecteds4_1[4] = {AS * AS, BS * AS, CS * AS, DS * AS}; >> +int32_t expecteds4_2[4] = {AS * BS, BS * BS, CS * BS, DS * BS}; >> +int32_t expecteds4_3[4] = {AS * CS, BS * CS, CS * CS, DS * CS}; >> +int32_t expecteds4_4[4] = {AS * DS, BS * DS, CS * DS, DS * DS}; >> +int32_t _elemsA = AS; >> +int32_t _elemsB = BS; >> +int32_t _elemsC = CS; >> +int32_t _elemsD = DS; >> + >> +#define AH ((int16_t) 0) >> +#define BH ((int16_t) -32) >> +#define CH ((int16_t) 102) >> +#define DH ((int16_t) -51) >> +#define EH ((int16_t) 71) >> +#define FH ((int16_t) -91) >> +#define GH ((int16_t) 48) >> +#define HH ((int16_t) 255) >> +int16_t expectedh4_1[4] = {AH * AH, BH * AH, CH * AH, DH * AH}; >> +int16_t expectedh4_2[4] = {AH * BH, BH * BH, CH * BH, DH * BH}; >> +int16_t expectedh4_3[4] = {AH * CH, BH * CH, CH * CH, DH * CH}; >> +int16_t expectedh4_4[4] = {AH * DH, BH * DH, CH * DH, DH * DH}; >> +int16_t expectedh8_1[8] = {AH * AH, BH * AH, CH * AH, DH * AH, >> + EH * AH, FH * AH, GH * AH, HH * AH}; >> +int16_t expectedh8_2[8] = {AH * BH, BH * BH, CH * BH, DH * BH, >> + EH * BH, FH * BH, GH * BH, HH * BH}; >> +int16_t expectedh8_3[8] = {AH * CH, BH * CH, CH * CH, DH * CH, >> + EH * CH, FH * CH, GH * CH, HH * CH}; >> +int16_t expectedh8_4[8] = {AH * DH, BH * DH, CH * DH, DH * DH, >> + EH * DH, FH * DH, GH * DH, HH * DH}; >> +int16_t expectedh8_5[8] = {AH * EH, BH * EH, CH * EH, DH * EH, >> + EH * EH, FH * EH, GH * EH, HH * EH}; >> +int16_t expectedh8_6[8] = {AH * FH, BH * FH, CH * FH, DH * FH, >> + EH * FH, FH * FH, GH * FH, HH * FH}; >> +int16_t expectedh8_7[8] = {AH * GH, BH * GH, CH * GH, DH * GH, >> + EH * GH, FH * GH, GH * GH, HH * GH}; >> +int16_t expectedh8_8[8] = {AH * HH, BH * HH, CH * HH, DH * HH, >> + EH * HH, FH * HH, GH * HH, HH * HH}; >> +int16_t _elemhA = AH; >> +int16_t _elemhB = BH; >> +int16_t _elemhC = CH; >> +int16_t _elemhD = DH; >> +int16_t _elemhE = EH; >> +int16_t _elemhF = FH; >> +int16_t _elemhG = GH; >> +int16_t _elemhH = HH; >> + >> +#define AUS (1024) >> +#define BUS (31) >> +#define CUS (0) >> +#define DUS (655) >> +uint32_t expectedus2_1[2] = {AUS * AUS, BUS * AUS}; >> +uint32_t expectedus2_2[2] = {AUS * BUS, BUS * BUS}; >> +uint32_t expectedus4_1[4] = {AUS * AUS, BUS * AUS, CUS * AUS, DUS * AUS}; >> +uint32_t expectedus4_2[4] = {AUS * BUS, BUS * BUS, CUS * BUS, DUS * BUS}; >> +uint32_t expectedus4_3[4] = {AUS * CUS, BUS * CUS, CUS * CUS, DUS * CUS}; >> +uint32_t expectedus4_4[4] = {AUS * DUS, BUS * DUS, CUS * DUS, DUS * DUS}; >> +uint32_t _elemusA = AUS; >> +uint32_t _elemusB = BUS; >> +uint32_t _elemusC = CUS; >> +uint32_t _elemusD = DUS; >> + >> +#define AUH ((uint16_t) 0) >> +#define BUH ((uint16_t) 32) >> +#define CUH ((uint16_t) 102) >> +#define DUH ((uint16_t) 51) >> +#define EUH ((uint16_t) 71) >> +#define FUH ((uint16_t) 91) >> +#define GUH ((uint16_t) 48) >> +#define HUH ((uint16_t) 255) >> +uint16_t expecteduh4_1[4] = {AUH * AUH, BUH * AUH, CUH * AUH, DUH * AUH}; >> +uint16_t expecteduh4_2[4] = {AUH * BUH, BUH * BUH, CUH * BUH, DUH * BUH}; >> +uint16_t expecteduh4_3[4] = {AUH * CUH, BUH * CUH, CUH * CUH, DUH * CUH}; >> +uint16_t expecteduh4_4[4] = {AUH * DUH, BUH * DUH, CUH * DUH, DUH * DUH}; >> +uint16_t expecteduh8_1[8] = {AUH * AUH, BUH * AUH, CUH * AUH, DUH * AUH, >> + EUH * AUH, FUH * AUH, GUH * AUH, HUH * AUH}; >> +uint16_t expecteduh8_2[8] = {AUH * BUH, BUH * BUH, CUH * BUH, DUH * BUH, >> + EUH * BUH, FUH * BUH, GUH * BUH, HUH * BUH}; >> +uint16_t expecteduh8_3[8] = {AUH * CUH, BUH * CUH, CUH * CUH, DUH * CUH, >> + EUH * CUH, FUH * CUH, GUH * CUH, HUH * CUH}; >> +uint16_t expecteduh8_4[8] = {AUH * DUH, BUH * DUH, CUH * DUH, DUH * DUH, >> + EUH * DUH, FUH * DUH, GUH * DUH, HUH * DUH}; >> +uint16_t expecteduh8_5[8] = {AUH * EUH, BUH * EUH, CUH * EUH, DUH * EUH, >> + EUH * EUH, FUH * EUH, GUH * EUH, HUH * EUH}; >> +uint16_t expecteduh8_6[8] = {AUH * FUH, BUH * FUH, CUH * FUH, DUH * FUH, >> + EUH * FUH, FUH * FUH, GUH * FUH, HUH * FUH}; >> +uint16_t expecteduh8_7[8] = {AUH * GUH, BUH * GUH, CUH * GUH, DUH * GUH, >> + EUH * GUH, FUH * GUH, GUH * GUH, HUH * GUH}; >> +uint16_t expecteduh8_8[8] = {AUH * HUH, BUH * HUH, CUH * HUH, DUH * HUH, >> + EUH * HUH, FUH * HUH, GUH * HUH, HUH * HUH}; >> +uint16_t _elemuhA = AUH; >> +uint16_t _elemuhB = BUH; >> +uint16_t _elemuhC = CUH; >> +uint16_t _elemuhD = DUH; >> +uint16_t _elemuhE = EUH; >> +uint16_t _elemuhF = FUH; >> +uint16_t _elemuhG = GUH; >> +uint16_t _elemuhH = HUH; >> + >> +void >> +check_v2sf (float32_t elemA, float32_t elemB) >> +{ >> + int32_t indx; >> + const float32_t vec32x2_buf[2] = {A, B}; >> + float32x2_t vec32x2_src = vld1_f32 (vec32x2_buf); >> + float32x2_t vec32x2_res = vec32x2_src * elemA; >> + >> + for (indx = 0; indx < 2; indx++) >> + if (* (uint32_t *) &vec32x2_res[indx] != * (uint32_t *) >> &expected2_1[indx]) >> + abort (); >> + >> + vec32x2_res = vec32x2_src * elemB; >> + >> + for (indx = 0; indx < 2; indx++) >> + if (* (uint32_t *) &vec32x2_res[indx] != * (uint32_t *) >> &expected2_2[indx]) >> + abort (); >> + >> +/* { dg-final { scan-assembler-times "fmul\tv\[0-9\]+\.2s, v\[0-9\]+\.2s, >> v\[0-9\]+\.s\\\[0\\\]" 2 } } */ >> +} >> + >> +void >> +check_v4sf (float32_t elemA, float32_t elemB, float32_t elemC, float32_t >> elemD) >> +{ >> + int32_t indx; >> + const float32_t vec32x4_buf[4] = {A, B, C, D}; >> + float32x4_t vec32x4_src = vld1q_f32 (vec32x4_buf); >> + float32x4_t vec32x4_res = vec32x4_src * elemA; >> + >> + for (indx = 0; indx < 4; indx++) >> + if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) >> &expected4_1[indx]) >> + abort (); >> + >> + vec32x4_res = vec32x4_src * elemB; >> + >> + for (indx = 0; indx < 4; indx++) >> + if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) >> &expected4_2[indx]) >> + abort (); >> + >> + vec32x4_res = vec32x4_src * elemC; >> + >> + for (indx = 0; indx < 4; indx++) >> + if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) >> &expected4_3[indx]) >> + abort (); >> + >> + vec32x4_res = vec32x4_src * elemD; >> + >> + for (indx = 0; indx < 4; indx++) >> + if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) >> &expected4_4[indx]) >> + abort (); >> + >> +/* { dg-final { scan-assembler-times "fmul\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, >> v\[0-9\]+\.s\\\[0\\\]" 4 } } */ >> +} >> + >> +void >> +check_v2df (float64_t elemdC, float64_t elemdD) >> +{ >> + int32_t indx; >> + const float64_t vec64x2_buf[2] = {AD, BD}; >> + float64x2_t vec64x2_src = vld1q_f64 (vec64x2_buf); >> + float64x2_t vec64x2_res = vec64x2_src * elemdC; >> + >> + for (indx = 0; indx < 2; indx++) >> + if (* (uint64_t *) &vec64x2_res[indx] != * (uint64_t *) >> &expectedd2_1[indx]) >> + abort (); >> + >> + vec64x2_res = vec64x2_src * elemdD; >> + >> + for (indx = 0; indx < 2; indx++) >> + if (* (uint64_t *) &vec64x2_res[indx] != * (uint64_t *) >> &expectedd2_2[indx]) >> + abort (); >> + >> +/* { dg-final { scan-assembler-times "fmul\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, >> v\[0-9\]+\.d\\\[0\\\]" 2 } } */ >> +} >> + >> +void >> +check_v2si (int32_t elemsA, int32_t elemsB) >> +{ >> + int32_t indx; >> + const int32_t vecs32x2_buf[2] = {AS, BS}; >> + int32x2_t vecs32x2_src = vld1_s32 (vecs32x2_buf); >> + int32x2_t vecs32x2_res = vecs32x2_src * elemsA; >> + >> + for (indx = 0; indx < 2; indx++) >> + if (vecs32x2_res[indx] != expecteds2_1[indx]) >> + abort (); >> + >> + vecs32x2_res = vecs32x2_src * elemsB; >> + >> + for (indx = 0; indx < 2; indx++) >> + if (vecs32x2_res[indx] != expecteds2_2[indx]) >> + abort (); >> +} >> + >> +void >> +check_v2si_unsigned (uint32_t elemusA, uint32_t elemusB) >> +{ >> + int indx; >> + const uint32_t vecus32x2_buf[2] = {AUS, BUS}; >> + uint32x2_t vecus32x2_src = vld1_u32 (vecus32x2_buf); >> + uint32x2_t vecus32x2_res = vecus32x2_src * elemusA; >> + >> + for (indx = 0; indx < 2; indx++) >> + if (vecus32x2_res[indx] != expectedus2_1[indx]) >> + abort (); >> + >> + vecus32x2_res = vecus32x2_src * elemusB; >> + >> + for (indx = 0; indx < 2; indx++) >> + if (vecus32x2_res[indx] != expectedus2_2[indx]) >> + abort (); >> + >> +/* { dg-final { scan-assembler-times "\tmul\tv\[0-9\]+\.2s, v\[0-9\]+\.2s, >> v\[0-9\]+\.s\\\[0\\\]" 4 } } */ >> +} >> + >> +void >> +check_v4si (int32_t elemsA, int32_t elemsB, int32_t elemsC, int32_t elemsD) >> +{ >> + int32_t indx; >> + const int32_t vecs32x4_buf[4] = {AS, BS, CS, DS}; >> + int32x4_t vecs32x4_src = vld1q_s32 (vecs32x4_buf); >> + int32x4_t vecs32x4_res = vecs32x4_src * elemsA; >> + >> + for (indx = 0; indx < 4; indx++) >> + if (vecs32x4_res[indx] != expecteds4_1[indx]) >> + abort (); >> + >> + vecs32x4_res = vecs32x4_src * elemsB; >> + >> + for (indx = 0; indx < 4; indx++) >> + if (vecs32x4_res[indx] != expecteds4_2[indx]) >> + abort (); >> + >> + vecs32x4_res = vecs32x4_src * elemsC; >> + >> + for (indx = 0; indx < 4; indx++) >> + if (vecs32x4_res[indx] != expecteds4_3[indx]) >> + abort (); >> + >> + vecs32x4_res = vecs32x4_src * elemsD; >> + >> + for (indx = 0; indx < 4; indx++) >> + if (vecs32x4_res[indx] != expecteds4_4[indx]) >> + abort (); >> +} >> + >> +void >> +check_v4si_unsigned (uint32_t elemusA, uint32_t elemusB, uint32_t elemusC, >> + uint32_t elemusD) >> +{ >> + int indx; >> + const uint32_t vecus32x4_buf[4] = {AUS, BUS, CUS, DUS}; >> + uint32x4_t vecus32x4_src = vld1q_u32 (vecus32x4_buf); >> + uint32x4_t vecus32x4_res = vecus32x4_src * elemusA; >> + >> + for (indx = 0; indx < 4; indx++) >> + if (vecus32x4_res[indx] != expectedus4_1[indx]) >> + abort (); >> + >> + vecus32x4_res = vecus32x4_src * elemusB; >> + >> + for (indx = 0; indx < 4; indx++) >> + if (vecus32x4_res[indx] != expectedus4_2[indx]) >> + abort (); >> + >> + vecus32x4_res = vecus32x4_src * elemusC; >> + >> + for (indx = 0; indx < 4; indx++) >> + if (vecus32x4_res[indx] != expectedus4_3[indx]) >> + abort (); >> + >> + vecus32x4_res = vecus32x4_src * elemusD; >> + >> + for (indx = 0; indx < 4; indx++) >> + if (vecus32x4_res[indx] != expectedus4_4[indx]) >> + abort (); >> + >> +/* { dg-final { scan-assembler-times "\tmul\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, >> v\[0-9\]+\.s\\\[0\\\]" 8 } } */ >> +} >> + >> + >> +void >> +check_v4hi (int16_t elemhA, int16_t elemhB, int16_t elemhC, int16_t elemhD) >> +{ >> + int32_t indx; >> + const int16_t vech16x4_buf[4] = {AH, BH, CH, DH}; >> + int16x4_t vech16x4_src = vld1_s16 (vech16x4_buf); >> + int16x4_t vech16x4_res = vech16x4_src * elemhA; >> + >> + for (indx = 0; indx < 4; indx++) >> + if (vech16x4_res[indx] != expectedh4_1[indx]) >> + abort (); >> + >> + vech16x4_res = vech16x4_src * elemhB; >> + >> + for (indx = 0; indx < 4; indx++) >> + if (vech16x4_res[indx] != expectedh4_2[indx]) >> + abort (); >> + >> + vech16x4_res = vech16x4_src * elemhC; >> + >> + for (indx = 0; indx < 4; indx++) >> + if (vech16x4_res[indx] != expectedh4_3[indx]) >> + abort (); >> + >> + vech16x4_res = vech16x4_src * elemhD; >> + >> + for (indx = 0; indx < 4; indx++) >> + if (vech16x4_res[indx] != expectedh4_4[indx]) >> + abort (); >> +} >> + >> +void >> +check_v4hi_unsigned (uint16_t elemuhA, uint16_t elemuhB, uint16_t elemuhC, >> + uint16_t elemuhD) >> +{ >> + int indx; >> + const uint16_t vecuh16x4_buf[4] = {AUH, BUH, CUH, DUH}; >> + uint16x4_t vecuh16x4_src = vld1_u16 (vecuh16x4_buf); >> + uint16x4_t vecuh16x4_res = vecuh16x4_src * elemuhA; >> + >> + for (indx = 0; indx < 4; indx++) >> + if (vecuh16x4_res[indx] != expecteduh4_1[indx]) >> + abort (); >> + >> + vecuh16x4_res = vecuh16x4_src * elemuhB; >> + >> + for (indx = 0; indx < 4; indx++) >> + if (vecuh16x4_res[indx] != expecteduh4_2[indx]) >> + abort (); >> + >> + vecuh16x4_res = vecuh16x4_src * elemuhC; >> + >> + for (indx = 0; indx < 4; indx++) >> + if (vecuh16x4_res[indx] != expecteduh4_3[indx]) >> + abort (); >> + >> + vecuh16x4_res = vecuh16x4_src * elemuhD; >> + >> + for (indx = 0; indx < 4; indx++) >> + if (vecuh16x4_res[indx] != expecteduh4_4[indx]) >> + abort (); >> + >> +/* { dg-final { scan-assembler-times "mul\tv\[0-9\]+\.4h, v\[0-9\]+\.4h, >> v\[0-9\]+\.h\\\[0\\\]" 8 } } */ >> +} >> + >> +void >> +check_v8hi (int16_t elemhA, int16_t elemhB, int16_t elemhC, int16_t elemhD, >> + int16_t elemhE, int16_t elemhF, int16_t elemhG, int16_t elemhH) >> +{ >> + int32_t indx; >> + const int16_t vech16x8_buf[8] = {AH, BH, CH, DH, EH, FH, GH, HH}; >> + int16x8_t vech16x8_src = vld1q_s16 (vech16x8_buf); >> + int16x8_t vech16x8_res = vech16x8_src * elemhA; >> + >> + for (indx = 0; indx < 8; indx++) >> + if (vech16x8_res[indx] != expectedh8_1[indx]) >> + abort (); >> + >> + vech16x8_res = vech16x8_src * elemhB; >> + >> + for (indx = 0; indx < 8; indx++) >> + if (vech16x8_res[indx] != expectedh8_2[indx]) >> + abort (); >> + >> + vech16x8_res = vech16x8_src * elemhC; >> + >> + for (indx = 0; indx < 8; indx++) >> + if (vech16x8_res[indx] != expectedh8_3[indx]) >> + abort (); >> + >> + vech16x8_res = vech16x8_src * elemhD; >> + >> + for (indx = 0; indx < 8; indx++) >> + if (vech16x8_res[indx] != expectedh8_4[indx]) >> + abort (); >> + >> + vech16x8_res = vech16x8_src * elemhE; >> + >> + for (indx = 0; indx < 8; indx++) >> + if (vech16x8_res[indx] != expectedh8_5[indx]) >> + abort (); >> + >> + vech16x8_res = vech16x8_src * elemhF; >> + >> + for (indx = 0; indx < 8; indx++) >> + if (vech16x8_res[indx] != expectedh8_6[indx]) >> + abort (); >> + >> + vech16x8_res = vech16x8_src * elemhG; >> + >> + for (indx = 0; indx < 8; indx++) >> + if (vech16x8_res[indx] != expectedh8_7[indx]) >> + abort (); >> + >> + vech16x8_res = vech16x8_src * elemhH; >> + >> + for (indx = 0; indx < 8; indx++) >> + if (vech16x8_res[indx] != expectedh8_8[indx]) >> + abort (); >> +} >> + >> +void >> +check_v8hi_unsigned (uint16_t elemuhA, uint16_t elemuhB, uint16_t elemuhC, >> + uint16_t elemuhD, uint16_t elemuhE, uint16_t elemuhF, >> + uint16_t elemuhG, uint16_t elemuhH) >> +{ >> + int indx; >> + const uint16_t vecuh16x8_buf[8] = {AUH, BUH, CUH, DUH, EUH, FUH, GUH, >> HUH}; >> + uint16x8_t vecuh16x8_src = vld1q_u16 (vecuh16x8_buf); >> + uint16x8_t vecuh16x8_res = vecuh16x8_src * elemuhA; >> + >> + for (indx = 0; indx < 8; indx++) >> + if (vecuh16x8_res[indx] != expecteduh8_1[indx]) >> + abort (); >> + >> + vecuh16x8_res = vecuh16x8_src * elemuhB; >> + >> + for (indx = 0; indx < 8; indx++) >> + if (vecuh16x8_res[indx] != expecteduh8_2[indx]) >> + abort (); >> + >> + vecuh16x8_res = vecuh16x8_src * elemuhC; >> + >> + for (indx = 0; indx < 8; indx++) >> + if (vecuh16x8_res[indx] != expecteduh8_3[indx]) >> + abort (); >> + >> + vecuh16x8_res = vecuh16x8_src * elemuhD; >> + >> + for (indx = 0; indx < 8; indx++) >> + if (vecuh16x8_res[indx] != expecteduh8_4[indx]) >> + abort (); >> + >> + vecuh16x8_res = vecuh16x8_src * elemuhE; >> + >> + for (indx = 0; indx < 8; indx++) >> + if (vecuh16x8_res[indx] != expecteduh8_5[indx]) >> + abort (); >> + >> + vecuh16x8_res = vecuh16x8_src * elemuhF; >> + >> + for (indx = 0; indx < 8; indx++) >> + if (vecuh16x8_res[indx] != expecteduh8_6[indx]) >> + abort (); >> + >> + vecuh16x8_res = vecuh16x8_src * elemuhG; >> + >> + for (indx = 0; indx < 8; indx++) >> + if (vecuh16x8_res[indx] != expecteduh8_7[indx]) >> + abort (); >> + >> + vecuh16x8_res = vecuh16x8_src * elemuhH; >> + >> + for (indx = 0; indx < 8; indx++) >> + if (vecuh16x8_res[indx] != expecteduh8_8[indx]) >> + abort (); >> + >> +/* { dg-final { scan-assembler-times "mul\tv\[0-9\]+\.8h, v\[0-9\]+\.8h, >> v\[0-9\]+\.h\\\[0\\\]" 16 } } */ >> +} >> + >> +int >> +main (void) >> +{ >> + check_v2sf (_elemA, _elemB); >> + check_v4sf (_elemA, _elemB, _elemC, _elemD); >> + check_v2df (_elemdC, _elemdD); >> + check_v2si (_elemsA, _elemsB); >> + check_v4si (_elemsA, _elemsB, _elemsC, _elemsD); >> + check_v4hi (_elemhA, _elemhB, _elemhC, _elemhD); >> + check_v8hi (_elemhA, _elemhB, _elemhC, _elemhD, >> + _elemhE, _elemhF, _elemhG, _elemhH); >> + check_v2si_unsigned (_elemusA, _elemusB); >> + check_v4si_unsigned (_elemusA, _elemusB, _elemusC, _elemusD); >> + check_v4hi_unsigned (_elemuhA, _elemuhB, _elemuhC, _elemuhD); >> + check_v8hi_unsigned (_elemuhA, _elemuhB, _elemuhC, _elemuhD, >> + _elemuhE, _elemuhF, _elemuhG, _elemuhH); >> + >> + return 0; >> +} >> + >> >