In LoongArch, we have xvshuf.{b/h/w/d} instructions which can dealt the situation that all low 128-bit elements of the target vector are shuffled by concatenating the low 128-bit elements of the two input vectors, and all high 128-bit elements of the target vector are similarly shuffled. Therefore, we added recognition for such situations and used the xvshuf instruction for optimization.
gcc/ChangeLog: * config/loongarch/loongarch.cc (loongarch_if_match_xvshuffle): Add new condition. (loongarch_expand_vec_perm_const): Add new function. gcc/testsuite/ChangeLog: * gcc.target/loongarch/vec_perm-verify-xvshuf.c: New test. * gcc.target/loongarch/vec_perm-xvshuf.c: New test. --- gcc/config/loongarch/loongarch.cc | 69 ++++++++++++ .../loongarch/vec_perm-verify-xvshuf.c | 106 ++++++++++++++++++ .../gcc.target/loongarch/vec_perm-xvshuf.c | 17 +++ 3 files changed, 192 insertions(+) create mode 100644 gcc/testsuite/gcc.target/loongarch/vec_perm-verify-xvshuf.c create mode 100644 gcc/testsuite/gcc.target/loongarch/vec_perm-xvshuf.c diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc index e9978370e8c..3ac6a74f15b 100644 --- a/gcc/config/loongarch/loongarch.cc +++ b/gcc/config/loongarch/loongarch.cc @@ -9318,6 +9318,34 @@ loongarch_is_elem_duplicate (struct expand_vec_perm_d *d) return result; } +/* If the target vector low 128-bit element comes from the low 128-bit element + of op0 or op1, and the target vector high 128-bit element comes from the + high 128-bit element of op0 or op1, the corresponding xvshuf.{h/w/d} + instruction can be matched. */ +static bool +loongarch_if_match_xvshuffle (struct expand_vec_perm_d *d) +{ + for (int i = 0; i < d->nelt; i++) + { + unsigned char buf = d->perm[i]; + + if (i < d->nelt / 2) + { + if ((buf >= d->nelt / 2 && buf < d->nelt) + || buf >= (d->nelt + d->nelt / 2)) + return false; + } + else + { + if ((buf >= d->nelt && buf < (d->nelt + d->nelt / 2)) + || buf < d->nelt / 2) + return false; + } + } + + return true; +} + /* In LASX, some permutation insn does not have the behavior that gcc expects when compiler wants to emit a vector permutation. @@ -9570,6 +9598,47 @@ loongarch_expand_vec_perm_const (struct expand_vec_perm_d *d) return true; } + if (loongarch_if_match_xvshuffle (d)) + { + if (d->testing_p) + return true; + + /* Selector example: E_V8SImode, { 0, 9, 2, 11, 4, 13, 6, 15 }. */ + /* If target low 128-bit has op1 low 128-bit element {9, 11}, we + need subtract half of d->nelt (so index in range (4, 7)) to form + the 256-bit intermediate vector vec0. + Similarly, if target high 128-bit has op0 high 128-bit element + {4, 6}, we need subtract half of d->nelt (so index in range + (0, 3)) to form the 256-bit intermediate vector vec1. + Especially if target high 128-bit has op1 high 128-bit element + {13, 15}, we need modulo d->nelt (so index in range (4, 7)) to + form the 256-bit intermediate vector vec1. */ + for (i = 0; i < d->nelt; i += 1) + { + if (i < d->nelt / 2) + { + if (d->perm[i] >= d->nelt) + remapped[i] = d->perm[i] - d->nelt / 2; + else + remapped[i] = d->perm[i]; + } + else + { + if (d->perm[i] < d->nelt) + remapped[i] = d->perm[i] - d->nelt / 2; + else + remapped[i] = d->perm[i] % d->nelt; + } + } + + /* Selector after: { 0, 5, 2, 7, 0, 5, 2, 7 }. */ + for (i = 0; i < d->nelt; i += 1) + rperm[i] = GEN_INT (remapped[i]); + + flag = true; + goto expand_perm_const_end; + } + expand_perm_const_end: if (flag) { diff --git a/gcc/testsuite/gcc.target/loongarch/vec_perm-verify-xvshuf.c b/gcc/testsuite/gcc.target/loongarch/vec_perm-verify-xvshuf.c new file mode 100644 index 00000000000..658dacfe340 --- /dev/null +++ b/gcc/testsuite/gcc.target/loongarch/vec_perm-verify-xvshuf.c @@ -0,0 +1,106 @@ +/* { dg-do run } */ +/* { dg-options "-O3 -mlasx -w -fno-strict-aliasing" } */ + +#include "./vector/simd_correctness_check.h" +#include <lasxintrin.h> +#define N 16 + +typedef int TYPE; + +void +foo (TYPE a[], TYPE b[], TYPE c[]) +{ + for (int i = 0; i < N; i += 2) + { + c[i + 0] = a[i + 0] + b[i + 0]; + c[i + 1] = a[i + 1] - b[i + 1]; + } +} + +__m256i +change_to_256vec (TYPE c[], int offset) +{ + __m256i __m256i_op; + int type_bit_len = sizeof (TYPE) * 8; + long int tmp; + + for (int i = offset; i < 256 / type_bit_len + offset; i += 2) + { + __m256i_op[(i - offset) / 2] = 0x0; + __m256i_op[(i - offset) / 2] |= c[i]; + tmp = ((long int)c[i + 1] << type_bit_len); + __m256i_op[(i - offset) / 2] |= tmp; + } + + return __m256i_op; +} + +int +main () +{ + TYPE a[N], b[N], c[N]; + for (int i = 0; i < N; ++i) + { + a[i] = i; + b[i] = N + i; + } + + // c = {16,-16,20,-16,24,-16,28,-16,32,-16,36,-16,40,-16,44,-16}; + foo (a, b, c); + + __m256i ans1 = change_to_256vec (c, 0); + __m256i ans2 = change_to_256vec (c, N / 2); + + __m256i __m256i_op0, __m256i_op1, __m256i_op2, __m256i_out, __m256i_result; + __m256i __m256i_op3, __m256i_op4, __m256i_op5, __m256i_out2, __m256i_result2; + + *((unsigned long *)&__m256i_op0[3]) = 0x0000000700000002; + *((unsigned long *)&__m256i_op0[2]) = 0x0000000500000000; + *((unsigned long *)&__m256i_op0[1]) = 0x0000000700000002; + *((unsigned long *)&__m256i_op0[0]) = 0x0000000500000000; + + *((unsigned long *)&__m256i_op1[3]) = 0x0000001e0000001c; + *((unsigned long *)&__m256i_op1[2]) = 0x0000001a00000018; + *((unsigned long *)&__m256i_op1[1]) = 0x0000001600000014; + *((unsigned long *)&__m256i_op1[0]) = 0x0000001200000010; + + *((unsigned long *)&__m256i_op2[3]) = 0xfffffff0fffffff0; + *((unsigned long *)&__m256i_op2[2]) = 0xfffffff0fffffff0; + *((unsigned long *)&__m256i_op2[1]) = 0xfffffff0fffffff0; + *((unsigned long *)&__m256i_op2[0]) = 0xfffffff0fffffff0; + + // __m256i_result = {16,-16,20,-16,24,-16,28,-16}; + *((unsigned long *)&__m256i_result[3]) = 0xfffffff00000001c; + *((unsigned long *)&__m256i_result[2]) = 0xfffffff000000018; + *((unsigned long *)&__m256i_result[1]) = 0xfffffff000000014; + *((unsigned long *)&__m256i_result[0]) = 0xfffffff000000010; + + __m256i_out = __lasx_xvshuf_w (__m256i_op0, __m256i_op2, __m256i_op1); + ASSERTEQ_64 (__LINE__, __m256i_result, __m256i_out); + ASSERTEQ_64 (__LINE__, ans1, __m256i_out); + + *((unsigned long *)&__m256i_op3[3]) = 0x0000000700000002; + *((unsigned long *)&__m256i_op3[2]) = 0x0000000500000000; + *((unsigned long *)&__m256i_op3[1]) = 0x0000000700000002; + *((unsigned long *)&__m256i_op3[0]) = 0x0000000500000000; + + *((unsigned long *)&__m256i_op4[3]) = 0x0000002e0000002c; + *((unsigned long *)&__m256i_op4[2]) = 0x0000002a00000028; + *((unsigned long *)&__m256i_op4[1]) = 0x0000002600000024; + *((unsigned long *)&__m256i_op4[0]) = 0x0000002200000020; + + *((unsigned long *)&__m256i_op5[3]) = 0xfffffff0fffffff0; + *((unsigned long *)&__m256i_op5[2]) = 0xfffffff0fffffff0; + *((unsigned long *)&__m256i_op5[1]) = 0xfffffff0fffffff0; + *((unsigned long *)&__m256i_op5[0]) = 0xfffffff0fffffff0; + + // __m256i_result2 = {32,-16,36,-16,40,-16,44,-16}; + *((unsigned long *)&__m256i_result2[3]) = 0xfffffff00000002c; + *((unsigned long *)&__m256i_result2[2]) = 0xfffffff000000028; + *((unsigned long *)&__m256i_result2[1]) = 0xfffffff000000024; + *((unsigned long *)&__m256i_result2[0]) = 0xfffffff000000020; + + __m256i_out2 = __lasx_xvshuf_w (__m256i_op3, __m256i_op5, __m256i_op4); + ASSERTEQ_64 (__LINE__, __m256i_result2, __m256i_out2); + ASSERTEQ_64 (__LINE__, ans2, __m256i_out2); +} diff --git a/gcc/testsuite/gcc.target/loongarch/vec_perm-xvshuf.c b/gcc/testsuite/gcc.target/loongarch/vec_perm-xvshuf.c new file mode 100644 index 00000000000..6b19c2c2fd8 --- /dev/null +++ b/gcc/testsuite/gcc.target/loongarch/vec_perm-xvshuf.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -mlasx" } */ +/* { dg-final { scan-assembler "xvshuf.w" } } */ +/* { dg-final { scan-assembler-not "xvperm.w" } } */ +/* { dg-final { scan-assembler-not "xvbitsel.v" } } */ + +void +foo (int a[], int b[], int c[]) +{ + for (int i = 0; i < 100; i += 4) + { + c[i + 0] = a[i + 0] + b[i + 0]; + c[i + 1] = a[i + 1] - b[i + 1]; + c[i + 2] = a[i + 2] - b[i + 2]; + c[i + 3] = a[i + 3] + b[i + 3]; + } +} -- 2.38.1