adream307 wrote:
I have checked on actual avx512 hardware
```cpp
#include <stdio.h>
#include <immintrin.h>
void printv8hu(__v8hu v) {
for (int i=0; i !=8; ++i) {
printf("%d,", v[i]);
}
printf("\n");
}
void printv16hu(__v16hu v) {
for (int i=0; i !=16; ++i) {
printf("%d,", v[i]);
}
printf("\n");
}
int main() {
__v8hu v0 = (__v8hu)_mm_mpsadbw_epu8(((__m128i)(__v16qu){11, 13, 17, 19, 23,
29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71}),
((__m128i)(__v16qu){167, 173, 179, 181,
191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251}),
0);
__v8hu v1 = (__v8hu)_mm_mpsadbw_epu8(((__m128i)(__v16qu){11, 13, 17, 19, 23,
29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71}),
((__m128i)(__v16qu){167, 173, 179, 181,
191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251}),
1);
__v8hu v2 = (__v8hu)_mm_mpsadbw_epu8(((__m128i)(__v16qu){11, 13, 17, 19, 23,
29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71}),
((__m128i)(__v16qu){167, 173, 179, 181,
191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251}),
2);
__v8hu v3 = (__v8hu)_mm_mpsadbw_epu8(((__m128i)(__v16qu){11, 13, 17, 19, 23,
29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71}),
((__m128i)(__v16qu){167, 173, 179, 181,
191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251}),
3);
__v8hu v4 = (__v8hu)_mm_mpsadbw_epu8(((__m128i)(__v16qu){11, 13, 17, 19, 23,
29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71}),
((__m128i)(__v16qu){167, 173, 179, 181,
191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251}),
4);
__v8hu v5 = (__v8hu)_mm_mpsadbw_epu8(((__m128i)(__v16qu){11, 13, 17, 19, 23,
29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71}),
((__m128i)(__v16qu){167, 173, 179, 181,
191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251}),
5);
__v8hu v6 = (__v8hu)_mm_mpsadbw_epu8(((__m128i)(__v16qu){11, 13, 17, 19, 23,
29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71}),
((__m128i)(__v16qu){167, 173, 179, 181,
191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251}),
6);
__v8hu v7 = (__v8hu)_mm_mpsadbw_epu8(((__m128i)(__v16qu){11, 13, 17, 19, 23,
29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71}),
((__m128i)(__v16qu){167, 173, 179, 181,
191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251}),
7);
printv8hu(v0);
printv8hu(v1);
printv8hu(v2);
printv8hu(v3);
printv8hu(v4);
printv8hu(v5);
printv8hu(v6);
printv8hu(v7);
__v16hu r0 = (__v16hu)_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2, 3, 5, 7, 11,
13, 17, 19, 23, 29,31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97,
101, 103, 107, 109, 113, 127, 131}),
((__m256i)(__v32qu){83, 89, 97,
101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179,
181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251}),
0);
__v16hu r1 = (__v16hu)_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2, 3, 5, 7, 11,
13, 17, 19, 23, 29,31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97,
101, 103, 107, 109, 113, 127, 131}),
((__m256i)(__v32qu){83, 89, 97,
101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179,
181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251}),
1);
__v16hu r2 = (__v16hu)_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2, 3, 5, 7, 11,
13, 17, 19, 23, 29,31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97,
101, 103, 107, 109, 113, 127, 131}),
((__m256i)(__v32qu){83, 89, 97,
101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179,
181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251}),
2);
__v16hu r3 = (__v16hu)_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2, 3, 5, 7, 11,
13, 17, 19, 23, 29,31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97,
101, 103, 107, 109, 113, 127, 131}),
((__m256i)(__v32qu){83, 89, 97,
101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179,
181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251}),
3);
__v16hu r4 = (__v16hu)_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2, 3, 5, 7, 11,
13, 17, 19, 23, 29,31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97,
101, 103, 107, 109, 113, 127, 131}),
((__m256i)(__v32qu){83, 89, 97,
101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179,
181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251}),
4);
__v16hu r5 = (__v16hu)_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2, 3, 5, 7, 11,
13, 17, 19, 23, 29,31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97,
101, 103, 107, 109, 113, 127, 131}),
((__m256i)(__v32qu){83, 89, 97,
101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179,
181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251}),
5);
__v16hu r6 = (__v16hu)_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2, 3, 5, 7, 11,
13, 17, 19, 23, 29,31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97,
101, 103, 107, 109, 113, 127, 131}),
((__m256i)(__v32qu){83, 89, 97,
101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179,
181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251}),
6);
__v16hu r7 = (__v16hu)_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2, 3, 5, 7, 11,
13, 17, 19, 23, 29,31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97,
101, 103, 107, 109, 113, 127, 131}),
((__m256i)(__v32qu){83, 89, 97,
101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179,
181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251}),
7);
printv16hu(r0);
printv16hu(r1);
printv16hu(r2);
printv16hu(r3);
printv16hu(r4);
printv16hu(r5);
printv16hu(r6);
printv16hu(r7);
__v16hu t0 = (__v16hu)_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2, 3, 5, 7, 11,
13, 17, 19, 23, 29,31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97,
101, 103, 107, 109, 113, 127, 131}),
((__m256i)(__v32qu){83, 89, 97,
101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179,
181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251}),
0<<3);
__v16hu t1 = (__v16hu)_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2, 3, 5, 7, 11,
13, 17, 19, 23, 29,31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97,
101, 103, 107, 109, 113, 127, 131}),
((__m256i)(__v32qu){83, 89, 97,
101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179,
181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251}),
1<<3);
__v16hu t2 = (__v16hu)_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2, 3, 5, 7, 11,
13, 17, 19, 23, 29,31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97,
101, 103, 107, 109, 113, 127, 131}),
((__m256i)(__v32qu){83, 89, 97,
101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179,
181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251}),
2<<3);
__v16hu t3 = (__v16hu)_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2, 3, 5, 7, 11,
13, 17, 19, 23, 29,31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97,
101, 103, 107, 109, 113, 127, 131}),
((__m256i)(__v32qu){83, 89, 97,
101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179,
181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251}),
3<<3);
__v16hu t4 = (__v16hu)_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2, 3, 5, 7, 11,
13, 17, 19, 23, 29,31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97,
101, 103, 107, 109, 113, 127, 131}),
((__m256i)(__v32qu){83, 89, 97,
101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179,
181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251}),
4<<3);
__v16hu t5 = (__v16hu)_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2, 3, 5, 7, 11,
13, 17, 19, 23, 29,31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97,
101, 103, 107, 109, 113, 127, 131}),
((__m256i)(__v32qu){83, 89, 97,
101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179,
181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251}),
5<<3);
__v16hu t6 = (__v16hu)_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2, 3, 5, 7, 11,
13, 17, 19, 23, 29,31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97,
101, 103, 107, 109, 113, 127, 131}),
((__m256i)(__v32qu){83, 89, 97,
101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179,
181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251}),
6<<3);
__v16hu t7 = (__v16hu)_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2, 3, 5, 7, 11,
13, 17, 19, 23, 29,31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97,
101, 103, 107, 109, 113, 127, 131}),
((__m256i)(__v32qu){83, 89, 97,
101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179,
181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251}),
7<<3);
printv16hu(t0);
printv16hu(t1);
printv16hu(t2);
printv16hu(t3);
printv16hu(t4);
printv16hu(t5);
printv16hu(t6);
printv16hu(t7);
return 0;
}
```
this is the output
```txt
640,628,612,598,580,562,548,532,
720,708,692,678,660,642,628,612,
830,818,802,788,770,752,738,722,
904,892,876,862,844,826,812,796,
580,562,548,532,516,498,480,460,
660,642,628,612,596,578,560,540,
770,752,738,722,706,688,670,650,
844,826,812,796,780,762,744,724,
353,344,334,322,310,298,282,268,442,428,410,394,376,352,330,310,
415,406,396,384,372,360,344,330,442,428,410,394,376,352,330,310,
517,508,498,486,474,462,446,432,442,428,410,394,376,352,330,310,
603,594,584,572,560,548,532,518,442,428,410,394,376,352,330,310,
310,298,282,268,250,232,218,202,442,428,410,394,376,352,330,310,
372,360,344,330,312,294,280,264,442,428,410,394,376,352,330,310,
474,462,446,432,414,396,382,366,442,428,410,394,376,352,330,310,
560,548,532,518,500,482,468,452,442,428,410,394,376,352,330,310,
353,344,334,322,310,298,282,268,442,428,410,394,376,352,330,310,
353,344,334,322,310,298,282,268,522,508,490,474,456,432,410,390,
353,344,334,322,310,298,282,268,632,618,600,584,566,542,520,500,
353,344,334,322,310,298,282,268,706,692,674,658,640,616,594,574,
353,344,334,322,310,298,282,268,376,352,330,310,292,280,268,244,
353,344,334,322,310,298,282,268,456,432,410,390,372,360,348,324,
353,344,334,322,310,298,282,268,566,542,520,500,482,470,458,434,
353,344,334,322,310,298,282,268,640,616,594,574,556,544,532,508,
```
https://github.com/llvm/llvm-project/pull/202257
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits