Issue |
144861
|
Summary |
[X86] bcmp with zero not vectorized
|
Labels |
backend:X86,
missed-optimization
|
Assignees |
|
Reporter |
nikic
|
A bcmp with an all zero value results in a sequence of ors, while a bcmp with all ones uses vptest.
https://llvm.godbolt.org/z/c15xY8nKv
```llvm
@zeroes = private unnamed_addr constant [64 x i8] zeroinitializer, align 1
@_ones_ = private unnamed_addr constant [64 x i8] c"\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF", align 1
declare i32 @bcmp(ptr, ptr, i64)
define zeroext i1 @test_zeroes(ptr %x) {
%bcmp = tail call i32 @bcmp(ptr %x, ptr @zeroes, i64 64)
%icmp = icmp eq i32 %bcmp, 0
ret i1 %icmp
}
define zeroext i1 @test_ones(ptr %x) {
%bcmp = tail call i32 @bcmp(ptr %x, ptr @ones, i64 64)
%icmp = icmp eq i32 %bcmp, 0
ret i1 %icmp
}
```
```
test_zeroes: # @test_zeroes
mov rax, qword ptr [rdi + 24]
mov rcx, qword ptr [rdi]
mov rdx, qword ptr [rdi + 8]
mov rsi, qword ptr [rdi + 16]
or rsi, qword ptr [rdi + 48]
or rcx, qword ptr [rdi + 32]
or rcx, rsi
or rax, qword ptr [rdi + 56]
or rdx, qword ptr [rdi + 40]
or rdx, rax
or rdx, rcx
sete al
ret
test_ones: # @test_ones
vmovdqu ymm0, ymmword ptr [rdi]
vpand ymm0, ymm0, ymmword ptr [rdi + 32]
vpcmpeqd ymm1, ymm1, ymm1
vptest ymm0, ymm1
setb al
vzeroupper
ret
```
The bcmp expansions look like this (https://llvm.godbolt.org/z/Tba34zYod):
```llvm
@zeroes = private unnamed_addr constant [64 x i8] zeroinitializer, align 1
@_ones_ = private unnamed_addr constant [64 x i8] c"\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF", align 1
define zeroext i1 @test_zeroes(ptr %x) {
start:
%0 = load i256, ptr %x, align 1
%1 = getelementptr i8, ptr %x, i64 32
%2 = load i256, ptr %1, align 1
%3 = or i256 %0, %2
%4 = icmp ne i256 %3, 0
%5 = zext i1 %4 to i32
%6 = icmp eq i32 %5, 0
ret i1 %6
}
define zeroext i1 @test_ones(ptr %x) {
start:
%0 = load i256, ptr %x, align 1
%1 = xor i256 %0, -1
%2 = getelementptr i8, ptr %x, i64 32
%3 = load i256, ptr %2, align 1
%4 = xor i256 %3, -1
%5 = or i256 %1, %4
%6 = icmp ne i256 %5, 0
%7 = zext i1 %6 to i32
%8 = icmp eq i32 %7, 0
ret i1 %8
}
```
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs