[llvm] [clang] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2023-12-29 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 created 
https://github.com/llvm/llvm-project/pull/76558

Currently `fp128` math intrinsics are lowered to functions expecting `long 
double`, which is a problem when `long double` and `f128` do not have the same 
layout (e.g. `long double` on x86 is `f80`).

This patchset does the following:

1. Move `long double` layout logic from Clang to LLVM so we can use it on all 
targets
2. Default to lowering to `__float128` math calls rather than `long double` 
calls (`sinf128` instead of `sinl`)
3. Add logic to still emit long double calls on platforms where `long double == 
f128`, 

I still need to figure out how to support `-mlong-double-128` and similar 
flags, and need to add a test for a target where `ld == f128` such as aarch64. 
A quick review at this point would still be appreciated to make sure I am on 
the right track.

Fixes: https://github.com/llvm/llvm-project/issues/44744
Discourse discussion: 
https://discourse.llvm.org/t/fp128-math-functions-strange-results/72708
Initial patchset: https://reviews.llvm.org/D157836 / 
http://108.170.204.19/D157836


>From 2f0f7c829a3ba16e7f1fa5a434832e07c40c1f1c Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Fri, 11 Aug 2023 22:16:01 -0400
Subject: [PATCH 1/4] [IR] Add an xpassing test for `f128` intrinsic lowering

`f128` intrinsic functions lower to incorrect libc calls. Add a test
showing current behavior.
---
 llvm/test/CodeGen/X86/f128-arith.ll | 516 
 1 file changed, 516 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/f128-arith.ll

diff --git a/llvm/test/CodeGen/X86/f128-arith.ll 
b/llvm/test/CodeGen/X86/f128-arith.ll
new file mode 100644
index 00..6af56a78b2926f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/f128-arith.ll
@@ -0,0 +1,516 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 2
+; RUN: llc < %s -mtriple=i686-unknown-unknown   -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-64
+;
+; Test lowering of fp128 intrinsics
+; FIXME: these emit calls to long double functions but should emit f128 calls
+
+define fp128 @test_cbrtf128(fp128 %a) {
+; CHECK-32-LABEL: test_cbrtf128:
+; CHECK-32:calll llvm.cbrt.f128@PLT
+;
+; CHECK-64-LABEL: test_cbrtf128:
+; CHECK-64:jmp llvm.cbrt.f128@PLT # TAILCALL
+start:
+  %0 = tail call fp128 @llvm.cbrt.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.cbrt.f128(fp128)
+
+
+define fp128 @test_ceilf128(fp128 %a) {
+; CHECK-32-LABEL: test_ceilf128:
+; CHECK-32:calll ceill
+;
+; CHECK-64-LABEL: test_ceilf128:
+; CHECK-64:jmp ceill@PLT # TAILCALL
+start:
+  %0 = tail call fp128 @llvm.ceil.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.ceil.f128(fp128)
+
+
+define fp128 @test_copysignf128(fp128 %a, fp128 %b) {
+; CHECK-32-LABEL: test_copysignf128:
+; CHECK-32:   # %bb.0: # %start
+; CHECK-32-NEXT:pushl %ebx
+; CHECK-32-NEXT:.cfi_def_cfa_offset 8
+; CHECK-32-NEXT:pushl %edi
+; CHECK-32-NEXT:.cfi_def_cfa_offset 12
+; CHECK-32-NEXT:pushl %esi
+; CHECK-32-NEXT:.cfi_def_cfa_offset 16
+; CHECK-32-NEXT:.cfi_offset %esi, -16
+; CHECK-32-NEXT:.cfi_offset %edi, -12
+; CHECK-32-NEXT:.cfi_offset %ebx, -8
+; CHECK-32-NEXT:movl {{[0-9]+}}(%esp), %eax
+; CHECK-32-NEXT:movl {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:movl {{[0-9]+}}(%esp), %edx
+; CHECK-32-NEXT:movl {{[0-9]+}}(%esp), %esi
+; CHECK-32-NEXT:movl $-2147483648, %edi # imm = 0x8000
+; CHECK-32-NEXT:andl {{[0-9]+}}(%esp), %edi
+; CHECK-32-NEXT:movl $2147483647, %ebx # imm = 0x7FFF
+; CHECK-32-NEXT:andl {{[0-9]+}}(%esp), %ebx
+; CHECK-32-NEXT:orl %edi, %ebx
+; CHECK-32-NEXT:movl %ebx, 12(%eax)
+; CHECK-32-NEXT:movl %esi, 8(%eax)
+; CHECK-32-NEXT:movl %edx, 4(%eax)
+; CHECK-32-NEXT:movl %ecx, (%eax)
+; CHECK-32-NEXT:popl %esi
+; CHECK-32-NEXT:.cfi_def_cfa_offset 12
+; CHECK-32-NEXT:popl %edi
+; CHECK-32-NEXT:.cfi_def_cfa_offset 8
+; CHECK-32-NEXT:popl %ebx
+; CHECK-32-NEXT:.cfi_def_cfa_offset 4
+; CHECK-32-NEXT:retl $4
+;
+; CHECK-64-LABEL: test_copysignf128:
+; CHECK-64:   # %bb.0: # %start
+; CHECK-64-NEXT:andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-64-NEXT:andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-64-NEXT:orps %xmm1, %xmm0
+; CHECK-64-NEXT:retq
+; FIXME: calling long double rather than f128 function
+; FIXME: calling long double rather than f128 function
+start:
+  %0 = tail call fp128 @llvm.copysign.f128(fp128 %a, fp128 %b)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.copysign.f128(fp128, fp128)
+
+
+define fp128 @test_cosf128(fp128 %a) {
+; CHECK-32-LABEL: test_cosf128:
+; CHECK-32:calll cosl
+;
+; CHECK-64-LABEL: test_cosf128:
+; CHECK-64:jmp cosl@PLT # TAILCALL
+start:
+  %0 = tail call fp128 @llvm.cos.f128(fp128 %a)
+  ret fp128 %0

[clang] [llvm] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2023-12-29 Thread Trevor Gross via cfe-commits

tgross35 wrote:

@efriedma-quic was looking at this on phabricator

https://github.com/llvm/llvm-project/pull/76558
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2023-12-29 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/76558

>From 7df4ef93989b1913d9200fbc29d6d04f9e59d51a Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Fri, 11 Aug 2023 22:16:01 -0400
Subject: [PATCH 1/4] [IR] Add an xpassing test for `f128` intrinsic lowering

`f128` intrinsic functions lower to incorrect libc calls. Add a test
showing current behavior.
---
 .../CodeGen/Generic/f128-math-lowering.ll | 610 ++
 1 file changed, 610 insertions(+)
 create mode 100644 llvm/test/CodeGen/Generic/f128-math-lowering.ll

diff --git a/llvm/test/CodeGen/Generic/f128-math-lowering.ll 
b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
new file mode 100644
index 00..30efb8ef34918e
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
@@ -0,0 +1,610 @@
+
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 2
+;;
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-AARCH64
+; RUN: llc < %s -mtriple=riscv32-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-RISCV32
+; RUN: llc < %s -mtriple=s390x-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-S390X
+; RUN: llc < %s -mtriple=i686-unknown-unknown   -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X64
+;
+; Verify that fp128 intrinsics only lower to `long double` calls on platforms
+; where `f128` and `long double` have the same layout.
+;
+; We test on x86 and x64 which have 80-bit ld, as well as aarch64 (ld == f128),
+; riscv32 (ld == f64), and s380x (ld == f128 with different alignment from
+; x64/aarch64 f128).
+;
+; FIXME: these emit calls to long double functions but should emit f128 calls
+
+define fp128 @test_cbrtf128(fp128 %a) {
+; CHECK-LABEL:  test_cbrtf128:
+; CHECK-AARCH64:b llvm.cbrt.f128
+; CHECK-RISCV32:call llvm.cbrt.f128@plt
+; CHECK-S390X:  brasl {{%.*}} llvm.cbrt.f128@PLT
+; CHECK-X64:jmp llvm.cbrt.f128@PLT # TAILCALL
+; CHECK-X86:calll llvm.cbrt.f128@PLT
+start:
+  %0 = tail call fp128 @llvm.cbrt.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.cbrt.f128(fp128)
+
+
+define fp128 @test_ceilf128(fp128 %a) {
+; CHECK-LABEL:  test_ceilf128:
+; CHECK-AARCH64:b ceill
+; CHECK-RISCV32:call ceill@plt
+; CHECK-S390X:  brasl {{%.*}} ceill@PLT
+; CHECK-X64:jmp ceill@PLT
+; CHECK-X86:calll ceill
+start:
+  %0 = tail call fp128 @llvm.ceil.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.ceil.f128(fp128)
+
+
+define fp128 @test_copysignf128(fp128 %a, fp128 %b) {
+; No math library call here, so make sure the assembly does the correct thing.
+; This test is autogenerated
+; CHECK-LABEL:test_copysignf128:
+; CHECK-AARCH64-LABEL: test_copysignf128:
+; CHECK-AARCH64:   // %bb.0: // %start
+; CHECK-AARCH64-NEXT:stp q0, q1, [sp, #-32]!
+; CHECK-AARCH64-NEXT:.cfi_def_cfa_offset 32
+; CHECK-AARCH64-NEXT:ldrb w8, [sp, #15]
+; CHECK-AARCH64-NEXT:ldrb w9, [sp, #31]
+; CHECK-AARCH64-NEXT:bfxil w9, w8, #0, #7
+; CHECK-AARCH64-NEXT:strb w9, [sp, #15]
+; CHECK-AARCH64-NEXT:ldr q0, [sp], #32
+; CHECK-AARCH64-NEXT:ret
+;
+; CHECK-RISCV32-LABEL: test_copysignf128:
+; CHECK-RISCV32:   # %bb.0: # %start
+; CHECK-RISCV32-NEXT:lw a3, 0(a1)
+; CHECK-RISCV32-NEXT:lw a4, 4(a1)
+; CHECK-RISCV32-NEXT:lw a2, 12(a2)
+; CHECK-RISCV32-NEXT:lw a5, 12(a1)
+; CHECK-RISCV32-NEXT:lw a1, 8(a1)
+; CHECK-RISCV32-NEXT:lui a6, 524288
+; CHECK-RISCV32-NEXT:and a2, a2, a6
+; CHECK-RISCV32-NEXT:slli a5, a5, 1
+; CHECK-RISCV32-NEXT:srli a5, a5, 1
+; CHECK-RISCV32-NEXT:or a2, a5, a2
+; CHECK-RISCV32-NEXT:sw a1, 8(a0)
+; CHECK-RISCV32-NEXT:sw a4, 4(a0)
+; CHECK-RISCV32-NEXT:sw a3, 0(a0)
+; CHECK-RISCV32-NEXT:sw a2, 12(a0)
+; CHECK-RISCV32-NEXT:ret
+;
+; CHECK-S390X-LABEL: test_copysignf128:
+; CHECK-S390X:   # %bb.0: # %start
+; CHECK-S390X-NEXT:ld %f0, 0(%r3)
+; CHECK-S390X-NEXT:ld %f2, 8(%r3)
+; CHECK-S390X-NEXT:ld %f1, 0(%r4)
+; CHECK-S390X-NEXT:ld %f3, 8(%r4)
+; CHECK-S390X-NEXT:cpsdr %f0, %f1, %f0
+; CHECK-S390X-NEXT:std %f0, 0(%r2)
+; CHECK-S390X-NEXT:std %f2, 8(%r2)
+; CHECK-S390X-NEXT:br %r14
+;
+; CHECK-X86-LABEL: test_copysignf128:
+; CHECK-X86:   # %bb.0: # %start
+; CHECK-X86-NEXT:pushl %ebx
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 8
+; CHECK-X86-NEXT:pushl %edi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 12
+; CHECK-X86-NEXT:pushl %esi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 16
+; CHECK-X86-NEXT:.cfi_offset %esi, -16
+; CHECK-X86-NEXT:.cfi_offset %edi, -12
+; CHECK-X86-NEXT:.cfi_offset %ebx, -8
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %ecx
+; CHECK-X86-

[clang] [llvm] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2023-12-29 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/76558

>From 590f4920ceb1a80d711d39624b0249cd9ff774d2 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Fri, 11 Aug 2023 22:16:01 -0400
Subject: [PATCH 1/4] [IR] Add an xpassing test for `f128` intrinsic lowering

`f128` intrinsic functions lower to incorrect libc calls. Add a test
showing current behavior.
---
 .../CodeGen/Generic/f128-math-lowering.ll | 610 ++
 1 file changed, 610 insertions(+)
 create mode 100644 llvm/test/CodeGen/Generic/f128-math-lowering.ll

diff --git a/llvm/test/CodeGen/Generic/f128-math-lowering.ll 
b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
new file mode 100644
index 00..8a70786d97fe67
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
@@ -0,0 +1,610 @@
+
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 2
+;
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-AARCH64
+; RUN: llc < %s -mtriple=riscv32-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-RISCV32
+; RUN: llc < %s -mtriple=s390x-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-S390X
+; RUN: llc < %s -mtriple=i686-unknown-unknown   -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X64
+;
+; Verify that fp128 intrinsics only lower to `long double` calls on platforms
+; where `f128` and `long double` have the same layout.
+;
+; We test on x86 and x64 which have 80-bit ld, as well as aarch64 (ld == f128),
+; riscv32 (ld == f64), and s380x (ld == f128 with different alignment from
+; x64/aarch64 f128).
+;
+; FIXME: these emit calls to long double functions but should emit f128 calls
+
+define fp128 @test_cbrtf128(fp128 %a) {
+; CHECK-LABEL:  test_cbrtf128:
+; CHECK-AARCH64:b llvm.cbrt.f128
+; CHECK-RISCV32:call llvm.cbrt.f128@plt
+; CHECK-S390X:  brasl {{%.*}} llvm.cbrt.f128@PLT
+; CHECK-X64:jmp llvm.cbrt.f128@PLT # TAILCALL
+; CHECK-X86:calll llvm.cbrt.f128@PLT
+start:
+  %0 = tail call fp128 @llvm.cbrt.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.cbrt.f128(fp128)
+
+
+define fp128 @test_ceilf128(fp128 %a) {
+; CHECK-LABEL:  test_ceilf128:
+; CHECK-AARCH64:b ceill
+; CHECK-RISCV32:call ceill@plt
+; CHECK-S390X:  brasl {{%.*}} ceill@PLT
+; CHECK-X64:jmp ceill@PLT
+; CHECK-X86:calll ceill
+start:
+  %0 = tail call fp128 @llvm.ceil.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.ceil.f128(fp128)
+
+
+define fp128 @test_copysignf128(fp128 %a, fp128 %b) {
+; No math library call here, so make sure the assembly does the correct thing.
+; This test is autogenerated
+; CHECK-LABEL:test_copysignf128:
+; CHECK-AARCH64-LABEL: test_copysignf128:
+; CHECK-AARCH64:   // %bb.0: // %start
+; CHECK-AARCH64-NEXT:stp q0, q1, [sp, #-32]!
+; CHECK-AARCH64-NEXT:.cfi_def_cfa_offset 32
+; CHECK-AARCH64-NEXT:ldrb w8, [sp, #15]
+; CHECK-AARCH64-NEXT:ldrb w9, [sp, #31]
+; CHECK-AARCH64-NEXT:bfxil w9, w8, #0, #7
+; CHECK-AARCH64-NEXT:strb w9, [sp, #15]
+; CHECK-AARCH64-NEXT:ldr q0, [sp], #32
+; CHECK-AARCH64-NEXT:ret
+;
+; CHECK-RISCV32-LABEL: test_copysignf128:
+; CHECK-RISCV32:   # %bb.0: # %start
+; CHECK-RISCV32-NEXT:lw a3, 0(a1)
+; CHECK-RISCV32-NEXT:lw a4, 4(a1)
+; CHECK-RISCV32-NEXT:lw a2, 12(a2)
+; CHECK-RISCV32-NEXT:lw a5, 12(a1)
+; CHECK-RISCV32-NEXT:lw a1, 8(a1)
+; CHECK-RISCV32-NEXT:lui a6, 524288
+; CHECK-RISCV32-NEXT:and a2, a2, a6
+; CHECK-RISCV32-NEXT:slli a5, a5, 1
+; CHECK-RISCV32-NEXT:srli a5, a5, 1
+; CHECK-RISCV32-NEXT:or a2, a5, a2
+; CHECK-RISCV32-NEXT:sw a1, 8(a0)
+; CHECK-RISCV32-NEXT:sw a4, 4(a0)
+; CHECK-RISCV32-NEXT:sw a3, 0(a0)
+; CHECK-RISCV32-NEXT:sw a2, 12(a0)
+; CHECK-RISCV32-NEXT:ret
+;
+; CHECK-S390X-LABEL: test_copysignf128:
+; CHECK-S390X:   # %bb.0: # %start
+; CHECK-S390X-NEXT:ld %f0, 0(%r3)
+; CHECK-S390X-NEXT:ld %f2, 8(%r3)
+; CHECK-S390X-NEXT:ld %f1, 0(%r4)
+; CHECK-S390X-NEXT:ld %f3, 8(%r4)
+; CHECK-S390X-NEXT:cpsdr %f0, %f1, %f0
+; CHECK-S390X-NEXT:std %f0, 0(%r2)
+; CHECK-S390X-NEXT:std %f2, 8(%r2)
+; CHECK-S390X-NEXT:br %r14
+;
+; CHECK-X86-LABEL: test_copysignf128:
+; CHECK-X86:   # %bb.0: # %start
+; CHECK-X86-NEXT:pushl %ebx
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 8
+; CHECK-X86-NEXT:pushl %edi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 12
+; CHECK-X86-NEXT:pushl %esi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 16
+; CHECK-X86-NEXT:.cfi_offset %esi, -16
+; CHECK-X86-NEXT:.cfi_offset %edi, -12
+; CHECK-X86-NEXT:.cfi_offset %ebx, -8
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %ecx
+; CHECK-X86-N

[clang] [llvm] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2023-12-29 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/76558

>From 590f4920ceb1a80d711d39624b0249cd9ff774d2 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Fri, 11 Aug 2023 22:16:01 -0400
Subject: [PATCH 1/4] [IR] Add an xpassing test for `f128` intrinsic lowering

`f128` intrinsic functions lower to incorrect libc calls. Add a test
showing current behavior.
---
 .../CodeGen/Generic/f128-math-lowering.ll | 610 ++
 1 file changed, 610 insertions(+)
 create mode 100644 llvm/test/CodeGen/Generic/f128-math-lowering.ll

diff --git a/llvm/test/CodeGen/Generic/f128-math-lowering.ll 
b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
new file mode 100644
index 00..8a70786d97fe67
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
@@ -0,0 +1,610 @@
+
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 2
+;
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-AARCH64
+; RUN: llc < %s -mtriple=riscv32-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-RISCV32
+; RUN: llc < %s -mtriple=s390x-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-S390X
+; RUN: llc < %s -mtriple=i686-unknown-unknown   -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X64
+;
+; Verify that fp128 intrinsics only lower to `long double` calls on platforms
+; where `f128` and `long double` have the same layout.
+;
+; We test on x86 and x64 which have 80-bit ld, as well as aarch64 (ld == f128),
+; riscv32 (ld == f64), and s380x (ld == f128 with different alignment from
+; x64/aarch64 f128).
+;
+; FIXME: these emit calls to long double functions but should emit f128 calls
+
+define fp128 @test_cbrtf128(fp128 %a) {
+; CHECK-LABEL:  test_cbrtf128:
+; CHECK-AARCH64:b llvm.cbrt.f128
+; CHECK-RISCV32:call llvm.cbrt.f128@plt
+; CHECK-S390X:  brasl {{%.*}} llvm.cbrt.f128@PLT
+; CHECK-X64:jmp llvm.cbrt.f128@PLT # TAILCALL
+; CHECK-X86:calll llvm.cbrt.f128@PLT
+start:
+  %0 = tail call fp128 @llvm.cbrt.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.cbrt.f128(fp128)
+
+
+define fp128 @test_ceilf128(fp128 %a) {
+; CHECK-LABEL:  test_ceilf128:
+; CHECK-AARCH64:b ceill
+; CHECK-RISCV32:call ceill@plt
+; CHECK-S390X:  brasl {{%.*}} ceill@PLT
+; CHECK-X64:jmp ceill@PLT
+; CHECK-X86:calll ceill
+start:
+  %0 = tail call fp128 @llvm.ceil.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.ceil.f128(fp128)
+
+
+define fp128 @test_copysignf128(fp128 %a, fp128 %b) {
+; No math library call here, so make sure the assembly does the correct thing.
+; This test is autogenerated
+; CHECK-LABEL:test_copysignf128:
+; CHECK-AARCH64-LABEL: test_copysignf128:
+; CHECK-AARCH64:   // %bb.0: // %start
+; CHECK-AARCH64-NEXT:stp q0, q1, [sp, #-32]!
+; CHECK-AARCH64-NEXT:.cfi_def_cfa_offset 32
+; CHECK-AARCH64-NEXT:ldrb w8, [sp, #15]
+; CHECK-AARCH64-NEXT:ldrb w9, [sp, #31]
+; CHECK-AARCH64-NEXT:bfxil w9, w8, #0, #7
+; CHECK-AARCH64-NEXT:strb w9, [sp, #15]
+; CHECK-AARCH64-NEXT:ldr q0, [sp], #32
+; CHECK-AARCH64-NEXT:ret
+;
+; CHECK-RISCV32-LABEL: test_copysignf128:
+; CHECK-RISCV32:   # %bb.0: # %start
+; CHECK-RISCV32-NEXT:lw a3, 0(a1)
+; CHECK-RISCV32-NEXT:lw a4, 4(a1)
+; CHECK-RISCV32-NEXT:lw a2, 12(a2)
+; CHECK-RISCV32-NEXT:lw a5, 12(a1)
+; CHECK-RISCV32-NEXT:lw a1, 8(a1)
+; CHECK-RISCV32-NEXT:lui a6, 524288
+; CHECK-RISCV32-NEXT:and a2, a2, a6
+; CHECK-RISCV32-NEXT:slli a5, a5, 1
+; CHECK-RISCV32-NEXT:srli a5, a5, 1
+; CHECK-RISCV32-NEXT:or a2, a5, a2
+; CHECK-RISCV32-NEXT:sw a1, 8(a0)
+; CHECK-RISCV32-NEXT:sw a4, 4(a0)
+; CHECK-RISCV32-NEXT:sw a3, 0(a0)
+; CHECK-RISCV32-NEXT:sw a2, 12(a0)
+; CHECK-RISCV32-NEXT:ret
+;
+; CHECK-S390X-LABEL: test_copysignf128:
+; CHECK-S390X:   # %bb.0: # %start
+; CHECK-S390X-NEXT:ld %f0, 0(%r3)
+; CHECK-S390X-NEXT:ld %f2, 8(%r3)
+; CHECK-S390X-NEXT:ld %f1, 0(%r4)
+; CHECK-S390X-NEXT:ld %f3, 8(%r4)
+; CHECK-S390X-NEXT:cpsdr %f0, %f1, %f0
+; CHECK-S390X-NEXT:std %f0, 0(%r2)
+; CHECK-S390X-NEXT:std %f2, 8(%r2)
+; CHECK-S390X-NEXT:br %r14
+;
+; CHECK-X86-LABEL: test_copysignf128:
+; CHECK-X86:   # %bb.0: # %start
+; CHECK-X86-NEXT:pushl %ebx
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 8
+; CHECK-X86-NEXT:pushl %edi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 12
+; CHECK-X86-NEXT:pushl %esi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 16
+; CHECK-X86-NEXT:.cfi_offset %esi, -16
+; CHECK-X86-NEXT:.cfi_offset %edi, -12
+; CHECK-X86-NEXT:.cfi_offset %ebx, -8
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %ecx
+; CHECK-X86-N

[clang] [llvm] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2023-12-29 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/76558

>From 590f4920ceb1a80d711d39624b0249cd9ff774d2 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Fri, 11 Aug 2023 22:16:01 -0400
Subject: [PATCH 1/4] [IR] Add an xpassing test for `f128` intrinsic lowering

`f128` intrinsic functions lower to incorrect libc calls. Add a test
showing current behavior.
---
 .../CodeGen/Generic/f128-math-lowering.ll | 610 ++
 1 file changed, 610 insertions(+)
 create mode 100644 llvm/test/CodeGen/Generic/f128-math-lowering.ll

diff --git a/llvm/test/CodeGen/Generic/f128-math-lowering.ll 
b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
new file mode 100644
index 00..8a70786d97fe67
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
@@ -0,0 +1,610 @@
+
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 2
+;
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-AARCH64
+; RUN: llc < %s -mtriple=riscv32-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-RISCV32
+; RUN: llc < %s -mtriple=s390x-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-S390X
+; RUN: llc < %s -mtriple=i686-unknown-unknown   -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X64
+;
+; Verify that fp128 intrinsics only lower to `long double` calls on platforms
+; where `f128` and `long double` have the same layout.
+;
+; We test on x86 and x64 which have 80-bit ld, as well as aarch64 (ld == f128),
+; riscv32 (ld == f64), and s380x (ld == f128 with different alignment from
+; x64/aarch64 f128).
+;
+; FIXME: these emit calls to long double functions but should emit f128 calls
+
+define fp128 @test_cbrtf128(fp128 %a) {
+; CHECK-LABEL:  test_cbrtf128:
+; CHECK-AARCH64:b llvm.cbrt.f128
+; CHECK-RISCV32:call llvm.cbrt.f128@plt
+; CHECK-S390X:  brasl {{%.*}} llvm.cbrt.f128@PLT
+; CHECK-X64:jmp llvm.cbrt.f128@PLT # TAILCALL
+; CHECK-X86:calll llvm.cbrt.f128@PLT
+start:
+  %0 = tail call fp128 @llvm.cbrt.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.cbrt.f128(fp128)
+
+
+define fp128 @test_ceilf128(fp128 %a) {
+; CHECK-LABEL:  test_ceilf128:
+; CHECK-AARCH64:b ceill
+; CHECK-RISCV32:call ceill@plt
+; CHECK-S390X:  brasl {{%.*}} ceill@PLT
+; CHECK-X64:jmp ceill@PLT
+; CHECK-X86:calll ceill
+start:
+  %0 = tail call fp128 @llvm.ceil.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.ceil.f128(fp128)
+
+
+define fp128 @test_copysignf128(fp128 %a, fp128 %b) {
+; No math library call here, so make sure the assembly does the correct thing.
+; This test is autogenerated
+; CHECK-LABEL:test_copysignf128:
+; CHECK-AARCH64-LABEL: test_copysignf128:
+; CHECK-AARCH64:   // %bb.0: // %start
+; CHECK-AARCH64-NEXT:stp q0, q1, [sp, #-32]!
+; CHECK-AARCH64-NEXT:.cfi_def_cfa_offset 32
+; CHECK-AARCH64-NEXT:ldrb w8, [sp, #15]
+; CHECK-AARCH64-NEXT:ldrb w9, [sp, #31]
+; CHECK-AARCH64-NEXT:bfxil w9, w8, #0, #7
+; CHECK-AARCH64-NEXT:strb w9, [sp, #15]
+; CHECK-AARCH64-NEXT:ldr q0, [sp], #32
+; CHECK-AARCH64-NEXT:ret
+;
+; CHECK-RISCV32-LABEL: test_copysignf128:
+; CHECK-RISCV32:   # %bb.0: # %start
+; CHECK-RISCV32-NEXT:lw a3, 0(a1)
+; CHECK-RISCV32-NEXT:lw a4, 4(a1)
+; CHECK-RISCV32-NEXT:lw a2, 12(a2)
+; CHECK-RISCV32-NEXT:lw a5, 12(a1)
+; CHECK-RISCV32-NEXT:lw a1, 8(a1)
+; CHECK-RISCV32-NEXT:lui a6, 524288
+; CHECK-RISCV32-NEXT:and a2, a2, a6
+; CHECK-RISCV32-NEXT:slli a5, a5, 1
+; CHECK-RISCV32-NEXT:srli a5, a5, 1
+; CHECK-RISCV32-NEXT:or a2, a5, a2
+; CHECK-RISCV32-NEXT:sw a1, 8(a0)
+; CHECK-RISCV32-NEXT:sw a4, 4(a0)
+; CHECK-RISCV32-NEXT:sw a3, 0(a0)
+; CHECK-RISCV32-NEXT:sw a2, 12(a0)
+; CHECK-RISCV32-NEXT:ret
+;
+; CHECK-S390X-LABEL: test_copysignf128:
+; CHECK-S390X:   # %bb.0: # %start
+; CHECK-S390X-NEXT:ld %f0, 0(%r3)
+; CHECK-S390X-NEXT:ld %f2, 8(%r3)
+; CHECK-S390X-NEXT:ld %f1, 0(%r4)
+; CHECK-S390X-NEXT:ld %f3, 8(%r4)
+; CHECK-S390X-NEXT:cpsdr %f0, %f1, %f0
+; CHECK-S390X-NEXT:std %f0, 0(%r2)
+; CHECK-S390X-NEXT:std %f2, 8(%r2)
+; CHECK-S390X-NEXT:br %r14
+;
+; CHECK-X86-LABEL: test_copysignf128:
+; CHECK-X86:   # %bb.0: # %start
+; CHECK-X86-NEXT:pushl %ebx
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 8
+; CHECK-X86-NEXT:pushl %edi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 12
+; CHECK-X86-NEXT:pushl %esi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 16
+; CHECK-X86-NEXT:.cfi_offset %esi, -16
+; CHECK-X86-NEXT:.cfi_offset %edi, -12
+; CHECK-X86-NEXT:.cfi_offset %ebx, -8
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %ecx
+; CHECK-X86-N

[clang] [llvm] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2023-12-29 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/76558

>From 590f4920ceb1a80d711d39624b0249cd9ff774d2 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Fri, 11 Aug 2023 22:16:01 -0400
Subject: [PATCH 1/4] [IR] Add an xpassing test for `f128` intrinsic lowering

`f128` intrinsic functions lower to incorrect libc calls. Add a test
showing current behavior.
---
 .../CodeGen/Generic/f128-math-lowering.ll | 610 ++
 1 file changed, 610 insertions(+)
 create mode 100644 llvm/test/CodeGen/Generic/f128-math-lowering.ll

diff --git a/llvm/test/CodeGen/Generic/f128-math-lowering.ll 
b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
new file mode 100644
index 00..8a70786d97fe67
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
@@ -0,0 +1,610 @@
+
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 2
+;
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-AARCH64
+; RUN: llc < %s -mtriple=riscv32-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-RISCV32
+; RUN: llc < %s -mtriple=s390x-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-S390X
+; RUN: llc < %s -mtriple=i686-unknown-unknown   -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X64
+;
+; Verify that fp128 intrinsics only lower to `long double` calls on platforms
+; where `f128` and `long double` have the same layout.
+;
+; We test on x86 and x64 which have 80-bit ld, as well as aarch64 (ld == f128),
+; riscv32 (ld == f64), and s380x (ld == f128 with different alignment from
+; x64/aarch64 f128).
+;
+; FIXME: these emit calls to long double functions but should emit f128 calls
+
+define fp128 @test_cbrtf128(fp128 %a) {
+; CHECK-LABEL:  test_cbrtf128:
+; CHECK-AARCH64:b llvm.cbrt.f128
+; CHECK-RISCV32:call llvm.cbrt.f128@plt
+; CHECK-S390X:  brasl {{%.*}} llvm.cbrt.f128@PLT
+; CHECK-X64:jmp llvm.cbrt.f128@PLT # TAILCALL
+; CHECK-X86:calll llvm.cbrt.f128@PLT
+start:
+  %0 = tail call fp128 @llvm.cbrt.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.cbrt.f128(fp128)
+
+
+define fp128 @test_ceilf128(fp128 %a) {
+; CHECK-LABEL:  test_ceilf128:
+; CHECK-AARCH64:b ceill
+; CHECK-RISCV32:call ceill@plt
+; CHECK-S390X:  brasl {{%.*}} ceill@PLT
+; CHECK-X64:jmp ceill@PLT
+; CHECK-X86:calll ceill
+start:
+  %0 = tail call fp128 @llvm.ceil.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.ceil.f128(fp128)
+
+
+define fp128 @test_copysignf128(fp128 %a, fp128 %b) {
+; No math library call here, so make sure the assembly does the correct thing.
+; This test is autogenerated
+; CHECK-LABEL:test_copysignf128:
+; CHECK-AARCH64-LABEL: test_copysignf128:
+; CHECK-AARCH64:   // %bb.0: // %start
+; CHECK-AARCH64-NEXT:stp q0, q1, [sp, #-32]!
+; CHECK-AARCH64-NEXT:.cfi_def_cfa_offset 32
+; CHECK-AARCH64-NEXT:ldrb w8, [sp, #15]
+; CHECK-AARCH64-NEXT:ldrb w9, [sp, #31]
+; CHECK-AARCH64-NEXT:bfxil w9, w8, #0, #7
+; CHECK-AARCH64-NEXT:strb w9, [sp, #15]
+; CHECK-AARCH64-NEXT:ldr q0, [sp], #32
+; CHECK-AARCH64-NEXT:ret
+;
+; CHECK-RISCV32-LABEL: test_copysignf128:
+; CHECK-RISCV32:   # %bb.0: # %start
+; CHECK-RISCV32-NEXT:lw a3, 0(a1)
+; CHECK-RISCV32-NEXT:lw a4, 4(a1)
+; CHECK-RISCV32-NEXT:lw a2, 12(a2)
+; CHECK-RISCV32-NEXT:lw a5, 12(a1)
+; CHECK-RISCV32-NEXT:lw a1, 8(a1)
+; CHECK-RISCV32-NEXT:lui a6, 524288
+; CHECK-RISCV32-NEXT:and a2, a2, a6
+; CHECK-RISCV32-NEXT:slli a5, a5, 1
+; CHECK-RISCV32-NEXT:srli a5, a5, 1
+; CHECK-RISCV32-NEXT:or a2, a5, a2
+; CHECK-RISCV32-NEXT:sw a1, 8(a0)
+; CHECK-RISCV32-NEXT:sw a4, 4(a0)
+; CHECK-RISCV32-NEXT:sw a3, 0(a0)
+; CHECK-RISCV32-NEXT:sw a2, 12(a0)
+; CHECK-RISCV32-NEXT:ret
+;
+; CHECK-S390X-LABEL: test_copysignf128:
+; CHECK-S390X:   # %bb.0: # %start
+; CHECK-S390X-NEXT:ld %f0, 0(%r3)
+; CHECK-S390X-NEXT:ld %f2, 8(%r3)
+; CHECK-S390X-NEXT:ld %f1, 0(%r4)
+; CHECK-S390X-NEXT:ld %f3, 8(%r4)
+; CHECK-S390X-NEXT:cpsdr %f0, %f1, %f0
+; CHECK-S390X-NEXT:std %f0, 0(%r2)
+; CHECK-S390X-NEXT:std %f2, 8(%r2)
+; CHECK-S390X-NEXT:br %r14
+;
+; CHECK-X86-LABEL: test_copysignf128:
+; CHECK-X86:   # %bb.0: # %start
+; CHECK-X86-NEXT:pushl %ebx
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 8
+; CHECK-X86-NEXT:pushl %edi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 12
+; CHECK-X86-NEXT:pushl %esi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 16
+; CHECK-X86-NEXT:.cfi_offset %esi, -16
+; CHECK-X86-NEXT:.cfi_offset %edi, -12
+; CHECK-X86-NEXT:.cfi_offset %ebx, -8
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %ecx
+; CHECK-X86-N

[clang] [llvm] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2023-12-29 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/76558

>From 590f4920ceb1a80d711d39624b0249cd9ff774d2 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Fri, 11 Aug 2023 22:16:01 -0400
Subject: [PATCH 1/4] [IR] Add an xpassing test for `f128` intrinsic lowering

`f128` intrinsic functions lower to incorrect libc calls. Add a test
showing current behavior.
---
 .../CodeGen/Generic/f128-math-lowering.ll | 610 ++
 1 file changed, 610 insertions(+)
 create mode 100644 llvm/test/CodeGen/Generic/f128-math-lowering.ll

diff --git a/llvm/test/CodeGen/Generic/f128-math-lowering.ll 
b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
new file mode 100644
index 00..8a70786d97fe67
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
@@ -0,0 +1,610 @@
+
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 2
+;
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-AARCH64
+; RUN: llc < %s -mtriple=riscv32-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-RISCV32
+; RUN: llc < %s -mtriple=s390x-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-S390X
+; RUN: llc < %s -mtriple=i686-unknown-unknown   -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X64
+;
+; Verify that fp128 intrinsics only lower to `long double` calls on platforms
+; where `f128` and `long double` have the same layout.
+;
+; We test on x86 and x64 which have 80-bit ld, as well as aarch64 (ld == f128),
+; riscv32 (ld == f64), and s380x (ld == f128 with different alignment from
+; x64/aarch64 f128).
+;
+; FIXME: these emit calls to long double functions but should emit f128 calls
+
+define fp128 @test_cbrtf128(fp128 %a) {
+; CHECK-LABEL:  test_cbrtf128:
+; CHECK-AARCH64:b llvm.cbrt.f128
+; CHECK-RISCV32:call llvm.cbrt.f128@plt
+; CHECK-S390X:  brasl {{%.*}} llvm.cbrt.f128@PLT
+; CHECK-X64:jmp llvm.cbrt.f128@PLT # TAILCALL
+; CHECK-X86:calll llvm.cbrt.f128@PLT
+start:
+  %0 = tail call fp128 @llvm.cbrt.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.cbrt.f128(fp128)
+
+
+define fp128 @test_ceilf128(fp128 %a) {
+; CHECK-LABEL:  test_ceilf128:
+; CHECK-AARCH64:b ceill
+; CHECK-RISCV32:call ceill@plt
+; CHECK-S390X:  brasl {{%.*}} ceill@PLT
+; CHECK-X64:jmp ceill@PLT
+; CHECK-X86:calll ceill
+start:
+  %0 = tail call fp128 @llvm.ceil.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.ceil.f128(fp128)
+
+
+define fp128 @test_copysignf128(fp128 %a, fp128 %b) {
+; No math library call here, so make sure the assembly does the correct thing.
+; This test is autogenerated
+; CHECK-LABEL:test_copysignf128:
+; CHECK-AARCH64-LABEL: test_copysignf128:
+; CHECK-AARCH64:   // %bb.0: // %start
+; CHECK-AARCH64-NEXT:stp q0, q1, [sp, #-32]!
+; CHECK-AARCH64-NEXT:.cfi_def_cfa_offset 32
+; CHECK-AARCH64-NEXT:ldrb w8, [sp, #15]
+; CHECK-AARCH64-NEXT:ldrb w9, [sp, #31]
+; CHECK-AARCH64-NEXT:bfxil w9, w8, #0, #7
+; CHECK-AARCH64-NEXT:strb w9, [sp, #15]
+; CHECK-AARCH64-NEXT:ldr q0, [sp], #32
+; CHECK-AARCH64-NEXT:ret
+;
+; CHECK-RISCV32-LABEL: test_copysignf128:
+; CHECK-RISCV32:   # %bb.0: # %start
+; CHECK-RISCV32-NEXT:lw a3, 0(a1)
+; CHECK-RISCV32-NEXT:lw a4, 4(a1)
+; CHECK-RISCV32-NEXT:lw a2, 12(a2)
+; CHECK-RISCV32-NEXT:lw a5, 12(a1)
+; CHECK-RISCV32-NEXT:lw a1, 8(a1)
+; CHECK-RISCV32-NEXT:lui a6, 524288
+; CHECK-RISCV32-NEXT:and a2, a2, a6
+; CHECK-RISCV32-NEXT:slli a5, a5, 1
+; CHECK-RISCV32-NEXT:srli a5, a5, 1
+; CHECK-RISCV32-NEXT:or a2, a5, a2
+; CHECK-RISCV32-NEXT:sw a1, 8(a0)
+; CHECK-RISCV32-NEXT:sw a4, 4(a0)
+; CHECK-RISCV32-NEXT:sw a3, 0(a0)
+; CHECK-RISCV32-NEXT:sw a2, 12(a0)
+; CHECK-RISCV32-NEXT:ret
+;
+; CHECK-S390X-LABEL: test_copysignf128:
+; CHECK-S390X:   # %bb.0: # %start
+; CHECK-S390X-NEXT:ld %f0, 0(%r3)
+; CHECK-S390X-NEXT:ld %f2, 8(%r3)
+; CHECK-S390X-NEXT:ld %f1, 0(%r4)
+; CHECK-S390X-NEXT:ld %f3, 8(%r4)
+; CHECK-S390X-NEXT:cpsdr %f0, %f1, %f0
+; CHECK-S390X-NEXT:std %f0, 0(%r2)
+; CHECK-S390X-NEXT:std %f2, 8(%r2)
+; CHECK-S390X-NEXT:br %r14
+;
+; CHECK-X86-LABEL: test_copysignf128:
+; CHECK-X86:   # %bb.0: # %start
+; CHECK-X86-NEXT:pushl %ebx
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 8
+; CHECK-X86-NEXT:pushl %edi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 12
+; CHECK-X86-NEXT:pushl %esi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 16
+; CHECK-X86-NEXT:.cfi_offset %esi, -16
+; CHECK-X86-NEXT:.cfi_offset %edi, -12
+; CHECK-X86-NEXT:.cfi_offset %ebx, -8
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %ecx
+; CHECK-X86-N

[llvm] [clang] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2023-12-29 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/76558

>From 590f4920ceb1a80d711d39624b0249cd9ff774d2 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Fri, 11 Aug 2023 22:16:01 -0400
Subject: [PATCH 1/4] [IR] Add an xpassing test for `f128` intrinsic lowering

`f128` intrinsic functions lower to incorrect libc calls. Add a test
showing current behavior.
---
 .../CodeGen/Generic/f128-math-lowering.ll | 610 ++
 1 file changed, 610 insertions(+)
 create mode 100644 llvm/test/CodeGen/Generic/f128-math-lowering.ll

diff --git a/llvm/test/CodeGen/Generic/f128-math-lowering.ll 
b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
new file mode 100644
index 00..8a70786d97fe67
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
@@ -0,0 +1,610 @@
+
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 2
+;
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-AARCH64
+; RUN: llc < %s -mtriple=riscv32-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-RISCV32
+; RUN: llc < %s -mtriple=s390x-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-S390X
+; RUN: llc < %s -mtriple=i686-unknown-unknown   -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X64
+;
+; Verify that fp128 intrinsics only lower to `long double` calls on platforms
+; where `f128` and `long double` have the same layout.
+;
+; We test on x86 and x64 which have 80-bit ld, as well as aarch64 (ld == f128),
+; riscv32 (ld == f64), and s380x (ld == f128 with different alignment from
+; x64/aarch64 f128).
+;
+; FIXME: these emit calls to long double functions but should emit f128 calls
+
+define fp128 @test_cbrtf128(fp128 %a) {
+; CHECK-LABEL:  test_cbrtf128:
+; CHECK-AARCH64:b llvm.cbrt.f128
+; CHECK-RISCV32:call llvm.cbrt.f128@plt
+; CHECK-S390X:  brasl {{%.*}} llvm.cbrt.f128@PLT
+; CHECK-X64:jmp llvm.cbrt.f128@PLT # TAILCALL
+; CHECK-X86:calll llvm.cbrt.f128@PLT
+start:
+  %0 = tail call fp128 @llvm.cbrt.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.cbrt.f128(fp128)
+
+
+define fp128 @test_ceilf128(fp128 %a) {
+; CHECK-LABEL:  test_ceilf128:
+; CHECK-AARCH64:b ceill
+; CHECK-RISCV32:call ceill@plt
+; CHECK-S390X:  brasl {{%.*}} ceill@PLT
+; CHECK-X64:jmp ceill@PLT
+; CHECK-X86:calll ceill
+start:
+  %0 = tail call fp128 @llvm.ceil.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.ceil.f128(fp128)
+
+
+define fp128 @test_copysignf128(fp128 %a, fp128 %b) {
+; No math library call here, so make sure the assembly does the correct thing.
+; This test is autogenerated
+; CHECK-LABEL:test_copysignf128:
+; CHECK-AARCH64-LABEL: test_copysignf128:
+; CHECK-AARCH64:   // %bb.0: // %start
+; CHECK-AARCH64-NEXT:stp q0, q1, [sp, #-32]!
+; CHECK-AARCH64-NEXT:.cfi_def_cfa_offset 32
+; CHECK-AARCH64-NEXT:ldrb w8, [sp, #15]
+; CHECK-AARCH64-NEXT:ldrb w9, [sp, #31]
+; CHECK-AARCH64-NEXT:bfxil w9, w8, #0, #7
+; CHECK-AARCH64-NEXT:strb w9, [sp, #15]
+; CHECK-AARCH64-NEXT:ldr q0, [sp], #32
+; CHECK-AARCH64-NEXT:ret
+;
+; CHECK-RISCV32-LABEL: test_copysignf128:
+; CHECK-RISCV32:   # %bb.0: # %start
+; CHECK-RISCV32-NEXT:lw a3, 0(a1)
+; CHECK-RISCV32-NEXT:lw a4, 4(a1)
+; CHECK-RISCV32-NEXT:lw a2, 12(a2)
+; CHECK-RISCV32-NEXT:lw a5, 12(a1)
+; CHECK-RISCV32-NEXT:lw a1, 8(a1)
+; CHECK-RISCV32-NEXT:lui a6, 524288
+; CHECK-RISCV32-NEXT:and a2, a2, a6
+; CHECK-RISCV32-NEXT:slli a5, a5, 1
+; CHECK-RISCV32-NEXT:srli a5, a5, 1
+; CHECK-RISCV32-NEXT:or a2, a5, a2
+; CHECK-RISCV32-NEXT:sw a1, 8(a0)
+; CHECK-RISCV32-NEXT:sw a4, 4(a0)
+; CHECK-RISCV32-NEXT:sw a3, 0(a0)
+; CHECK-RISCV32-NEXT:sw a2, 12(a0)
+; CHECK-RISCV32-NEXT:ret
+;
+; CHECK-S390X-LABEL: test_copysignf128:
+; CHECK-S390X:   # %bb.0: # %start
+; CHECK-S390X-NEXT:ld %f0, 0(%r3)
+; CHECK-S390X-NEXT:ld %f2, 8(%r3)
+; CHECK-S390X-NEXT:ld %f1, 0(%r4)
+; CHECK-S390X-NEXT:ld %f3, 8(%r4)
+; CHECK-S390X-NEXT:cpsdr %f0, %f1, %f0
+; CHECK-S390X-NEXT:std %f0, 0(%r2)
+; CHECK-S390X-NEXT:std %f2, 8(%r2)
+; CHECK-S390X-NEXT:br %r14
+;
+; CHECK-X86-LABEL: test_copysignf128:
+; CHECK-X86:   # %bb.0: # %start
+; CHECK-X86-NEXT:pushl %ebx
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 8
+; CHECK-X86-NEXT:pushl %edi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 12
+; CHECK-X86-NEXT:pushl %esi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 16
+; CHECK-X86-NEXT:.cfi_offset %esi, -16
+; CHECK-X86-NEXT:.cfi_offset %edi, -12
+; CHECK-X86-NEXT:.cfi_offset %ebx, -8
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %ecx
+; CHECK-X86-N

[llvm] [clang] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2023-12-29 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/76558

>From 590f4920ceb1a80d711d39624b0249cd9ff774d2 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Fri, 11 Aug 2023 22:16:01 -0400
Subject: [PATCH 1/4] [IR] Add an xpassing test for `f128` intrinsic lowering

`f128` intrinsic functions lower to incorrect libc calls. Add a test
showing current behavior.
---
 .../CodeGen/Generic/f128-math-lowering.ll | 610 ++
 1 file changed, 610 insertions(+)
 create mode 100644 llvm/test/CodeGen/Generic/f128-math-lowering.ll

diff --git a/llvm/test/CodeGen/Generic/f128-math-lowering.ll 
b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
new file mode 100644
index 00..8a70786d97fe67
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
@@ -0,0 +1,610 @@
+
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 2
+;
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-AARCH64
+; RUN: llc < %s -mtriple=riscv32-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-RISCV32
+; RUN: llc < %s -mtriple=s390x-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-S390X
+; RUN: llc < %s -mtriple=i686-unknown-unknown   -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X64
+;
+; Verify that fp128 intrinsics only lower to `long double` calls on platforms
+; where `f128` and `long double` have the same layout.
+;
+; We test on x86 and x64 which have 80-bit ld, as well as aarch64 (ld == f128),
+; riscv32 (ld == f64), and s380x (ld == f128 with different alignment from
+; x64/aarch64 f128).
+;
+; FIXME: these emit calls to long double functions but should emit f128 calls
+
+define fp128 @test_cbrtf128(fp128 %a) {
+; CHECK-LABEL:  test_cbrtf128:
+; CHECK-AARCH64:b llvm.cbrt.f128
+; CHECK-RISCV32:call llvm.cbrt.f128@plt
+; CHECK-S390X:  brasl {{%.*}} llvm.cbrt.f128@PLT
+; CHECK-X64:jmp llvm.cbrt.f128@PLT # TAILCALL
+; CHECK-X86:calll llvm.cbrt.f128@PLT
+start:
+  %0 = tail call fp128 @llvm.cbrt.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.cbrt.f128(fp128)
+
+
+define fp128 @test_ceilf128(fp128 %a) {
+; CHECK-LABEL:  test_ceilf128:
+; CHECK-AARCH64:b ceill
+; CHECK-RISCV32:call ceill@plt
+; CHECK-S390X:  brasl {{%.*}} ceill@PLT
+; CHECK-X64:jmp ceill@PLT
+; CHECK-X86:calll ceill
+start:
+  %0 = tail call fp128 @llvm.ceil.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.ceil.f128(fp128)
+
+
+define fp128 @test_copysignf128(fp128 %a, fp128 %b) {
+; No math library call here, so make sure the assembly does the correct thing.
+; This test is autogenerated
+; CHECK-LABEL:test_copysignf128:
+; CHECK-AARCH64-LABEL: test_copysignf128:
+; CHECK-AARCH64:   // %bb.0: // %start
+; CHECK-AARCH64-NEXT:stp q0, q1, [sp, #-32]!
+; CHECK-AARCH64-NEXT:.cfi_def_cfa_offset 32
+; CHECK-AARCH64-NEXT:ldrb w8, [sp, #15]
+; CHECK-AARCH64-NEXT:ldrb w9, [sp, #31]
+; CHECK-AARCH64-NEXT:bfxil w9, w8, #0, #7
+; CHECK-AARCH64-NEXT:strb w9, [sp, #15]
+; CHECK-AARCH64-NEXT:ldr q0, [sp], #32
+; CHECK-AARCH64-NEXT:ret
+;
+; CHECK-RISCV32-LABEL: test_copysignf128:
+; CHECK-RISCV32:   # %bb.0: # %start
+; CHECK-RISCV32-NEXT:lw a3, 0(a1)
+; CHECK-RISCV32-NEXT:lw a4, 4(a1)
+; CHECK-RISCV32-NEXT:lw a2, 12(a2)
+; CHECK-RISCV32-NEXT:lw a5, 12(a1)
+; CHECK-RISCV32-NEXT:lw a1, 8(a1)
+; CHECK-RISCV32-NEXT:lui a6, 524288
+; CHECK-RISCV32-NEXT:and a2, a2, a6
+; CHECK-RISCV32-NEXT:slli a5, a5, 1
+; CHECK-RISCV32-NEXT:srli a5, a5, 1
+; CHECK-RISCV32-NEXT:or a2, a5, a2
+; CHECK-RISCV32-NEXT:sw a1, 8(a0)
+; CHECK-RISCV32-NEXT:sw a4, 4(a0)
+; CHECK-RISCV32-NEXT:sw a3, 0(a0)
+; CHECK-RISCV32-NEXT:sw a2, 12(a0)
+; CHECK-RISCV32-NEXT:ret
+;
+; CHECK-S390X-LABEL: test_copysignf128:
+; CHECK-S390X:   # %bb.0: # %start
+; CHECK-S390X-NEXT:ld %f0, 0(%r3)
+; CHECK-S390X-NEXT:ld %f2, 8(%r3)
+; CHECK-S390X-NEXT:ld %f1, 0(%r4)
+; CHECK-S390X-NEXT:ld %f3, 8(%r4)
+; CHECK-S390X-NEXT:cpsdr %f0, %f1, %f0
+; CHECK-S390X-NEXT:std %f0, 0(%r2)
+; CHECK-S390X-NEXT:std %f2, 8(%r2)
+; CHECK-S390X-NEXT:br %r14
+;
+; CHECK-X86-LABEL: test_copysignf128:
+; CHECK-X86:   # %bb.0: # %start
+; CHECK-X86-NEXT:pushl %ebx
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 8
+; CHECK-X86-NEXT:pushl %edi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 12
+; CHECK-X86-NEXT:pushl %esi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 16
+; CHECK-X86-NEXT:.cfi_offset %esi, -16
+; CHECK-X86-NEXT:.cfi_offset %edi, -12
+; CHECK-X86-NEXT:.cfi_offset %ebx, -8
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %ecx
+; CHECK-X86-N

[clang] [llvm] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2024-01-13 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/76558

>From 590f4920ceb1a80d711d39624b0249cd9ff774d2 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Fri, 11 Aug 2023 22:16:01 -0400
Subject: [PATCH 1/4] [IR] Add an xpassing test for `f128` intrinsic lowering

`f128` intrinsic functions lower to incorrect libc calls. Add a test
showing current behavior.
---
 .../CodeGen/Generic/f128-math-lowering.ll | 610 ++
 1 file changed, 610 insertions(+)
 create mode 100644 llvm/test/CodeGen/Generic/f128-math-lowering.ll

diff --git a/llvm/test/CodeGen/Generic/f128-math-lowering.ll 
b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
new file mode 100644
index 00..8a70786d97fe67
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
@@ -0,0 +1,610 @@
+
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 2
+;
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-AARCH64
+; RUN: llc < %s -mtriple=riscv32-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-RISCV32
+; RUN: llc < %s -mtriple=s390x-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-S390X
+; RUN: llc < %s -mtriple=i686-unknown-unknown   -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X64
+;
+; Verify that fp128 intrinsics only lower to `long double` calls on platforms
+; where `f128` and `long double` have the same layout.
+;
+; We test on x86 and x64 which have 80-bit ld, as well as aarch64 (ld == f128),
+; riscv32 (ld == f64), and s380x (ld == f128 with different alignment from
+; x64/aarch64 f128).
+;
+; FIXME: these emit calls to long double functions but should emit f128 calls
+
+define fp128 @test_cbrtf128(fp128 %a) {
+; CHECK-LABEL:  test_cbrtf128:
+; CHECK-AARCH64:b llvm.cbrt.f128
+; CHECK-RISCV32:call llvm.cbrt.f128@plt
+; CHECK-S390X:  brasl {{%.*}} llvm.cbrt.f128@PLT
+; CHECK-X64:jmp llvm.cbrt.f128@PLT # TAILCALL
+; CHECK-X86:calll llvm.cbrt.f128@PLT
+start:
+  %0 = tail call fp128 @llvm.cbrt.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.cbrt.f128(fp128)
+
+
+define fp128 @test_ceilf128(fp128 %a) {
+; CHECK-LABEL:  test_ceilf128:
+; CHECK-AARCH64:b ceill
+; CHECK-RISCV32:call ceill@plt
+; CHECK-S390X:  brasl {{%.*}} ceill@PLT
+; CHECK-X64:jmp ceill@PLT
+; CHECK-X86:calll ceill
+start:
+  %0 = tail call fp128 @llvm.ceil.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.ceil.f128(fp128)
+
+
+define fp128 @test_copysignf128(fp128 %a, fp128 %b) {
+; No math library call here, so make sure the assembly does the correct thing.
+; This test is autogenerated
+; CHECK-LABEL:test_copysignf128:
+; CHECK-AARCH64-LABEL: test_copysignf128:
+; CHECK-AARCH64:   // %bb.0: // %start
+; CHECK-AARCH64-NEXT:stp q0, q1, [sp, #-32]!
+; CHECK-AARCH64-NEXT:.cfi_def_cfa_offset 32
+; CHECK-AARCH64-NEXT:ldrb w8, [sp, #15]
+; CHECK-AARCH64-NEXT:ldrb w9, [sp, #31]
+; CHECK-AARCH64-NEXT:bfxil w9, w8, #0, #7
+; CHECK-AARCH64-NEXT:strb w9, [sp, #15]
+; CHECK-AARCH64-NEXT:ldr q0, [sp], #32
+; CHECK-AARCH64-NEXT:ret
+;
+; CHECK-RISCV32-LABEL: test_copysignf128:
+; CHECK-RISCV32:   # %bb.0: # %start
+; CHECK-RISCV32-NEXT:lw a3, 0(a1)
+; CHECK-RISCV32-NEXT:lw a4, 4(a1)
+; CHECK-RISCV32-NEXT:lw a2, 12(a2)
+; CHECK-RISCV32-NEXT:lw a5, 12(a1)
+; CHECK-RISCV32-NEXT:lw a1, 8(a1)
+; CHECK-RISCV32-NEXT:lui a6, 524288
+; CHECK-RISCV32-NEXT:and a2, a2, a6
+; CHECK-RISCV32-NEXT:slli a5, a5, 1
+; CHECK-RISCV32-NEXT:srli a5, a5, 1
+; CHECK-RISCV32-NEXT:or a2, a5, a2
+; CHECK-RISCV32-NEXT:sw a1, 8(a0)
+; CHECK-RISCV32-NEXT:sw a4, 4(a0)
+; CHECK-RISCV32-NEXT:sw a3, 0(a0)
+; CHECK-RISCV32-NEXT:sw a2, 12(a0)
+; CHECK-RISCV32-NEXT:ret
+;
+; CHECK-S390X-LABEL: test_copysignf128:
+; CHECK-S390X:   # %bb.0: # %start
+; CHECK-S390X-NEXT:ld %f0, 0(%r3)
+; CHECK-S390X-NEXT:ld %f2, 8(%r3)
+; CHECK-S390X-NEXT:ld %f1, 0(%r4)
+; CHECK-S390X-NEXT:ld %f3, 8(%r4)
+; CHECK-S390X-NEXT:cpsdr %f0, %f1, %f0
+; CHECK-S390X-NEXT:std %f0, 0(%r2)
+; CHECK-S390X-NEXT:std %f2, 8(%r2)
+; CHECK-S390X-NEXT:br %r14
+;
+; CHECK-X86-LABEL: test_copysignf128:
+; CHECK-X86:   # %bb.0: # %start
+; CHECK-X86-NEXT:pushl %ebx
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 8
+; CHECK-X86-NEXT:pushl %edi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 12
+; CHECK-X86-NEXT:pushl %esi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 16
+; CHECK-X86-NEXT:.cfi_offset %esi, -16
+; CHECK-X86-NEXT:.cfi_offset %edi, -12
+; CHECK-X86-NEXT:.cfi_offset %ebx, -8
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %ecx
+; CHECK-X86-N

[clang] [llvm] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2024-01-13 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/76558

>From 946581e0c6a06be92b16d74199b58a72be4b76f3 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Fri, 11 Aug 2023 22:16:01 -0400
Subject: [PATCH 1/4] [IR] Add an xpassing test for `f128` intrinsic lowering

`f128` intrinsic functions lower to incorrect libc calls. Add a test
showing current behavior.
---
 .../CodeGen/Generic/f128-math-lowering.ll | 610 ++
 1 file changed, 610 insertions(+)
 create mode 100644 llvm/test/CodeGen/Generic/f128-math-lowering.ll

diff --git a/llvm/test/CodeGen/Generic/f128-math-lowering.ll 
b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
new file mode 100644
index 00..8a70786d97fe67
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
@@ -0,0 +1,610 @@
+
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 2
+;
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-AARCH64
+; RUN: llc < %s -mtriple=riscv32-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-RISCV32
+; RUN: llc < %s -mtriple=s390x-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-S390X
+; RUN: llc < %s -mtriple=i686-unknown-unknown   -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X64
+;
+; Verify that fp128 intrinsics only lower to `long double` calls on platforms
+; where `f128` and `long double` have the same layout.
+;
+; We test on x86 and x64 which have 80-bit ld, as well as aarch64 (ld == f128),
+; riscv32 (ld == f64), and s380x (ld == f128 with different alignment from
+; x64/aarch64 f128).
+;
+; FIXME: these emit calls to long double functions but should emit f128 calls
+
+define fp128 @test_cbrtf128(fp128 %a) {
+; CHECK-LABEL:  test_cbrtf128:
+; CHECK-AARCH64:b llvm.cbrt.f128
+; CHECK-RISCV32:call llvm.cbrt.f128@plt
+; CHECK-S390X:  brasl {{%.*}} llvm.cbrt.f128@PLT
+; CHECK-X64:jmp llvm.cbrt.f128@PLT # TAILCALL
+; CHECK-X86:calll llvm.cbrt.f128@PLT
+start:
+  %0 = tail call fp128 @llvm.cbrt.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.cbrt.f128(fp128)
+
+
+define fp128 @test_ceilf128(fp128 %a) {
+; CHECK-LABEL:  test_ceilf128:
+; CHECK-AARCH64:b ceill
+; CHECK-RISCV32:call ceill@plt
+; CHECK-S390X:  brasl {{%.*}} ceill@PLT
+; CHECK-X64:jmp ceill@PLT
+; CHECK-X86:calll ceill
+start:
+  %0 = tail call fp128 @llvm.ceil.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.ceil.f128(fp128)
+
+
+define fp128 @test_copysignf128(fp128 %a, fp128 %b) {
+; No math library call here, so make sure the assembly does the correct thing.
+; This test is autogenerated
+; CHECK-LABEL:test_copysignf128:
+; CHECK-AARCH64-LABEL: test_copysignf128:
+; CHECK-AARCH64:   // %bb.0: // %start
+; CHECK-AARCH64-NEXT:stp q0, q1, [sp, #-32]!
+; CHECK-AARCH64-NEXT:.cfi_def_cfa_offset 32
+; CHECK-AARCH64-NEXT:ldrb w8, [sp, #15]
+; CHECK-AARCH64-NEXT:ldrb w9, [sp, #31]
+; CHECK-AARCH64-NEXT:bfxil w9, w8, #0, #7
+; CHECK-AARCH64-NEXT:strb w9, [sp, #15]
+; CHECK-AARCH64-NEXT:ldr q0, [sp], #32
+; CHECK-AARCH64-NEXT:ret
+;
+; CHECK-RISCV32-LABEL: test_copysignf128:
+; CHECK-RISCV32:   # %bb.0: # %start
+; CHECK-RISCV32-NEXT:lw a3, 0(a1)
+; CHECK-RISCV32-NEXT:lw a4, 4(a1)
+; CHECK-RISCV32-NEXT:lw a2, 12(a2)
+; CHECK-RISCV32-NEXT:lw a5, 12(a1)
+; CHECK-RISCV32-NEXT:lw a1, 8(a1)
+; CHECK-RISCV32-NEXT:lui a6, 524288
+; CHECK-RISCV32-NEXT:and a2, a2, a6
+; CHECK-RISCV32-NEXT:slli a5, a5, 1
+; CHECK-RISCV32-NEXT:srli a5, a5, 1
+; CHECK-RISCV32-NEXT:or a2, a5, a2
+; CHECK-RISCV32-NEXT:sw a1, 8(a0)
+; CHECK-RISCV32-NEXT:sw a4, 4(a0)
+; CHECK-RISCV32-NEXT:sw a3, 0(a0)
+; CHECK-RISCV32-NEXT:sw a2, 12(a0)
+; CHECK-RISCV32-NEXT:ret
+;
+; CHECK-S390X-LABEL: test_copysignf128:
+; CHECK-S390X:   # %bb.0: # %start
+; CHECK-S390X-NEXT:ld %f0, 0(%r3)
+; CHECK-S390X-NEXT:ld %f2, 8(%r3)
+; CHECK-S390X-NEXT:ld %f1, 0(%r4)
+; CHECK-S390X-NEXT:ld %f3, 8(%r4)
+; CHECK-S390X-NEXT:cpsdr %f0, %f1, %f0
+; CHECK-S390X-NEXT:std %f0, 0(%r2)
+; CHECK-S390X-NEXT:std %f2, 8(%r2)
+; CHECK-S390X-NEXT:br %r14
+;
+; CHECK-X86-LABEL: test_copysignf128:
+; CHECK-X86:   # %bb.0: # %start
+; CHECK-X86-NEXT:pushl %ebx
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 8
+; CHECK-X86-NEXT:pushl %edi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 12
+; CHECK-X86-NEXT:pushl %esi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 16
+; CHECK-X86-NEXT:.cfi_offset %esi, -16
+; CHECK-X86-NEXT:.cfi_offset %edi, -12
+; CHECK-X86-NEXT:.cfi_offset %ebx, -8
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %ecx
+; CHECK-X86-N

[clang] [llvm] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2024-01-13 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/76558

>From 90a465d0a7e9744a4a8043152016e500927a0d95 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Fri, 11 Aug 2023 22:16:01 -0400
Subject: [PATCH 1/4] [IR] Add an xpassing test for `f128` intrinsic lowering

`f128` intrinsic functions lower to incorrect libc calls. Add a test
showing current behavior.
---
 .../CodeGen/Generic/f128-math-lowering.ll | 610 ++
 1 file changed, 610 insertions(+)
 create mode 100644 llvm/test/CodeGen/Generic/f128-math-lowering.ll

diff --git a/llvm/test/CodeGen/Generic/f128-math-lowering.ll 
b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
new file mode 100644
index 00..8a70786d97fe67
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
@@ -0,0 +1,610 @@
+
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 2
+;
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-AARCH64
+; RUN: llc < %s -mtriple=riscv32-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-RISCV32
+; RUN: llc < %s -mtriple=s390x-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-S390X
+; RUN: llc < %s -mtriple=i686-unknown-unknown   -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X64
+;
+; Verify that fp128 intrinsics only lower to `long double` calls on platforms
+; where `f128` and `long double` have the same layout.
+;
+; We test on x86 and x64 which have 80-bit ld, as well as aarch64 (ld == f128),
+; riscv32 (ld == f64), and s380x (ld == f128 with different alignment from
+; x64/aarch64 f128).
+;
+; FIXME: these emit calls to long double functions but should emit f128 calls
+
+define fp128 @test_cbrtf128(fp128 %a) {
+; CHECK-LABEL:  test_cbrtf128:
+; CHECK-AARCH64:b llvm.cbrt.f128
+; CHECK-RISCV32:call llvm.cbrt.f128@plt
+; CHECK-S390X:  brasl {{%.*}} llvm.cbrt.f128@PLT
+; CHECK-X64:jmp llvm.cbrt.f128@PLT # TAILCALL
+; CHECK-X86:calll llvm.cbrt.f128@PLT
+start:
+  %0 = tail call fp128 @llvm.cbrt.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.cbrt.f128(fp128)
+
+
+define fp128 @test_ceilf128(fp128 %a) {
+; CHECK-LABEL:  test_ceilf128:
+; CHECK-AARCH64:b ceill
+; CHECK-RISCV32:call ceill@plt
+; CHECK-S390X:  brasl {{%.*}} ceill@PLT
+; CHECK-X64:jmp ceill@PLT
+; CHECK-X86:calll ceill
+start:
+  %0 = tail call fp128 @llvm.ceil.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.ceil.f128(fp128)
+
+
+define fp128 @test_copysignf128(fp128 %a, fp128 %b) {
+; No math library call here, so make sure the assembly does the correct thing.
+; This test is autogenerated
+; CHECK-LABEL:test_copysignf128:
+; CHECK-AARCH64-LABEL: test_copysignf128:
+; CHECK-AARCH64:   // %bb.0: // %start
+; CHECK-AARCH64-NEXT:stp q0, q1, [sp, #-32]!
+; CHECK-AARCH64-NEXT:.cfi_def_cfa_offset 32
+; CHECK-AARCH64-NEXT:ldrb w8, [sp, #15]
+; CHECK-AARCH64-NEXT:ldrb w9, [sp, #31]
+; CHECK-AARCH64-NEXT:bfxil w9, w8, #0, #7
+; CHECK-AARCH64-NEXT:strb w9, [sp, #15]
+; CHECK-AARCH64-NEXT:ldr q0, [sp], #32
+; CHECK-AARCH64-NEXT:ret
+;
+; CHECK-RISCV32-LABEL: test_copysignf128:
+; CHECK-RISCV32:   # %bb.0: # %start
+; CHECK-RISCV32-NEXT:lw a3, 0(a1)
+; CHECK-RISCV32-NEXT:lw a4, 4(a1)
+; CHECK-RISCV32-NEXT:lw a2, 12(a2)
+; CHECK-RISCV32-NEXT:lw a5, 12(a1)
+; CHECK-RISCV32-NEXT:lw a1, 8(a1)
+; CHECK-RISCV32-NEXT:lui a6, 524288
+; CHECK-RISCV32-NEXT:and a2, a2, a6
+; CHECK-RISCV32-NEXT:slli a5, a5, 1
+; CHECK-RISCV32-NEXT:srli a5, a5, 1
+; CHECK-RISCV32-NEXT:or a2, a5, a2
+; CHECK-RISCV32-NEXT:sw a1, 8(a0)
+; CHECK-RISCV32-NEXT:sw a4, 4(a0)
+; CHECK-RISCV32-NEXT:sw a3, 0(a0)
+; CHECK-RISCV32-NEXT:sw a2, 12(a0)
+; CHECK-RISCV32-NEXT:ret
+;
+; CHECK-S390X-LABEL: test_copysignf128:
+; CHECK-S390X:   # %bb.0: # %start
+; CHECK-S390X-NEXT:ld %f0, 0(%r3)
+; CHECK-S390X-NEXT:ld %f2, 8(%r3)
+; CHECK-S390X-NEXT:ld %f1, 0(%r4)
+; CHECK-S390X-NEXT:ld %f3, 8(%r4)
+; CHECK-S390X-NEXT:cpsdr %f0, %f1, %f0
+; CHECK-S390X-NEXT:std %f0, 0(%r2)
+; CHECK-S390X-NEXT:std %f2, 8(%r2)
+; CHECK-S390X-NEXT:br %r14
+;
+; CHECK-X86-LABEL: test_copysignf128:
+; CHECK-X86:   # %bb.0: # %start
+; CHECK-X86-NEXT:pushl %ebx
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 8
+; CHECK-X86-NEXT:pushl %edi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 12
+; CHECK-X86-NEXT:pushl %esi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 16
+; CHECK-X86-NEXT:.cfi_offset %esi, -16
+; CHECK-X86-NEXT:.cfi_offset %edi, -12
+; CHECK-X86-NEXT:.cfi_offset %ebx, -8
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %ecx
+; CHECK-X86-N

[llvm] [clang] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2024-01-13 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/76558

>From 90a465d0a7e9744a4a8043152016e500927a0d95 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Fri, 11 Aug 2023 22:16:01 -0400
Subject: [PATCH 1/4] [IR] Add an xpassing test for `f128` intrinsic lowering

`f128` intrinsic functions lower to incorrect libc calls. Add a test
showing current behavior.
---
 .../CodeGen/Generic/f128-math-lowering.ll | 610 ++
 1 file changed, 610 insertions(+)
 create mode 100644 llvm/test/CodeGen/Generic/f128-math-lowering.ll

diff --git a/llvm/test/CodeGen/Generic/f128-math-lowering.ll 
b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
new file mode 100644
index 00..8a70786d97fe67
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
@@ -0,0 +1,610 @@
+
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 2
+;
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-AARCH64
+; RUN: llc < %s -mtriple=riscv32-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-RISCV32
+; RUN: llc < %s -mtriple=s390x-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-S390X
+; RUN: llc < %s -mtriple=i686-unknown-unknown   -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X64
+;
+; Verify that fp128 intrinsics only lower to `long double` calls on platforms
+; where `f128` and `long double` have the same layout.
+;
+; We test on x86 and x64 which have 80-bit ld, as well as aarch64 (ld == f128),
+; riscv32 (ld == f64), and s380x (ld == f128 with different alignment from
+; x64/aarch64 f128).
+;
+; FIXME: these emit calls to long double functions but should emit f128 calls
+
+define fp128 @test_cbrtf128(fp128 %a) {
+; CHECK-LABEL:  test_cbrtf128:
+; CHECK-AARCH64:b llvm.cbrt.f128
+; CHECK-RISCV32:call llvm.cbrt.f128@plt
+; CHECK-S390X:  brasl {{%.*}} llvm.cbrt.f128@PLT
+; CHECK-X64:jmp llvm.cbrt.f128@PLT # TAILCALL
+; CHECK-X86:calll llvm.cbrt.f128@PLT
+start:
+  %0 = tail call fp128 @llvm.cbrt.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.cbrt.f128(fp128)
+
+
+define fp128 @test_ceilf128(fp128 %a) {
+; CHECK-LABEL:  test_ceilf128:
+; CHECK-AARCH64:b ceill
+; CHECK-RISCV32:call ceill@plt
+; CHECK-S390X:  brasl {{%.*}} ceill@PLT
+; CHECK-X64:jmp ceill@PLT
+; CHECK-X86:calll ceill
+start:
+  %0 = tail call fp128 @llvm.ceil.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.ceil.f128(fp128)
+
+
+define fp128 @test_copysignf128(fp128 %a, fp128 %b) {
+; No math library call here, so make sure the assembly does the correct thing.
+; This test is autogenerated
+; CHECK-LABEL:test_copysignf128:
+; CHECK-AARCH64-LABEL: test_copysignf128:
+; CHECK-AARCH64:   // %bb.0: // %start
+; CHECK-AARCH64-NEXT:stp q0, q1, [sp, #-32]!
+; CHECK-AARCH64-NEXT:.cfi_def_cfa_offset 32
+; CHECK-AARCH64-NEXT:ldrb w8, [sp, #15]
+; CHECK-AARCH64-NEXT:ldrb w9, [sp, #31]
+; CHECK-AARCH64-NEXT:bfxil w9, w8, #0, #7
+; CHECK-AARCH64-NEXT:strb w9, [sp, #15]
+; CHECK-AARCH64-NEXT:ldr q0, [sp], #32
+; CHECK-AARCH64-NEXT:ret
+;
+; CHECK-RISCV32-LABEL: test_copysignf128:
+; CHECK-RISCV32:   # %bb.0: # %start
+; CHECK-RISCV32-NEXT:lw a3, 0(a1)
+; CHECK-RISCV32-NEXT:lw a4, 4(a1)
+; CHECK-RISCV32-NEXT:lw a2, 12(a2)
+; CHECK-RISCV32-NEXT:lw a5, 12(a1)
+; CHECK-RISCV32-NEXT:lw a1, 8(a1)
+; CHECK-RISCV32-NEXT:lui a6, 524288
+; CHECK-RISCV32-NEXT:and a2, a2, a6
+; CHECK-RISCV32-NEXT:slli a5, a5, 1
+; CHECK-RISCV32-NEXT:srli a5, a5, 1
+; CHECK-RISCV32-NEXT:or a2, a5, a2
+; CHECK-RISCV32-NEXT:sw a1, 8(a0)
+; CHECK-RISCV32-NEXT:sw a4, 4(a0)
+; CHECK-RISCV32-NEXT:sw a3, 0(a0)
+; CHECK-RISCV32-NEXT:sw a2, 12(a0)
+; CHECK-RISCV32-NEXT:ret
+;
+; CHECK-S390X-LABEL: test_copysignf128:
+; CHECK-S390X:   # %bb.0: # %start
+; CHECK-S390X-NEXT:ld %f0, 0(%r3)
+; CHECK-S390X-NEXT:ld %f2, 8(%r3)
+; CHECK-S390X-NEXT:ld %f1, 0(%r4)
+; CHECK-S390X-NEXT:ld %f3, 8(%r4)
+; CHECK-S390X-NEXT:cpsdr %f0, %f1, %f0
+; CHECK-S390X-NEXT:std %f0, 0(%r2)
+; CHECK-S390X-NEXT:std %f2, 8(%r2)
+; CHECK-S390X-NEXT:br %r14
+;
+; CHECK-X86-LABEL: test_copysignf128:
+; CHECK-X86:   # %bb.0: # %start
+; CHECK-X86-NEXT:pushl %ebx
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 8
+; CHECK-X86-NEXT:pushl %edi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 12
+; CHECK-X86-NEXT:pushl %esi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 16
+; CHECK-X86-NEXT:.cfi_offset %esi, -16
+; CHECK-X86-NEXT:.cfi_offset %edi, -12
+; CHECK-X86-NEXT:.cfi_offset %ebx, -8
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %ecx
+; CHECK-X86-N

[clang] [llvm] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2024-01-13 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/76558

>From 90a465d0a7e9744a4a8043152016e500927a0d95 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Fri, 11 Aug 2023 22:16:01 -0400
Subject: [PATCH 1/4] [IR] Add an xpassing test for `f128` intrinsic lowering

`f128` intrinsic functions lower to incorrect libc calls. Add a test
showing current behavior.
---
 .../CodeGen/Generic/f128-math-lowering.ll | 610 ++
 1 file changed, 610 insertions(+)
 create mode 100644 llvm/test/CodeGen/Generic/f128-math-lowering.ll

diff --git a/llvm/test/CodeGen/Generic/f128-math-lowering.ll 
b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
new file mode 100644
index 00..8a70786d97fe67
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
@@ -0,0 +1,610 @@
+
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 2
+;
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-AARCH64
+; RUN: llc < %s -mtriple=riscv32-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-RISCV32
+; RUN: llc < %s -mtriple=s390x-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-S390X
+; RUN: llc < %s -mtriple=i686-unknown-unknown   -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X64
+;
+; Verify that fp128 intrinsics only lower to `long double` calls on platforms
+; where `f128` and `long double` have the same layout.
+;
+; We test on x86 and x64 which have 80-bit ld, as well as aarch64 (ld == f128),
+; riscv32 (ld == f64), and s380x (ld == f128 with different alignment from
+; x64/aarch64 f128).
+;
+; FIXME: these emit calls to long double functions but should emit f128 calls
+
+define fp128 @test_cbrtf128(fp128 %a) {
+; CHECK-LABEL:  test_cbrtf128:
+; CHECK-AARCH64:b llvm.cbrt.f128
+; CHECK-RISCV32:call llvm.cbrt.f128@plt
+; CHECK-S390X:  brasl {{%.*}} llvm.cbrt.f128@PLT
+; CHECK-X64:jmp llvm.cbrt.f128@PLT # TAILCALL
+; CHECK-X86:calll llvm.cbrt.f128@PLT
+start:
+  %0 = tail call fp128 @llvm.cbrt.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.cbrt.f128(fp128)
+
+
+define fp128 @test_ceilf128(fp128 %a) {
+; CHECK-LABEL:  test_ceilf128:
+; CHECK-AARCH64:b ceill
+; CHECK-RISCV32:call ceill@plt
+; CHECK-S390X:  brasl {{%.*}} ceill@PLT
+; CHECK-X64:jmp ceill@PLT
+; CHECK-X86:calll ceill
+start:
+  %0 = tail call fp128 @llvm.ceil.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.ceil.f128(fp128)
+
+
+define fp128 @test_copysignf128(fp128 %a, fp128 %b) {
+; No math library call here, so make sure the assembly does the correct thing.
+; This test is autogenerated
+; CHECK-LABEL:test_copysignf128:
+; CHECK-AARCH64-LABEL: test_copysignf128:
+; CHECK-AARCH64:   // %bb.0: // %start
+; CHECK-AARCH64-NEXT:stp q0, q1, [sp, #-32]!
+; CHECK-AARCH64-NEXT:.cfi_def_cfa_offset 32
+; CHECK-AARCH64-NEXT:ldrb w8, [sp, #15]
+; CHECK-AARCH64-NEXT:ldrb w9, [sp, #31]
+; CHECK-AARCH64-NEXT:bfxil w9, w8, #0, #7
+; CHECK-AARCH64-NEXT:strb w9, [sp, #15]
+; CHECK-AARCH64-NEXT:ldr q0, [sp], #32
+; CHECK-AARCH64-NEXT:ret
+;
+; CHECK-RISCV32-LABEL: test_copysignf128:
+; CHECK-RISCV32:   # %bb.0: # %start
+; CHECK-RISCV32-NEXT:lw a3, 0(a1)
+; CHECK-RISCV32-NEXT:lw a4, 4(a1)
+; CHECK-RISCV32-NEXT:lw a2, 12(a2)
+; CHECK-RISCV32-NEXT:lw a5, 12(a1)
+; CHECK-RISCV32-NEXT:lw a1, 8(a1)
+; CHECK-RISCV32-NEXT:lui a6, 524288
+; CHECK-RISCV32-NEXT:and a2, a2, a6
+; CHECK-RISCV32-NEXT:slli a5, a5, 1
+; CHECK-RISCV32-NEXT:srli a5, a5, 1
+; CHECK-RISCV32-NEXT:or a2, a5, a2
+; CHECK-RISCV32-NEXT:sw a1, 8(a0)
+; CHECK-RISCV32-NEXT:sw a4, 4(a0)
+; CHECK-RISCV32-NEXT:sw a3, 0(a0)
+; CHECK-RISCV32-NEXT:sw a2, 12(a0)
+; CHECK-RISCV32-NEXT:ret
+;
+; CHECK-S390X-LABEL: test_copysignf128:
+; CHECK-S390X:   # %bb.0: # %start
+; CHECK-S390X-NEXT:ld %f0, 0(%r3)
+; CHECK-S390X-NEXT:ld %f2, 8(%r3)
+; CHECK-S390X-NEXT:ld %f1, 0(%r4)
+; CHECK-S390X-NEXT:ld %f3, 8(%r4)
+; CHECK-S390X-NEXT:cpsdr %f0, %f1, %f0
+; CHECK-S390X-NEXT:std %f0, 0(%r2)
+; CHECK-S390X-NEXT:std %f2, 8(%r2)
+; CHECK-S390X-NEXT:br %r14
+;
+; CHECK-X86-LABEL: test_copysignf128:
+; CHECK-X86:   # %bb.0: # %start
+; CHECK-X86-NEXT:pushl %ebx
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 8
+; CHECK-X86-NEXT:pushl %edi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 12
+; CHECK-X86-NEXT:pushl %esi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 16
+; CHECK-X86-NEXT:.cfi_offset %esi, -16
+; CHECK-X86-NEXT:.cfi_offset %edi, -12
+; CHECK-X86-NEXT:.cfi_offset %ebx, -8
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %ecx
+; CHECK-X86-N

[clang] [llvm] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2024-01-20 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/76558

>From 90a465d0a7e9744a4a8043152016e500927a0d95 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Fri, 11 Aug 2023 22:16:01 -0400
Subject: [PATCH 1/4] [IR] Add an xpassing test for `f128` intrinsic lowering

`f128` intrinsic functions lower to incorrect libc calls. Add a test
showing current behavior.
---
 .../CodeGen/Generic/f128-math-lowering.ll | 610 ++
 1 file changed, 610 insertions(+)
 create mode 100644 llvm/test/CodeGen/Generic/f128-math-lowering.ll

diff --git a/llvm/test/CodeGen/Generic/f128-math-lowering.ll 
b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
new file mode 100644
index 00..8a70786d97fe67
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
@@ -0,0 +1,610 @@
+
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 2
+;
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-AARCH64
+; RUN: llc < %s -mtriple=riscv32-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-RISCV32
+; RUN: llc < %s -mtriple=s390x-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-S390X
+; RUN: llc < %s -mtriple=i686-unknown-unknown   -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X64
+;
+; Verify that fp128 intrinsics only lower to `long double` calls on platforms
+; where `f128` and `long double` have the same layout.
+;
+; We test on x86 and x64 which have 80-bit ld, as well as aarch64 (ld == f128),
+; riscv32 (ld == f64), and s380x (ld == f128 with different alignment from
+; x64/aarch64 f128).
+;
+; FIXME: these emit calls to long double functions but should emit f128 calls
+
+define fp128 @test_cbrtf128(fp128 %a) {
+; CHECK-LABEL:  test_cbrtf128:
+; CHECK-AARCH64:b llvm.cbrt.f128
+; CHECK-RISCV32:call llvm.cbrt.f128@plt
+; CHECK-S390X:  brasl {{%.*}} llvm.cbrt.f128@PLT
+; CHECK-X64:jmp llvm.cbrt.f128@PLT # TAILCALL
+; CHECK-X86:calll llvm.cbrt.f128@PLT
+start:
+  %0 = tail call fp128 @llvm.cbrt.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.cbrt.f128(fp128)
+
+
+define fp128 @test_ceilf128(fp128 %a) {
+; CHECK-LABEL:  test_ceilf128:
+; CHECK-AARCH64:b ceill
+; CHECK-RISCV32:call ceill@plt
+; CHECK-S390X:  brasl {{%.*}} ceill@PLT
+; CHECK-X64:jmp ceill@PLT
+; CHECK-X86:calll ceill
+start:
+  %0 = tail call fp128 @llvm.ceil.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.ceil.f128(fp128)
+
+
+define fp128 @test_copysignf128(fp128 %a, fp128 %b) {
+; No math library call here, so make sure the assembly does the correct thing.
+; This test is autogenerated
+; CHECK-LABEL:test_copysignf128:
+; CHECK-AARCH64-LABEL: test_copysignf128:
+; CHECK-AARCH64:   // %bb.0: // %start
+; CHECK-AARCH64-NEXT:stp q0, q1, [sp, #-32]!
+; CHECK-AARCH64-NEXT:.cfi_def_cfa_offset 32
+; CHECK-AARCH64-NEXT:ldrb w8, [sp, #15]
+; CHECK-AARCH64-NEXT:ldrb w9, [sp, #31]
+; CHECK-AARCH64-NEXT:bfxil w9, w8, #0, #7
+; CHECK-AARCH64-NEXT:strb w9, [sp, #15]
+; CHECK-AARCH64-NEXT:ldr q0, [sp], #32
+; CHECK-AARCH64-NEXT:ret
+;
+; CHECK-RISCV32-LABEL: test_copysignf128:
+; CHECK-RISCV32:   # %bb.0: # %start
+; CHECK-RISCV32-NEXT:lw a3, 0(a1)
+; CHECK-RISCV32-NEXT:lw a4, 4(a1)
+; CHECK-RISCV32-NEXT:lw a2, 12(a2)
+; CHECK-RISCV32-NEXT:lw a5, 12(a1)
+; CHECK-RISCV32-NEXT:lw a1, 8(a1)
+; CHECK-RISCV32-NEXT:lui a6, 524288
+; CHECK-RISCV32-NEXT:and a2, a2, a6
+; CHECK-RISCV32-NEXT:slli a5, a5, 1
+; CHECK-RISCV32-NEXT:srli a5, a5, 1
+; CHECK-RISCV32-NEXT:or a2, a5, a2
+; CHECK-RISCV32-NEXT:sw a1, 8(a0)
+; CHECK-RISCV32-NEXT:sw a4, 4(a0)
+; CHECK-RISCV32-NEXT:sw a3, 0(a0)
+; CHECK-RISCV32-NEXT:sw a2, 12(a0)
+; CHECK-RISCV32-NEXT:ret
+;
+; CHECK-S390X-LABEL: test_copysignf128:
+; CHECK-S390X:   # %bb.0: # %start
+; CHECK-S390X-NEXT:ld %f0, 0(%r3)
+; CHECK-S390X-NEXT:ld %f2, 8(%r3)
+; CHECK-S390X-NEXT:ld %f1, 0(%r4)
+; CHECK-S390X-NEXT:ld %f3, 8(%r4)
+; CHECK-S390X-NEXT:cpsdr %f0, %f1, %f0
+; CHECK-S390X-NEXT:std %f0, 0(%r2)
+; CHECK-S390X-NEXT:std %f2, 8(%r2)
+; CHECK-S390X-NEXT:br %r14
+;
+; CHECK-X86-LABEL: test_copysignf128:
+; CHECK-X86:   # %bb.0: # %start
+; CHECK-X86-NEXT:pushl %ebx
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 8
+; CHECK-X86-NEXT:pushl %edi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 12
+; CHECK-X86-NEXT:pushl %esi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 16
+; CHECK-X86-NEXT:.cfi_offset %esi, -16
+; CHECK-X86-NEXT:.cfi_offset %edi, -12
+; CHECK-X86-NEXT:.cfi_offset %ebx, -8
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %ecx
+; CHECK-X86-N

[clang] [llvm] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2024-01-20 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/76558

>From e869ad1bc601d95b6364dc5619e79a06e8b0fc82 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Fri, 11 Aug 2023 22:16:01 -0400
Subject: [PATCH 1/4] [IR] Add an xpassing test for `f128` intrinsic lowering

`f128` intrinsic functions lower to incorrect libc calls. Add a test
showing current behavior.
---
 .../CodeGen/Generic/f128-math-lowering.ll | 610 ++
 1 file changed, 610 insertions(+)
 create mode 100644 llvm/test/CodeGen/Generic/f128-math-lowering.ll

diff --git a/llvm/test/CodeGen/Generic/f128-math-lowering.ll 
b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
new file mode 100644
index 00..8a70786d97fe67
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
@@ -0,0 +1,610 @@
+
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 2
+;
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-AARCH64
+; RUN: llc < %s -mtriple=riscv32-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-RISCV32
+; RUN: llc < %s -mtriple=s390x-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-S390X
+; RUN: llc < %s -mtriple=i686-unknown-unknown   -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X64
+;
+; Verify that fp128 intrinsics only lower to `long double` calls on platforms
+; where `f128` and `long double` have the same layout.
+;
+; We test on x86 and x64 which have 80-bit ld, as well as aarch64 (ld == f128),
+; riscv32 (ld == f64), and s380x (ld == f128 with different alignment from
+; x64/aarch64 f128).
+;
+; FIXME: these emit calls to long double functions but should emit f128 calls
+
+define fp128 @test_cbrtf128(fp128 %a) {
+; CHECK-LABEL:  test_cbrtf128:
+; CHECK-AARCH64:b llvm.cbrt.f128
+; CHECK-RISCV32:call llvm.cbrt.f128@plt
+; CHECK-S390X:  brasl {{%.*}} llvm.cbrt.f128@PLT
+; CHECK-X64:jmp llvm.cbrt.f128@PLT # TAILCALL
+; CHECK-X86:calll llvm.cbrt.f128@PLT
+start:
+  %0 = tail call fp128 @llvm.cbrt.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.cbrt.f128(fp128)
+
+
+define fp128 @test_ceilf128(fp128 %a) {
+; CHECK-LABEL:  test_ceilf128:
+; CHECK-AARCH64:b ceill
+; CHECK-RISCV32:call ceill@plt
+; CHECK-S390X:  brasl {{%.*}} ceill@PLT
+; CHECK-X64:jmp ceill@PLT
+; CHECK-X86:calll ceill
+start:
+  %0 = tail call fp128 @llvm.ceil.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.ceil.f128(fp128)
+
+
+define fp128 @test_copysignf128(fp128 %a, fp128 %b) {
+; No math library call here, so make sure the assembly does the correct thing.
+; This test is autogenerated
+; CHECK-LABEL:test_copysignf128:
+; CHECK-AARCH64-LABEL: test_copysignf128:
+; CHECK-AARCH64:   // %bb.0: // %start
+; CHECK-AARCH64-NEXT:stp q0, q1, [sp, #-32]!
+; CHECK-AARCH64-NEXT:.cfi_def_cfa_offset 32
+; CHECK-AARCH64-NEXT:ldrb w8, [sp, #15]
+; CHECK-AARCH64-NEXT:ldrb w9, [sp, #31]
+; CHECK-AARCH64-NEXT:bfxil w9, w8, #0, #7
+; CHECK-AARCH64-NEXT:strb w9, [sp, #15]
+; CHECK-AARCH64-NEXT:ldr q0, [sp], #32
+; CHECK-AARCH64-NEXT:ret
+;
+; CHECK-RISCV32-LABEL: test_copysignf128:
+; CHECK-RISCV32:   # %bb.0: # %start
+; CHECK-RISCV32-NEXT:lw a3, 0(a1)
+; CHECK-RISCV32-NEXT:lw a4, 4(a1)
+; CHECK-RISCV32-NEXT:lw a2, 12(a2)
+; CHECK-RISCV32-NEXT:lw a5, 12(a1)
+; CHECK-RISCV32-NEXT:lw a1, 8(a1)
+; CHECK-RISCV32-NEXT:lui a6, 524288
+; CHECK-RISCV32-NEXT:and a2, a2, a6
+; CHECK-RISCV32-NEXT:slli a5, a5, 1
+; CHECK-RISCV32-NEXT:srli a5, a5, 1
+; CHECK-RISCV32-NEXT:or a2, a5, a2
+; CHECK-RISCV32-NEXT:sw a1, 8(a0)
+; CHECK-RISCV32-NEXT:sw a4, 4(a0)
+; CHECK-RISCV32-NEXT:sw a3, 0(a0)
+; CHECK-RISCV32-NEXT:sw a2, 12(a0)
+; CHECK-RISCV32-NEXT:ret
+;
+; CHECK-S390X-LABEL: test_copysignf128:
+; CHECK-S390X:   # %bb.0: # %start
+; CHECK-S390X-NEXT:ld %f0, 0(%r3)
+; CHECK-S390X-NEXT:ld %f2, 8(%r3)
+; CHECK-S390X-NEXT:ld %f1, 0(%r4)
+; CHECK-S390X-NEXT:ld %f3, 8(%r4)
+; CHECK-S390X-NEXT:cpsdr %f0, %f1, %f0
+; CHECK-S390X-NEXT:std %f0, 0(%r2)
+; CHECK-S390X-NEXT:std %f2, 8(%r2)
+; CHECK-S390X-NEXT:br %r14
+;
+; CHECK-X86-LABEL: test_copysignf128:
+; CHECK-X86:   # %bb.0: # %start
+; CHECK-X86-NEXT:pushl %ebx
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 8
+; CHECK-X86-NEXT:pushl %edi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 12
+; CHECK-X86-NEXT:pushl %esi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 16
+; CHECK-X86-NEXT:.cfi_offset %esi, -16
+; CHECK-X86-NEXT:.cfi_offset %edi, -12
+; CHECK-X86-NEXT:.cfi_offset %ebx, -8
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %ecx
+; CHECK-X86-N

[llvm] [clang] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2024-01-20 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/76558

>From e869ad1bc601d95b6364dc5619e79a06e8b0fc82 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Fri, 11 Aug 2023 22:16:01 -0400
Subject: [PATCH 1/4] [IR] Add an xpassing test for `f128` intrinsic lowering

`f128` intrinsic functions lower to incorrect libc calls. Add a test
showing current behavior.
---
 .../CodeGen/Generic/f128-math-lowering.ll | 610 ++
 1 file changed, 610 insertions(+)
 create mode 100644 llvm/test/CodeGen/Generic/f128-math-lowering.ll

diff --git a/llvm/test/CodeGen/Generic/f128-math-lowering.ll 
b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
new file mode 100644
index 00..8a70786d97fe67
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
@@ -0,0 +1,610 @@
+
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 2
+;
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-AARCH64
+; RUN: llc < %s -mtriple=riscv32-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-RISCV32
+; RUN: llc < %s -mtriple=s390x-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-S390X
+; RUN: llc < %s -mtriple=i686-unknown-unknown   -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X64
+;
+; Verify that fp128 intrinsics only lower to `long double` calls on platforms
+; where `f128` and `long double` have the same layout.
+;
+; We test on x86 and x64 which have 80-bit ld, as well as aarch64 (ld == f128),
+; riscv32 (ld == f64), and s380x (ld == f128 with different alignment from
+; x64/aarch64 f128).
+;
+; FIXME: these emit calls to long double functions but should emit f128 calls
+
+define fp128 @test_cbrtf128(fp128 %a) {
+; CHECK-LABEL:  test_cbrtf128:
+; CHECK-AARCH64:b llvm.cbrt.f128
+; CHECK-RISCV32:call llvm.cbrt.f128@plt
+; CHECK-S390X:  brasl {{%.*}} llvm.cbrt.f128@PLT
+; CHECK-X64:jmp llvm.cbrt.f128@PLT # TAILCALL
+; CHECK-X86:calll llvm.cbrt.f128@PLT
+start:
+  %0 = tail call fp128 @llvm.cbrt.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.cbrt.f128(fp128)
+
+
+define fp128 @test_ceilf128(fp128 %a) {
+; CHECK-LABEL:  test_ceilf128:
+; CHECK-AARCH64:b ceill
+; CHECK-RISCV32:call ceill@plt
+; CHECK-S390X:  brasl {{%.*}} ceill@PLT
+; CHECK-X64:jmp ceill@PLT
+; CHECK-X86:calll ceill
+start:
+  %0 = tail call fp128 @llvm.ceil.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.ceil.f128(fp128)
+
+
+define fp128 @test_copysignf128(fp128 %a, fp128 %b) {
+; No math library call here, so make sure the assembly does the correct thing.
+; This test is autogenerated
+; CHECK-LABEL:test_copysignf128:
+; CHECK-AARCH64-LABEL: test_copysignf128:
+; CHECK-AARCH64:   // %bb.0: // %start
+; CHECK-AARCH64-NEXT:stp q0, q1, [sp, #-32]!
+; CHECK-AARCH64-NEXT:.cfi_def_cfa_offset 32
+; CHECK-AARCH64-NEXT:ldrb w8, [sp, #15]
+; CHECK-AARCH64-NEXT:ldrb w9, [sp, #31]
+; CHECK-AARCH64-NEXT:bfxil w9, w8, #0, #7
+; CHECK-AARCH64-NEXT:strb w9, [sp, #15]
+; CHECK-AARCH64-NEXT:ldr q0, [sp], #32
+; CHECK-AARCH64-NEXT:ret
+;
+; CHECK-RISCV32-LABEL: test_copysignf128:
+; CHECK-RISCV32:   # %bb.0: # %start
+; CHECK-RISCV32-NEXT:lw a3, 0(a1)
+; CHECK-RISCV32-NEXT:lw a4, 4(a1)
+; CHECK-RISCV32-NEXT:lw a2, 12(a2)
+; CHECK-RISCV32-NEXT:lw a5, 12(a1)
+; CHECK-RISCV32-NEXT:lw a1, 8(a1)
+; CHECK-RISCV32-NEXT:lui a6, 524288
+; CHECK-RISCV32-NEXT:and a2, a2, a6
+; CHECK-RISCV32-NEXT:slli a5, a5, 1
+; CHECK-RISCV32-NEXT:srli a5, a5, 1
+; CHECK-RISCV32-NEXT:or a2, a5, a2
+; CHECK-RISCV32-NEXT:sw a1, 8(a0)
+; CHECK-RISCV32-NEXT:sw a4, 4(a0)
+; CHECK-RISCV32-NEXT:sw a3, 0(a0)
+; CHECK-RISCV32-NEXT:sw a2, 12(a0)
+; CHECK-RISCV32-NEXT:ret
+;
+; CHECK-S390X-LABEL: test_copysignf128:
+; CHECK-S390X:   # %bb.0: # %start
+; CHECK-S390X-NEXT:ld %f0, 0(%r3)
+; CHECK-S390X-NEXT:ld %f2, 8(%r3)
+; CHECK-S390X-NEXT:ld %f1, 0(%r4)
+; CHECK-S390X-NEXT:ld %f3, 8(%r4)
+; CHECK-S390X-NEXT:cpsdr %f0, %f1, %f0
+; CHECK-S390X-NEXT:std %f0, 0(%r2)
+; CHECK-S390X-NEXT:std %f2, 8(%r2)
+; CHECK-S390X-NEXT:br %r14
+;
+; CHECK-X86-LABEL: test_copysignf128:
+; CHECK-X86:   # %bb.0: # %start
+; CHECK-X86-NEXT:pushl %ebx
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 8
+; CHECK-X86-NEXT:pushl %edi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 12
+; CHECK-X86-NEXT:pushl %esi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 16
+; CHECK-X86-NEXT:.cfi_offset %esi, -16
+; CHECK-X86-NEXT:.cfi_offset %edi, -12
+; CHECK-X86-NEXT:.cfi_offset %ebx, -8
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %ecx
+; CHECK-X86-N

[llvm] [clang] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2024-01-20 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/76558

>From e869ad1bc601d95b6364dc5619e79a06e8b0fc82 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Fri, 11 Aug 2023 22:16:01 -0400
Subject: [PATCH 1/4] [IR] Add an xpassing test for `f128` intrinsic lowering

`f128` intrinsic functions lower to incorrect libc calls. Add a test
showing current behavior.
---
 .../CodeGen/Generic/f128-math-lowering.ll | 610 ++
 1 file changed, 610 insertions(+)
 create mode 100644 llvm/test/CodeGen/Generic/f128-math-lowering.ll

diff --git a/llvm/test/CodeGen/Generic/f128-math-lowering.ll 
b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
new file mode 100644
index 000..8a70786d97fe67d
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
@@ -0,0 +1,610 @@
+
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 2
+;
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-AARCH64
+; RUN: llc < %s -mtriple=riscv32-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-RISCV32
+; RUN: llc < %s -mtriple=s390x-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-S390X
+; RUN: llc < %s -mtriple=i686-unknown-unknown   -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X64
+;
+; Verify that fp128 intrinsics only lower to `long double` calls on platforms
+; where `f128` and `long double` have the same layout.
+;
+; We test on x86 and x64 which have 80-bit ld, as well as aarch64 (ld == f128),
+; riscv32 (ld == f64), and s380x (ld == f128 with different alignment from
+; x64/aarch64 f128).
+;
+; FIXME: these emit calls to long double functions but should emit f128 calls
+
+define fp128 @test_cbrtf128(fp128 %a) {
+; CHECK-LABEL:  test_cbrtf128:
+; CHECK-AARCH64:b llvm.cbrt.f128
+; CHECK-RISCV32:call llvm.cbrt.f128@plt
+; CHECK-S390X:  brasl {{%.*}} llvm.cbrt.f128@PLT
+; CHECK-X64:jmp llvm.cbrt.f128@PLT # TAILCALL
+; CHECK-X86:calll llvm.cbrt.f128@PLT
+start:
+  %0 = tail call fp128 @llvm.cbrt.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.cbrt.f128(fp128)
+
+
+define fp128 @test_ceilf128(fp128 %a) {
+; CHECK-LABEL:  test_ceilf128:
+; CHECK-AARCH64:b ceill
+; CHECK-RISCV32:call ceill@plt
+; CHECK-S390X:  brasl {{%.*}} ceill@PLT
+; CHECK-X64:jmp ceill@PLT
+; CHECK-X86:calll ceill
+start:
+  %0 = tail call fp128 @llvm.ceil.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.ceil.f128(fp128)
+
+
+define fp128 @test_copysignf128(fp128 %a, fp128 %b) {
+; No math library call here, so make sure the assembly does the correct thing.
+; This test is autogenerated
+; CHECK-LABEL:test_copysignf128:
+; CHECK-AARCH64-LABEL: test_copysignf128:
+; CHECK-AARCH64:   // %bb.0: // %start
+; CHECK-AARCH64-NEXT:stp q0, q1, [sp, #-32]!
+; CHECK-AARCH64-NEXT:.cfi_def_cfa_offset 32
+; CHECK-AARCH64-NEXT:ldrb w8, [sp, #15]
+; CHECK-AARCH64-NEXT:ldrb w9, [sp, #31]
+; CHECK-AARCH64-NEXT:bfxil w9, w8, #0, #7
+; CHECK-AARCH64-NEXT:strb w9, [sp, #15]
+; CHECK-AARCH64-NEXT:ldr q0, [sp], #32
+; CHECK-AARCH64-NEXT:ret
+;
+; CHECK-RISCV32-LABEL: test_copysignf128:
+; CHECK-RISCV32:   # %bb.0: # %start
+; CHECK-RISCV32-NEXT:lw a3, 0(a1)
+; CHECK-RISCV32-NEXT:lw a4, 4(a1)
+; CHECK-RISCV32-NEXT:lw a2, 12(a2)
+; CHECK-RISCV32-NEXT:lw a5, 12(a1)
+; CHECK-RISCV32-NEXT:lw a1, 8(a1)
+; CHECK-RISCV32-NEXT:lui a6, 524288
+; CHECK-RISCV32-NEXT:and a2, a2, a6
+; CHECK-RISCV32-NEXT:slli a5, a5, 1
+; CHECK-RISCV32-NEXT:srli a5, a5, 1
+; CHECK-RISCV32-NEXT:or a2, a5, a2
+; CHECK-RISCV32-NEXT:sw a1, 8(a0)
+; CHECK-RISCV32-NEXT:sw a4, 4(a0)
+; CHECK-RISCV32-NEXT:sw a3, 0(a0)
+; CHECK-RISCV32-NEXT:sw a2, 12(a0)
+; CHECK-RISCV32-NEXT:ret
+;
+; CHECK-S390X-LABEL: test_copysignf128:
+; CHECK-S390X:   # %bb.0: # %start
+; CHECK-S390X-NEXT:ld %f0, 0(%r3)
+; CHECK-S390X-NEXT:ld %f2, 8(%r3)
+; CHECK-S390X-NEXT:ld %f1, 0(%r4)
+; CHECK-S390X-NEXT:ld %f3, 8(%r4)
+; CHECK-S390X-NEXT:cpsdr %f0, %f1, %f0
+; CHECK-S390X-NEXT:std %f0, 0(%r2)
+; CHECK-S390X-NEXT:std %f2, 8(%r2)
+; CHECK-S390X-NEXT:br %r14
+;
+; CHECK-X86-LABEL: test_copysignf128:
+; CHECK-X86:   # %bb.0: # %start
+; CHECK-X86-NEXT:pushl %ebx
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 8
+; CHECK-X86-NEXT:pushl %edi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 12
+; CHECK-X86-NEXT:pushl %esi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 16
+; CHECK-X86-NEXT:.cfi_offset %esi, -16
+; CHECK-X86-NEXT:.cfi_offset %edi, -12
+; CHECK-X86-NEXT:.cfi_offset %ebx, -8
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %ecx
+; CHECK-X86

[clang] [llvm] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2024-01-20 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/76558

>From e869ad1bc601d95b6364dc5619e79a06e8b0fc82 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Fri, 11 Aug 2023 22:16:01 -0400
Subject: [PATCH 1/5] [IR] Add an xpassing test for `f128` intrinsic lowering

`f128` intrinsic functions lower to incorrect libc calls. Add a test
showing current behavior.
---
 .../CodeGen/Generic/f128-math-lowering.ll | 610 ++
 1 file changed, 610 insertions(+)
 create mode 100644 llvm/test/CodeGen/Generic/f128-math-lowering.ll

diff --git a/llvm/test/CodeGen/Generic/f128-math-lowering.ll 
b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
new file mode 100644
index 00..8a70786d97fe67
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
@@ -0,0 +1,610 @@
+
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 2
+;
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-AARCH64
+; RUN: llc < %s -mtriple=riscv32-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-RISCV32
+; RUN: llc < %s -mtriple=s390x-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-S390X
+; RUN: llc < %s -mtriple=i686-unknown-unknown   -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X64
+;
+; Verify that fp128 intrinsics only lower to `long double` calls on platforms
+; where `f128` and `long double` have the same layout.
+;
+; We test on x86 and x64 which have 80-bit ld, as well as aarch64 (ld == f128),
+; riscv32 (ld == f64), and s380x (ld == f128 with different alignment from
+; x64/aarch64 f128).
+;
+; FIXME: these emit calls to long double functions but should emit f128 calls
+
+define fp128 @test_cbrtf128(fp128 %a) {
+; CHECK-LABEL:  test_cbrtf128:
+; CHECK-AARCH64:b llvm.cbrt.f128
+; CHECK-RISCV32:call llvm.cbrt.f128@plt
+; CHECK-S390X:  brasl {{%.*}} llvm.cbrt.f128@PLT
+; CHECK-X64:jmp llvm.cbrt.f128@PLT # TAILCALL
+; CHECK-X86:calll llvm.cbrt.f128@PLT
+start:
+  %0 = tail call fp128 @llvm.cbrt.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.cbrt.f128(fp128)
+
+
+define fp128 @test_ceilf128(fp128 %a) {
+; CHECK-LABEL:  test_ceilf128:
+; CHECK-AARCH64:b ceill
+; CHECK-RISCV32:call ceill@plt
+; CHECK-S390X:  brasl {{%.*}} ceill@PLT
+; CHECK-X64:jmp ceill@PLT
+; CHECK-X86:calll ceill
+start:
+  %0 = tail call fp128 @llvm.ceil.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.ceil.f128(fp128)
+
+
+define fp128 @test_copysignf128(fp128 %a, fp128 %b) {
+; No math library call here, so make sure the assembly does the correct thing.
+; This test is autogenerated
+; CHECK-LABEL:test_copysignf128:
+; CHECK-AARCH64-LABEL: test_copysignf128:
+; CHECK-AARCH64:   // %bb.0: // %start
+; CHECK-AARCH64-NEXT:stp q0, q1, [sp, #-32]!
+; CHECK-AARCH64-NEXT:.cfi_def_cfa_offset 32
+; CHECK-AARCH64-NEXT:ldrb w8, [sp, #15]
+; CHECK-AARCH64-NEXT:ldrb w9, [sp, #31]
+; CHECK-AARCH64-NEXT:bfxil w9, w8, #0, #7
+; CHECK-AARCH64-NEXT:strb w9, [sp, #15]
+; CHECK-AARCH64-NEXT:ldr q0, [sp], #32
+; CHECK-AARCH64-NEXT:ret
+;
+; CHECK-RISCV32-LABEL: test_copysignf128:
+; CHECK-RISCV32:   # %bb.0: # %start
+; CHECK-RISCV32-NEXT:lw a3, 0(a1)
+; CHECK-RISCV32-NEXT:lw a4, 4(a1)
+; CHECK-RISCV32-NEXT:lw a2, 12(a2)
+; CHECK-RISCV32-NEXT:lw a5, 12(a1)
+; CHECK-RISCV32-NEXT:lw a1, 8(a1)
+; CHECK-RISCV32-NEXT:lui a6, 524288
+; CHECK-RISCV32-NEXT:and a2, a2, a6
+; CHECK-RISCV32-NEXT:slli a5, a5, 1
+; CHECK-RISCV32-NEXT:srli a5, a5, 1
+; CHECK-RISCV32-NEXT:or a2, a5, a2
+; CHECK-RISCV32-NEXT:sw a1, 8(a0)
+; CHECK-RISCV32-NEXT:sw a4, 4(a0)
+; CHECK-RISCV32-NEXT:sw a3, 0(a0)
+; CHECK-RISCV32-NEXT:sw a2, 12(a0)
+; CHECK-RISCV32-NEXT:ret
+;
+; CHECK-S390X-LABEL: test_copysignf128:
+; CHECK-S390X:   # %bb.0: # %start
+; CHECK-S390X-NEXT:ld %f0, 0(%r3)
+; CHECK-S390X-NEXT:ld %f2, 8(%r3)
+; CHECK-S390X-NEXT:ld %f1, 0(%r4)
+; CHECK-S390X-NEXT:ld %f3, 8(%r4)
+; CHECK-S390X-NEXT:cpsdr %f0, %f1, %f0
+; CHECK-S390X-NEXT:std %f0, 0(%r2)
+; CHECK-S390X-NEXT:std %f2, 8(%r2)
+; CHECK-S390X-NEXT:br %r14
+;
+; CHECK-X86-LABEL: test_copysignf128:
+; CHECK-X86:   # %bb.0: # %start
+; CHECK-X86-NEXT:pushl %ebx
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 8
+; CHECK-X86-NEXT:pushl %edi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 12
+; CHECK-X86-NEXT:pushl %esi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 16
+; CHECK-X86-NEXT:.cfi_offset %esi, -16
+; CHECK-X86-NEXT:.cfi_offset %edi, -12
+; CHECK-X86-NEXT:.cfi_offset %ebx, -8
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %ecx
+; CHECK-X86-N

[clang] [llvm] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2024-01-21 Thread Trevor Gross via cfe-commits

tgross35 wrote:

I'm struggling a bit with how to handle ABI information since that affects 
layout (e.g. ARM aapcs), which I think explains most of the errors in 
https://buildkite.com/llvm-project/github-pull-requests/builds/31198#018d26e2-fd17-4e15-a1eb-08580c189056.
 This needs to be available at TargetLoweringBase::InitLibcalls, which calls 
[`getCLayouts`](https://github.com/llvm/llvm-project/blob/cb3bf7540cf9b797575c625318e47a33f7514fad/llvm/lib/TargetParser/Triple.cpp#L1945).

TargetMachine is available at that time, so would it be better to move CLayouts 
from Triple to TargetMachine? If so subclasses could be used rather than the if 
block, which more closely follows the Clang side.

Also, are there currently any module flags that make it to TargetLowering? 
Looking for a reference on how get the -mlong-double-128 information.

https://github.com/llvm/llvm-project/pull/76558
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [SystemZ] Add support for half (fp16) (PR #109164)

2024-10-23 Thread Trevor Gross via cfe-commits

tgross35 wrote:

> With this version, the fp16 values are passed to conversion functions as 
> integer, which seems to be the default. It is however a bit tricky to do this 
> and at the same time pass half values in FP registers.
>
> At this point I wonder for one thing if it would be better to pass FP16 
> values to the conversion functions as _Float16 instead? It seems this may be 
> possible to change in the configurations by looking at 
> COMPILER_RT_HAS_FLOAT16 / compiler-rt/lib/builtins/extendhfsf2.c / 
> fp_extend.h...
> 
> Not really sure if those conversion functions are supposed to be built and 
> only used for soft-promotion of fp16, or if there are any external 
> implications, for instance gcc compatability.

My understanding is that in GCC's `__gnu_h2f_ieee`/`__gnu_f2h_ieee` is always 
`i32`<->`i16` (integer ABI), then `__extendhfsf2`/`__truncsfhf2` uses either 
`int16_t` or `_Float16` on a per-target basis as controlled by 
`__LIBGCC_HAS_HF_MODE__` (I don't know where this gets set). In LLVM 
compiler-rt, `COMPILER_RT_HAS_FLOAT16` is the control to do the same thing but 
it affects `extend`/`trunc` as well as `h2f`/`f2h`. I think the discrepancy 
works out here because if a target has `_Float16`, it will never be calling 
`__gnu_h2f_ieee` `__gnu_f2h_ieee`.

>From your first two sentences it sounds like `f16` is getting passed in a FP 
>register but going 
>FP->GPR->__gnu_h2f_ieee->FP->some_math_op->FP->__gnu_f2h_ieee->GPR->FP? I 
>think it makes sense to either always pass `f16` as `i16` and avoid the FP 
>registers, or make `_Float16` available so `COMPILER_RT_HAS_FLOAT16` can be 
>used.

@uweigand mentioned figuring out an ABI for `_Float16`, is this possible? That 
seems like the best option.

A quick check seems to show that GCC 13 does not support `_Float16` on s390x, 
nor does the crossbuild `libgcc.a` provide `__gnu_h2f_ieee`, `__gnu_f2h_ieee`, 
`__extendhfsf2`, or `__truncsfhf2`. So I think LLVM will be the one to set the 
precedent here.

Note that there are some common issues with these conversions, would probably 
be good to test against them if possible 
https://github.com/llvm/llvm-project/issues/97981 
https://github.com/llvm/llvm-project/issues/97975.

https://github.com/llvm/llvm-project/pull/109164
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang] Always pass fp128 arguments indirectly on Windows (PR #115052)

2024-11-05 Thread Trevor Gross via cfe-commits

tgross35 wrote:

It seems like arguments aren't actually getting passed indirectly to libcalls. 
Simple test program

```c
#include 
#include 

union ty128 {
  struct { uint64_t hi, lo; } u64x2;
  __float128 f128;
};

void f128_add(__float128 a, __float128 b) {
union ty128 cvt;
cvt.f128 = a * b;
printf("0x%016llx%016llx\n", cvt.u64x2.lo, cvt.u64x2.hi);
}

int main() {
__float128 fa, fb;

fa = 122134.345678901234;
fb = 78.9012345678901234;

f128_add(fa, fb);
}
```

Checking right before the call to `__multf3`.  Built with GCC `rcx` points to a 
return slot on the stack, `rdx` and `r8` point to the two float values 
(0x400f... and 0x4005...):

```text
(gdb) x/2gx $rcx
0x7cb30:0x0007cbf0  0x0007cc85
(gdb) x/2gx $rdx
0x7cb20:0xb000  0x400fdd16587e6997
(gdb) x/2gx $r8
0x7cb10:0x8000  0x40053b9add3c0c73
```

Built using this patch, it looks like Clang is putting the float arguments in 
`$rcx` and `$rdx` and not leaving a return slot pointer:

```text
(gdb) x/2gx $rcx
0x5ffe70:   0xb000  0x400fdd16587e6997
(gdb) x/2gx $rdx
0x5ffe60:   0x8000  0x40053b9add3c0c73
(gdb) x/2gx $r8
0x622720:   0x00622988  0x006229a7
```

Is libcall ABI handled elsewhere?

https://github.com/llvm/llvm-project/pull/115052
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang] Always pass fp128 arguments indirectly on Windows (PR #115052)

2024-11-05 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 created 
https://github.com/llvm/llvm-project/pull/115052

Clang currently passes and returns `__float128` in vector registers on MinGW 
targets. However, the Windows x86-64 calling convention [1] states the 
following:

> __m128 types, arrays, and strings are never passed by immediate value. 
> Instead, a pointer is passed to memory allocated by the caller. Structs and 
> unions of size 8, 16, 32, or 64 bits, and __m64 types, are passed as if they 
> were integers of the same size. Structs or unions of other sizes are passed 
> as a pointer to memory allocated by the caller. For these aggregate types 
> passed as a pointer, including __m128, the caller-allocated temporary memory 
> must be 16-byte aligned.

Based on the above it sounds like `__float128` should be passed indirectly; 
this is what MinGW GCC already does, so change Clang to match. Passing by value 
causes problems with varargs. E.g. the below completes successfully when built 
with GCC but has a runtime crash when built with Clang:

```c
void va_f128(int count, ...) {
va_list args;
va_start(args, count);
__float128 val = va_arg(args, __float128);
va_end(args);
}

int main() {
va_f128(0, 0.0);
}
```

This patch fixes the above. It also resolves crashes when calling GCC-built 
f128 libcalls.

Regarding return values, the documentation states:

> A scalar return value that can fit into 64 bits, including the __m64 type, is 
> returned through RAX. Non-scalar types including floats, doubles, and vector 
> types such as __m128, __m128i, __m128d are returned in XMM0.

This makes it sound like it should be acceptable to return `__float128` in 
XMM0. However, GCC returns `__float128` on the stack, so do the same here to be 
consistent.

Clang's MSVC targets do not support `__float128` or `_Float128`, but these 
changes would also apply there if it is eventually enabled.

[1]: 
https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170

>From 432e8a66156f08d45ad691017255364cfb0fd947 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Tue, 5 Nov 2024 07:00:35 -0500
Subject: [PATCH 1/2] [clang] Add fp128 ABI tests for MinGW (NFC)

Duplicate `win64-i128.c` to `win64-fp128.c` and update with the current
behavior of `__float128`.
---
 clang/test/CodeGen/win64-fp128.c | 20 
 1 file changed, 20 insertions(+)
 create mode 100644 clang/test/CodeGen/win64-fp128.c

diff --git a/clang/test/CodeGen/win64-fp128.c b/clang/test/CodeGen/win64-fp128.c
new file mode 100644
index 00..33e2441ddf3146
--- /dev/null
+++ b/clang/test/CodeGen/win64-fp128.c
@@ -0,0 +1,20 @@
+// RUN: %clang_cc1 -triple x86_64-windows-gnu -emit-llvm -o - %s \
+// RUN:| FileCheck %s --check-prefix=GNU64
+// __float128 is unsupported on MSVC
+
+__float128 fp128_ret(void) { return 0; }
+// GNU64: define dso_local fp128 @fp128_ret()
+
+__float128 fp128_args(__float128 a, __float128 b) { return a * b; }
+// GNU64: define dso_local fp128 @fp128_args(fp128 noundef %a, fp128 noundef 
%b)
+
+void fp128_vararg(int a, ...) {
+  // GNU64-LABEL: define dso_local void @fp128_vararg
+  __builtin_va_list ap;
+  __builtin_va_start(ap, a);
+  __float128 i = __builtin_va_arg(ap, __float128);
+  // movaps  xmm0, xmmword ptr [rax]
+  // GNU64: load ptr, ptr
+  // GNU64: load fp128, ptr
+  __builtin_va_end(ap);
+}

>From bacafd9466c37109d059f8de356b697934e493a3 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Tue, 5 Nov 2024 05:53:10 -0500
Subject: [PATCH 2/2] [clang] Always pass `fp128` arguments indirectly on
 Windows

Clang currently passes and returns `__float128` in vector registers on
MinGW targets. However, the Windows x86-64 calling convention [1] states
the following:

__m128 types, arrays, and strings are never passed by immediate
value. Instead, a pointer is passed to memory allocated by the
caller. Structs and unions of size 8, 16, 32, or 64 bits, and __m64
types, are passed as if they were integers of the same size. Structs
or unions of other sizes are passed as a pointer to memory allocated
by the caller. For these aggregate types passed as a pointer,
including __m128, the caller-allocated temporary memory must be
16-byte aligned.

Based on the above it sounds like `__float128` should be passed
indirectly; this is what MinGW GCC already does, so change Clang to
match. Passing by value causes problems with varargs. E.g. the below
completes successfully when built with GCC but has a runtime crash when
built with Clang:

void va_f128(int count, ...) {
va_list args;
va_start(args, count);
__float128 val = va_arg(args, __float128);
va_end(args);
}

int main() {
va_f128(0, 0.0);
}

This patch fixes the above. It also resolves crashes when calling
GCC-built f128 libcalls.

Regarding return values, the documentation states:

A scalar return value that can fit into 64 bits, including the __m64
type, is returned 

[clang] [clang] Always pass fp128 arguments indirectly on Windows (PR #115052)

2024-11-05 Thread Trevor Gross via cfe-commits

tgross35 wrote:

Cc @beetrees and @wesleywiser

https://github.com/llvm/llvm-project/pull/115052
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang] Always pass fp128 arguments indirectly on Windows (PR #115052)

2024-11-05 Thread Trevor Gross via cfe-commits


@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -triple x86_64-windows-gnu -emit-llvm -o - %s \
+// RUN:| FileCheck %s --check-prefix=GNU64
+// __float128 is unsupported on MSVC
+
+__float128 fp128_ret(void) { return 0; }
+// GNU64: define dso_local void @fp128_ret(ptr dead_on_unwind noalias writable 
sret(fp128) align 16 %agg.result)
+
+__float128 fp128_args(__float128 a, __float128 b) { return a * b; }
+// GNU64: define dso_local void @fp128_args(ptr dead_on_unwind noalias 
writable sret(fp128) align 16 %agg.result, ptr noundef %0, ptr noundef %1)
+
+void fp128_vararg(int a, ...) {
+  // GNU64-LABEL: define dso_local void @fp128_vararg
+  __builtin_va_list ap;
+  __builtin_va_start(ap, a);
+  __float128 i = __builtin_va_arg(ap, __float128);
+  // GNU64: load ptr, ptr
+  // GNU64: load fp128, ptr
+  __builtin_va_end(ap);
+}

tgross35 wrote:

This isn't sufficient to identify the varargs crash, I don't think that is 
possible without an asm or execution test. Should I add one somewhere?

https://github.com/llvm/llvm-project/pull/115052
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang] Always pass fp128 arguments indirectly on Windows (PR #115052)

2024-11-05 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/115052

>From 432e8a66156f08d45ad691017255364cfb0fd947 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Tue, 5 Nov 2024 07:00:35 -0500
Subject: [PATCH 1/2] [clang] Add fp128 ABI tests for MinGW (NFC)

Duplicate `win64-i128.c` to `win64-fp128.c` and update with the current
behavior of `__float128`.
---
 clang/test/CodeGen/win64-fp128.c | 20 
 1 file changed, 20 insertions(+)
 create mode 100644 clang/test/CodeGen/win64-fp128.c

diff --git a/clang/test/CodeGen/win64-fp128.c b/clang/test/CodeGen/win64-fp128.c
new file mode 100644
index 00..33e2441ddf3146
--- /dev/null
+++ b/clang/test/CodeGen/win64-fp128.c
@@ -0,0 +1,20 @@
+// RUN: %clang_cc1 -triple x86_64-windows-gnu -emit-llvm -o - %s \
+// RUN:| FileCheck %s --check-prefix=GNU64
+// __float128 is unsupported on MSVC
+
+__float128 fp128_ret(void) { return 0; }
+// GNU64: define dso_local fp128 @fp128_ret()
+
+__float128 fp128_args(__float128 a, __float128 b) { return a * b; }
+// GNU64: define dso_local fp128 @fp128_args(fp128 noundef %a, fp128 noundef 
%b)
+
+void fp128_vararg(int a, ...) {
+  // GNU64-LABEL: define dso_local void @fp128_vararg
+  __builtin_va_list ap;
+  __builtin_va_start(ap, a);
+  __float128 i = __builtin_va_arg(ap, __float128);
+  // movaps  xmm0, xmmword ptr [rax]
+  // GNU64: load ptr, ptr
+  // GNU64: load fp128, ptr
+  __builtin_va_end(ap);
+}

>From 96f754f899254446130c6e02c4787eabed51e41d Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Tue, 5 Nov 2024 05:53:10 -0500
Subject: [PATCH 2/2] [clang] Always pass `fp128` arguments indirectly on
 Windows

Clang currently passes and returns `__float128` in vector registers on
MinGW targets. However, the Windows x86-64 calling convention [1] states
the following:

__m128 types, arrays, and strings are never passed by immediate
value. Instead, a pointer is passed to memory allocated by the
caller. Structs and unions of size 8, 16, 32, or 64 bits, and __m64
types, are passed as if they were integers of the same size. Structs
or unions of other sizes are passed as a pointer to memory allocated
by the caller. For these aggregate types passed as a pointer,
including __m128, the caller-allocated temporary memory must be
16-byte aligned.

Based on the above it sounds like `__float128` should be passed
indirectly; this is what MinGW GCC already does, so change Clang to
match. Passing by value causes problems with varargs. E.g. the below
completes successfully when built with GCC but has a runtime crash when
built with Clang:

void va_f128(int count, ...) {
va_list args;
va_start(args, count);
__float128 val = va_arg(args, __float128);
va_end(args);
}

int main() {
va_f128(0, 0.0);
}

This patch fixes the above. It also resolves crashes when calling
GCC-built f128 libcalls.

Regarding return values, the documentation states:

A scalar return value that can fit into 64 bits, including the __m64
type, is returned through RAX. Non-scalar types including floats,
doubles, and vector types such as __m128, __m128i, __m128d are
returned in XMM0.

This makes it sound like it should be acceptable to return `__float128`
in XMM0. However, GCC returns `__float128` on the stack, so do the same
here to be consistent.

Clang's MSVC targets do not support `__float128` or `_Float128`, but
these changes would also apply there if it is eventually enabled.

[1]: 
https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170
---
 clang/lib/CodeGen/Targets/X86.cpp | 5 +
 clang/test/CodeGen/win64-fp128.c  | 5 ++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CodeGen/Targets/X86.cpp 
b/clang/lib/CodeGen/Targets/X86.cpp
index 7f73bf2a65266e..16656be14d8353 100644
--- a/clang/lib/CodeGen/Targets/X86.cpp
+++ b/clang/lib/CodeGen/Targets/X86.cpp
@@ -3367,6 +3367,11 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, 
unsigned &FreeSSERegs,
   return ABIArgInfo::getDirect(llvm::FixedVectorType::get(
   llvm::Type::getInt64Ty(getVMContext()), 2));
 
+case BuiltinType::Float128:
+  // f128 is too large to fit in integer registers so the Windows ABI
+  // require it be passed on the stack. GCC does the same.
+  return ABIArgInfo::getIndirect(Align, /*ByVal=*/false);
+
 default:
   break;
 }
diff --git a/clang/test/CodeGen/win64-fp128.c b/clang/test/CodeGen/win64-fp128.c
index 33e2441ddf3146..3a864e139d244f 100644
--- a/clang/test/CodeGen/win64-fp128.c
+++ b/clang/test/CodeGen/win64-fp128.c
@@ -3,17 +3,16 @@
 // __float128 is unsupported on MSVC
 
 __float128 fp128_ret(void) { return 0; }
-// GNU64: define dso_local fp128 @fp128_ret()
+// GNU64: define dso_local void @fp128_ret(ptr dead_on_unwind noalias writable 
sret(fp128) align 16 %agg.result)
 
 __float128 fp128_args(__float128 a, __float128 b) { return a * b; }

[clang] [llvm] [SystemZ] Add support for half (fp16) (PR #109164)

2024-10-23 Thread Trevor Gross via cfe-commits

tgross35 wrote:

> From what I can see in the libgcc sources, `__gnu_h2f_ieee`/`__gnu_f2h_ieee` 
> is indeed always `i32`<->`i16`, but it is only present on 32-bit ARM, no 
> other platforms. On AArch64, GCC will always use inline instructions to 
> perform the conversion. On 32-bit and 64-bit Intel, the compiler will use 
> inline instructions if AVX512-FP16 is available; if not, but SSE2 is 
> available, the compiler will use `__extendhfsf2`/`__truncsfhf2` with a 
> `HFmode` argument (this corresponds to `_Float16`, i.e. it is passed in SSE2 
> registers, not like an integer); if not even SSE2 is available, using the 
> type will result in an error.
>
> I never see `__extendhfsf2`/`__truncsfhf2` being used with `int16_t`, even in 
> principle, on any platform in libgcc. There is indeed a setting 
> `__LIBGCC_HAS_HF_MODE__` (controlled indirectly by the GCC target back-end's 
> `TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P` setting), but the only thing 
> that appears to be controlled by this flag is whether routines for complex 
> multiplication and division (`__mulhc3` / `__divhc3`) are being built. Am I 
> missing something here?
 
I think this is accurate, libgcc just appears to (reasonably) not provide any 
f16-related symbols on platforms where GCC doesn't support `_Float16`. LLVM 
does seem to use `__gnu_h2f_ieee` and `__gnu_f2h_ieee` though, on targets where 
Clang doesn't have `_Float16` (e.g. PowerPC, Wasm, x86-32 without SSE), which 
is why it shows up in the current state of this PR. Presumably this is 
`HasLegalHalfType`?

For that reason we just always provide the symbol [in rust's 
compiler-builtins](https://github.com/rust-lang/compiler-builtins/blob/adaef3265c560998a03b6902bd63d88009d3a132/src/float/trunc.rs#L139-L145)
 (though we let LLVM figure out that `f16` is `i16`).

> > @uweigand mentioned figuring out an ABI for `_Float16`, is this possible? 
> > That seems like the best option.
> 
> Yes, we're working on that. What we're planning to do is to have `_Float16` 
> be passed and returned in the same way as `float` and `double`, i.e. using 
> (part of) certain floating-point registers. These registers are available on 
> every SystemZ architecture level, so we would not have to guard their use 
> (like Intel does with the SSE2 registers).

That is great news, especially considering how problematic the 
target-feature-dependent ABI on x86-32 has been.


https://github.com/llvm/llvm-project/pull/109164
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [compiler-rt] [llvm] [SystemZ] Add support for half (fp16) (PR #109164)

2024-11-20 Thread Trevor Gross via cfe-commits


@@ -255,4 +255,9 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
 }
 setLibcallName(RTLIB::MULO_I128, nullptr);
   }
+
+  if (TT.isSystemZ()) {
+setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
+setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
+  }

tgross35 wrote:

Regarding how to build and link, they are in compiler-rt if that can be built 
https://github.com/llvm/llvm-project/blob/fa22100d57631bbb0a507dd27e3ebb24b1354623/compiler-rt/lib/builtins/truncsfhf2.c#L15.
 `__trunc` and `__extend` are what you want to emit here, I'm just not sure 
what exactly this file needs to do because it seems like `HasLegalHalfType` 
controls `__extend`/`__trunc` vs. `__gnu_` lowering somehow 
https://github.com/llvm/llvm-project/pull/109164#issuecomment-2433525551.

https://github.com/llvm/llvm-project/pull/109164
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [compiler-rt] [llvm] [SystemZ] Add support for half (fp16) (PR #109164)

2024-11-19 Thread Trevor Gross via cfe-commits


@@ -255,4 +255,9 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
 }
 setLibcallName(RTLIB::MULO_I128, nullptr);
   }
+
+  if (TT.isSystemZ()) {
+setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
+setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
+  }

tgross35 wrote:

Why do these names need to be set, aren't these the default?

https://github.com/llvm/llvm-project/pull/109164
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [compiler-rt] [llvm] [SystemZ] Add support for half (fp16) (PR #109164)

2024-11-19 Thread Trevor Gross via cfe-commits


@@ -513,11 +514,37 @@ SystemZTargetLowering::SystemZTargetLowering(const 
TargetMachine &TM,
   }
 
   // Handle floating-point types.
+  // Promote all f16 operations to float, with some exceptions below.
+  for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
+setOperationAction(Opc, MVT::f16, Promote);
+  setOperationAction(ISD::ConstantFP, MVT::f16, Expand);
+  for (MVT VT : {MVT::f32, MVT::f64, MVT::f128}) {
+setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
+setTruncStoreAction(VT, MVT::f16, Expand);
+  }
+  setOperationAction(ISD::LOAD, MVT::f16, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD, MVT::f16, Custom);
+  setOperationAction(ISD::STORE, MVT::f16, Custom);
+  setOperationAction(ISD::ATOMIC_STORE, MVT::f16, Custom);
+  setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
+  setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
+  setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
+  setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
+
   for (unsigned I = MVT::FIRST_FP_VALUETYPE;
I <= MVT::LAST_FP_VALUETYPE;
++I) {
 MVT VT = MVT::SimpleValueType(I);
 if (isTypeLegal(VT)) {
+  // No special instructions for these.
+  setOperationAction(ISD::FSIN, VT, Expand);
+  setOperationAction(ISD::FCOS, VT, Expand);
+  setOperationAction(ISD::FSINCOS, VT, Expand);
+  setOperationAction(ISD::FREM, VT, Expand);
+  setOperationAction(ISD::FPOW, VT, Expand);

tgross35 wrote:

Just crosslinking that there is an effort to add f16 libcalls 
https://github.com/llvm/llvm-project/issues/95250 but I have no clue what the 
plan is as far as lowering to them.

https://github.com/llvm/llvm-project/pull/109164
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [compiler-rt] [llvm] [SystemZ] Add support for half (fp16) (PR #109164)

2024-11-19 Thread Trevor Gross via cfe-commits


@@ -255,4 +255,9 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
 }
 setLibcallName(RTLIB::MULO_I128, nullptr);
   }
+
+  if (TT.isSystemZ()) {
+setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
+setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
+  }

tgross35 wrote:

Hm, I see they default to the `__gnu_` functions in this file. Some targets 
(wasm, hexagon) manually set it to `__extendhfsf2` and `__truncsfhf2` in 
`*SelfLowering.cpp` but why do targets like x86 correctly lower to these as 
well without an override either in this file or in selflowering?

https://github.com/llvm/llvm-project/pull/109164
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [compiler-rt] [llvm] [SystemZ] Add support for half (fp16) (PR #109164)

2025-03-20 Thread Trevor Gross via cfe-commits


@@ -548,11 +543,28 @@ SystemZTargetLowering::SystemZTargetLowering(const 
TargetMachine &TM,
   }
 
   // Handle floating-point types.
+  if (!useSoftFloat()) {
+// Promote all f16 operations to float, with some exceptions below.
+for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
+  setOperationAction(Opc, MVT::f16, Promote);

tgross35 wrote:

@nikic are you referring to https://github.com/llvm/llvm-project/issues/97975 
and https://github.com/llvm/llvm-project/issues/97981? It would probably be 
good to add a test against those issues here if there isn't already. 

Cc issue author @beetrees

https://github.com/llvm/llvm-project/pull/109164
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [compiler-rt] [llvm] [SystemZ] Add support for half (fp16) (PR #109164)

2025-03-21 Thread Trevor Gross via cfe-commits


@@ -548,11 +543,28 @@ SystemZTargetLowering::SystemZTargetLowering(const 
TargetMachine &TM,
   }
 
   // Handle floating-point types.
+  if (!useSoftFloat()) {
+// Promote all f16 operations to float, with some exceptions below.
+for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
+  setOperationAction(Opc, MVT::f16, Promote);

tgross35 wrote:

For the second one, it should be sufficient to ensure that i16<->f16 
conversions are asm-only with no libcalls (it's possible this is tested 
somewhere and I'm just overlooking it in the large diff)

https://github.com/llvm/llvm-project/pull/109164
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [compiler-rt] [llvm] [SystemZ] Add support for half (fp16) (PR #109164)

2025-04-04 Thread Trevor Gross via cfe-commits


@@ -3,6 +3,18 @@
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
+; Test f16.
+declare half @llvm.fabs.f16(half %f)
+define half @f0(half %f) {
+; CHECK-LABEL: f0:
+; CHECK:  brasl %r14, __extendhfsf2@PLT
+; CHECK-NEXT: lpdfr %f0, %f0
+; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT
+; CHECK: br %r14

tgross35 wrote:

Similarly `fabs `could be `a & !SIGN_MASK`. It looks like aarch64 uses this, 
x86 still seems to extend then truncate for whatever reason.

https://github.com/llvm/llvm-project/pull/109164
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [compiler-rt] [llvm] [SystemZ] Add support for half (fp16) (PR #109164)

2025-04-04 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 edited 
https://github.com/llvm/llvm-project/pull/109164
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [compiler-rt] [llvm] [SystemZ] Add support for half (fp16) (PR #109164)

2025-04-04 Thread Trevor Gross via cfe-commits


@@ -0,0 +1,65 @@
+; Test copysign intrinsics involving half.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare half @llvm.copysign.f16(half, half)
+declare float @llvm.copysign.f32(float, float)
+declare double @llvm.copysign.f64(double, double)
+
+; Test f16 copies.
+define half @f0(half %a, half %b) {
+; CHECK-LABEL: f0:
+; CHECK: brasl %r14, __extendhfsf2@PLT
+; CHECK: brasl %r14, __extendhfsf2@PLT
+; CHECK: cpsdr %f0, %f9, %f0
+; CHECK: brasl %r14, __truncsfhf2@PLT
+; CHECK: br %r14
+  %res = call half @llvm.copysign.f16(half %a, half %b)

tgross35 wrote:

(Nonblocker) a few other architectures use an asm lowering to avoid the calls 
here, it's just `(a & !MASK) | (Y & MASK)` with `MASK = 1 << 15`.

https://github.com/llvm/llvm-project/pull/109164
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [compiler-rt] [llvm] [SystemZ] Add support for half (fp16) (PR #109164)

2025-04-01 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 edited 
https://github.com/llvm/llvm-project/pull/109164
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [compiler-rt] [llvm] [SystemZ] Add support for half (fp16) (PR #109164)

2025-04-01 Thread Trevor Gross via cfe-commits


@@ -126,3 +138,23 @@ define void @f9(ptr %cptr, ptr %aptr, ptr %bptr) {
   store fp128 %c, ptr %cptr
   ret void
 }
+
+; Test f16 copies in which the sign comes from an f16.
+define half @f10(half %a, half %b) {
+; CHECK-LABEL: f10:
+; CHECK: brasl %r14, copysignh@PLT

tgross35 wrote:

I think `abs` and `copysign` are generally libcall-free, so that would be 
preferable here if possible.

Not related with that change but as a note, I don't think the `h` suffix exists 
for `math.h` routines. C23 does specify sized suffixes however, so e.g. 
`copysignf16` is valid and is provided by both [LLVM's 
`libc`](https://github.com/llvm/llvm-project/blob/749535ba2808e133682074f712ac6829335f8875/libc/src/math/generic/copysignf16.cpp#L16)
 and [Rust's 
builtins](https://docs.rs/compiler_builtins/latest/compiler_builtins/math/full_availability/fn.copysignf16.html).

https://github.com/llvm/llvm-project/pull/109164
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [compiler-rt] [llvm] [SystemZ] Add support for half (fp16) (PR #109164)

2025-04-01 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 edited 
https://github.com/llvm/llvm-project/pull/109164
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2025-04-16 Thread Trevor Gross via cfe-commits


@@ -0,0 +1,331 @@
+; RUN: llc < %s -mtriple=aarch64-unknown-none -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-USELD
+; RUN: llc < %s -mtriple=i686-unknown -verify-machineinstrs | FileCheck %s 
--check-prefix=CHECK-NOTLD
+; RUN: llc < %s -mtriple=powerpc-unknown -verify-machineinstrs | FileCheck %s 
--check-prefix=CHECK-NOTLD
+; RUN: llc < %s -mtriple=powerpc64-unknown -verify-machineinstrs | FileCheck 
%s --check-prefix=CHECK-NOTLD
+; RUN: llc < %s -mtriple=riscv32 -verify-machineinstrs | FileCheck %s 
--check-prefix=CHECK-NOTLD
+; RUN: llc < %s -mtriple=s390x-unknown -verify-machineinstrs | FileCheck %s 
--check-prefix=CHECK-S390X
+; RUN: llc < %s -mtriple=x86_64-unknown -verify-machineinstrs | FileCheck %s 
--check-prefix=CHECK-NOTLD
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-NOTLD
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-musl -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-NOTLD
+;
+; REQUIRES: aarch64-registered-target
+; REQUIRES: powerpc-registered-target
+; REQUIRES: riscv-registered-target
+; REQUIRES: systemz-registered-target
+; REQUIRES: x86-registered-target

tgross35 wrote:

Todo: replace this with `%if` somehow so this test still runs if only a subset 
of architectures is available https://llvm.org/docs/TestingGuide.html

https://github.com/llvm/llvm-project/pull/76558
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2025-04-27 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/76558

>From dafb3a378876db6b3bf505b425b386fd8f79c918 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Fri, 11 Aug 2023 22:16:01 -0400
Subject: [PATCH 1/5] [IR] Add a test for `f128` libcall lowering (NFC)

`f128` intrinsic functions sometimes lower to `long double` library
calls when they instead need to be `f128` versions. Add a test
demonstrating current behavior.
---
 .../CodeGen/Generic/f128-math-lowering.ll | 328 ++
 1 file changed, 328 insertions(+)
 create mode 100644 llvm/test/CodeGen/Generic/f128-math-lowering.ll

diff --git a/llvm/test/CodeGen/Generic/f128-math-lowering.ll 
b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
new file mode 100644
index 0..dfbd1eaeda109
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
@@ -0,0 +1,328 @@
+; RUN: llc < %s -mtriple=aarch64-unknown-none -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-USELD
+; RUN: llc < %s -mtriple=i686-unknown -verify-machineinstrs | FileCheck %s 
--check-prefix=CHECK-USELD
+; RUN: llc < %s -mtriple=riscv32 -verify-machineinstrs | FileCheck %s 
--check-prefix=CHECK-USELD
+; RUN: llc < %s -mtriple=s390x-unknown -verify-machineinstrs | FileCheck %s 
--check-prefix=CHECK-S390X
+; RUN: llc < %s -mtriple=x86_64-unknown -verify-machineinstrs | FileCheck %s 
--check-prefix=CHECK-USELD
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-NOTLD
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-musl -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-USELD
+;
+; REQUIRES: aarch64-registered-target
+; REQUIRES: riscv-registered-target
+; REQUIRES: systemz-registered-target
+; REQUIRES: x86-registered-target
+;
+; Verify that fp128 intrinsics only lower to `long double` calls (e.g. `sinl`)
+; on platforms where `f128` and `long double` have the same layout, and
+; otherwise lower to `f128` versions (e.g. `sinf128`).
+;
+; Targets include:
+; * x86, x64 (80-bit long double)
+; * aarch64 (long double == f128)
+; * riscv32 (long double == f64)
+; * s390x (long double == f128, hardware support)
+; * A couple assorted environments for x86
+;
+; FIXME: only targets where long double is `f128` should be using `USELD`, all
+; others need to be NOTLD. PowerPC should be added but it currently emits an
+; interesting blend of both (e.g. `acosl` but `ceilf128`).
+
+define fp128 @test_acosf128(fp128 %a) {
+; CHECK-LABEL:  test_acosf128:
+; CHECK-NOTLD:  acosf128
+; CHECK-USELD:  acosl
+; CHECK-S390X:  acosl
+start:
+  %0 = tail call fp128 @llvm.acos.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_asinf128(fp128 %a) {
+; CHECK-LABEL:  test_asinf128:
+; CHECK-NOTLD:  asinf128
+; CHECK-USELD:  asinl
+; CHECK-S390X:  asinl
+start:
+  %0 = tail call fp128 @llvm.asin.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_atanf128(fp128 %a) {
+; CHECK-LABEL:  test_atanf128:
+; CHECK-NOTLD:  atanf128
+; CHECK-USELD:  atanl
+; CHECK-S390X:  atanl
+start:
+  %0 = tail call fp128 @llvm.atan.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_ceilf128(fp128 %a) {
+; CHECK-LABEL:  test_ceilf128:
+; CHECK-NOTLD:  ceilf128
+; CHECK-USELD:  ceill
+; CHECK-S390X:  ceill
+start:
+  %0 = tail call fp128 @llvm.ceil.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_copysignf128(fp128 %a, fp128 %b) {
+; copysign should always get lowered to assembly
+; CHECK-LABEL:  test_copysignf128:
+; CHECK-NOT:copysignl
+; CHECK-NOT:copysignf128
+start:
+  %0 = tail call fp128 @llvm.copysign.f128(fp128 %a, fp128 %b)
+  ret fp128 %0
+}
+
+define fp128 @test_cosf128(fp128 %a) {
+; CHECK-LABEL:  test_cosf128:
+; CHECK-NOTLD:  cosf128
+; CHECK-USELD:  cosl
+; CHECK-S390X:  cosl
+start:
+  %0 = tail call fp128 @llvm.cos.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_exp10f128(fp128 %a) {
+; CHECK-LABEL:  test_exp2f128:
+; CHECK-NOTLD:  exp10f128
+; CHECK-USELD:  exp10l
+; CHECK-S390X:  exp10l
+start:
+  %0 = tail call fp128 @llvm.exp10.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_exp2f128(fp128 %a) {
+; CHECK-LABEL:  test_exp2f128:
+; CHECK-NOTLD:  exp2f128
+; CHECK-USELD:  exp2l
+; CHECK-S390X:  exp2l
+start:
+  %0 = tail call fp128 @llvm.exp2.f128(fp128 %a)
+  ret fp128 %0
+}
+
+
+define fp128 @test_expf128(fp128 %a) {
+; CHECK-LABEL:  test_expf128:
+; CHECK-NOTLD:  expf128
+; CHECK-USELD:  expl
+; CHECK-S390X:  expl
+start:
+  %0 = tail call fp128 @llvm.exp.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_fabsf128(fp128 %a) {
+; fabs should always get lowered to assembly
+; CHECK-LABEL:  test_fabsf128:
+; CHECK-NOT:fabsl
+; CHECK-NOT:fabsf128
+start:
+  %0 = tail call fp128 @llvm.fabs.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_floorf128(fp128 %a) {
+; CHECK-LABEL:  test

[clang] [llvm] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2025-04-27 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/76558

>From dafb3a378876db6b3bf505b425b386fd8f79c918 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Fri, 11 Aug 2023 22:16:01 -0400
Subject: [PATCH 1/4] [IR] Add a test for `f128` libcall lowering (NFC)

`f128` intrinsic functions sometimes lower to `long double` library
calls when they instead need to be `f128` versions. Add a test
demonstrating current behavior.
---
 .../CodeGen/Generic/f128-math-lowering.ll | 328 ++
 1 file changed, 328 insertions(+)
 create mode 100644 llvm/test/CodeGen/Generic/f128-math-lowering.ll

diff --git a/llvm/test/CodeGen/Generic/f128-math-lowering.ll 
b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
new file mode 100644
index 0..dfbd1eaeda109
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
@@ -0,0 +1,328 @@
+; RUN: llc < %s -mtriple=aarch64-unknown-none -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-USELD
+; RUN: llc < %s -mtriple=i686-unknown -verify-machineinstrs | FileCheck %s 
--check-prefix=CHECK-USELD
+; RUN: llc < %s -mtriple=riscv32 -verify-machineinstrs | FileCheck %s 
--check-prefix=CHECK-USELD
+; RUN: llc < %s -mtriple=s390x-unknown -verify-machineinstrs | FileCheck %s 
--check-prefix=CHECK-S390X
+; RUN: llc < %s -mtriple=x86_64-unknown -verify-machineinstrs | FileCheck %s 
--check-prefix=CHECK-USELD
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-NOTLD
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-musl -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-USELD
+;
+; REQUIRES: aarch64-registered-target
+; REQUIRES: riscv-registered-target
+; REQUIRES: systemz-registered-target
+; REQUIRES: x86-registered-target
+;
+; Verify that fp128 intrinsics only lower to `long double` calls (e.g. `sinl`)
+; on platforms where `f128` and `long double` have the same layout, and
+; otherwise lower to `f128` versions (e.g. `sinf128`).
+;
+; Targets include:
+; * x86, x64 (80-bit long double)
+; * aarch64 (long double == f128)
+; * riscv32 (long double == f64)
+; * s390x (long double == f128, hardware support)
+; * A couple assorted environments for x86
+;
+; FIXME: only targets where long double is `f128` should be using `USELD`, all
+; others need to be NOTLD. PowerPC should be added but it currently emits an
+; interesting blend of both (e.g. `acosl` but `ceilf128`).
+
+define fp128 @test_acosf128(fp128 %a) {
+; CHECK-LABEL:  test_acosf128:
+; CHECK-NOTLD:  acosf128
+; CHECK-USELD:  acosl
+; CHECK-S390X:  acosl
+start:
+  %0 = tail call fp128 @llvm.acos.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_asinf128(fp128 %a) {
+; CHECK-LABEL:  test_asinf128:
+; CHECK-NOTLD:  asinf128
+; CHECK-USELD:  asinl
+; CHECK-S390X:  asinl
+start:
+  %0 = tail call fp128 @llvm.asin.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_atanf128(fp128 %a) {
+; CHECK-LABEL:  test_atanf128:
+; CHECK-NOTLD:  atanf128
+; CHECK-USELD:  atanl
+; CHECK-S390X:  atanl
+start:
+  %0 = tail call fp128 @llvm.atan.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_ceilf128(fp128 %a) {
+; CHECK-LABEL:  test_ceilf128:
+; CHECK-NOTLD:  ceilf128
+; CHECK-USELD:  ceill
+; CHECK-S390X:  ceill
+start:
+  %0 = tail call fp128 @llvm.ceil.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_copysignf128(fp128 %a, fp128 %b) {
+; copysign should always get lowered to assembly
+; CHECK-LABEL:  test_copysignf128:
+; CHECK-NOT:copysignl
+; CHECK-NOT:copysignf128
+start:
+  %0 = tail call fp128 @llvm.copysign.f128(fp128 %a, fp128 %b)
+  ret fp128 %0
+}
+
+define fp128 @test_cosf128(fp128 %a) {
+; CHECK-LABEL:  test_cosf128:
+; CHECK-NOTLD:  cosf128
+; CHECK-USELD:  cosl
+; CHECK-S390X:  cosl
+start:
+  %0 = tail call fp128 @llvm.cos.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_exp10f128(fp128 %a) {
+; CHECK-LABEL:  test_exp2f128:
+; CHECK-NOTLD:  exp10f128
+; CHECK-USELD:  exp10l
+; CHECK-S390X:  exp10l
+start:
+  %0 = tail call fp128 @llvm.exp10.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_exp2f128(fp128 %a) {
+; CHECK-LABEL:  test_exp2f128:
+; CHECK-NOTLD:  exp2f128
+; CHECK-USELD:  exp2l
+; CHECK-S390X:  exp2l
+start:
+  %0 = tail call fp128 @llvm.exp2.f128(fp128 %a)
+  ret fp128 %0
+}
+
+
+define fp128 @test_expf128(fp128 %a) {
+; CHECK-LABEL:  test_expf128:
+; CHECK-NOTLD:  expf128
+; CHECK-USELD:  expl
+; CHECK-S390X:  expl
+start:
+  %0 = tail call fp128 @llvm.exp.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_fabsf128(fp128 %a) {
+; fabs should always get lowered to assembly
+; CHECK-LABEL:  test_fabsf128:
+; CHECK-NOT:fabsl
+; CHECK-NOT:fabsf128
+start:
+  %0 = tail call fp128 @llvm.fabs.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_floorf128(fp128 %a) {
+; CHECK-LABEL:  test

[clang] [llvm] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2025-04-27 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/76558

>From dafb3a378876db6b3bf505b425b386fd8f79c918 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Fri, 11 Aug 2023 22:16:01 -0400
Subject: [PATCH 1/7] [IR] Add a test for `f128` libcall lowering (NFC)

`f128` intrinsic functions sometimes lower to `long double` library
calls when they instead need to be `f128` versions. Add a test
demonstrating current behavior.
---
 .../CodeGen/Generic/f128-math-lowering.ll | 328 ++
 1 file changed, 328 insertions(+)
 create mode 100644 llvm/test/CodeGen/Generic/f128-math-lowering.ll

diff --git a/llvm/test/CodeGen/Generic/f128-math-lowering.ll 
b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
new file mode 100644
index 0..dfbd1eaeda109
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
@@ -0,0 +1,328 @@
+; RUN: llc < %s -mtriple=aarch64-unknown-none -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-USELD
+; RUN: llc < %s -mtriple=i686-unknown -verify-machineinstrs | FileCheck %s 
--check-prefix=CHECK-USELD
+; RUN: llc < %s -mtriple=riscv32 -verify-machineinstrs | FileCheck %s 
--check-prefix=CHECK-USELD
+; RUN: llc < %s -mtriple=s390x-unknown -verify-machineinstrs | FileCheck %s 
--check-prefix=CHECK-S390X
+; RUN: llc < %s -mtriple=x86_64-unknown -verify-machineinstrs | FileCheck %s 
--check-prefix=CHECK-USELD
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-NOTLD
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-musl -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-USELD
+;
+; REQUIRES: aarch64-registered-target
+; REQUIRES: riscv-registered-target
+; REQUIRES: systemz-registered-target
+; REQUIRES: x86-registered-target
+;
+; Verify that fp128 intrinsics only lower to `long double` calls (e.g. `sinl`)
+; on platforms where `f128` and `long double` have the same layout, and
+; otherwise lower to `f128` versions (e.g. `sinf128`).
+;
+; Targets include:
+; * x86, x64 (80-bit long double)
+; * aarch64 (long double == f128)
+; * riscv32 (long double == f64)
+; * s390x (long double == f128, hardware support)
+; * A couple assorted environments for x86
+;
+; FIXME: only targets where long double is `f128` should be using `USELD`, all
+; others need to be NOTLD. PowerPC should be added but it currently emits an
+; interesting blend of both (e.g. `acosl` but `ceilf128`).
+
+define fp128 @test_acosf128(fp128 %a) {
+; CHECK-LABEL:  test_acosf128:
+; CHECK-NOTLD:  acosf128
+; CHECK-USELD:  acosl
+; CHECK-S390X:  acosl
+start:
+  %0 = tail call fp128 @llvm.acos.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_asinf128(fp128 %a) {
+; CHECK-LABEL:  test_asinf128:
+; CHECK-NOTLD:  asinf128
+; CHECK-USELD:  asinl
+; CHECK-S390X:  asinl
+start:
+  %0 = tail call fp128 @llvm.asin.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_atanf128(fp128 %a) {
+; CHECK-LABEL:  test_atanf128:
+; CHECK-NOTLD:  atanf128
+; CHECK-USELD:  atanl
+; CHECK-S390X:  atanl
+start:
+  %0 = tail call fp128 @llvm.atan.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_ceilf128(fp128 %a) {
+; CHECK-LABEL:  test_ceilf128:
+; CHECK-NOTLD:  ceilf128
+; CHECK-USELD:  ceill
+; CHECK-S390X:  ceill
+start:
+  %0 = tail call fp128 @llvm.ceil.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_copysignf128(fp128 %a, fp128 %b) {
+; copysign should always get lowered to assembly
+; CHECK-LABEL:  test_copysignf128:
+; CHECK-NOT:copysignl
+; CHECK-NOT:copysignf128
+start:
+  %0 = tail call fp128 @llvm.copysign.f128(fp128 %a, fp128 %b)
+  ret fp128 %0
+}
+
+define fp128 @test_cosf128(fp128 %a) {
+; CHECK-LABEL:  test_cosf128:
+; CHECK-NOTLD:  cosf128
+; CHECK-USELD:  cosl
+; CHECK-S390X:  cosl
+start:
+  %0 = tail call fp128 @llvm.cos.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_exp10f128(fp128 %a) {
+; CHECK-LABEL:  test_exp2f128:
+; CHECK-NOTLD:  exp10f128
+; CHECK-USELD:  exp10l
+; CHECK-S390X:  exp10l
+start:
+  %0 = tail call fp128 @llvm.exp10.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_exp2f128(fp128 %a) {
+; CHECK-LABEL:  test_exp2f128:
+; CHECK-NOTLD:  exp2f128
+; CHECK-USELD:  exp2l
+; CHECK-S390X:  exp2l
+start:
+  %0 = tail call fp128 @llvm.exp2.f128(fp128 %a)
+  ret fp128 %0
+}
+
+
+define fp128 @test_expf128(fp128 %a) {
+; CHECK-LABEL:  test_expf128:
+; CHECK-NOTLD:  expf128
+; CHECK-USELD:  expl
+; CHECK-S390X:  expl
+start:
+  %0 = tail call fp128 @llvm.exp.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_fabsf128(fp128 %a) {
+; fabs should always get lowered to assembly
+; CHECK-LABEL:  test_fabsf128:
+; CHECK-NOT:fabsl
+; CHECK-NOT:fabsf128
+start:
+  %0 = tail call fp128 @llvm.fabs.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_floorf128(fp128 %a) {
+; CHECK-LABEL:  test

[clang] [llvm] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2025-04-26 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/76558

>From 9089a966f2107a2dfcad86fec746e9f2bd8a105b Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Fri, 11 Aug 2023 22:16:01 -0400
Subject: [PATCH 1/5] [IR] Add a test for `f128` libcall lowering (NFC)

`f128` intrinsic functions sometimes lower to `long double` library
calls when they instead need to be `f128` versions. Add a test
demonstrating current behavior.
---
 .../CodeGen/Generic/f128-math-lowering.ll | 328 ++
 1 file changed, 328 insertions(+)
 create mode 100644 llvm/test/CodeGen/Generic/f128-math-lowering.ll

diff --git a/llvm/test/CodeGen/Generic/f128-math-lowering.ll 
b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
new file mode 100644
index 0..dfbd1eaeda109
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
@@ -0,0 +1,328 @@
+; RUN: llc < %s -mtriple=aarch64-unknown-none -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-USELD
+; RUN: llc < %s -mtriple=i686-unknown -verify-machineinstrs | FileCheck %s 
--check-prefix=CHECK-USELD
+; RUN: llc < %s -mtriple=riscv32 -verify-machineinstrs | FileCheck %s 
--check-prefix=CHECK-USELD
+; RUN: llc < %s -mtriple=s390x-unknown -verify-machineinstrs | FileCheck %s 
--check-prefix=CHECK-S390X
+; RUN: llc < %s -mtriple=x86_64-unknown -verify-machineinstrs | FileCheck %s 
--check-prefix=CHECK-USELD
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-NOTLD
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-musl -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-USELD
+;
+; REQUIRES: aarch64-registered-target
+; REQUIRES: riscv-registered-target
+; REQUIRES: systemz-registered-target
+; REQUIRES: x86-registered-target
+;
+; Verify that fp128 intrinsics only lower to `long double` calls (e.g. `sinl`)
+; on platforms where `f128` and `long double` have the same layout, and
+; otherwise lower to `f128` versions (e.g. `sinf128`).
+;
+; Targets include:
+; * x86, x64 (80-bit long double)
+; * aarch64 (long double == f128)
+; * riscv32 (long double == f64)
+; * s390x (long double == f128, hardware support)
+; * A couple assorted environments for x86
+;
+; FIXME: only targets where long double is `f128` should be using `USELD`, all
+; others need to be NOTLD. PowerPC should be added but it currently emits an
+; interesting blend of both (e.g. `acosl` but `ceilf128`).
+
+define fp128 @test_acosf128(fp128 %a) {
+; CHECK-LABEL:  test_acosf128:
+; CHECK-NOTLD:  acosf128
+; CHECK-USELD:  acosl
+; CHECK-S390X:  acosl
+start:
+  %0 = tail call fp128 @llvm.acos.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_asinf128(fp128 %a) {
+; CHECK-LABEL:  test_asinf128:
+; CHECK-NOTLD:  asinf128
+; CHECK-USELD:  asinl
+; CHECK-S390X:  asinl
+start:
+  %0 = tail call fp128 @llvm.asin.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_atanf128(fp128 %a) {
+; CHECK-LABEL:  test_atanf128:
+; CHECK-NOTLD:  atanf128
+; CHECK-USELD:  atanl
+; CHECK-S390X:  atanl
+start:
+  %0 = tail call fp128 @llvm.atan.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_ceilf128(fp128 %a) {
+; CHECK-LABEL:  test_ceilf128:
+; CHECK-NOTLD:  ceilf128
+; CHECK-USELD:  ceill
+; CHECK-S390X:  ceill
+start:
+  %0 = tail call fp128 @llvm.ceil.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_copysignf128(fp128 %a, fp128 %b) {
+; copysign should always get lowered to assembly
+; CHECK-LABEL:  test_copysignf128:
+; CHECK-NOT:copysignl
+; CHECK-NOT:copysignf128
+start:
+  %0 = tail call fp128 @llvm.copysign.f128(fp128 %a, fp128 %b)
+  ret fp128 %0
+}
+
+define fp128 @test_cosf128(fp128 %a) {
+; CHECK-LABEL:  test_cosf128:
+; CHECK-NOTLD:  cosf128
+; CHECK-USELD:  cosl
+; CHECK-S390X:  cosl
+start:
+  %0 = tail call fp128 @llvm.cos.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_exp10f128(fp128 %a) {
+; CHECK-LABEL:  test_exp2f128:
+; CHECK-NOTLD:  exp10f128
+; CHECK-USELD:  exp10l
+; CHECK-S390X:  exp10l
+start:
+  %0 = tail call fp128 @llvm.exp10.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_exp2f128(fp128 %a) {
+; CHECK-LABEL:  test_exp2f128:
+; CHECK-NOTLD:  exp2f128
+; CHECK-USELD:  exp2l
+; CHECK-S390X:  exp2l
+start:
+  %0 = tail call fp128 @llvm.exp2.f128(fp128 %a)
+  ret fp128 %0
+}
+
+
+define fp128 @test_expf128(fp128 %a) {
+; CHECK-LABEL:  test_expf128:
+; CHECK-NOTLD:  expf128
+; CHECK-USELD:  expl
+; CHECK-S390X:  expl
+start:
+  %0 = tail call fp128 @llvm.exp.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_fabsf128(fp128 %a) {
+; fabs should always get lowered to assembly
+; CHECK-LABEL:  test_fabsf128:
+; CHECK-NOT:fabsl
+; CHECK-NOT:fabsf128
+start:
+  %0 = tail call fp128 @llvm.fabs.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_floorf128(fp128 %a) {
+; CHECK-LABEL:  test

[clang] [llvm] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2025-04-26 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/76558

>From dafb3a378876db6b3bf505b425b386fd8f79c918 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Fri, 11 Aug 2023 22:16:01 -0400
Subject: [PATCH 1/3] [IR] Add a test for `f128` libcall lowering (NFC)

`f128` intrinsic functions sometimes lower to `long double` library
calls when they instead need to be `f128` versions. Add a test
demonstrating current behavior.
---
 .../CodeGen/Generic/f128-math-lowering.ll | 328 ++
 1 file changed, 328 insertions(+)
 create mode 100644 llvm/test/CodeGen/Generic/f128-math-lowering.ll

diff --git a/llvm/test/CodeGen/Generic/f128-math-lowering.ll 
b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
new file mode 100644
index 0..dfbd1eaeda109
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
@@ -0,0 +1,328 @@
+; RUN: llc < %s -mtriple=aarch64-unknown-none -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-USELD
+; RUN: llc < %s -mtriple=i686-unknown -verify-machineinstrs | FileCheck %s 
--check-prefix=CHECK-USELD
+; RUN: llc < %s -mtriple=riscv32 -verify-machineinstrs | FileCheck %s 
--check-prefix=CHECK-USELD
+; RUN: llc < %s -mtriple=s390x-unknown -verify-machineinstrs | FileCheck %s 
--check-prefix=CHECK-S390X
+; RUN: llc < %s -mtriple=x86_64-unknown -verify-machineinstrs | FileCheck %s 
--check-prefix=CHECK-USELD
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-NOTLD
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-musl -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-USELD
+;
+; REQUIRES: aarch64-registered-target
+; REQUIRES: riscv-registered-target
+; REQUIRES: systemz-registered-target
+; REQUIRES: x86-registered-target
+;
+; Verify that fp128 intrinsics only lower to `long double` calls (e.g. `sinl`)
+; on platforms where `f128` and `long double` have the same layout, and
+; otherwise lower to `f128` versions (e.g. `sinf128`).
+;
+; Targets include:
+; * x86, x64 (80-bit long double)
+; * aarch64 (long double == f128)
+; * riscv32 (long double == f64)
+; * s390x (long double == f128, hardware support)
+; * A couple assorted environments for x86
+;
+; FIXME: only targets where long double is `f128` should be using `USELD`, all
+; others need to be NOTLD. PowerPC should be added but it currently emits an
+; interesting blend of both (e.g. `acosl` but `ceilf128`).
+
+define fp128 @test_acosf128(fp128 %a) {
+; CHECK-LABEL:  test_acosf128:
+; CHECK-NOTLD:  acosf128
+; CHECK-USELD:  acosl
+; CHECK-S390X:  acosl
+start:
+  %0 = tail call fp128 @llvm.acos.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_asinf128(fp128 %a) {
+; CHECK-LABEL:  test_asinf128:
+; CHECK-NOTLD:  asinf128
+; CHECK-USELD:  asinl
+; CHECK-S390X:  asinl
+start:
+  %0 = tail call fp128 @llvm.asin.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_atanf128(fp128 %a) {
+; CHECK-LABEL:  test_atanf128:
+; CHECK-NOTLD:  atanf128
+; CHECK-USELD:  atanl
+; CHECK-S390X:  atanl
+start:
+  %0 = tail call fp128 @llvm.atan.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_ceilf128(fp128 %a) {
+; CHECK-LABEL:  test_ceilf128:
+; CHECK-NOTLD:  ceilf128
+; CHECK-USELD:  ceill
+; CHECK-S390X:  ceill
+start:
+  %0 = tail call fp128 @llvm.ceil.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_copysignf128(fp128 %a, fp128 %b) {
+; copysign should always get lowered to assembly
+; CHECK-LABEL:  test_copysignf128:
+; CHECK-NOT:copysignl
+; CHECK-NOT:copysignf128
+start:
+  %0 = tail call fp128 @llvm.copysign.f128(fp128 %a, fp128 %b)
+  ret fp128 %0
+}
+
+define fp128 @test_cosf128(fp128 %a) {
+; CHECK-LABEL:  test_cosf128:
+; CHECK-NOTLD:  cosf128
+; CHECK-USELD:  cosl
+; CHECK-S390X:  cosl
+start:
+  %0 = tail call fp128 @llvm.cos.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_exp10f128(fp128 %a) {
+; CHECK-LABEL:  test_exp2f128:
+; CHECK-NOTLD:  exp10f128
+; CHECK-USELD:  exp10l
+; CHECK-S390X:  exp10l
+start:
+  %0 = tail call fp128 @llvm.exp10.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_exp2f128(fp128 %a) {
+; CHECK-LABEL:  test_exp2f128:
+; CHECK-NOTLD:  exp2f128
+; CHECK-USELD:  exp2l
+; CHECK-S390X:  exp2l
+start:
+  %0 = tail call fp128 @llvm.exp2.f128(fp128 %a)
+  ret fp128 %0
+}
+
+
+define fp128 @test_expf128(fp128 %a) {
+; CHECK-LABEL:  test_expf128:
+; CHECK-NOTLD:  expf128
+; CHECK-USELD:  expl
+; CHECK-S390X:  expl
+start:
+  %0 = tail call fp128 @llvm.exp.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_fabsf128(fp128 %a) {
+; fabs should always get lowered to assembly
+; CHECK-LABEL:  test_fabsf128:
+; CHECK-NOT:fabsl
+; CHECK-NOT:fabsf128
+start:
+  %0 = tail call fp128 @llvm.fabs.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_floorf128(fp128 %a) {
+; CHECK-LABEL:  test

[clang] [llvm] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2025-04-21 Thread Trevor Gross via cfe-commits

tgross35 wrote:

Talked to arsenm on discord, long discussion starting around here 
https://discord.com/channels/636084430946959380/636732535434510338/1362207130559578185.
 The outcome is that this is effectively a target option and needs to be tied 
to the triple rather than per-module. Which makes sense and avoids the above 
problem.

So, I'll be doing the following:

1. Make LLVM assume that `sqrtf128` (and similar) libcalls are available by 
default
2. On 64-bit arm, loongarch, mips, risc-v, and s390x musl targets, use `sqrtl` 
instead
3. Add some way to make musl targets also use `sqrtf128`, like `-nolongdouble` 
in the target triple

This should work because calling `sqrtf128` is correct on _most_ platforms:
* On Windows, Apple, and 32-bit platforms, `long double` is `f64` so `sqrtf128` 
is the only correct call
* On x86, `long double` is the x87 80-bit float so `sqrtf128` is the only 
correct call
* On anything glibc, `sqrtf128` is an alias to `sqrtl` on platforms where that 
works, so `sqrtf128` can always be called
* That leaves 64-bit `musl` on platforms where `long double` is `f128` as the 
only platforms where `sqrtl` has to be called (otherwise calling `sqrtl` from C 
would get intercepted and relowered as `sqrtf128` for a linker error)

https://github.com/llvm/llvm-project/pull/76558
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang] Always pass `fp128` arguments indirectly on Windows (PR #115052)

2025-02-27 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 edited 
https://github.com/llvm/llvm-project/pull/115052
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang] Always pass fp128 arguments indirectly on Windows (PR #115052)

2025-02-27 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/115052

>From 60b07161e8668c2bc3ee5d7a4c470a90a7673178 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Tue, 5 Nov 2024 07:00:35 -0500
Subject: [PATCH 1/2] [clang] Add fp128 ABI tests for MinGW (NFC)

Duplicate `win64-i128.c` to `win64-fp128.c` and update with the current
behavior of `__float128`.
---
 clang/test/CodeGen/win64-fp128.c | 20 
 1 file changed, 20 insertions(+)
 create mode 100644 clang/test/CodeGen/win64-fp128.c

diff --git a/clang/test/CodeGen/win64-fp128.c b/clang/test/CodeGen/win64-fp128.c
new file mode 100644
index 0..33e2441ddf314
--- /dev/null
+++ b/clang/test/CodeGen/win64-fp128.c
@@ -0,0 +1,20 @@
+// RUN: %clang_cc1 -triple x86_64-windows-gnu -emit-llvm -o - %s \
+// RUN:| FileCheck %s --check-prefix=GNU64
+// __float128 is unsupported on MSVC
+
+__float128 fp128_ret(void) { return 0; }
+// GNU64: define dso_local fp128 @fp128_ret()
+
+__float128 fp128_args(__float128 a, __float128 b) { return a * b; }
+// GNU64: define dso_local fp128 @fp128_args(fp128 noundef %a, fp128 noundef 
%b)
+
+void fp128_vararg(int a, ...) {
+  // GNU64-LABEL: define dso_local void @fp128_vararg
+  __builtin_va_list ap;
+  __builtin_va_start(ap, a);
+  __float128 i = __builtin_va_arg(ap, __float128);
+  // movaps  xmm0, xmmword ptr [rax]
+  // GNU64: load ptr, ptr
+  // GNU64: load fp128, ptr
+  __builtin_va_end(ap);
+}

>From a95f02c32fa5e6c9a02f81f1fece27e3839fd90c Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Tue, 5 Nov 2024 05:53:10 -0500
Subject: [PATCH 2/2] [clang] Always pass `fp128` arguments indirectly on
 Windows

Clang currently passes and returns `__float128` in vector registers on
MinGW targets. However, the Windows x86-64 calling convention [1] states
the following:

__m128 types, arrays, and strings are never passed by immediate
value. Instead, a pointer is passed to memory allocated by the
caller. Structs and unions of size 8, 16, 32, or 64 bits, and __m64
types, are passed as if they were integers of the same size. Structs
or unions of other sizes are passed as a pointer to memory allocated
by the caller. For these aggregate types passed as a pointer,
including __m128, the caller-allocated temporary memory must be
16-byte aligned.

Based on the above it sounds like `__float128` should be passed
indirectly; this is what MinGW GCC already does, so change Clang to
match. Passing by value also causes problems with varargs. E.g. the
below completes successfully when built with GCC but has a runtime crash
when built with Clang:

void va_f128(int count, ...) {
va_list args;
va_start(args, count);
__float128 val = va_arg(args, __float128);
va_end(args);
}

int main() {
va_f128(0, 0.0);
}

Change `f128` passing to use the stack, which is the same thing GCC
does, and matches `i128`.

Regarding return values, the documentation states:

A scalar return value that can fit into 64 bits, including the __m64
type, is returned through RAX. Non-scalar types including floats,
doubles, and vector types such as __m128, __m128i, __m128d are
returned in XMM0.

This makes it sound like it should be acceptable to return `__float128`
in xmm0; however, GCC returns `__float128` on the stack. That above ABI
statement as well as consistency with `i128` (which is returned in xmm0)
mean that it would likely be better for GCC to change its return ABI to
match Clang rather than the other way around, so that portion is left
as-is.

Clang's MSVC targets do not support `__float128` or `_Float128`, but
these changes would also apply there if it is eventually enabled.

[1]: 
https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170
---
 clang/lib/CodeGen/Targets/X86.cpp | 5 +
 clang/test/CodeGen/win64-fp128.c  | 5 ++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CodeGen/Targets/X86.cpp 
b/clang/lib/CodeGen/Targets/X86.cpp
index b7a1374d5b399..513a54c39e522 100644
--- a/clang/lib/CodeGen/Targets/X86.cpp
+++ b/clang/lib/CodeGen/Targets/X86.cpp
@@ -3403,6 +3403,11 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, 
unsigned &FreeSSERegs,
   return ABIArgInfo::getDirect(llvm::FixedVectorType::get(
   llvm::Type::getInt64Ty(getVMContext()), 2));
 
+case BuiltinType::Float128:
+  // f128 is too large to fit in integer registers so the Windows ABI
+  // require it be passed on the stack. GCC does the same.
+  return ABIArgInfo::getIndirect(Align, /*ByVal=*/false);
+
 default:
   break;
 }
diff --git a/clang/test/CodeGen/win64-fp128.c b/clang/test/CodeGen/win64-fp128.c
index 33e2441ddf314..3a864e139d244 100644
--- a/clang/test/CodeGen/win64-fp128.c
+++ b/clang/test/CodeGen/win64-fp128.c
@@ -3,17 +3,16 @@
 // __float128 is unsupported on MSVC
 
 __float128 fp128_ret(void) { return 0; }
-// GNU64: define dso_local fp1

[clang] [clang] Always pass `fp128` arguments indirectly on Windows (PR #115052)

2025-02-27 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 edited 
https://github.com/llvm/llvm-project/pull/115052
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang] Pass fp128 indirectly and return in xmm0 on Windows (PR #115052)

2025-02-28 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 edited 
https://github.com/llvm/llvm-project/pull/115052
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang] Pass fp128 indirectly and return in xmm0 on Windows (PR #115052)

2025-02-27 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 edited 
https://github.com/llvm/llvm-project/pull/115052
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang] Pass fp128 indirectly and return in xmm0 on Windows (PR #115052)

2025-02-27 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 edited 
https://github.com/llvm/llvm-project/pull/115052
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [compiler-rt] [llvm] [SystemZ] Add support for half (fp16) (PR #109164)

2025-03-07 Thread Trevor Gross via cfe-commits

tgross35 wrote:

Nothing about the implementation stands out to me as wrong. It looks like the 
exponent isn't correct, I guess you could step through this portion 
https://github.com/llvm/llvm-project/blob/ae42f071032b29821beef6a337712580861c/compiler-rt/lib/builtins/fp_extend_impl.inc#L65-L66
 and double check the intermediate values? Our implementation is doing 
something similar except it adds in place rather than shifting back and forth, 
but I don't know why that would make a difference 
https://github.com/rust-lang/compiler-builtins/blob/7bec089672eb5cd83d7902edd59479527bc9d8d1/src/float/extend.rs#L42-L45.

https://github.com/llvm/llvm-project/pull/109164
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [compiler-rt] [llvm] [SystemZ] Add support for half (fp16) (PR #109164)

2025-03-07 Thread Trevor Gross via cfe-commits

tgross35 wrote:

To clarify, the code is calling `__extendhfdf2` then `__truncdfhf2` from either 
libgcc or from compiler-rt with your patches, and the compiler-rt version is 
incorrect? Could you have it print the intermediate results as u16 hex (`Op0 + 
Op1`, `Res2`, `Res`)?

Not sure if you are testing only on s390x but there should probably be a unit 
test at 
https://github.com/llvm/llvm-project/tree/d90423e310482bdbc731242fa25dcb3dd44e69de/compiler-rt/test/builtins/Unit
 to see if things work on other platforms.

https://github.com/llvm/llvm-project/pull/109164
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [compiler-rt] [llvm] [SystemZ] Add support for half (fp16) (PR #109164)

2025-03-04 Thread Trevor Gross via cfe-commits

tgross35 wrote:

Looks like GCC uses option 3 https://gcc.godbolt.org/z/fM1EbK6Mn.

If you only need something to test against locally, I was able to get our s390x 
rust dist built against your patches 
https://github.com/rust-lang-ci/rust/actions/runs/13668570835. 
`rust-std-s390x-unknown-linux-gnu/lib/rustlib/s390x-unknown-linux-gnu/lib/libcompiler_builtins-77321e818a95f60c.rlib`
 has all six conversion symbols, though I haven't been able to test their 
correctness on the platform.

https://github.com/llvm/llvm-project/pull/109164
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [compiler-rt] [llvm] [SystemZ] Add support for half (fp16) (PR #109164)

2025-03-04 Thread Trevor Gross via cfe-commits

tgross35 wrote:

I think something like the following precedence would make sense for all 
targets:

1. Direct asm: Lower to assembly if hardware support is available
2. Indirect mixed: libcall f16->f32 then asm the second conversion (f32->f64 or 
f32->f128) if hardware is available. The advantage is only linking one libcall 
(extendhfsf)
3. Direct libcall: If f32->f64 or f32->f128 are not hardware supported, use 
extendhfsf, extendhfdf, or extendhftf to lower f16->f32, f16->f64, a f16->f128
4. Indirect libcalls: this is LLVM's current behavior without hard float, i.e. 
libcall f16->f32 and then libcall again for f32->f64 or f32->f128

Option 2 is probably best here. Does s390x have `+soft-float` or any features 
that toggle the availability of float conversion ops? In these cases I think 
option 3 is preferable to option 4. (Although since the other targets seem to 
fall back to option 4, maybe this something that could be fixed globally?)

https://github.com/llvm/llvm-project/pull/109164
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang] Pass fp128 indirectly and return in xmm0 on Windows (PR #115052)

2025-03-05 Thread Trevor Gross via cfe-commits

tgross35 wrote:

@rnk (or anyone) would you be able to land this?

https://github.com/llvm/llvm-project/pull/115052
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [compiler-rt] [llvm] [SystemZ] Add support for half (fp16) (PR #109164)

2025-03-09 Thread Trevor Gross via cfe-commits

tgross35 wrote:

> > By the way, these sites are pretty helpful for double checking float reprs 
> > https://float.exposed https://weitz.de/ieee/.
> 
> I tried float.exposed but I couldn't really convert an f16 hex to a double 
> hex. Is it supposed to be able to do this?

It should, on the `half` page you can paste the int repr into "Raw Hexadecimal 
Integer Value", then click `float` or `double` at the top. I don't use it too 
much for that though, mostly exploding into sign/exponent/mantissa when 
debugging soft floats. 

https://github.com/llvm/llvm-project/pull/109164
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2025-02-28 Thread Trevor Gross via cfe-commits

tgross35 wrote:

Finally getting around to this after more than a year. @efriedma-quic as an 
alternative to the current implementation of duplicating `long double` layout 
information from Clang to LLVM, would it work if LLVM lowers to `*f128` calls 
but provides a module flag `fp128_use_long_double_libcalls` to prefer the `*l` 
versions? So if Clang or other frontends know that their `long double` is 
`_Float128`, it can select those libcalls.

The advantage is avoided code duplication and the logic is easier to follow. 
Also this avoids problems if linking a library built with an unexpected 
`-mlong-double-` configuration.

The disadvantage is that frontends that don't know about C's `long double` 
can't benefit from the more common `*l` symbols. I don't think this is too big 
of a problem though: it makes no difference with glibc (the f128 aliases have 
been around sufficiently long) or on any platforms where `long double` is not 
`_Float128`. And it is easy enough for frontends to set 
`fp128_use_long_double_libcalls` on a case-by-case basis if they know what math 
library is being used (e.g. aarch64 musl).

(I handle the f128 support for Rust and would much rather never think about 
`*l` symbols, I can alias them to `*f128` if needed or set the flag)

https://github.com/llvm/llvm-project/pull/76558
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang] Pass fp128 indirectly and return in xmm0 on Windows (PR #115052)

2025-02-28 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 edited 
https://github.com/llvm/llvm-project/pull/115052
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang] Pass fp128 indirectly and return in xmm0 on Windows (PR #115052)

2025-02-28 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/115052

>From 6afdfd07a22260914b45363870a7be54324bd736 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Tue, 5 Nov 2024 07:00:35 -0500
Subject: [PATCH 1/2] [clang] Add fp128 ABI tests for MinGW (NFC)

Add a test based on `win64-i128.c` with the current behavior of
`__float128`.
---
 clang/test/CodeGen/win-fp128.c | 19 +++
 1 file changed, 19 insertions(+)
 create mode 100644 clang/test/CodeGen/win-fp128.c

diff --git a/clang/test/CodeGen/win-fp128.c b/clang/test/CodeGen/win-fp128.c
new file mode 100644
index 0..ec77356201550
--- /dev/null
+++ b/clang/test/CodeGen/win-fp128.c
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -triple x86_64-windows-gnu -emit-llvm -o - %s \
+// RUN:| FileCheck %s --check-prefix=CHECK-GNU64
+// __float128 is unsupported on MSVC
+
+__float128 fp128_ret(void) { return 0; }
+// CHECK-GNU64: define dso_local fp128 @fp128_ret()
+
+__float128 fp128_args(__float128 a, __float128 b) { return a * b; }
+// CHECK-GNU64: define dso_local fp128 @fp128_args(fp128 noundef %a, fp128 
noundef %b)
+
+void fp128_vararg(int a, ...) {
+  // CHECK-GNU64-LABEL: define dso_local void @fp128_vararg
+  __builtin_va_list ap;
+  __builtin_va_start(ap, a);
+  __float128 i = __builtin_va_arg(ap, __float128);
+  // CHECK-GNU64: load ptr, ptr
+  // CHECK-GNU64: load fp128, ptr
+  __builtin_va_end(ap);
+}

>From 4f0768786f9308d9dea87cfc970b12858a15be7d Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Tue, 5 Nov 2024 05:53:10 -0500
Subject: [PATCH 2/2] [clang] Pass `fp128` indirectly and return in xmm0 on
 Windows

Clang currently passes and returns `__float128` in vector registers on
MinGW targets, which is LLVM's default ABI for `fp128`. However, the
Windows x86-64 calling convention [1] states the following:

__m128 types, arrays, and strings are never passed by immediate
value. Instead, a pointer is passed to memory allocated by the
caller. Structs and unions of size 8, 16, 32, or 64 bits, and __m64
types, are passed as if they were integers of the same size. Structs
or unions of other sizes are passed as a pointer to memory allocated
by the caller. For these aggregate types passed as a pointer,
including __m128, the caller-allocated temporary memory must be
16-byte aligned.

Based on the above it sounds like `__float128` should be passed
indirectly. Thus, change `f128` passing to use the stack and make the
return in xmm0 explicit. This is the identical to `i128`, and passing is
the same as GCC.

Regarding return values, the documentation states:

A scalar return value that can fit into 64 bits, including the __m64
type, is returned through RAX. Non-scalar types including floats,
doubles, and vector types such as __m128, __m128i, __m128d are
returned in XMM0.

This makes it sound like it should be acceptable to return `__float128`
in xmm0; however, GCC returns `__float128` on the stack. That above ABI
statement as well as consistency with `i128` (which is returned in xmm0)
mean that it would likely be better for GCC to change its return ABI to
match Clang rather than the other way around, so that portion is left
as-is.

Clang's MSVC targets do not support `__float128` or `_Float128`, but
these changes would also apply there if it is eventually enabled.

[1]: 
https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170
---
 clang/lib/CodeGen/Targets/X86.cpp | 5 +
 clang/test/CodeGen/win-fp128.c| 4 ++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/clang/lib/CodeGen/Targets/X86.cpp 
b/clang/lib/CodeGen/Targets/X86.cpp
index b7a1374d5b399..b36a6e1396653 100644
--- a/clang/lib/CodeGen/Targets/X86.cpp
+++ b/clang/lib/CodeGen/Targets/X86.cpp
@@ -3390,6 +3390,9 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, 
unsigned &FreeSSERegs,
 
 case BuiltinType::Int128:
 case BuiltinType::UInt128:
+case BuiltinType::Float128:
+  // 128-bit float and integer types share the same ABI.
+
   // If it's a parameter type, the normal ABI rule is that arguments larger
   // than 8 bytes are passed indirectly. GCC follows it. We follow it too,
   // even though it isn't particularly efficient.
@@ -3400,6 +3403,8 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, 
unsigned &FreeSSERegs,
 
   // Mingw64 GCC returns i128 in XMM0. Coerce to v2i64 to handle that.
   // Clang matches them for compatibility.
+  // NOTE: GCC actually returns f128 indirectly but will hopefully change.
+  // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115054#c8.
   return ABIArgInfo::getDirect(llvm::FixedVectorType::get(
   llvm::Type::getInt64Ty(getVMContext()), 2));
 
diff --git a/clang/test/CodeGen/win-fp128.c b/clang/test/CodeGen/win-fp128.c
index ec77356201550..328a7aaa7df57 100644
--- a/clang/test/CodeGen/win-fp128.c
+++ b/clang/test/CodeGen/win-fp128.c
@@ -3,10 +3,10 @@
 // __float128 is 

[clang] [clang] Pass fp128 indirectly and return in xmm0 on Windows (PR #115052)

2025-02-28 Thread Trevor Gross via cfe-commits

tgross35 wrote:

I updated this PR to pass indirectly and make the xmm0 return explicit, 
identical to `i128`. This does not match GCC exactly since GCC's `__float128` 
returns on the stack, unfortunately meaning cross-implementation is still 
broken. I don't think that needs block the changes to Clang's here however, I 
plan to send patches to GCC returning in xmm0, which will match the behavior in 
this PR.

This should be ready for a review. Libcall changes are in 
https://github.com/llvm/llvm-project/pull/128848, that should land at the same 
time.

https://github.com/llvm/llvm-project/pull/115052
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang] Always pass `fp128` arguments indirectly on Windows (PR #115052)

2025-02-28 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/115052

>From 60b07161e8668c2bc3ee5d7a4c470a90a7673178 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Tue, 5 Nov 2024 07:00:35 -0500
Subject: [PATCH 1/2] [clang] Add fp128 ABI tests for MinGW (NFC)

Duplicate `win64-i128.c` to `win64-fp128.c` and update with the current
behavior of `__float128`.
---
 clang/test/CodeGen/win64-fp128.c | 20 
 1 file changed, 20 insertions(+)
 create mode 100644 clang/test/CodeGen/win64-fp128.c

diff --git a/clang/test/CodeGen/win64-fp128.c b/clang/test/CodeGen/win64-fp128.c
new file mode 100644
index 0..33e2441ddf314
--- /dev/null
+++ b/clang/test/CodeGen/win64-fp128.c
@@ -0,0 +1,20 @@
+// RUN: %clang_cc1 -triple x86_64-windows-gnu -emit-llvm -o - %s \
+// RUN:| FileCheck %s --check-prefix=GNU64
+// __float128 is unsupported on MSVC
+
+__float128 fp128_ret(void) { return 0; }
+// GNU64: define dso_local fp128 @fp128_ret()
+
+__float128 fp128_args(__float128 a, __float128 b) { return a * b; }
+// GNU64: define dso_local fp128 @fp128_args(fp128 noundef %a, fp128 noundef 
%b)
+
+void fp128_vararg(int a, ...) {
+  // GNU64-LABEL: define dso_local void @fp128_vararg
+  __builtin_va_list ap;
+  __builtin_va_start(ap, a);
+  __float128 i = __builtin_va_arg(ap, __float128);
+  // movaps  xmm0, xmmword ptr [rax]
+  // GNU64: load ptr, ptr
+  // GNU64: load fp128, ptr
+  __builtin_va_end(ap);
+}

>From afaf2cb8b8a99a064a8daa5e11c45c1e7c3ee770 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Tue, 5 Nov 2024 05:53:10 -0500
Subject: [PATCH 2/2] [clang] Pass `fp128` indirectly and return in xmm0 on
 Windows

Clang currently passes and returns `__float128` in vector registers on
MinGW targets, which is LLVM's default ABI for `fp128`. However, the
Windows x86-64 calling convention [1] states the following:

__m128 types, arrays, and strings are never passed by immediate
value. Instead, a pointer is passed to memory allocated by the
caller. Structs and unions of size 8, 16, 32, or 64 bits, and __m64
types, are passed as if they were integers of the same size. Structs
or unions of other sizes are passed as a pointer to memory allocated
by the caller. For these aggregate types passed as a pointer,
including __m128, the caller-allocated temporary memory must be
16-byte aligned.

Based on the above it sounds like `__float128` should be passed
indirectly; this is what MinGW GCC already does, so change Clang to
match. Passing by value also causes problems with varargs. E.g. the
below completes successfully when built with GCC but has a runtime crash
when built with Clang:

void va_f128(int count, ...) {
va_list args;
va_start(args, count);
__float128 val = va_arg(args, __float128);
va_end(args);
}

int main() {
va_f128(0, 0.0);
}

Change `f128` passing to use the stack and make the return in xmm0
explicit. This is the identical to `i128`, and passing is the same as
GCC.

Regarding return values, the documentation states:

A scalar return value that can fit into 64 bits, including the __m64
type, is returned through RAX. Non-scalar types including floats,
doubles, and vector types such as __m128, __m128i, __m128d are
returned in XMM0.

This makes it sound like it should be acceptable to return `__float128`
in xmm0; however, GCC returns `__float128` on the stack. That above ABI
statement as well as consistency with `i128` (which is returned in xmm0)
mean that it would likely be better for GCC to change its return ABI to
match Clang rather than the other way around, so that portion is left
as-is.

Clang's MSVC targets do not support `__float128` or `_Float128`, but
these changes would also apply there if it is eventually enabled.

[1]: 
https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170
---
 clang/lib/CodeGen/Targets/X86.cpp | 5 +
 clang/test/CodeGen/win64-fp128.c  | 5 ++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CodeGen/Targets/X86.cpp 
b/clang/lib/CodeGen/Targets/X86.cpp
index b7a1374d5b399..145c60d89d79e 100644
--- a/clang/lib/CodeGen/Targets/X86.cpp
+++ b/clang/lib/CodeGen/Targets/X86.cpp
@@ -3390,6 +3390,9 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, 
unsigned &FreeSSERegs,
 
 case BuiltinType::Int128:
 case BuiltinType::UInt128:
+case BuiltinType::Float128:
+  // 128-bit float and integer types share the same ABI.
+  
   // If it's a parameter type, the normal ABI rule is that arguments larger
   // than 8 bytes are passed indirectly. GCC follows it. We follow it too,
   // even though it isn't particularly efficient.
@@ -3400,6 +3403,8 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, 
unsigned &FreeSSERegs,
 
   // Mingw64 GCC returns i128 in XMM0. Coerce to v2i64 to handle that.
   // Clang matches them for compatibility.
+  // NOTE: GCC actually 

[clang] [clang] Pass fp128 indirectly and return in xmm0 on Windows (PR #115052)

2025-02-28 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/115052

>From 6afdfd07a22260914b45363870a7be54324bd736 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Tue, 5 Nov 2024 07:00:35 -0500
Subject: [PATCH 1/2] [clang] Add fp128 ABI tests for MinGW (NFC)

Add a test based on `win64-i128.c` with the current behavior of
`__float128`.
---
 clang/test/CodeGen/win-fp128.c | 19 +++
 1 file changed, 19 insertions(+)
 create mode 100644 clang/test/CodeGen/win-fp128.c

diff --git a/clang/test/CodeGen/win-fp128.c b/clang/test/CodeGen/win-fp128.c
new file mode 100644
index 0..ec77356201550
--- /dev/null
+++ b/clang/test/CodeGen/win-fp128.c
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -triple x86_64-windows-gnu -emit-llvm -o - %s \
+// RUN:| FileCheck %s --check-prefix=CHECK-GNU64
+// __float128 is unsupported on MSVC
+
+__float128 fp128_ret(void) { return 0; }
+// CHECK-GNU64: define dso_local fp128 @fp128_ret()
+
+__float128 fp128_args(__float128 a, __float128 b) { return a * b; }
+// CHECK-GNU64: define dso_local fp128 @fp128_args(fp128 noundef %a, fp128 
noundef %b)
+
+void fp128_vararg(int a, ...) {
+  // CHECK-GNU64-LABEL: define dso_local void @fp128_vararg
+  __builtin_va_list ap;
+  __builtin_va_start(ap, a);
+  __float128 i = __builtin_va_arg(ap, __float128);
+  // CHECK-GNU64: load ptr, ptr
+  // CHECK-GNU64: load fp128, ptr
+  __builtin_va_end(ap);
+}

>From d695313e585e6dd6fd13c9b0efb401271d9027d4 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Tue, 5 Nov 2024 05:53:10 -0500
Subject: [PATCH 2/2] [clang] Pass `fp128` indirectly and return in xmm0 on
 Windows

Clang currently passes and returns `__float128` in vector registers on
MinGW targets, which is LLVM's default ABI for `fp128`. However, the
Windows x86-64 calling convention [1] states the following:

__m128 types, arrays, and strings are never passed by immediate
value. Instead, a pointer is passed to memory allocated by the
caller. Structs and unions of size 8, 16, 32, or 64 bits, and __m64
types, are passed as if they were integers of the same size. Structs
or unions of other sizes are passed as a pointer to memory allocated
by the caller. For these aggregate types passed as a pointer,
including __m128, the caller-allocated temporary memory must be
16-byte aligned.

Based on the above it sounds like `__float128` should be passed
indirectly; this is what MinGW GCC already does, so change Clang to
match. Passing by value also causes problems with varargs. E.g. the
below completes successfully when built with GCC but has a runtime crash
when built with Clang:

void va_f128(int count, ...) {
va_list args;
va_start(args, count);
__float128 val = va_arg(args, __float128);
va_end(args);
}

int main() {
va_f128(0, 0.0);
}

Change `f128` passing to use the stack and make the return in xmm0
explicit. This is the identical to `i128`, and passing is the same as
GCC.

Regarding return values, the documentation states:

A scalar return value that can fit into 64 bits, including the __m64
type, is returned through RAX. Non-scalar types including floats,
doubles, and vector types such as __m128, __m128i, __m128d are
returned in XMM0.

This makes it sound like it should be acceptable to return `__float128`
in xmm0; however, GCC returns `__float128` on the stack. That above ABI
statement as well as consistency with `i128` (which is returned in xmm0)
mean that it would likely be better for GCC to change its return ABI to
match Clang rather than the other way around, so that portion is left
as-is.

Clang's MSVC targets do not support `__float128` or `_Float128`, but
these changes would also apply there if it is eventually enabled.

[1]: 
https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170
---
 clang/lib/CodeGen/Targets/X86.cpp | 5 +
 clang/test/CodeGen/win-fp128.c| 4 ++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/clang/lib/CodeGen/Targets/X86.cpp 
b/clang/lib/CodeGen/Targets/X86.cpp
index b7a1374d5b399..b36a6e1396653 100644
--- a/clang/lib/CodeGen/Targets/X86.cpp
+++ b/clang/lib/CodeGen/Targets/X86.cpp
@@ -3390,6 +3390,9 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, 
unsigned &FreeSSERegs,
 
 case BuiltinType::Int128:
 case BuiltinType::UInt128:
+case BuiltinType::Float128:
+  // 128-bit float and integer types share the same ABI.
+
   // If it's a parameter type, the normal ABI rule is that arguments larger
   // than 8 bytes are passed indirectly. GCC follows it. We follow it too,
   // even though it isn't particularly efficient.
@@ -3400,6 +3403,8 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, 
unsigned &FreeSSERegs,
 
   // Mingw64 GCC returns i128 in XMM0. Coerce to v2i64 to handle that.
   // Clang matches them for compatibility.
+  // NOTE: GCC actually returns f128 indirectly but will hopeful

[clang] [clang] Pass fp128 indirectly and return in xmm0 on Windows (PR #115052)

2025-02-28 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/115052

>From 6afdfd07a22260914b45363870a7be54324bd736 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Tue, 5 Nov 2024 07:00:35 -0500
Subject: [PATCH 1/2] [clang] Add fp128 ABI tests for MinGW (NFC)

Add a test based on `win64-i128.c` with the current behavior of
`__float128`.
---
 clang/test/CodeGen/win-fp128.c | 19 +++
 1 file changed, 19 insertions(+)
 create mode 100644 clang/test/CodeGen/win-fp128.c

diff --git a/clang/test/CodeGen/win-fp128.c b/clang/test/CodeGen/win-fp128.c
new file mode 100644
index 0..ec77356201550
--- /dev/null
+++ b/clang/test/CodeGen/win-fp128.c
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -triple x86_64-windows-gnu -emit-llvm -o - %s \
+// RUN:| FileCheck %s --check-prefix=CHECK-GNU64
+// __float128 is unsupported on MSVC
+
+__float128 fp128_ret(void) { return 0; }
+// CHECK-GNU64: define dso_local fp128 @fp128_ret()
+
+__float128 fp128_args(__float128 a, __float128 b) { return a * b; }
+// CHECK-GNU64: define dso_local fp128 @fp128_args(fp128 noundef %a, fp128 
noundef %b)
+
+void fp128_vararg(int a, ...) {
+  // CHECK-GNU64-LABEL: define dso_local void @fp128_vararg
+  __builtin_va_list ap;
+  __builtin_va_start(ap, a);
+  __float128 i = __builtin_va_arg(ap, __float128);
+  // CHECK-GNU64: load ptr, ptr
+  // CHECK-GNU64: load fp128, ptr
+  __builtin_va_end(ap);
+}

>From 67124440d2f0c1320078dad2ac90caa7a431c188 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Tue, 5 Nov 2024 05:53:10 -0500
Subject: [PATCH 2/2] [clang] Pass `fp128` indirectly and return in xmm0 on
 Windows

Clang currently passes and returns `__float128` in vector registers on
MinGW targets, which is LLVM's default ABI for `fp128`. However, the
Windows x86-64 calling convention [1] states the following:

__m128 types, arrays, and strings are never passed by immediate
value. Instead, a pointer is passed to memory allocated by the
caller. Structs and unions of size 8, 16, 32, or 64 bits, and __m64
types, are passed as if they were integers of the same size. Structs
or unions of other sizes are passed as a pointer to memory allocated
by the caller. For these aggregate types passed as a pointer,
including __m128, the caller-allocated temporary memory must be
16-byte aligned.

Based on the above it sounds like `__float128` should be passed
indirectly. Thus, change `f128` passing to use the stack and make the
return in xmm0 explicit. This is the identical to `i128`, and passing is
the same as GCC.

Regarding return values, the documentation states:

A scalar return value that can fit into 64 bits, including the __m64
type, is returned through RAX. Non-scalar types including floats,
doubles, and vector types such as __m128, __m128i, __m128d are
returned in XMM0.

This makes it sound like it should be acceptable to return `__float128`
in xmm0; however, GCC returns `__float128` on the stack. That above ABI
statement as well as consistency with `i128` (which is returned in xmm0)
mean that it would likely be better for GCC to change its return ABI to
match Clang rather than the other way around, so that portion is left
as-is.

Clang's MSVC targets do not support `__float128` or `_Float128`, but
these changes would also apply there if it is eventually enabled.

With [2] which should land around the same time, LLVM will also
implement this ABI so it is not technically necessary for Clang to make
a change here as well. This is sill done in order to be consistent with
other types, and to allow calling convention-aware optimizations at all
available optimization layers (@rnk mentioned possible reuse of stack
arguments). An added benefit is readibility of the LLVM IR since it more
accurately reflects what the lowered assembly does.

[1]: 
https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170
[2]: https://github.com/llvm/llvm-project/pull/128848
---
 clang/lib/CodeGen/Targets/X86.cpp | 5 +
 clang/test/CodeGen/win-fp128.c| 4 ++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/clang/lib/CodeGen/Targets/X86.cpp 
b/clang/lib/CodeGen/Targets/X86.cpp
index b7a1374d5b399..b36a6e1396653 100644
--- a/clang/lib/CodeGen/Targets/X86.cpp
+++ b/clang/lib/CodeGen/Targets/X86.cpp
@@ -3390,6 +3390,9 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, 
unsigned &FreeSSERegs,
 
 case BuiltinType::Int128:
 case BuiltinType::UInt128:
+case BuiltinType::Float128:
+  // 128-bit float and integer types share the same ABI.
+
   // If it's a parameter type, the normal ABI rule is that arguments larger
   // than 8 bytes are passed indirectly. GCC follows it. We follow it too,
   // even though it isn't particularly efficient.
@@ -3400,6 +3403,8 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, 
unsigned &FreeSSERegs,
 
   // Mingw64 GCC returns i128 in XMM0. Coerce to v2i64 to handle that.
   // Clang mat

[clang] [clang] Pass fp128 indirectly and return in xmm0 on Windows (PR #115052)

2025-02-28 Thread Trevor Gross via cfe-commits

tgross35 wrote:

Thanks for answering that question, I wouldn't have had a good answer outside 
of consistency. Does mid-level optimizations refer to optimizations done in 
Clang rather than in LLVM?

Somebody will need to land this for me, the two commits should come separate 
(first is NFC).

> Otherwise, I think this is good to go. I suppose Rust folks are watching this 
> PR, otherwise, I'd say loop them in.

For future reference that's mostly me for f16/f128, I'll update our frontend 
after this lands.

https://github.com/llvm/llvm-project/pull/115052
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang] Pass fp128 indirectly and return in xmm0 on Windows (PR #115052)

2025-03-01 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 edited 
https://github.com/llvm/llvm-project/pull/115052
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2025-02-26 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/76558

>From f110337467d5a2b1f624eab507daa2bc854def17 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Fri, 11 Aug 2023 22:16:01 -0400
Subject: [PATCH 1/4] [IR] Add an xpassing test for `f128` intrinsic lowering
 (nfc)

`f128` intrinsic functions lower to incorrect libc calls. Add a test
showing current behavior.
---
 .../CodeGen/Generic/f128-math-lowering.ll | 610 ++
 1 file changed, 610 insertions(+)
 create mode 100644 llvm/test/CodeGen/Generic/f128-math-lowering.ll

diff --git a/llvm/test/CodeGen/Generic/f128-math-lowering.ll 
b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
new file mode 100644
index 0..8a70786d97fe6
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
@@ -0,0 +1,610 @@
+
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 2
+;
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-AARCH64
+; RUN: llc < %s -mtriple=riscv32-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-RISCV32
+; RUN: llc < %s -mtriple=s390x-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-S390X
+; RUN: llc < %s -mtriple=i686-unknown-unknown   -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-X64
+;
+; Verify that fp128 intrinsics only lower to `long double` calls on platforms
+; where `f128` and `long double` have the same layout.
+;
+; We test on x86 and x64 which have 80-bit ld, as well as aarch64 (ld == f128),
+; riscv32 (ld == f64), and s380x (ld == f128 with different alignment from
+; x64/aarch64 f128).
+;
+; FIXME: these emit calls to long double functions but should emit f128 calls
+
+define fp128 @test_cbrtf128(fp128 %a) {
+; CHECK-LABEL:  test_cbrtf128:
+; CHECK-AARCH64:b llvm.cbrt.f128
+; CHECK-RISCV32:call llvm.cbrt.f128@plt
+; CHECK-S390X:  brasl {{%.*}} llvm.cbrt.f128@PLT
+; CHECK-X64:jmp llvm.cbrt.f128@PLT # TAILCALL
+; CHECK-X86:calll llvm.cbrt.f128@PLT
+start:
+  %0 = tail call fp128 @llvm.cbrt.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.cbrt.f128(fp128)
+
+
+define fp128 @test_ceilf128(fp128 %a) {
+; CHECK-LABEL:  test_ceilf128:
+; CHECK-AARCH64:b ceill
+; CHECK-RISCV32:call ceill@plt
+; CHECK-S390X:  brasl {{%.*}} ceill@PLT
+; CHECK-X64:jmp ceill@PLT
+; CHECK-X86:calll ceill
+start:
+  %0 = tail call fp128 @llvm.ceil.f128(fp128 %a)
+  ret fp128 %0
+}
+
+declare fp128 @llvm.ceil.f128(fp128)
+
+
+define fp128 @test_copysignf128(fp128 %a, fp128 %b) {
+; No math library call here, so make sure the assembly does the correct thing.
+; This test is autogenerated
+; CHECK-LABEL:test_copysignf128:
+; CHECK-AARCH64-LABEL: test_copysignf128:
+; CHECK-AARCH64:   // %bb.0: // %start
+; CHECK-AARCH64-NEXT:stp q0, q1, [sp, #-32]!
+; CHECK-AARCH64-NEXT:.cfi_def_cfa_offset 32
+; CHECK-AARCH64-NEXT:ldrb w8, [sp, #15]
+; CHECK-AARCH64-NEXT:ldrb w9, [sp, #31]
+; CHECK-AARCH64-NEXT:bfxil w9, w8, #0, #7
+; CHECK-AARCH64-NEXT:strb w9, [sp, #15]
+; CHECK-AARCH64-NEXT:ldr q0, [sp], #32
+; CHECK-AARCH64-NEXT:ret
+;
+; CHECK-RISCV32-LABEL: test_copysignf128:
+; CHECK-RISCV32:   # %bb.0: # %start
+; CHECK-RISCV32-NEXT:lw a3, 0(a1)
+; CHECK-RISCV32-NEXT:lw a4, 4(a1)
+; CHECK-RISCV32-NEXT:lw a2, 12(a2)
+; CHECK-RISCV32-NEXT:lw a5, 12(a1)
+; CHECK-RISCV32-NEXT:lw a1, 8(a1)
+; CHECK-RISCV32-NEXT:lui a6, 524288
+; CHECK-RISCV32-NEXT:and a2, a2, a6
+; CHECK-RISCV32-NEXT:slli a5, a5, 1
+; CHECK-RISCV32-NEXT:srli a5, a5, 1
+; CHECK-RISCV32-NEXT:or a2, a5, a2
+; CHECK-RISCV32-NEXT:sw a1, 8(a0)
+; CHECK-RISCV32-NEXT:sw a4, 4(a0)
+; CHECK-RISCV32-NEXT:sw a3, 0(a0)
+; CHECK-RISCV32-NEXT:sw a2, 12(a0)
+; CHECK-RISCV32-NEXT:ret
+;
+; CHECK-S390X-LABEL: test_copysignf128:
+; CHECK-S390X:   # %bb.0: # %start
+; CHECK-S390X-NEXT:ld %f0, 0(%r3)
+; CHECK-S390X-NEXT:ld %f2, 8(%r3)
+; CHECK-S390X-NEXT:ld %f1, 0(%r4)
+; CHECK-S390X-NEXT:ld %f3, 8(%r4)
+; CHECK-S390X-NEXT:cpsdr %f0, %f1, %f0
+; CHECK-S390X-NEXT:std %f0, 0(%r2)
+; CHECK-S390X-NEXT:std %f2, 8(%r2)
+; CHECK-S390X-NEXT:br %r14
+;
+; CHECK-X86-LABEL: test_copysignf128:
+; CHECK-X86:   # %bb.0: # %start
+; CHECK-X86-NEXT:pushl %ebx
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 8
+; CHECK-X86-NEXT:pushl %edi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 12
+; CHECK-X86-NEXT:pushl %esi
+; CHECK-X86-NEXT:.cfi_def_cfa_offset 16
+; CHECK-X86-NEXT:.cfi_offset %esi, -16
+; CHECK-X86-NEXT:.cfi_offset %edi, -12
+; CHECK-X86-NEXT:.cfi_offset %ebx, -8
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:movl {{[0-9]+}}(%esp), %ecx
+; CHECK-

[clang] [llvm] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2025-03-01 Thread Trevor Gross via cfe-commits

tgross35 wrote:

In either case, I need to have the module flags available pretty early and I'm 
not sure how to do that. Ideally they would be available when `TargetLowering` 
is constructed or sometime before it is used for lowering, but it only gets a 
`TargetMachine` as a paremeter. All values in `TargetOptions` seem to be 
configured once and don't pay attention to module flags or take the module as a 
parameter - is there a reason for that? I'm wondering if `TargetMachine` is 
intended to be unchanging across different modules.

https://github.com/llvm/llvm-project/pull/76558
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [compiler-rt] [llvm] [SystemZ] Add support for half (fp16) (PR #109164)

2025-03-05 Thread Trevor Gross via cfe-commits

tgross35 wrote:

If there isn't any reason to be consistent with the other LLVM targets then 
agreed, using the direct libcalls seems better. The new library support could 
likely land separately, right? As long as the lowering is correct, considering 
this PR is already pretty expansive.

> To bring this back to the original topic of floating-point support in the 
> compiler runtime libraries: there are a small number of such routines today, 
> but those are not intended for use with soft-float (they do expect the normal 
> hard-float ABI, i.e. passing floating-point values in floating-point 
> registers), but rather implement a few corner-case operations (like int128 
> <-> float conversion) where we actually do not have hardware instructions. 
> Any new f16-related compiler runtime library routine would fall into this 
> category, _not_ the soft-float category.

>From https://llvm.godbolt.org/z/sr6aW5e1e it looks like `+soft-float` means 
>`f64` gets passed in integer registers, it doesn't seem to still use the float 
>ABI (unless `%f` and `%r` refer to the same registers, I don't know much about 
>s390x). This is what I expect however including passing `f16` as `u16` - as 
>far as I know other targets do something similar

> Yes, we support soft-float, and as I wrote earlier I am worried what happens 
> with soft-float and conversion functions. I saw that with soft-float, the 
> same conversion functions are called, but with args in integer registers, 
> while with hard-float they go in fp-registers. For that to work, there would 
> have to be another library with the same functions but with the soft-float 
> abi. I am not sure if compiler-rt could be built for soft-float, but perhaps 
> the gcc lib has them, in which case it seem reasonable to allow that ABI 
> change in the tests. The user just has to provide a soft-float library for 
> the conversion functions as well.

For reference, this is what we do for Rust. Some more details, as an aside:

We have started splitting `+soft-float` into separate targets since 
accidentally mixing softfloat and hardfloat is an ABI footgun (more 
[here](https://github.com/rust-lang/rust/issues/116344)). So kernels use e.g. 
`aarch64-unknown-none-softfloat`, `loongarch64-unknown-none-softfloat`, which, 
behind the scenes, is the same target but adds `+soft-float` everywhere. The 
difference is this lets us distribute a different version of our builtins 
library and libm between the softfloat and hardfloat version, which gives us 
functional float support OOTB. Not that you generally want the large-ish 
softfloat routines on embedded or kernel anyway, but it works if you do.

There isn't actually a softfloat 390x target currently but this will need to be 
added at some point for RfL support (it's trivial).

https://github.com/llvm/llvm-project/pull/109164
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [compiler-rt] [llvm] [SystemZ] Add support for half (fp16) (PR #109164)

2025-02-13 Thread Trevor Gross via cfe-commits

tgross35 wrote:

What is needed to move this forward?

https://github.com/llvm/llvm-project/pull/109164
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [compiler-rt] [llvm] [SystemZ] Add support for half (fp16) (PR #109164)

2025-02-19 Thread Trevor Gross via cfe-commits


@@ -255,4 +255,9 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
 }
 setLibcallName(RTLIB::MULO_I128, nullptr);
   }
+
+  if (TT.isSystemZ()) {
+setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
+setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
+  }

tgross35 wrote:

This bit of code still seems unusual, did it get resolved now that compiler-rt 
has been updated for systemz?

https://github.com/llvm/llvm-project/pull/109164
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang] Always pass fp128 arguments indirectly on Windows (PR #115052)

2025-02-26 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/115052

>From 60b07161e8668c2bc3ee5d7a4c470a90a7673178 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Tue, 5 Nov 2024 07:00:35 -0500
Subject: [PATCH 1/2] [clang] Add fp128 ABI tests for MinGW (NFC)

Duplicate `win64-i128.c` to `win64-fp128.c` and update with the current
behavior of `__float128`.
---
 clang/test/CodeGen/win64-fp128.c | 20 
 1 file changed, 20 insertions(+)
 create mode 100644 clang/test/CodeGen/win64-fp128.c

diff --git a/clang/test/CodeGen/win64-fp128.c b/clang/test/CodeGen/win64-fp128.c
new file mode 100644
index 0..33e2441ddf314
--- /dev/null
+++ b/clang/test/CodeGen/win64-fp128.c
@@ -0,0 +1,20 @@
+// RUN: %clang_cc1 -triple x86_64-windows-gnu -emit-llvm -o - %s \
+// RUN:| FileCheck %s --check-prefix=GNU64
+// __float128 is unsupported on MSVC
+
+__float128 fp128_ret(void) { return 0; }
+// GNU64: define dso_local fp128 @fp128_ret()
+
+__float128 fp128_args(__float128 a, __float128 b) { return a * b; }
+// GNU64: define dso_local fp128 @fp128_args(fp128 noundef %a, fp128 noundef 
%b)
+
+void fp128_vararg(int a, ...) {
+  // GNU64-LABEL: define dso_local void @fp128_vararg
+  __builtin_va_list ap;
+  __builtin_va_start(ap, a);
+  __float128 i = __builtin_va_arg(ap, __float128);
+  // movaps  xmm0, xmmword ptr [rax]
+  // GNU64: load ptr, ptr
+  // GNU64: load fp128, ptr
+  __builtin_va_end(ap);
+}

>From 06ce65a1d1153ab0f5befdc2d66bdc5f2430b72f Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Tue, 5 Nov 2024 05:53:10 -0500
Subject: [PATCH 2/2] [clang] Always pass `fp128` arguments indirectly on
 Windows

Clang currently passes and returns `__float128` in vector registers on
MinGW targets. However, the Windows x86-64 calling convention [1] states
the following:

__m128 types, arrays, and strings are never passed by immediate
value. Instead, a pointer is passed to memory allocated by the
caller. Structs and unions of size 8, 16, 32, or 64 bits, and __m64
types, are passed as if they were integers of the same size. Structs
or unions of other sizes are passed as a pointer to memory allocated
by the caller. For these aggregate types passed as a pointer,
including __m128, the caller-allocated temporary memory must be
16-byte aligned.

Based on the above it sounds like `__float128` should be passed
indirectly; this is what MinGW GCC already does, so change Clang to
match. Passing by value causes problems with varargs. E.g. the below
completes successfully when built with GCC but has a runtime crash when
built with Clang:

void va_f128(int count, ...) {
va_list args;
va_start(args, count);
__float128 val = va_arg(args, __float128);
va_end(args);
}

int main() {
va_f128(0, 0.0);
}

This patch fixes the above. It also resolves crashes when calling
GCC-built f128 libcalls.

Regarding return values, the documentation states:

A scalar return value that can fit into 64 bits, including the __m64
type, is returned through RAX. Non-scalar types including floats,
doubles, and vector types such as __m128, __m128i, __m128d are
returned in XMM0.

This makes it sound like it should be acceptable to return `__float128`
in XMM0. However, GCC returns `__float128` on the stack, so do the same
here to be consistent.

Clang's MSVC targets do not support `__float128` or `_Float128`, but
these changes would also apply there if it is eventually enabled.

[1]: 
https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170
---
 clang/lib/CodeGen/Targets/X86.cpp | 3 ++-
 clang/test/CodeGen/win64-fp128.c  | 5 ++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/lib/CodeGen/Targets/X86.cpp 
b/clang/lib/CodeGen/Targets/X86.cpp
index b7a1374d5b399..1628c981c9b14 100644
--- a/clang/lib/CodeGen/Targets/X86.cpp
+++ b/clang/lib/CodeGen/Targets/X86.cpp
@@ -3390,6 +3390,7 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, 
unsigned &FreeSSERegs,
 
 case BuiltinType::Int128:
 case BuiltinType::UInt128:
+case BuiltinType::Float128:
   // If it's a parameter type, the normal ABI rule is that arguments larger
   // than 8 bytes are passed indirectly. GCC follows it. We follow it too,
   // even though it isn't particularly efficient.
@@ -3399,7 +3400,7 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, 
unsigned &FreeSSERegs,
 /*ByVal=*/false);
 
   // Mingw64 GCC returns i128 in XMM0. Coerce to v2i64 to handle that.
-  // Clang matches them for compatibility.
+  // Clang matches them for compatibility. f128 uses the same ABI.
   return ABIArgInfo::getDirect(llvm::FixedVectorType::get(
   llvm::Type::getInt64Ty(getVMContext()), 2));
 
diff --git a/clang/test/CodeGen/win64-fp128.c b/clang/test/CodeGen/win64-fp128.c
index 33e2441ddf314..707d49ed9b7f1 100644
--- a/clang/test/CodeGen/win64-fp128.c
+++ b/clang/

[clang] [clang] Always pass fp128 arguments indirectly on Windows (PR #115052)

2025-02-26 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/115052

>From 60b07161e8668c2bc3ee5d7a4c470a90a7673178 Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Tue, 5 Nov 2024 07:00:35 -0500
Subject: [PATCH 1/2] [clang] Add fp128 ABI tests for MinGW (NFC)

Duplicate `win64-i128.c` to `win64-fp128.c` and update with the current
behavior of `__float128`.
---
 clang/test/CodeGen/win64-fp128.c | 20 
 1 file changed, 20 insertions(+)
 create mode 100644 clang/test/CodeGen/win64-fp128.c

diff --git a/clang/test/CodeGen/win64-fp128.c b/clang/test/CodeGen/win64-fp128.c
new file mode 100644
index 0..33e2441ddf314
--- /dev/null
+++ b/clang/test/CodeGen/win64-fp128.c
@@ -0,0 +1,20 @@
+// RUN: %clang_cc1 -triple x86_64-windows-gnu -emit-llvm -o - %s \
+// RUN:| FileCheck %s --check-prefix=GNU64
+// __float128 is unsupported on MSVC
+
+__float128 fp128_ret(void) { return 0; }
+// GNU64: define dso_local fp128 @fp128_ret()
+
+__float128 fp128_args(__float128 a, __float128 b) { return a * b; }
+// GNU64: define dso_local fp128 @fp128_args(fp128 noundef %a, fp128 noundef 
%b)
+
+void fp128_vararg(int a, ...) {
+  // GNU64-LABEL: define dso_local void @fp128_vararg
+  __builtin_va_list ap;
+  __builtin_va_start(ap, a);
+  __float128 i = __builtin_va_arg(ap, __float128);
+  // movaps  xmm0, xmmword ptr [rax]
+  // GNU64: load ptr, ptr
+  // GNU64: load fp128, ptr
+  __builtin_va_end(ap);
+}

>From 07e0c4d453f9c52683fe65201acef357107b331b Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Tue, 5 Nov 2024 05:53:10 -0500
Subject: [PATCH 2/2] [clang] Always pass `fp128` arguments indirectly on
 Windows

Clang currently passes and returns `__float128` in vector registers on
MinGW targets. However, the Windows x86-64 calling convention [1] states
the following:

__m128 types, arrays, and strings are never passed by immediate
value. Instead, a pointer is passed to memory allocated by the
caller. Structs and unions of size 8, 16, 32, or 64 bits, and __m64
types, are passed as if they were integers of the same size. Structs
or unions of other sizes are passed as a pointer to memory allocated
by the caller. For these aggregate types passed as a pointer,
including __m128, the caller-allocated temporary memory must be
16-byte aligned.

Based on the above it sounds like `__float128` should be passed
indirectly; this is what MinGW GCC already does, so change Clang to
match. Passing by value causes problems with varargs. E.g. the below
completes successfully when built with GCC but has a runtime crash when
built with Clang:

void va_f128(int count, ...) {
va_list args;
va_start(args, count);
__float128 val = va_arg(args, __float128);
va_end(args);
}

int main() {
va_f128(0, 0.0);
}

This patch fixes the above. It also resolves crashes when calling
GCC-built f128 libcalls.

Regarding return values, the documentation states:

A scalar return value that can fit into 64 bits, including the __m64
type, is returned through RAX. Non-scalar types including floats,
doubles, and vector types such as __m128, __m128i, __m128d are
returned in XMM0.

This makes it sound like it should be acceptable to return `__float128`
in XMM0. However, GCC returns `__float128` on the stack, so do the same
here to be consistent.

Clang's MSVC targets do not support `__float128` or `_Float128`, but
these changes would also apply there if it is eventually enabled.

[1]: 
https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170
---
 clang/lib/CodeGen/Targets/X86.cpp | 3 ++-
 clang/test/CodeGen/win64-fp128.c  | 5 ++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/lib/CodeGen/Targets/X86.cpp 
b/clang/lib/CodeGen/Targets/X86.cpp
index b7a1374d5b399..dda564868e1af 100644
--- a/clang/lib/CodeGen/Targets/X86.cpp
+++ b/clang/lib/CodeGen/Targets/X86.cpp
@@ -3390,6 +3390,7 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, 
unsigned &FreeSSERegs,
 
 case BuiltinType::Int128:
 case BuiltinType::UInt128:
+case BuiltinType::Float128:
   // If it's a parameter type, the normal ABI rule is that arguments larger
   // than 8 bytes are passed indirectly. GCC follows it. We follow it too,
   // even though it isn't particularly efficient.
@@ -3399,7 +3400,7 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, 
unsigned &FreeSSERegs,
 /*ByVal=*/false);
 
   // Mingw64 GCC returns i128 in XMM0. Coerce to v2i64 to handle that.
-  // Clang matches them for compatibility.
+  // Clang matches them for compatibility. f128 
   return ABIArgInfo::getDirect(llvm::FixedVectorType::get(
   llvm::Type::getInt64Ty(getVMContext()), 2));
 
diff --git a/clang/test/CodeGen/win64-fp128.c b/clang/test/CodeGen/win64-fp128.c
index 33e2441ddf314..707d49ed9b7f1 100644
--- a/clang/test/CodeGen/win64-fp128.c
+++ b/clang/test/CodeGen/win64

[clang] [llvm] [WIP] Correct lowering of `fp128` intrinsics (PR #76558)

2025-02-27 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 updated 
https://github.com/llvm/llvm-project/pull/76558

>From 31405591b5661156348ec7a45e66eb43e0ace15b Mon Sep 17 00:00:00 2001
From: Trevor Gross 
Date: Fri, 11 Aug 2023 22:16:01 -0400
Subject: [PATCH 1/6] [IR] Add a test for `f128` libcall lowering (nfc)

`f128` intrinsic functions sometimes lower to `long double` library
calls when they instead need to be `f128` versions. Add a test
demonstrating current behavior.
---
 .../CodeGen/Generic/f128-math-lowering.ll | 328 ++
 1 file changed, 328 insertions(+)
 create mode 100644 llvm/test/CodeGen/Generic/f128-math-lowering.ll

diff --git a/llvm/test/CodeGen/Generic/f128-math-lowering.ll 
b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
new file mode 100644
index 0..dfbd1eaeda109
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/f128-math-lowering.ll
@@ -0,0 +1,328 @@
+; RUN: llc < %s -mtriple=aarch64-unknown-none -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-USELD
+; RUN: llc < %s -mtriple=i686-unknown -verify-machineinstrs | FileCheck %s 
--check-prefix=CHECK-USELD
+; RUN: llc < %s -mtriple=riscv32 -verify-machineinstrs | FileCheck %s 
--check-prefix=CHECK-USELD
+; RUN: llc < %s -mtriple=s390x-unknown -verify-machineinstrs | FileCheck %s 
--check-prefix=CHECK-S390X
+; RUN: llc < %s -mtriple=x86_64-unknown -verify-machineinstrs | FileCheck %s 
--check-prefix=CHECK-USELD
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-NOTLD
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-musl -verify-machineinstrs | 
FileCheck %s --check-prefix=CHECK-USELD
+;
+; REQUIRES: aarch64-registered-target
+; REQUIRES: riscv-registered-target
+; REQUIRES: systemz-registered-target
+; REQUIRES: x86-registered-target
+;
+; Verify that fp128 intrinsics only lower to `long double` calls (e.g. `sinl`)
+; on platforms where `f128` and `long double` have the same layout, and
+; otherwise lower to `f128` versions (e.g. `sinf128`).
+;
+; Targets include:
+; * x86, x64 (80-bit long double)
+; * aarch64 (long double == f128)
+; * riscv32 (long double == f64)
+; * s390x (long double == f128, hardware support)
+; * A couple assorted environments for x86
+;
+; FIXME: only targets where long double is `f128` should be using `USELD`, all
+; others need to be NOTLD. PowerPC should be added but it currently emits an
+; interesting blend of both (e.g. `acosl` but `ceilf128`).
+
+define fp128 @test_acosf128(fp128 %a) {
+; CHECK-LABEL:  test_acosf128:
+; CHECK-NOTLD:  acosf128
+; CHECK-USELD:  acosl
+; CHECK-S390X:  acosl
+start:
+  %0 = tail call fp128 @llvm.acos.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_asinf128(fp128 %a) {
+; CHECK-LABEL:  test_asinf128:
+; CHECK-NOTLD:  asinf128
+; CHECK-USELD:  asinl
+; CHECK-S390X:  asinl
+start:
+  %0 = tail call fp128 @llvm.asin.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_atanf128(fp128 %a) {
+; CHECK-LABEL:  test_atanf128:
+; CHECK-NOTLD:  atanf128
+; CHECK-USELD:  atanl
+; CHECK-S390X:  atanl
+start:
+  %0 = tail call fp128 @llvm.atan.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_ceilf128(fp128 %a) {
+; CHECK-LABEL:  test_ceilf128:
+; CHECK-NOTLD:  ceilf128
+; CHECK-USELD:  ceill
+; CHECK-S390X:  ceill
+start:
+  %0 = tail call fp128 @llvm.ceil.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_copysignf128(fp128 %a, fp128 %b) {
+; copysign should always get lowered to assembly
+; CHECK-LABEL:  test_copysignf128:
+; CHECK-NOT:copysignl
+; CHECK-NOT:copysignf128
+start:
+  %0 = tail call fp128 @llvm.copysign.f128(fp128 %a, fp128 %b)
+  ret fp128 %0
+}
+
+define fp128 @test_cosf128(fp128 %a) {
+; CHECK-LABEL:  test_cosf128:
+; CHECK-NOTLD:  cosf128
+; CHECK-USELD:  cosl
+; CHECK-S390X:  cosl
+start:
+  %0 = tail call fp128 @llvm.cos.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_exp10f128(fp128 %a) {
+; CHECK-LABEL:  test_exp2f128:
+; CHECK-NOTLD:  exp10f128
+; CHECK-USELD:  exp10l
+; CHECK-S390X:  exp10l
+start:
+  %0 = tail call fp128 @llvm.exp10.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_exp2f128(fp128 %a) {
+; CHECK-LABEL:  test_exp2f128:
+; CHECK-NOTLD:  exp2f128
+; CHECK-USELD:  exp2l
+; CHECK-S390X:  exp2l
+start:
+  %0 = tail call fp128 @llvm.exp2.f128(fp128 %a)
+  ret fp128 %0
+}
+
+
+define fp128 @test_expf128(fp128 %a) {
+; CHECK-LABEL:  test_expf128:
+; CHECK-NOTLD:  expf128
+; CHECK-USELD:  expl
+; CHECK-S390X:  expl
+start:
+  %0 = tail call fp128 @llvm.exp.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_fabsf128(fp128 %a) {
+; fabs should always get lowered to assembly
+; CHECK-LABEL:  test_fabsf128:
+; CHECK-NOT:fabsl
+; CHECK-NOT:fabsf128
+start:
+  %0 = tail call fp128 @llvm.fabs.f128(fp128 %a)
+  ret fp128 %0
+}
+
+define fp128 @test_floorf128(fp128 %a) {
+; CHECK-LABEL:  test

[clang] [compiler-rt] [llvm] [SystemZ] Add support for half (fp16) (PR #109164)

2025-04-03 Thread Trevor Gross via cfe-commits


@@ -126,3 +138,23 @@ define void @f9(ptr %cptr, ptr %aptr, ptr %bptr) {
   store fp128 %c, ptr %cptr
   ret void
 }
+
+; Test f16 copies in which the sign comes from an f16.
+define half @f10(half %a, half %b) {
+; CHECK-LABEL: f10:
+; CHECK: brasl %r14, copysignh@PLT

tgross35 wrote:

This is kind of a weird test, is it meant to ensure that LLVM can recognize 
calls to `libm` functions and replace them with assembly lowerings if that is 
more optimal? If that is the goal then `copysignh` doesn't seem to be doing 
anything and maybe it should be a FIXME (it should be `copysignf16` anyway). If 
that isn't the goal then I don't know why the other tests aren't calling 
`@llvm.copysign.f32` and similar.

https://github.com/llvm/llvm-project/pull/109164
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [llvm] Change `fp128` lowering to use `f128` functions by default (PR #76558)

2025-07-13 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 edited 
https://github.com/llvm/llvm-project/pull/76558
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [llvm] Change `fp128` lowering to use `f128` functions by default (PR #76558)

2025-07-14 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 edited 
https://github.com/llvm/llvm-project/pull/76558
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [llvm] Change `fp128` lowering to use `f128` functions by default (PR #76558)

2025-07-14 Thread Trevor Gross via cfe-commits


@@ -39,7 +39,7 @@ define double @f5(double %x, double %y) {
 
 define fp128 @f6(fp128 %x, fp128 %y) {
 ; CHECK-LABEL: f6:
-; CHECK: brasl %r14, powl@PLT
+; CHECK: brasl %r14, powf128@PLT

tgross35 wrote:

This has been updated to not change the current behavior on existing platforms, 
with a small exception of MIPS32 (due to lacking ABI information)

https://github.com/llvm/llvm-project/pull/76558
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [llvm] Change `fp128` lowering to use `f128` functions by default (PR #76558)

2025-07-14 Thread Trevor Gross via cfe-commits

https://github.com/tgross35 edited 
https://github.com/llvm/llvm-project/pull/76558
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits