[llvm-branch-commits] [llvm] [AMDGPU][SDAG] DAGCombine PTRADD -> disjoint OR (PR #146075)

2025-08-01 Thread Matt Arsenault via llvm-branch-commits


@@ -2767,6 +2767,19 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) {
 }
   }
 
+  // Transform (ptradd a, b) -> (or disjoint a, b) if it is equivalent and if
+  // that transformation can't block an offset folding at any use of the 
ptradd.
+  // This should be done late, after legalization, so that it doesn't block
+  // other ptradd combines that could enable more offset folding.
+  if (LegalOperations && DAG.haveNoCommonBitsSet(N0, N1)) {
+bool TransformCanBreakAddrMode = any_of(N->users(), [&](SDNode *User) {
+  return canFoldInAddressingMode(N, User, DAG, TLI);
+});
+
+if (!TransformCanBreakAddrMode)

arsenm wrote:

!any_of -> none_of 

https://github.com/llvm/llvm-project/pull/146075
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] [AMDGPU] gfx1250 v_cvt_scalef32_sr_pk8_* instructions (PR #151765)

2025-08-01 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/151765

>From a8ad72e17b59e727cd534f5042dfcc18204c41ed Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Fri, 1 Aug 2025 13:10:57 -0700
Subject: [PATCH] [AMDGPU] gfx1250 v_cvt_scalef32_sr_pk8_* instructions

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   9 +
 .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl  | 106 +
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  19 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   9 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   6 +
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  21 +
 .../llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll | 385 ++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s |  54 +++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s|  54 +++
 .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt |  54 +++
 10 files changed, 712 insertions(+), 5 deletions(-)
 create mode 100644 
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index e117e993fc572..9196f5583e45f 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -725,6 +725,15 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f32, 
"V2UiV8ff", "nc", "gfx
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f32, "UiV8ff", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f16, "UiV8hf", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16, "UiV8yf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_bf16, "V2UiV8yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_bf16, "V2UiV8yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f16, "V2UiV8hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f16, "V2UiV8hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f32, "V2UiV8fUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", 
"nc", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 150c6ce0b76ee..177df6c1e555a 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -768,6 +768,112 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 
srcbf8, half8 srch8, float
   *out1 = __builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16(srcbf8, scale);
 }
 
+// CHECK-LABEL: @test_cvt_scalef32_sr_pk(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[OUT2_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF8_ADDR:%.*]] = alloca <8 x bfloat>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCH8_ADDR:%.*]] = alloca <8 x half>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCF8_ADDR:%.*]] = alloca <8 x float>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[OUT3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF16_ADDR:%.*]] = alloca <16 x bfloat>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCH16_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCF16_ADDR:%.*]] = alloca <16 x float>, align 64, 
addrspace(5)
+// CHECK-NEXT:[[OUT1_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SR_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:[[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT2_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF8_ADDR]] to ptr
+// CHECK-NEXT:[[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT3_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF16_ADDR_ASCA

[llvm-branch-commits] [clang] [llvm] [AMDGPU] gfx1250 v_cvt_scalef32_pk8_* instructions (PR #151758)

2025-08-01 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/151758

>From 159959bda89857ab52cf656b1e3fca19bf662e79 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Fri, 1 Aug 2025 12:38:04 -0700
Subject: [PATCH] [AMDGPU] gfx1250 v_cvt_scalef32_pk8_* instructions

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   9 +
 .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl  |  94 
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  14 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   9 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   6 +
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  23 +
 .../AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll| 403 ++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s |  54 +++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s|  54 +++
 .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt |  54 +++
 10 files changed, 718 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 0c4a485d60936..e117e993fc572 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -716,6 +716,15 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_bf16_fp4, 
"V8yUiUiIUi", "nc", "gfx
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp8, "V8fV2UiUiIUi", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_bf8, "V8fV2UiUiIUi", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp4, "V8fUiUiIUi", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_bf16, "V2UiV8yf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_bf16, "V2UiV8yf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_f16, "V2UiV8hf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f16, "V2UiV8hf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_f32, "V2UiV8ff", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f32, "V2UiV8ff", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f32, "UiV8ff", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f16, "UiV8hf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16, "UiV8yf", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 2fd816cebd365..150c6ce0b76ee 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -674,6 +674,100 @@ void test_cvt_scale_pk(global half8 *outh8, global 
bfloat8 *outy8, uint2 src2,
   *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp4(src1, scale, 7);
 }
 
+// CHECK-LABEL: @test_cvt_scalef32_pk(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[OUT2_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF8_ADDR:%.*]] = alloca <8 x bfloat>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCH8_ADDR:%.*]] = alloca <8 x half>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCF8_ADDR:%.*]] = alloca <8 x float>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[OUT3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF16_ADDR:%.*]] = alloca <16 x bfloat>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCH16_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCF16_ADDR:%.*]] = alloca <16 x float>, align 64, 
addrspace(5)
+// CHECK-NEXT:[[OUT1_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:[[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT2_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF8_ADDR]] to ptr
+// CHECK-NEXT:[[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT3_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF16_ADDR]] to ptr
+// CHECK-NEXT:[[OUT1_ADDR_ASCAST:%.*]] = addrspacecast ptr ad

[llvm-branch-commits] [clang] [llvm] [AMDGPU] gfx1250 v_cvt_scalef32_sr_pk8_* instructions (PR #151765)

2025-08-01 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/151765

>From a8ad72e17b59e727cd534f5042dfcc18204c41ed Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Fri, 1 Aug 2025 13:10:57 -0700
Subject: [PATCH] [AMDGPU] gfx1250 v_cvt_scalef32_sr_pk8_* instructions

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   9 +
 .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl  | 106 +
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  19 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   9 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   6 +
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  21 +
 .../llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll | 385 ++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s |  54 +++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s|  54 +++
 .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt |  54 +++
 10 files changed, 712 insertions(+), 5 deletions(-)
 create mode 100644 
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index e117e993fc572..9196f5583e45f 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -725,6 +725,15 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f32, 
"V2UiV8ff", "nc", "gfx
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f32, "UiV8ff", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f16, "UiV8hf", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16, "UiV8yf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_bf16, "V2UiV8yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_bf16, "V2UiV8yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f16, "V2UiV8hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f16, "V2UiV8hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f32, "V2UiV8fUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", 
"nc", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 150c6ce0b76ee..177df6c1e555a 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -768,6 +768,112 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 
srcbf8, half8 srch8, float
   *out1 = __builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16(srcbf8, scale);
 }
 
+// CHECK-LABEL: @test_cvt_scalef32_sr_pk(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[OUT2_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF8_ADDR:%.*]] = alloca <8 x bfloat>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCH8_ADDR:%.*]] = alloca <8 x half>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCF8_ADDR:%.*]] = alloca <8 x float>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[OUT3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF16_ADDR:%.*]] = alloca <16 x bfloat>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCH16_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCF16_ADDR:%.*]] = alloca <16 x float>, align 64, 
addrspace(5)
+// CHECK-NEXT:[[OUT1_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SR_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:[[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT2_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF8_ADDR]] to ptr
+// CHECK-NEXT:[[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT3_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF16_ADDR_ASCA

[llvm-branch-commits] [clang] [llvm] [AMDGPU] gfx1250 v_cvt_scalef32_pk8_* instructions (PR #151758)

2025-08-01 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/151758

>From 159959bda89857ab52cf656b1e3fca19bf662e79 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Fri, 1 Aug 2025 12:38:04 -0700
Subject: [PATCH] [AMDGPU] gfx1250 v_cvt_scalef32_pk8_* instructions

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   9 +
 .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl  |  94 
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  14 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   9 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   6 +
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  23 +
 .../AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll| 403 ++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s |  54 +++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s|  54 +++
 .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt |  54 +++
 10 files changed, 718 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 0c4a485d60936..e117e993fc572 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -716,6 +716,15 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_bf16_fp4, 
"V8yUiUiIUi", "nc", "gfx
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp8, "V8fV2UiUiIUi", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_bf8, "V8fV2UiUiIUi", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp4, "V8fUiUiIUi", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_bf16, "V2UiV8yf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_bf16, "V2UiV8yf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_f16, "V2UiV8hf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f16, "V2UiV8hf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_f32, "V2UiV8ff", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f32, "V2UiV8ff", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f32, "UiV8ff", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f16, "UiV8hf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16, "UiV8yf", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 2fd816cebd365..150c6ce0b76ee 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -674,6 +674,100 @@ void test_cvt_scale_pk(global half8 *outh8, global 
bfloat8 *outy8, uint2 src2,
   *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp4(src1, scale, 7);
 }
 
+// CHECK-LABEL: @test_cvt_scalef32_pk(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[OUT2_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF8_ADDR:%.*]] = alloca <8 x bfloat>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCH8_ADDR:%.*]] = alloca <8 x half>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCF8_ADDR:%.*]] = alloca <8 x float>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[OUT3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF16_ADDR:%.*]] = alloca <16 x bfloat>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCH16_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCF16_ADDR:%.*]] = alloca <16 x float>, align 64, 
addrspace(5)
+// CHECK-NEXT:[[OUT1_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:[[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT2_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF8_ADDR]] to ptr
+// CHECK-NEXT:[[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT3_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF16_ADDR]] to ptr
+// CHECK-NEXT:[[OUT1_ADDR_ASCAST:%.*]] = addrspacecast ptr ad

[llvm-branch-commits] [clang] [llvm] [AMDGPU] gfx1250 v_cvt_scalef32_sr_pk8_* instructions (PR #151765)

2025-08-01 Thread Shilei Tian via llvm-branch-commits

https://github.com/shiltian approved this pull request.


https://github.com/llvm/llvm-project/pull/151765
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [libcxx] [PATCH 3/7] [clang] improve NestedNameSpecifier: test changes (PR #148014)

2025-08-01 Thread Matheus Izvekov via llvm-branch-commits

https://github.com/mizvekov edited 
https://github.com/llvm/llvm-project/pull/148014
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [lld] release/21.x: [lld][LoongArch] GOT indirection to PC relative optimization (#123743) (PR #151794)

2025-08-01 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-lld

Author: None (llvmbot)


Changes

Backport 283c47b4c5231a1baf528355f7119a73ac168968

Requested by: @brad0

---
Full diff: https://github.com/llvm/llvm-project/pull/151794.diff


3 Files Affected:

- (modified) lld/ELF/Arch/LoongArch.cpp (+117) 
- (added) lld/test/ELF/loongarch-pc-hi20-lo12-got.s (+145) 
- (modified) lld/test/ELF/loongarch-relax-pc-hi20-lo12.s (+6-4) 


``diff
diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp
index a14553018fc36..8802c8c2e7f01 100644
--- a/lld/ELF/Arch/LoongArch.cpp
+++ b/lld/ELF/Arch/LoongArch.cpp
@@ -46,6 +46,8 @@ class LoongArch final : public TargetInfo {
 private:
   void tlsdescToIe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
   void tlsdescToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
+  bool tryGotToPCRel(uint8_t *loc, const Relocation &rHi20,
+ const Relocation &rLo12, uint64_t secAddr) const;
 };
 } // end anonymous namespace
 
@@ -1155,6 +1157,78 @@ void LoongArch::tlsdescToLe(uint8_t *loc, const 
Relocation &rel,
   }
 }
 
+// Try GOT indirection to PC relative optimization.
+// From:
+//  * pcalau12i $a0, %got_pc_hi20(sym_got)
+//  * ld.w/d$a0, $a0, %got_pc_lo12(sym_got)
+// To:
+//  * pcalau12i $a0, %pc_hi20(sym)
+//  * addi.w/d  $a0, $a0, %pc_lo12(sym)
+//
+// Note: Althouth the optimization has been performed, the GOT entries still
+// exists, similarly to AArch64. Eliminating the entries will increase code
+// complexity.
+bool LoongArch::tryGotToPCRel(uint8_t *loc, const Relocation &rHi20,
+  const Relocation &rLo12, uint64_t secAddr) const 
{
+  // Check if the relocations apply to consecutive instructions.
+  if (rHi20.offset + 4 != rLo12.offset)
+return false;
+
+  // Check if the relocations reference the same symbol and skip undefined,
+  // preemptible and STT_GNU_IFUNC symbols.
+  if (!rHi20.sym || rHi20.sym != rLo12.sym || !rHi20.sym->isDefined() ||
+  rHi20.sym->isPreemptible || rHi20.sym->isGnuIFunc())
+return false;
+
+  // GOT references to absolute symbols can't be relaxed to use PCALAU12I/ADDI
+  // in position-independent code because these instructions produce a relative
+  // address.
+  if ((ctx.arg.isPic && !cast(*rHi20.sym).section))
+return false;
+
+  // Check if the addends of the both relocations are zero.
+  if (rHi20.addend != 0 || rLo12.addend != 0)
+return false;
+
+  const uint32_t currInsn = read32le(loc);
+  const uint32_t nextInsn = read32le(loc + 4);
+  const uint32_t ldOpcode = ctx.arg.is64 ? LD_D : LD_W;
+  // Check if the first instruction is PCALAU12I and the second instruction is
+  // LD.
+  if ((currInsn & 0xfe00) != PCALAU12I ||
+  (nextInsn & 0xffc0) != ldOpcode)
+return false;
+
+  // Check if use the same register.
+  if (getD5(currInsn) != getJ5(nextInsn) || getJ5(nextInsn) != getD5(nextInsn))
+return false;
+
+  Symbol &sym = *rHi20.sym;
+  uint64_t symLocal = sym.getVA(ctx);
+  const int64_t displace = symLocal - getLoongArchPage(secAddr + rHi20.offset);
+  // Check if the symbol address is in
+  // [(PC & ~0xfff) - 2GiB - 0x800, (PC & ~0xfff) + 2GiB - 0x800).
+  const int64_t underflow = -0x8000LL - 0x800;
+  const int64_t overflow = 0x8000LL - 0x800;
+  if (!(displace >= underflow && displace < overflow))
+return false;
+
+  Relocation newRHi20 = {RE_LOONGARCH_PAGE_PC, R_LARCH_PCALA_HI20, 
rHi20.offset,
+ rHi20.addend, &sym};
+  Relocation newRLo12 = {R_ABS, R_LARCH_PCALA_LO12, rLo12.offset, rLo12.addend,
+ &sym};
+  uint64_t pageDelta =
+  getLoongArchPageDelta(symLocal, secAddr + rHi20.offset, rHi20.type);
+  // pcalau12i $a0, %pc_hi20
+  write32le(loc, insn(PCALAU12I, getD5(currInsn), 0, 0));
+  relocate(loc, newRHi20, pageDelta);
+  // addi.w/d $a0, $a0, %pc_lo12
+  write32le(loc + 4, insn(ctx.arg.is64 ? ADDI_D : ADDI_W, getD5(nextInsn),
+  getJ5(nextInsn), 0));
+  relocate(loc + 4, newRLo12, SignExtend64(symLocal, 64));
+  return true;
+}
+
 // During TLSDESC GD_TO_IE, the converted code sequence always includes an
 // instruction related to the Lo12 relocation (ld.[wd]). To obtain correct val
 // in `getRelocTargetVA`, expr of this instruction should be adjusted to
@@ -1172,6 +1246,30 @@ RelExpr LoongArch::adjustTlsExpr(RelType type, RelExpr 
expr) const {
   return expr;
 }
 
+static bool pairForGotRels(ArrayRef relocs) {
+  // Check if R_LARCH_GOT_PC_HI20 and R_LARCH_GOT_PC_LO12 always appear in
+  // pairs.
+  size_t i = 0;
+  const size_t size = relocs.size();
+  for (; i != size; ++i) {
+if (relocs[i].type == R_LARCH_GOT_PC_HI20) {
+  if (i + 1 < size && relocs[i + 1].type == R_LARCH_GOT_PC_LO12) {
+++i;
+continue;
+  }
+  if (relaxable(relocs, i) && i + 2 < size &&
+  relocs[i + 2].type == R_LARCH_GOT_PC_LO12) {
+i += 2;
+continue;
+  }
+  break;
+} e

[llvm-branch-commits] [lld] release/21.x: [lld][LoongArch] GOT indirection to PC relative optimization (#123743) (PR #151794)

2025-08-01 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-lld-elf

Author: None (llvmbot)


Changes

Backport 283c47b4c5231a1baf528355f7119a73ac168968

Requested by: @brad0

---
Full diff: https://github.com/llvm/llvm-project/pull/151794.diff


3 Files Affected:

- (modified) lld/ELF/Arch/LoongArch.cpp (+117) 
- (added) lld/test/ELF/loongarch-pc-hi20-lo12-got.s (+145) 
- (modified) lld/test/ELF/loongarch-relax-pc-hi20-lo12.s (+6-4) 


``diff
diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp
index a14553018fc36..8802c8c2e7f01 100644
--- a/lld/ELF/Arch/LoongArch.cpp
+++ b/lld/ELF/Arch/LoongArch.cpp
@@ -46,6 +46,8 @@ class LoongArch final : public TargetInfo {
 private:
   void tlsdescToIe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
   void tlsdescToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
+  bool tryGotToPCRel(uint8_t *loc, const Relocation &rHi20,
+ const Relocation &rLo12, uint64_t secAddr) const;
 };
 } // end anonymous namespace
 
@@ -1155,6 +1157,78 @@ void LoongArch::tlsdescToLe(uint8_t *loc, const 
Relocation &rel,
   }
 }
 
+// Try GOT indirection to PC relative optimization.
+// From:
+//  * pcalau12i $a0, %got_pc_hi20(sym_got)
+//  * ld.w/d$a0, $a0, %got_pc_lo12(sym_got)
+// To:
+//  * pcalau12i $a0, %pc_hi20(sym)
+//  * addi.w/d  $a0, $a0, %pc_lo12(sym)
+//
+// Note: Althouth the optimization has been performed, the GOT entries still
+// exists, similarly to AArch64. Eliminating the entries will increase code
+// complexity.
+bool LoongArch::tryGotToPCRel(uint8_t *loc, const Relocation &rHi20,
+  const Relocation &rLo12, uint64_t secAddr) const 
{
+  // Check if the relocations apply to consecutive instructions.
+  if (rHi20.offset + 4 != rLo12.offset)
+return false;
+
+  // Check if the relocations reference the same symbol and skip undefined,
+  // preemptible and STT_GNU_IFUNC symbols.
+  if (!rHi20.sym || rHi20.sym != rLo12.sym || !rHi20.sym->isDefined() ||
+  rHi20.sym->isPreemptible || rHi20.sym->isGnuIFunc())
+return false;
+
+  // GOT references to absolute symbols can't be relaxed to use PCALAU12I/ADDI
+  // in position-independent code because these instructions produce a relative
+  // address.
+  if ((ctx.arg.isPic && !cast(*rHi20.sym).section))
+return false;
+
+  // Check if the addends of the both relocations are zero.
+  if (rHi20.addend != 0 || rLo12.addend != 0)
+return false;
+
+  const uint32_t currInsn = read32le(loc);
+  const uint32_t nextInsn = read32le(loc + 4);
+  const uint32_t ldOpcode = ctx.arg.is64 ? LD_D : LD_W;
+  // Check if the first instruction is PCALAU12I and the second instruction is
+  // LD.
+  if ((currInsn & 0xfe00) != PCALAU12I ||
+  (nextInsn & 0xffc0) != ldOpcode)
+return false;
+
+  // Check if use the same register.
+  if (getD5(currInsn) != getJ5(nextInsn) || getJ5(nextInsn) != getD5(nextInsn))
+return false;
+
+  Symbol &sym = *rHi20.sym;
+  uint64_t symLocal = sym.getVA(ctx);
+  const int64_t displace = symLocal - getLoongArchPage(secAddr + rHi20.offset);
+  // Check if the symbol address is in
+  // [(PC & ~0xfff) - 2GiB - 0x800, (PC & ~0xfff) + 2GiB - 0x800).
+  const int64_t underflow = -0x8000LL - 0x800;
+  const int64_t overflow = 0x8000LL - 0x800;
+  if (!(displace >= underflow && displace < overflow))
+return false;
+
+  Relocation newRHi20 = {RE_LOONGARCH_PAGE_PC, R_LARCH_PCALA_HI20, 
rHi20.offset,
+ rHi20.addend, &sym};
+  Relocation newRLo12 = {R_ABS, R_LARCH_PCALA_LO12, rLo12.offset, rLo12.addend,
+ &sym};
+  uint64_t pageDelta =
+  getLoongArchPageDelta(symLocal, secAddr + rHi20.offset, rHi20.type);
+  // pcalau12i $a0, %pc_hi20
+  write32le(loc, insn(PCALAU12I, getD5(currInsn), 0, 0));
+  relocate(loc, newRHi20, pageDelta);
+  // addi.w/d $a0, $a0, %pc_lo12
+  write32le(loc + 4, insn(ctx.arg.is64 ? ADDI_D : ADDI_W, getD5(nextInsn),
+  getJ5(nextInsn), 0));
+  relocate(loc + 4, newRLo12, SignExtend64(symLocal, 64));
+  return true;
+}
+
 // During TLSDESC GD_TO_IE, the converted code sequence always includes an
 // instruction related to the Lo12 relocation (ld.[wd]). To obtain correct val
 // in `getRelocTargetVA`, expr of this instruction should be adjusted to
@@ -1172,6 +1246,30 @@ RelExpr LoongArch::adjustTlsExpr(RelType type, RelExpr 
expr) const {
   return expr;
 }
 
+static bool pairForGotRels(ArrayRef relocs) {
+  // Check if R_LARCH_GOT_PC_HI20 and R_LARCH_GOT_PC_LO12 always appear in
+  // pairs.
+  size_t i = 0;
+  const size_t size = relocs.size();
+  for (; i != size; ++i) {
+if (relocs[i].type == R_LARCH_GOT_PC_HI20) {
+  if (i + 1 < size && relocs[i + 1].type == R_LARCH_GOT_PC_LO12) {
+++i;
+continue;
+  }
+  if (relaxable(relocs, i) && i + 2 < size &&
+  relocs[i + 2].type == R_LARCH_GOT_PC_LO12) {
+i += 2;
+continue;
+  }
+  break;
+   

[llvm-branch-commits] [lld] release/21.x: [lld][LoongArch] GOT indirection to PC relative optimization (#123743) (PR #151794)

2025-08-01 Thread via llvm-branch-commits

https://github.com/llvmbot created 
https://github.com/llvm/llvm-project/pull/151794

Backport 283c47b4c5231a1baf528355f7119a73ac168968

Requested by: @brad0

>From 5fae51953f170e5aa7686c6a1a550fcaddb58ad9 Mon Sep 17 00:00:00 2001
From: Zhaoxin Yang 
Date: Fri, 1 Aug 2025 14:45:46 +0800
Subject: [PATCH] [lld][LoongArch] GOT indirection to PC relative optimization
 (#123743)

In LoongArch, we try GOT indirection to PC relative optimization in
normal or medium code model, whether or not with R_LARCH_RELAX
relocation.

From:
* pcalau12i $a0, %got_pc_hi20(sym_got)
* ld.w/d $a0, $a0, %got_pc_lo12(sym_got)

To:
* pcalau12i $a0, %pc_hi20(sym)
* addi.w/d $a0, $a0, %pc_lo12(sym)

If the original code sequence can be relaxed into a single instruction
`pcaddi`, this patch will not be taken (see
https://github.com/llvm/llvm-project/pull/123566).
The optimization related to GOT is split into two locations because the
`relax()` function is part of an iteration fixed-point algorithm. We
should minimize it to achieve better linker performance.

Note: Althouth the optimization has been performed, the GOT entries
still exists, similarly to AArch64. Eliminating the entries will
increase code complexity.

(cherry picked from commit 283c47b4c5231a1baf528355f7119a73ac168968)
---
 lld/ELF/Arch/LoongArch.cpp  | 117 
 lld/test/ELF/loongarch-pc-hi20-lo12-got.s   | 145 
 lld/test/ELF/loongarch-relax-pc-hi20-lo12.s |  10 +-
 3 files changed, 268 insertions(+), 4 deletions(-)
 create mode 100644 lld/test/ELF/loongarch-pc-hi20-lo12-got.s

diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp
index a14553018fc36..8802c8c2e7f01 100644
--- a/lld/ELF/Arch/LoongArch.cpp
+++ b/lld/ELF/Arch/LoongArch.cpp
@@ -46,6 +46,8 @@ class LoongArch final : public TargetInfo {
 private:
   void tlsdescToIe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
   void tlsdescToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
+  bool tryGotToPCRel(uint8_t *loc, const Relocation &rHi20,
+ const Relocation &rLo12, uint64_t secAddr) const;
 };
 } // end anonymous namespace
 
@@ -1155,6 +1157,78 @@ void LoongArch::tlsdescToLe(uint8_t *loc, const 
Relocation &rel,
   }
 }
 
+// Try GOT indirection to PC relative optimization.
+// From:
+//  * pcalau12i $a0, %got_pc_hi20(sym_got)
+//  * ld.w/d$a0, $a0, %got_pc_lo12(sym_got)
+// To:
+//  * pcalau12i $a0, %pc_hi20(sym)
+//  * addi.w/d  $a0, $a0, %pc_lo12(sym)
+//
+// Note: Althouth the optimization has been performed, the GOT entries still
+// exists, similarly to AArch64. Eliminating the entries will increase code
+// complexity.
+bool LoongArch::tryGotToPCRel(uint8_t *loc, const Relocation &rHi20,
+  const Relocation &rLo12, uint64_t secAddr) const 
{
+  // Check if the relocations apply to consecutive instructions.
+  if (rHi20.offset + 4 != rLo12.offset)
+return false;
+
+  // Check if the relocations reference the same symbol and skip undefined,
+  // preemptible and STT_GNU_IFUNC symbols.
+  if (!rHi20.sym || rHi20.sym != rLo12.sym || !rHi20.sym->isDefined() ||
+  rHi20.sym->isPreemptible || rHi20.sym->isGnuIFunc())
+return false;
+
+  // GOT references to absolute symbols can't be relaxed to use PCALAU12I/ADDI
+  // in position-independent code because these instructions produce a relative
+  // address.
+  if ((ctx.arg.isPic && !cast(*rHi20.sym).section))
+return false;
+
+  // Check if the addends of the both relocations are zero.
+  if (rHi20.addend != 0 || rLo12.addend != 0)
+return false;
+
+  const uint32_t currInsn = read32le(loc);
+  const uint32_t nextInsn = read32le(loc + 4);
+  const uint32_t ldOpcode = ctx.arg.is64 ? LD_D : LD_W;
+  // Check if the first instruction is PCALAU12I and the second instruction is
+  // LD.
+  if ((currInsn & 0xfe00) != PCALAU12I ||
+  (nextInsn & 0xffc0) != ldOpcode)
+return false;
+
+  // Check if use the same register.
+  if (getD5(currInsn) != getJ5(nextInsn) || getJ5(nextInsn) != getD5(nextInsn))
+return false;
+
+  Symbol &sym = *rHi20.sym;
+  uint64_t symLocal = sym.getVA(ctx);
+  const int64_t displace = symLocal - getLoongArchPage(secAddr + rHi20.offset);
+  // Check if the symbol address is in
+  // [(PC & ~0xfff) - 2GiB - 0x800, (PC & ~0xfff) + 2GiB - 0x800).
+  const int64_t underflow = -0x8000LL - 0x800;
+  const int64_t overflow = 0x8000LL - 0x800;
+  if (!(displace >= underflow && displace < overflow))
+return false;
+
+  Relocation newRHi20 = {RE_LOONGARCH_PAGE_PC, R_LARCH_PCALA_HI20, 
rHi20.offset,
+ rHi20.addend, &sym};
+  Relocation newRLo12 = {R_ABS, R_LARCH_PCALA_LO12, rLo12.offset, rLo12.addend,
+ &sym};
+  uint64_t pageDelta =
+  getLoongArchPageDelta(symLocal, secAddr + rHi20.offset, rHi20.type);
+  // pcalau12i $a0, %pc_hi20
+  write32le(loc, insn(PCALAU12I, getD5(currInsn), 0, 0));
+  relocate(loc, newRHi20

[llvm-branch-commits] [lld] release/21.x: [lld][LoongArch] GOT indirection to PC relative optimization (#123743) (PR #151794)

2025-08-01 Thread via llvm-branch-commits

https://github.com/llvmbot milestoned 
https://github.com/llvm/llvm-project/pull/151794
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [lld] release/21.x: [lld][LoongArch] GOT indirection to PC relative optimization (#123743) (PR #151794)

2025-08-01 Thread via llvm-branch-commits

llvmbot wrote:

@SixWeining What do you think about merging this PR to the release branch?

https://github.com/llvm/llvm-project/pull/151794
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [lld] release/21.x: [lld][LoongArch] GOT indirection to PC relative optimization (#123743) (PR #151794)

2025-08-01 Thread via llvm-branch-commits

https://github.com/llvmbot updated 
https://github.com/llvm/llvm-project/pull/151794

>From 604c7750a86d8e33fd3e1f4402752829370398f2 Mon Sep 17 00:00:00 2001
From: Zhaoxin Yang 
Date: Fri, 1 Aug 2025 14:45:46 +0800
Subject: [PATCH] [lld][LoongArch] GOT indirection to PC relative optimization
 (#123743)

In LoongArch, we try GOT indirection to PC relative optimization in
normal or medium code model, whether or not with R_LARCH_RELAX
relocation.

From:
* pcalau12i $a0, %got_pc_hi20(sym_got)
* ld.w/d $a0, $a0, %got_pc_lo12(sym_got)

To:
* pcalau12i $a0, %pc_hi20(sym)
* addi.w/d $a0, $a0, %pc_lo12(sym)

If the original code sequence can be relaxed into a single instruction
`pcaddi`, this patch will not be taken (see
https://github.com/llvm/llvm-project/pull/123566).
The optimization related to GOT is split into two locations because the
`relax()` function is part of an iteration fixed-point algorithm. We
should minimize it to achieve better linker performance.

Note: Althouth the optimization has been performed, the GOT entries
still exists, similarly to AArch64. Eliminating the entries will
increase code complexity.

(cherry picked from commit 283c47b4c5231a1baf528355f7119a73ac168968)
---
 lld/ELF/Arch/LoongArch.cpp  | 117 
 lld/test/ELF/loongarch-pc-hi20-lo12-got.s   | 145 
 lld/test/ELF/loongarch-relax-pc-hi20-lo12.s |  10 +-
 3 files changed, 268 insertions(+), 4 deletions(-)
 create mode 100644 lld/test/ELF/loongarch-pc-hi20-lo12-got.s

diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp
index a14553018fc36..8802c8c2e7f01 100644
--- a/lld/ELF/Arch/LoongArch.cpp
+++ b/lld/ELF/Arch/LoongArch.cpp
@@ -46,6 +46,8 @@ class LoongArch final : public TargetInfo {
 private:
   void tlsdescToIe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
   void tlsdescToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
+  bool tryGotToPCRel(uint8_t *loc, const Relocation &rHi20,
+ const Relocation &rLo12, uint64_t secAddr) const;
 };
 } // end anonymous namespace
 
@@ -1155,6 +1157,78 @@ void LoongArch::tlsdescToLe(uint8_t *loc, const 
Relocation &rel,
   }
 }
 
+// Try GOT indirection to PC relative optimization.
+// From:
+//  * pcalau12i $a0, %got_pc_hi20(sym_got)
+//  * ld.w/d$a0, $a0, %got_pc_lo12(sym_got)
+// To:
+//  * pcalau12i $a0, %pc_hi20(sym)
+//  * addi.w/d  $a0, $a0, %pc_lo12(sym)
+//
+// Note: Althouth the optimization has been performed, the GOT entries still
+// exists, similarly to AArch64. Eliminating the entries will increase code
+// complexity.
+bool LoongArch::tryGotToPCRel(uint8_t *loc, const Relocation &rHi20,
+  const Relocation &rLo12, uint64_t secAddr) const 
{
+  // Check if the relocations apply to consecutive instructions.
+  if (rHi20.offset + 4 != rLo12.offset)
+return false;
+
+  // Check if the relocations reference the same symbol and skip undefined,
+  // preemptible and STT_GNU_IFUNC symbols.
+  if (!rHi20.sym || rHi20.sym != rLo12.sym || !rHi20.sym->isDefined() ||
+  rHi20.sym->isPreemptible || rHi20.sym->isGnuIFunc())
+return false;
+
+  // GOT references to absolute symbols can't be relaxed to use PCALAU12I/ADDI
+  // in position-independent code because these instructions produce a relative
+  // address.
+  if ((ctx.arg.isPic && !cast(*rHi20.sym).section))
+return false;
+
+  // Check if the addends of the both relocations are zero.
+  if (rHi20.addend != 0 || rLo12.addend != 0)
+return false;
+
+  const uint32_t currInsn = read32le(loc);
+  const uint32_t nextInsn = read32le(loc + 4);
+  const uint32_t ldOpcode = ctx.arg.is64 ? LD_D : LD_W;
+  // Check if the first instruction is PCALAU12I and the second instruction is
+  // LD.
+  if ((currInsn & 0xfe00) != PCALAU12I ||
+  (nextInsn & 0xffc0) != ldOpcode)
+return false;
+
+  // Check if use the same register.
+  if (getD5(currInsn) != getJ5(nextInsn) || getJ5(nextInsn) != getD5(nextInsn))
+return false;
+
+  Symbol &sym = *rHi20.sym;
+  uint64_t symLocal = sym.getVA(ctx);
+  const int64_t displace = symLocal - getLoongArchPage(secAddr + rHi20.offset);
+  // Check if the symbol address is in
+  // [(PC & ~0xfff) - 2GiB - 0x800, (PC & ~0xfff) + 2GiB - 0x800).
+  const int64_t underflow = -0x8000LL - 0x800;
+  const int64_t overflow = 0x8000LL - 0x800;
+  if (!(displace >= underflow && displace < overflow))
+return false;
+
+  Relocation newRHi20 = {RE_LOONGARCH_PAGE_PC, R_LARCH_PCALA_HI20, 
rHi20.offset,
+ rHi20.addend, &sym};
+  Relocation newRLo12 = {R_ABS, R_LARCH_PCALA_LO12, rLo12.offset, rLo12.addend,
+ &sym};
+  uint64_t pageDelta =
+  getLoongArchPageDelta(symLocal, secAddr + rHi20.offset, rHi20.type);
+  // pcalau12i $a0, %pc_hi20
+  write32le(loc, insn(PCALAU12I, getD5(currInsn), 0, 0));
+  relocate(loc, newRHi20, pageDelta);
+  // addi.w/d $a0, $a0, %pc_lo12
+  write32le(loc + 4, ins

[llvm-branch-commits] [lld] release/21.x: [lld][LoongArch] GOT indirection to PC relative optimization (#123743) (PR #151794)

2025-08-01 Thread via llvm-branch-commits

https://github.com/llvmbot updated 
https://github.com/llvm/llvm-project/pull/151794

>From 223b08df6cfd7530be58258282bc0bb1b006bbeb Mon Sep 17 00:00:00 2001
From: Zhaoxin Yang 
Date: Fri, 1 Aug 2025 14:45:46 +0800
Subject: [PATCH] [lld][LoongArch] GOT indirection to PC relative optimization
 (#123743)

In LoongArch, we try GOT indirection to PC relative optimization in
normal or medium code model, whether or not with R_LARCH_RELAX
relocation.

From:
* pcalau12i $a0, %got_pc_hi20(sym_got)
* ld.w/d $a0, $a0, %got_pc_lo12(sym_got)

To:
* pcalau12i $a0, %pc_hi20(sym)
* addi.w/d $a0, $a0, %pc_lo12(sym)

If the original code sequence can be relaxed into a single instruction
`pcaddi`, this patch will not be taken (see
https://github.com/llvm/llvm-project/pull/123566).
The optimization related to GOT is split into two locations because the
`relax()` function is part of an iteration fixed-point algorithm. We
should minimize it to achieve better linker performance.

Note: Althouth the optimization has been performed, the GOT entries
still exists, similarly to AArch64. Eliminating the entries will
increase code complexity.

(cherry picked from commit 283c47b4c5231a1baf528355f7119a73ac168968)
---
 lld/ELF/Arch/LoongArch.cpp  | 117 
 lld/test/ELF/loongarch-pc-hi20-lo12-got.s   | 145 
 lld/test/ELF/loongarch-relax-pc-hi20-lo12.s |  10 +-
 3 files changed, 268 insertions(+), 4 deletions(-)
 create mode 100644 lld/test/ELF/loongarch-pc-hi20-lo12-got.s

diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp
index a14553018fc36..8802c8c2e7f01 100644
--- a/lld/ELF/Arch/LoongArch.cpp
+++ b/lld/ELF/Arch/LoongArch.cpp
@@ -46,6 +46,8 @@ class LoongArch final : public TargetInfo {
 private:
   void tlsdescToIe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
   void tlsdescToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
+  bool tryGotToPCRel(uint8_t *loc, const Relocation &rHi20,
+ const Relocation &rLo12, uint64_t secAddr) const;
 };
 } // end anonymous namespace
 
@@ -1155,6 +1157,78 @@ void LoongArch::tlsdescToLe(uint8_t *loc, const 
Relocation &rel,
   }
 }
 
+// Try GOT indirection to PC relative optimization.
+// From:
+//  * pcalau12i $a0, %got_pc_hi20(sym_got)
+//  * ld.w/d$a0, $a0, %got_pc_lo12(sym_got)
+// To:
+//  * pcalau12i $a0, %pc_hi20(sym)
+//  * addi.w/d  $a0, $a0, %pc_lo12(sym)
+//
+// Note: Althouth the optimization has been performed, the GOT entries still
+// exists, similarly to AArch64. Eliminating the entries will increase code
+// complexity.
+bool LoongArch::tryGotToPCRel(uint8_t *loc, const Relocation &rHi20,
+  const Relocation &rLo12, uint64_t secAddr) const 
{
+  // Check if the relocations apply to consecutive instructions.
+  if (rHi20.offset + 4 != rLo12.offset)
+return false;
+
+  // Check if the relocations reference the same symbol and skip undefined,
+  // preemptible and STT_GNU_IFUNC symbols.
+  if (!rHi20.sym || rHi20.sym != rLo12.sym || !rHi20.sym->isDefined() ||
+  rHi20.sym->isPreemptible || rHi20.sym->isGnuIFunc())
+return false;
+
+  // GOT references to absolute symbols can't be relaxed to use PCALAU12I/ADDI
+  // in position-independent code because these instructions produce a relative
+  // address.
+  if ((ctx.arg.isPic && !cast(*rHi20.sym).section))
+return false;
+
+  // Check if the addends of the both relocations are zero.
+  if (rHi20.addend != 0 || rLo12.addend != 0)
+return false;
+
+  const uint32_t currInsn = read32le(loc);
+  const uint32_t nextInsn = read32le(loc + 4);
+  const uint32_t ldOpcode = ctx.arg.is64 ? LD_D : LD_W;
+  // Check if the first instruction is PCALAU12I and the second instruction is
+  // LD.
+  if ((currInsn & 0xfe00) != PCALAU12I ||
+  (nextInsn & 0xffc0) != ldOpcode)
+return false;
+
+  // Check if use the same register.
+  if (getD5(currInsn) != getJ5(nextInsn) || getJ5(nextInsn) != getD5(nextInsn))
+return false;
+
+  Symbol &sym = *rHi20.sym;
+  uint64_t symLocal = sym.getVA(ctx);
+  const int64_t displace = symLocal - getLoongArchPage(secAddr + rHi20.offset);
+  // Check if the symbol address is in
+  // [(PC & ~0xfff) - 2GiB - 0x800, (PC & ~0xfff) + 2GiB - 0x800).
+  const int64_t underflow = -0x8000LL - 0x800;
+  const int64_t overflow = 0x8000LL - 0x800;
+  if (!(displace >= underflow && displace < overflow))
+return false;
+
+  Relocation newRHi20 = {RE_LOONGARCH_PAGE_PC, R_LARCH_PCALA_HI20, 
rHi20.offset,
+ rHi20.addend, &sym};
+  Relocation newRLo12 = {R_ABS, R_LARCH_PCALA_LO12, rLo12.offset, rLo12.addend,
+ &sym};
+  uint64_t pageDelta =
+  getLoongArchPageDelta(symLocal, secAddr + rHi20.offset, rHi20.type);
+  // pcalau12i $a0, %pc_hi20
+  write32le(loc, insn(PCALAU12I, getD5(currInsn), 0, 0));
+  relocate(loc, newRHi20, pageDelta);
+  // addi.w/d $a0, $a0, %pc_lo12
+  write32le(loc + 4, ins

[llvm-branch-commits] [clang] [llvm] [AMDGPU] gfx1250 v_cvt_scalef32_sr_pk8_* instructions (PR #151765)

2025-08-01 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-clang

Author: Stanislav Mekhanoshin (rampitec)


Changes



---

Patch is 52.83 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/151765.diff


10 Files Affected:

- (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+9) 
- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl (+106) 
- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+14-5) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+9) 
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+6) 
- (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+21) 
- (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll 
(+385) 
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s (+54) 
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s (+54) 
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt (+54) 


``diff
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index e117e993fc572..9196f5583e45f 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -725,6 +725,15 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f32, 
"V2UiV8ff", "nc", "gfx
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f32, "UiV8ff", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f16, "UiV8hf", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16, "UiV8yf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_bf16, "V2UiV8yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_bf16, "V2UiV8yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f16, "V2UiV8hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f16, "V2UiV8hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f32, "V2UiV8fUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", 
"nc", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 150c6ce0b76ee..177df6c1e555a 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -768,6 +768,112 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 
srcbf8, half8 srch8, float
   *out1 = __builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16(srcbf8, scale);
 }
 
+// CHECK-LABEL: @test_cvt_scalef32_sr_pk(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[OUT2_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF8_ADDR:%.*]] = alloca <8 x bfloat>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCH8_ADDR:%.*]] = alloca <8 x half>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCF8_ADDR:%.*]] = alloca <8 x float>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[OUT3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF16_ADDR:%.*]] = alloca <16 x bfloat>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCH16_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCF16_ADDR:%.*]] = alloca <16 x float>, align 64, 
addrspace(5)
+// CHECK-NEXT:[[OUT1_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SR_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:[[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT2_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF8_ADDR]] to ptr
+// CHECK-NEXT:[[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT3_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF16_ADDR]] to ptr
+// CHECK-NEXT: 

[llvm-branch-commits] [clang] [llvm] [AMDGPU] gfx1250 v_cvt_scalef32_sr_pk8_* instructions (PR #151765)

2025-08-01 Thread via llvm-branch-commits

llvmbot wrote:



@llvm/pr-subscribers-mc

@llvm/pr-subscribers-backend-amdgpu

Author: Stanislav Mekhanoshin (rampitec)


Changes



---

Patch is 52.83 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/151765.diff


10 Files Affected:

- (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+9) 
- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl (+106) 
- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+14-5) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+9) 
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+6) 
- (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+21) 
- (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll 
(+385) 
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s (+54) 
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s (+54) 
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt (+54) 


``diff
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index e117e993fc572..9196f5583e45f 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -725,6 +725,15 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f32, 
"V2UiV8ff", "nc", "gfx
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f32, "UiV8ff", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f16, "UiV8hf", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16, "UiV8yf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_bf16, "V2UiV8yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_bf16, "V2UiV8yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f16, "V2UiV8hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f16, "V2UiV8hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f32, "V2UiV8fUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", 
"nc", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 150c6ce0b76ee..177df6c1e555a 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -768,6 +768,112 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 
srcbf8, half8 srch8, float
   *out1 = __builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16(srcbf8, scale);
 }
 
+// CHECK-LABEL: @test_cvt_scalef32_sr_pk(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[OUT2_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF8_ADDR:%.*]] = alloca <8 x bfloat>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCH8_ADDR:%.*]] = alloca <8 x half>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCF8_ADDR:%.*]] = alloca <8 x float>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[OUT3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF16_ADDR:%.*]] = alloca <16 x bfloat>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCH16_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCF16_ADDR:%.*]] = alloca <16 x float>, align 64, 
addrspace(5)
+// CHECK-NEXT:[[OUT1_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SR_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:[[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT2_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF8_ADDR]] to ptr
+// CHECK-NEXT:[[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT3_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF

[llvm-branch-commits] [clang] [llvm] [AMDGPU] gfx1250 v_cvt_scalef32_sr_pk8_* instructions (PR #151765)

2025-08-01 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec ready_for_review 
https://github.com/llvm/llvm-project/pull/151765
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] [AMDGPU] gfx1250 v_cvt_scalef32_sr_pk8_* instructions (PR #151765)

2025-08-01 Thread Stanislav Mekhanoshin via llvm-branch-commits

rampitec wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/151765?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#151765** https://app.graphite.dev/github/pr/llvm/llvm-project/151765?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/151765?utm_source=stack-comment-view-in-graphite";
 target="_blank">(View in Graphite)
* **#151758** https://app.graphite.dev/github/pr/llvm/llvm-project/151758?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#151749** https://app.graphite.dev/github/pr/llvm/llvm-project/151749?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`




This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn 
more about https://stacking.dev/?utm_source=stack-comment";>stacking.


https://github.com/llvm/llvm-project/pull/151765
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] [AMDGPU] gfx1250 v_cvt_scalef32_pk8_* instructions (PR #151758)

2025-08-01 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/151758

>From a04047c742d86f7c47045eb79319f814ed846cb4 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Fri, 1 Aug 2025 12:38:04 -0700
Subject: [PATCH] [AMDGPU] gfx1250 v_cvt_scalef32_pk8_* instructions

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   9 +
 .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl  |  94 
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  14 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   9 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   6 +
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  23 +
 .../AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll| 403 ++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s |  54 +++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s|  54 +++
 .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt |  54 +++
 10 files changed, 718 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 0c4a485d60936..e117e993fc572 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -716,6 +716,15 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_bf16_fp4, 
"V8yUiUiIUi", "nc", "gfx
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp8, "V8fV2UiUiIUi", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_bf8, "V8fV2UiUiIUi", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp4, "V8fUiUiIUi", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_bf16, "V2UiV8yf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_bf16, "V2UiV8yf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_f16, "V2UiV8hf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f16, "V2UiV8hf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_f32, "V2UiV8ff", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f32, "V2UiV8ff", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f32, "UiV8ff", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f16, "UiV8hf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16, "UiV8yf", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 2fd816cebd365..150c6ce0b76ee 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -674,6 +674,100 @@ void test_cvt_scale_pk(global half8 *outh8, global 
bfloat8 *outy8, uint2 src2,
   *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp4(src1, scale, 7);
 }
 
+// CHECK-LABEL: @test_cvt_scalef32_pk(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[OUT2_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF8_ADDR:%.*]] = alloca <8 x bfloat>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCH8_ADDR:%.*]] = alloca <8 x half>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCF8_ADDR:%.*]] = alloca <8 x float>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[OUT3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF16_ADDR:%.*]] = alloca <16 x bfloat>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCH16_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCF16_ADDR:%.*]] = alloca <16 x float>, align 64, 
addrspace(5)
+// CHECK-NEXT:[[OUT1_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:[[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT2_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF8_ADDR]] to ptr
+// CHECK-NEXT:[[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT3_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF16_ADDR]] to ptr
+// CHECK-NEXT:[[OUT1_ADDR_ASCAST:%.*]] = addrspacecast ptr ad

[llvm-branch-commits] [clang] [llvm] [AMDGPU] gfx1250 v_cvt_scalef32_sr_pk8_* instructions (PR #151765)

2025-08-01 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/151765

>From b0154731dc024d812de24ba138270b3cdf2b0d4b Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Fri, 1 Aug 2025 13:10:57 -0700
Subject: [PATCH] [AMDGPU] gfx1250 v_cvt_scalef32_sr_pk8_* instructions

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   9 +
 .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl  | 106 +
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  19 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   9 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   6 +
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  21 +
 .../llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll | 385 ++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s |  54 +++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s|  54 +++
 .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt |  54 +++
 10 files changed, 712 insertions(+), 5 deletions(-)
 create mode 100644 
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index e117e993fc572..9196f5583e45f 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -725,6 +725,15 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f32, 
"V2UiV8ff", "nc", "gfx
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f32, "UiV8ff", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f16, "UiV8hf", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16, "UiV8yf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_bf16, "V2UiV8yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_bf16, "V2UiV8yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f16, "V2UiV8hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f16, "V2UiV8hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f32, "V2UiV8fUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", 
"nc", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 150c6ce0b76ee..177df6c1e555a 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -768,6 +768,112 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 
srcbf8, half8 srch8, float
   *out1 = __builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16(srcbf8, scale);
 }
 
+// CHECK-LABEL: @test_cvt_scalef32_sr_pk(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[OUT2_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF8_ADDR:%.*]] = alloca <8 x bfloat>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCH8_ADDR:%.*]] = alloca <8 x half>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCF8_ADDR:%.*]] = alloca <8 x float>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[OUT3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF16_ADDR:%.*]] = alloca <16 x bfloat>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCH16_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCF16_ADDR:%.*]] = alloca <16 x float>, align 64, 
addrspace(5)
+// CHECK-NEXT:[[OUT1_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SR_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:[[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT2_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF8_ADDR]] to ptr
+// CHECK-NEXT:[[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT3_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF16_ADDR_ASCA

[llvm-branch-commits] [clang] [llvm] [AMDGPU] gfx1250 v_cvt_scalef32_sr_pk8_* instructions (PR #151765)

2025-08-01 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/151765

>From b0154731dc024d812de24ba138270b3cdf2b0d4b Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Fri, 1 Aug 2025 13:10:57 -0700
Subject: [PATCH] [AMDGPU] gfx1250 v_cvt_scalef32_sr_pk8_* instructions

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   9 +
 .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl  | 106 +
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  19 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   9 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   6 +
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  21 +
 .../llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll | 385 ++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s |  54 +++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s|  54 +++
 .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt |  54 +++
 10 files changed, 712 insertions(+), 5 deletions(-)
 create mode 100644 
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index e117e993fc572..9196f5583e45f 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -725,6 +725,15 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f32, 
"V2UiV8ff", "nc", "gfx
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f32, "UiV8ff", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f16, "UiV8hf", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16, "UiV8yf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_bf16, "V2UiV8yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_bf16, "V2UiV8yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f16, "V2UiV8hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f16, "V2UiV8hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f32, "V2UiV8fUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", 
"nc", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 150c6ce0b76ee..177df6c1e555a 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -768,6 +768,112 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 
srcbf8, half8 srch8, float
   *out1 = __builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16(srcbf8, scale);
 }
 
+// CHECK-LABEL: @test_cvt_scalef32_sr_pk(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[OUT2_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF8_ADDR:%.*]] = alloca <8 x bfloat>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCH8_ADDR:%.*]] = alloca <8 x half>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCF8_ADDR:%.*]] = alloca <8 x float>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[OUT3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF16_ADDR:%.*]] = alloca <16 x bfloat>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCH16_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCF16_ADDR:%.*]] = alloca <16 x float>, align 64, 
addrspace(5)
+// CHECK-NEXT:[[OUT1_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SR_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:[[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT2_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF8_ADDR]] to ptr
+// CHECK-NEXT:[[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT3_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF16_ADDR_ASCA

[llvm-branch-commits] [clang] [llvm] [AMDGPU] gfx1250 v_cvt_scalef32_pk8_* instructions (PR #151758)

2025-08-01 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/151758

>From a04047c742d86f7c47045eb79319f814ed846cb4 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Fri, 1 Aug 2025 12:38:04 -0700
Subject: [PATCH] [AMDGPU] gfx1250 v_cvt_scalef32_pk8_* instructions

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   9 +
 .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl  |  94 
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  14 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   9 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   6 +
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  23 +
 .../AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll| 403 ++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s |  54 +++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s|  54 +++
 .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt |  54 +++
 10 files changed, 718 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 0c4a485d60936..e117e993fc572 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -716,6 +716,15 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_bf16_fp4, 
"V8yUiUiIUi", "nc", "gfx
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp8, "V8fV2UiUiIUi", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_bf8, "V8fV2UiUiIUi", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp4, "V8fUiUiIUi", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_bf16, "V2UiV8yf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_bf16, "V2UiV8yf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_f16, "V2UiV8hf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f16, "V2UiV8hf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_f32, "V2UiV8ff", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f32, "V2UiV8ff", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f32, "UiV8ff", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f16, "UiV8hf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16, "UiV8yf", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 2fd816cebd365..150c6ce0b76ee 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -674,6 +674,100 @@ void test_cvt_scale_pk(global half8 *outh8, global 
bfloat8 *outy8, uint2 src2,
   *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp4(src1, scale, 7);
 }
 
+// CHECK-LABEL: @test_cvt_scalef32_pk(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[OUT2_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF8_ADDR:%.*]] = alloca <8 x bfloat>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCH8_ADDR:%.*]] = alloca <8 x half>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCF8_ADDR:%.*]] = alloca <8 x float>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[OUT3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF16_ADDR:%.*]] = alloca <16 x bfloat>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCH16_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCF16_ADDR:%.*]] = alloca <16 x float>, align 64, 
addrspace(5)
+// CHECK-NEXT:[[OUT1_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:[[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT2_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF8_ADDR]] to ptr
+// CHECK-NEXT:[[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT3_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF16_ADDR]] to ptr
+// CHECK-NEXT:[[OUT1_ADDR_ASCAST:%.*]] = addrspacecast ptr ad

[llvm-branch-commits] [clang] release/21.x: [clang] Avoid inheriting [[noreturn]] in explicit function template specializations (#150003) (PR #151752)

2025-08-01 Thread via llvm-branch-commits

github-actions[bot] wrote:

⚠️ We detected that you are using a GitHub private e-mail address to contribute 
to the repo. Please turn off [Keep my email addresses 
private](https://github.com/settings/emails) setting in your account. See 
[LLVM Developer 
Policy](https://llvm.org/docs/DeveloperPolicy.html#email-addresses) and [LLVM 
Discourse](https://discourse.llvm.org/t/hidden-emails-on-github-should-we-do-something-about-it)
 for more information.

https://github.com/llvm/llvm-project/pull/151752
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [libc] [llvm] [libc][math] Refactor atanhf16 implementation to header-only in src/__support/math folder. (PR #151779)

2025-08-01 Thread Muhammad Bassiouni via llvm-branch-commits

https://github.com/bassiounix created 
https://github.com/llvm/llvm-project/pull/151779

None

>From 135f49e154cc7e484ede397df8ff1591b4bc59eb Mon Sep 17 00:00:00 2001
From: bassiounix 
Date: Sat, 2 Aug 2025 02:04:26 +0300
Subject: [PATCH] [libc][math] Refactor atanhf16 implementation to header-only
 in src/__support/math folder.

---
 libc/shared/math.h|   1 +
 libc/shared/math/atanhf16.h   |  28 +++
 libc/src/__support/math/CMakeLists.txt|  15 ++
 libc/src/__support/math/atanhf16.h| 234 ++
 libc/src/math/generic/CMakeLists.txt  |  12 +-
 libc/src/math/generic/atanhf16.cpp|  86 +--
 libc/src/math/generic/common_constants.cpp|  78 --
 libc/src/math/generic/common_constants.h  |   8 -
 libc/src/math/generic/explogxf.h  |  43 
 libc/test/shared/CMakeLists.txt   |   1 +
 libc/test/shared/shared_math_test.cpp |   1 +
 .../llvm-project-overlay/libc/BUILD.bazel |  22 ++
 12 files changed, 305 insertions(+), 224 deletions(-)
 create mode 100644 libc/shared/math/atanhf16.h
 create mode 100644 libc/src/__support/math/atanhf16.h

diff --git a/libc/shared/math.h b/libc/shared/math.h
index ddf219ece8ff1..7fb736b78efa5 100644
--- a/libc/shared/math.h
+++ b/libc/shared/math.h
@@ -29,6 +29,7 @@
 #include "math/atanf.h"
 #include "math/atanf16.h"
 #include "math/atanhf.h"
+#include "math/atanhf16.h"
 #include "math/erff.h"
 #include "math/exp.h"
 #include "math/exp10.h"
diff --git a/libc/shared/math/atanhf16.h b/libc/shared/math/atanhf16.h
new file mode 100644
index 0..b7b5d77ae98c8
--- /dev/null
+++ b/libc/shared/math/atanhf16.h
@@ -0,0 +1,28 @@
+//===-- Shared atanhf16 function *- C++ 
-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_ATANHF16_H
+#define LLVM_LIBC_SHARED_MATH_ATANHF16_H
+
+#include "shared/libc_common.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "src/__support/math/atanhf16.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::atanhf16;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SHARED_MATH_ATANHF16_H
diff --git a/libc/src/__support/math/CMakeLists.txt 
b/libc/src/__support/math/CMakeLists.txt
index 500dd9de2c555..9631ab5be7d3b 100644
--- a/libc/src/__support/math/CMakeLists.txt
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -286,6 +286,21 @@ add_header_library(
 libc.src.__support.macros.optimization
 )
 
+add_header_library(
+  atanhf16
+  HDRS
+atanhf16.h
+  DEPENDS
+libc.src.__support.FPUtil.fenv_impl
+libc.src.__support.FPUtil.fp_bits
+libc.src.__support.FPUtil.polyeval
+libc.src.__support.FPUtil.cast
+libc.src.__support.FPUtil.except_value_utils
+libc.src.__support.FPUtil.multiply_add
+libc.src.__support.macros.config
+libc.src.__support.macros.optimization
+)
+
 add_header_library(
   asinf
   HDRS
diff --git a/libc/src/__support/math/atanhf16.h 
b/libc/src/__support/math/atanhf16.h
new file mode 100644
index 0..9146e1e31b815
--- /dev/null
+++ b/libc/src/__support/math/atanhf16.h
@@ -0,0 +1,234 @@
+//===-- Implementation header for atanhf16 --*- C++ 
-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_ATANHF16_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_ATANHF16_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+namespace atanhf16_internal {
+
+// Lookup table for logf(f) = logf(1 + n*2^(-7)) where n = 0..127,
+// computed and stored as float precision constants.
+// Generated by Sollya with the following commands:
+//   display = hexadecimal;
+//   for n from 0 to 127 do { print(single(1 / (1 + n / 128.0))); };
+static constexpr float ONE_OVER_F_FLOAT[128] = {
+0x1p0f, 0x1.fc07fp-1f,  0x1.f81f82p-1f, 0x1.f4465ap-1f,
+0x1.f07c2p-1f,  0x1.ecc07cp-1f, 0x1.e9131ap-1f, 0x1.e573acp-1f,
+0x1.e1e1e2p-1

[llvm-branch-commits] [libc] [llvm] [libc][math] Refactor atanhf16 implementation to header-only in src/__support/math folder. (PR #151779)

2025-08-01 Thread Muhammad Bassiouni via llvm-branch-commits

bassiounix wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/151779?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#151779** https://app.graphite.dev/github/pr/llvm/llvm-project/151779?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/151779?utm_source=stack-comment-view-in-graphite";
 target="_blank">(View in Graphite)
* **#151399** https://app.graphite.dev/github/pr/llvm/llvm-project/151399?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#151012** https://app.graphite.dev/github/pr/llvm/llvm-project/151012?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#150993** https://app.graphite.dev/github/pr/llvm/llvm-project/150993?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#150968** https://app.graphite.dev/github/pr/llvm/llvm-project/150968?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#150868** https://app.graphite.dev/github/pr/llvm/llvm-project/150868?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#150854** https://app.graphite.dev/github/pr/llvm/llvm-project/150854?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#150852** https://app.graphite.dev/github/pr/llvm/llvm-project/150852?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#150849** https://app.graphite.dev/github/pr/llvm/llvm-project/150849?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#150843** https://app.graphite.dev/github/pr/llvm/llvm-project/150843?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`




This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn 
more about https://stacking.dev/?utm_source=stack-comment";>stacking.


https://github.com/llvm/llvm-project/pull/151779
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] release/21.x: [clang] Avoid inheriting [[noreturn]] in explicit function template specializations (#150003) (PR #151752)

2025-08-01 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-clang

Author: None (llvmbot)


Changes

Backport 22fef005225b129d73ade4ed995fc0ec0c7be044

Requested by: @mstorsjo

---
Full diff: https://github.com/llvm/llvm-project/pull/151752.diff


3 Files Affected:

- (modified) clang/lib/Sema/SemaDecl.cpp (+8) 
- (modified) clang/lib/Sema/SemaDeclAttr.cpp (+7) 
- (modified) clang/test/SemaCXX/wreturn-always-throws.cpp (+20-1) 


``diff
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 14403e65e8f42..bb412ef6788e7 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -3267,6 +3267,14 @@ void Sema::mergeDeclAttributes(NamedDecl *New, Decl *Old,
 if (isa(I) || isa(I))
   continue;
 
+if (isa(I)) {
+  if (auto *FD = dyn_cast(New)) {
+if (FD->getTemplateSpecializationKind() == TSK_ExplicitSpecialization)
+  continue; // Don't propagate inferred noreturn attributes to explicit
+// specializations.
+  }
+}
+
 if (mergeDeclAttribute(*this, New, I, LocalAMK))
   foundAny = true;
   }
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index eff5f9568236a..a7897bdfe6e0f 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -1970,6 +1970,13 @@ void clang::inferNoReturnAttr(Sema &S, const Decl *D) {
   if (!FD)
 return;
 
+  // Skip explicit specializations here as they may have
+  // a user-provided definition that may deliberately differ from the primary
+  // template. If an explicit specialization truly never returns, the user
+  // should explicitly mark it with [[noreturn]].
+  if (FD->getTemplateSpecializationKind() == TSK_ExplicitSpecialization)
+return;
+
   auto *NonConstFD = const_cast(FD);
   DiagnosticsEngine &Diags = S.getDiagnostics();
   if (Diags.isIgnored(diag::warn_falloff_nonvoid, FD->getLocation()) &&
diff --git a/clang/test/SemaCXX/wreturn-always-throws.cpp 
b/clang/test/SemaCXX/wreturn-always-throws.cpp
index addcadd1183dc..df7689f7063cc 100644
--- a/clang/test/SemaCXX/wreturn-always-throws.cpp
+++ b/clang/test/SemaCXX/wreturn-always-throws.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -fcxx-exceptions -fexceptions -Wreturn-type 
-verify %s
+// RUN: %clang_cc1 -fsyntax-only -fcxx-exceptions -fexceptions -Wreturn-type 
-Winvalid-noreturn -verify %s
 // expected-no-diagnostics
 
 namespace std {
@@ -44,3 +44,22 @@ void testTemplates() {
   throwErrorTemplate("ERROR");
   (void)ensureZeroTemplate(42);
 }
+
+// Ensure that explicit specialization of a member function does not inherit
+// the warning from the primary template.
+
+template
+struct S {
+  void f();
+  void g();
+};
+
+template
+void S::f() { throw 0; } 
+template<>
+void S::f() {}
+
+template 
+void S::g() {}  
+template<> 
+void S::g() { throw 0; }

``




https://github.com/llvm/llvm-project/pull/151752
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] [AMDGPU] gfx1250 v_cvt_scalef32_pk8_* instructions (PR #151758)

2025-08-01 Thread Shilei Tian via llvm-branch-commits

https://github.com/shiltian approved this pull request.


https://github.com/llvm/llvm-project/pull/151758
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [lld] ELF: Introduce R_AARCH64_FUNCINIT64 relocation type. (PR #133531)

2025-08-01 Thread Peter Collingbourne via llvm-branch-commits

https://github.com/pcc updated https://github.com/llvm/llvm-project/pull/133531

>From 96e7da9a083888683c2ba00d97f886fd748ea10b Mon Sep 17 00:00:00 2001
From: Peter Collingbourne 
Date: Wed, 9 Apr 2025 20:30:57 -0700
Subject: [PATCH 1/2] Undo unnecessary change

Created using spr 1.3.6-beta.1
---
 lld/ELF/SyntheticSections.cpp | 2 +-
 lld/ELF/SyntheticSections.h   | 2 +-
 lld/ELF/Writer.cpp| 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 8cab71c4c8d94..106749e90a82b 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -1696,7 +1696,7 @@ void 
RelocationBaseSection::addAddendOnlyRelocIfNonPreemptible(
  sym, 0, R_ABS, addendRelType);
 }
 
-void RelocationBaseSection::mergeRels(Ctx &ctx) {
+void RelocationBaseSection::mergeRels() {
   size_t newSize = relocs.size();
   for (const auto &v : relocsVec)
 newSize += v.size();
diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h
index 7a85906e8601d..2dd4b80eb85bf 100644
--- a/lld/ELF/SyntheticSections.h
+++ b/lld/ELF/SyntheticSections.h
@@ -553,7 +553,7 @@ class RelocationBaseSection : public SyntheticSection {
   }
   size_t getSize() const override { return relocs.size() * this->entsize; }
   size_t getRelativeRelocCount() const { return numRelativeRelocs; }
-  void mergeRels(Ctx &ctx);
+  void mergeRels();
   void partitionRels();
   void finalizeContents() override;
 
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 67004055f1af5..28b24f90716b8 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -2076,7 +2076,7 @@ template  void 
Writer::finalizeSections() {
 // symbol table section (dynSymTab) must be the first one.
 for (Partition &part : ctx.partitions) {
   if (part.relaDyn) {
-part.relaDyn->mergeRels(ctx);
+part.relaDyn->mergeRels();
 // Compute DT_RELACOUNT to be used by part.dynamic.
 part.relaDyn->partitionRels();
 finalizeSynthetic(ctx, part.relaDyn.get());

>From b33113cee03cd0b68b7e8c7e98bfa56dfbe02a46 Mon Sep 17 00:00:00 2001
From: Peter Collingbourne 
Date: Fri, 1 Aug 2025 14:03:45 -0700
Subject: [PATCH 2/2] Fix build

Created using spr 1.3.6-beta.1
---
 lld/ELF/Relocations.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 2573cf5c9fbbf..e2f594f3d2bba 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -1175,9 +1175,8 @@ void RelocationScanner::processAux(RelExpr expr, RelType 
type, uint64_t offset,
<< " cannot be used against ifunc symbol '" << &sym << "'";
   printLocation(diag, *sec, sym, offset);
 } else {
-  part.relaDyn->addReloc({ctx.target->iRelativeRel, sec, offset,
-  DynamicReloc::AddendOnlyWithTargetVA, sym,
-  addend, R_ABS});
+  part.relaDyn->addReloc({ctx.target->iRelativeRel, sec, offset, false,
+  sym, addend, R_ABS});
   return;
 }
   }

___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] Add llvm.protected.field.ptr intrinsic and pre-ISel lowering. (PR #151647)

2025-08-01 Thread Peter Collingbourne via llvm-branch-commits

https://github.com/pcc updated https://github.com/llvm/llvm-project/pull/151647


___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] Add llvm.protected.field.ptr intrinsic and pre-ISel lowering. (PR #151647)

2025-08-01 Thread Peter Collingbourne via llvm-branch-commits

https://github.com/pcc updated https://github.com/llvm/llvm-project/pull/151647


___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] CodeGen: Optionally emit PAuth relocations as IRELATIVE relocations. (PR #133533)

2025-08-01 Thread Peter Collingbourne via llvm-branch-commits

https://github.com/pcc updated https://github.com/llvm/llvm-project/pull/133533


___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] CodeGen: Optionally emit PAuth relocations as IRELATIVE relocations. (PR #133533)

2025-08-01 Thread Peter Collingbourne via llvm-branch-commits

https://github.com/pcc updated https://github.com/llvm/llvm-project/pull/133533


___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] Add IR and codegen support for deactivation symbols. (PR #133536)

2025-08-01 Thread Peter Collingbourne via llvm-branch-commits

https://github.com/pcc updated https://github.com/llvm/llvm-project/pull/133536


___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] Add IR and codegen support for deactivation symbols. (PR #133536)

2025-08-01 Thread Peter Collingbourne via llvm-branch-commits

https://github.com/pcc updated https://github.com/llvm/llvm-project/pull/133536


___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Handle ISD::PTRADD in various special cases (PR #145330)

2025-08-01 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm approved this pull request.


https://github.com/llvm/llvm-project/pull/145330
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] release/21.x: [clang-format] Google Style: disable DerivePointerAlignment. (#149602) (PR #151797)

2025-08-01 Thread Owen Pan via llvm-branch-commits

https://github.com/owenca approved this pull request.


https://github.com/llvm/llvm-project/pull/151797
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [SDAG][AMDGPU] Allow opting in to OOB-generating PTRADD transforms (PR #146074)

2025-08-01 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm approved this pull request.


https://github.com/llvm/llvm-project/pull/146074
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] [AMDGPU] gfx1250 v_cvt_scalef32_pk8_* instructions (PR #151758)

2025-08-01 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec created 
https://github.com/llvm/llvm-project/pull/151758

None

>From 0676b6855c6321383f656bb7059d7022c8e951c4 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Fri, 1 Aug 2025 12:38:04 -0700
Subject: [PATCH] [AMDGPU] gfx1250 v_cvt_scalef32_pk8_* instructions

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   9 +
 .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl  |  94 
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  14 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   9 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   6 +
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  23 +
 .../AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll| 403 ++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s |  54 +++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s|  54 +++
 .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt |  54 +++
 10 files changed, 718 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 0c4a485d60936..e117e993fc572 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -716,6 +716,15 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_bf16_fp4, 
"V8yUiUiIUi", "nc", "gfx
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp8, "V8fV2UiUiIUi", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_bf8, "V8fV2UiUiIUi", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp4, "V8fUiUiIUi", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_bf16, "V2UiV8yf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_bf16, "V2UiV8yf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_f16, "V2UiV8hf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f16, "V2UiV8hf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_f32, "V2UiV8ff", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f32, "V2UiV8ff", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f32, "UiV8ff", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f16, "UiV8hf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16, "UiV8yf", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 2fd816cebd365..150c6ce0b76ee 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -674,6 +674,100 @@ void test_cvt_scale_pk(global half8 *outh8, global 
bfloat8 *outy8, uint2 src2,
   *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp4(src1, scale, 7);
 }
 
+// CHECK-LABEL: @test_cvt_scalef32_pk(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[OUT2_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF8_ADDR:%.*]] = alloca <8 x bfloat>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCH8_ADDR:%.*]] = alloca <8 x half>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCF8_ADDR:%.*]] = alloca <8 x float>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[OUT3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF16_ADDR:%.*]] = alloca <16 x bfloat>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCH16_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCF16_ADDR:%.*]] = alloca <16 x float>, align 64, 
addrspace(5)
+// CHECK-NEXT:[[OUT1_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:[[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT2_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF8_ADDR]] to ptr
+// CHECK-NEXT:[[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT3_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF16_ADDR]] to ptr
+// CHECK-NEXT:[[OUT1_ADDR_ASCAST:%.*]] = addrspacecast 

[llvm-branch-commits] [clang] [llvm] [AMDGPU] gfx1250 v_cvt_scalef32_pk8_* instructions (PR #151758)

2025-08-01 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Stanislav Mekhanoshin (rampitec)


Changes



---

Patch is 50.90 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/151758.diff


10 Files Affected:

- (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+9) 
- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl (+94) 
- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+12-2) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+9) 
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+6) 
- (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+23) 
- (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll (+403) 
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s (+54) 
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s (+54) 
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt (+54) 


``diff
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 0c4a485d60936..e117e993fc572 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -716,6 +716,15 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_bf16_fp4, 
"V8yUiUiIUi", "nc", "gfx
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp8, "V8fV2UiUiIUi", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_bf8, "V8fV2UiUiIUi", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp4, "V8fUiUiIUi", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_bf16, "V2UiV8yf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_bf16, "V2UiV8yf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_f16, "V2UiV8hf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f16, "V2UiV8hf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_f32, "V2UiV8ff", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f32, "V2UiV8ff", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f32, "UiV8ff", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f16, "UiV8hf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16, "UiV8yf", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 2fd816cebd365..150c6ce0b76ee 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -674,6 +674,100 @@ void test_cvt_scale_pk(global half8 *outh8, global 
bfloat8 *outy8, uint2 src2,
   *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp4(src1, scale, 7);
 }
 
+// CHECK-LABEL: @test_cvt_scalef32_pk(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[OUT2_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF8_ADDR:%.*]] = alloca <8 x bfloat>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCH8_ADDR:%.*]] = alloca <8 x half>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCF8_ADDR:%.*]] = alloca <8 x float>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[OUT3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF16_ADDR:%.*]] = alloca <16 x bfloat>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCH16_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCF16_ADDR:%.*]] = alloca <16 x float>, align 64, 
addrspace(5)
+// CHECK-NEXT:[[OUT1_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:[[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT2_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF8_ADDR]] to ptr
+// CHECK-NEXT:[[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT3_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF16_ADDR]] to ptr
+// CHECK-NEXT:[[OUT1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT1_ADDR]] to ptr
+// CHECK-NEXT:[[SCALE_ADDR_ASCAS

[llvm-branch-commits] [clang] [llvm] [AMDGPU] gfx1250 v_cvt_scalef32_pk8_* instructions (PR #151758)

2025-08-01 Thread Stanislav Mekhanoshin via llvm-branch-commits

rampitec wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/151758?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#151758** https://app.graphite.dev/github/pr/llvm/llvm-project/151758?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/151758?utm_source=stack-comment-view-in-graphite";
 target="_blank">(View in Graphite)
* **#151749** https://app.graphite.dev/github/pr/llvm/llvm-project/151749?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`




This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn 
more about https://stacking.dev/?utm_source=stack-comment";>stacking.


https://github.com/llvm/llvm-project/pull/151758
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] [AMDGPU] gfx1250 v_cvt_scalef32_pk8_* instructions (PR #151758)

2025-08-01 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec ready_for_review 
https://github.com/llvm/llvm-project/pull/151758
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [libc] [llvm] [libc][math] Refactor atanhf16 implementation to header-only in src/__support/math folder. (PR #151779)

2025-08-01 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-libc

Author: Muhammad Bassiouni (bassiounix)


Changes

Part of #147386

in preparation for: 
https://discourse.llvm.org/t/rfc-make-clang-builtin-math-functions-constexpr-with-llvm-libc-to-support-c-23-constexpr-math-functions/86450

---

Patch is 27.14 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/151779.diff


12 Files Affected:

- (modified) libc/shared/math.h (+1) 
- (added) libc/shared/math/atanhf16.h (+28) 
- (modified) libc/src/__support/math/CMakeLists.txt (+15) 
- (added) libc/src/__support/math/atanhf16.h (+234) 
- (modified) libc/src/math/generic/CMakeLists.txt (+1-11) 
- (modified) libc/src/math/generic/atanhf16.cpp (+2-84) 
- (modified) libc/src/math/generic/common_constants.cpp (-78) 
- (modified) libc/src/math/generic/common_constants.h (-8) 
- (modified) libc/src/math/generic/explogxf.h (-43) 
- (modified) libc/test/shared/CMakeLists.txt (+1) 
- (modified) libc/test/shared/shared_math_test.cpp (+1) 
- (modified) utils/bazel/llvm-project-overlay/libc/BUILD.bazel (+22) 


``diff
diff --git a/libc/shared/math.h b/libc/shared/math.h
index ddf219ece8ff1..7fb736b78efa5 100644
--- a/libc/shared/math.h
+++ b/libc/shared/math.h
@@ -29,6 +29,7 @@
 #include "math/atanf.h"
 #include "math/atanf16.h"
 #include "math/atanhf.h"
+#include "math/atanhf16.h"
 #include "math/erff.h"
 #include "math/exp.h"
 #include "math/exp10.h"
diff --git a/libc/shared/math/atanhf16.h b/libc/shared/math/atanhf16.h
new file mode 100644
index 0..b7b5d77ae98c8
--- /dev/null
+++ b/libc/shared/math/atanhf16.h
@@ -0,0 +1,28 @@
+//===-- Shared atanhf16 function *- C++ 
-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_ATANHF16_H
+#define LLVM_LIBC_SHARED_MATH_ATANHF16_H
+
+#include "shared/libc_common.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "src/__support/math/atanhf16.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::atanhf16;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SHARED_MATH_ATANHF16_H
diff --git a/libc/src/__support/math/CMakeLists.txt 
b/libc/src/__support/math/CMakeLists.txt
index 500dd9de2c555..9631ab5be7d3b 100644
--- a/libc/src/__support/math/CMakeLists.txt
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -286,6 +286,21 @@ add_header_library(
 libc.src.__support.macros.optimization
 )
 
+add_header_library(
+  atanhf16
+  HDRS
+atanhf16.h
+  DEPENDS
+libc.src.__support.FPUtil.fenv_impl
+libc.src.__support.FPUtil.fp_bits
+libc.src.__support.FPUtil.polyeval
+libc.src.__support.FPUtil.cast
+libc.src.__support.FPUtil.except_value_utils
+libc.src.__support.FPUtil.multiply_add
+libc.src.__support.macros.config
+libc.src.__support.macros.optimization
+)
+
 add_header_library(
   asinf
   HDRS
diff --git a/libc/src/__support/math/atanhf16.h 
b/libc/src/__support/math/atanhf16.h
new file mode 100644
index 0..9146e1e31b815
--- /dev/null
+++ b/libc/src/__support/math/atanhf16.h
@@ -0,0 +1,234 @@
+//===-- Implementation header for atanhf16 --*- C++ 
-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_ATANHF16_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_ATANHF16_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+namespace atanhf16_internal {
+
+// Lookup table for logf(f) = logf(1 + n*2^(-7)) where n = 0..127,
+// computed and stored as float precision constants.
+// Generated by Sollya with the following commands:
+//   display = hexadecimal;
+//   for n from 0 to 127 do { print(single(1 / (1 + n / 128.0))); };
+static constexpr float ONE_OVER_F_FLOAT[128] = {
+0x1p0f, 0x1.fc07fp-1f,  0x1.f81f82p-1f, 0x1.f4465ap-1f,
+0x1.f07c2p-1f,  0x1.ecc07cp-1f, 0x1.e9131ap-1f, 0x1.e573acp-1f,
+0x1.e1e1e2p-1f, 0x1.de5d6ep-1f, 0x1.dae608p-1f, 0x1.d77b66p-1f,
+0x1.d41d42p-1f, 0x1.d0cb58p-1f, 

[llvm-branch-commits] [libc] [llvm] [libc][math] Refactor atanhf16 implementation to header-only in src/__support/math folder. (PR #151779)

2025-08-01 Thread Muhammad Bassiouni via llvm-branch-commits

https://github.com/bassiounix edited 
https://github.com/llvm/llvm-project/pull/151779
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [libc] [llvm] [libc][math] Refactor atanhf16 implementation to header-only in src/__support/math folder. (PR #151779)

2025-08-01 Thread Muhammad Bassiouni via llvm-branch-commits

https://github.com/bassiounix ready_for_review 
https://github.com/llvm/llvm-project/pull/151779
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] [AMDGPU] gfx1250 v_perm_pk16_* instructions (PR #151773)

2025-08-01 Thread Shilei Tian via llvm-branch-commits

https://github.com/shiltian approved this pull request.


https://github.com/llvm/llvm-project/pull/151773
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] [AMDGPU] gfx1250 v_cvt_scalef32_sr_pk8_* instructions (PR #151765)

2025-08-01 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/151765

>From e3bd008acb79a59c3561228e97aedc8d2dfeac00 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Fri, 1 Aug 2025 13:10:57 -0700
Subject: [PATCH] [AMDGPU] gfx1250 v_cvt_scalef32_sr_pk8_* instructions

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   9 +
 .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl  | 106 +
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  19 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   9 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   6 +
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  21 +
 .../llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll | 385 ++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s |  54 +++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s|  54 +++
 .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt |  54 +++
 10 files changed, 712 insertions(+), 5 deletions(-)
 create mode 100644 
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index e117e993fc572..9196f5583e45f 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -725,6 +725,15 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f32, 
"V2UiV8ff", "nc", "gfx
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f32, "UiV8ff", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f16, "UiV8hf", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16, "UiV8yf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_bf16, "V2UiV8yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_bf16, "V2UiV8yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f16, "V2UiV8hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f16, "V2UiV8hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f32, "V2UiV8fUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", 
"nc", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 150c6ce0b76ee..177df6c1e555a 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -768,6 +768,112 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 
srcbf8, half8 srch8, float
   *out1 = __builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16(srcbf8, scale);
 }
 
+// CHECK-LABEL: @test_cvt_scalef32_sr_pk(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[OUT2_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF8_ADDR:%.*]] = alloca <8 x bfloat>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCH8_ADDR:%.*]] = alloca <8 x half>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCF8_ADDR:%.*]] = alloca <8 x float>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[OUT3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF16_ADDR:%.*]] = alloca <16 x bfloat>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCH16_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCF16_ADDR:%.*]] = alloca <16 x float>, align 64, 
addrspace(5)
+// CHECK-NEXT:[[OUT1_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SR_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:[[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT2_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF8_ADDR]] to ptr
+// CHECK-NEXT:[[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT3_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF16_ADDR_ASCA

[llvm-branch-commits] [clang] [llvm] [AMDGPU] gfx1250 v_perm_pk16_* instructions (PR #151773)

2025-08-01 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/151773

>From dead6ea1ef30c5dba70e2709faad18293ae3895f Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Fri, 1 Aug 2025 14:09:42 -0700
Subject: [PATCH] [AMDGPU] gfx1250 v_perm_pk16_* instructions

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |  4 ++
 clang/test/CodeGenOpenCL/amdgpu-features.cl   |  2 +-
 .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl  | 55 
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  | 12 
 llvm/lib/Target/AMDGPU/AMDGPU.td  | 10 +++
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |  3 +
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |  3 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |  2 +
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td  |  1 +
 llvm/lib/Target/AMDGPU/VOP3Instructions.td| 15 +
 llvm/lib/TargetParser/TargetParser.cpp|  1 +
 .../CodeGen/AMDGPU/llvm.amdgcn.perm.pk.ll | 66 +++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s | 45 +
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s| 45 +
 .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt | 45 +
 15 files changed, 308 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.perm.pk.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 9196f5583e45f5..a2e109b416b9d6 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -745,6 +745,10 @@ TARGET_BUILTIN(__builtin_amdgcn_permlane_down, "", 
"nc", "gfx1250-insts,wave
 TARGET_BUILTIN(__builtin_amdgcn_permlane_xor, "", "nc", 
"gfx1250-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_permlane_idx_gen, "iii", "nc", 
"gfx1250-insts,wavefrontsize32")
 
+TARGET_BUILTIN(__builtin_amdgcn_perm_pk16_b4_u4, "V2UiUiUiV2Ui", "nc", 
"tensor-cvt-lut-insts")
+TARGET_BUILTIN(__builtin_amdgcn_perm_pk16_b6_u4, "V3UiUiULiV2Ui", "nc", 
"tensor-cvt-lut-insts")
+TARGET_BUILTIN(__builtin_amdgcn_perm_pk16_b8_u4, "V4UiULiULiV2Ui", "nc", 
"tensor-cvt-lut-insts")
+
 // GFX1250 WMMA builtins
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x4_f32, 
"V8fIbV2fIbV2fIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x32_bf16, 
"V8fIbV16yIbV16yIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl 
b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index df71ead39f48ce..9ae947985e5457 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -108,7 +108,7 @@
 // GFX1153: 
"target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 // GFX1200: 
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 // GFX1201: 
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1250: 
"target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
+// GFX1250: 
"target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-

[llvm-branch-commits] [clang] [llvm] [AMDGPU] gfx1250 v_perm_pk16_* instructions (PR #151773)

2025-08-01 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/151773

>From dead6ea1ef30c5dba70e2709faad18293ae3895f Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Fri, 1 Aug 2025 14:09:42 -0700
Subject: [PATCH] [AMDGPU] gfx1250 v_perm_pk16_* instructions

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |  4 ++
 clang/test/CodeGenOpenCL/amdgpu-features.cl   |  2 +-
 .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl  | 55 
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  | 12 
 llvm/lib/Target/AMDGPU/AMDGPU.td  | 10 +++
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |  3 +
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |  3 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |  2 +
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td  |  1 +
 llvm/lib/Target/AMDGPU/VOP3Instructions.td| 15 +
 llvm/lib/TargetParser/TargetParser.cpp|  1 +
 .../CodeGen/AMDGPU/llvm.amdgcn.perm.pk.ll | 66 +++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s | 45 +
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s| 45 +
 .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt | 45 +
 15 files changed, 308 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.perm.pk.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 9196f5583e45f5..a2e109b416b9d6 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -745,6 +745,10 @@ TARGET_BUILTIN(__builtin_amdgcn_permlane_down, "", 
"nc", "gfx1250-insts,wave
 TARGET_BUILTIN(__builtin_amdgcn_permlane_xor, "", "nc", 
"gfx1250-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_permlane_idx_gen, "iii", "nc", 
"gfx1250-insts,wavefrontsize32")
 
+TARGET_BUILTIN(__builtin_amdgcn_perm_pk16_b4_u4, "V2UiUiUiV2Ui", "nc", 
"tensor-cvt-lut-insts")
+TARGET_BUILTIN(__builtin_amdgcn_perm_pk16_b6_u4, "V3UiUiULiV2Ui", "nc", 
"tensor-cvt-lut-insts")
+TARGET_BUILTIN(__builtin_amdgcn_perm_pk16_b8_u4, "V4UiULiULiV2Ui", "nc", 
"tensor-cvt-lut-insts")
+
 // GFX1250 WMMA builtins
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x4_f32, 
"V8fIbV2fIbV2fIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x32_bf16, 
"V8fIbV16yIbV16yIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl 
b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index df71ead39f48ce..9ae947985e5457 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -108,7 +108,7 @@
 // GFX1153: 
"target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 // GFX1200: 
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 // GFX1201: 
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1250: 
"target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
+// GFX1250: 
"target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-

[llvm-branch-commits] [clang] [llvm] [AMDGPU] gfx1250 v_cvt_scalef32_sr_pk8_* instructions (PR #151765)

2025-08-01 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/151765

>From e3bd008acb79a59c3561228e97aedc8d2dfeac00 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Fri, 1 Aug 2025 13:10:57 -0700
Subject: [PATCH] [AMDGPU] gfx1250 v_cvt_scalef32_sr_pk8_* instructions

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   9 +
 .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl  | 106 +
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  19 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   9 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   6 +
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  21 +
 .../llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll | 385 ++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s |  54 +++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s|  54 +++
 .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt |  54 +++
 10 files changed, 712 insertions(+), 5 deletions(-)
 create mode 100644 
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index e117e993fc572..9196f5583e45f 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -725,6 +725,15 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f32, 
"V2UiV8ff", "nc", "gfx
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f32, "UiV8ff", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f16, "UiV8hf", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16, "UiV8yf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_bf16, "V2UiV8yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_bf16, "V2UiV8yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f16, "V2UiV8hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f16, "V2UiV8hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f32, "V2UiV8fUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", 
"nc", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 150c6ce0b76ee..177df6c1e555a 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -768,6 +768,112 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 
srcbf8, half8 srch8, float
   *out1 = __builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16(srcbf8, scale);
 }
 
+// CHECK-LABEL: @test_cvt_scalef32_sr_pk(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[OUT2_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF8_ADDR:%.*]] = alloca <8 x bfloat>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCH8_ADDR:%.*]] = alloca <8 x half>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCF8_ADDR:%.*]] = alloca <8 x float>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[OUT3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF16_ADDR:%.*]] = alloca <16 x bfloat>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCH16_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCF16_ADDR:%.*]] = alloca <16 x float>, align 64, 
addrspace(5)
+// CHECK-NEXT:[[OUT1_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SR_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:[[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT2_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF8_ADDR]] to ptr
+// CHECK-NEXT:[[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT3_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF16_ADDR_ASCA

[llvm-branch-commits] [clang] [llvm] [AMDGPU] gfx1250 v_cvt_scalef32_pk8_* instructions (PR #151758)

2025-08-01 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/151758

>From 984ab04422e121769f5ddb74088b37c6f319aa68 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Fri, 1 Aug 2025 12:38:04 -0700
Subject: [PATCH] [AMDGPU] gfx1250 v_cvt_scalef32_pk8_* instructions

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   9 +
 .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl  |  94 
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  14 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   9 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   6 +
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  23 +
 .../AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll| 403 ++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s |  54 +++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s|  54 +++
 .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt |  54 +++
 10 files changed, 718 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 0c4a485d60936..e117e993fc572 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -716,6 +716,15 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_bf16_fp4, 
"V8yUiUiIUi", "nc", "gfx
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp8, "V8fV2UiUiIUi", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_bf8, "V8fV2UiUiIUi", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp4, "V8fUiUiIUi", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_bf16, "V2UiV8yf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_bf16, "V2UiV8yf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_f16, "V2UiV8hf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f16, "V2UiV8hf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_f32, "V2UiV8ff", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f32, "V2UiV8ff", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f32, "UiV8ff", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f16, "UiV8hf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16, "UiV8yf", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 2fd816cebd365..150c6ce0b76ee 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -674,6 +674,100 @@ void test_cvt_scale_pk(global half8 *outh8, global 
bfloat8 *outy8, uint2 src2,
   *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp4(src1, scale, 7);
 }
 
+// CHECK-LABEL: @test_cvt_scalef32_pk(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[OUT2_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF8_ADDR:%.*]] = alloca <8 x bfloat>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCH8_ADDR:%.*]] = alloca <8 x half>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCF8_ADDR:%.*]] = alloca <8 x float>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[OUT3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF16_ADDR:%.*]] = alloca <16 x bfloat>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCH16_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCF16_ADDR:%.*]] = alloca <16 x float>, align 64, 
addrspace(5)
+// CHECK-NEXT:[[OUT1_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:[[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT2_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF8_ADDR]] to ptr
+// CHECK-NEXT:[[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT3_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF16_ADDR]] to ptr
+// CHECK-NEXT:[[OUT1_ADDR_ASCAST:%.*]] = addrspacecast ptr ad

[llvm-branch-commits] [clang] [llvm] [AMDGPU] gfx1250 v_cvt_scalef32_sr_pk8_* instructions (PR #151765)

2025-08-01 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/151765

>From ef7297724fd0898979cbafc71cf56b6bc3305b0b Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Fri, 1 Aug 2025 13:10:57 -0700
Subject: [PATCH] [AMDGPU] gfx1250 v_cvt_scalef32_sr_pk8_* instructions

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   9 +
 .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl  | 106 +
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  19 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   9 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   6 +
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  21 +
 .../llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll | 385 ++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s |  54 +++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s|  54 +++
 .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt |  54 +++
 10 files changed, 712 insertions(+), 5 deletions(-)
 create mode 100644 
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index e117e993fc572..9196f5583e45f 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -725,6 +725,15 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f32, 
"V2UiV8ff", "nc", "gfx
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f32, "UiV8ff", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f16, "UiV8hf", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16, "UiV8yf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_bf16, "V2UiV8yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_bf16, "V2UiV8yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f16, "V2UiV8hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f16, "V2UiV8hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f32, "V2UiV8fUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", 
"nc", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 150c6ce0b76ee..177df6c1e555a 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -768,6 +768,112 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 
srcbf8, half8 srch8, float
   *out1 = __builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16(srcbf8, scale);
 }
 
+// CHECK-LABEL: @test_cvt_scalef32_sr_pk(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[OUT2_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF8_ADDR:%.*]] = alloca <8 x bfloat>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCH8_ADDR:%.*]] = alloca <8 x half>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCF8_ADDR:%.*]] = alloca <8 x float>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[OUT3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF16_ADDR:%.*]] = alloca <16 x bfloat>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCH16_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCF16_ADDR:%.*]] = alloca <16 x float>, align 64, 
addrspace(5)
+// CHECK-NEXT:[[OUT1_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SR_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:[[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT2_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF8_ADDR]] to ptr
+// CHECK-NEXT:[[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT3_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF16_ADDR_ASCA

[llvm-branch-commits] [clang] [llvm] [AMDGPU] gfx1250 v_cvt_scalef32_pk8_* instructions (PR #151758)

2025-08-01 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/151758

>From 984ab04422e121769f5ddb74088b37c6f319aa68 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Fri, 1 Aug 2025 12:38:04 -0700
Subject: [PATCH] [AMDGPU] gfx1250 v_cvt_scalef32_pk8_* instructions

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   9 +
 .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl  |  94 
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  14 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   9 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   6 +
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  23 +
 .../AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll| 403 ++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s |  54 +++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s|  54 +++
 .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt |  54 +++
 10 files changed, 718 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 0c4a485d60936..e117e993fc572 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -716,6 +716,15 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_bf16_fp4, 
"V8yUiUiIUi", "nc", "gfx
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp8, "V8fV2UiUiIUi", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_bf8, "V8fV2UiUiIUi", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp4, "V8fUiUiIUi", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_bf16, "V2UiV8yf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_bf16, "V2UiV8yf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_f16, "V2UiV8hf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f16, "V2UiV8hf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_f32, "V2UiV8ff", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f32, "V2UiV8ff", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f32, "UiV8ff", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f16, "UiV8hf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16, "UiV8yf", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 2fd816cebd365..150c6ce0b76ee 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -674,6 +674,100 @@ void test_cvt_scale_pk(global half8 *outh8, global 
bfloat8 *outy8, uint2 src2,
   *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp4(src1, scale, 7);
 }
 
+// CHECK-LABEL: @test_cvt_scalef32_pk(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[OUT2_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF8_ADDR:%.*]] = alloca <8 x bfloat>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCH8_ADDR:%.*]] = alloca <8 x half>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCF8_ADDR:%.*]] = alloca <8 x float>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[OUT3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF16_ADDR:%.*]] = alloca <16 x bfloat>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCH16_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCF16_ADDR:%.*]] = alloca <16 x float>, align 64, 
addrspace(5)
+// CHECK-NEXT:[[OUT1_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:[[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT2_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF8_ADDR]] to ptr
+// CHECK-NEXT:[[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT3_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF16_ADDR]] to ptr
+// CHECK-NEXT:[[OUT1_ADDR_ASCAST:%.*]] = addrspacecast ptr ad

[llvm-branch-commits] Add llvm.protected.field.ptr intrinsic and pre-ISel lowering. (PR #151647)

2025-08-01 Thread Peter Collingbourne via llvm-branch-commits


@@ -461,6 +465,198 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(
   return Changed;
 }
 
+namespace {
+
+enum class PointerEncoding {
+  Rotate,
+  PACCopyable,
+  PACNonCopyable,
+};
+
+bool expandProtectedFieldPtr(Function &Intr) {
+  Module &M = *Intr.getParent();
+
+  std::set DSsToDeactivate;
+  std::set LoadsStores;
+
+  Type *Int8Ty = Type::getInt8Ty(M.getContext());
+  Type *Int64Ty = Type::getInt64Ty(M.getContext());
+  PointerType *PtrTy = PointerType::get(M.getContext(), 0);
+
+  Function *SignIntr =
+  Intrinsic::getOrInsertDeclaration(&M, Intrinsic::ptrauth_sign, {});
+  Function *AuthIntr =
+  Intrinsic::getOrInsertDeclaration(&M, Intrinsic::ptrauth_auth, {});
+
+  auto *EmuFnTy = FunctionType::get(Int64Ty, {Int64Ty, Int64Ty}, false);
+  FunctionCallee EmuSignIntr = M.getOrInsertFunction("__emupac_pacda", 
EmuFnTy);
+  FunctionCallee EmuAuthIntr = M.getOrInsertFunction("__emupac_autda", 
EmuFnTy);
+
+  auto CreateSign = [&](IRBuilder<> &B, Value *Val, Value *Disc,
+   OperandBundleDef DSBundle) {
+Function *F = B.GetInsertBlock()->getParent();
+Attribute FSAttr = F->getFnAttribute("target-features");
+if (FSAttr.isValid() && FSAttr.getValueAsString().contains("+pauth"))
+  return B.CreateCall(SignIntr, {Val, B.getInt32(2), Disc}, DSBundle);
+return B.CreateCall(EmuSignIntr, {Val, Disc}, DSBundle);
+  };
+
+  auto CreateAuth = [&](IRBuilder<> &B, Value *Val, Value *Disc,
+   OperandBundleDef DSBundle) {
+Function *F = B.GetInsertBlock()->getParent();
+Attribute FSAttr = F->getFnAttribute("target-features");
+if (FSAttr.isValid() && FSAttr.getValueAsString().contains("+pauth"))
+  return B.CreateCall(AuthIntr, {Val, B.getInt32(2), Disc}, DSBundle);
+return B.CreateCall(EmuAuthIntr, {Val, Disc}, DSBundle);
+  };
+
+  auto GetDeactivationSymbol = [&](CallInst *Call) -> GlobalValue * {
+if (auto Bundle =
+Call->getOperandBundle(LLVMContext::OB_deactivation_symbol))
+  return cast(Bundle->Inputs[0]);
+return nullptr;
+  };
+
+  for (User *U : Intr.users()) {
+auto *Call = cast(U);
+auto *DS = GetDeactivationSymbol(Call);
+std::set VisitedPhis;
+
+std::function FindLoadsStores;
+FindLoadsStores = [&](Instruction *I) {
+  for (Use &U : I->uses()) {
+if (auto *LI = dyn_cast(U.getUser())) {
+  if (isa(LI->getType())) {
+LoadsStores.insert(LI);
+continue;
+  }
+}
+if (auto *SI = dyn_cast(U.getUser())) {
+  if (U.getOperandNo() == 1 &&
+  isa(SI->getValueOperand()->getType())) {
+LoadsStores.insert(SI);
+continue;
+  }
+}
+if (auto *P = dyn_cast(U.getUser())) {
+  if (VisitedPhis.insert(P).second)
+FindLoadsStores(P);
+  continue;
+}
+// Comparisons against null cannot be used to recover the original
+// pointer so we allow them.
+if (auto *CI = dyn_cast(U.getUser())) {

pcc wrote:

Also noticed that this is missing a test, added.

https://github.com/llvm/llvm-project/pull/151647
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] Add llvm.protected.field.ptr intrinsic and pre-ISel lowering. (PR #151647)

2025-08-01 Thread Peter Collingbourne via llvm-branch-commits


@@ -37,6 +39,8 @@
 #include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
 #include "llvm/Transforms/Utils/LowerVectorIntrinsics.h"
 
+#include 

pcc wrote:

Done

https://github.com/llvm/llvm-project/pull/151647
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] Add llvm.protected.field.ptr intrinsic and pre-ISel lowering. (PR #151647)

2025-08-01 Thread Peter Collingbourne via llvm-branch-commits


@@ -461,6 +465,198 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(
   return Changed;
 }
 
+namespace {
+
+enum class PointerEncoding {
+  Rotate,
+  PACCopyable,
+  PACNonCopyable,
+};
+
+bool expandProtectedFieldPtr(Function &Intr) {
+  Module &M = *Intr.getParent();
+
+  std::set DSsToDeactivate;
+  std::set LoadsStores;
+
+  Type *Int8Ty = Type::getInt8Ty(M.getContext());
+  Type *Int64Ty = Type::getInt64Ty(M.getContext());
+  PointerType *PtrTy = PointerType::get(M.getContext(), 0);
+
+  Function *SignIntr =
+  Intrinsic::getOrInsertDeclaration(&M, Intrinsic::ptrauth_sign, {});
+  Function *AuthIntr =
+  Intrinsic::getOrInsertDeclaration(&M, Intrinsic::ptrauth_auth, {});
+
+  auto *EmuFnTy = FunctionType::get(Int64Ty, {Int64Ty, Int64Ty}, false);
+  FunctionCallee EmuSignIntr = M.getOrInsertFunction("__emupac_pacda", 
EmuFnTy);
+  FunctionCallee EmuAuthIntr = M.getOrInsertFunction("__emupac_autda", 
EmuFnTy);
+
+  auto CreateSign = [&](IRBuilder<> &B, Value *Val, Value *Disc,
+   OperandBundleDef DSBundle) {
+Function *F = B.GetInsertBlock()->getParent();
+Attribute FSAttr = F->getFnAttribute("target-features");
+if (FSAttr.isValid() && FSAttr.getValueAsString().contains("+pauth"))
+  return B.CreateCall(SignIntr, {Val, B.getInt32(2), Disc}, DSBundle);
+return B.CreateCall(EmuSignIntr, {Val, Disc}, DSBundle);
+  };
+
+  auto CreateAuth = [&](IRBuilder<> &B, Value *Val, Value *Disc,
+   OperandBundleDef DSBundle) {
+Function *F = B.GetInsertBlock()->getParent();
+Attribute FSAttr = F->getFnAttribute("target-features");
+if (FSAttr.isValid() && FSAttr.getValueAsString().contains("+pauth"))
+  return B.CreateCall(AuthIntr, {Val, B.getInt32(2), Disc}, DSBundle);
+return B.CreateCall(EmuAuthIntr, {Val, Disc}, DSBundle);
+  };
+
+  auto GetDeactivationSymbol = [&](CallInst *Call) -> GlobalValue * {
+if (auto Bundle =
+Call->getOperandBundle(LLVMContext::OB_deactivation_symbol))
+  return cast(Bundle->Inputs[0]);
+return nullptr;
+  };
+
+  for (User *U : Intr.users()) {
+auto *Call = cast(U);
+auto *DS = GetDeactivationSymbol(Call);
+std::set VisitedPhis;
+
+std::function FindLoadsStores;

pcc wrote:

Was able to remove the recursion after removing phi handling, so this is just 
inline now.

https://github.com/llvm/llvm-project/pull/151647
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] Add llvm.protected.field.ptr intrinsic and pre-ISel lowering. (PR #151647)

2025-08-01 Thread Peter Collingbourne via llvm-branch-commits


@@ -0,0 +1,46 @@
+; RUN: opt -passes=pre-isel-intrinsic-lowering -S < %s | FileCheck 
--check-prefixes=CHECK,NOPAUTH %s
+; RUN: opt -passes=pre-isel-intrinsic-lowering -mattr=+pauth -S < %s | 
FileCheck --check-prefixes=CHECK,PAUTH %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; CHECK: @ds1 = external global i8
+@ds1 = external global i8
+; CHECK: @ds2 = external global i8
+@ds2 = external global i8
+; CHECK: @ds3 = hidden alias i8, inttoptr (i64 3573751839 to ptr)
+@ds3 = external global i8
+
+; CHECK: define ptr @f1
+define ptr @f1(ptr %ptrptr) {
+  ; CHECK: %ptr = load ptr, ptr %ptrptr, align 8
+  ; CHECK: %1 = ptrtoint ptr %ptr to i64
+  ; NOPAUTH: %2 = call i64 @__emupac_autda(i64 %1, i64 1) [ 
"deactivation-symbol"(ptr @ds1) ]
+  ; PAUTH: %2 = call i64 @llvm.ptrauth.auth(i64 %1, i32 2, i64 1) [ 
"deactivation-symbol"(ptr @ds1) ]
+  ; CHECK: %3 = inttoptr i64 %2 to ptr
+  ; CHECK: ret ptr %3
+  %protptrptr = call ptr @llvm.protected.field.ptr(ptr %ptrptr, i64 1, i1 
true) [ "deactivation-symbol"(ptr @ds1) ]

pcc wrote:

Added

https://github.com/llvm/llvm-project/pull/151647
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] Add llvm.protected.field.ptr intrinsic and pre-ISel lowering. (PR #151647)

2025-08-01 Thread Peter Collingbourne via llvm-branch-commits


@@ -31161,3 +31161,57 @@ This intrinsic is assumed to execute in the default 
:ref:`floating-point
 environment ` *except* for the rounding mode.
 This intrinsic is not supported on all targets. Some targets may not support
 all rounding modes.
+
+'``llvm.protected.field.ptr``' Intrinsic
+
+
+Syntax:
+"""
+
+::
+
+  declare ptr @llvm.protected.field.ptr(ptr ptr, i64 disc, i1 
use_hw_encoding)
+
+Overview:
+"
+
+The '``llvm.protected.field.ptr``' intrinsic returns a pointer to the
+storage location of a pointer that has special properties as described
+below.
+
+Arguments:
+""
+
+The first argument is the pointer specifying the location to store the
+pointer. The second argument is the discriminator, which is used as an
+input for the pointer encoding. The third argument specifies whether to
+use a target-specific mechanism to encode the pointer.
+
+Semantics:
+""
+
+This intrinsic returns a pointer which may be used to store a
+pointer at the specified address that is encoded using the specified
+discriminator. Stores via the pointer will cause the stored pointer to be
+blended with the second argument before being stored. The blend operation
+shall be either a weak but cheap and target-independent operation (if
+the third argument is 0) or a stronger target-specific operation (if the
+third argument is 1). When loading from the pointer, the inverse operation
+is done on the loaded pointer after it is loaded. Specifically, when the
+third argument is 1, the pointer is signed (using pointer authentication
+instructions or emulated PAC if not supported by the hardware) using
+the struct address before being stored, and authenticated after being
+loaded. Note that it is currently unsupported to have the third argument
+be 1 on targets other than AArch64. When the third argument is 0, it is
+rotated left by 16 bits and the discriminator is subtracted before being
+stored, and the discriminator is added and the pointer is rotated right
+by 16 bits after being loaded.
+
+If the pointer is used otherwise than for loading or storing (e.g. its

pcc wrote:

Done

https://github.com/llvm/llvm-project/pull/151647
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] Add llvm.protected.field.ptr intrinsic and pre-ISel lowering. (PR #151647)

2025-08-01 Thread Peter Collingbourne via llvm-branch-commits




pcc wrote:

I tried it, but it wanted to delete the checks for the deactivation symbols. Do 
you know if there is a way to prevent this?
```
diff --git 
a/llvm/test/Transforms/PreISelIntrinsicLowering/protected-field-pointer.ll 
b/llvm/test/Transforms/PreISelIntrinsicLowering/protected-field-pointer.ll
index cb7e695bfd12..0b17b544e4d8 100644
--- a/llvm/test/Transforms/PreISelIntrinsicLowering/protected-field-pointer.ll
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/protected-field-pointer.ll
@@ -1,89 +1,143 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 
UTC_ARGS: --version 5
 ; RUN: opt -passes=pre-isel-intrinsic-lowering -S < %s | FileCheck 
--check-prefixes=CHECK,NOPAUTH %s
 ; RUN: opt -passes=pre-isel-intrinsic-lowering -mattr=+pauth -S < %s | 
FileCheck --check-prefixes=CHECK,PAUTH %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; CHECK: @ds1 = external global i8
 @ds1 = external global i8
-; CHECK: @ds2 = external global i8
 @ds2 = external global i8
-; CHECK: @ds3 = external global i8
 @ds3 = external global i8
-; CHECK: @ds4 = external global i8
 @ds4 = external global i8
-; CHECK: @ds5 = external global i8
 @ds5 = external global i8
-; CHECK: @ds6 = hidden alias i8, inttoptr (i64 3573751839 to ptr)
 @ds6 = external global i8
 
```

https://github.com/llvm/llvm-project/pull/151647
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] Add llvm.protected.field.ptr intrinsic and pre-ISel lowering. (PR #151647)

2025-08-01 Thread Peter Collingbourne via llvm-branch-commits


@@ -461,6 +465,198 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(
   return Changed;
 }
 
+namespace {
+
+enum class PointerEncoding {
+  Rotate,
+  PACCopyable,
+  PACNonCopyable,
+};
+
+bool expandProtectedFieldPtr(Function &Intr) {
+  Module &M = *Intr.getParent();
+
+  std::set DSsToDeactivate;
+  std::set LoadsStores;
+
+  Type *Int8Ty = Type::getInt8Ty(M.getContext());
+  Type *Int64Ty = Type::getInt64Ty(M.getContext());
+  PointerType *PtrTy = PointerType::get(M.getContext(), 0);
+
+  Function *SignIntr =
+  Intrinsic::getOrInsertDeclaration(&M, Intrinsic::ptrauth_sign, {});
+  Function *AuthIntr =
+  Intrinsic::getOrInsertDeclaration(&M, Intrinsic::ptrauth_auth, {});
+
+  auto *EmuFnTy = FunctionType::get(Int64Ty, {Int64Ty, Int64Ty}, false);
+  FunctionCallee EmuSignIntr = M.getOrInsertFunction("__emupac_pacda", 
EmuFnTy);
+  FunctionCallee EmuAuthIntr = M.getOrInsertFunction("__emupac_autda", 
EmuFnTy);
+
+  auto CreateSign = [&](IRBuilder<> &B, Value *Val, Value *Disc,
+   OperandBundleDef DSBundle) {
+Function *F = B.GetInsertBlock()->getParent();
+Attribute FSAttr = F->getFnAttribute("target-features");
+if (FSAttr.isValid() && FSAttr.getValueAsString().contains("+pauth"))
+  return B.CreateCall(SignIntr, {Val, B.getInt32(2), Disc}, DSBundle);
+return B.CreateCall(EmuSignIntr, {Val, Disc}, DSBundle);
+  };
+
+  auto CreateAuth = [&](IRBuilder<> &B, Value *Val, Value *Disc,
+   OperandBundleDef DSBundle) {
+Function *F = B.GetInsertBlock()->getParent();
+Attribute FSAttr = F->getFnAttribute("target-features");
+if (FSAttr.isValid() && FSAttr.getValueAsString().contains("+pauth"))
+  return B.CreateCall(AuthIntr, {Val, B.getInt32(2), Disc}, DSBundle);
+return B.CreateCall(EmuAuthIntr, {Val, Disc}, DSBundle);
+  };
+
+  auto GetDeactivationSymbol = [&](CallInst *Call) -> GlobalValue * {
+if (auto Bundle =
+Call->getOperandBundle(LLVMContext::OB_deactivation_symbol))
+  return cast(Bundle->Inputs[0]);
+return nullptr;
+  };
+
+  for (User *U : Intr.users()) {
+auto *Call = cast(U);
+auto *DS = GetDeactivationSymbol(Call);
+std::set VisitedPhis;
+
+std::function FindLoadsStores;
+FindLoadsStores = [&](Instruction *I) {
+  for (Use &U : I->uses()) {
+if (auto *LI = dyn_cast(U.getUser())) {
+  if (isa(LI->getType())) {
+LoadsStores.insert(LI);
+continue;
+  }
+}
+if (auto *SI = dyn_cast(U.getUser())) {
+  if (U.getOperandNo() == 1 &&
+  isa(SI->getValueOperand()->getType())) {
+LoadsStores.insert(SI);
+continue;
+  }
+}
+if (auto *P = dyn_cast(U.getUser())) {

pcc wrote:

Right. I checked whether this phi handling is actually necessary given #151649 
(by building Fleetbench with/without the phi code removed and checking the 
number of defined deactivation symbols), and it turned out not to be, so I 
removed it, which allowed the code to be simplified significantly.

https://github.com/llvm/llvm-project/pull/151647
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] Add deactivation symbol operand to ConstantPtrAuth. (PR #133537)

2025-08-01 Thread Peter Collingbourne via llvm-branch-commits

https://github.com/pcc updated https://github.com/llvm/llvm-project/pull/133537


___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] Add deactivation symbol operand to ConstantPtrAuth. (PR #133537)

2025-08-01 Thread Peter Collingbourne via llvm-branch-commits

https://github.com/pcc updated https://github.com/llvm/llvm-project/pull/133537


___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] AlwaysInliner: A new inlining algorithm to interleave alloca promotion with inlines. (PR #145613)

2025-08-01 Thread Amara Emerson via llvm-branch-commits




aemerson wrote:

Sure but it's not the real motivating case, it's an example of how the 
different phase ordering can trigger issues. What about the actual inliner 
change proposed here?

https://github.com/llvm/llvm-project/pull/145613
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [libc] [llvm] [libc][math] Refactor atanhf implementation to header-only in src/__support/math folder. (PR #151399)

2025-08-01 Thread Muhammad Bassiouni via llvm-branch-commits

https://github.com/bassiounix updated 
https://github.com/llvm/llvm-project/pull/151399

>From 4c79b81484f25c99e8e840d5b994264e48a8b962 Mon Sep 17 00:00:00 2001
From: bassiounix 
Date: Thu, 31 Jul 2025 00:41:13 +0300
Subject: [PATCH] [libc][math] Refactor atanhf implementation to header-only in
 src/__support/math folder.

---
 libc/shared/math.h|  1 +
 libc/shared/math/atanhf.h | 23 ++
 libc/src/__support/math/CMakeLists.txt| 11 +++
 libc/src/__support/math/atanhf.h  | 76 +++
 libc/src/math/generic/CMakeLists.txt  |  5 +-
 libc/src/math/generic/atanhf.cpp  | 56 +-
 libc/test/shared/CMakeLists.txt   |  1 +
 libc/test/shared/shared_math_test.cpp |  1 +
 .../llvm-project-overlay/libc/BUILD.bazel | 20 +++--
 9 files changed, 129 insertions(+), 65 deletions(-)
 create mode 100644 libc/shared/math/atanhf.h
 create mode 100644 libc/src/__support/math/atanhf.h

diff --git a/libc/shared/math.h b/libc/shared/math.h
index 6cb583c08dedd..ddf219ece8ff1 100644
--- a/libc/shared/math.h
+++ b/libc/shared/math.h
@@ -28,6 +28,7 @@
 #include "math/atan2f128.h"
 #include "math/atanf.h"
 #include "math/atanf16.h"
+#include "math/atanhf.h"
 #include "math/erff.h"
 #include "math/exp.h"
 #include "math/exp10.h"
diff --git a/libc/shared/math/atanhf.h b/libc/shared/math/atanhf.h
new file mode 100644
index 0..763fb3e00a659
--- /dev/null
+++ b/libc/shared/math/atanhf.h
@@ -0,0 +1,23 @@
+//===-- Shared atanhf function --*- C++ 
-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_ATANHF_H
+#define LLVM_LIBC_SHARED_MATH_ATANHF_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/atanhf.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::atanhf;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_ATANHF_H
diff --git a/libc/src/__support/math/CMakeLists.txt 
b/libc/src/__support/math/CMakeLists.txt
index caafdc2cbf1d6..500dd9de2c555 100644
--- a/libc/src/__support/math/CMakeLists.txt
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -275,6 +275,17 @@ add_header_library(
 libc.src.__support.macros.optimization
 )
 
+add_header_library(
+  atanhf
+  HDRS
+atanhf.h
+  DEPENDS
+.acoshf_utils
+libc.src.__support.FPUtil.fp_bits
+libc.src.__support.FPUtil.fenv_impl
+libc.src.__support.macros.optimization
+)
+
 add_header_library(
   asinf
   HDRS
diff --git a/libc/src/__support/math/atanhf.h b/libc/src/__support/math/atanhf.h
new file mode 100644
index 0..b3ee5bbb4d408
--- /dev/null
+++ b/libc/src/__support/math/atanhf.h
@@ -0,0 +1,76 @@
+//===-- Implementation header for atanhf *- C++ 
-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_ATANHF_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_ATANHF_H
+
+#include "acoshf_utils.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+LIBC_INLINE static constexpr float atanhf(float x) {
+  using namespace acoshf_internal;
+  using FPBits = typename fputil::FPBits;
+
+  FPBits xbits(x);
+  Sign sign = xbits.sign();
+  uint32_t x_abs = xbits.abs().uintval();
+
+  // |x| >= 1.0
+  if (LIBC_UNLIKELY(x_abs >= 0x3F80'U)) {
+if (xbits.is_nan()) {
+  if (xbits.is_signaling_nan()) {
+fputil::raise_except_if_required(FE_INVALID);
+return FPBits::quiet_nan().get_val();
+  }
+  return x;
+}
+// |x| == 1.0
+if (x_abs == 0x3F80'U) {
+  fputil::set_errno_if_required(ERANGE);
+  fputil::raise_except_if_required(FE_DIVBYZERO);
+  return FPBits::inf(sign).get_val();
+} else {
+  fputil::set_errno_if_required(EDOM);
+  fputil::raise_except_if_required(FE_INVALID);
+  return FPBits::quiet_nan().get_val();
+}
+  }
+
+  // |x| < ~0.10
+  if (LIBC_UNLIKELY(x_abs <= 0x3dcc'U)) {
+// |x| <= 2^-26
+if (LIBC_UNLIKELY(x_abs <= 0x3280'U)) {
+  return static_cast(LIBC_UNLIKELY(x_abs == 0)
+? x
+: (x + 0x1.5p-2 * x * x * x));
+}
+
+double xdbl = x;
+double x2 

[llvm-branch-commits] [libc] [llvm] [libc][math] Refactor atanhf implementation to header-only in src/__support/math folder. (PR #151399)

2025-08-01 Thread Muhammad Bassiouni via llvm-branch-commits

https://github.com/bassiounix updated 
https://github.com/llvm/llvm-project/pull/151399

>From 4c79b81484f25c99e8e840d5b994264e48a8b962 Mon Sep 17 00:00:00 2001
From: bassiounix 
Date: Thu, 31 Jul 2025 00:41:13 +0300
Subject: [PATCH] [libc][math] Refactor atanhf implementation to header-only in
 src/__support/math folder.

---
 libc/shared/math.h|  1 +
 libc/shared/math/atanhf.h | 23 ++
 libc/src/__support/math/CMakeLists.txt| 11 +++
 libc/src/__support/math/atanhf.h  | 76 +++
 libc/src/math/generic/CMakeLists.txt  |  5 +-
 libc/src/math/generic/atanhf.cpp  | 56 +-
 libc/test/shared/CMakeLists.txt   |  1 +
 libc/test/shared/shared_math_test.cpp |  1 +
 .../llvm-project-overlay/libc/BUILD.bazel | 20 +++--
 9 files changed, 129 insertions(+), 65 deletions(-)
 create mode 100644 libc/shared/math/atanhf.h
 create mode 100644 libc/src/__support/math/atanhf.h

diff --git a/libc/shared/math.h b/libc/shared/math.h
index 6cb583c08dedd..ddf219ece8ff1 100644
--- a/libc/shared/math.h
+++ b/libc/shared/math.h
@@ -28,6 +28,7 @@
 #include "math/atan2f128.h"
 #include "math/atanf.h"
 #include "math/atanf16.h"
+#include "math/atanhf.h"
 #include "math/erff.h"
 #include "math/exp.h"
 #include "math/exp10.h"
diff --git a/libc/shared/math/atanhf.h b/libc/shared/math/atanhf.h
new file mode 100644
index 0..763fb3e00a659
--- /dev/null
+++ b/libc/shared/math/atanhf.h
@@ -0,0 +1,23 @@
+//===-- Shared atanhf function --*- C++ 
-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_ATANHF_H
+#define LLVM_LIBC_SHARED_MATH_ATANHF_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/atanhf.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::atanhf;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_ATANHF_H
diff --git a/libc/src/__support/math/CMakeLists.txt 
b/libc/src/__support/math/CMakeLists.txt
index caafdc2cbf1d6..500dd9de2c555 100644
--- a/libc/src/__support/math/CMakeLists.txt
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -275,6 +275,17 @@ add_header_library(
 libc.src.__support.macros.optimization
 )
 
+add_header_library(
+  atanhf
+  HDRS
+atanhf.h
+  DEPENDS
+.acoshf_utils
+libc.src.__support.FPUtil.fp_bits
+libc.src.__support.FPUtil.fenv_impl
+libc.src.__support.macros.optimization
+)
+
 add_header_library(
   asinf
   HDRS
diff --git a/libc/src/__support/math/atanhf.h b/libc/src/__support/math/atanhf.h
new file mode 100644
index 0..b3ee5bbb4d408
--- /dev/null
+++ b/libc/src/__support/math/atanhf.h
@@ -0,0 +1,76 @@
+//===-- Implementation header for atanhf *- C++ 
-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_ATANHF_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_ATANHF_H
+
+#include "acoshf_utils.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+LIBC_INLINE static constexpr float atanhf(float x) {
+  using namespace acoshf_internal;
+  using FPBits = typename fputil::FPBits;
+
+  FPBits xbits(x);
+  Sign sign = xbits.sign();
+  uint32_t x_abs = xbits.abs().uintval();
+
+  // |x| >= 1.0
+  if (LIBC_UNLIKELY(x_abs >= 0x3F80'U)) {
+if (xbits.is_nan()) {
+  if (xbits.is_signaling_nan()) {
+fputil::raise_except_if_required(FE_INVALID);
+return FPBits::quiet_nan().get_val();
+  }
+  return x;
+}
+// |x| == 1.0
+if (x_abs == 0x3F80'U) {
+  fputil::set_errno_if_required(ERANGE);
+  fputil::raise_except_if_required(FE_DIVBYZERO);
+  return FPBits::inf(sign).get_val();
+} else {
+  fputil::set_errno_if_required(EDOM);
+  fputil::raise_except_if_required(FE_INVALID);
+  return FPBits::quiet_nan().get_val();
+}
+  }
+
+  // |x| < ~0.10
+  if (LIBC_UNLIKELY(x_abs <= 0x3dcc'U)) {
+// |x| <= 2^-26
+if (LIBC_UNLIKELY(x_abs <= 0x3280'U)) {
+  return static_cast(LIBC_UNLIKELY(x_abs == 0)
+? x
+: (x + 0x1.5p-2 * x * x * x));
+}
+
+double xdbl = x;
+double x2 

[llvm-branch-commits] [clang] release/21.x: [clang-format] Google Style: disable DerivePointerAlignment. (#149602) (PR #151797)

2025-08-01 Thread via llvm-branch-commits

https://github.com/llvmbot milestoned 
https://github.com/llvm/llvm-project/pull/151797
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] release/21.x: [clang-format] Google Style: disable DerivePointerAlignment. (#149602) (PR #151797)

2025-08-01 Thread via llvm-branch-commits

https://github.com/llvmbot created 
https://github.com/llvm/llvm-project/pull/151797

Backport 9281797a577b7954521fb9192d41e457ca2ca42e

Requested by: @owenca

>From f9fde763bd2b3faf2814346856378cdeb235506e Mon Sep 17 00:00:00 2001
From: James Y Knight 
Date: Fri, 25 Jul 2025 11:55:50 -0400
Subject: [PATCH] [clang-format] Google Style: disable DerivePointerAlignment.
 (#149602)

The [Google C++ Style
Guide](https://google.github.io/styleguide/cppguide.html#Pointer_and_Reference_Expressions)
is being changed to specify that spaces should go after the
asterisk/ampersand, rather than permitting either before or after on a
file-by-file basis.

The new requirement is:
> When referring to a pointer or reference (variable declarations or
> definitions, arguments, return types, template parameters, etc.),
> you must not place a space before the asterisk/ampersand. Use a
> space to separate the type from the declared name (if present).

The [Google ObjC
style](https://google.github.io/styleguide/objcguide.html) is silent on
this matter, but the de-facto style is not being modified at this time.
So, keep DerivePointerAlignment enabled for ObjC language mode.

(cherry picked from commit 9281797a577b7954521fb9192d41e457ca2ca42e)
---
 clang/lib/Format/Format.cpp   |  2 +-
 clang/unittests/Format/FormatTest.cpp | 22 +-
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 0e92c93ea1dde..513fcfcd41258 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -1749,7 +1749,6 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind 
Language) {
   GoogleStyle.AttributeMacros.push_back("absl_nullable");
   GoogleStyle.AttributeMacros.push_back("absl_nullability_unknown");
   GoogleStyle.BreakTemplateDeclarations = FormatStyle::BTDS_Yes;
-  GoogleStyle.DerivePointerAlignment = true;
   GoogleStyle.IncludeStyle.IncludeBlocks = tooling::IncludeStyle::IBS_Regroup;
   GoogleStyle.IncludeStyle.IncludeCategories = {{"^", 2, 0, false},
 {"^<.*\\.h>", 1, 0, false},
@@ -1858,6 +1857,7 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind 
Language) {
   } else if (Language == FormatStyle::LK_ObjC) {
 GoogleStyle.AlwaysBreakBeforeMultilineStrings = false;
 GoogleStyle.ColumnLimit = 100;
+GoogleStyle.DerivePointerAlignment = true;
 // "Regroup" doesn't work well for ObjC yet (main header heuristic,
 // relationship between ObjC standard library headers and other heades,
 // #imports, etc.)
diff --git a/clang/unittests/Format/FormatTest.cpp 
b/clang/unittests/Format/FormatTest.cpp
index f9eedcb8f12af..e161f61e8864e 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -8571,10 +8571,10 @@ TEST_F(FormatTest, BreaksFunctionDeclarations) {
"operator<<(const SomeLogType 
&other);");
   verifyGoogleFormat(
   "SomeLoogType operator>>(\n"
-  "const SomeLogType &a, const SomeLogType &b);");
+  "const SomeLogType& a, const SomeLogType& b);");
   verifyGoogleFormat(
   "SomeLoogType operator<<(\n"
-  "const SomeLogType &a, const SomeLogType &b);");
+  "const SomeLogType& a, const SomeLogType& b);");
 
   verifyFormat("void (\n"
"int aaa = 1);");
@@ -8583,7 +8583,7 @@ TEST_F(FormatTest, BreaksFunctionDeclarations) {
   verifyGoogleFormat(
   "typename aa::aaa\n"
   "aa::aaa(\n"
-  "bool *aa, bool *aa) {}");
+  "bool* aa, bool* aa) {}");
   verifyGoogleFormat("template \n"
  "aa\n"
  "aaa::a(\n"
@@ -12891,27 +12891,31 @@ TEST_F(FormatTest, UnderstandsEllipsis) {
 }
 
 TEST_F(FormatTest, AdaptivelyFormatsPointersAndReferences) {
+  auto Style = getGoogleStyle();
+  EXPECT_FALSE(Style.DerivePointerAlignment);
+  Style.DerivePointerAlignment = true;
+
   verifyFormat("int *a;\n"
"int *a;\n"
"int *a;",
"int *a;\n"
"int* a;\n"
"int *a;",
-   getGoogleStyle());
+   Style);
   verifyFormat("int* a;\n"
"int* a;\n"
"int* a;",
"int* a;\n"
"int* a;\n"
"int *a;",
-   getGoogleStyle());
+   Style);
   verifyFormat("int *a;\n"
"int *a;\n"
"int *a;",
"int *a;\n"
"int * a;\n"
"int *  a;",
-   getGoogleStyle());
+   Style)

[llvm-branch-commits] [clang] release/21.x: [clang-format] Google Style: disable DerivePointerAlignment. (#149602) (PR #151797)

2025-08-01 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-clang-format

Author: None (llvmbot)


Changes

Backport 9281797a577b7954521fb9192d41e457ca2ca42e

Requested by: @owenca

---
Full diff: https://github.com/llvm/llvm-project/pull/151797.diff


2 Files Affected:

- (modified) clang/lib/Format/Format.cpp (+1-1) 
- (modified) clang/unittests/Format/FormatTest.cpp (+13-9) 


``diff
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 0e92c93ea1dde..513fcfcd41258 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -1749,7 +1749,6 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind 
Language) {
   GoogleStyle.AttributeMacros.push_back("absl_nullable");
   GoogleStyle.AttributeMacros.push_back("absl_nullability_unknown");
   GoogleStyle.BreakTemplateDeclarations = FormatStyle::BTDS_Yes;
-  GoogleStyle.DerivePointerAlignment = true;
   GoogleStyle.IncludeStyle.IncludeBlocks = tooling::IncludeStyle::IBS_Regroup;
   GoogleStyle.IncludeStyle.IncludeCategories = {{"^", 2, 0, false},
 {"^<.*\\.h>", 1, 0, false},
@@ -1858,6 +1857,7 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind 
Language) {
   } else if (Language == FormatStyle::LK_ObjC) {
 GoogleStyle.AlwaysBreakBeforeMultilineStrings = false;
 GoogleStyle.ColumnLimit = 100;
+GoogleStyle.DerivePointerAlignment = true;
 // "Regroup" doesn't work well for ObjC yet (main header heuristic,
 // relationship between ObjC standard library headers and other heades,
 // #imports, etc.)
diff --git a/clang/unittests/Format/FormatTest.cpp 
b/clang/unittests/Format/FormatTest.cpp
index f9eedcb8f12af..e161f61e8864e 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -8571,10 +8571,10 @@ TEST_F(FormatTest, BreaksFunctionDeclarations) {
"operator<<(const SomeLogType 
&other);");
   verifyGoogleFormat(
   "SomeLoogType operator>>(\n"
-  "const SomeLogType &a, const SomeLogType &b);");
+  "const SomeLogType& a, const SomeLogType& b);");
   verifyGoogleFormat(
   "SomeLoogType operator<<(\n"
-  "const SomeLogType &a, const SomeLogType &b);");
+  "const SomeLogType& a, const SomeLogType& b);");
 
   verifyFormat("void (\n"
"int aaa = 1);");
@@ -8583,7 +8583,7 @@ TEST_F(FormatTest, BreaksFunctionDeclarations) {
   verifyGoogleFormat(
   "typename aa::aaa\n"
   "aa::aaa(\n"
-  "bool *aa, bool *aa) {}");
+  "bool* aa, bool* aa) {}");
   verifyGoogleFormat("template \n"
  "aa\n"
  "aaa::a(\n"
@@ -12891,27 +12891,31 @@ TEST_F(FormatTest, UnderstandsEllipsis) {
 }
 
 TEST_F(FormatTest, AdaptivelyFormatsPointersAndReferences) {
+  auto Style = getGoogleStyle();
+  EXPECT_FALSE(Style.DerivePointerAlignment);
+  Style.DerivePointerAlignment = true;
+
   verifyFormat("int *a;\n"
"int *a;\n"
"int *a;",
"int *a;\n"
"int* a;\n"
"int *a;",
-   getGoogleStyle());
+   Style);
   verifyFormat("int* a;\n"
"int* a;\n"
"int* a;",
"int* a;\n"
"int* a;\n"
"int *a;",
-   getGoogleStyle());
+   Style);
   verifyFormat("int *a;\n"
"int *a;\n"
"int *a;",
"int *a;\n"
"int * a;\n"
"int *  a;",
-   getGoogleStyle());
+   Style);
   verifyFormat("auto x = [] {\n"
"  int *a;\n"
"  int *a;\n"
@@ -12920,7 +12924,7 @@ TEST_F(FormatTest, 
AdaptivelyFormatsPointersAndReferences) {
"auto x=[]{int *a;\n"
"int * a;\n"
"int *  a;};",
-   getGoogleStyle());
+   Style);
 }
 
 TEST_F(FormatTest, UnderstandsRvalueReferences) {
@@ -13056,7 +13060,7 @@ TEST_F(FormatTest, FormatsCasts) {
   verifyFormat("virtual void foo(char &) const;");
   verifyFormat("virtual void foo(int *a, char *) const;");
   verifyFormat("int a = sizeof(int *) + b;");
-  verifyGoogleFormat("int a = alignof(int *) + b;");
+  verifyGoogleFormat("int a = alignof(int*) + b;");
   verifyFormat("bool b = f(g) && c;");
   verifyFormat("typedef void (*f)(int i) func;");
   verifyFormat("void operator++(int) noexcept;");
@@ -25425,7 +25429,7 @@ TEST_F(FormatTest, AtomicQualifier) {
   verifyFormat("struct foo {\n"
"  int a1;\n"
"  _Atomic(a) a2;\n"
- 

[llvm-branch-commits] [clang] release/21.x: [clang-format] Google Style: disable DerivePointerAlignment. (#149602) (PR #151797)

2025-08-01 Thread via llvm-branch-commits

llvmbot wrote:

@AaronBallman What do you think about merging this PR to the release branch?

https://github.com/llvm/llvm-project/pull/151797
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] [AMDGPU] gfx1250 v_cvt_scalef32_pk8_* instructions (PR #151758)

2025-08-01 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/151758

>From 0c185e2740d4b61e0c29354d3a46192847effc5f Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Fri, 1 Aug 2025 12:38:04 -0700
Subject: [PATCH] [AMDGPU] gfx1250 v_cvt_scalef32_pk8_* instructions

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   9 +
 .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl  |  94 
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  14 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   9 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   6 +
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  23 +
 .../AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll| 403 ++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s |  54 +++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s|  54 +++
 .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt |  54 +++
 10 files changed, 718 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 0c4a485d60936..e117e993fc572 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -716,6 +716,15 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_bf16_fp4, 
"V8yUiUiIUi", "nc", "gfx
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp8, "V8fV2UiUiIUi", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_bf8, "V8fV2UiUiIUi", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp4, "V8fUiUiIUi", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_bf16, "V2UiV8yf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_bf16, "V2UiV8yf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_f16, "V2UiV8hf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f16, "V2UiV8hf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_f32, "V2UiV8ff", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f32, "V2UiV8ff", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f32, "UiV8ff", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f16, "UiV8hf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16, "UiV8yf", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 2fd816cebd365..150c6ce0b76ee 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -674,6 +674,100 @@ void test_cvt_scale_pk(global half8 *outh8, global 
bfloat8 *outy8, uint2 src2,
   *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp4(src1, scale, 7);
 }
 
+// CHECK-LABEL: @test_cvt_scalef32_pk(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[OUT2_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF8_ADDR:%.*]] = alloca <8 x bfloat>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCH8_ADDR:%.*]] = alloca <8 x half>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCF8_ADDR:%.*]] = alloca <8 x float>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[OUT3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF16_ADDR:%.*]] = alloca <16 x bfloat>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCH16_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCF16_ADDR:%.*]] = alloca <16 x float>, align 64, 
addrspace(5)
+// CHECK-NEXT:[[OUT1_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:[[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT2_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF8_ADDR]] to ptr
+// CHECK-NEXT:[[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT3_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF16_ADDR]] to ptr
+// CHECK-NEXT:[[OUT1_ADDR_ASCAST:%.*]] = addrspacecast ptr ad

[llvm-branch-commits] [clang] [llvm] [AMDGPU] gfx1250 v_cvt_scalef32_pk8_* instructions (PR #151758)

2025-08-01 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec updated 
https://github.com/llvm/llvm-project/pull/151758

>From 0c185e2740d4b61e0c29354d3a46192847effc5f Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Fri, 1 Aug 2025 12:38:04 -0700
Subject: [PATCH] [AMDGPU] gfx1250 v_cvt_scalef32_pk8_* instructions

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   9 +
 .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl  |  94 
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  14 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   9 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   6 +
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  23 +
 .../AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll| 403 ++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s |  54 +++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s|  54 +++
 .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt |  54 +++
 10 files changed, 718 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 0c4a485d60936..e117e993fc572 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -716,6 +716,15 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_bf16_fp4, 
"V8yUiUiIUi", "nc", "gfx
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp8, "V8fV2UiUiIUi", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_bf8, "V8fV2UiUiIUi", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp4, "V8fUiUiIUi", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_bf16, "V2UiV8yf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_bf16, "V2UiV8yf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_f16, "V2UiV8hf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f16, "V2UiV8hf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_f32, "V2UiV8ff", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f32, "V2UiV8ff", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f32, "UiV8ff", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f16, "UiV8hf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16, "UiV8yf", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 2fd816cebd365..150c6ce0b76ee 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -674,6 +674,100 @@ void test_cvt_scale_pk(global half8 *outh8, global 
bfloat8 *outy8, uint2 src2,
   *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp4(src1, scale, 7);
 }
 
+// CHECK-LABEL: @test_cvt_scalef32_pk(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[OUT2_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF8_ADDR:%.*]] = alloca <8 x bfloat>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCH8_ADDR:%.*]] = alloca <8 x half>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCF8_ADDR:%.*]] = alloca <8 x float>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[OUT3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF16_ADDR:%.*]] = alloca <16 x bfloat>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCH16_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCF16_ADDR:%.*]] = alloca <16 x float>, align 64, 
addrspace(5)
+// CHECK-NEXT:[[OUT1_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:[[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT2_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF8_ADDR]] to ptr
+// CHECK-NEXT:[[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT3_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF16_ADDR]] to ptr
+// CHECK-NEXT:[[OUT1_ADDR_ASCAST:%.*]] = addrspacecast ptr ad

[llvm-branch-commits] [clang] [llvm] [AMDGPU] gfx1250 v_cvt_scalef32_sr_pk8_* instructions (PR #151765)

2025-08-01 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec created 
https://github.com/llvm/llvm-project/pull/151765

None

>From b7f9023286f26c807a3363d4188e237359c5e213 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin 
Date: Fri, 1 Aug 2025 13:10:57 -0700
Subject: [PATCH] [AMDGPU] gfx1250 v_cvt_scalef32_sr_pk8_* instructions

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   9 +
 .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl  | 106 +
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  19 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   9 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |   6 +
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  21 +
 .../llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll | 385 ++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s |  54 +++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s|  54 +++
 .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt |  54 +++
 10 files changed, 712 insertions(+), 5 deletions(-)
 create mode 100644 
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index e117e993fc572..9196f5583e45f 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -725,6 +725,15 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f32, 
"V2UiV8ff", "nc", "gfx
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f32, "UiV8ff", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f16, "UiV8hf", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16, "UiV8yf", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_bf16, "V2UiV8yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_bf16, "V2UiV8yUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f16, "V2UiV8hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f16, "V2UiV8hUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f32, "V2UiV8fUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", 
"nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", 
"gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", 
"nc", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", 
"fp8e5m3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 150c6ce0b76ee..177df6c1e555a 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -768,6 +768,112 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 
srcbf8, half8 srch8, float
   *out1 = __builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16(srcbf8, scale);
 }
 
+// CHECK-LABEL: @test_cvt_scalef32_sr_pk(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[OUT2_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF8_ADDR:%.*]] = alloca <8 x bfloat>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCH8_ADDR:%.*]] = alloca <8 x half>, align 16, 
addrspace(5)
+// CHECK-NEXT:[[SRCF8_ADDR:%.*]] = alloca <8 x float>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[OUT3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SRCBF16_ADDR:%.*]] = alloca <16 x bfloat>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCH16_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-NEXT:[[SRCF16_ADDR:%.*]] = alloca <16 x float>, align 64, 
addrspace(5)
+// CHECK-NEXT:[[OUT1_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:[[SR_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:[[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:[[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT2_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH8_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCF8_ADDR]] to ptr
+// CHECK-NEXT:[[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT3_ADDR]] to ptr
+// CHECK-NEXT:[[SRCBF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCBF16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[SRCH16_ADDR]] to ptr
+// CHECK-NEXT:[[SRCF16_ADD

[llvm-branch-commits] [clang] release/21.x: [clang] Avoid inheriting [[noreturn]] in explicit function template specializations (#150003) (PR #151752)

2025-08-01 Thread via llvm-branch-commits

https://github.com/llvmbot created 
https://github.com/llvm/llvm-project/pull/151752

Backport 22fef005225b129d73ade4ed995fc0ec0c7be044

Requested by: @mstorsjo

>From eab41145d4ed4b2ce95708559c178fc629c9655b Mon Sep 17 00:00:00 2001
From: Samarth Narang <[email protected]>
Date: Wed, 23 Jul 2025 21:04:05 -0400
Subject: [PATCH] [clang] Avoid inheriting [[noreturn]] in explicit function
 template specializations (#150003)

This patch fixes incorrect behavior in Clang where [[noreturn]] (either
spelled or inferred) was being inherited by explicit specializations of
function templates or member function templates, even when those
specializations returned normally.

Follow up on https://github.com/llvm/llvm-project/pull/145166

(cherry picked from commit 22fef005225b129d73ade4ed995fc0ec0c7be044)
---
 clang/lib/Sema/SemaDecl.cpp  |  8 
 clang/lib/Sema/SemaDeclAttr.cpp  |  7 +++
 clang/test/SemaCXX/wreturn-always-throws.cpp | 21 +++-
 3 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 14403e65e8f42..bb412ef6788e7 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -3267,6 +3267,14 @@ void Sema::mergeDeclAttributes(NamedDecl *New, Decl *Old,
 if (isa(I) || isa(I))
   continue;
 
+if (isa(I)) {
+  if (auto *FD = dyn_cast(New)) {
+if (FD->getTemplateSpecializationKind() == TSK_ExplicitSpecialization)
+  continue; // Don't propagate inferred noreturn attributes to explicit
+// specializations.
+  }
+}
+
 if (mergeDeclAttribute(*this, New, I, LocalAMK))
   foundAny = true;
   }
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index eff5f9568236a..a7897bdfe6e0f 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -1970,6 +1970,13 @@ void clang::inferNoReturnAttr(Sema &S, const Decl *D) {
   if (!FD)
 return;
 
+  // Skip explicit specializations here as they may have
+  // a user-provided definition that may deliberately differ from the primary
+  // template. If an explicit specialization truly never returns, the user
+  // should explicitly mark it with [[noreturn]].
+  if (FD->getTemplateSpecializationKind() == TSK_ExplicitSpecialization)
+return;
+
   auto *NonConstFD = const_cast(FD);
   DiagnosticsEngine &Diags = S.getDiagnostics();
   if (Diags.isIgnored(diag::warn_falloff_nonvoid, FD->getLocation()) &&
diff --git a/clang/test/SemaCXX/wreturn-always-throws.cpp 
b/clang/test/SemaCXX/wreturn-always-throws.cpp
index addcadd1183dc..df7689f7063cc 100644
--- a/clang/test/SemaCXX/wreturn-always-throws.cpp
+++ b/clang/test/SemaCXX/wreturn-always-throws.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -fcxx-exceptions -fexceptions -Wreturn-type 
-verify %s
+// RUN: %clang_cc1 -fsyntax-only -fcxx-exceptions -fexceptions -Wreturn-type 
-Winvalid-noreturn -verify %s
 // expected-no-diagnostics
 
 namespace std {
@@ -44,3 +44,22 @@ void testTemplates() {
   throwErrorTemplate("ERROR");
   (void)ensureZeroTemplate(42);
 }
+
+// Ensure that explicit specialization of a member function does not inherit
+// the warning from the primary template.
+
+template
+struct S {
+  void f();
+  void g();
+};
+
+template
+void S::f() { throw 0; } 
+template<>
+void S::f() {}
+
+template 
+void S::g() {}  
+template<> 
+void S::g() { throw 0; }

___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] release/21.x: [clang] Avoid inheriting [[noreturn]] in explicit function template specializations (#150003) (PR #151752)

2025-08-01 Thread via llvm-branch-commits

https://github.com/llvmbot milestoned 
https://github.com/llvm/llvm-project/pull/151752
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] release/21.x: [clang] Avoid inheriting [[noreturn]] in explicit function template specializations (#150003) (PR #151752)

2025-08-01 Thread via llvm-branch-commits

llvmbot wrote:

@alexfh What do you think about merging this PR to the release branch?

https://github.com/llvm/llvm-project/pull/151752
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [libc] [llvm] [libc][math] Refactor atanhf implementation to header-only in src/__support/math folder. (PR #151399)

2025-08-01 Thread Muhammad Bassiouni via llvm-branch-commits

https://github.com/bassiounix updated 
https://github.com/llvm/llvm-project/pull/151399

>From 909f8e78ebe538dc929bbfa2d80c7e79df6a0194 Mon Sep 17 00:00:00 2001
From: bassiounix 
Date: Thu, 31 Jul 2025 00:41:13 +0300
Subject: [PATCH] [libc][math] Refactor atanhf implementation to header-only in
 src/__support/math folder.

---
 libc/shared/math.h|  1 +
 libc/shared/math/atanhf.h | 23 ++
 libc/src/__support/math/CMakeLists.txt| 11 +++
 libc/src/__support/math/atanhf.h  | 76 +++
 libc/src/math/generic/CMakeLists.txt  |  5 +-
 libc/src/math/generic/atanhf.cpp  | 56 +-
 libc/test/shared/CMakeLists.txt   |  1 +
 libc/test/shared/shared_math_test.cpp |  1 +
 .../llvm-project-overlay/libc/BUILD.bazel | 20 +++--
 9 files changed, 129 insertions(+), 65 deletions(-)
 create mode 100644 libc/shared/math/atanhf.h
 create mode 100644 libc/src/__support/math/atanhf.h

diff --git a/libc/shared/math.h b/libc/shared/math.h
index 6cb583c08dedd..ddf219ece8ff1 100644
--- a/libc/shared/math.h
+++ b/libc/shared/math.h
@@ -28,6 +28,7 @@
 #include "math/atan2f128.h"
 #include "math/atanf.h"
 #include "math/atanf16.h"
+#include "math/atanhf.h"
 #include "math/erff.h"
 #include "math/exp.h"
 #include "math/exp10.h"
diff --git a/libc/shared/math/atanhf.h b/libc/shared/math/atanhf.h
new file mode 100644
index 0..763fb3e00a659
--- /dev/null
+++ b/libc/shared/math/atanhf.h
@@ -0,0 +1,23 @@
+//===-- Shared atanhf function --*- C++ 
-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_ATANHF_H
+#define LLVM_LIBC_SHARED_MATH_ATANHF_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/atanhf.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::atanhf;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_ATANHF_H
diff --git a/libc/src/__support/math/CMakeLists.txt 
b/libc/src/__support/math/CMakeLists.txt
index caafdc2cbf1d6..500dd9de2c555 100644
--- a/libc/src/__support/math/CMakeLists.txt
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -275,6 +275,17 @@ add_header_library(
 libc.src.__support.macros.optimization
 )
 
+add_header_library(
+  atanhf
+  HDRS
+atanhf.h
+  DEPENDS
+.acoshf_utils
+libc.src.__support.FPUtil.fp_bits
+libc.src.__support.FPUtil.fenv_impl
+libc.src.__support.macros.optimization
+)
+
 add_header_library(
   asinf
   HDRS
diff --git a/libc/src/__support/math/atanhf.h b/libc/src/__support/math/atanhf.h
new file mode 100644
index 0..b3ee5bbb4d408
--- /dev/null
+++ b/libc/src/__support/math/atanhf.h
@@ -0,0 +1,76 @@
+//===-- Implementation header for atanhf *- C++ 
-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_ATANHF_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_ATANHF_H
+
+#include "acoshf_utils.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+LIBC_INLINE static constexpr float atanhf(float x) {
+  using namespace acoshf_internal;
+  using FPBits = typename fputil::FPBits;
+
+  FPBits xbits(x);
+  Sign sign = xbits.sign();
+  uint32_t x_abs = xbits.abs().uintval();
+
+  // |x| >= 1.0
+  if (LIBC_UNLIKELY(x_abs >= 0x3F80'U)) {
+if (xbits.is_nan()) {
+  if (xbits.is_signaling_nan()) {
+fputil::raise_except_if_required(FE_INVALID);
+return FPBits::quiet_nan().get_val();
+  }
+  return x;
+}
+// |x| == 1.0
+if (x_abs == 0x3F80'U) {
+  fputil::set_errno_if_required(ERANGE);
+  fputil::raise_except_if_required(FE_DIVBYZERO);
+  return FPBits::inf(sign).get_val();
+} else {
+  fputil::set_errno_if_required(EDOM);
+  fputil::raise_except_if_required(FE_INVALID);
+  return FPBits::quiet_nan().get_val();
+}
+  }
+
+  // |x| < ~0.10
+  if (LIBC_UNLIKELY(x_abs <= 0x3dcc'U)) {
+// |x| <= 2^-26
+if (LIBC_UNLIKELY(x_abs <= 0x3280'U)) {
+  return static_cast(LIBC_UNLIKELY(x_abs == 0)
+? x
+: (x + 0x1.5p-2 * x * x * x));
+}
+
+double xdbl = x;
+double x2 

[llvm-branch-commits] [libc] [llvm] [libc][math] Refactor atanhf implementation to header-only in src/__support/math folder. (PR #151399)

2025-08-01 Thread Muhammad Bassiouni via llvm-branch-commits

https://github.com/bassiounix updated 
https://github.com/llvm/llvm-project/pull/151399

>From 909f8e78ebe538dc929bbfa2d80c7e79df6a0194 Mon Sep 17 00:00:00 2001
From: bassiounix 
Date: Thu, 31 Jul 2025 00:41:13 +0300
Subject: [PATCH] [libc][math] Refactor atanhf implementation to header-only in
 src/__support/math folder.

---
 libc/shared/math.h|  1 +
 libc/shared/math/atanhf.h | 23 ++
 libc/src/__support/math/CMakeLists.txt| 11 +++
 libc/src/__support/math/atanhf.h  | 76 +++
 libc/src/math/generic/CMakeLists.txt  |  5 +-
 libc/src/math/generic/atanhf.cpp  | 56 +-
 libc/test/shared/CMakeLists.txt   |  1 +
 libc/test/shared/shared_math_test.cpp |  1 +
 .../llvm-project-overlay/libc/BUILD.bazel | 20 +++--
 9 files changed, 129 insertions(+), 65 deletions(-)
 create mode 100644 libc/shared/math/atanhf.h
 create mode 100644 libc/src/__support/math/atanhf.h

diff --git a/libc/shared/math.h b/libc/shared/math.h
index 6cb583c08dedd..ddf219ece8ff1 100644
--- a/libc/shared/math.h
+++ b/libc/shared/math.h
@@ -28,6 +28,7 @@
 #include "math/atan2f128.h"
 #include "math/atanf.h"
 #include "math/atanf16.h"
+#include "math/atanhf.h"
 #include "math/erff.h"
 #include "math/exp.h"
 #include "math/exp10.h"
diff --git a/libc/shared/math/atanhf.h b/libc/shared/math/atanhf.h
new file mode 100644
index 0..763fb3e00a659
--- /dev/null
+++ b/libc/shared/math/atanhf.h
@@ -0,0 +1,23 @@
+//===-- Shared atanhf function --*- C++ 
-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_ATANHF_H
+#define LLVM_LIBC_SHARED_MATH_ATANHF_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/atanhf.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::atanhf;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_ATANHF_H
diff --git a/libc/src/__support/math/CMakeLists.txt 
b/libc/src/__support/math/CMakeLists.txt
index caafdc2cbf1d6..500dd9de2c555 100644
--- a/libc/src/__support/math/CMakeLists.txt
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -275,6 +275,17 @@ add_header_library(
 libc.src.__support.macros.optimization
 )
 
+add_header_library(
+  atanhf
+  HDRS
+atanhf.h
+  DEPENDS
+.acoshf_utils
+libc.src.__support.FPUtil.fp_bits
+libc.src.__support.FPUtil.fenv_impl
+libc.src.__support.macros.optimization
+)
+
 add_header_library(
   asinf
   HDRS
diff --git a/libc/src/__support/math/atanhf.h b/libc/src/__support/math/atanhf.h
new file mode 100644
index 0..b3ee5bbb4d408
--- /dev/null
+++ b/libc/src/__support/math/atanhf.h
@@ -0,0 +1,76 @@
+//===-- Implementation header for atanhf *- C++ 
-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_ATANHF_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_ATANHF_H
+
+#include "acoshf_utils.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+LIBC_INLINE static constexpr float atanhf(float x) {
+  using namespace acoshf_internal;
+  using FPBits = typename fputil::FPBits;
+
+  FPBits xbits(x);
+  Sign sign = xbits.sign();
+  uint32_t x_abs = xbits.abs().uintval();
+
+  // |x| >= 1.0
+  if (LIBC_UNLIKELY(x_abs >= 0x3F80'U)) {
+if (xbits.is_nan()) {
+  if (xbits.is_signaling_nan()) {
+fputil::raise_except_if_required(FE_INVALID);
+return FPBits::quiet_nan().get_val();
+  }
+  return x;
+}
+// |x| == 1.0
+if (x_abs == 0x3F80'U) {
+  fputil::set_errno_if_required(ERANGE);
+  fputil::raise_except_if_required(FE_DIVBYZERO);
+  return FPBits::inf(sign).get_val();
+} else {
+  fputil::set_errno_if_required(EDOM);
+  fputil::raise_except_if_required(FE_INVALID);
+  return FPBits::quiet_nan().get_val();
+}
+  }
+
+  // |x| < ~0.10
+  if (LIBC_UNLIKELY(x_abs <= 0x3dcc'U)) {
+// |x| <= 2^-26
+if (LIBC_UNLIKELY(x_abs <= 0x3280'U)) {
+  return static_cast(LIBC_UNLIKELY(x_abs == 0)
+? x
+: (x + 0x1.5p-2 * x * x * x));
+}
+
+double xdbl = x;
+double x2 

[llvm-branch-commits] [libc++] Introduce __force_nonstandard_layout base class for pointer field protection (PR #151652)

2025-08-01 Thread Peter Collingbourne via llvm-branch-commits

pcc wrote:

> IIUC, the goal here is for the compiler to be able to apply e.g. pointer 
> authentication on fields of these structs automatically. It can't do so if 
> they are standard layout types, because then users are technically allowed to 
> poke into the binary representation of these types a bit too much. Did I get 
> that right?

That's right. For example, the standard would require that a standard-layout 
`std::unique_ptr` has the same representation as `int *` (assuming the 
obvious implementation), and it was not considered practical to change the 
representation of all pointers for compatibility reasons.

> Let's take for example std::shared_ptr. Before your patch, it is a standard 
> layout class. However, users technically can't take advantage of that, 
> because the only things that they would be allowed to do is inspect the 
> common initial subsequence of std::shared_ptr, use offsetof with it, or cast 
> a std::shared_ptr<...>* to its first member. None of that applies, because 
> all of its members are private (and reserved names anyway). Hence, making it 
> explicitly non-standard-layout should not be a breaking change for users. Do 
> I follow your line of thinking correctly?

If the user knows the layout of std::shared_ptr, I think they could access the 
private fields via the common initial subsequence. In practice it seems highly 
unlikely that code would do this. So this could technically break the source 
level API, but that seems unlikely, and in the millions of lines of internal 
source code that I tested this on, I did not find any practical breakage caused 
by this.

> Are there any other affordances provided by standard layout types that I 
> would have missed? Either in terms of API (i.e. what users can do with a SL 
> type), or ABI (e.g. the calling convention changes for SL types like it does 
> for trivial types)? It's important to have a solid grasp of this before we 
> can consider moving forward with this (and decide how to best do it). If this 
> e.g. breaks the ABI in any way, we'll need to have a completely different 
> conversation.

Theoretically a change to standard-layout-ness for the standard types could 
break some ABI somewhere (the x86_64 and AArch64 psABIs don't mention standard 
layout, but maybe some other psABI does, and at least I suppose that some code 
somewhere could be doing things like making ABI-breaking decisions based on 
`std::is_standard_layout`), but this break only happens if the user opts into 
PFP (note that `_LIBCPP_MAYBE_FORCE_NONSTANDARD_LAYOUT` expands to nothing if 
PFP is disabled) so there will be no break for existing users.

> Also, while we're talking about ABI, I assume that in your use case (which I 
> perhaps naively imagine to be applying ptrauth to struct fields), I guess 
> you'd be taking an ABI break to enable that new compiler feature, right?

That's right, and I'd like to take advantage of the ABI break to make some 
things work better for PFP (like this standard-layout change) when it is 
enabled. PFP is still experimental, so we may consider making further changes 
to the libc++ ABI when it is enabled.

> I do agree with what @philnik777 said in that other review -- I think we need 
> additional context and a RFC to understand what's your underlying plan, what 
> the constraints are and evaluate what is the impact to libc++.

Sure. I already have 
https://discourse.llvm.org/t/rfc-structure-protection-a-family-of-uaf-mitigation-techniques/8
 which covers the feature as a whole (the "On standard layout types" section is 
most relevant for this PR) but I could try to write a separate RFC that focuses 
on the libc++ aspects.

https://github.com/llvm/llvm-project/pull/151652
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] Add llvm.protected.field.ptr intrinsic and pre-ISel lowering. (PR #151647)

2025-08-01 Thread Nikita Popov via llvm-branch-commits




nikic wrote:

Use update_test_checks.py.

https://github.com/llvm/llvm-project/pull/151647
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] Add llvm.protected.field.ptr intrinsic and pre-ISel lowering. (PR #151647)

2025-08-01 Thread Nikita Popov via llvm-branch-commits


@@ -461,6 +465,198 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(
   return Changed;
 }
 
+namespace {
+
+enum class PointerEncoding {
+  Rotate,
+  PACCopyable,
+  PACNonCopyable,
+};
+
+bool expandProtectedFieldPtr(Function &Intr) {
+  Module &M = *Intr.getParent();
+
+  std::set DSsToDeactivate;
+  std::set LoadsStores;
+
+  Type *Int8Ty = Type::getInt8Ty(M.getContext());
+  Type *Int64Ty = Type::getInt64Ty(M.getContext());
+  PointerType *PtrTy = PointerType::get(M.getContext(), 0);
+
+  Function *SignIntr =
+  Intrinsic::getOrInsertDeclaration(&M, Intrinsic::ptrauth_sign, {});
+  Function *AuthIntr =
+  Intrinsic::getOrInsertDeclaration(&M, Intrinsic::ptrauth_auth, {});
+
+  auto *EmuFnTy = FunctionType::get(Int64Ty, {Int64Ty, Int64Ty}, false);
+  FunctionCallee EmuSignIntr = M.getOrInsertFunction("__emupac_pacda", 
EmuFnTy);
+  FunctionCallee EmuAuthIntr = M.getOrInsertFunction("__emupac_autda", 
EmuFnTy);
+
+  auto CreateSign = [&](IRBuilder<> &B, Value *Val, Value *Disc,
+   OperandBundleDef DSBundle) {
+Function *F = B.GetInsertBlock()->getParent();
+Attribute FSAttr = F->getFnAttribute("target-features");
+if (FSAttr.isValid() && FSAttr.getValueAsString().contains("+pauth"))
+  return B.CreateCall(SignIntr, {Val, B.getInt32(2), Disc}, DSBundle);
+return B.CreateCall(EmuSignIntr, {Val, Disc}, DSBundle);
+  };
+
+  auto CreateAuth = [&](IRBuilder<> &B, Value *Val, Value *Disc,
+   OperandBundleDef DSBundle) {
+Function *F = B.GetInsertBlock()->getParent();
+Attribute FSAttr = F->getFnAttribute("target-features");
+if (FSAttr.isValid() && FSAttr.getValueAsString().contains("+pauth"))
+  return B.CreateCall(AuthIntr, {Val, B.getInt32(2), Disc}, DSBundle);
+return B.CreateCall(EmuAuthIntr, {Val, Disc}, DSBundle);
+  };
+
+  auto GetDeactivationSymbol = [&](CallInst *Call) -> GlobalValue * {
+if (auto Bundle =
+Call->getOperandBundle(LLVMContext::OB_deactivation_symbol))
+  return cast(Bundle->Inputs[0]);
+return nullptr;
+  };
+
+  for (User *U : Intr.users()) {
+auto *Call = cast(U);
+auto *DS = GetDeactivationSymbol(Call);
+std::set VisitedPhis;
+
+std::function FindLoadsStores;
+FindLoadsStores = [&](Instruction *I) {
+  for (Use &U : I->uses()) {
+if (auto *LI = dyn_cast(U.getUser())) {
+  if (isa(LI->getType())) {
+LoadsStores.insert(LI);
+continue;
+  }
+}
+if (auto *SI = dyn_cast(U.getUser())) {
+  if (U.getOperandNo() == 1 &&
+  isa(SI->getValueOperand()->getType())) {
+LoadsStores.insert(SI);
+continue;
+  }
+}
+if (auto *P = dyn_cast(U.getUser())) {

nikic wrote:

Phi handling not tested?

https://github.com/llvm/llvm-project/pull/151647
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] Add llvm.protected.field.ptr intrinsic and pre-ISel lowering. (PR #151647)

2025-08-01 Thread Nikita Popov via llvm-branch-commits


@@ -37,6 +39,8 @@
 #include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
 #include "llvm/Transforms/Utils/LowerVectorIntrinsics.h"
 
+#include 

nikic wrote:

Do not use `std::set` unless you actually need an ordered set. Based on usage, 
you want SmallPtrSet.

https://github.com/llvm/llvm-project/pull/151647
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] Add llvm.protected.field.ptr intrinsic and pre-ISel lowering. (PR #151647)

2025-08-01 Thread Nikita Popov via llvm-branch-commits


@@ -0,0 +1,46 @@
+; RUN: opt -passes=pre-isel-intrinsic-lowering -S < %s | FileCheck 
--check-prefixes=CHECK,NOPAUTH %s
+; RUN: opt -passes=pre-isel-intrinsic-lowering -mattr=+pauth -S < %s | 
FileCheck --check-prefixes=CHECK,PAUTH %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; CHECK: @ds1 = external global i8
+@ds1 = external global i8
+; CHECK: @ds2 = external global i8
+@ds2 = external global i8
+; CHECK: @ds3 = hidden alias i8, inttoptr (i64 3573751839 to ptr)
+@ds3 = external global i8
+
+; CHECK: define ptr @f1
+define ptr @f1(ptr %ptrptr) {
+  ; CHECK: %ptr = load ptr, ptr %ptrptr, align 8
+  ; CHECK: %1 = ptrtoint ptr %ptr to i64
+  ; NOPAUTH: %2 = call i64 @__emupac_autda(i64 %1, i64 1) [ 
"deactivation-symbol"(ptr @ds1) ]
+  ; PAUTH: %2 = call i64 @llvm.ptrauth.auth(i64 %1, i32 2, i64 1) [ 
"deactivation-symbol"(ptr @ds1) ]
+  ; CHECK: %3 = inttoptr i64 %2 to ptr
+  ; CHECK: ret ptr %3
+  %protptrptr = call ptr @llvm.protected.field.ptr(ptr %ptrptr, i64 1, i1 
true) [ "deactivation-symbol"(ptr @ds1) ]

nikic wrote:

Missing tests for use_hw=false?

https://github.com/llvm/llvm-project/pull/151647
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] Add llvm.protected.field.ptr intrinsic and pre-ISel lowering. (PR #151647)

2025-08-01 Thread Nikita Popov via llvm-branch-commits


@@ -461,6 +465,198 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(
   return Changed;
 }
 
+namespace {
+
+enum class PointerEncoding {
+  Rotate,
+  PACCopyable,
+  PACNonCopyable,
+};
+
+bool expandProtectedFieldPtr(Function &Intr) {
+  Module &M = *Intr.getParent();
+
+  std::set DSsToDeactivate;
+  std::set LoadsStores;
+
+  Type *Int8Ty = Type::getInt8Ty(M.getContext());
+  Type *Int64Ty = Type::getInt64Ty(M.getContext());
+  PointerType *PtrTy = PointerType::get(M.getContext(), 0);
+
+  Function *SignIntr =
+  Intrinsic::getOrInsertDeclaration(&M, Intrinsic::ptrauth_sign, {});
+  Function *AuthIntr =
+  Intrinsic::getOrInsertDeclaration(&M, Intrinsic::ptrauth_auth, {});
+
+  auto *EmuFnTy = FunctionType::get(Int64Ty, {Int64Ty, Int64Ty}, false);
+  FunctionCallee EmuSignIntr = M.getOrInsertFunction("__emupac_pacda", 
EmuFnTy);
+  FunctionCallee EmuAuthIntr = M.getOrInsertFunction("__emupac_autda", 
EmuFnTy);
+
+  auto CreateSign = [&](IRBuilder<> &B, Value *Val, Value *Disc,
+   OperandBundleDef DSBundle) {
+Function *F = B.GetInsertBlock()->getParent();
+Attribute FSAttr = F->getFnAttribute("target-features");
+if (FSAttr.isValid() && FSAttr.getValueAsString().contains("+pauth"))
+  return B.CreateCall(SignIntr, {Val, B.getInt32(2), Disc}, DSBundle);
+return B.CreateCall(EmuSignIntr, {Val, Disc}, DSBundle);
+  };
+
+  auto CreateAuth = [&](IRBuilder<> &B, Value *Val, Value *Disc,
+   OperandBundleDef DSBundle) {
+Function *F = B.GetInsertBlock()->getParent();
+Attribute FSAttr = F->getFnAttribute("target-features");
+if (FSAttr.isValid() && FSAttr.getValueAsString().contains("+pauth"))
+  return B.CreateCall(AuthIntr, {Val, B.getInt32(2), Disc}, DSBundle);
+return B.CreateCall(EmuAuthIntr, {Val, Disc}, DSBundle);
+  };
+
+  auto GetDeactivationSymbol = [&](CallInst *Call) -> GlobalValue * {
+if (auto Bundle =
+Call->getOperandBundle(LLVMContext::OB_deactivation_symbol))
+  return cast(Bundle->Inputs[0]);
+return nullptr;
+  };
+
+  for (User *U : Intr.users()) {
+auto *Call = cast(U);
+auto *DS = GetDeactivationSymbol(Call);
+std::set VisitedPhis;
+
+std::function FindLoadsStores;

nikic wrote:

Do not use std::function for recursion. Make this a separate static function 
instead.

https://github.com/llvm/llvm-project/pull/151647
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] Add llvm.protected.field.ptr intrinsic and pre-ISel lowering. (PR #151647)

2025-08-01 Thread Nikita Popov via llvm-branch-commits


@@ -31161,3 +31161,57 @@ This intrinsic is assumed to execute in the default 
:ref:`floating-point
 environment ` *except* for the rounding mode.
 This intrinsic is not supported on all targets. Some targets may not support
 all rounding modes.
+
+'``llvm.protected.field.ptr``' Intrinsic
+
+
+Syntax:
+"""
+
+::
+
+  declare ptr @llvm.protected.field.ptr(ptr ptr, i64 disc, i1 
use_hw_encoding)
+
+Overview:
+"
+
+The '``llvm.protected.field.ptr``' intrinsic returns a pointer to the
+storage location of a pointer that has special properties as described
+below.
+
+Arguments:
+""
+
+The first argument is the pointer specifying the location to store the
+pointer. The second argument is the discriminator, which is used as an
+input for the pointer encoding. The third argument specifies whether to
+use a target-specific mechanism to encode the pointer.
+
+Semantics:
+""
+
+This intrinsic returns a pointer which may be used to store a
+pointer at the specified address that is encoded using the specified
+discriminator. Stores via the pointer will cause the stored pointer to be
+blended with the second argument before being stored. The blend operation
+shall be either a weak but cheap and target-independent operation (if
+the third argument is 0) or a stronger target-specific operation (if the
+third argument is 1). When loading from the pointer, the inverse operation
+is done on the loaded pointer after it is loaded. Specifically, when the
+third argument is 1, the pointer is signed (using pointer authentication
+instructions or emulated PAC if not supported by the hardware) using
+the struct address before being stored, and authenticated after being
+loaded. Note that it is currently unsupported to have the third argument
+be 1 on targets other than AArch64. When the third argument is 0, it is
+rotated left by 16 bits and the discriminator is subtracted before being
+stored, and the discriminator is added and the pointer is rotated right
+by 16 bits after being loaded.
+
+If the pointer is used otherwise than for loading or storing (e.g. its

nikic wrote:

```suggestion
If the pointer is used other than for loading or storing (e.g. its
```

https://github.com/llvm/llvm-project/pull/151647
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] release/21.x: [TailDup] Delay aggressive computed-goto taildup to after RegAlloc. (#150911) (PR #151680)

2025-08-01 Thread Florian Hahn via llvm-branch-commits

https://github.com/fhahn milestoned 
https://github.com/llvm/llvm-project/pull/151680
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] release/21.x: [TailDup] Delay aggressive computed-goto taildup to after RegAlloc. (#150911) (PR #151680)

2025-08-01 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-aarch64

Author: Florian Hahn (fhahn)


Changes



Back-ports additional tests (eb9febb4a6b0, dc697de12792), refactoring 
(43c9c14577db) and functional change (18f1369297f4) in a single PR.

https://github.com/llvm/llvm-project/pull/114990 allowed more aggressive tail 
duplication for computed-gotos in both pre- and post-regalloc tail duplication.

In some cases, performing tail-duplication too early can lead to worse results, 
especially if we duplicate blocks with a number of phi nodes.

This is causing a ~3% performance regression in some workloads using Python 
3.12.

This patch updates TailDup to delay aggressive tail-duplication for computed 
gotos to after register allocation.

This means we can keep the non-duplicated version for a bit longer throughout 
the backend, which should reduce compile-time as well as allowing a number of 
optimizations and simplifications to trigger before drastically expanding the 
CFG.

For the case in https://github.com/llvm/llvm-project/issues/106846, I get the 
same performance with and without this patch on Skylake.

PR: https://github.com/llvm/llvm-project/pull/150911

---

Patch is 22.42 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/151680.diff


5 Files Affected:

- (modified) llvm/include/llvm/CodeGen/MachineBasicBlock.h (+5-4) 
- (modified) llvm/lib/CodeGen/TailDuplicator.cpp (+11-7) 
- (added) llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll (+143) 
- (renamed) llvm/test/CodeGen/X86/early-tail-dup-computed-goto.mir (+23-21) 
- (added) llvm/test/CodeGen/X86/late-tail-dup-computed-goto.mir (+128) 


``diff
diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h 
b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
index 938d71dd030e8..9e3d9196cc184 100644
--- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
@@ -323,10 +323,11 @@ class MachineBasicBlock
   const MachineFunction *getParent() const { return xParent; }
   MachineFunction *getParent() { return xParent; }
 
-  /// Returns true if the original IR terminator is an `indirectbr`. This
-  /// typically corresponds to a `goto` in C, rather than jump tables.
-  bool terminatorIsComputedGoto() const {
-return back().isIndirectBranch() &&
+  /// Returns true if the original IR terminator is an `indirectbr` with
+  /// successor blocks. This typically corresponds to a `goto` in C, rather 
than
+  /// jump tables.
+  bool terminatorIsComputedGotoWithSuccessors() const {
+return back().isIndirectBranch() && !succ_empty() &&
llvm::all_of(successors(), [](const MachineBasicBlock *Succ) {
  return Succ->isIRBlockAddressTaken();
});
diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp 
b/llvm/lib/CodeGen/TailDuplicator.cpp
index a88c57fdc165a..5d720fbbf1c61 100644
--- a/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -604,12 +604,21 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
   bool HasComputedGoto = false;
   if (!TailBB.empty()) {
 HasIndirectbr = TailBB.back().isIndirectBranch();
-HasComputedGoto = TailBB.terminatorIsComputedGoto();
+HasComputedGoto = TailBB.terminatorIsComputedGotoWithSuccessors();
   }
 
   if (HasIndirectbr && PreRegAlloc)
 MaxDuplicateCount = TailDupIndirectBranchSize;
 
+  // Allow higher limits when the block has computed-gotos and running after
+  // register allocation. NB. This basically unfactors computed gotos that were
+  // factored early on in the compilation process to speed up edge based data
+  // flow. If we do not unfactor them again, it can seriously pessimize code
+  // with many computed jumps in the source code, such as interpreters.
+  // Therefore we do not restrict the computed gotos.
+  if (HasComputedGoto && !PreRegAlloc)
+MaxDuplicateCount = std::max(MaxDuplicateCount, 10u);
+
   // Check the instructions in the block to determine whether tail-duplication
   // is invalid or unlikely to be profitable.
   unsigned InstrCount = 0;
@@ -663,12 +672,7 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
   // Duplicating a BB which has both multiple predecessors and successors will
   // may cause huge amount of PHI nodes. If we want to remove this limitation,
   // we have to address https://github.com/llvm/llvm-project/issues/78578.
-  // NB. This basically unfactors computed gotos that were factored early on in
-  // the compilation process to speed up edge based data flow. If we do not
-  // unfactor them again, it can seriously pessimize code with many computed
-  // jumps in the source code, such as interpreters. Therefore we do not
-  // restrict the computed gotos.
-  if (!HasComputedGoto && TailBB.pred_size() > TailDupPredSize &&
+  if (PreRegAlloc && TailBB.pred_size() > TailDupPredSize &&
   TailBB.succ_size() > TailDupSuccSize) {
 // If TailBB or any of its successors contains a phi, w

[llvm-branch-commits] [llvm] release/21.x: [TailDup] Delay aggressive computed-goto taildup to after RegAlloc. (#150911) (PR #151680)

2025-08-01 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-x86

Author: Florian Hahn (fhahn)


Changes



Back-ports additional tests (eb9febb4a6b0, dc697de12792), refactoring 
(43c9c14577db) and functional change (18f1369297f4) in a single PR.

https://github.com/llvm/llvm-project/pull/114990 allowed more aggressive tail 
duplication for computed-gotos in both pre- and post-regalloc tail duplication.

In some cases, performing tail-duplication too early can lead to worse results, 
especially if we duplicate blocks with a number of phi nodes.

This is causing a ~3% performance regression in some workloads using Python 
3.12.

This patch updates TailDup to delay aggressive tail-duplication for computed 
gotos to after register allocation.

This means we can keep the non-duplicated version for a bit longer throughout 
the backend, which should reduce compile-time as well as allowing a number of 
optimizations and simplifications to trigger before drastically expanding the 
CFG.

For the case in https://github.com/llvm/llvm-project/issues/106846, I get the 
same performance with and without this patch on Skylake.

PR: https://github.com/llvm/llvm-project/pull/150911

---

Patch is 22.42 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/151680.diff


5 Files Affected:

- (modified) llvm/include/llvm/CodeGen/MachineBasicBlock.h (+5-4) 
- (modified) llvm/lib/CodeGen/TailDuplicator.cpp (+11-7) 
- (added) llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll (+143) 
- (renamed) llvm/test/CodeGen/X86/early-tail-dup-computed-goto.mir (+23-21) 
- (added) llvm/test/CodeGen/X86/late-tail-dup-computed-goto.mir (+128) 


``diff
diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h 
b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
index 938d71dd030e8..9e3d9196cc184 100644
--- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
@@ -323,10 +323,11 @@ class MachineBasicBlock
   const MachineFunction *getParent() const { return xParent; }
   MachineFunction *getParent() { return xParent; }
 
-  /// Returns true if the original IR terminator is an `indirectbr`. This
-  /// typically corresponds to a `goto` in C, rather than jump tables.
-  bool terminatorIsComputedGoto() const {
-return back().isIndirectBranch() &&
+  /// Returns true if the original IR terminator is an `indirectbr` with
+  /// successor blocks. This typically corresponds to a `goto` in C, rather 
than
+  /// jump tables.
+  bool terminatorIsComputedGotoWithSuccessors() const {
+return back().isIndirectBranch() && !succ_empty() &&
llvm::all_of(successors(), [](const MachineBasicBlock *Succ) {
  return Succ->isIRBlockAddressTaken();
});
diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp 
b/llvm/lib/CodeGen/TailDuplicator.cpp
index a88c57fdc165a..5d720fbbf1c61 100644
--- a/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -604,12 +604,21 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
   bool HasComputedGoto = false;
   if (!TailBB.empty()) {
 HasIndirectbr = TailBB.back().isIndirectBranch();
-HasComputedGoto = TailBB.terminatorIsComputedGoto();
+HasComputedGoto = TailBB.terminatorIsComputedGotoWithSuccessors();
   }
 
   if (HasIndirectbr && PreRegAlloc)
 MaxDuplicateCount = TailDupIndirectBranchSize;
 
+  // Allow higher limits when the block has computed-gotos and running after
+  // register allocation. NB. This basically unfactors computed gotos that were
+  // factored early on in the compilation process to speed up edge based data
+  // flow. If we do not unfactor them again, it can seriously pessimize code
+  // with many computed jumps in the source code, such as interpreters.
+  // Therefore we do not restrict the computed gotos.
+  if (HasComputedGoto && !PreRegAlloc)
+MaxDuplicateCount = std::max(MaxDuplicateCount, 10u);
+
   // Check the instructions in the block to determine whether tail-duplication
   // is invalid or unlikely to be profitable.
   unsigned InstrCount = 0;
@@ -663,12 +672,7 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
   // Duplicating a BB which has both multiple predecessors and successors will
   // may cause huge amount of PHI nodes. If we want to remove this limitation,
   // we have to address https://github.com/llvm/llvm-project/issues/78578.
-  // NB. This basically unfactors computed gotos that were factored early on in
-  // the compilation process to speed up edge based data flow. If we do not
-  // unfactor them again, it can seriously pessimize code with many computed
-  // jumps in the source code, such as interpreters. Therefore we do not
-  // restrict the computed gotos.
-  if (!HasComputedGoto && TailBB.pred_size() > TailDupPredSize &&
+  if (PreRegAlloc && TailBB.pred_size() > TailDupPredSize &&
   TailBB.succ_size() > TailDupSuccSize) {
 // If TailBB or any of its successors contains a phi, we ma

[llvm-branch-commits] [llvm] release/21.x: [TailDup] Delay aggressive computed-goto taildup to after RegAlloc. (#150911) (PR #151680)

2025-08-01 Thread Florian Hahn via llvm-branch-commits

https://github.com/fhahn created 
https://github.com/llvm/llvm-project/pull/151680



Back-ports additional tests (eb9febb4a6b0, dc697de12792), refactoring 
(43c9c14577db) and functional change (18f1369297f4) in a single PR.

https://github.com/llvm/llvm-project/pull/114990 allowed more aggressive tail 
duplication for computed-gotos in both pre- and post-regalloc tail duplication.

In some cases, performing tail-duplication too early can lead to worse results, 
especially if we duplicate blocks with a number of phi nodes.

This is causing a ~3% performance regression in some workloads using Python 
3.12.

This patch updates TailDup to delay aggressive tail-duplication for computed 
gotos to after register allocation.

This means we can keep the non-duplicated version for a bit longer throughout 
the backend, which should reduce compile-time as well as allowing a number of 
optimizations and simplifications to trigger before drastically expanding the 
CFG.

For the case in https://github.com/llvm/llvm-project/issues/106846, I get the 
same performance with and without this patch on Skylake.

PR: https://github.com/llvm/llvm-project/pull/150911

>From 0a2665fb1b6cd3e0c8a3b70ac941cb07d2f561c3 Mon Sep 17 00:00:00 2001
From: Florian Hahn 
Date: Mon, 28 Jul 2025 09:37:00 +0100
Subject: [PATCH] [TailDup] Delay aggressive computed-goto taildup to after
 RegAlloc. (#150911)

Back-ports additional tests (eb9febb4a6b0, dc697de12792), refactoring
(43c9c14577db) and functional change (18f1369297f4) in a single PR.

https://github.com/llvm/llvm-project/pull/114990 allowed more aggressive
tail duplication for computed-gotos in both pre- and post-regalloc tail
duplication.

In some cases, performing tail-duplication too early can lead to worse
results, especially if we duplicate blocks with a number of phi nodes.

This is causing a ~3% performance regression in some workloads using
Python 3.12.

This patch updates TailDup to delay aggressive tail-duplication for
computed gotos to after register allocation.

This means we can keep the non-duplicated version for a bit longer
throughout the backend, which should reduce compile-time as well as
allowing a number of optimizations and simplifications to trigger before
drastically expanding the CFG.

For the case in https://github.com/llvm/llvm-project/issues/106846, I
get the same performance with and without this patch on Skylake.

PR: https://github.com/llvm/llvm-project/pull/150911
---
 llvm/include/llvm/CodeGen/MachineBasicBlock.h |   9 +-
 llvm/lib/CodeGen/TailDuplicator.cpp   |  18 ++-
 .../AArch64/late-taildup-computed-goto.ll | 143 ++
 ...o.mir => early-tail-dup-computed-goto.mir} |  44 +++---
 .../X86/late-tail-dup-computed-goto.mir   | 128 
 5 files changed, 310 insertions(+), 32 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll
 rename llvm/test/CodeGen/X86/{tail-dup-computed-goto.mir => 
early-tail-dup-computed-goto.mir} (93%)
 create mode 100644 llvm/test/CodeGen/X86/late-tail-dup-computed-goto.mir

diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h 
b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
index 938d71dd030e8..9e3d9196cc184 100644
--- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
@@ -323,10 +323,11 @@ class MachineBasicBlock
   const MachineFunction *getParent() const { return xParent; }
   MachineFunction *getParent() { return xParent; }
 
-  /// Returns true if the original IR terminator is an `indirectbr`. This
-  /// typically corresponds to a `goto` in C, rather than jump tables.
-  bool terminatorIsComputedGoto() const {
-return back().isIndirectBranch() &&
+  /// Returns true if the original IR terminator is an `indirectbr` with
+  /// successor blocks. This typically corresponds to a `goto` in C, rather 
than
+  /// jump tables.
+  bool terminatorIsComputedGotoWithSuccessors() const {
+return back().isIndirectBranch() && !succ_empty() &&
llvm::all_of(successors(), [](const MachineBasicBlock *Succ) {
  return Succ->isIRBlockAddressTaken();
});
diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp 
b/llvm/lib/CodeGen/TailDuplicator.cpp
index a88c57fdc165a..5d720fbbf1c61 100644
--- a/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -604,12 +604,21 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
   bool HasComputedGoto = false;
   if (!TailBB.empty()) {
 HasIndirectbr = TailBB.back().isIndirectBranch();
-HasComputedGoto = TailBB.terminatorIsComputedGoto();
+HasComputedGoto = TailBB.terminatorIsComputedGotoWithSuccessors();
   }
 
   if (HasIndirectbr && PreRegAlloc)
 MaxDuplicateCount = TailDupIndirectBranchSize;
 
+  // Allow higher limits when the block has computed-gotos and running after
+  // register allocation. NB. This basically unfactors computed gotos that were
+  // factored early on in the compilat

[llvm-branch-commits] [llvm] release/21.x: [TailDup] Delay aggressive computed-goto taildup to after RegAlloc. (#150911) (PR #151680)

2025-08-01 Thread Florian Hahn via llvm-branch-commits

fhahn wrote:

For context, this fixes a regression in the Python interpreter on AArch64 
compared to 20.1.0

https://github.com/llvm/llvm-project/pull/151680
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] [OpenMP][clang] 6.0: num_threads strict (part 3: codegen) (PR #146405)

2025-08-01 Thread Alexey Bataev via llvm-branch-commits


@@ -1963,8 +1963,12 @@ void 
OMPClausePrinter::VisitOMPSeverityClause(OMPSeverityClause *Node) {
 }
 
 void OMPClausePrinter::VisitOMPMessageClause(OMPMessageClause *Node) {
-  OS << "message(\""
- << cast(Node->getMessageString())->getString() << "\")";
+  OS << "message(";
+  if (StringLiteral *SL = dyn_cast(Node->getMessageString()))
+OS << "\"" << SL->getString() << "\"";

alexey-bataev wrote:

Why do you need this special processing?

https://github.com/llvm/llvm-project/pull/146405
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] [OpenMP][clang] 6.0: num_threads strict (part 3: codegen) (PR #146405)

2025-08-01 Thread Alexey Bataev via llvm-branch-commits


@@ -1260,21 +1260,30 @@ void 
CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
   NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty);
 
 assert(IfCondVal && "Expected a value");
+RuntimeFunction FnID = OMPRTL___kmpc_parallel_51;
 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
-llvm::Value *Args[] = {
-RTLoc,
-getThreadID(CGF, Loc),
-IfCondVal,
-NumThreadsVal,
-llvm::ConstantInt::get(CGF.Int32Ty, -1),
-FnPtr,
-ID,
-Bld.CreateBitOrPointerCast(CapturedVarsAddrs.emitRawPointer(CGF),
-   CGF.VoidPtrPtrTy),
-llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
-CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-CGM.getModule(), OMPRTL___kmpc_parallel_51),
-Args);
+llvm::SmallVector Args(
+{RTLoc, getThreadID(CGF, Loc), IfCondVal, NumThreadsVal,
+ llvm::ConstantInt::get(CGF.Int32Ty, -1), FnPtr, ID,
+ Bld.CreateBitOrPointerCast(CapturedVarsAddrs.emitRawPointer(CGF),
+CGF.VoidPtrPtrTy),
+ llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())});
+if (NumThreadsModifier == OMPC_NUMTHREADS_strict) {
+  FnID = OMPRTL___kmpc_parallel_60;
+  // OpenMP 6.0, 10.4: "If no severity clause is specified then the effect
+  // is as if sev-level is fatal."
+  Args.append(
+  {llvm::ConstantInt::get(CGM.Int32Ty, true),
+   llvm::ConstantInt::get(CGM.Int32Ty,
+  Severity == OMPC_SEVERITY_warning ? 1 : 2)});
+  if (Message)
+
Args.push_back(CGF.EmitStringLiteralLValue(cast(Message))
+   .getPointer(CGF));

alexey-bataev wrote:

This needs to be fixed

https://github.com/llvm/llvm-project/pull/146405
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] [OpenMP][clang] 6.0: num_threads strict (part 3: codegen) (PR #146405)

2025-08-01 Thread Alexey Bataev via llvm-branch-commits


@@ -16445,11 +16450,18 @@ OMPClause *SemaOpenMP::ActOnOpenMPMessageClause(Expr 
*ME,
 SourceLocation LParenLoc,
 SourceLocation EndLoc) {
   assert(ME && "NULL expr in Message clause");
-  if (!isa(ME)) {
+  QualType Type = ME->getType();
+  if ((!Type->isPointerType() && !Type->isArrayType()) ||
+  !Type->getPointeeOrArrayElementType()->isAnyCharacterType()) {
 Diag(ME->getBeginLoc(), diag::warn_clause_expected_string)
 << getOpenMPClauseNameForDiag(OMPC_message);
 return nullptr;
   }
+
+  // Convert array type to pointer type if needed.
+  if (Type->isArrayType())
+ME = SemaRef.DefaultFunctionArrayConversion(ME).get();

alexey-bataev wrote:

I don't think you need this check, there is another similar function 
`DefaultLvalueConversion`, which will do the required conversion without extra 
checks

https://github.com/llvm/llvm-project/pull/146405
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] SROA: Recognize llvm.protected.field.ptr intrinsics. (PR #151650)

2025-08-01 Thread Nikita Popov via llvm-branch-commits




nikic wrote:

Use update_test_checks.py.

Please also add a test where the alloca is split but not promoted.

https://github.com/llvm/llvm-project/pull/151650
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] SROA: Recognize llvm.protected.field.ptr intrinsics. (PR #151650)

2025-08-01 Thread Nikita Popov via llvm-branch-commits


@@ -0,0 +1,41 @@
+; RUN: opt -passes=sroa -S < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"

nikic wrote:

Is this triple necessary? If not, drop it, otherwise add REQUIRES.

https://github.com/llvm/llvm-project/pull/151650
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] Utils: Inhibit load/store folding through phis for llvm.protected.field.ptr. (PR #151649)

2025-08-01 Thread Nikita Popov via llvm-branch-commits


@@ -3846,10 +3846,7 @@ bool llvm::canReplaceOperandWithVariable(const 
Instruction *I, unsigned OpIdx) {
   if (Op->getType()->isMetadataTy())
 return false;
 
-  // swifterror pointers can only be used by a load, store, or as a swifterror
-  // argument; swifterror pointers are not allowed to be used in select or phi
-  // instructions.
-  if (Op->isSwiftError())
+  if (!shouldFoldLoadStoreWithPointerOperandThroughPhi(Op))

nikic wrote:

This is going to apply not to just loads and stores.

https://github.com/llvm/llvm-project/pull/151649
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] Utils: Inhibit load/store folding through phis for llvm.protected.field.ptr. (PR #151649)

2025-08-01 Thread Nikita Popov via llvm-branch-commits




nikic wrote:

Use update_test_checks.py.

https://github.com/llvm/llvm-project/pull/151649
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] Utils: Inhibit load/store folding through phis for llvm.protected.field.ptr. (PR #151649)

2025-08-01 Thread Nikita Popov via llvm-branch-commits


@@ -0,0 +1,33 @@
+; RUN: opt -O2 -S < %s | FileCheck %s
+
+; Test that no optimization run at -O2 moves the loads into the exit block,
+; as this causes unnecessary address escapes with pointer field protection.
+
+target triple = "aarch64-unknown-linux-gnu"

nikic wrote:

Needs to either drop the triple (more likely) or add REQUIRES.

https://github.com/llvm/llvm-project/pull/151649
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [lldb] [llvm] Backport Wasm Debugging changes to the LLVM 21.x Release (PR #151559)

2025-08-01 Thread David Spickett via llvm-branch-commits

DavidSpickett wrote:

I reviewed a lot of the changes on main but was not involved in writing them or 
the work that they are being written for (Swift compiled to WASM, I believe).

The release note is the best place to start to understand it:
> * LLDB can now set breakpoints, show backtraces, and display variables when
>  debugging Wasm with supported runtimes (WAMR and V8).

We had *some* wasm support before but it was not usable as I understand it. 
These changes are not full support of wasm, but it will now do something useful 
with the runtimes mentioned. It's new features, but with a limited scope.

So for backport risk, there are:
* New tests. The nice thing is these tests are simulated targets so they run 
everywhere. On Linaro's Linux and Windows bots, there have been no problems so 
far. The new tests have no new dependencies either.
* Some changes to internal DWARF handling functions. Debug information is well 
tested, so I don't consider this a risk.
* Lots of new code specifically for WASM. This code is built unconditionally 
but does not bring in any new dependencies to LLDB. At runtime, this code is 
doing nothing unless you opt into connecting to a WASM target (in fact, due to 
a bug, the user has to type out the wasm plugin name, you can't accidentally do 
this).

Is this a relatively large set of new features? Yes.
Is it quite isolated? Yes.
Does it change any user facing commands or LLDB API? No

On that basis I'm ok with backporting this.

If we want a second opinion, @labath did not review these changes on main, so 
might have a different take.

https://github.com/llvm/llvm-project/pull/151559
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] Improve StructurizeCFG pass performance by using SSAUpdaterBulk. (PR #150937)

2025-08-01 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm approved this pull request.


https://github.com/llvm/llvm-project/pull/150937
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [lld] [llvm] release/21.x: [DTLTO][LLD][ELF] Support bitcode members of thin archives (#149425) (PR #151674)

2025-08-01 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-lld

Author: None (llvmbot)


Changes

Backport 673476d96bed306be6ed81a8174f481a9a4b2934

Requested by: @bd1976bris

---
Full diff: https://github.com/llvm/llvm-project/pull/151674.diff


5 Files Affected:

- (modified) cross-project-tests/CMakeLists.txt (+3-2) 
- (added) cross-project-tests/dtlto/ld-archive-thin.test (+97) 
- (modified) cross-project-tests/lit.cfg.py (+2) 
- (modified) lld/ELF/InputFiles.cpp (+49-10) 
- (added) lld/test/ELF/dtlto/archive-thin.test (+65) 


``diff
diff --git a/cross-project-tests/CMakeLists.txt 
b/cross-project-tests/CMakeLists.txt
index b4b1f47626073..192db87043177 100644
--- a/cross-project-tests/CMakeLists.txt
+++ b/cross-project-tests/CMakeLists.txt
@@ -19,11 +19,12 @@ set(CROSS_PROJECT_TEST_DEPS
   FileCheck
   check-gdb-llvm-support
   count
-  llvm-dwarfdump
+  llvm-ar
   llvm-config
+  llvm-dwarfdump
   llvm-objdump
-  split-file
   not
+  split-file
   )
 
 if ("clang" IN_LIST LLVM_ENABLE_PROJECTS)
diff --git a/cross-project-tests/dtlto/ld-archive-thin.test 
b/cross-project-tests/dtlto/ld-archive-thin.test
new file mode 100644
index 0..979da5423962e
--- /dev/null
+++ b/cross-project-tests/dtlto/ld-archive-thin.test
@@ -0,0 +1,97 @@
+REQUIRES: ld.lld,llvm-ar
+
+## Test that a DTLTO link succeeds and outputs the expected set of files
+## correctly when thin archives are present.
+
+RUN: rm -rf %t && split-file %s %t && cd %t
+
+## Compile bitcode. -O2 is required for cross-module importing.
+RUN: %clang -O2 --target=x86_64-linux-gnu -flto=thin -c \
+RUN:   foo.c bar.c dog.c cat.c start.c
+
+## Generate thin archives.
+RUN: llvm-ar rcs foo.a foo.o --thin
+## Create this bitcode thin archive in a subdirectory to test the expansion of
+## the path to a bitcode file that is referenced using "..", e.g., in this case
+## "../bar.o".
+RUN: mkdir lib
+RUN: llvm-ar rcs lib/bar.a bar.o --thin
+## Create this bitcode thin archive with an absolute path entry containing 
"..".
+RUN: llvm-ar rcs dog.a %t/lib/../dog.o --thin
+## The bitcode member of cat.a will not be used in the link.
+RUN: llvm-ar rcs cat.a cat.o --thin
+RUN: llvm-ar rcs start.a start.o --thin
+
+## Link from a different directory to ensure that thin archive member paths are
+## resolved correctly relative to the archive locations.
+RUN: mkdir %t/out && cd %t/out
+
+RUN: %clang --target=x86_64-linux-gnu -flto=thin -fuse-ld=lld %t/foo.a 
%t/lib/bar.a ../start.a %t/cat.a \
+RUN:   -Wl,--whole-archive ../dog.a \
+RUN:   -fthinlto-distributor=%python \
+RUN:   -Xthinlto-distributor=%llvm_src_root/utils/dtlto/local.py \
+RUN:   -Wl,--save-temps -nostdlib -Werror
+
+## Check that the required output files have been created.
+RUN: ls | sort | FileCheck %s
+
+## No files are expected before.
+CHECK-NOT: {{.}}
+
+## JSON jobs description.
+CHECK: {{^}}a.[[PID:[a-zA-Z0-9_]+]].dist-file.json{{$}}
+
+## Native output object files and individual summary index files.
+CHECK: {{^}}bar.3.[[PID]].native.o{{$}}
+CHECK: {{^}}bar.3.[[PID]].native.o.thinlto.bc{{$}}
+CHECK: {{^}}dog.1.[[PID]].native.o{{$}}
+CHECK: {{^}}dog.1.[[PID]].native.o.thinlto.bc{{$}}
+CHECK: {{^}}foo.2.[[PID]].native.o{{$}}
+CHECK: {{^}}foo.2.[[PID]].native.o.thinlto.bc{{$}}
+CHECK: {{^}}start.4.[[PID]].native.o{{$}}
+CHECK: {{^}}start.4.[[PID]].native.o.thinlto.bc{{$}}
+
+## No files are expected after.
+CHECK-NOT: {{.}}
+
+
+## It is important that cross-module inlining occurs for this test to show 
that Clang can
+## successfully load the bitcode file dependencies recorded in the summary 
indices.
+## Explicitly check that the expected importing has occurred.
+
+RUN: llvm-dis start.4.*.native.o.thinlto.bc -o - | \
+RUN:   FileCheck %s --check-prefixes=FOO,BAR,START
+
+RUN: llvm-dis dog.1.*.native.o.thinlto.bc -o - | \
+RUN:   FileCheck %s --check-prefixes=FOO,BAR,DOG,START
+
+RUN: llvm-dis foo.2.*.native.o.thinlto.bc -o - | \
+RUN:   FileCheck %s --check-prefixes=FOO,BAR,START
+
+RUN: llvm-dis bar.3.*.native.o.thinlto.bc -o - | \
+RUN:   FileCheck %s --check-prefixes=FOO,BAR,START
+
+FOO-DAG:   foo.o
+BAR-DAG:   bar.o
+DOG-DAG:   dog.o
+START-DAG: start.o
+
+
+#--- foo.c
+extern int bar(int), _start(int);
+__attribute__((retain)) int foo(int x) { return x + bar(x) + _start(x); }
+
+#--- bar.c
+extern int foo(int), _start(int);
+__attribute__((retain)) int bar(int x) { return x + foo(x) + _start(x); }
+
+#--- dog.c
+extern int foo(int), bar(int), _start(int);
+__attribute__((retain)) int dog(int x) { return x + foo(x) + bar(x) + 
_start(x); }
+
+#--- cat.c
+__attribute__((retain)) void cat(int x) {}
+
+#--- start.c
+extern int foo(int), bar(int);
+__attribute__((retain)) int _start(int x) { return x + foo(x) + bar(x); }
diff --git a/cross-project-tests/lit.cfg.py b/cross-project-tests/lit.cfg.py
index 31c93923ac9ed..ac27753472646 100644
--- a/cross-project-tests/lit.cfg.py
+++ b/cross-project-tests/lit.cfg.py
@@ -107,6 +107,8 @@ def get_required_attr(config, attr_name):
 if lldb_path is not None:
 

[llvm-branch-commits] [lld] [llvm] release/21.x: [DTLTO][LLD][ELF] Support bitcode members of thin archives (#149425) (PR #151674)

2025-08-01 Thread via llvm-branch-commits

https://github.com/llvmbot created 
https://github.com/llvm/llvm-project/pull/151674

Backport 673476d96bed306be6ed81a8174f481a9a4b2934

Requested by: @bd1976bris

>From a23810b87d0b7e908ef5d0e0955b54370ad03a85 Mon Sep 17 00:00:00 2001
From: bd1976bris 
Date: Fri, 1 Aug 2025 09:38:46 +0100
Subject: [PATCH] [DTLTO][LLD][ELF] Support bitcode members of thin archives
 (#149425)

This patch adds support for bitcode members of thin archives to DTLTO
(https://llvm.org/docs/DTLTO.html) in ELF LLD.

For DTLTO, bitcode identifiers must be valid paths to bitcode files on
disk. Clang does not support archive inputs for ThinLTO backend
compilations. This patch adjusts the identifier for bitcode members of
thin archives in DTLTO links so that it is the path to the member file
on disk, allowing such members to be supported in DTLTO.

This patch is sufficient to allow for self-hosting an LLVM build with
DTLTO when thin archives are used.

Note: Bitcode members of non-thin archives remain unsupported. This will
be addressed in a future change.

Testing:
- LLD lit test coverage has been added to check that the identifier is
adjusted appropriately.
- A cross-project lit test has been added to show that a DTLTO link can
succeed when linking bitcode members of thin archives.

For the design discussion of the DTLTO feature, see: #126654.

(cherry picked from commit 673476d96bed306be6ed81a8174f481a9a4b2934)
---
 cross-project-tests/CMakeLists.txt|  5 +-
 .../dtlto/ld-archive-thin.test| 97 +++
 cross-project-tests/lit.cfg.py|  2 +
 lld/ELF/InputFiles.cpp| 59 +--
 lld/test/ELF/dtlto/archive-thin.test  | 65 +
 5 files changed, 216 insertions(+), 12 deletions(-)
 create mode 100644 cross-project-tests/dtlto/ld-archive-thin.test
 create mode 100644 lld/test/ELF/dtlto/archive-thin.test

diff --git a/cross-project-tests/CMakeLists.txt 
b/cross-project-tests/CMakeLists.txt
index b4b1f47626073..192db87043177 100644
--- a/cross-project-tests/CMakeLists.txt
+++ b/cross-project-tests/CMakeLists.txt
@@ -19,11 +19,12 @@ set(CROSS_PROJECT_TEST_DEPS
   FileCheck
   check-gdb-llvm-support
   count
-  llvm-dwarfdump
+  llvm-ar
   llvm-config
+  llvm-dwarfdump
   llvm-objdump
-  split-file
   not
+  split-file
   )
 
 if ("clang" IN_LIST LLVM_ENABLE_PROJECTS)
diff --git a/cross-project-tests/dtlto/ld-archive-thin.test 
b/cross-project-tests/dtlto/ld-archive-thin.test
new file mode 100644
index 0..979da5423962e
--- /dev/null
+++ b/cross-project-tests/dtlto/ld-archive-thin.test
@@ -0,0 +1,97 @@
+REQUIRES: ld.lld,llvm-ar
+
+## Test that a DTLTO link succeeds and outputs the expected set of files
+## correctly when thin archives are present.
+
+RUN: rm -rf %t && split-file %s %t && cd %t
+
+## Compile bitcode. -O2 is required for cross-module importing.
+RUN: %clang -O2 --target=x86_64-linux-gnu -flto=thin -c \
+RUN:   foo.c bar.c dog.c cat.c start.c
+
+## Generate thin archives.
+RUN: llvm-ar rcs foo.a foo.o --thin
+## Create this bitcode thin archive in a subdirectory to test the expansion of
+## the path to a bitcode file that is referenced using "..", e.g., in this case
+## "../bar.o".
+RUN: mkdir lib
+RUN: llvm-ar rcs lib/bar.a bar.o --thin
+## Create this bitcode thin archive with an absolute path entry containing 
"..".
+RUN: llvm-ar rcs dog.a %t/lib/../dog.o --thin
+## The bitcode member of cat.a will not be used in the link.
+RUN: llvm-ar rcs cat.a cat.o --thin
+RUN: llvm-ar rcs start.a start.o --thin
+
+## Link from a different directory to ensure that thin archive member paths are
+## resolved correctly relative to the archive locations.
+RUN: mkdir %t/out && cd %t/out
+
+RUN: %clang --target=x86_64-linux-gnu -flto=thin -fuse-ld=lld %t/foo.a 
%t/lib/bar.a ../start.a %t/cat.a \
+RUN:   -Wl,--whole-archive ../dog.a \
+RUN:   -fthinlto-distributor=%python \
+RUN:   -Xthinlto-distributor=%llvm_src_root/utils/dtlto/local.py \
+RUN:   -Wl,--save-temps -nostdlib -Werror
+
+## Check that the required output files have been created.
+RUN: ls | sort | FileCheck %s
+
+## No files are expected before.
+CHECK-NOT: {{.}}
+
+## JSON jobs description.
+CHECK: {{^}}a.[[PID:[a-zA-Z0-9_]+]].dist-file.json{{$}}
+
+## Native output object files and individual summary index files.
+CHECK: {{^}}bar.3.[[PID]].native.o{{$}}
+CHECK: {{^}}bar.3.[[PID]].native.o.thinlto.bc{{$}}
+CHECK: {{^}}dog.1.[[PID]].native.o{{$}}
+CHECK: {{^}}dog.1.[[PID]].native.o.thinlto.bc{{$}}
+CHECK: {{^}}foo.2.[[PID]].native.o{{$}}
+CHECK: {{^}}foo.2.[[PID]].native.o.thinlto.bc{{$}}
+CHECK: {{^}}start.4.[[PID]].native.o{{$}}
+CHECK: {{^}}start.4.[[PID]].native.o.thinlto.bc{{$}}
+
+## No files are expected after.
+CHECK-NOT: {{.}}
+
+
+## It is important that cross-module inlining occurs for this test to show 
that Clang can
+## successfully load the bitcode file dependencies recorded in the summary 
indices.
+## Explicitly check that the expected importing has occurr

  1   2   3   >