[clang] [llvm] Support branch hint (PR #97721)

2024-07-04 Thread Feng Zou via cfe-commits

https://github.com/fzou1 created https://github.com/llvm/llvm-project/pull/97721

For more details about this feature, please refer to latest Intel 64 and IA-32 
Architectures Optimization Reference Manual Volume 1: 
https://www.intel.com/content/www/us/en/content-details/821612/intel-64-and-ia-32-architectures-optimization-reference-manual-volume-1.html

>From 3c75e22504416afae288723aff34120d88b100db Mon Sep 17 00:00:00 2001
From: Feng Zou 
Date: Thu, 4 Jul 2024 15:43:12 +0800
Subject: [PATCH] Support branch hint

For more details about this feature, please refer to latest Intel 64 and
IA-32 Architectures Optimization Reference Manual Volume 1:
https://www.intel.com/content/www/us/en/content-details/821612/intel-64-and-ia-32-architectures-optimization-reference-manual-volume-1.html
---
 clang/lib/Basic/Targets/X86.cpp|  3 +
 clang/lib/Basic/Targets/X86.h  |  1 +
 llvm/lib/Target/X86/X86.td | 13 +++-
 llvm/lib/Target/X86/X86MCInstLower.cpp | 24 +++
 llvm/test/CodeGen/X86/branch-hint.ll   | 95 ++
 5 files changed, 133 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/branch-hint.ll

diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 276d492955207..1f6fc842ddd95 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -457,6 +457,8 @@ bool 
X86TargetInfo::handleTargetFeatures(std::vector &Features,
   HasCF = true;
 } else if (Feature == "+zu") {
   HasZU = true;
+} else if (Feature == "+branch-hint") {
+  HasBranchHint = true;
 }
 
 X86SSEEnum Level = llvm::StringSwitch(Feature)
@@ -1292,6 +1294,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
   .Case("nf", HasNF)
   .Case("cf", HasCF)
   .Case("zu", HasZU)
+  .Case("branch-hint", HasBranchHint)
   .Default(false);
 }
 
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index 5ce4953251bc3..a70711f4ae2bb 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -174,6 +174,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public 
TargetInfo {
   bool HasCF = false;
   bool HasZU = false;
   bool HasInlineAsmUseGPR32 = false;
+  bool HasBranchHint = false;
 
 protected:
   llvm::X86::CPUKind CPU = llvm::X86::CK_None;
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 68b78c7c44771..fdd7d5f1ee0e7 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -749,6 +749,11 @@ def TuningUseGLMDivSqrtCosts
 : SubtargetFeature<"use-glm-div-sqrt-costs", "UseGLMDivSqrtCosts", "true",
 "Use Goldmont specific floating point div/sqrt costs">;
 
+// Starting with Redwood Cove architecture, the branch has branch taken hint
+// (i.e., instruction prefix 3EH).
+def TuningBranchHint: SubtargetFeature<"branch-hint", "HasBranchHint", "true",
+"Target has branch hint feature">;
+
 
//===--===//
 // X86 CPU Families
 // TODO: Remove these - use general tuning features to determine codegen.
@@ -1124,6 +1129,8 @@ def ProcessorFeatures {
   FeaturePREFETCHI];
   list GNRFeatures =
 !listconcat(SPRFeatures, GNRAdditionalFeatures);
+  list GNRAdditionalTuning = [TuningBranchHint];
+  list GNRTuning = !listconcat(SPRTuning, 
GNRAdditionalTuning);
 
   // Graniterapids D
   list GNRDAdditionalFeatures = [FeatureAMXCOMPLEX];
@@ -1815,12 +1822,12 @@ def : ProcModel<"pantherlake", AlderlakePModel,
 def : ProcModel<"clearwaterforest", AlderlakePModel,
 ProcessorFeatures.CWFFeatures, ProcessorFeatures.ADLTuning>;
 def : ProcModel<"graniterapids", SapphireRapidsModel,
-ProcessorFeatures.GNRFeatures, ProcessorFeatures.SPRTuning>;
+ProcessorFeatures.GNRFeatures, ProcessorFeatures.GNRTuning>;
 def : ProcModel<"emeraldrapids", SapphireRapidsModel,
-ProcessorFeatures.SPRFeatures, ProcessorFeatures.SPRTuning>;
+ProcessorFeatures.SPRFeatures, ProcessorFeatures.GNRTuning>;
 foreach P = ["graniterapids-d", "graniterapids_d"] in {
 def : ProcModel;
+ProcessorFeatures.GNRDFeatures, ProcessorFeatures.GNRTuning>;
 }
 
 // AMD CPUs.
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp 
b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 00f58f9432e4d..34d95573585c9 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -25,6 +25,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
@@ -54,6 +55,14 @@
 
 using namespace llvm;
 
+static cl::opt EnableBranchHint("branch-hi

[clang] [llvm] [X86] Support branch hint (PR #97721)

2024-07-04 Thread Feng Zou via cfe-commits




fzou1 wrote:

Thanks. It's simpler. What's the metadata for "!prof !0" and "!prof !1"?

https://github.com/llvm/llvm-project/pull/97721
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86] Support branch hint (PR #97721)

2024-07-04 Thread Feng Zou via cfe-commits


@@ -749,6 +749,11 @@ def TuningUseGLMDivSqrtCosts
 : SubtargetFeature<"use-glm-div-sqrt-costs", "UseGLMDivSqrtCosts", "true",
 "Use Goldmont specific floating point div/sqrt costs">;
 
+// Starting with Redwood Cove architecture, the branch has branch taken hint
+// (i.e., instruction prefix 3EH).
+def TuningBranchHint: SubtargetFeature<"branch-hint", "HasBranchHint", "true",

fzou1 wrote:

@KanRobert, Thank you for helping this.

https://github.com/llvm/llvm-project/pull/97721
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86] Support branch hint (PR #97721)

2024-07-04 Thread Feng Zou via cfe-commits

https://github.com/fzou1 updated https://github.com/llvm/llvm-project/pull/97721

>From 3c75e22504416afae288723aff34120d88b100db Mon Sep 17 00:00:00 2001
From: Feng Zou 
Date: Thu, 4 Jul 2024 15:43:12 +0800
Subject: [PATCH 1/2] Support branch hint

For more details about this feature, please refer to latest Intel 64 and
IA-32 Architectures Optimization Reference Manual Volume 1:
https://www.intel.com/content/www/us/en/content-details/821612/intel-64-and-ia-32-architectures-optimization-reference-manual-volume-1.html
---
 clang/lib/Basic/Targets/X86.cpp|  3 +
 clang/lib/Basic/Targets/X86.h  |  1 +
 llvm/lib/Target/X86/X86.td | 13 +++-
 llvm/lib/Target/X86/X86MCInstLower.cpp | 24 +++
 llvm/test/CodeGen/X86/branch-hint.ll   | 95 ++
 5 files changed, 133 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/branch-hint.ll

diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 276d492955207a..1f6fc842ddd955 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -457,6 +457,8 @@ bool 
X86TargetInfo::handleTargetFeatures(std::vector &Features,
   HasCF = true;
 } else if (Feature == "+zu") {
   HasZU = true;
+} else if (Feature == "+branch-hint") {
+  HasBranchHint = true;
 }
 
 X86SSEEnum Level = llvm::StringSwitch(Feature)
@@ -1292,6 +1294,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
   .Case("nf", HasNF)
   .Case("cf", HasCF)
   .Case("zu", HasZU)
+  .Case("branch-hint", HasBranchHint)
   .Default(false);
 }
 
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index 5ce4953251bc34..a70711f4ae2bb2 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -174,6 +174,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public 
TargetInfo {
   bool HasCF = false;
   bool HasZU = false;
   bool HasInlineAsmUseGPR32 = false;
+  bool HasBranchHint = false;
 
 protected:
   llvm::X86::CPUKind CPU = llvm::X86::CK_None;
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 68b78c7c44771f..fdd7d5f1ee0e73 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -749,6 +749,11 @@ def TuningUseGLMDivSqrtCosts
 : SubtargetFeature<"use-glm-div-sqrt-costs", "UseGLMDivSqrtCosts", "true",
 "Use Goldmont specific floating point div/sqrt costs">;
 
+// Starting with Redwood Cove architecture, the branch has branch taken hint
+// (i.e., instruction prefix 3EH).
+def TuningBranchHint: SubtargetFeature<"branch-hint", "HasBranchHint", "true",
+"Target has branch hint feature">;
+
 
//===--===//
 // X86 CPU Families
 // TODO: Remove these - use general tuning features to determine codegen.
@@ -1124,6 +1129,8 @@ def ProcessorFeatures {
   FeaturePREFETCHI];
   list GNRFeatures =
 !listconcat(SPRFeatures, GNRAdditionalFeatures);
+  list GNRAdditionalTuning = [TuningBranchHint];
+  list GNRTuning = !listconcat(SPRTuning, 
GNRAdditionalTuning);
 
   // Graniterapids D
   list GNRDAdditionalFeatures = [FeatureAMXCOMPLEX];
@@ -1815,12 +1822,12 @@ def : ProcModel<"pantherlake", AlderlakePModel,
 def : ProcModel<"clearwaterforest", AlderlakePModel,
 ProcessorFeatures.CWFFeatures, ProcessorFeatures.ADLTuning>;
 def : ProcModel<"graniterapids", SapphireRapidsModel,
-ProcessorFeatures.GNRFeatures, ProcessorFeatures.SPRTuning>;
+ProcessorFeatures.GNRFeatures, ProcessorFeatures.GNRTuning>;
 def : ProcModel<"emeraldrapids", SapphireRapidsModel,
-ProcessorFeatures.SPRFeatures, ProcessorFeatures.SPRTuning>;
+ProcessorFeatures.SPRFeatures, ProcessorFeatures.GNRTuning>;
 foreach P = ["graniterapids-d", "graniterapids_d"] in {
 def : ProcModel;
+ProcessorFeatures.GNRDFeatures, ProcessorFeatures.GNRTuning>;
 }
 
 // AMD CPUs.
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp 
b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 00f58f9432e4d7..34d95573585c90 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -25,6 +25,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
@@ -54,6 +55,14 @@
 
 using namespace llvm;
 
+static cl::opt EnableBranchHint("branch-hint",
+  cl::desc("Enable branch hint."),
+  cl::init(false), cl::Hidden);
+static cl::opt BranchHintProbabilityThreshold(
+"branch-hint-probability-threshold",
+cl::desc("The probabili

[clang] [llvm] [X86] Support branch hint (PR #97721)

2024-07-09 Thread Feng Zou via cfe-commits


@@ -1815,12 +1822,12 @@ def : ProcModel<"pantherlake", AlderlakePModel,
 def : ProcModel<"clearwaterforest", AlderlakePModel,
 ProcessorFeatures.CWFFeatures, ProcessorFeatures.ADLTuning>;
 def : ProcModel<"graniterapids", SapphireRapidsModel,
-ProcessorFeatures.GNRFeatures, ProcessorFeatures.SPRTuning>;
+ProcessorFeatures.GNRFeatures, ProcessorFeatures.GNRTuning>;
 def : ProcModel<"emeraldrapids", SapphireRapidsModel,
-ProcessorFeatures.SPRFeatures, ProcessorFeatures.SPRTuning>;
+ProcessorFeatures.SPRFeatures, ProcessorFeatures.GNRTuning>;

fzou1 wrote:

Nice catch. I'll correct it.

https://github.com/llvm/llvm-project/pull/97721
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-FP8 (PR #113850)

2024-10-29 Thread Feng Zou via cfe-commits

https://github.com/fzou1 updated 
https://github.com/llvm/llvm-project/pull/113850

>From fd570cb8d41f5f94b61d515985245fc81aab633e Mon Sep 17 00:00:00 2001
From: Feng Zou 
Date: Thu, 24 Oct 2024 21:56:48 +0800
Subject: [PATCH 1/3] Support AMX-FP8

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368
---
 clang/docs/ReleaseNotes.rst   |  1 +
 clang/include/clang/Basic/BuiltinsX86_64.def  |  6 +++
 clang/include/clang/Driver/Options.td |  2 +
 clang/lib/Basic/Targets/X86.cpp   |  6 +++
 clang/lib/Basic/Targets/X86.h |  1 +
 clang/lib/Headers/CMakeLists.txt  |  1 +
 clang/lib/Headers/amxfp8intrin.h  | 24 
 clang/lib/Headers/immintrin.h |  4 ++
 clang/lib/Sema/SemaX86.cpp|  4 ++
 clang/test/CodeGen/X86/amx_fp8.c  | 27 +
 clang/test/CodeGen/X86/amx_fp8_errors.c   | 10 +
 clang/test/CodeGen/X86/amx_fp8_inline_asm.c   | 32 +++
 llvm/include/llvm/IR/IntrinsicsX86.td | 17 
 .../llvm/TargetParser/X86TargetParser.def |  1 +
 llvm/lib/Target/X86/X86.td|  3 ++
 llvm/lib/Target/X86/X86ISelLowering.cpp   | 23 +++
 llvm/lib/Target/X86/X86InstrAMX.td| 39 +++
 llvm/lib/Target/X86/X86InstrPredicates.td |  1 +
 llvm/lib/TargetParser/Host.cpp|  4 ++
 llvm/lib/TargetParser/X86TargetParser.cpp |  1 +
 llvm/test/CodeGen/X86/amx_fp8_intrinsics.ll   | 20 ++
 .../Disassembler/X86/AMX/x86-64-amx-fp8.txt   | 34 
 llvm/test/MC/X86/AMX/x86-64-amx-fp8-att.s | 33 
 llvm/test/MC/X86/AMX/x86-64-amx-fp8-intel.s   | 33 
 24 files changed, 327 insertions(+)
 create mode 100644 clang/lib/Headers/amxfp8intrin.h
 create mode 100644 clang/test/CodeGen/X86/amx_fp8.c
 create mode 100644 clang/test/CodeGen/X86/amx_fp8_errors.c
 create mode 100644 clang/test/CodeGen/X86/amx_fp8_inline_asm.c
 create mode 100644 llvm/test/CodeGen/X86/amx_fp8_intrinsics.ll
 create mode 100644 llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-fp8.txt
 create mode 100644 llvm/test/MC/X86/AMX/x86-64-amx-fp8-att.s
 create mode 100644 llvm/test/MC/X86/AMX/x86-64-amx-fp8-intel.s

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 6a95337815174b..da0ab888ce200d 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -642,6 +642,7 @@ X86 Support
 
 - Supported intrinsics for ``MOVRS AND AVX10.2``.
   * Supported intrinsics of ``_mm(256|512)_(mask(z))_loadrs_epi(8|16|32|64)``.
+- Support ISA of ``AMX-FP8``.
 
 Arm and AArch64 Support
 ^^^
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def 
b/clang/include/clang/Basic/BuiltinsX86_64.def
index e1e613560167ac..68904ae8abcd15 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -155,6 +155,12 @@ TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, 
"SLLiv*SLLiSLLiIi", "n", "cmpccxadd")
 // AMX_FP16 FP16
 TARGET_BUILTIN(__builtin_ia32_tdpfp16ps, "vIUcIUcIUc", "n", "amx-fp16")
 
+// AMX FP8
+TARGET_BUILTIN(__builtin_ia32_tdpbf8ps, "vIUcUIcUIc", "n", "amx-fp8")
+TARGET_BUILTIN(__builtin_ia32_tdpbhf8ps, "vIUcUIcUIc", "n", "amx-fp8")
+TARGET_BUILTIN(__builtin_ia32_tdphbf8ps, "vIUcUIcUIc", "n", "amx-fp8")
+TARGET_BUILTIN(__builtin_ia32_tdphf8ps, "vIUcUIcUIc", "n", "amx-fp8")
+
 // RAO-INT
 TARGET_BUILTIN(__builtin_ia32_aadd64, "vv*SOi", "n", "raoint")
 TARGET_BUILTIN(__builtin_ia32_aand64, "vv*SOi", "n", "raoint")
diff --git a/clang/include/clang/Driver/Options.td 
b/clang/include/clang/Driver/Options.td
index 5df6ddd5e6a0c5..bbada0834526d7 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6290,6 +6290,8 @@ def mamx_fp16 : Flag<["-"], "mamx-fp16">, 
Group;
 def mno_amx_fp16 : Flag<["-"], "mno-amx-fp16">, Group;
 def mamx_int8 : Flag<["-"], "mamx-int8">, Group;
 def mno_amx_int8 : Flag<["-"], "mno-amx-int8">, Group;
+def mamx_fp8 : Flag<["-"], "mamx-fp8">, Group;
+def mno_amx_fp8 : Flag<["-"], "mno-amx-fp8">, Group;
 def mamx_tile : Flag<["-"], "mamx-tile">, Group;
 def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group;
 def mcmpccxadd : Flag<["-"], "mcmpccxadd">, Group;
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index d067ec218b5270..b95261c39a5993 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -420,6 +420,8 @@ bool 
X86TargetInfo::handleTargetFeatures(std::vector &Features,
   HasAMXTILE = true;
 } else if (Feature == "+amx-complex") {
   HasAMXCOMPLEX = true;
+} else if (Feature == "+amx-fp8") {
+  HasAMXFP8 = true;
 } else if (Feature == "+cmpccxadd") {
   HasCMPCCXADD = true;
 } else if (Feature == "+raoint") {
@@ -939,6 +941,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions 
&Opts,
 Builder.defineMacro("__AMX_FP16__");
   

[clang] [llvm] [X86][AMX] Support AMX-FP8 (PR #113850)

2024-10-29 Thread Feng Zou via cfe-commits

https://github.com/fzou1 updated 
https://github.com/llvm/llvm-project/pull/113850

>From fd570cb8d41f5f94b61d515985245fc81aab633e Mon Sep 17 00:00:00 2001
From: Feng Zou 
Date: Thu, 24 Oct 2024 21:56:48 +0800
Subject: [PATCH 1/5] Support AMX-FP8

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368
---
 clang/docs/ReleaseNotes.rst   |  1 +
 clang/include/clang/Basic/BuiltinsX86_64.def  |  6 +++
 clang/include/clang/Driver/Options.td |  2 +
 clang/lib/Basic/Targets/X86.cpp   |  6 +++
 clang/lib/Basic/Targets/X86.h |  1 +
 clang/lib/Headers/CMakeLists.txt  |  1 +
 clang/lib/Headers/amxfp8intrin.h  | 24 
 clang/lib/Headers/immintrin.h |  4 ++
 clang/lib/Sema/SemaX86.cpp|  4 ++
 clang/test/CodeGen/X86/amx_fp8.c  | 27 +
 clang/test/CodeGen/X86/amx_fp8_errors.c   | 10 +
 clang/test/CodeGen/X86/amx_fp8_inline_asm.c   | 32 +++
 llvm/include/llvm/IR/IntrinsicsX86.td | 17 
 .../llvm/TargetParser/X86TargetParser.def |  1 +
 llvm/lib/Target/X86/X86.td|  3 ++
 llvm/lib/Target/X86/X86ISelLowering.cpp   | 23 +++
 llvm/lib/Target/X86/X86InstrAMX.td| 39 +++
 llvm/lib/Target/X86/X86InstrPredicates.td |  1 +
 llvm/lib/TargetParser/Host.cpp|  4 ++
 llvm/lib/TargetParser/X86TargetParser.cpp |  1 +
 llvm/test/CodeGen/X86/amx_fp8_intrinsics.ll   | 20 ++
 .../Disassembler/X86/AMX/x86-64-amx-fp8.txt   | 34 
 llvm/test/MC/X86/AMX/x86-64-amx-fp8-att.s | 33 
 llvm/test/MC/X86/AMX/x86-64-amx-fp8-intel.s   | 33 
 24 files changed, 327 insertions(+)
 create mode 100644 clang/lib/Headers/amxfp8intrin.h
 create mode 100644 clang/test/CodeGen/X86/amx_fp8.c
 create mode 100644 clang/test/CodeGen/X86/amx_fp8_errors.c
 create mode 100644 clang/test/CodeGen/X86/amx_fp8_inline_asm.c
 create mode 100644 llvm/test/CodeGen/X86/amx_fp8_intrinsics.ll
 create mode 100644 llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-fp8.txt
 create mode 100644 llvm/test/MC/X86/AMX/x86-64-amx-fp8-att.s
 create mode 100644 llvm/test/MC/X86/AMX/x86-64-amx-fp8-intel.s

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 6a95337815174b..da0ab888ce200d 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -642,6 +642,7 @@ X86 Support
 
 - Supported intrinsics for ``MOVRS AND AVX10.2``.
   * Supported intrinsics of ``_mm(256|512)_(mask(z))_loadrs_epi(8|16|32|64)``.
+- Support ISA of ``AMX-FP8``.
 
 Arm and AArch64 Support
 ^^^
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def 
b/clang/include/clang/Basic/BuiltinsX86_64.def
index e1e613560167ac..68904ae8abcd15 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -155,6 +155,12 @@ TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, 
"SLLiv*SLLiSLLiIi", "n", "cmpccxadd")
 // AMX_FP16 FP16
 TARGET_BUILTIN(__builtin_ia32_tdpfp16ps, "vIUcIUcIUc", "n", "amx-fp16")
 
+// AMX FP8
+TARGET_BUILTIN(__builtin_ia32_tdpbf8ps, "vIUcUIcUIc", "n", "amx-fp8")
+TARGET_BUILTIN(__builtin_ia32_tdpbhf8ps, "vIUcUIcUIc", "n", "amx-fp8")
+TARGET_BUILTIN(__builtin_ia32_tdphbf8ps, "vIUcUIcUIc", "n", "amx-fp8")
+TARGET_BUILTIN(__builtin_ia32_tdphf8ps, "vIUcUIcUIc", "n", "amx-fp8")
+
 // RAO-INT
 TARGET_BUILTIN(__builtin_ia32_aadd64, "vv*SOi", "n", "raoint")
 TARGET_BUILTIN(__builtin_ia32_aand64, "vv*SOi", "n", "raoint")
diff --git a/clang/include/clang/Driver/Options.td 
b/clang/include/clang/Driver/Options.td
index 5df6ddd5e6a0c5..bbada0834526d7 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6290,6 +6290,8 @@ def mamx_fp16 : Flag<["-"], "mamx-fp16">, 
Group;
 def mno_amx_fp16 : Flag<["-"], "mno-amx-fp16">, Group;
 def mamx_int8 : Flag<["-"], "mamx-int8">, Group;
 def mno_amx_int8 : Flag<["-"], "mno-amx-int8">, Group;
+def mamx_fp8 : Flag<["-"], "mamx-fp8">, Group;
+def mno_amx_fp8 : Flag<["-"], "mno-amx-fp8">, Group;
 def mamx_tile : Flag<["-"], "mamx-tile">, Group;
 def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group;
 def mcmpccxadd : Flag<["-"], "mcmpccxadd">, Group;
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index d067ec218b5270..b95261c39a5993 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -420,6 +420,8 @@ bool 
X86TargetInfo::handleTargetFeatures(std::vector &Features,
   HasAMXTILE = true;
 } else if (Feature == "+amx-complex") {
   HasAMXCOMPLEX = true;
+} else if (Feature == "+amx-fp8") {
+  HasAMXFP8 = true;
 } else if (Feature == "+cmpccxadd") {
   HasCMPCCXADD = true;
 } else if (Feature == "+raoint") {
@@ -939,6 +941,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions 
&Opts,
 Builder.defineMacro("__AMX_FP16__");
   

[clang] [llvm] [X86][AMX] Support AMX-FP8 (PR #113850)

2024-10-29 Thread Feng Zou via cfe-commits


@@ -0,0 +1,83 @@
+/*===- amxfp8intrin.h - AMX intrinsics -*- C++ 
-*===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ 
*======
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use  directly; include  instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMXFP8INTRIN_H
+#define __AMXFP8INTRIN_H
+#ifdef __x86_64__
+
+
+/// Compute dot-product of brain-float8 (BF8) or hybrid-float8 (HF8)
+///floating-point pairs in tiles \a a and \a b, accumulating the
+///intermediate single-precision (32-bit) floating-point elements with
+///elements in \a dst, and store the 32-bit result back to tile \a dst.

fzou1 wrote:

Okay. I'll copy description from instruction set extension document for the 
file and each intrinsic.

https://github.com/llvm/llvm-project/pull/113850
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-TRANSPOSE (PR #113532)

2024-10-31 Thread Feng Zou via cfe-commits


@@ -34,9 +34,31 @@ class ShapeT {
 if (MRI)
   deduceImm(MRI);
   }
+  // When ShapeT has mult shapes, we only use Shapes (never use Row and Col)
+  // and ImmShapes. Due to the most case is only one shape (just simply use
+  // Shape.Row or Shape.Col), so here we don't merge Row and Col into vector
+  // Shapes to keep the speed and code simplicity.
+  // TODO: The upper solution is a temporary way to minimize current tile
+  // register allocation code changes. It can not handle both Reg shape and
+  // Imm shape for different shapes (e.g. shape 1 is reg shape while shape 2
+  // is imm shape). Refine me when we have more mult-tile shape instructions!
+  ShapeT(ArrayRef ShapesOperands,
+ const MachineRegisterInfo *MRI = nullptr)
+  : Row(nullptr), Col(nullptr), RowImm(InvalidImmShape),
+ColImm(InvalidImmShape) {
+assert(ShapesOperands.size() % 2 == 0 && "Miss row or col!");
+
+for (auto *Shape : ShapesOperands)
+  Shapes.push_back(Shape);
+
+if (MRI)
+  deduceImm(MRI);
+  }
   ShapeT()
   : Row(nullptr), Col(nullptr), RowImm(InvalidImmShape),
 ColImm(InvalidImmShape) {}
+  // TODO: We need to extern cmp operator for muti-shapes if

fzou1 wrote:

muti->multi

https://github.com/llvm/llvm-project/pull/113532
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-TRANSPOSE (PR #113532)

2024-10-31 Thread Feng Zou via cfe-commits


@@ -0,0 +1,248 @@
+/* ===--- amxtransposeintrin.h - AMX_TRANSPOSE intrinsics -*- C++ 
-*-===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * 
===---===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use  directly; use  instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMX_TRANSPOSEINTRIN_H
+#define __AMX_TRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS_TRANSPOSE   
\
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-transpose")))
+
+#define _tile_2rpntlvwz0(tdst, base, stride)   
\
+  __builtin_ia32_t2rpntlvwz0(tdst, base, stride)
+#define _tile_2rpntlvwz0t1(tdst, base, stride) 
\
+  __builtin_ia32_t2rpntlvwz0t1(tdst, base, stride)
+#define _tile_2rpntlvwz1(tdst, base, stride)   
\
+  __builtin_ia32_t2rpntlvwz1(tdst, base, stride)
+#define _tile_2rpntlvwz1t1(tdst, base, stride) 
\
+  __builtin_ia32_t2rpntlvwz1t1(tdst, base, stride)
+
+/// Transpose 32-bit elements from \a src and write the result to \a dst.
+///
+/// \headerfile 
+///
+/// \code
+/// void __tile_transposed(__tile dst, __tile src);

fzou1 wrote:

Remove extra underline: __tile_transposed -> _tile_transposed.

https://github.com/llvm/llvm-project/pull/113532
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-FP8 (PR #113850)

2024-10-27 Thread Feng Zou via cfe-commits

https://github.com/fzou1 created 
https://github.com/llvm/llvm-project/pull/113850

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368

>From fd570cb8d41f5f94b61d515985245fc81aab633e Mon Sep 17 00:00:00 2001
From: Feng Zou 
Date: Thu, 24 Oct 2024 21:56:48 +0800
Subject: [PATCH] Support AMX-FP8

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368
---
 clang/docs/ReleaseNotes.rst   |  1 +
 clang/include/clang/Basic/BuiltinsX86_64.def  |  6 +++
 clang/include/clang/Driver/Options.td |  2 +
 clang/lib/Basic/Targets/X86.cpp   |  6 +++
 clang/lib/Basic/Targets/X86.h |  1 +
 clang/lib/Headers/CMakeLists.txt  |  1 +
 clang/lib/Headers/amxfp8intrin.h  | 24 
 clang/lib/Headers/immintrin.h |  4 ++
 clang/lib/Sema/SemaX86.cpp|  4 ++
 clang/test/CodeGen/X86/amx_fp8.c  | 27 +
 clang/test/CodeGen/X86/amx_fp8_errors.c   | 10 +
 clang/test/CodeGen/X86/amx_fp8_inline_asm.c   | 32 +++
 llvm/include/llvm/IR/IntrinsicsX86.td | 17 
 .../llvm/TargetParser/X86TargetParser.def |  1 +
 llvm/lib/Target/X86/X86.td|  3 ++
 llvm/lib/Target/X86/X86ISelLowering.cpp   | 23 +++
 llvm/lib/Target/X86/X86InstrAMX.td| 39 +++
 llvm/lib/Target/X86/X86InstrPredicates.td |  1 +
 llvm/lib/TargetParser/Host.cpp|  4 ++
 llvm/lib/TargetParser/X86TargetParser.cpp |  1 +
 llvm/test/CodeGen/X86/amx_fp8_intrinsics.ll   | 20 ++
 .../Disassembler/X86/AMX/x86-64-amx-fp8.txt   | 34 
 llvm/test/MC/X86/AMX/x86-64-amx-fp8-att.s | 33 
 llvm/test/MC/X86/AMX/x86-64-amx-fp8-intel.s   | 33 
 24 files changed, 327 insertions(+)
 create mode 100644 clang/lib/Headers/amxfp8intrin.h
 create mode 100644 clang/test/CodeGen/X86/amx_fp8.c
 create mode 100644 clang/test/CodeGen/X86/amx_fp8_errors.c
 create mode 100644 clang/test/CodeGen/X86/amx_fp8_inline_asm.c
 create mode 100644 llvm/test/CodeGen/X86/amx_fp8_intrinsics.ll
 create mode 100644 llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-fp8.txt
 create mode 100644 llvm/test/MC/X86/AMX/x86-64-amx-fp8-att.s
 create mode 100644 llvm/test/MC/X86/AMX/x86-64-amx-fp8-intel.s

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 6a95337815174b..da0ab888ce200d 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -642,6 +642,7 @@ X86 Support
 
 - Supported intrinsics for ``MOVRS AND AVX10.2``.
   * Supported intrinsics of ``_mm(256|512)_(mask(z))_loadrs_epi(8|16|32|64)``.
+- Support ISA of ``AMX-FP8``.
 
 Arm and AArch64 Support
 ^^^
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def 
b/clang/include/clang/Basic/BuiltinsX86_64.def
index e1e613560167ac..68904ae8abcd15 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -155,6 +155,12 @@ TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, 
"SLLiv*SLLiSLLiIi", "n", "cmpccxadd")
 // AMX_FP16 FP16
 TARGET_BUILTIN(__builtin_ia32_tdpfp16ps, "vIUcIUcIUc", "n", "amx-fp16")
 
+// AMX FP8
+TARGET_BUILTIN(__builtin_ia32_tdpbf8ps, "vIUcUIcUIc", "n", "amx-fp8")
+TARGET_BUILTIN(__builtin_ia32_tdpbhf8ps, "vIUcUIcUIc", "n", "amx-fp8")
+TARGET_BUILTIN(__builtin_ia32_tdphbf8ps, "vIUcUIcUIc", "n", "amx-fp8")
+TARGET_BUILTIN(__builtin_ia32_tdphf8ps, "vIUcUIcUIc", "n", "amx-fp8")
+
 // RAO-INT
 TARGET_BUILTIN(__builtin_ia32_aadd64, "vv*SOi", "n", "raoint")
 TARGET_BUILTIN(__builtin_ia32_aand64, "vv*SOi", "n", "raoint")
diff --git a/clang/include/clang/Driver/Options.td 
b/clang/include/clang/Driver/Options.td
index 5df6ddd5e6a0c5..bbada0834526d7 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6290,6 +6290,8 @@ def mamx_fp16 : Flag<["-"], "mamx-fp16">, 
Group;
 def mno_amx_fp16 : Flag<["-"], "mno-amx-fp16">, Group;
 def mamx_int8 : Flag<["-"], "mamx-int8">, Group;
 def mno_amx_int8 : Flag<["-"], "mno-amx-int8">, Group;
+def mamx_fp8 : Flag<["-"], "mamx-fp8">, Group;
+def mno_amx_fp8 : Flag<["-"], "mno-amx-fp8">, Group;
 def mamx_tile : Flag<["-"], "mamx-tile">, Group;
 def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group;
 def mcmpccxadd : Flag<["-"], "mcmpccxadd">, Group;
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index d067ec218b5270..b95261c39a5993 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -420,6 +420,8 @@ bool 
X86TargetInfo::handleTargetFeatures(std::vector &Features,
   HasAMXTILE = true;
 } else if (Feature == "+amx-complex") {
   HasAMXCOMPLEX = true;
+} else if (Feature == "+amx-fp8") {
+  HasAMXFP8 = true;
 } else if (Feature == "+cmpccxadd") {
   HasCMPCCXADD = true;
 } else if (Feature == "+raoint") {
@@ -939,6 +941,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions 

[clang] [llvm] [X86][AMX] Support AMX-FP8 (PR #113850)

2024-10-27 Thread Feng Zou via cfe-commits

https://github.com/fzou1 updated 
https://github.com/llvm/llvm-project/pull/113850

>From fd570cb8d41f5f94b61d515985245fc81aab633e Mon Sep 17 00:00:00 2001
From: Feng Zou 
Date: Thu, 24 Oct 2024 21:56:48 +0800
Subject: [PATCH 1/2] Support AMX-FP8

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368
---
 clang/docs/ReleaseNotes.rst   |  1 +
 clang/include/clang/Basic/BuiltinsX86_64.def  |  6 +++
 clang/include/clang/Driver/Options.td |  2 +
 clang/lib/Basic/Targets/X86.cpp   |  6 +++
 clang/lib/Basic/Targets/X86.h |  1 +
 clang/lib/Headers/CMakeLists.txt  |  1 +
 clang/lib/Headers/amxfp8intrin.h  | 24 
 clang/lib/Headers/immintrin.h |  4 ++
 clang/lib/Sema/SemaX86.cpp|  4 ++
 clang/test/CodeGen/X86/amx_fp8.c  | 27 +
 clang/test/CodeGen/X86/amx_fp8_errors.c   | 10 +
 clang/test/CodeGen/X86/amx_fp8_inline_asm.c   | 32 +++
 llvm/include/llvm/IR/IntrinsicsX86.td | 17 
 .../llvm/TargetParser/X86TargetParser.def |  1 +
 llvm/lib/Target/X86/X86.td|  3 ++
 llvm/lib/Target/X86/X86ISelLowering.cpp   | 23 +++
 llvm/lib/Target/X86/X86InstrAMX.td| 39 +++
 llvm/lib/Target/X86/X86InstrPredicates.td |  1 +
 llvm/lib/TargetParser/Host.cpp|  4 ++
 llvm/lib/TargetParser/X86TargetParser.cpp |  1 +
 llvm/test/CodeGen/X86/amx_fp8_intrinsics.ll   | 20 ++
 .../Disassembler/X86/AMX/x86-64-amx-fp8.txt   | 34 
 llvm/test/MC/X86/AMX/x86-64-amx-fp8-att.s | 33 
 llvm/test/MC/X86/AMX/x86-64-amx-fp8-intel.s   | 33 
 24 files changed, 327 insertions(+)
 create mode 100644 clang/lib/Headers/amxfp8intrin.h
 create mode 100644 clang/test/CodeGen/X86/amx_fp8.c
 create mode 100644 clang/test/CodeGen/X86/amx_fp8_errors.c
 create mode 100644 clang/test/CodeGen/X86/amx_fp8_inline_asm.c
 create mode 100644 llvm/test/CodeGen/X86/amx_fp8_intrinsics.ll
 create mode 100644 llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-fp8.txt
 create mode 100644 llvm/test/MC/X86/AMX/x86-64-amx-fp8-att.s
 create mode 100644 llvm/test/MC/X86/AMX/x86-64-amx-fp8-intel.s

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 6a95337815174b..da0ab888ce200d 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -642,6 +642,7 @@ X86 Support
 
 - Supported intrinsics for ``MOVRS AND AVX10.2``.
   * Supported intrinsics of ``_mm(256|512)_(mask(z))_loadrs_epi(8|16|32|64)``.
+- Support ISA of ``AMX-FP8``.
 
 Arm and AArch64 Support
 ^^^
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def 
b/clang/include/clang/Basic/BuiltinsX86_64.def
index e1e613560167ac..68904ae8abcd15 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -155,6 +155,12 @@ TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, 
"SLLiv*SLLiSLLiIi", "n", "cmpccxadd")
 // AMX_FP16 FP16
 TARGET_BUILTIN(__builtin_ia32_tdpfp16ps, "vIUcIUcIUc", "n", "amx-fp16")
 
+// AMX FP8
+TARGET_BUILTIN(__builtin_ia32_tdpbf8ps, "vIUcUIcUIc", "n", "amx-fp8")
+TARGET_BUILTIN(__builtin_ia32_tdpbhf8ps, "vIUcUIcUIc", "n", "amx-fp8")
+TARGET_BUILTIN(__builtin_ia32_tdphbf8ps, "vIUcUIcUIc", "n", "amx-fp8")
+TARGET_BUILTIN(__builtin_ia32_tdphf8ps, "vIUcUIcUIc", "n", "amx-fp8")
+
 // RAO-INT
 TARGET_BUILTIN(__builtin_ia32_aadd64, "vv*SOi", "n", "raoint")
 TARGET_BUILTIN(__builtin_ia32_aand64, "vv*SOi", "n", "raoint")
diff --git a/clang/include/clang/Driver/Options.td 
b/clang/include/clang/Driver/Options.td
index 5df6ddd5e6a0c5..bbada0834526d7 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6290,6 +6290,8 @@ def mamx_fp16 : Flag<["-"], "mamx-fp16">, 
Group;
 def mno_amx_fp16 : Flag<["-"], "mno-amx-fp16">, Group;
 def mamx_int8 : Flag<["-"], "mamx-int8">, Group;
 def mno_amx_int8 : Flag<["-"], "mno-amx-int8">, Group;
+def mamx_fp8 : Flag<["-"], "mamx-fp8">, Group;
+def mno_amx_fp8 : Flag<["-"], "mno-amx-fp8">, Group;
 def mamx_tile : Flag<["-"], "mamx-tile">, Group;
 def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group;
 def mcmpccxadd : Flag<["-"], "mcmpccxadd">, Group;
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index d067ec218b5270..b95261c39a5993 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -420,6 +420,8 @@ bool 
X86TargetInfo::handleTargetFeatures(std::vector &Features,
   HasAMXTILE = true;
 } else if (Feature == "+amx-complex") {
   HasAMXCOMPLEX = true;
+} else if (Feature == "+amx-fp8") {
+  HasAMXFP8 = true;
 } else if (Feature == "+cmpccxadd") {
   HasCMPCCXADD = true;
 } else if (Feature == "+raoint") {
@@ -939,6 +941,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions 
&Opts,
 Builder.defineMacro("__AMX_FP16__");
   

[clang] [llvm] [X86][AMX] Support AMX-TRANSPOSE (PR #113532)

2024-10-28 Thread Feng Zou via cfe-commits


@@ -34,9 +34,31 @@ class ShapeT {
 if (MRI)
   deduceImm(MRI);
   }
+  // When ShapeT has mult shapes, we only use Shapes (never use Row and Col)
+  // and ImmShapes. Due to the most case is only one shape (just simply use
+  // Shape.Row or Shape.Col), so here we don't merge Row and Col into vertor

fzou1 wrote:

vertor -> vector?

https://github.com/llvm/llvm-project/pull/113532
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-TRANSPOSE (PR #113532)

2024-10-28 Thread Feng Zou via cfe-commits


@@ -568,6 +568,131 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
 MI.setDesc(TII->get(Opc));
 return true;
   }
+  // TILEPAIRLOAD is just for TILEPair spill, we don't have corresponding
+  // AMX instruction to support it. So, split it to 2 load instructions:
+  // "TILEPAIRLOAD TMM0:TMM1, Base, Scale, Index, Offset, Segment" -->
+  // "TILELOAD TMM0, Base, Scale, Index, Offset, Segment" +
+  // "TILELOAD TMM1, Base, Scale, Index, Offset + TMM_SIZE, Segment"
+  case X86::PTILEPAIRLOAD: {
+int64_t Disp = MBBI->getOperand(1 + X86::AddrDisp).getImm();
+Register TReg = MBBI->getOperand(0).getReg();
+bool DstIsDead = MBBI->getOperand(0).isDead();
+Register TReg0 = TRI->getSubReg(TReg, X86::sub_t0);
+Register TReg1 = TRI->getSubReg(TReg, X86::sub_t1);
+unsigned TmmSize = TRI->getRegSizeInBits(X86::TILERegClass) / 8;
+
+MachineInstrBuilder MIBLo =
+BuildMI(MBB, MBBI, DL, TII->get(X86::TILELOADD))
+.addReg(TReg0, RegState::Define | getDeadRegState(DstIsDead));
+MachineInstrBuilder MIBHi =
+BuildMI(MBB, MBBI, DL, TII->get(X86::TILELOADD))
+.addReg(TReg1, RegState::Define | getDeadRegState(DstIsDead));
+
+for (int i = 0; i < X86::AddrNumOperands; ++i) {
+  MIBLo.add(MBBI->getOperand(1 + i));
+  if (i == X86::AddrDisp)
+MIBHi.addImm(Disp + TmmSize);
+  else
+MIBHi.add(MBBI->getOperand(1 + i));
+}
+
+// Make sure the first stride reg used in first tileload is alive.
+MachineOperand &Stride =
+MIBLo.getInstr()->getOperand(1 + X86::AddrIndexReg);
+Stride.setIsKill(false);
+
+// Split the memory operand, adjusting the offset and size for the halves.
+MachineMemOperand *OldMMO = MBBI->memoperands().front();
+MachineFunction *MF = MBB.getParent();
+MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, TmmSize);
+MachineMemOperand *MMOHi =
+MF->getMachineMemOperand(OldMMO, TmmSize, TmmSize);
+
+MIBLo.setMemRefs(MMOLo);
+MIBHi.setMemRefs(MMOHi);
+
+// Delete the pseudo.
+MBB.erase(MBBI);
+return true;
+  }
+  // Smilar with TILEPAIRLOAD, TILEPAIRSTORE is just for TILEPair spill, no

fzou1 wrote:

Smilar -> Similar

https://github.com/llvm/llvm-project/pull/113532
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-TRANSPOSE (PR #113532)

2024-10-28 Thread Feng Zou via cfe-commits


@@ -80,28 +80,41 @@ INITIALIZE_PASS_BEGIN(X86FastTileConfig, DEBUG_TYPE,
 INITIALIZE_PASS_END(X86FastTileConfig, DEBUG_TYPE,
 "Fast Tile Register Configure", false, false)
 
-static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) {
+static unsigned getNumDefTiles(MachineRegisterInfo *MRI, MachineInstr &MI) {
   // There is no phi instruction after register allocation.
   assert(MI.isPHI() == false);
   // The instruction must have 3 operands: tile def, row, col.
   // It should be AMX pseudo instruction that have shape operand.
   if (MI.isDebugInstr() || MI.isCopy() || MI.getNumOperands() < 3 ||
   !MI.isPseudo())
-return false;
+return 0;
   MachineOperand &MO = MI.getOperand(0);
 
   if (MO.isReg()) {
 Register Reg = MO.getReg();
-// FIXME it may be used after Greedy RA and the physical
+// FIXME: It may be used after Greedy RA and the physical
 // register is not rewritten yet.
-if (Reg.isVirtual() &&
-MRI->getRegClass(Reg)->getID() == X86::TILERegClassID)
-  return true;
+if (Reg.isVirtual()) {

fzou1 wrote:

Can this piece of code be replaced with calling getTileDefNum?

https://github.com/llvm/llvm-project/pull/113532
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-TF32 (PR #115625)

2024-11-10 Thread Feng Zou via cfe-commits


@@ -6101,6 +6101,25 @@ let TargetPrefix = "x86" in {
   Intrinsic<[llvm_v16i32_ty],
 [llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty, 
llvm_i32_ty],
 []>;
+
+  def int_x86_tmmultf32ps : ClangBuiltin<"__builtin_ia32_tmmultf32ps">,
+  Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
+  [ImmArg>, ImmArg>, ImmArg>]>;
+  def int_x86_ttmmultf32ps : ClangBuiltin<"__builtin_ia32_ttmmultf32ps">,
+  Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
+  [ImmArg>, ImmArg>, ImmArg>]>;

fzou1 wrote:

Done

https://github.com/llvm/llvm-project/pull/115625
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-TF32 (PR #115625)

2024-11-10 Thread Feng Zou via cfe-commits


@@ -151,6 +151,7 @@ set(x86_files
   amxfp16intrin.h
   amxfp8intrin.h
   amxintrin.h
+  amxtf32intrin.h

fzou1 wrote:

Sorry. Forgot to add it. Done. Thanks.

https://github.com/llvm/llvm-project/pull/115625
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-TF32 (PR #115625)

2024-11-10 Thread Feng Zou via cfe-commits


@@ -660,6 +660,10 @@ _storebe_i64(void * __P, long long __D) {
 #include 
 #endif
 
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_TF32__)
+#include 
+#endif
+

fzou1 wrote:

Added.

https://github.com/llvm/llvm-project/pull/115625
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-TF32 (PR #115625)

2024-11-10 Thread Feng Zou via cfe-commits

https://github.com/fzou1 updated 
https://github.com/llvm/llvm-project/pull/115625

>From b1d9799b99b45b5af2b63868c4c3b139dbf9378c Mon Sep 17 00:00:00 2001
From: Feng Zou 
Date: Sat, 26 Oct 2024 18:44:32 +0800
Subject: [PATCH 1/4] [X86][AMX] Support AMX-TF32

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368
---
 clang/docs/ReleaseNotes.rst   |   1 +
 clang/include/clang/Basic/BuiltinsX86_64.def  |  15 +-
 clang/include/clang/Driver/Options.td |   2 +
 clang/lib/Basic/Targets/X86.cpp   |   6 +
 clang/lib/Basic/Targets/X86.h |   1 +
 clang/lib/Headers/CMakeLists.txt  |   1 +
 clang/lib/Headers/amxtf32intrin.h | 194 ++
 clang/lib/Headers/immintrin.h |   4 +
 clang/lib/Sema/SemaX86.cpp|   2 +
 clang/test/CodeGen/X86/amx_tf32.c |  17 ++
 clang/test/CodeGen/X86/amx_tf32_api.c |  27 +++
 clang/test/CodeGen/X86/amx_tf32_errors.c  |  23 +++
 clang/test/CodeGen/X86/amx_tf32_inline_asm.c  |  18 ++
 clang/test/Driver/x86-target-features.c   |   7 +
 clang/test/Preprocessor/x86_target_features.c |   9 +
 llvm/include/llvm/IR/IntrinsicsX86.td |  19 ++
 .../llvm/TargetParser/X86TargetParser.def |   1 +
 llvm/lib/Target/X86/X86.td|   3 +
 llvm/lib/Target/X86/X86ExpandPseudo.cpp   |  11 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp   |  22 ++
 llvm/lib/Target/X86/X86InstrAMX.td|  52 +
 llvm/lib/Target/X86/X86InstrPredicates.td |   1 +
 llvm/lib/Target/X86/X86LowerAMXType.cpp   |  20 +-
 llvm/lib/Target/X86/X86RegisterInfo.cpp   |   4 +-
 llvm/lib/TargetParser/Host.cpp|   1 +
 llvm/lib/TargetParser/X86TargetParser.cpp |   1 +
 llvm/test/CodeGen/X86/amx-tf32-internal.ll|  47 +
 llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll  |  23 +++
 .../Disassembler/X86/AMX/x86-64-amx-tf32.txt  |  19 ++
 llvm/test/MC/X86/AMX/x86-64-amx-tf32-att.s|  17 ++
 llvm/test/MC/X86/AMX/x86-64-amx-tf32-intel.s  |  17 ++
 31 files changed, 578 insertions(+), 7 deletions(-)
 create mode 100644 clang/lib/Headers/amxtf32intrin.h
 create mode 100644 clang/test/CodeGen/X86/amx_tf32.c
 create mode 100644 clang/test/CodeGen/X86/amx_tf32_api.c
 create mode 100644 clang/test/CodeGen/X86/amx_tf32_errors.c
 create mode 100644 clang/test/CodeGen/X86/amx_tf32_inline_asm.c
 create mode 100644 llvm/test/CodeGen/X86/amx-tf32-internal.ll
 create mode 100644 llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll
 create mode 100644 llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-tf32.txt
 create mode 100644 llvm/test/MC/X86/AMX/x86-64-amx-tf32-att.s
 create mode 100644 llvm/test/MC/X86/AMX/x86-64-amx-tf32-intel.s

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c3424e0e6f34c9..e235a04f78112b 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -740,6 +740,7 @@ X86 Support
 - Support ISA of ``AMX-FP8``.
 - Support ISA of ``AMX-TRANSPOSE``.
 - Support ISA of ``AMX-AVX512``.
+- Support ISA of ``AMX-TF32``.
 
 Arm and AArch64 Support
 ^^^
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def 
b/clang/include/clang/Basic/BuiltinsX86_64.def
index 9f7462b1e0d962..25c10d39df32e2 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -139,6 +139,9 @@ TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l_internal, 
"V32yUsUsV256iUi", "n",
 TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phh_internal, "V32xUsUsV256iUi", "n", 
"amx-avx512,avx10.2-512")
 TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phl_internal, "V32xUsUsV256iUi", "n", 
"amx-avx512,avx10.2-512")
 TARGET_BUILTIN(__builtin_ia32_tilemovrow_internal, "V16iUsUsV256iUi", "n", 
"amx-avx512,avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_tmmultf32ps_internal, 
"V256iUsUsUsV256iV256iV256i", "n", "amx-tf32")
+TARGET_BUILTIN(__builtin_ia32_ttmmultf32ps_internal, 
"V256iUsUsUsV256iV256iV256i", "n", "amx-tf32,amx-transpose")
+
 // AMX
 TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
 TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")
@@ -172,10 +175,6 @@ TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phh, "V32xIUcUi", 
"n", "amx-avx512,avx10
 TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phl, "V32xIUcUi", "n", 
"amx-avx512,avx10.2-512")
 TARGET_BUILTIN(__builtin_ia32_tilemovrow, "V16iIUcUi", "n", 
"amx-avx512,avx10.2-512")
 
-TARGET_BUILTIN(__builtin_ia32_prefetchi, "vvC*Ui", "nc", "prefetchi")
-TARGET_BUILTIN(__builtin_ia32_cmpccxadd32, "Siv*SiSiIi", "n", "cmpccxadd")
-TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, "SLLiSLLi*SLLiSLLiIi", "n", 
"cmpccxadd")
-
 // AMX_FP16 FP16
 TARGET_BUILTIN(__builtin_ia32_tdpfp16ps, "vIUcIUcIUc", "n", "amx-fp16")
 
@@ -185,6 +184,14 @@ TARGET_BUILTIN(__builtin_ia32_tdpbhf8ps, "vIUcUIcUIc", 
"n", "amx-fp8")
 TARGET_BUILTIN(__builtin_ia32_tdphbf8ps, "vIUcUIcUIc", "n", "amx-fp8")
 TARGET_BUILTIN(__builtin_ia32_tdph

[clang] [llvm] [X86][AMX] Add AMX FP8 new APIs (PR #115829)

2024-11-11 Thread Feng Zou via cfe-commits

https://github.com/fzou1 created 
https://github.com/llvm/llvm-project/pull/115829

This is a follow-up to #113850.

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368

>From 9fd6e9e598423b6cc58a25fe70cc12a846483be5 Mon Sep 17 00:00:00 2001
From: Feng Zou 
Date: Thu, 7 Nov 2024 11:56:17 +0800
Subject: [PATCH] [X86][AMX] Add AMX FP8 new APIs

This is a follow-up to #113850.

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368
---
 clang/include/clang/Basic/BuiltinsX86_64.def |   4 +
 clang/lib/Headers/amxfp8intrin.h | 175 ---
 clang/test/CodeGen/X86/amx_fp8_api.c |  36 
 llvm/include/llvm/IR/IntrinsicsX86.td|  25 +++
 llvm/lib/Target/X86/X86ExpandPseudo.cpp  |  18 +-
 llvm/lib/Target/X86/X86InstrAMX.td   |  31 
 llvm/lib/Target/X86/X86RegisterInfo.cpp  |   6 +-
 7 files changed, 272 insertions(+), 23 deletions(-)
 create mode 100644 clang/test/CodeGen/X86/amx_fp8_api.c

diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def 
b/clang/include/clang/Basic/BuiltinsX86_64.def
index 25c10d39df32e2..8653fc217bdddb 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -141,6 +141,10 @@ TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phl_internal, 
"V32xUsUsV256iUi", "n", "a
 TARGET_BUILTIN(__builtin_ia32_tilemovrow_internal, "V16iUsUsV256iUi", "n", 
"amx-avx512,avx10.2-512")
 TARGET_BUILTIN(__builtin_ia32_tmmultf32ps_internal, 
"V256iUsUsUsV256iV256iV256i", "n", "amx-tf32")
 TARGET_BUILTIN(__builtin_ia32_ttmmultf32ps_internal, 
"V256iUsUsUsV256iV256iV256i", "n", "amx-tf32,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_tdpbf8ps_internal, "V256iUsUsUsV256iV256iV256i", 
"n", "amx-fp8")
+TARGET_BUILTIN(__builtin_ia32_tdpbhf8ps_internal, 
"V256iUsUsUsV256iV256iV256i", "n", "amx-fp8")
+TARGET_BUILTIN(__builtin_ia32_tdphbf8ps_internal, 
"V256iUsUsUsV256iV256iV256i", "n", "amx-fp8")
+TARGET_BUILTIN(__builtin_ia32_tdphf8ps_internal, "V256iUsUsUsV256iV256iV256i", 
"n", "amx-fp8")
 
 // AMX
 TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
diff --git a/clang/lib/Headers/amxfp8intrin.h b/clang/lib/Headers/amxfp8intrin.h
index 0f5ddc87e5a752..4ada936a5d40af 100644
--- a/clang/lib/Headers/amxfp8intrin.h
+++ b/clang/lib/Headers/amxfp8intrin.h
@@ -15,81 +15,214 @@
 #define __AMXFP8INTRIN_H
 #ifdef __x86_64__
 
-/// Peform the dot product of a BF8 value \a a by a BF8 value \a b accumulating
-/// into a Single Precision (FP32) source/dest \a dst.
+#define __DEFAULT_FN_ATTRS_FP8 
\
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-fp8")))
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8
+_tile_dpbf8ps_internal(unsigned short m, unsigned short n, unsigned short k,
+   _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdpbf8ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// Perform the dot product of a BF8 value \a src1 by a BF8 value \a src2
+/// accumulating into a Single Precision (FP32) source/dest \a dst.
 ///
 /// \headerfile 
 ///
 /// \code
-/// void _tile_dpbf8ps (__tile dst, __tile a, __tile b)
+/// void __tile_dpbf8ps (__tile1024i *dst, __tile1024i src1, __tile1024i src2)
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///   temp1[(dst.colsb / 4 - 1) : 0] = 0
+///   FOR k := 0 TO src1.colsb / 4 - 1
+/// FOR n := 0 TO dst.colsb / 4 - 1
+///   temp1[n] +=
+/// INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0])
+/// + INT64(src1.row[m].float8[4*k+1]) * 
INT64(src2.row[k].float8[4*n+1])
+/// + INT64(src1.row[m].float8[4*k+2]) * 
INT64(src2.row[k].float8[4*n+2])
+/// + INT64(src1.row[m].float8[4*k+3]) * 
INT64(src2.row[k].float8[4*n+3])
+/// ENDFOR
+///   ENDFOR
+///   FOR n := 0 TO dst.colsb / 4 - 1
+/// tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n])
+///   ENDFOR
+/// write_row_and_zero(dst, m, tmp, dst.colsb)
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
 /// \endcode
 ///
 /// This intrinsic corresponds to the \c TDPBF8PS instruction.
 ///
 /// \param dst
 ///The destination tile. Max size is 1024 Bytes.
-/// \param a
+/// \param src1
 ///The 1st source tile. Max size is 1024 Bytes.
-/// \param b
+/// \param src2
 ///The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbf8ps(dst, a, b) __builtin_ia32_tdpbf8ps((dst), (a), (b))
+__DEFAULT_FN_ATTRS_FP8 static void
+__tile_dpbf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) {
+  dst->tile = _tile_dpbf8ps_internal(src1.row, src2.col, src1.col, dst->tile,
+ src1.tile, src2.tile);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8
+_tile_dpbhf8ps_internal(unsigned short m, unsigned short n, unsigned short k,
+_tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdpbhf8ps_internal(m,

[clang] [llvm] [X86][AMX] Add AMX FP8 new APIs (PR #115829)

2024-11-12 Thread Feng Zou via cfe-commits

https://github.com/fzou1 updated 
https://github.com/llvm/llvm-project/pull/115829

>From 9fd6e9e598423b6cc58a25fe70cc12a846483be5 Mon Sep 17 00:00:00 2001
From: Feng Zou 
Date: Thu, 7 Nov 2024 11:56:17 +0800
Subject: [PATCH 1/2] [X86][AMX] Add AMX FP8 new APIs

This is a follow-up to #113850.

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368
---
 clang/include/clang/Basic/BuiltinsX86_64.def |   4 +
 clang/lib/Headers/amxfp8intrin.h | 175 ---
 clang/test/CodeGen/X86/amx_fp8_api.c |  36 
 llvm/include/llvm/IR/IntrinsicsX86.td|  25 +++
 llvm/lib/Target/X86/X86ExpandPseudo.cpp  |  18 +-
 llvm/lib/Target/X86/X86InstrAMX.td   |  31 
 llvm/lib/Target/X86/X86RegisterInfo.cpp  |   6 +-
 7 files changed, 272 insertions(+), 23 deletions(-)
 create mode 100644 clang/test/CodeGen/X86/amx_fp8_api.c

diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def 
b/clang/include/clang/Basic/BuiltinsX86_64.def
index 25c10d39df32e2..8653fc217bdddb 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -141,6 +141,10 @@ TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phl_internal, 
"V32xUsUsV256iUi", "n", "a
 TARGET_BUILTIN(__builtin_ia32_tilemovrow_internal, "V16iUsUsV256iUi", "n", 
"amx-avx512,avx10.2-512")
 TARGET_BUILTIN(__builtin_ia32_tmmultf32ps_internal, 
"V256iUsUsUsV256iV256iV256i", "n", "amx-tf32")
 TARGET_BUILTIN(__builtin_ia32_ttmmultf32ps_internal, 
"V256iUsUsUsV256iV256iV256i", "n", "amx-tf32,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_tdpbf8ps_internal, "V256iUsUsUsV256iV256iV256i", 
"n", "amx-fp8")
+TARGET_BUILTIN(__builtin_ia32_tdpbhf8ps_internal, 
"V256iUsUsUsV256iV256iV256i", "n", "amx-fp8")
+TARGET_BUILTIN(__builtin_ia32_tdphbf8ps_internal, 
"V256iUsUsUsV256iV256iV256i", "n", "amx-fp8")
+TARGET_BUILTIN(__builtin_ia32_tdphf8ps_internal, "V256iUsUsUsV256iV256iV256i", 
"n", "amx-fp8")
 
 // AMX
 TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
diff --git a/clang/lib/Headers/amxfp8intrin.h b/clang/lib/Headers/amxfp8intrin.h
index 0f5ddc87e5a752..4ada936a5d40af 100644
--- a/clang/lib/Headers/amxfp8intrin.h
+++ b/clang/lib/Headers/amxfp8intrin.h
@@ -15,81 +15,214 @@
 #define __AMXFP8INTRIN_H
 #ifdef __x86_64__
 
-/// Peform the dot product of a BF8 value \a a by a BF8 value \a b accumulating
-/// into a Single Precision (FP32) source/dest \a dst.
+#define __DEFAULT_FN_ATTRS_FP8 
\
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-fp8")))
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8
+_tile_dpbf8ps_internal(unsigned short m, unsigned short n, unsigned short k,
+   _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdpbf8ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// Perform the dot product of a BF8 value \a src1 by a BF8 value \a src2
+/// accumulating into a Single Precision (FP32) source/dest \a dst.
 ///
 /// \headerfile 
 ///
 /// \code
-/// void _tile_dpbf8ps (__tile dst, __tile a, __tile b)
+/// void __tile_dpbf8ps (__tile1024i *dst, __tile1024i src1, __tile1024i src2)
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///   temp1[(dst.colsb / 4 - 1) : 0] = 0
+///   FOR k := 0 TO src1.colsb / 4 - 1
+/// FOR n := 0 TO dst.colsb / 4 - 1
+///   temp1[n] +=
+/// INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0])
+/// + INT64(src1.row[m].float8[4*k+1]) * 
INT64(src2.row[k].float8[4*n+1])
+/// + INT64(src1.row[m].float8[4*k+2]) * 
INT64(src2.row[k].float8[4*n+2])
+/// + INT64(src1.row[m].float8[4*k+3]) * 
INT64(src2.row[k].float8[4*n+3])
+/// ENDFOR
+///   ENDFOR
+///   FOR n := 0 TO dst.colsb / 4 - 1
+/// tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n])
+///   ENDFOR
+/// write_row_and_zero(dst, m, tmp, dst.colsb)
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
 /// \endcode
 ///
 /// This intrinsic corresponds to the \c TDPBF8PS instruction.
 ///
 /// \param dst
 ///The destination tile. Max size is 1024 Bytes.
-/// \param a
+/// \param src1
 ///The 1st source tile. Max size is 1024 Bytes.
-/// \param b
+/// \param src2
 ///The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbf8ps(dst, a, b) __builtin_ia32_tdpbf8ps((dst), (a), (b))
+__DEFAULT_FN_ATTRS_FP8 static void
+__tile_dpbf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) {
+  dst->tile = _tile_dpbf8ps_internal(src1.row, src2.col, src1.col, dst->tile,
+ src1.tile, src2.tile);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8
+_tile_dpbhf8ps_internal(unsigned short m, unsigned short n, unsigned short k,
+_tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdpbhf8ps_internal(m, n, k, dst, src1, src2);
+}
 
-/// Perform the dot product of a BF8 value \a a by an 

[clang] [llvm] [X86][AMX] Add AMX FP8 new APIs (PR #115829)

2024-11-12 Thread Feng Zou via cfe-commits


@@ -15,81 +15,214 @@
 #define __AMXFP8INTRIN_H
 #ifdef __x86_64__
 
-/// Peform the dot product of a BF8 value \a a by a BF8 value \a b accumulating
-/// into a Single Precision (FP32) source/dest \a dst.
+#define __DEFAULT_FN_ATTRS_FP8 
\
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-fp8")))
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8
+_tile_dpbf8ps_internal(unsigned short m, unsigned short n, unsigned short k,
+   _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdpbf8ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// Perform the dot product of a BF8 value \a src1 by a BF8 value \a src2
+/// accumulating into a Single Precision (FP32) source/dest \a dst.
 ///
 /// \headerfile 
 ///
 /// \code
-/// void _tile_dpbf8ps (__tile dst, __tile a, __tile b)
+/// void __tile_dpbf8ps (__tile1024i *dst, __tile1024i src1, __tile1024i src2)
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///   temp1[(dst.colsb / 4 - 1) : 0] = 0
+///   FOR k := 0 TO src1.colsb / 4 - 1
+/// FOR n := 0 TO dst.colsb / 4 - 1
+///   temp1[n] +=
+/// INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0])
+/// + INT64(src1.row[m].float8[4*k+1]) * 
INT64(src2.row[k].float8[4*n+1])
+/// + INT64(src1.row[m].float8[4*k+2]) * 
INT64(src2.row[k].float8[4*n+2])
+/// + INT64(src1.row[m].float8[4*k+3]) * 
INT64(src2.row[k].float8[4*n+3])
+/// ENDFOR
+///   ENDFOR
+///   FOR n := 0 TO dst.colsb / 4 - 1
+/// tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n])
+///   ENDFOR
+/// write_row_and_zero(dst, m, tmp, dst.colsb)
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
 /// \endcode
 ///
 /// This intrinsic corresponds to the \c TDPBF8PS instruction.
 ///
 /// \param dst
 ///The destination tile. Max size is 1024 Bytes.
-/// \param a
+/// \param src1
 ///The 1st source tile. Max size is 1024 Bytes.
-/// \param b
+/// \param src2
 ///The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbf8ps(dst, a, b) __builtin_ia32_tdpbf8ps((dst), (a), (b))
+__DEFAULT_FN_ATTRS_FP8 static void
+__tile_dpbf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) {
+  dst->tile = _tile_dpbf8ps_internal(src1.row, src2.col, src1.col, dst->tile,
+ src1.tile, src2.tile);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8
+_tile_dpbhf8ps_internal(unsigned short m, unsigned short n, unsigned short k,
+_tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdpbhf8ps_internal(m, n, k, dst, src1, src2);
+}
 
-/// Perform the dot product of a BF8 value \a a by an HF8 value \a b
+/// Perform the dot product of a BF8 value \a src1 by an HF8 value \a src2
 /// accumulating into a Single Precision (FP32) source/dest \a dst.
 ///
 /// \headerfile 
 ///
 /// \code
-/// void _tile_dpbhf8ps (__tile dst, __tile a, __tile b)
+/// void __tile_dpbhf8ps (__tile1024i dst, __tile1024i src1, __tile1024i src2)
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///   temp1[(dst.colsb / 4 - 1) : 0] = 0
+///   FOR k := 0 TO src1.colsb / 4 - 1
+/// FOR n := 0 TO dst.colsb / 4 - 1
+///   temp1[n] +=
+/// INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0])
+/// + INT64(src1.row[m].float8[4*k+1]) * 
INT64(src2.row[k].float8[4*n+1])
+/// + INT64(src1.row[m].float8[4*k+2]) * 
INT64(src2.row[k].float8[4*n+2])
+/// + INT64(src1.row[m].float8[4*k+3]) * 
INT64(src2.row[k].float8[4*n+3])
+/// ENDFOR
+///   ENDFOR
+///   FOR n := 0 TO dst.colsb / 4 - 1
+/// tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n])
+///   ENDFOR
+/// write_row_and_zero(dst, m, tmp, dst.colsb)
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
 /// \endcode
 ///
 /// This intrinsic corresponds to the \c TDPBHF8PS instruction.
 ///
 /// \param dst
 ///The destination tile. Max size is 1024 Bytes.
-/// \param a
+/// \param src1
 ///The 1st source tile. Max size is 1024 Bytes.
-/// \param b
+/// \param src2
 ///The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbhf8ps(dst, a, b) __builtin_ia32_tdpbhf8ps((dst), (a), (b))
+__DEFAULT_FN_ATTRS_FP8 static void
+__tile_dpbhf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) {
+  dst->tile = _tile_dpbhf8ps_internal(src1.row, src2.col, src1.col, dst->tile,
+  src1.tile, src2.tile);
+}
 
-/// Perform the dot product of an HF8 value \a a by a BF8 value \a b
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8
+_tile_dphbf8ps_internal(unsigned short m, unsigned short n, unsigned short k,
+_tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdphbf8ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// Perform the dot product of an HF8 

[clang] [llvm] [X86][AMX] Support AMX-TRANSPOSE, part 2 (PR #115660)

2024-11-13 Thread Feng Zou via cfe-commits


@@ -0,0 +1,94 @@
+/*===- amxfp16transposeintrin.h - AMX-FP16 and AMX-TRANSPOSE 
===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ 
*======
+ */
+
+#ifndef __IMMINTRIN_H
+#error 
\
+"Never use  directly; use  instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMX_FP16TRANSPOSEINTRIN_H
+#define __AMX_FP16TRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS 
\
+  __attribute__((__always_inline__, __nodebug__,   
\
+ __target__("amx-fp16,amx-transpose")))
+
+/// Compute transpose and dot-product of FP16 (16-bit) floating-point pairs in
+///tiles \a a and \a b, accumulating the intermediate single-precision
+///(32-bit) floating-point elements with elements in \a dst, and store the
+///32-bit result back to tile \a dst.
+///
+/// \headerfile 
+///
+/// \code
+/// void _tile_tdpfp16ps (__tile dst, __tile a, __tile b)
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///tmp := dst.row[m]
+///FOR k := 0 TO (a.colsb / 4) - 1
+///FOR n := 0 TO (dst.colsb / 4) - 1
+///tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) *
+///FP32(b.row[k].fp16[2*n+0])
+///tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) *
+///FP32(b.row[k].fp16[2*n+1])
+///ENDFOR
+///ENDFOR
+///write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTDPFP16PS instruction.
+///
+/// \param dst
+///The destination tile. Max size is 1024 Bytes.
+/// \param a
+///The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tdpfp16ps(dst, a, b) __builtin_ia32_ttdpfp16ps(dst, a, b)

fzou1 wrote:

ditto

https://github.com/llvm/llvm-project/pull/115660
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-TRANSPOSE, part 2 (PR #115660)

2024-11-13 Thread Feng Zou via cfe-commits


@@ -0,0 +1,301 @@
+/*===- amxcomplextransposeintrin.h - AMX-COMPLEX and AMX-TRANSPOSE 
--===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ 
*======
+ */
+
+#ifndef __IMMINTRIN_H
+#error 
\
+"Never use  directly; include  
instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AMX_COMPLEXTRANSPOSEINTRIN_H
+#define __AMX_COMPLEXTRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS 
\
+  __attribute__((__always_inline__, __nodebug__,   
\
+ __target__("amx-complex,amx-transpose")))
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+///accumulate the results into a packed single precision tile. Each dword
+///element in input tiles \a a and \a b is interpreted as a complex number
+///with FP16 real part and FP16 imaginary part.
+/// Calculates the imaginary part of the result. For each possible combination
+///of (transposed column of \a a, column of \a b), it performs a set of
+///multiplication and accumulations on all corresponding complex numbers
+///(one from \a a and one from \a b). The imaginary part of the \a a 
element
+///is multiplied with the real part of the corresponding \a b element, and
+///the real part of the \a a element is multiplied with the imaginary part
+///of the corresponding \a b elements. The two accumulated results are
+///added, and then accumulated into the corresponding row and column of
+///\a dst.
+///
+/// \headerfile 
+///
+/// \code
+/// void _tile_tcmmimfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///tmp := dst.row[m]
+///FOR k := 0 TO a.rows - 1
+///FOR n := 0 TO (dst.colsb / 4) - 1
+///tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * 
FP32(b.row[k].fp16[2*n+1])
+///tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * 
FP32(b.row[k].fp16[2*n+0])
+///ENDFOR
+///ENDFOR
+///write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.
+///
+/// \param dst
+///The destination tile. Max size is 1024 Bytes.
+/// \param a
+///The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tcmmimfp16ps(dst, a, b) __builtin_ia32_ttcmmimfp16ps(dst, a, b)
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+///accumulate the results into a packed single precision tile. Each dword
+///element in input tiles \a a and \a b is interpreted as a complex number
+///with FP16 real part and FP16 imaginary part.
+/// Calculates the real part of the result. For each possible combination
+///of (rtransposed colum of \a a, column of \a b), it performs a set of
+///multiplication and accumulations on all corresponding complex numbers
+///(one from \a a and one from \a b). The real part of the \a a element is
+///multiplied with the real part of the corresponding \a b element, and the
+///negated imaginary part of the \a a element is multiplied with the
+///imaginary part of the corresponding \a b elements. The two accumulated
+///results are added, and then accumulated into the corresponding row and
+///column of \a dst.
+///
+/// \headerfile 
+///
+/// \code
+/// void _tile_tcmmrlfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///tmp := dst.row[m]
+///FOR k := 0 TO a.rows - 1
+///FOR n := 0 TO (dst.colsb / 4) - 1
+///tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * 
FP32(b.row[k].fp16[2*n+0])
+///tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * 
FP32(b.row[k].fp16[2*n+1])
+///ENDFOR
+///ENDFOR
+///write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.
+///
+/// \param dst
+///The destination tile. Max size is 1024 Bytes.
+/// \param a
+///The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tcmmrlfp16ps(dst, a, b) __builtin_ia32_ttcmmrlfp16ps(dst, a, b)
+
+/// Perform matrix conjugate transpose and multiplication of two tiles
+///containing complex elements and accumulate the results into a packe

[clang] [llvm] [X86][AMX] Support AMX-TRANSPOSE, part 2 (PR #115660)

2024-11-13 Thread Feng Zou via cfe-commits


@@ -0,0 +1,94 @@
+/*===- amxbf16transposeintrin.h - AMX-BF16 and AMX-TRANSPOSE 
===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ 
*======
+ */
+
+#ifndef __IMMINTRIN_H
+#error 
\
+"Never use  directly; use  instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMX_BF16TRANSPOSEINTRIN_H
+#define __AMX_BF16TRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS 
\
+  __attribute__((__always_inline__, __nodebug__,   
\
+ __target__("amx-bf16,amx-transpose")))
+
+/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in
+///tiles \a a and \a b, accumulating the intermediate single-precision
+///(32-bit) floating-point elements with elements in \a dst, and store the
+///32-bit result back to tile \a dst.
+///
+/// \headerfile 
+///
+/// \code
+/// void _tile_tdpbf16ps (__tile dst, __tile a, __tile b)
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///tmp := dst.row[m]
+///FOR k := 0 TO (a.colsb / 4) - 1
+///FOR n := 0 TO (dst.colsb / 4) - 1
+///tmp.bf32[n] += FP32(a.row[m].bf16[2*k+0]) *
+///FP32(b.row[k].bf16[2*n+0])
+///tmp.bf32[n] += FP32(a.row[m].bf16[2*k+1]) *
+///FP32(b.row[k].bf16[2*n+1])
+///ENDFOR
+///ENDFOR
+///write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTDPBF16PS instruction.
+///
+/// \param dst
+///The destination tile. Max size is 1024 Bytes.
+/// \param a
+///The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tdpbf16ps(dst, a, b) __builtin_ia32_ttdpbf16ps(dst, a, b)

fzou1 wrote:

__builtin_ia32_ttdpbf16ps(dst, a, b) -> __builtin_ia32_ttdpbf16ps((dst), (a), 
(b))

https://github.com/llvm/llvm-project/pull/115660
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-TRANSPOSE, part 2 (PR #115660)

2024-11-13 Thread Feng Zou via cfe-commits


@@ -0,0 +1,301 @@
+/*===- amxcomplextransposeintrin.h - AMX-COMPLEX and AMX-TRANSPOSE 
--===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ 
*======
+ */
+
+#ifndef __IMMINTRIN_H
+#error 
\
+"Never use  directly; include  
instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AMX_COMPLEXTRANSPOSEINTRIN_H
+#define __AMX_COMPLEXTRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS 
\
+  __attribute__((__always_inline__, __nodebug__,   
\
+ __target__("amx-complex,amx-transpose")))
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+///accumulate the results into a packed single precision tile. Each dword
+///element in input tiles \a a and \a b is interpreted as a complex number
+///with FP16 real part and FP16 imaginary part.
+/// Calculates the imaginary part of the result. For each possible combination
+///of (transposed column of \a a, column of \a b), it performs a set of
+///multiplication and accumulations on all corresponding complex numbers
+///(one from \a a and one from \a b). The imaginary part of the \a a 
element
+///is multiplied with the real part of the corresponding \a b element, and
+///the real part of the \a a element is multiplied with the imaginary part
+///of the corresponding \a b elements. The two accumulated results are
+///added, and then accumulated into the corresponding row and column of
+///\a dst.
+///
+/// \headerfile 
+///
+/// \code
+/// void _tile_tcmmimfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///tmp := dst.row[m]
+///FOR k := 0 TO a.rows - 1
+///FOR n := 0 TO (dst.colsb / 4) - 1
+///tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * 
FP32(b.row[k].fp16[2*n+1])
+///tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * 
FP32(b.row[k].fp16[2*n+0])
+///ENDFOR
+///ENDFOR
+///write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.
+///
+/// \param dst
+///The destination tile. Max size is 1024 Bytes.
+/// \param a
+///The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tcmmimfp16ps(dst, a, b) __builtin_ia32_ttcmmimfp16ps(dst, a, b)

fzou1 wrote:

ditto

https://github.com/llvm/llvm-project/pull/115660
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-TRANSPOSE, part 2 (PR #115660)

2024-11-13 Thread Feng Zou via cfe-commits


@@ -0,0 +1,301 @@
+/*===- amxcomplextransposeintrin.h - AMX-COMPLEX and AMX-TRANSPOSE 
--===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ 
*======
+ */
+
+#ifndef __IMMINTRIN_H
+#error 
\
+"Never use  directly; include  
instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AMX_COMPLEXTRANSPOSEINTRIN_H
+#define __AMX_COMPLEXTRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS 
\
+  __attribute__((__always_inline__, __nodebug__,   
\
+ __target__("amx-complex,amx-transpose")))
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+///accumulate the results into a packed single precision tile. Each dword
+///element in input tiles \a a and \a b is interpreted as a complex number
+///with FP16 real part and FP16 imaginary part.
+/// Calculates the imaginary part of the result. For each possible combination
+///of (transposed column of \a a, column of \a b), it performs a set of
+///multiplication and accumulations on all corresponding complex numbers
+///(one from \a a and one from \a b). The imaginary part of the \a a 
element
+///is multiplied with the real part of the corresponding \a b element, and
+///the real part of the \a a element is multiplied with the imaginary part
+///of the corresponding \a b elements. The two accumulated results are
+///added, and then accumulated into the corresponding row and column of
+///\a dst.
+///
+/// \headerfile 
+///
+/// \code
+/// void _tile_tcmmimfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///tmp := dst.row[m]
+///FOR k := 0 TO a.rows - 1
+///FOR n := 0 TO (dst.colsb / 4) - 1
+///tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * 
FP32(b.row[k].fp16[2*n+1])
+///tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * 
FP32(b.row[k].fp16[2*n+0])
+///ENDFOR
+///ENDFOR
+///write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.
+///
+/// \param dst
+///The destination tile. Max size is 1024 Bytes.
+/// \param a
+///The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tcmmimfp16ps(dst, a, b) __builtin_ia32_ttcmmimfp16ps(dst, a, b)
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+///accumulate the results into a packed single precision tile. Each dword
+///element in input tiles \a a and \a b is interpreted as a complex number
+///with FP16 real part and FP16 imaginary part.
+/// Calculates the real part of the result. For each possible combination
+///of (rtransposed colum of \a a, column of \a b), it performs a set of
+///multiplication and accumulations on all corresponding complex numbers
+///(one from \a a and one from \a b). The real part of the \a a element is
+///multiplied with the real part of the corresponding \a b element, and the
+///negated imaginary part of the \a a element is multiplied with the
+///imaginary part of the corresponding \a b elements. The two accumulated
+///results are added, and then accumulated into the corresponding row and
+///column of \a dst.
+///
+/// \headerfile 
+///
+/// \code
+/// void _tile_tcmmrlfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///tmp := dst.row[m]
+///FOR k := 0 TO a.rows - 1
+///FOR n := 0 TO (dst.colsb / 4) - 1
+///tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * 
FP32(b.row[k].fp16[2*n+0])
+///tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * 
FP32(b.row[k].fp16[2*n+1])
+///ENDFOR
+///ENDFOR
+///write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.
+///
+/// \param dst
+///The destination tile. Max size is 1024 Bytes.
+/// \param a
+///The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tcmmrlfp16ps(dst, a, b) __builtin_ia32_ttcmmrlfp16ps(dst, a, b)

fzou1 wrote:

ditto

https://github.com/llvm/llvm-project/pull/115660
___
cfe-commits ma

[clang] [llvm] [X86][AMX] Support AMX-TRANSPOSE, part 2 (PR #115660)

2024-11-13 Thread Feng Zou via cfe-commits


@@ -0,0 +1,301 @@
+/*===- amxcomplextransposeintrin.h - AMX-COMPLEX and AMX-TRANSPOSE 
--===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ 
*======
+ */
+
+#ifndef __IMMINTRIN_H
+#error 
\
+"Never use  directly; include  
instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AMX_COMPLEXTRANSPOSEINTRIN_H
+#define __AMX_COMPLEXTRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS 
\
+  __attribute__((__always_inline__, __nodebug__,   
\
+ __target__("amx-complex,amx-transpose")))
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+///accumulate the results into a packed single precision tile. Each dword
+///element in input tiles \a a and \a b is interpreted as a complex number
+///with FP16 real part and FP16 imaginary part.
+/// Calculates the imaginary part of the result. For each possible combination
+///of (transposed column of \a a, column of \a b), it performs a set of
+///multiplication and accumulations on all corresponding complex numbers
+///(one from \a a and one from \a b). The imaginary part of the \a a 
element
+///is multiplied with the real part of the corresponding \a b element, and
+///the real part of the \a a element is multiplied with the imaginary part
+///of the corresponding \a b elements. The two accumulated results are
+///added, and then accumulated into the corresponding row and column of
+///\a dst.
+///
+/// \headerfile 
+///
+/// \code
+/// void _tile_tcmmimfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///tmp := dst.row[m]
+///FOR k := 0 TO a.rows - 1
+///FOR n := 0 TO (dst.colsb / 4) - 1
+///tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * 
FP32(b.row[k].fp16[2*n+1])
+///tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * 
FP32(b.row[k].fp16[2*n+0])
+///ENDFOR
+///ENDFOR
+///write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.
+///
+/// \param dst
+///The destination tile. Max size is 1024 Bytes.
+/// \param a
+///The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tcmmimfp16ps(dst, a, b) __builtin_ia32_ttcmmimfp16ps(dst, a, b)
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+///accumulate the results into a packed single precision tile. Each dword
+///element in input tiles \a a and \a b is interpreted as a complex number
+///with FP16 real part and FP16 imaginary part.
+/// Calculates the real part of the result. For each possible combination
+///of (rtransposed colum of \a a, column of \a b), it performs a set of
+///multiplication and accumulations on all corresponding complex numbers
+///(one from \a a and one from \a b). The real part of the \a a element is
+///multiplied with the real part of the corresponding \a b element, and the
+///negated imaginary part of the \a a element is multiplied with the
+///imaginary part of the corresponding \a b elements. The two accumulated
+///results are added, and then accumulated into the corresponding row and
+///column of \a dst.
+///
+/// \headerfile 
+///
+/// \code
+/// void _tile_tcmmrlfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///tmp := dst.row[m]
+///FOR k := 0 TO a.rows - 1
+///FOR n := 0 TO (dst.colsb / 4) - 1
+///tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * 
FP32(b.row[k].fp16[2*n+0])
+///tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * 
FP32(b.row[k].fp16[2*n+1])
+///ENDFOR
+///ENDFOR
+///write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.
+///
+/// \param dst
+///The destination tile. Max size is 1024 Bytes.
+/// \param a
+///The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tcmmrlfp16ps(dst, a, b) __builtin_ia32_ttcmmrlfp16ps(dst, a, b)
+
+/// Perform matrix conjugate transpose and multiplication of two tiles
+///containing complex elements and accumulate the results into a packe

[clang] [llvm] [X86][AMX] Support AMX-TRANSPOSE (PR #113532)

2024-10-31 Thread Feng Zou via cfe-commits


@@ -919,23 +1017,66 @@ bool X86LowerAMXCast::optimizeAMXCastFromPhi(
   return true;
 }
 
+static Value *getShapeFromAMXIntrinsic(Value *Inst, unsigned ShapeIdx,
+   bool IsRow) {
+  if (!isAMXIntrinsic(Inst))
+return nullptr;
+
+  auto *II = cast(Inst);
+  if (IsRow)
+return II->getOperand(0);
+
+  assert(ShapeIdx < 2 && "Currently 2 shapes in 1 instruction at most!");
+  return II->getOperand(ShapeIdx + 1);
+}
+
 // %43 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %42)
 // store <256 x i32> %43, <256 x i32>* %p, align 64
 // -->
 // call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %p,
 //   i64 64, x86_amx %42)
 bool X86LowerAMXCast::combineCastStore(IntrinsicInst *Cast, StoreInst *ST) {
   Value *Tile = Cast->getOperand(0);
-  // TODO: If it is cast intrinsic or phi node, we can propagate the
-  // shape information through def-use chain.
-  if (!isAMXIntrinsic(Tile))
+
+  assert(Tile->getType()->isX86_AMXTy() && "Not Tile Operand!");
+
+  // TODO: Specially handle the mult-use case.

fzou1 wrote:

mult->multi. The same below.

https://github.com/llvm/llvm-project/pull/113532
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-TRANSPOSE (PR #113532)

2024-10-31 Thread Feng Zou via cfe-commits


@@ -0,0 +1,248 @@
+/* ===--- amxtransposeintrin.h - AMX_TRANSPOSE intrinsics -*- C++ 
-*-===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * 
===---===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use  directly; use  instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMX_TRANSPOSEINTRIN_H
+#define __AMX_TRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS_TRANSPOSE   
\
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-transpose")))
+
+#define _tile_2rpntlvwz0(tdst, base, stride)   
\
+  __builtin_ia32_t2rpntlvwz0(tdst, base, stride)
+#define _tile_2rpntlvwz0t1(tdst, base, stride) 
\
+  __builtin_ia32_t2rpntlvwz0t1(tdst, base, stride)
+#define _tile_2rpntlvwz1(tdst, base, stride)   
\
+  __builtin_ia32_t2rpntlvwz1(tdst, base, stride)
+#define _tile_2rpntlvwz1t1(tdst, base, stride) 
\
+  __builtin_ia32_t2rpntlvwz1t1(tdst, base, stride)
+
+/// Transpose 32-bit elements from \a src and write the result to \a dst.
+///
+/// \headerfile 
+///
+/// \code
+/// void __tile_transposed(__tile dst, __tile src);
+/// \endcode
+///
+/// This intrinsic corresponds to the  TTRANSPOSED  instruction.
+///
+/// \param dst
+///The destination tile. Max size is 1024 Bytes.
+/// \param src
+///The 1st source tile. Max size is 1024 Bytes.
+///
+/// \code{.operation}
+///
+/// FOR i := 0 TO (dst.rows-1)
+///tmp[511:0] := 0
+///FOR j := 0 TO (dst.colsb/4-1)
+///tmp.dword[j] := src.row[j].dword[i]
+///ENDFOR
+///dst.row[i] := tmp
+/// ENDFOR
+///
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+#define _tile_transposed(dst, src) __builtin_ia32_ttransposed(dst, src)
+
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0_internal(
+unsigned short row, unsigned short col0, unsigned short col1,
+_tile1024i *dst0, _tile1024i *dst1, const void *base,
+__SIZE_TYPE__ stride) {
+  // Use __tile1024i_1024a* to escape the alignment check in
+  // clang/test/Headers/x86-intrinsics-headers-clean.cpp
+  __builtin_ia32_t2rpntlvwz0_internal(row, col0, col1, (_tile1024i_1024a 
*)dst0,
+  (_tile1024i_1024a *)dst1, base,
+  (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE 
_tile_2rpntlvwz0t1_internal(
+unsigned short row, unsigned short col0, unsigned short col1,
+_tile1024i *dst0, _tile1024i *dst1, const void *base,
+__SIZE_TYPE__ stride) {
+  __builtin_ia32_t2rpntlvwz0t1_internal(
+  row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, 
base,
+  (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1_internal(
+unsigned short row, unsigned short col0, unsigned short col1,
+_tile1024i *dst0, _tile1024i *dst1, const void *base,
+__SIZE_TYPE__ stride) {
+  __builtin_ia32_t2rpntlvwz1_internal(row, col0, col1, (_tile1024i_1024a 
*)dst0,
+  (_tile1024i_1024a *)dst1, base,
+  (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE 
_tile_2rpntlvwz1t1_internal(
+unsigned short row, unsigned short col0, unsigned short col1,
+_tile1024i *dst0, _tile1024i *dst1, const void *base,
+__SIZE_TYPE__ stride) {
+  __builtin_ia32_t2rpntlvwz1t1_internal(
+  row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, 
base,
+  (__SIZE_TYPE__)(stride));
+}
+
+// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TRANSPOSE
+_tile_transposed_internal(unsigned short m, unsigned short n, _tile1024i src) {
+  return __builtin_ia32_ttransposed_internal(m, n, src);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written.
+/// Provides a hint to the implementation that the data will likely not be
+/// reused in the 

[clang] [llvm] [X86][AMX] Support AMX-TRANSPOSE (PR #113532)

2024-10-31 Thread Feng Zou via cfe-commits


@@ -34,9 +34,31 @@ class ShapeT {
 if (MRI)
   deduceImm(MRI);
   }
+  // When ShapeT has mult shapes, we only use Shapes (never use Row and Col)

fzou1 wrote:

mult -> multiple

https://github.com/llvm/llvm-project/pull/113532
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-TRANSPOSE (PR #113532)

2024-10-31 Thread Feng Zou via cfe-commits


@@ -121,12 +137,96 @@ static Instruction 
*getFirstNonAllocaInTheEntryBlock(Function &F) {
   llvm_unreachable("No terminator in the entry block!");
 }
 
-static std::pair getShape(IntrinsicInst *II, unsigned OpNo) {
+class ShapeCalculator {
+private:
+  TargetMachine *TM = nullptr;
+
+  // In AMX intrinsics we let Shape = {Row, Col}, but the
+  // RealCol = Col / ElementSize. We may use the RealCol
+  // as a new Row for other new created AMX intrinsics.
+  std::map Col2Row, Row2Col;
+
+public:
+  ShapeCalculator(TargetMachine *TargetM) : TM(TargetM) {}
+  std::pair getShape(IntrinsicInst *II, unsigned OpNo);
+  std::pair getShape(PHINode *Phi);
+  Value *getRowFromCol(Instruction *II, Value *V, unsigned Granularity);
+  Value *getColFromRow(Instruction *II, Value *V, unsigned Granularity);
+};
+
+Value *ShapeCalculator::getRowFromCol(Instruction *II, Value *V,
+  unsigned Granularity) {
+  if (Col2Row.count(V))
+return Col2Row[V];
+  IRBuilder<> Builder(II);
+  Value *RealRow = nullptr;
+  if (isa(V))
+RealRow =
+Builder.getInt16((cast(V)->getSExtValue()) / Granularity);
+  else if (isa(V)) {
+// When it is not a const value and it is not a function argument, we
+// create Row after the definition of V instead of
+// before II. For example, II is %118, we try to getshape for %117:
+//   %117 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x
+//   i32> %115).
+//   %118 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16
+//   %104, i16 %105, i16 %106, x86_amx %110, x86_amx %114, x86_amx
+//   %117).
+// If we create %row = udiv i16 %106, 4 before %118(aka. II), then its
+// definition is after its user(new tileload for %117).
+// So, the best choice is to create %row right after the definition of
+// %106.
+Builder.SetInsertPoint(cast(V));
+RealRow = Builder.CreateUDiv(V, Builder.getInt16(4));
+cast(RealRow)->moveAfter(cast(V));
+  } else {
+// When it is not a const value and it is a function argument, we create
+// Row at the entry bb.
+IRBuilder<> NewBuilder(
+getFirstNonAllocaInTheEntryBlock(*II->getFunction()));
+RealRow = NewBuilder.CreateUDiv(V, NewBuilder.getInt16(Granularity));
+  }
+  Col2Row[V] = RealRow;
+  return RealRow;
+}
+
+Value *ShapeCalculator::getColFromRow(Instruction *II, Value *V,
+  unsigned Granularity) {
+  if (Row2Col.count(V))
+return Row2Col[V];
+  IRBuilder<> Builder(II);
+  Value *RealCol = nullptr;
+  if (isa(V))
+RealCol =
+Builder.getInt16((cast(V)->getSExtValue()) * Granularity);
+  else if (isa(V)) {
+Builder.SetInsertPoint(cast(V));
+RealCol = Builder.CreateNUWMul(V, Builder.getInt16(Granularity));
+cast(RealCol)->moveAfter(cast(V));
+  } else {
+// When it is not a const value and it is a function argument, we create
+// Row at the entry bb.

fzou1 wrote:

Row -> Column

https://github.com/llvm/llvm-project/pull/113532
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-TRANSPOSE (PR #113532)

2024-10-31 Thread Feng Zou via cfe-commits


@@ -16920,6 +16920,58 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned 
BuiltinID,
 // instruction, but it will create a memset that won't be optimized away.
 return Builder.CreateMemSet(Ops[0], Ops[1], Ops[2], Align(1), true);
   }
+  // Corresponding to intrisics which will return 2 tiles (tile0_tile1).
+  case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
+  case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
+  case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
+  case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: {
+Intrinsic::ID IID;
+switch (BuiltinID) {
+default:
+  llvm_unreachable("Unsupported intrinsic!");
+case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
+  IID = Intrinsic::x86_t2rpntlvwz0_internal;
+  break;
+case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
+  IID = Intrinsic::x86_t2rpntlvwz0t1_internal;
+  break;
+case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
+  IID = Intrinsic::x86_t2rpntlvwz1_internal;
+  break;
+case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
+  IID = Intrinsic::x86_t2rpntlvwz1t1_internal;
+  break;
+}
+
+// Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride)
+Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID),
+ {Ops[0], Ops[1], Ops[2], Ops[5], Ops[6]});
+
+auto *PtrTy = E->getArg(3)->getType()->getAs();
+assert(PtrTy && "arg3 must be of pointer type");
+QualType PtreeTy = PtrTy->getPointeeType();
+llvm::Type *TyPtee = ConvertType(PtreeTy);
+
+// Bitcast amx type (x86_amx) to vector type (256 x i32)
+// Then store tile0 into DstPtr0
+Value *T0 = Builder.CreateExtractValue(Call, 0);
+Value *VecT0 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
+   {TyPtee}, {T0});
+Builder.CreateDefaultAlignedStore(VecT0, Ops[3]);
+
+// Then store tile1 into DstPtr1
+Value *T1 = Builder.CreateExtractValue(Call, 1);
+Value *VecT1 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
+   {TyPtee}, {T1});
+Value *Store = Builder.CreateDefaultAlignedStore(VecT1, Ops[4]);
+
+// Note: Here we escape directly use x86_tilestored64_internal to store
+// the results due to it can't make sure the Mem writen scope. This may

fzou1 wrote:

writen -> written

https://github.com/llvm/llvm-project/pull/113532
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-TRANSPOSE (PR #113532)

2024-10-31 Thread Feng Zou via cfe-commits


@@ -0,0 +1,248 @@
+/* ===--- amxtransposeintrin.h - AMX_TRANSPOSE intrinsics -*- C++ 
-*-===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * 
===---===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use  directly; use  instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMX_TRANSPOSEINTRIN_H
+#define __AMX_TRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS_TRANSPOSE   
\
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-transpose")))
+
+#define _tile_2rpntlvwz0(tdst, base, stride)   
\
+  __builtin_ia32_t2rpntlvwz0(tdst, base, stride)
+#define _tile_2rpntlvwz0t1(tdst, base, stride) 
\
+  __builtin_ia32_t2rpntlvwz0t1(tdst, base, stride)
+#define _tile_2rpntlvwz1(tdst, base, stride)   
\
+  __builtin_ia32_t2rpntlvwz1(tdst, base, stride)
+#define _tile_2rpntlvwz1t1(tdst, base, stride) 
\
+  __builtin_ia32_t2rpntlvwz1t1(tdst, base, stride)
+
+/// Transpose 32-bit elements from \a src and write the result to \a dst.
+///
+/// \headerfile 
+///
+/// \code
+/// void __tile_transposed(__tile dst, __tile src);
+/// \endcode
+///
+/// This intrinsic corresponds to the  TTRANSPOSED  instruction.
+///
+/// \param dst
+///The destination tile. Max size is 1024 Bytes.
+/// \param src
+///The 1st source tile. Max size is 1024 Bytes.

fzou1 wrote:

Removed "1st" from description since there is only 1 source tile.

https://github.com/llvm/llvm-project/pull/113532
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-TRANSPOSE (PR #113532)

2024-10-31 Thread Feng Zou via cfe-commits


@@ -80,28 +80,41 @@ INITIALIZE_PASS_BEGIN(X86FastTileConfig, DEBUG_TYPE,
 INITIALIZE_PASS_END(X86FastTileConfig, DEBUG_TYPE,
 "Fast Tile Register Configure", false, false)
 
-static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) {
+static unsigned getNumDefTiles(MachineRegisterInfo *MRI, MachineInstr &MI) {
   // There is no phi instruction after register allocation.
   assert(MI.isPHI() == false);
   // The instruction must have 3 operands: tile def, row, col.
   // It should be AMX pseudo instruction that have shape operand.
   if (MI.isDebugInstr() || MI.isCopy() || MI.getNumOperands() < 3 ||
   !MI.isPseudo())
-return false;
+return 0;
   MachineOperand &MO = MI.getOperand(0);
 
   if (MO.isReg()) {
 Register Reg = MO.getReg();
-// FIXME it may be used after Greedy RA and the physical
+// FIXME: It may be used after Greedy RA and the physical
 // register is not rewritten yet.
-if (Reg.isVirtual() &&
-MRI->getRegClass(Reg)->getID() == X86::TILERegClassID)
-  return true;
+if (Reg.isVirtual()) {

fzou1 wrote:

It's okay.

https://github.com/llvm/llvm-project/pull/113532
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-TRANSPOSE (PR #113532)

2024-10-31 Thread Feng Zou via cfe-commits


@@ -623,6 +623,37 @@ struct X86Operand final : public MCParsedAsmOperand {
 Inst.addOperand(MCOperand::createReg(Reg));
   }
 
+  bool isTILEPair() const {
+return Kind == Register &&
+   X86MCRegisterClasses[X86::TILERegClassID].contains(getReg());

fzou1 wrote:

Should X86::TILERegClassID be X86::TILEPAIRRegClassID?

https://github.com/llvm/llvm-project/pull/113532
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-TRANSPOSE (PR #113532)

2024-11-01 Thread Feng Zou via cfe-commits

https://github.com/fzou1 deleted 
https://github.com/llvm/llvm-project/pull/113532
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-TRANSPOSE (PR #113532)

2024-11-01 Thread Feng Zou via cfe-commits

https://github.com/fzou1 approved this pull request.

LGTM

https://github.com/llvm/llvm-project/pull/113532
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-TF32 (PR #115625)

2024-11-09 Thread Feng Zou via cfe-commits

https://github.com/fzou1 created 
https://github.com/llvm/llvm-project/pull/115625

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368

>From b1d9799b99b45b5af2b63868c4c3b139dbf9378c Mon Sep 17 00:00:00 2001
From: Feng Zou 
Date: Sat, 26 Oct 2024 18:44:32 +0800
Subject: [PATCH] [X86][AMX] Support AMX-TF32

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368
---
 clang/docs/ReleaseNotes.rst   |   1 +
 clang/include/clang/Basic/BuiltinsX86_64.def  |  15 +-
 clang/include/clang/Driver/Options.td |   2 +
 clang/lib/Basic/Targets/X86.cpp   |   6 +
 clang/lib/Basic/Targets/X86.h |   1 +
 clang/lib/Headers/CMakeLists.txt  |   1 +
 clang/lib/Headers/amxtf32intrin.h | 194 ++
 clang/lib/Headers/immintrin.h |   4 +
 clang/lib/Sema/SemaX86.cpp|   2 +
 clang/test/CodeGen/X86/amx_tf32.c |  17 ++
 clang/test/CodeGen/X86/amx_tf32_api.c |  27 +++
 clang/test/CodeGen/X86/amx_tf32_errors.c  |  23 +++
 clang/test/CodeGen/X86/amx_tf32_inline_asm.c  |  18 ++
 clang/test/Driver/x86-target-features.c   |   7 +
 clang/test/Preprocessor/x86_target_features.c |   9 +
 llvm/include/llvm/IR/IntrinsicsX86.td |  19 ++
 .../llvm/TargetParser/X86TargetParser.def |   1 +
 llvm/lib/Target/X86/X86.td|   3 +
 llvm/lib/Target/X86/X86ExpandPseudo.cpp   |  11 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp   |  22 ++
 llvm/lib/Target/X86/X86InstrAMX.td|  52 +
 llvm/lib/Target/X86/X86InstrPredicates.td |   1 +
 llvm/lib/Target/X86/X86LowerAMXType.cpp   |  20 +-
 llvm/lib/Target/X86/X86RegisterInfo.cpp   |   4 +-
 llvm/lib/TargetParser/Host.cpp|   1 +
 llvm/lib/TargetParser/X86TargetParser.cpp |   1 +
 llvm/test/CodeGen/X86/amx-tf32-internal.ll|  47 +
 llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll  |  23 +++
 .../Disassembler/X86/AMX/x86-64-amx-tf32.txt  |  19 ++
 llvm/test/MC/X86/AMX/x86-64-amx-tf32-att.s|  17 ++
 llvm/test/MC/X86/AMX/x86-64-amx-tf32-intel.s  |  17 ++
 31 files changed, 578 insertions(+), 7 deletions(-)
 create mode 100644 clang/lib/Headers/amxtf32intrin.h
 create mode 100644 clang/test/CodeGen/X86/amx_tf32.c
 create mode 100644 clang/test/CodeGen/X86/amx_tf32_api.c
 create mode 100644 clang/test/CodeGen/X86/amx_tf32_errors.c
 create mode 100644 clang/test/CodeGen/X86/amx_tf32_inline_asm.c
 create mode 100644 llvm/test/CodeGen/X86/amx-tf32-internal.ll
 create mode 100644 llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll
 create mode 100644 llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-tf32.txt
 create mode 100644 llvm/test/MC/X86/AMX/x86-64-amx-tf32-att.s
 create mode 100644 llvm/test/MC/X86/AMX/x86-64-amx-tf32-intel.s

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c3424e0e6f34c9..e235a04f78112b 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -740,6 +740,7 @@ X86 Support
 - Support ISA of ``AMX-FP8``.
 - Support ISA of ``AMX-TRANSPOSE``.
 - Support ISA of ``AMX-AVX512``.
+- Support ISA of ``AMX-TF32``.
 
 Arm and AArch64 Support
 ^^^
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def 
b/clang/include/clang/Basic/BuiltinsX86_64.def
index 9f7462b1e0d962..25c10d39df32e2 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -139,6 +139,9 @@ TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l_internal, 
"V32yUsUsV256iUi", "n",
 TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phh_internal, "V32xUsUsV256iUi", "n", 
"amx-avx512,avx10.2-512")
 TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phl_internal, "V32xUsUsV256iUi", "n", 
"amx-avx512,avx10.2-512")
 TARGET_BUILTIN(__builtin_ia32_tilemovrow_internal, "V16iUsUsV256iUi", "n", 
"amx-avx512,avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_tmmultf32ps_internal, 
"V256iUsUsUsV256iV256iV256i", "n", "amx-tf32")
+TARGET_BUILTIN(__builtin_ia32_ttmmultf32ps_internal, 
"V256iUsUsUsV256iV256iV256i", "n", "amx-tf32,amx-transpose")
+
 // AMX
 TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
 TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")
@@ -172,10 +175,6 @@ TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phh, "V32xIUcUi", 
"n", "amx-avx512,avx10
 TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phl, "V32xIUcUi", "n", 
"amx-avx512,avx10.2-512")
 TARGET_BUILTIN(__builtin_ia32_tilemovrow, "V16iIUcUi", "n", 
"amx-avx512,avx10.2-512")
 
-TARGET_BUILTIN(__builtin_ia32_prefetchi, "vvC*Ui", "nc", "prefetchi")
-TARGET_BUILTIN(__builtin_ia32_cmpccxadd32, "Siv*SiSiIi", "n", "cmpccxadd")
-TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, "SLLiSLLi*SLLiSLLiIi", "n", 
"cmpccxadd")
-
 // AMX_FP16 FP16
 TARGET_BUILTIN(__builtin_ia32_tdpfp16ps, "vIUcIUcIUc", "n", "amx-fp16")
 
@@ -185,6 +184,14 @@ TARGET_BUILTIN(__builtin_ia32_tdpbhf8ps, "vIUcUIcUIc", 
"n", "amx-fp8")
 TARGET_BUILTIN(__builtin_ia32_tdphbf8ps, "vIUcUIcUIc",

[clang] [llvm] [X86][AMX] Support AMX-TF32 (PR #115625)

2024-11-09 Thread Feng Zou via cfe-commits

https://github.com/fzou1 updated 
https://github.com/llvm/llvm-project/pull/115625

>From b1d9799b99b45b5af2b63868c4c3b139dbf9378c Mon Sep 17 00:00:00 2001
From: Feng Zou 
Date: Sat, 26 Oct 2024 18:44:32 +0800
Subject: [PATCH 1/2] [X86][AMX] Support AMX-TF32

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368
---
 clang/docs/ReleaseNotes.rst   |   1 +
 clang/include/clang/Basic/BuiltinsX86_64.def  |  15 +-
 clang/include/clang/Driver/Options.td |   2 +
 clang/lib/Basic/Targets/X86.cpp   |   6 +
 clang/lib/Basic/Targets/X86.h |   1 +
 clang/lib/Headers/CMakeLists.txt  |   1 +
 clang/lib/Headers/amxtf32intrin.h | 194 ++
 clang/lib/Headers/immintrin.h |   4 +
 clang/lib/Sema/SemaX86.cpp|   2 +
 clang/test/CodeGen/X86/amx_tf32.c |  17 ++
 clang/test/CodeGen/X86/amx_tf32_api.c |  27 +++
 clang/test/CodeGen/X86/amx_tf32_errors.c  |  23 +++
 clang/test/CodeGen/X86/amx_tf32_inline_asm.c  |  18 ++
 clang/test/Driver/x86-target-features.c   |   7 +
 clang/test/Preprocessor/x86_target_features.c |   9 +
 llvm/include/llvm/IR/IntrinsicsX86.td |  19 ++
 .../llvm/TargetParser/X86TargetParser.def |   1 +
 llvm/lib/Target/X86/X86.td|   3 +
 llvm/lib/Target/X86/X86ExpandPseudo.cpp   |  11 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp   |  22 ++
 llvm/lib/Target/X86/X86InstrAMX.td|  52 +
 llvm/lib/Target/X86/X86InstrPredicates.td |   1 +
 llvm/lib/Target/X86/X86LowerAMXType.cpp   |  20 +-
 llvm/lib/Target/X86/X86RegisterInfo.cpp   |   4 +-
 llvm/lib/TargetParser/Host.cpp|   1 +
 llvm/lib/TargetParser/X86TargetParser.cpp |   1 +
 llvm/test/CodeGen/X86/amx-tf32-internal.ll|  47 +
 llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll  |  23 +++
 .../Disassembler/X86/AMX/x86-64-amx-tf32.txt  |  19 ++
 llvm/test/MC/X86/AMX/x86-64-amx-tf32-att.s|  17 ++
 llvm/test/MC/X86/AMX/x86-64-amx-tf32-intel.s  |  17 ++
 31 files changed, 578 insertions(+), 7 deletions(-)
 create mode 100644 clang/lib/Headers/amxtf32intrin.h
 create mode 100644 clang/test/CodeGen/X86/amx_tf32.c
 create mode 100644 clang/test/CodeGen/X86/amx_tf32_api.c
 create mode 100644 clang/test/CodeGen/X86/amx_tf32_errors.c
 create mode 100644 clang/test/CodeGen/X86/amx_tf32_inline_asm.c
 create mode 100644 llvm/test/CodeGen/X86/amx-tf32-internal.ll
 create mode 100644 llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll
 create mode 100644 llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-tf32.txt
 create mode 100644 llvm/test/MC/X86/AMX/x86-64-amx-tf32-att.s
 create mode 100644 llvm/test/MC/X86/AMX/x86-64-amx-tf32-intel.s

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c3424e0e6f34c9..e235a04f78112b 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -740,6 +740,7 @@ X86 Support
 - Support ISA of ``AMX-FP8``.
 - Support ISA of ``AMX-TRANSPOSE``.
 - Support ISA of ``AMX-AVX512``.
+- Support ISA of ``AMX-TF32``.
 
 Arm and AArch64 Support
 ^^^
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def 
b/clang/include/clang/Basic/BuiltinsX86_64.def
index 9f7462b1e0d962..25c10d39df32e2 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -139,6 +139,9 @@ TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l_internal, 
"V32yUsUsV256iUi", "n",
 TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phh_internal, "V32xUsUsV256iUi", "n", 
"amx-avx512,avx10.2-512")
 TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phl_internal, "V32xUsUsV256iUi", "n", 
"amx-avx512,avx10.2-512")
 TARGET_BUILTIN(__builtin_ia32_tilemovrow_internal, "V16iUsUsV256iUi", "n", 
"amx-avx512,avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_tmmultf32ps_internal, 
"V256iUsUsUsV256iV256iV256i", "n", "amx-tf32")
+TARGET_BUILTIN(__builtin_ia32_ttmmultf32ps_internal, 
"V256iUsUsUsV256iV256iV256i", "n", "amx-tf32,amx-transpose")
+
 // AMX
 TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
 TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")
@@ -172,10 +175,6 @@ TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phh, "V32xIUcUi", 
"n", "amx-avx512,avx10
 TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phl, "V32xIUcUi", "n", 
"amx-avx512,avx10.2-512")
 TARGET_BUILTIN(__builtin_ia32_tilemovrow, "V16iIUcUi", "n", 
"amx-avx512,avx10.2-512")
 
-TARGET_BUILTIN(__builtin_ia32_prefetchi, "vvC*Ui", "nc", "prefetchi")
-TARGET_BUILTIN(__builtin_ia32_cmpccxadd32, "Siv*SiSiIi", "n", "cmpccxadd")
-TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, "SLLiSLLi*SLLiSLLiIi", "n", 
"cmpccxadd")
-
 // AMX_FP16 FP16
 TARGET_BUILTIN(__builtin_ia32_tdpfp16ps, "vIUcIUcIUc", "n", "amx-fp16")
 
@@ -185,6 +184,14 @@ TARGET_BUILTIN(__builtin_ia32_tdpbhf8ps, "vIUcUIcUIc", 
"n", "amx-fp8")
 TARGET_BUILTIN(__builtin_ia32_tdphbf8ps, "vIUcUIcUIc", "n", "amx-fp8")
 TARGET_BUILTIN(__builtin_ia32_tdph

[clang] [llvm] [X86][AMX] Support AMX-TF32 (PR #115625)

2024-11-09 Thread Feng Zou via cfe-commits

https://github.com/fzou1 updated 
https://github.com/llvm/llvm-project/pull/115625

>From b1d9799b99b45b5af2b63868c4c3b139dbf9378c Mon Sep 17 00:00:00 2001
From: Feng Zou 
Date: Sat, 26 Oct 2024 18:44:32 +0800
Subject: [PATCH 1/3] [X86][AMX] Support AMX-TF32

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368
---
 clang/docs/ReleaseNotes.rst   |   1 +
 clang/include/clang/Basic/BuiltinsX86_64.def  |  15 +-
 clang/include/clang/Driver/Options.td |   2 +
 clang/lib/Basic/Targets/X86.cpp   |   6 +
 clang/lib/Basic/Targets/X86.h |   1 +
 clang/lib/Headers/CMakeLists.txt  |   1 +
 clang/lib/Headers/amxtf32intrin.h | 194 ++
 clang/lib/Headers/immintrin.h |   4 +
 clang/lib/Sema/SemaX86.cpp|   2 +
 clang/test/CodeGen/X86/amx_tf32.c |  17 ++
 clang/test/CodeGen/X86/amx_tf32_api.c |  27 +++
 clang/test/CodeGen/X86/amx_tf32_errors.c  |  23 +++
 clang/test/CodeGen/X86/amx_tf32_inline_asm.c  |  18 ++
 clang/test/Driver/x86-target-features.c   |   7 +
 clang/test/Preprocessor/x86_target_features.c |   9 +
 llvm/include/llvm/IR/IntrinsicsX86.td |  19 ++
 .../llvm/TargetParser/X86TargetParser.def |   1 +
 llvm/lib/Target/X86/X86.td|   3 +
 llvm/lib/Target/X86/X86ExpandPseudo.cpp   |  11 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp   |  22 ++
 llvm/lib/Target/X86/X86InstrAMX.td|  52 +
 llvm/lib/Target/X86/X86InstrPredicates.td |   1 +
 llvm/lib/Target/X86/X86LowerAMXType.cpp   |  20 +-
 llvm/lib/Target/X86/X86RegisterInfo.cpp   |   4 +-
 llvm/lib/TargetParser/Host.cpp|   1 +
 llvm/lib/TargetParser/X86TargetParser.cpp |   1 +
 llvm/test/CodeGen/X86/amx-tf32-internal.ll|  47 +
 llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll  |  23 +++
 .../Disassembler/X86/AMX/x86-64-amx-tf32.txt  |  19 ++
 llvm/test/MC/X86/AMX/x86-64-amx-tf32-att.s|  17 ++
 llvm/test/MC/X86/AMX/x86-64-amx-tf32-intel.s  |  17 ++
 31 files changed, 578 insertions(+), 7 deletions(-)
 create mode 100644 clang/lib/Headers/amxtf32intrin.h
 create mode 100644 clang/test/CodeGen/X86/amx_tf32.c
 create mode 100644 clang/test/CodeGen/X86/amx_tf32_api.c
 create mode 100644 clang/test/CodeGen/X86/amx_tf32_errors.c
 create mode 100644 clang/test/CodeGen/X86/amx_tf32_inline_asm.c
 create mode 100644 llvm/test/CodeGen/X86/amx-tf32-internal.ll
 create mode 100644 llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll
 create mode 100644 llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-tf32.txt
 create mode 100644 llvm/test/MC/X86/AMX/x86-64-amx-tf32-att.s
 create mode 100644 llvm/test/MC/X86/AMX/x86-64-amx-tf32-intel.s

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c3424e0e6f34c9..e235a04f78112b 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -740,6 +740,7 @@ X86 Support
 - Support ISA of ``AMX-FP8``.
 - Support ISA of ``AMX-TRANSPOSE``.
 - Support ISA of ``AMX-AVX512``.
+- Support ISA of ``AMX-TF32``.
 
 Arm and AArch64 Support
 ^^^
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def 
b/clang/include/clang/Basic/BuiltinsX86_64.def
index 9f7462b1e0d962..25c10d39df32e2 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -139,6 +139,9 @@ TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l_internal, 
"V32yUsUsV256iUi", "n",
 TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phh_internal, "V32xUsUsV256iUi", "n", 
"amx-avx512,avx10.2-512")
 TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phl_internal, "V32xUsUsV256iUi", "n", 
"amx-avx512,avx10.2-512")
 TARGET_BUILTIN(__builtin_ia32_tilemovrow_internal, "V16iUsUsV256iUi", "n", 
"amx-avx512,avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_tmmultf32ps_internal, 
"V256iUsUsUsV256iV256iV256i", "n", "amx-tf32")
+TARGET_BUILTIN(__builtin_ia32_ttmmultf32ps_internal, 
"V256iUsUsUsV256iV256iV256i", "n", "amx-tf32,amx-transpose")
+
 // AMX
 TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
 TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")
@@ -172,10 +175,6 @@ TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phh, "V32xIUcUi", 
"n", "amx-avx512,avx10
 TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phl, "V32xIUcUi", "n", 
"amx-avx512,avx10.2-512")
 TARGET_BUILTIN(__builtin_ia32_tilemovrow, "V16iIUcUi", "n", 
"amx-avx512,avx10.2-512")
 
-TARGET_BUILTIN(__builtin_ia32_prefetchi, "vvC*Ui", "nc", "prefetchi")
-TARGET_BUILTIN(__builtin_ia32_cmpccxadd32, "Siv*SiSiIi", "n", "cmpccxadd")
-TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, "SLLiSLLi*SLLiSLLiIi", "n", 
"cmpccxadd")
-
 // AMX_FP16 FP16
 TARGET_BUILTIN(__builtin_ia32_tdpfp16ps, "vIUcIUcIUc", "n", "amx-fp16")
 
@@ -185,6 +184,14 @@ TARGET_BUILTIN(__builtin_ia32_tdpbhf8ps, "vIUcUIcUIc", 
"n", "amx-fp8")
 TARGET_BUILTIN(__builtin_ia32_tdphbf8ps, "vIUcUIcUIc", "n", "amx-fp8")
 TARGET_BUILTIN(__builtin_ia32_tdph

[clang] [llvm] [X86][AMX] Support AMX-TRANSPOSE, part 2 (PR #115660)

2024-11-12 Thread Feng Zou via cfe-commits


@@ -275,6 +276,27 @@ std::pair 
ShapeCalculator::getShape(IntrinsicInst *II,
 Col = II->getArgOperand(1);
 break;
   }
+  case Intrinsic::x86_ttdpbf16ps_internal:
+  case Intrinsic::x86_ttdpfp16ps_internal:
+  case Intrinsic::x86_ttcmmimfp16ps_internal:
+  case Intrinsic::x86_ttcmmrlfp16ps_internal:
+  case Intrinsic::x86_tconjtcmmimfp16ps_internal: {

fzou1 wrote:

You may rebase the branch and merge the code with AMX-TF32 intrinsic 
(https://github.com/llvm/llvm-project/pull/115625/files#diff-69bcebccbbe093fbfcf4393d7115a823206ee71a8b98b78d83d12642e1afcc5aR279)

https://github.com/llvm/llvm-project/pull/115660
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Add AMX FP8 new APIs (PR #115829)

2024-11-12 Thread Feng Zou via cfe-commits

fzou1 wrote:

> Missing IR test?

Sorry. Added. Thanks.

https://github.com/llvm/llvm-project/pull/115829
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-AVX512 (PR #114070)

2024-11-06 Thread Feng Zou via cfe-commits


@@ -0,0 +1,381 @@
+/*===- amxavx512intrin.h - AMXAVX512 
===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ 
*======
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use  directly; include  instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AMX_AVX512INTRIN_H
+#define __AMX_AVX512INTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS_AVX512  
\
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-avx512")))
+
+/// Moves a row from a tile register to a zmm destination register, converting
+///the int32 source elements to fp32. The row of the tile is selected by an
+///32b GPR.
+///
+/// \headerfile 
+///
+/// \code
+/// __m512i _tile_cvtrowd2ps(__tile tsrc, unsigned int row);
+/// \endcode
+///
+/// \code{.operation}
+/// VL := 512
+/// VL_bytes := VL >> 3
+/// row_index := row & 0x
+/// row_chunk := ((row >> 16) & 0x) * VL_bytes
+/// FOR i := 0 TO (VL_bytes / 4) - 1
+/// IF i + row_chunk / 4 >= tsrc.colsb / 4
+/// dst.dword[i] := 0
+/// ELSE
+/// dst.f32[i] := 
CONVERT_INT32_TO_FP32(tsrc.row[row_index].dword[row_chunk/4+i], RNE)
+/// FI
+/// ENDFOR
+/// dst[MAX_VL-1:VL] := 0
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCVTROWD2PS instruction.
+///
+/// \param tsrc
+///The 1st source tile. Max size is 1024 Bytes.

fzou1 wrote:

Remove "1st" since there is only one source tile. The following should be 
updated as this.

https://github.com/llvm/llvm-project/pull/114070
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-AVX512 (PR #114070)

2024-11-06 Thread Feng Zou via cfe-commits


@@ -0,0 +1,381 @@
+/*===- amxavx512intrin.h - AMXAVX512 
===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ 
*======
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use  directly; include  instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AMX_AVX512INTRIN_H
+#define __AMX_AVX512INTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS_AVX512  
\
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-avx512")))
+
+/// Moves a row from a tile register to a zmm destination register, converting
+///the int32 source elements to fp32. The row of the tile is selected by an

fzou1 wrote:

an -> a.

https://github.com/llvm/llvm-project/pull/114070
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-AVX512 (PR #114070)

2024-11-06 Thread Feng Zou via cfe-commits


@@ -0,0 +1,381 @@
+/*===- amxavx512intrin.h - AMXAVX512 
===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ 
*======
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use  directly; include  instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AMX_AVX512INTRIN_H
+#define __AMX_AVX512INTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS_AVX512  
\
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-avx512")))
+
+/// Moves a row from a tile register to a zmm destination register, converting
+///the int32 source elements to fp32. The row of the tile is selected by an
+///32b GPR.
+///
+/// \headerfile 
+///
+/// \code
+/// __m512i _tile_cvtrowd2ps(__tile tsrc, unsigned int row);
+/// \endcode
+///
+/// \code{.operation}
+/// VL := 512
+/// VL_bytes := VL >> 3
+/// row_index := row & 0x
+/// row_chunk := ((row >> 16) & 0x) * VL_bytes
+/// FOR i := 0 TO (VL_bytes / 4) - 1
+/// IF i + row_chunk / 4 >= tsrc.colsb / 4
+/// dst.dword[i] := 0
+/// ELSE
+/// dst.f32[i] := 
CONVERT_INT32_TO_FP32(tsrc.row[row_index].dword[row_chunk/4+i], RNE)
+/// FI
+/// ENDFOR
+/// dst[MAX_VL-1:VL] := 0
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCVTROWD2PS instruction.
+///
+/// \param tsrc
+///The 1st source tile. Max size is 1024 Bytes.
+/// \param row
+///The row of the source tile
+#define _tile_cvtrowd2ps(tsrc, row) __builtin_ia32_tcvtrowd2ps(tsrc, row)
+
+/// Moves a row from a tile register to a zmm destination register, converting
+///the fp32 source elements to bf16. It places the resulting bf16 elements
+///in the high 16 bits within each dword. The row of the tile is selected
+///by an 32b GPR.

fzou1 wrote:

an -> a. The following "an 32b" should be updated to "a 32b" too.

https://github.com/llvm/llvm-project/pull/114070
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-AVX512 (PR #114070)

2024-11-06 Thread Feng Zou via cfe-commits


@@ -559,12 +559,68 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
 return true;
   }
   case X86::PTILELOADDV:
-  case X86::PTILELOADDT1V: {
+  case X86::PTILELOADDT1V:
+  case X86::PTCVTROWD2PSrreV:
+  case X86::PTCVTROWD2PSrriV:
+  case X86::PTCVTROWPS2PBF16HrreV:
+  case X86::PTCVTROWPS2PBF16HrriV:
+  case X86::PTCVTROWPS2PBF16LrreV:
+  case X86::PTCVTROWPS2PBF16LrriV:
+  case X86::PTCVTROWPS2PHHrreV:
+  case X86::PTCVTROWPS2PHHrriV:
+  case X86::PTCVTROWPS2PHLrreV:
+  case X86::PTCVTROWPS2PHLrriV:
+  case X86::PTILEMOVROWrreV:
+  case X86::PTILEMOVROWrriV: {
 for (unsigned i = 2; i > 0; --i)
   MI.removeOperand(i);
-unsigned Opc = Opcode == X86::PTILELOADDV
-   ? GET_EGPR_IF_ENABLED(X86::TILELOADD)
-   : GET_EGPR_IF_ENABLED(X86::TILELOADDT1);
+unsigned Opc;
+switch (Opcode) {
+case X86::PTILELOADDV:
+  Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
+  break;
+case X86::PTILELOADDT1V:
+  Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDT1);
+  break;
+case X86::PTCVTROWD2PSrreV:
+  Opc = X86::TCVTROWD2PSrre;
+  break;
+case X86::PTCVTROWD2PSrriV:
+  Opc = X86::TCVTROWD2PSrri;
+  break;
+case X86::PTCVTROWPS2PBF16HrreV:
+  Opc = X86::TCVTROWPS2PBF16Hrre;
+  break;
+case X86::PTCVTROWPS2PBF16HrriV:
+  Opc = X86::TCVTROWPS2PBF16Hrri;
+  break;
+case X86::PTCVTROWPS2PBF16LrreV:
+  Opc = X86::TCVTROWPS2PBF16Lrre;
+  break;
+case X86::PTCVTROWPS2PBF16LrriV:
+  Opc = X86::TCVTROWPS2PBF16Lrri;
+  break;
+case X86::PTCVTROWPS2PHHrreV:
+  Opc = X86::TCVTROWPS2PHHrre;
+  break;
+case X86::PTCVTROWPS2PHHrriV:
+  Opc = X86::TCVTROWPS2PHHrri;
+  break;
+case X86::PTCVTROWPS2PHLrreV:
+  Opc = X86::TCVTROWPS2PHLrre;
+  break;
+case X86::PTCVTROWPS2PHLrriV:
+  Opc = X86::TCVTROWPS2PHLrri;
+  break;
+case X86::PTILEMOVROWrreV:
+  Opc = X86::TILEMOVROWrre;
+  break;
+case X86::PTILEMOVROWrriV:
+  Opc = X86::TILEMOVROWrri;
+  break;
+default:
+  llvm_unreachable("Impossible Opcode!");

fzou1 wrote:

Better to change "Impossible" to "Invalid".

https://github.com/llvm/llvm-project/pull/114070
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-AVX512 (PR #114070)

2024-11-06 Thread Feng Zou via cfe-commits


@@ -133,6 +133,12 @@ TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, 
"vUsUsUsV256i*V256i*vC*z",
 TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", 
"n", "amx-transpose")
 TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, 
"vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
 TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", 
"amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps_internal, "V16fUsUsV256iUi", "n", 
"amx-avx512")

fzou1 wrote:

Is "avx10.2-512" feature needed to be added for the intrinsics here and there?

https://github.com/llvm/llvm-project/pull/114070
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-AVX512 (PR #114070)

2024-11-07 Thread Feng Zou via cfe-commits

https://github.com/fzou1 commented:

LGTM except the last place probably missing avx10.2-512 dependency.

https://github.com/llvm/llvm-project/pull/114070
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-AVX512 (PR #114070)

2024-11-07 Thread Feng Zou via cfe-commits

https://github.com/fzou1 edited https://github.com/llvm/llvm-project/pull/114070
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-AVX512 (PR #114070)

2024-11-07 Thread Feng Zou via cfe-commits


@@ -369,3 +369,150 @@ let Predicates = [HasAMXTRANSPOSE, In64BitMode] in {
 }
   }
 } // HasAMXTILE, HasAMXTRANSPOSE
+
+multiclass m_tcvtrowd2ps {
+  let Predicates = [HasAMXAVX512, In64BitMode] in {

fzou1 wrote:

Should add HasAVX10_2_512 in line 374, 390 and 454?

https://github.com/llvm/llvm-project/pull/114070
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-AVX512 (PR #114070)

2024-11-07 Thread Feng Zou via cfe-commits


@@ -133,6 +133,12 @@ TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, 
"vUsUsUsV256i*V256i*vC*z",
 TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", 
"n", "amx-transpose")
 TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, 
"vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
 TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", 
"amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps_internal, "V16fUsUsV256iUi", "n", 
"amx-avx512")

fzou1 wrote:

Is it necessary to add avx10.2-512 feature for these internal APIs? With that, 
we may detect errors for new APIs if there is no AVX10.2-512 target feature.

https://github.com/llvm/llvm-project/pull/114070
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-AVX512 (PR #114070)

2024-11-07 Thread Feng Zou via cfe-commits


@@ -0,0 +1,381 @@
+/*===- amxavx512intrin.h - AMXAVX512 
===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ 
*======
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use  directly; include  instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AMX_AVX512INTRIN_H
+#define __AMX_AVX512INTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS_AVX512  
\
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-avx512")))

fzou1 wrote:

If AVX10.2-512 feature dependency is needed for internal APIs, we may create 
another attribute with AVX10.2-512 and add it to internal APIs.

https://github.com/llvm/llvm-project/pull/114070
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX] Support AMX-AVX512 (PR #114070)

2024-11-08 Thread Feng Zou via cfe-commits

https://github.com/fzou1 approved this pull request.

LGTM

https://github.com/llvm/llvm-project/pull/114070
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [lld] [llvm] [X86][MC,LLD][NFC] Rename R_X86_64_REX2_GOTPCRELX (PR #116737)

2024-11-18 Thread Feng Zou via cfe-commits

https://github.com/fzou1 updated 
https://github.com/llvm/llvm-project/pull/116737

>From c1716f030d8503b5a4742447ef8883d900521c34 Mon Sep 17 00:00:00 2001
From: Feng Zou 
Date: Tue, 19 Nov 2024 11:19:17 +0800
Subject: [PATCH 1/2] [X86][MC,LLD][NFC] Rename R_X86_64_REX2_GOTPCRELX to
 R_X86_64_CODE_4_GOTPCRELX

This is to align with GCC/binutils and ABI.
GCC/binutils: 
https://github.com/bminor/binutils-gdb/commit/3d5a60de52556f6a53d71d7e607c6696450ae3e4
and 
https://github.com/bminor/binutils-gdb/commit/4a54cb06585f568031dfd291d0fe45979ad75e98

ABI: 
https://gitlab.com/x86-psABIs/x86-64-ABI/-/commit/357de358ba68eb779822dfcbb45f7ee2d9d09193
---
 lld/ELF/Arch/X86_64.cpp   | 13 ++--
 lld/test/ELF/x86-64-gotpc-no-relax-err.s  |  2 +-
 lld/test/ELF/x86-64-gotpc-relax-nopic.s   |  2 +-
 lld/test/ELF/x86-64-gotpc-relax.s |  2 +-
 .../llvm/BinaryFormat/ELFRelocs/x86_64.def|  2 +-
 llvm/lib/MC/MCTargetOptionsCommandFlags.cpp   |  2 +-
 .../X86/MCTargetDesc/X86ELFObjectWriter.cpp   |  4 ++--
 .../X86/MCTargetDesc/X86MCCodeEmitter.cpp |  2 +-
 llvm/test/MC/ELF/relocation-alias.s   |  2 +-
 llvm/test/MC/X86/gotpcrelx.s  | 20 +--
 llvm/test/MC/X86/reloc-directive-elf-64.s |  6 +++---
 11 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/lld/ELF/Arch/X86_64.cpp b/lld/ELF/Arch/X86_64.cpp
index e9267bd4128d18..2dcce5c224d5d6 100644
--- a/lld/ELF/Arch/X86_64.cpp
+++ b/lld/ELF/Arch/X86_64.cpp
@@ -394,7 +394,7 @@ RelExpr X86_64::getRelExpr(RelType type, const Symbol &s,
   case R_X86_64_GOTPCREL:
   case R_X86_64_GOTPCRELX:
   case R_X86_64_REX_GOTPCRELX:
-  case R_X86_64_REX2_GOTPCRELX:
+  case R_X86_64_CODE_4_GOTPCRELX:
   case R_X86_64_GOTTPOFF:
 return R_GOT_PC;
   case R_X86_64_GOTOFF64:
@@ -738,7 +738,7 @@ int64_t X86_64::getImplicitAddend(const uint8_t *buf, 
RelType type) const {
   case R_X86_64_GOTPCREL:
   case R_X86_64_GOTPCRELX:
   case R_X86_64_REX_GOTPCRELX:
-  case R_X86_64_REX2_GOTPCRELX:
+  case R_X86_64_CODE_4_GOTPCRELX:
   case R_X86_64_PC32:
   case R_X86_64_GOTTPOFF:
   case R_X86_64_PLT32:
@@ -821,7 +821,7 @@ void X86_64::relocate(uint8_t *loc, const Relocation &rel, 
uint64_t val) const {
 break;
   case R_X86_64_GOTPCRELX:
   case R_X86_64_REX_GOTPCRELX:
-  case R_X86_64_REX2_GOTPCRELX:
+  case R_X86_64_CODE_4_GOTPCRELX:
 if (rel.expr != R_GOT_PC) {
   relaxGot(loc, rel, val);
 } else {
@@ -873,13 +873,13 @@ void X86_64::relocate(uint8_t *loc, const Relocation 
&rel, uint64_t val) const {
 
 RelExpr X86_64::adjustGotPcExpr(RelType type, int64_t addend,
 const uint8_t *loc) const {
-  // Only R_X86_64_[REX_]|[REX2_]GOTPCRELX can be relaxed. GNU as may emit
+  // Only R_X86_64_[REX_]|[CODE_4_]GOTPCRELX can be relaxed. GNU as may emit
   // GOTPCRELX with addend != -4. Such an instruction does not load the full 
GOT
   // entry, so we cannot relax the relocation. E.g. movl x@GOTPCREL+4(%rip),
   // %rax (addend=0) loads the high 32 bits of the GOT entry.
   if (!ctx.arg.relax || addend != -4 ||
   (type != R_X86_64_GOTPCRELX && type != R_X86_64_REX_GOTPCRELX &&
-   type != R_X86_64_REX2_GOTPCRELX))
+   type != R_X86_64_CODE_4_GOTPCRELX))
 return R_GOT_PC;
   const uint8_t op = loc[-2];
   const uint8_t modRm = loc[-1];
@@ -1002,7 +1002,8 @@ static void relaxGot(uint8_t *loc, const Relocation &rel, 
uint64_t val) {
 // We are relaxing a rip relative to an absolute, so compensate
 // for the old -4 addend.
 assert(!rel.sym->file || !rel.sym->file->ctx.arg.isPic);
-relaxGotNoPic(loc, val + 4, op, modRm, rel.type == 
R_X86_64_REX2_GOTPCRELX);
+relaxGotNoPic(loc, val + 4, op, modRm,
+  rel.type == R_X86_64_CODE_4_GOTPCRELX);
 return;
   }
 
diff --git a/lld/test/ELF/x86-64-gotpc-no-relax-err.s 
b/lld/test/ELF/x86-64-gotpc-no-relax-err.s
index 4280c8fd1dc97e..8452090e2c35a0 100644
--- a/lld/test/ELF/x86-64-gotpc-no-relax-err.s
+++ b/lld/test/ELF/x86-64-gotpc-no-relax-err.s
@@ -13,7 +13,7 @@
 # CHECK-NEXT: error: {{.*}}:(.text+0x9): relocation R_X86_64_REX_GOTPCRELX out 
of range: 2147483659 is not in [-2147483648, 2147483647]; references 
'__stop_data'
 # CHECK-NEXT: >>> defined in 
 # CHECK-EMPTY:
-# CHECK-NEXT: error: {{.*}}:(.text+0x11): relocation R_X86_64_REX2_GOTPCRELX 
out of range: 2147483651 is not in [-2147483648, 2147483647]; references 
'__stop_data'
+# CHECK-NEXT: error: {{.*}}:(.text+0x11): relocation R_X86_64_CODE_4_GOTPCRELX 
out of range: 2147483651 is not in [-2147483648, 2147483647]; references 
'__stop_data'
 # CHECK-NEXT: >>> defined in 
 
 #--- a.s
diff --git a/lld/test/ELF/x86-64-gotpc-relax-nopic.s 
b/lld/test/ELF/x86-64-gotpc-relax-nopic.s
index e3cd93d1d57962..be55c7d7006fe5 100644
--- a/lld/test/ELF/x86-64-gotpc-relax-nopic.s
+++ b/lld/test/ELF/x86-64-gotpc-relax-nopic.s
@@ -134,7 +134,7 @@ _start:
   xorqbar@GOTPCREL(%rip), %r8
   testq   %r15, bar@GOTPCREL(%r

[clang] [llvm] [X86][AMX] Support AMX-TRANSPOSE, part 2 (PR #115660)

2024-11-13 Thread Feng Zou via cfe-commits

https://github.com/fzou1 approved this pull request.

LGTM

https://github.com/llvm/llvm-project/pull/115660
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [X86][AMX-AVX512][NFC] Remove P from intrinsic and instruction name (PR #123270)

2025-01-17 Thread Feng Zou via cfe-commits

https://github.com/fzou1 approved this pull request.

LGTM

https://github.com/llvm/llvm-project/pull/123270
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [lld] [llvm] [X86] Implement disabling APX relocations and EPGR/NDD instrs for relocations (PR #136660)

2025-04-21 Thread Feng Zou via cfe-commits

https://github.com/fzou1 created 
https://github.com/llvm/llvm-project/pull/136660

Introduce an option (-mapx-relax-relocations) to control the emission of the 
new APX relocations. It's off by default to keep backward compatibility with 
old version of ld and other linkers without APX support. And EGPR and NDD are 
also suppressed to avoid the instructions updated incorrectly by old version of 
linkers.

>From a98371ac23728289b7e1b9a1516141158bf7c1e0 Mon Sep 17 00:00:00 2001
From: Feng Zou 
Date: Tue, 22 Apr 2025 11:05:29 +0800
Subject: [PATCH] [X86] Implement disabling APX relocations and EPGR/NDD instrs
 for them

Introduce an option (-mapx-relax-relocations) to control the emission of the
new APX relocations. It's off by default to keep backward compatibility with
older version of ld and other linkers without APX support. And EGPR and NDD are
also suppressed to avoid the instructions updated incorrectly by older version
of linker.
---
 clang/CMakeLists.txt  |   3 +
 clang/cmake/caches/Fuchsia-stage2.cmake   |   1 +
 clang/cmake/caches/Fuchsia.cmake  |   1 +
 clang/include/clang/Basic/CodeGenOptions.def  |   1 +
 clang/include/clang/Config/config.h.cmake |   3 +
 clang/include/clang/Driver/Options.td |   3 +
 clang/include/clang/Driver/ToolChain.h|   3 +
 clang/lib/CodeGen/BackendUtil.cpp |   1 +
 clang/lib/Driver/ToolChain.cpp|   4 +
 clang/lib/Driver/ToolChains/Clang.cpp |  10 ++
 clang/test/Driver/relax.s |  15 +-
 clang/tools/driver/cc1as_main.cpp |   5 +
 lld/ELF/Arch/X86_64.cpp   |   6 +-
 lld/test/ELF/tls-opt.s| 111 ++
 lld/test/ELF/x86-64-gotpc-no-relax-err.s  |   9 +-
 lld/test/ELF/x86-64-gotpc-relax-nopic.s   |  35 +++--
 lld/test/ELF/x86-64-gotpc-relax.s |  91 +++
 lld/test/ELF/x86-64-tlsdesc-gd.s  |  37 -
 llvm/include/llvm/MC/MCTargetOptions.h|   2 +
 .../llvm/MC/MCTargetOptionsCommandFlags.h |   2 +
 llvm/lib/LTO/LTO.cpp  |   1 +
 llvm/lib/MC/MCTargetOptionsCommandFlags.cpp   |  10 ++
 llvm/lib/Target/X86/CMakeLists.txt|   1 +
 .../lib/Target/X86/MCTargetDesc/X86BaseInfo.h |  22 +++
 .../X86/MCTargetDesc/X86ELFObjectWriter.cpp   |  20 ++-
 llvm/lib/Target/X86/X86.h |   1 +
 llvm/lib/Target/X86/X86InstrInfo.cpp  |  17 +-
 .../X86/X86SuppressEGPRAndNDDForReloc.cpp | 145 ++
 llvm/lib/Target/X86/X86TargetMachine.cpp  |   2 +
 llvm/test/CodeGen/X86/O0-pipeline.ll  |   1 +
 llvm/test/CodeGen/X86/apx/tls-desc.ll |  96 
 llvm/test/CodeGen/X86/opt-pipeline.ll |   3 +
 llvm/test/MC/ELF/relocation-alias.s   |   8 +-
 llvm/test/MC/X86/elf-reloc-tls.s  |  16 +-
 llvm/test/MC/X86/gotpcrelx.s  |  49 +-
 llvm/test/MC/X86/tlsdesc-64.s |   8 +-
 llvm/tools/gold/gold-plugin.cpp   |   2 +
 37 files changed, 635 insertions(+), 110 deletions(-)
 create mode 100644 llvm/lib/Target/X86/X86SuppressEGPRAndNDDForReloc.cpp
 create mode 100644 llvm/test/CodeGen/X86/apx/tls-desc.ll

diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt
index c3f30e2a8e9c0..fa3063d6be9a7 100644
--- a/clang/CMakeLists.txt
+++ b/clang/CMakeLists.txt
@@ -219,6 +219,9 @@ set(ENABLE_LINKER_BUILD_ID OFF CACHE BOOL "pass --build-id 
to ld")
 set(ENABLE_X86_RELAX_RELOCATIONS ON CACHE BOOL
 "enable x86 relax relocations by default")
 
+set(ENABLE_X86_APX_RELAX_RELOCATIONS OFF CACHE BOOL
+"Enable x86 APX relax relocations by default")
+
 set(PPC_LINUX_DEFAULT_IEEELONGDOUBLE OFF CACHE BOOL
 "Enable IEEE binary128 as default long double format on PowerPC Linux.")
 
diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake 
b/clang/cmake/caches/Fuchsia-stage2.cmake
index 99890b8246ad7..22eef24b611a8 100644
--- a/clang/cmake/caches/Fuchsia-stage2.cmake
+++ b/clang/cmake/caches/Fuchsia-stage2.cmake
@@ -49,6 +49,7 @@ set(CLANG_PLUGIN_SUPPORT OFF CACHE BOOL "")
 
 set(ENABLE_LINKER_BUILD_ID ON CACHE BOOL "")
 set(ENABLE_X86_RELAX_RELOCATIONS ON CACHE BOOL "")
+set(ENABLE_X86_APX_RELAX_RELOCATIONS OFF CACHE BOOL "")
 
 # TODO(#67176): relative-vtables doesn't play well with different default
 # visibilities. Making everything hidden visibility causes other complications
diff --git a/clang/cmake/caches/Fuchsia.cmake b/clang/cmake/caches/Fuchsia.cmake
index 83336589da305..550d94d1ec6a5 100644
--- a/clang/cmake/caches/Fuchsia.cmake
+++ b/clang/cmake/caches/Fuchsia.cmake
@@ -89,6 +89,7 @@ set(CLANG_PLUGIN_SUPPORT OFF CACHE BOOL "")
 
 set(ENABLE_LINKER_BUILD_ID ON CACHE BOOL "")
 set(ENABLE_X86_RELAX_RELOCATIONS ON CACHE BOOL "")
+set(ENABLE_X86_APX_RELAX_RELOCATIONS OFF CACHE BOOL "")
 
 set(LLVM_ENABLE_ASSERTIONS ON CACHE BOOL "")
 set(LLVM_ENABLE_BACKTRACES ON CACHE BOOL "")
diff --git a/clang/include/clang/Basic/CodeGe

[clang] [lld] [llvm] [X86] Implement disabling APX relocations and EPGR/NDD instrs for relocations (PR #136660)

2025-04-22 Thread Feng Zou via cfe-commits

fzou1 wrote:

> > Introduce an option (-mapx-relax-relocations) to control the emission of 
> > the new APX relocations. It's off by default to keep backward compatibility 
> > with old version of ld and other linkers without APX support. And EGPR and 
> > NDD are also suppressed to avoid the instructions updated incorrectly by 
> > old version of linkers.
> 
> Not understand this. IIUC, either we should not emit APX instructions at all, 
> which is controlled by `-m[no-]apxf`, or we should not relax the all the 
> relocations, which is controlled by `-Wl,--no-relax`.

If the APX relocation types are emitted in MC, they cannot be recognized by old 
version of linkers. It leads to APX features unavailable on existing Linux OSes 
with old version of linkers. `--no-relax` just disables the GOT optimization in 
linker, and it cannot resolve the link error of unsupported relocation type as 
mentioned above.

https://github.com/llvm/llvm-project/pull/136660
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [lld] [llvm] [X86] Implement disabling APX relocations and EPGR/NDD instrs for relocations (PR #136660)

2025-04-22 Thread Feng Zou via cfe-commits

https://github.com/fzou1 updated 
https://github.com/llvm/llvm-project/pull/136660

>From a98371ac23728289b7e1b9a1516141158bf7c1e0 Mon Sep 17 00:00:00 2001
From: Feng Zou 
Date: Tue, 22 Apr 2025 11:05:29 +0800
Subject: [PATCH 1/2] [X86] Implement disabling APX relocations and EPGR/NDD
 instrs for them

Introduce an option (-mapx-relax-relocations) to control the emission of the
new APX relocations. It's off by default to keep backward compatibility with
older version of ld and other linkers without APX support. And EGPR and NDD are
also suppressed to avoid the instructions updated incorrectly by older version
of linker.
---
 clang/CMakeLists.txt  |   3 +
 clang/cmake/caches/Fuchsia-stage2.cmake   |   1 +
 clang/cmake/caches/Fuchsia.cmake  |   1 +
 clang/include/clang/Basic/CodeGenOptions.def  |   1 +
 clang/include/clang/Config/config.h.cmake |   3 +
 clang/include/clang/Driver/Options.td |   3 +
 clang/include/clang/Driver/ToolChain.h|   3 +
 clang/lib/CodeGen/BackendUtil.cpp |   1 +
 clang/lib/Driver/ToolChain.cpp|   4 +
 clang/lib/Driver/ToolChains/Clang.cpp |  10 ++
 clang/test/Driver/relax.s |  15 +-
 clang/tools/driver/cc1as_main.cpp |   5 +
 lld/ELF/Arch/X86_64.cpp   |   6 +-
 lld/test/ELF/tls-opt.s| 111 ++
 lld/test/ELF/x86-64-gotpc-no-relax-err.s  |   9 +-
 lld/test/ELF/x86-64-gotpc-relax-nopic.s   |  35 +++--
 lld/test/ELF/x86-64-gotpc-relax.s |  91 +++
 lld/test/ELF/x86-64-tlsdesc-gd.s  |  37 -
 llvm/include/llvm/MC/MCTargetOptions.h|   2 +
 .../llvm/MC/MCTargetOptionsCommandFlags.h |   2 +
 llvm/lib/LTO/LTO.cpp  |   1 +
 llvm/lib/MC/MCTargetOptionsCommandFlags.cpp   |  10 ++
 llvm/lib/Target/X86/CMakeLists.txt|   1 +
 .../lib/Target/X86/MCTargetDesc/X86BaseInfo.h |  22 +++
 .../X86/MCTargetDesc/X86ELFObjectWriter.cpp   |  20 ++-
 llvm/lib/Target/X86/X86.h |   1 +
 llvm/lib/Target/X86/X86InstrInfo.cpp  |  17 +-
 .../X86/X86SuppressEGPRAndNDDForReloc.cpp | 145 ++
 llvm/lib/Target/X86/X86TargetMachine.cpp  |   2 +
 llvm/test/CodeGen/X86/O0-pipeline.ll  |   1 +
 llvm/test/CodeGen/X86/apx/tls-desc.ll |  96 
 llvm/test/CodeGen/X86/opt-pipeline.ll |   3 +
 llvm/test/MC/ELF/relocation-alias.s   |   8 +-
 llvm/test/MC/X86/elf-reloc-tls.s  |  16 +-
 llvm/test/MC/X86/gotpcrelx.s  |  49 +-
 llvm/test/MC/X86/tlsdesc-64.s |   8 +-
 llvm/tools/gold/gold-plugin.cpp   |   2 +
 37 files changed, 635 insertions(+), 110 deletions(-)
 create mode 100644 llvm/lib/Target/X86/X86SuppressEGPRAndNDDForReloc.cpp
 create mode 100644 llvm/test/CodeGen/X86/apx/tls-desc.ll

diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt
index c3f30e2a8e9c0..fa3063d6be9a7 100644
--- a/clang/CMakeLists.txt
+++ b/clang/CMakeLists.txt
@@ -219,6 +219,9 @@ set(ENABLE_LINKER_BUILD_ID OFF CACHE BOOL "pass --build-id 
to ld")
 set(ENABLE_X86_RELAX_RELOCATIONS ON CACHE BOOL
 "enable x86 relax relocations by default")
 
+set(ENABLE_X86_APX_RELAX_RELOCATIONS OFF CACHE BOOL
+"Enable x86 APX relax relocations by default")
+
 set(PPC_LINUX_DEFAULT_IEEELONGDOUBLE OFF CACHE BOOL
 "Enable IEEE binary128 as default long double format on PowerPC Linux.")
 
diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake 
b/clang/cmake/caches/Fuchsia-stage2.cmake
index 99890b8246ad7..22eef24b611a8 100644
--- a/clang/cmake/caches/Fuchsia-stage2.cmake
+++ b/clang/cmake/caches/Fuchsia-stage2.cmake
@@ -49,6 +49,7 @@ set(CLANG_PLUGIN_SUPPORT OFF CACHE BOOL "")
 
 set(ENABLE_LINKER_BUILD_ID ON CACHE BOOL "")
 set(ENABLE_X86_RELAX_RELOCATIONS ON CACHE BOOL "")
+set(ENABLE_X86_APX_RELAX_RELOCATIONS OFF CACHE BOOL "")
 
 # TODO(#67176): relative-vtables doesn't play well with different default
 # visibilities. Making everything hidden visibility causes other complications
diff --git a/clang/cmake/caches/Fuchsia.cmake b/clang/cmake/caches/Fuchsia.cmake
index 83336589da305..550d94d1ec6a5 100644
--- a/clang/cmake/caches/Fuchsia.cmake
+++ b/clang/cmake/caches/Fuchsia.cmake
@@ -89,6 +89,7 @@ set(CLANG_PLUGIN_SUPPORT OFF CACHE BOOL "")
 
 set(ENABLE_LINKER_BUILD_ID ON CACHE BOOL "")
 set(ENABLE_X86_RELAX_RELOCATIONS ON CACHE BOOL "")
+set(ENABLE_X86_APX_RELAX_RELOCATIONS OFF CACHE BOOL "")
 
 set(LLVM_ENABLE_ASSERTIONS ON CACHE BOOL "")
 set(LLVM_ENABLE_BACKTRACES ON CACHE BOOL "")
diff --git a/clang/include/clang/Basic/CodeGenOptions.def 
b/clang/include/clang/Basic/CodeGenOptions.def
index c5990fb248689..875facd6cfc63 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -201,6 +201,7 @@ CODEGENOPT(UniqueInternalLinkageNames, 1, 0) ///< Internal 
Linkage symbols get u
 CODEGENOPT(Spl

[clang] [lld] [llvm] [X86] Implement disabling APX relocations and EPGR/NDD instrs for relocations (PR #136660)

2025-04-27 Thread Feng Zou via cfe-commits

https://github.com/fzou1 edited https://github.com/llvm/llvm-project/pull/136660
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [lld] [llvm] [X86] Add pass to suppress EPGR/NDD instructions for relocations (PR #136660)

2025-04-27 Thread Feng Zou via cfe-commits

https://github.com/fzou1 edited https://github.com/llvm/llvm-project/pull/136660
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [lld] [llvm] [X86] Add pass to suppress EPGR/NDD instructions for relocations (PR #136660)

2025-04-27 Thread Feng Zou via cfe-commits

https://github.com/fzou1 edited https://github.com/llvm/llvm-project/pull/136660
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [lld] [llvm] [X86] Implement disabling APX relocations and EPGR/NDD instrs for relocations (PR #136660)

2025-04-24 Thread Feng Zou via cfe-commits

fzou1 wrote:

Any comments?

https://github.com/llvm/llvm-project/pull/136660
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [lld] [llvm] [X86] Implement disabling APX relocations and EPGR/NDD instrs for relocations (PR #136660)

2025-04-25 Thread Feng Zou via cfe-commits


@@ -912,9 +912,9 @@ void X86_64::relocate(uint8_t *loc, const Relocation &rel, 
uint64_t val) const {
   case R_X86_64_CODE_4_GOTPC32_TLSDESC:
   case R_X86_64_TLSDESC_CALL:
   case R_X86_64_TLSGD:
-if (rel.expr == R_RELAX_TLS_GD_TO_LE) {
+if (rel.expr == R_RELAX_TLS_GD_TO_LE && ctx.arg.relax) {

fzou1 wrote:

This makes the tests or user programs having APX EGPR/NDD/NF pass with 
"-no-relax" option.

https://github.com/llvm/llvm-project/pull/136660
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [lld] [llvm] [X86] Implement disabling APX relocations and EPGR/NDD instrs for relocations (PR #136660)

2025-04-25 Thread Feng Zou via cfe-commits


@@ -912,9 +912,9 @@ void X86_64::relocate(uint8_t *loc, const Relocation &rel, 
uint64_t val) const {
   case R_X86_64_CODE_4_GOTPC32_TLSDESC:
   case R_X86_64_TLSDESC_CALL:
   case R_X86_64_TLSGD:
-if (rel.expr == R_RELAX_TLS_GD_TO_LE) {
+if (rel.expr == R_RELAX_TLS_GD_TO_LE && ctx.arg.relax) {

fzou1 wrote:

Use --no-relax to suppress TLS relaxation as well as GOTPCREL optimization.

https://github.com/llvm/llvm-project/pull/136660
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [lld] [llvm] [X86] Implement disabling APX relocations and EPGR/NDD instrs for relocations (PR #136660)

2025-04-25 Thread Feng Zou via cfe-commits


@@ -1,44 +1,84 @@
 // REQUIRES: x86
-// RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o
+// RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o 
-x86-apx-relax-relocations=true
 // RUN: ld.lld %t.o -o %t1
 // RUN: llvm-readobj -r %t1 | FileCheck --check-prefix=NORELOC %s
-// RUN: llvm-objdump --no-print-imm-hex -d --no-show-raw-insn %t1 | FileCheck 
--check-prefix=DISASM %s
+// RUN: llvm-objdump --no-print-imm-hex -d --no-show-raw-insn %t1 | FileCheck 
--check-prefixes=DISASM,APXRELAX %s
+
+// RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o
+// RUN: ld.lld %t.o -o %t1 --no-relax

fzou1 wrote:

There are instructions with APX EGPR or NDD/NF instructions in the test, and 
the R_X86_64_GOTTPOFF relocation is emitted. so we add this to suppress link 
relaxation for TLS relocation in LLD linker. I wonder if we need to emit 
warning or error in MC for this case (if users write such assembly code).

https://github.com/llvm/llvm-project/pull/136660
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [lld] [llvm] [X86] Implement disabling APX relocations and EPGR/NDD instrs for relocations (PR #136660)

2025-04-25 Thread Feng Zou via cfe-commits


@@ -0,0 +1,173 @@
+//===- X86SuppressAPXForReloc.cpp - Suppress APX features for relocations 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+/// \file
+///
+/// This pass is added to suppress APX features for relocations. It's used
+/// together with disabling emitting APX relocation types for backward
+/// compatibility with old version of linker (like before LD 2.43). It can 
avoid
+/// the instructions updated incorrectly by old version of linker if the
+/// instructions are with APX EGPR/NDD/NF features + the relocations other than
+/// APX ones (like GOTTPOFF).
+///
+//===--===//
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-suppress-apx-for-relocation"
+
+static cl::opt X86EnableAPXForRelocation(
+"x86-enable-apx-for-relocation",
+cl::desc("Enable APX features (EGPR, NDD and NF) for instructions with "
+ "relocations on x86-64 ELF"),
+cl::init(false));
+
+namespace {
+class X86SuppressAPXForRelocationPass : public MachineFunctionPass {
+public:
+  X86SuppressAPXForRelocationPass() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override {
+return "X86 Suppress APX features for relocation";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  static char ID;
+};
+} // namespace
+
+char X86SuppressAPXForRelocationPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(X86SuppressAPXForRelocationPass, DEBUG_TYPE,
+  "X86 Suppress APX features for relocation", false, false)
+INITIALIZE_PASS_END(X86SuppressAPXForRelocationPass, DEBUG_TYPE,
+"X86 Suppress APX features for relocation", false, false)
+
+FunctionPass *llvm::createX86SuppressAPXForRelocationPass() {
+  return new X86SuppressAPXForRelocationPass();
+}
+
+static void suppressEGPRRegClass(MachineFunction &MF, MachineInstr &MI,
+ unsigned int OpNum) {
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
+  auto Reg = MI.getOperand(OpNum).getReg();
+  if (!Reg.isVirtual()) {
+assert(!X86II::isApxExtendedReg(Reg) && "APX EGPR is used unexpectedly.");
+return;
+  }
+
+  auto *RC = MRI->getRegClass(Reg);
+  auto *NewRC = X86II::constrainRegClassToNonRex2(RC);
+  MRI->setRegClass(Reg, NewRC);
+}
+
+bool X86SuppressAPXForRelocationPass::runOnMachineFunction(
+MachineFunction &MF) {
+  if (MF.getTarget().Options.MCOptions.X86APXRelaxRelocations ||
+  X86EnableAPXForRelocation)
+return false;
+  const X86Subtarget &ST = MF.getSubtarget();
+  if (!ST.hasEGPR() && !ST.hasNDD() && !ST.hasNF())

fzou1 wrote:

There are NF instructions for CODE_6_GOTTPOFF.

https://github.com/llvm/llvm-project/pull/136660
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [lld] [llvm] [X86] Implement disabling APX relocations and EPGR/NDD instrs for relocations (PR #136660)

2025-04-25 Thread Feng Zou via cfe-commits


@@ -89,6 +89,7 @@ set(sources
   GISel/X86InstructionSelector.cpp
   GISel/X86LegalizerInfo.cpp
   GISel/X86RegisterBankInfo.cpp
+  X86SuppressAPXForReloc.cpp

fzou1 wrote:

Will do

https://github.com/llvm/llvm-project/pull/136660
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [lld] [llvm] [X86] Implement disabling APX relocations and EPGR/NDD instrs for relocations (PR #136660)

2025-04-25 Thread Feng Zou via cfe-commits


@@ -89,6 +89,7 @@ set(sources
   GISel/X86InstructionSelector.cpp
   GISel/X86LegalizerInfo.cpp
   GISel/X86RegisterBankInfo.cpp
+  X86SuppressAPXForReloc.cpp

fzou1 wrote:

Will do

https://github.com/llvm/llvm-project/pull/136660
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [lld] [llvm] [X86] Implement disabling APX relocations and EPGR/NDD instrs for relocations (PR #136660)

2025-04-25 Thread Feng Zou via cfe-commits


@@ -0,0 +1,173 @@
+//===- X86SuppressAPXForReloc.cpp - Suppress APX features for relocations 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+/// \file
+///
+/// This pass is added to suppress APX features for relocations. It's used
+/// together with disabling emitting APX relocation types for backward
+/// compatibility with old version of linker (like before LD 2.43). It can 
avoid
+/// the instructions updated incorrectly by old version of linker if the
+/// instructions are with APX EGPR/NDD/NF features + the relocations other than
+/// APX ones (like GOTTPOFF).
+///
+//===--===//
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-suppress-apx-for-relocation"
+
+static cl::opt X86EnableAPXForRelocation(
+"x86-enable-apx-for-relocation",
+cl::desc("Enable APX features (EGPR, NDD and NF) for instructions with "
+ "relocations on x86-64 ELF"),
+cl::init(false));
+
+namespace {
+class X86SuppressAPXForRelocationPass : public MachineFunctionPass {
+public:
+  X86SuppressAPXForRelocationPass() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override {
+return "X86 Suppress APX features for relocation";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  static char ID;
+};
+} // namespace
+
+char X86SuppressAPXForRelocationPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(X86SuppressAPXForRelocationPass, DEBUG_TYPE,
+  "X86 Suppress APX features for relocation", false, false)
+INITIALIZE_PASS_END(X86SuppressAPXForRelocationPass, DEBUG_TYPE,
+"X86 Suppress APX features for relocation", false, false)
+
+FunctionPass *llvm::createX86SuppressAPXForRelocationPass() {
+  return new X86SuppressAPXForRelocationPass();
+}
+
+static void suppressEGPRRegClass(MachineFunction &MF, MachineInstr &MI,
+ unsigned int OpNum) {
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
+  auto Reg = MI.getOperand(OpNum).getReg();
+  if (!Reg.isVirtual()) {
+assert(!X86II::isApxExtendedReg(Reg) && "APX EGPR is used unexpectedly.");
+return;
+  }
+
+  auto *RC = MRI->getRegClass(Reg);
+  auto *NewRC = X86II::constrainRegClassToNonRex2(RC);
+  MRI->setRegClass(Reg, NewRC);
+}
+
+bool X86SuppressAPXForRelocationPass::runOnMachineFunction(
+MachineFunction &MF) {
+  if (MF.getTarget().Options.MCOptions.X86APXRelaxRelocations ||
+  X86EnableAPXForRelocation)
+return false;
+  const X86Subtarget &ST = MF.getSubtarget();
+  if (!ST.hasEGPR() && !ST.hasNDD() && !ST.hasNF())

fzou1 wrote:

There are NF instructions for GOT_6_GOTTPOFF.

https://github.com/llvm/llvm-project/pull/136660
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [lld] [llvm] [X86] Implement disabling APX relocations and EPGR/NDD instrs for relocations (PR #136660)

2025-04-25 Thread Feng Zou via cfe-commits

https://github.com/fzou1 deleted 
https://github.com/llvm/llvm-project/pull/136660
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [lld] [llvm] [X86] Implement disabling APX relocations and EPGR/NDD instrs for relocations (PR #136660)

2025-04-25 Thread Feng Zou via cfe-commits

https://github.com/fzou1 deleted 
https://github.com/llvm/llvm-project/pull/136660
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [lld] [llvm] [X86] Implement disabling APX relocations and EPGR/NDD instrs for relocations (PR #136660)

2025-04-22 Thread Feng Zou via cfe-commits

fzou1 wrote:

> > > > Introduce an option (-mapx-relax-relocations) to control the emission 
> > > > of the new APX relocations. It's off by default to keep backward 
> > > > compatibility with old version of ld and other linkers without APX 
> > > > support. And EGPR and NDD are also suppressed to avoid the instructions 
> > > > updated incorrectly by old version of linkers.
> > > 
> > > 
> > > Not understand this. IIUC, either we should not emit APX instructions at 
> > > all, which is controlled by `-m[no-]apxf`, or we should not relax the all 
> > > the relocations, which is controlled by `-Wl,--no-relax`.
> > 
> > 
> > If the APX relocation types are emitted in MC, they cannot be recognized by 
> > old version of linkers. It leads to APX features unavailable on existing 
> > Linux OSes with old version of linkers. `--no-relax` just disables the GOT 
> > optimization in linker, and it cannot resolve the link error of unsupported 
> > relocation type as mentioned above.
> 
> But if you just old relocations for APX instructions and `no-relax` is not 
> used. Wouldn't the old linker do the relaxation incorrectly and silently?

No, the old linker will do wrong thing. Here we added a pass to suppress EGPR 
and NDD/NF instructions for that.

https://github.com/llvm/llvm-project/pull/136660
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [lld] [llvm] [X86] Implement disabling APX relocations and EPGR/NDD instrs for relocations (PR #136660)

2025-04-22 Thread Feng Zou via cfe-commits

https://github.com/fzou1 updated 
https://github.com/llvm/llvm-project/pull/136660

>From a98371ac23728289b7e1b9a1516141158bf7c1e0 Mon Sep 17 00:00:00 2001
From: Feng Zou 
Date: Tue, 22 Apr 2025 11:05:29 +0800
Subject: [PATCH 1/3] [X86] Implement disabling APX relocations and EPGR/NDD
 instrs for them

Introduce an option (-mapx-relax-relocations) to control the emission of the
new APX relocations. It's off by default to keep backward compatibility with
older version of ld and other linkers without APX support. And EGPR and NDD are
also suppressed to avoid the instructions updated incorrectly by older version
of linker.
---
 clang/CMakeLists.txt  |   3 +
 clang/cmake/caches/Fuchsia-stage2.cmake   |   1 +
 clang/cmake/caches/Fuchsia.cmake  |   1 +
 clang/include/clang/Basic/CodeGenOptions.def  |   1 +
 clang/include/clang/Config/config.h.cmake |   3 +
 clang/include/clang/Driver/Options.td |   3 +
 clang/include/clang/Driver/ToolChain.h|   3 +
 clang/lib/CodeGen/BackendUtil.cpp |   1 +
 clang/lib/Driver/ToolChain.cpp|   4 +
 clang/lib/Driver/ToolChains/Clang.cpp |  10 ++
 clang/test/Driver/relax.s |  15 +-
 clang/tools/driver/cc1as_main.cpp |   5 +
 lld/ELF/Arch/X86_64.cpp   |   6 +-
 lld/test/ELF/tls-opt.s| 111 ++
 lld/test/ELF/x86-64-gotpc-no-relax-err.s  |   9 +-
 lld/test/ELF/x86-64-gotpc-relax-nopic.s   |  35 +++--
 lld/test/ELF/x86-64-gotpc-relax.s |  91 +++
 lld/test/ELF/x86-64-tlsdesc-gd.s  |  37 -
 llvm/include/llvm/MC/MCTargetOptions.h|   2 +
 .../llvm/MC/MCTargetOptionsCommandFlags.h |   2 +
 llvm/lib/LTO/LTO.cpp  |   1 +
 llvm/lib/MC/MCTargetOptionsCommandFlags.cpp   |  10 ++
 llvm/lib/Target/X86/CMakeLists.txt|   1 +
 .../lib/Target/X86/MCTargetDesc/X86BaseInfo.h |  22 +++
 .../X86/MCTargetDesc/X86ELFObjectWriter.cpp   |  20 ++-
 llvm/lib/Target/X86/X86.h |   1 +
 llvm/lib/Target/X86/X86InstrInfo.cpp  |  17 +-
 .../X86/X86SuppressEGPRAndNDDForReloc.cpp | 145 ++
 llvm/lib/Target/X86/X86TargetMachine.cpp  |   2 +
 llvm/test/CodeGen/X86/O0-pipeline.ll  |   1 +
 llvm/test/CodeGen/X86/apx/tls-desc.ll |  96 
 llvm/test/CodeGen/X86/opt-pipeline.ll |   3 +
 llvm/test/MC/ELF/relocation-alias.s   |   8 +-
 llvm/test/MC/X86/elf-reloc-tls.s  |  16 +-
 llvm/test/MC/X86/gotpcrelx.s  |  49 +-
 llvm/test/MC/X86/tlsdesc-64.s |   8 +-
 llvm/tools/gold/gold-plugin.cpp   |   2 +
 37 files changed, 635 insertions(+), 110 deletions(-)
 create mode 100644 llvm/lib/Target/X86/X86SuppressEGPRAndNDDForReloc.cpp
 create mode 100644 llvm/test/CodeGen/X86/apx/tls-desc.ll

diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt
index c3f30e2a8e9c0..fa3063d6be9a7 100644
--- a/clang/CMakeLists.txt
+++ b/clang/CMakeLists.txt
@@ -219,6 +219,9 @@ set(ENABLE_LINKER_BUILD_ID OFF CACHE BOOL "pass --build-id 
to ld")
 set(ENABLE_X86_RELAX_RELOCATIONS ON CACHE BOOL
 "enable x86 relax relocations by default")
 
+set(ENABLE_X86_APX_RELAX_RELOCATIONS OFF CACHE BOOL
+"Enable x86 APX relax relocations by default")
+
 set(PPC_LINUX_DEFAULT_IEEELONGDOUBLE OFF CACHE BOOL
 "Enable IEEE binary128 as default long double format on PowerPC Linux.")
 
diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake 
b/clang/cmake/caches/Fuchsia-stage2.cmake
index 99890b8246ad7..22eef24b611a8 100644
--- a/clang/cmake/caches/Fuchsia-stage2.cmake
+++ b/clang/cmake/caches/Fuchsia-stage2.cmake
@@ -49,6 +49,7 @@ set(CLANG_PLUGIN_SUPPORT OFF CACHE BOOL "")
 
 set(ENABLE_LINKER_BUILD_ID ON CACHE BOOL "")
 set(ENABLE_X86_RELAX_RELOCATIONS ON CACHE BOOL "")
+set(ENABLE_X86_APX_RELAX_RELOCATIONS OFF CACHE BOOL "")
 
 # TODO(#67176): relative-vtables doesn't play well with different default
 # visibilities. Making everything hidden visibility causes other complications
diff --git a/clang/cmake/caches/Fuchsia.cmake b/clang/cmake/caches/Fuchsia.cmake
index 83336589da305..550d94d1ec6a5 100644
--- a/clang/cmake/caches/Fuchsia.cmake
+++ b/clang/cmake/caches/Fuchsia.cmake
@@ -89,6 +89,7 @@ set(CLANG_PLUGIN_SUPPORT OFF CACHE BOOL "")
 
 set(ENABLE_LINKER_BUILD_ID ON CACHE BOOL "")
 set(ENABLE_X86_RELAX_RELOCATIONS ON CACHE BOOL "")
+set(ENABLE_X86_APX_RELAX_RELOCATIONS OFF CACHE BOOL "")
 
 set(LLVM_ENABLE_ASSERTIONS ON CACHE BOOL "")
 set(LLVM_ENABLE_BACKTRACES ON CACHE BOOL "")
diff --git a/clang/include/clang/Basic/CodeGenOptions.def 
b/clang/include/clang/Basic/CodeGenOptions.def
index c5990fb248689..875facd6cfc63 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -201,6 +201,7 @@ CODEGENOPT(UniqueInternalLinkageNames, 1, 0) ///< Internal 
Linkage symbols get u
 CODEGENOPT(Spl

[clang] [lld] [llvm] [X86][APX] Suppress EGPR/NDD instructions for relocations (PR #136660)

2025-04-28 Thread Feng Zou via cfe-commits

fzou1 wrote:

> > Any comments?
> 
> Still not understand the requirement...

I've update commit message with an example. Please take a look.

https://github.com/llvm/llvm-project/pull/136660
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [lld] [llvm] [X86][APX] Suppress EGPR/NDD instructions for relocations (PR #136660)

2025-04-29 Thread Feng Zou via cfe-commits


@@ -1257,6 +1259,26 @@ inline bool isX86_64ExtendedReg(MCRegister Reg) {
   return false;
 }
 
+inline const TargetRegisterClass *
+constrainRegClassToNonRex2(const TargetRegisterClass *RC) {

fzou1 wrote:

I wrote it based on canUseApxExtendedReg function. Updated.

https://github.com/llvm/llvm-project/pull/136660
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [lld] [llvm] [X86][APX] Suppress EGPR/NDD instructions for relocations (PR #136660)

2025-04-29 Thread Feng Zou via cfe-commits

https://github.com/fzou1 closed https://github.com/llvm/llvm-project/pull/136660
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [lld] [llvm] [X86][APX] Suppress EGPR/NDD instructions for relocations (PR #136660)

2025-04-29 Thread Feng Zou via cfe-commits

fzou1 wrote:

Sorry. The test failure had been fixed in 
https://github.com/llvm/llvm-project/pull/137794.

https://github.com/llvm/llvm-project/pull/136660
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [lld] [llvm] [X86][APX] Suppress EGPR/NDD instructions for relocations (PR #136660)

2025-04-29 Thread Feng Zou via cfe-commits


@@ -0,0 +1,236 @@
+//===- X86SuppressAPXForReloc.cpp - Suppress APX features for relocations 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+/// \file
+///
+/// This pass is added to suppress APX features for relocations. It's used to
+/// keep backward compatibility with old version of linker having no APX
+/// support. It can be removed after APX support is included in the default
+/// linker on OS.
+///
+//===--===//
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-suppress-apx-for-relocation"
+
+cl::opt X86EnableAPXForRelocation(
+"x86-enable-apx-for-relocation",
+cl::desc("Enable APX features (EGPR, NDD and NF) for instructions with "
+ "relocations on x86-64 ELF"),
+cl::init(false));
+
+namespace {
+class X86SuppressAPXForRelocationPass : public MachineFunctionPass {
+public:
+  X86SuppressAPXForRelocationPass() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override {
+return "X86 Suppress APX features for relocation";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  static char ID;
+};
+} // namespace
+
+char X86SuppressAPXForRelocationPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(X86SuppressAPXForRelocationPass, DEBUG_TYPE,
+  "X86 Suppress APX features for relocation", false, false)
+INITIALIZE_PASS_END(X86SuppressAPXForRelocationPass, DEBUG_TYPE,
+"X86 Suppress APX features for relocation", false, false)
+
+FunctionPass *llvm::createX86SuppressAPXForRelocationPass() {
+  return new X86SuppressAPXForRelocationPass();
+}
+
+static void suppressEGPRRegClass(MachineFunction &MF, MachineInstr &MI,
+ unsigned int OpNum) {
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
+  Register Reg = MI.getOperand(OpNum).getReg();
+  if (!Reg.isVirtual()) {
+assert(!X86II::isApxExtendedReg(Reg) && "APX EGPR is used unexpectedly.");
+return;
+  }
+
+  const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+  const TargetRegisterClass *NewRC = X86II::constrainRegClassToNonRex2(RC);
+  MRI->setRegClass(Reg, NewRC);
+}
+
+static bool handleInstructionWithEGPR(MachineFunction &MF,
+  const X86Subtarget &ST) {
+  if (!ST.hasEGPR())
+return false;
+
+  auto suppressEGPRInInstrWithReloc = [&](MachineInstr &MI,
+  ArrayRef OpNoArray) {
+int MemOpNo = X86II::getMemoryOperandNo(MI.getDesc().TSFlags) +
+  X86II::getOperandBias(MI.getDesc());
+const MachineOperand &MO = MI.getOperand(X86::AddrDisp + MemOpNo);
+if (MO.getTargetFlags() == X86II::MO_GOTTPOFF ||
+MO.getTargetFlags() == X86II::MO_GOTPCREL) {
+  LLVM_DEBUG(dbgs() << "Transform instruction with relocation type:\n  "
+<< MI);
+  for (unsigned OpNo : OpNoArray)
+suppressEGPRRegClass(MF, MI, OpNo);
+  LLVM_DEBUG(dbgs() << "to:\n  " << MI << "\n");
+}
+  };
+
+  for (MachineBasicBlock &MBB : MF) {
+for (MachineInstr &MI : MBB) {
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+// For GOTPC32_TLSDESC, it's emitted with physical register (EAX/RAX) 
in
+// X86AsmPrinter::LowerTlsAddr, and there is no corresponding target
+// flag for it, so we don't need to handle LEA64r with TLSDESC and EGPR
+// in this pass (before emitting assembly).
+  case X86::TEST32mr:
+  case X86::TEST64mr: {
+suppressEGPRInInstrWithReloc(MI, {5});

fzou1 wrote:

Yes. The rip-relative addressing instruction is like "binop 
name@GOTPCREL(%rip), %reg". Base register is rip and no index register.


https://github.com/llvm/llvm-project/pull/136660
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [lld] [llvm] [X86][APX] Suppress EGPR/NDD instructions for relocations (PR #136660)

2025-04-28 Thread Feng Zou via cfe-commits

fzou1 wrote:

> > Suppress EGPR/NDD instructions for relocations to avoid APX relocation 
> > types emitted. This is to keep backward compatibility with old version of 
> > ld and other linkers without APX support. If there are APX relocation 
> > types, old version of linkers would raise "unsupported relocation type" 
> > error. Example:
> > ```
> > $ llvm-mc -filetype=obj -o got.o -triple=x86_64-unknown-linux got.s
> > $ ld got.o -o got.exe
> > ld: got.o: unsupported relocation type 0x2b
> > ...
> > 
> > $ cat got.s
> > ...
> > movq foo@GOTPCREL(%rip), %r16
> > 
> > $ llvm-objdump -dr got.o
> > ...
> > 1: d5 48 8b 05 00 00 00 00   movq(%rip), %r16
> > 0005:  R_X86_64_CODE_4_GOTPCRELXfoo-0x4
> > ```
> 
> Could you add who is the user for such request: LLVM + APX + old BFD linker? 
> The code looks like a dirty workaround. I expect a real user.

Agree. But without this, users cannot try APX features with LLVM + old builtin 
linker on RHEL9 OS which is expected to be EOL in 2032.

https://github.com/llvm/llvm-project/pull/136660
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [lld] [llvm] [X86][APX] Suppress EGPR/NDD instructions for relocations (PR #136660)

2025-04-28 Thread Feng Zou via cfe-commits

fzou1 wrote:

> > > Any comments?
> > 
> > 
> > Still not understand the requirement...
> 
> I'm confused by the test cases too. I understood the code as we replace the 
> EGPR or NDD instructions with non-APX ones, which meets my expection. But the 
> option and tests are in MC phase, which means instructions and registers 
> cannot be changed any more. I think it's not correct to just change the 
> relocation type without changing instructions.

I've reverted the changes for disabling relocation type. A pass is added to fix 
the issue of unsupported relocation type, by avoiding emitting EGPR and NDD for 
instructions with relocations. NF instruction is assumed not to be emitted 
before this pass, so assertion is added there. And the NF optimizations had 
been disabled for instructions with relocations. Thanks @phoebewang 

https://github.com/llvm/llvm-project/pull/136660
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [lld] [llvm] [X86][APX] Suppress EGPR/NDD instructions for relocations (PR #136660)

2025-04-28 Thread Feng Zou via cfe-commits

fzou1 wrote:

> > > > Suppress EGPR/NDD instructions for relocations to avoid APX relocation 
> > > > types emitted. This is to keep backward compatibility with old version 
> > > > of ld and other linkers without APX support. If there are APX 
> > > > relocation types, old version of linkers would raise "unsupported 
> > > > relocation type" error. Example:
> > > > ```
> > > > $ llvm-mc -filetype=obj -o got.o -triple=x86_64-unknown-linux got.s
> > > > $ ld got.o -o got.exe
> > > > ld: got.o: unsupported relocation type 0x2b
> > > > ...
> > > > 
> > > > $ cat got.s
> > > > ...
> > > > movq foo@GOTPCREL(%rip), %r16
> > > > 
> > > > $ llvm-objdump -dr got.o
> > > > ...
> > > > 1: d5 48 8b 05 00 00 00 00   movq(%rip), %r16
> > > > 0005:  R_X86_64_CODE_4_GOTPCRELXfoo-0x4
> > > > ```
> > > 
> > > 
> > > Could you add who is the user for such request: LLVM + APX + old BFD 
> > > linker? The code looks like a dirty workaround. I expect a real user.
> > 
> > 
> > Agree. But without this, users cannot try APX features with LLVM + old 
> > builtin linker on RHEL9 OS which is expected to be EOL in 2032.
> 
> Okay, can you add this info into description of this PR?

Updated. Thanks.

https://github.com/llvm/llvm-project/pull/136660
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [lld] [llvm] [X86][APX] Suppress EGPR/NDD instructions for relocations (PR #136660)

2025-04-28 Thread Feng Zou via cfe-commits

https://github.com/fzou1 edited https://github.com/llvm/llvm-project/pull/136660
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [lld] [llvm] [X86][APX] Suppress EGPR/NDD instructions for relocations (PR #136660)

2025-05-13 Thread Feng Zou via cfe-commits

fzou1 wrote:

> We should avoid introducing a new cmake config for changes like this. You 
> could utilize clang config file to specify the option.
> 
> lld change should have been separated

Sorry for late reply. The related cmake and LLD changes were reverted. Only a 
backend pass and option were added to suppress APX features for relocation. 
Thanks.

https://github.com/llvm/llvm-project/pull/136660
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Driver][X86] Add -m[no-]apxf to m_x86_Features_Group (PR #140874)

2025-05-21 Thread Feng Zou via cfe-commits

https://github.com/fzou1 created 
https://github.com/llvm/llvm-project/pull/140874

This is to expose these options to clang-cl on Windows. And add help text for 
these options.

>From d683302d4768dfb5c618aa1b09553f3b22142aed Mon Sep 17 00:00:00 2001
From: Feng Zou 
Date: Tue, 20 May 2025 22:10:07 +0800
Subject: [PATCH] [Driver][X86] Add -m[no-]apxf to m_x86_Features_Group

This is to expose these options to clang-cl on Windows.
And add help text for these options.
---
 clang/include/clang/Driver/Options.td |  9 +++--
 clang/test/Driver/cl-x86-flags.c  | 50 +++
 2 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Driver/Options.td 
b/clang/include/clang/Driver/Options.td
index 9a4253113488d..ead729342173b 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6679,8 +6679,13 @@ def mapx_features_EQ : CommaJoined<["-"], 
"mapx-features=">, Group, 
Values<"egpr,push2pop2,ppx,ndd,ccmp,nf,cf,zu">,  Visibility<[ClangOption, 
CLOption, FlangOption]>;
 def mno_apx_features_EQ : CommaJoined<["-"], "mno-apx-features=">, 
Group,
 HelpText<"Disable features of APX">, 
Values<"egpr,push2pop2,ppx,ndd,ccmp,nf,cf,zu">, Visibility<[ClangOption, 
CLOption, FlangOption]>;
-def mapxf : Flag<["-"], "mapxf">, Alias, 
AliasArgs<["egpr","push2pop2","ppx","ndd","ccmp","nf","cf","zu"]>;
-def mno_apxf : Flag<["-"], "mno-apxf">, Alias, 
AliasArgs<["egpr","push2pop2","ppx","ndd","ccmp","nf","cf","zu"]>;
+def mapxf : Flag<["-"], "mapxf">, Alias,
+AliasArgs<["egpr","push2pop2","ppx","ndd","ccmp","nf","cf","zu"]>,
+Group, HelpText<"Enable all features of 
APX">;
+def mno_apxf : Flag<["-"], "mno-apxf">, Alias,
+   
AliasArgs<["egpr","push2pop2","ppx","ndd","ccmp","nf","cf","zu"]>,
+   Group,
+   HelpText<"Disable all features of APX">;
 def mapx_inline_asm_use_gpr32 : Flag<["-"], "mapx-inline-asm-use-gpr32">, 
Group,
 HelpText<"Enable use of GPR32 in inline 
assembly for APX">;
 } // let Flags = [TargetSpecific]
diff --git a/clang/test/Driver/cl-x86-flags.c b/clang/test/Driver/cl-x86-flags.c
index 51b16f0ce3546..23fe96d604604 100644
--- a/clang/test/Driver/cl-x86-flags.c
+++ b/clang/test/Driver/cl-x86-flags.c
@@ -135,3 +135,53 @@
 
 void f(void) {
 }
+
+
+// RUN: not %clang_cl -### --target=i386 -mapx-features=ndd %s 2>&1 | 
FileCheck --check-prefix=NON-APX %s
+// RUN: not %clang_cl -### --target=i386 -mapxf %s 2>&1 | FileCheck 
--check-prefix=NON-APX %s
+// RUN: %clang_cl -### --target=i386 -mno-apxf %s 2>&1 > /dev/null
+// NON-APX:  error: unsupported option '-mapx-features=|-mapxf' for target 
'i386'
+// NON-APX-NOT:  error: {{.*}} -mapx-features=
+
+// RUN: %clang_cl -target x86_64-pc-windows -mapxf %s -### -o %t.o 2>&1 | 
FileCheck -check-prefix=APXF %s
+// RUN: %clang_cl -target x86_64-pc-windows -mno-apxf %s -### -o %t.o 2>&1 | 
FileCheck -check-prefix=NO-APXF %s
+// RUN: %clang_cl -target x86_64-pc-windows -mno-apxf -mapxf %s -### -o %t.o 
2>&1 | FileCheck -check-prefix=APXF %s
+// RUN: %clang_cl -target x86_64-pc-windows -mapxf -mno-apxf %s -### -o %t.o 
2>&1 | FileCheck -check-prefix=NO-APXF %s
+//
+// APXF: "-target-feature" "+egpr" "-target-feature" "+push2pop2" 
"-target-feature" "+ppx" "-target-feature" "+ndd" "-target-feature" "+ccmp" 
"-target-feature" "+nf" "-target-feature" "+cf" "-target-feature" "+zu"
+// NO-APXF: "-target-feature" "-egpr" "-target-feature" "-push2pop2" 
"-target-feature" "-ppx" "-target-feature" "-ndd" "-target-feature" "-ccmp" 
"-target-feature" "-nf" "-target-feature" "-cf" "-target-feature" "-zu"
+
+// RUN: %clang_cl -target x86_64-pc-windows -mapx-features=egpr %s -### -o 
%t.o 2>&1 | FileCheck -check-prefix=EGPR %s
+// RUN: %clang_cl -target x86_64-pc-windows -mapx-features=push2pop2 %s -### 
-o %t.o 2>&1 | FileCheck -check-prefix=PUSH2POP2 %s
+// RUN: %clang_cl -target x86_64-pc-windows -mapx-features=ppx %s -### -o %t.o 
2>&1 | FileCheck -check-prefix=PPX %s
+// RUN: %clang_cl -target x86_64-pc-windows -mapx-features=ndd %s -### -o %t.o 
2>&1 | FileCheck -check-prefix=NDD %s
+// RUN: %clang_cl -target x86_64-pc-windows -mapx-features=ccmp %s -### -o 
%t.o 2>&1 | FileCheck -check-prefix=CCMP %s
+// RUN: %clang_cl -target x86_64-pc-windows -mapx-features=nf %s -### -o %t.o 
2>&1 | FileCheck -check-prefix=NF %s
+// RUN: %clang_cl -target x86_64-pc-windows -mapx-features=cf %s -### -o %t.o 
2>&1 | FileCheck -check-prefix=CF %s
+// RUN: %clang_cl -target x86_64-pc-windows -mapx-features=zu %s -### -o %t.o 
2>&1 | FileCheck -check-prefix=ZU %s
+// EGPR: "-target-feature" "+egpr"
+// PUSH2POP2: "-target-feature" "+push2pop2"
+// PPX: "-target-feature" "+ppx"
+// NDD: "-target-feature" "+ndd"
+// CCMP: "-target-feature" "+ccmp"
+// NF: "-target-feature" "+nf"
+// CF: "-target-feature" "+cf"
+// ZU: "-target-feature" "+zu"
+
+// RUN: %clang_cl -target x86_64-pc-windows -mapx-features=egpr,ndd %s -### 

[clang] [Driver][X86] Add -m[no-]apxf to m_x86_Features_Group (PR #140874)

2025-05-25 Thread Feng Zou via cfe-commits

https://github.com/fzou1 closed https://github.com/llvm/llvm-project/pull/140874
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Driver][X86] Fix LIT test failure on Solaris OS + AMD64 CPU. (PR #141486)

2025-05-26 Thread Feng Zou via cfe-commits

https://github.com/fzou1 updated 
https://github.com/llvm/llvm-project/pull/141486

>From 5f839ebc0d21f393b30821b1df44d066e4b11d2b Mon Sep 17 00:00:00 2001
From: Feng Zou 
Date: Mon, 26 May 2025 20:16:34 +0800
Subject: [PATCH 1/2] [Driver][X86] Fix LIT test failure on Solaris OS + AMD64
 CPU.

The LIT test is introduced by https://github.com/llvm/llvm-project/pull/140874.
---
 clang/test/Driver/cl-x86-flags.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/test/Driver/cl-x86-flags.c b/clang/test/Driver/cl-x86-flags.c
index 0210b5f3cd3f1..fc9a3befe7164 100644
--- a/clang/test/Driver/cl-x86-flags.c
+++ b/clang/test/Driver/cl-x86-flags.c
@@ -137,10 +137,10 @@ void f(void) {
 }
 
 
-// RUN: not %clang_cl -### --target=i386 -mapx-features=ndd %s 2>&1 | 
FileCheck --check-prefix=NON-APX %s
-// RUN: not %clang_cl -### --target=i386 -mapxf %s 2>&1 | FileCheck 
--check-prefix=NON-APX %s
-// RUN: %clang_cl -### --target=i386 -mno-apxf %s 2>&1 > /dev/null
-// NON-APX:  error: unsupported option '-mapx-features=|-mapxf' for target 
'i386'
+// RUN: not %clang_cl -### --target=i386-pc-windows -mapx-features=ndd %s 2>&1 
| FileCheck --check-prefix=NON-APX %s
+// RUN: not %clang_cl -### --target=i386-pc-windows -mapxf %s 2>&1 | FileCheck 
--check-prefix=NON-APX %s
+// RUN: %clang_cl -### --target=i386-pc-windows -mno-apxf %s 2>&1 > /dev/null
+// NON-APX:  error: unsupported option '-mapx-features=|-mapxf' for target 
'i386-pc-windows{{.*}}'
 // NON-APX-NOT:  error: {{.*}} -mapx-features=
 
 // RUN: %clang_cl --target=x86_64-pc-windows -mapxf %s -### 2>&1 | FileCheck 
-check-prefix=APXF %s

>From 9b16c4cc21fc0f593248a47d7625f3d1640571c7 Mon Sep 17 00:00:00 2001
From: Feng Zou 
Date: Tue, 27 May 2025 10:42:56 +0800
Subject: [PATCH 2/2] Fix the issue by using "--" to separate options from
 input file names.

---
 clang/test/Driver/cl-x86-flags.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/clang/test/Driver/cl-x86-flags.c b/clang/test/Driver/cl-x86-flags.c
index fc9a3befe7164..1e6418207e642 100644
--- a/clang/test/Driver/cl-x86-flags.c
+++ b/clang/test/Driver/cl-x86-flags.c
@@ -137,14 +137,14 @@ void f(void) {
 }
 
 
-// RUN: not %clang_cl -### --target=i386-pc-windows -mapx-features=ndd %s 2>&1 
| FileCheck --check-prefix=NON-APX %s
-// RUN: not %clang_cl -### --target=i386-pc-windows -mapxf %s 2>&1 | FileCheck 
--check-prefix=NON-APX %s
-// RUN: %clang_cl -### --target=i386-pc-windows -mno-apxf %s 2>&1 > /dev/null
+// RUN: not %clang_cl -### --target=i386-pc-windows -mapx-features=ndd -- 2>&1 
%s | FileCheck --check-prefix=NON-APX %s
+// RUN: not %clang_cl -### --target=i386-pc-windows -mapxf -- 2>&1 %s | 
FileCheck --check-prefix=NON-APX %s
+// RUN: %clang_cl -### --target=i386-pc-windows -mno-apxf -- 2>&1 %s > 
/dev/null
 // NON-APX:  error: unsupported option '-mapx-features=|-mapxf' for target 
'i386-pc-windows{{.*}}'
 // NON-APX-NOT:  error: {{.*}} -mapx-features=
 
-// RUN: %clang_cl --target=x86_64-pc-windows -mapxf %s -### 2>&1 | FileCheck 
-check-prefix=APXF %s
-// RUN: %clang_cl --target=x86_64-pc-windows -mapxf -mno-apxf %s -### 2>&1 | 
FileCheck -check-prefix=NO-APXF %s
-// RUN: %clang_cl --target=x86_64-pc-windows 
-mapx-features=egpr,push2pop2,ppx,ndd,ccmp,nf,cf,zu %s -### 2>&1 | FileCheck 
-check-prefix=APXF %s
+// RUN: %clang_cl --target=x86_64-pc-windows -mapxf -### -- 2>&1 %s | 
FileCheck -check-prefix=APXF %s
+// RUN: %clang_cl --target=x86_64-pc-windows -mapxf -mno-apxf -### -- 2>&1 %s 
| FileCheck -check-prefix=NO-APXF %s
+// RUN: %clang_cl --target=x86_64-pc-windows 
-mapx-features=egpr,push2pop2,ppx,ndd,ccmp,nf,cf,zu -### -- 2>&1 %s | FileCheck 
-check-prefix=APXF %s
 // APXF: "-target-feature" "+egpr" "-target-feature" "+push2pop2" 
"-target-feature" "+ppx" "-target-feature" "+ndd" "-target-feature" "+ccmp" 
"-target-feature" "+nf" "-target-feature" "+cf" "-target-feature" "+zu"
 // NO-APXF: "-target-feature" "-egpr" "-target-feature" "-push2pop2" 
"-target-feature" "-ppx" "-target-feature" "-ndd" "-target-feature" "-ccmp" 
"-target-feature" "-nf" "-target-feature" "-cf" "-target-feature" "-zu"

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Driver][X86] Fix LIT test failure on Solaris/MacOS (PR #141486)

2025-05-26 Thread Feng Zou via cfe-commits

https://github.com/fzou1 edited https://github.com/llvm/llvm-project/pull/141486
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Driver][X86] Fix LIT test failure on Solaris/MacOS (PR #141486)

2025-05-26 Thread Feng Zou via cfe-commits

https://github.com/fzou1 closed https://github.com/llvm/llvm-project/pull/141486
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Driver][X86] Add -m[no-]apxf to m_x86_Features_Group (PR #140874)

2025-05-27 Thread Feng Zou via cfe-commits

fzou1 wrote:

> > Looks like this breaks tests on macOS: 
> > http://45.33.8.238/macm1/107398/step_6.txt
> > Please take a look and revert for now if it takes a while to fix.
> 
> Thank you. I've reproduced this issue on MacOS and updated PR #141486 to fix 
> it.

@nico / @rorth , The PR #141486 had been merged. Please have a check. Thanks.

https://github.com/llvm/llvm-project/pull/140874
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Driver][X86] Add -m[no-]apxf to m_x86_Features_Group (PR #140874)

2025-05-26 Thread Feng Zou via cfe-commits

fzou1 wrote:

> Looks like this breaks tests on macOS: 
> http://45.33.8.238/macm1/107398/step_6.txt
> 
> Please take a look and revert for now if it takes a while to fix.

Thank you. I've reproduced this issue on MacOS and updated PR #141486 to fix it.

https://github.com/llvm/llvm-project/pull/140874
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Driver][X86] Fix LIT test failure on Solaris/MacOS (PR #141486)

2025-05-26 Thread Feng Zou via cfe-commits

https://github.com/fzou1 edited https://github.com/llvm/llvm-project/pull/141486
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Driver][X86] Add -m[no-]apxf to m_x86_Features_Group (PR #140874)

2025-05-25 Thread Feng Zou via cfe-commits

fzou1 wrote:

@KanRobert , any further comments?

https://github.com/llvm/llvm-project/pull/140874
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Driver][X86] Fix LIT test failure on Solaris OS + AMD64 CPU. (PR #141486)

2025-05-26 Thread Feng Zou via cfe-commits

https://github.com/fzou1 created 
https://github.com/llvm/llvm-project/pull/141486

The LIT test is introduced by https://github.com/llvm/llvm-project/pull/140874.

>From 5f839ebc0d21f393b30821b1df44d066e4b11d2b Mon Sep 17 00:00:00 2001
From: Feng Zou 
Date: Mon, 26 May 2025 20:16:34 +0800
Subject: [PATCH] [Driver][X86] Fix LIT test failure on Solaris OS + AMD64 CPU.

The LIT test is introduced by https://github.com/llvm/llvm-project/pull/140874.
---
 clang/test/Driver/cl-x86-flags.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/test/Driver/cl-x86-flags.c b/clang/test/Driver/cl-x86-flags.c
index 0210b5f3cd3f1..fc9a3befe7164 100644
--- a/clang/test/Driver/cl-x86-flags.c
+++ b/clang/test/Driver/cl-x86-flags.c
@@ -137,10 +137,10 @@ void f(void) {
 }
 
 
-// RUN: not %clang_cl -### --target=i386 -mapx-features=ndd %s 2>&1 | 
FileCheck --check-prefix=NON-APX %s
-// RUN: not %clang_cl -### --target=i386 -mapxf %s 2>&1 | FileCheck 
--check-prefix=NON-APX %s
-// RUN: %clang_cl -### --target=i386 -mno-apxf %s 2>&1 > /dev/null
-// NON-APX:  error: unsupported option '-mapx-features=|-mapxf' for target 
'i386'
+// RUN: not %clang_cl -### --target=i386-pc-windows -mapx-features=ndd %s 2>&1 
| FileCheck --check-prefix=NON-APX %s
+// RUN: not %clang_cl -### --target=i386-pc-windows -mapxf %s 2>&1 | FileCheck 
--check-prefix=NON-APX %s
+// RUN: %clang_cl -### --target=i386-pc-windows -mno-apxf %s 2>&1 > /dev/null
+// NON-APX:  error: unsupported option '-mapx-features=|-mapxf' for target 
'i386-pc-windows{{.*}}'
 // NON-APX-NOT:  error: {{.*}} -mapx-features=
 
 // RUN: %clang_cl --target=x86_64-pc-windows -mapxf %s -### 2>&1 | FileCheck 
-check-prefix=APXF %s

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Driver][X86] Add -m[no-]apxf to m_x86_Features_Group (PR #140874)

2025-05-26 Thread Feng Zou via cfe-commits

fzou1 wrote:

> This patch broke the [Solaris/amd64 
> buildbot](https://lab.llvm.org/staging/#/builders/120/builds/8977). I suspect 
> the test should use `--target=i386-pc-windows` instead of just `i386`?

Sorry for that. I've uploaded a PR (#141486) to fix that. But I didn't verify 
that since I don't have a machine with solaris OS and AMD CPU.

https://github.com/llvm/llvm-project/pull/140874
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Driver][X86] Add -m[no-]apxf to m_x86_Features_Group (PR #140874)

2025-05-26 Thread Feng Zou via cfe-commits

fzou1 wrote:

> > This patch broke the [Solaris/amd64 
> > buildbot](https://lab.llvm.org/staging/#/builders/120/builds/8977). I 
> > suspect the test should use `--target=i386-pc-windows` instead of just 
> > `i386`?
> 
> Sorry for that. I've uploaded a PR (#141486) to fix that. But I didn't verify 
> that since I don't have a machine with solaris OS and AMD CPU.

I tried to use commands below but failed to reproduce this issue.
`
`

https://github.com/llvm/llvm-project/pull/140874
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Driver][X86] Add -m[no-]apxf to m_x86_Features_Group (PR #140874)

2025-05-22 Thread Feng Zou via cfe-commits


@@ -135,3 +135,53 @@
 
 void f(void) {
 }
+
+
+// RUN: not %clang_cl -### --target=i386 -mapx-features=ndd %s 2>&1 | 
FileCheck --check-prefix=NON-APX %s
+// RUN: not %clang_cl -### --target=i386 -mapxf %s 2>&1 | FileCheck 
--check-prefix=NON-APX %s
+// RUN: %clang_cl -### --target=i386 -mno-apxf %s 2>&1 > /dev/null
+// NON-APX:  error: unsupported option '-mapx-features=|-mapxf' for target 
'i386'
+// NON-APX-NOT:  error: {{.*}} -mapx-features=
+
+// RUN: %clang_cl -target x86_64-pc-windows -mapxf %s -### -o %t.o 2>&1 | 
FileCheck -check-prefix=APXF %s
+// RUN: %clang_cl -target x86_64-pc-windows -mno-apxf %s -### -o %t.o 2>&1 | 
FileCheck -check-prefix=NO-APXF %s
+// RUN: %clang_cl -target x86_64-pc-windows -mno-apxf -mapxf %s -### -o %t.o 
2>&1 | FileCheck -check-prefix=APXF %s
+// RUN: %clang_cl -target x86_64-pc-windows -mapxf -mno-apxf %s -### -o %t.o 
2>&1 | FileCheck -check-prefix=NO-APXF %s
+//
+// APXF: "-target-feature" "+egpr" "-target-feature" "+push2pop2" 
"-target-feature" "+ppx" "-target-feature" "+ndd" "-target-feature" "+ccmp" 
"-target-feature" "+nf" "-target-feature" "+cf" "-target-feature" "+zu"
+// NO-APXF: "-target-feature" "-egpr" "-target-feature" "-push2pop2" 
"-target-feature" "-ppx" "-target-feature" "-ndd" "-target-feature" "-ccmp" 
"-target-feature" "-nf" "-target-feature" "-cf" "-target-feature" "-zu"
+
+// RUN: %clang_cl -target x86_64-pc-windows -mapx-features=egpr %s -### -o 
%t.o 2>&1 | FileCheck -check-prefix=EGPR %s

fzou1 wrote:

Okay. Removed some RUN lines.

https://github.com/llvm/llvm-project/pull/140874
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Driver][X86] Add -m[no-]apxf to m_x86_Features_Group (PR #140874)

2025-05-22 Thread Feng Zou via cfe-commits

https://github.com/fzou1 updated 
https://github.com/llvm/llvm-project/pull/140874

>From d683302d4768dfb5c618aa1b09553f3b22142aed Mon Sep 17 00:00:00 2001
From: Feng Zou 
Date: Tue, 20 May 2025 22:10:07 +0800
Subject: [PATCH 1/3] [Driver][X86] Add -m[no-]apxf to m_x86_Features_Group

This is to expose these options to clang-cl on Windows.
And add help text for these options.
---
 clang/include/clang/Driver/Options.td |  9 +++--
 clang/test/Driver/cl-x86-flags.c  | 50 +++
 2 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Driver/Options.td 
b/clang/include/clang/Driver/Options.td
index 9a4253113488d..ead729342173b 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6679,8 +6679,13 @@ def mapx_features_EQ : CommaJoined<["-"], 
"mapx-features=">, Group, 
Values<"egpr,push2pop2,ppx,ndd,ccmp,nf,cf,zu">,  Visibility<[ClangOption, 
CLOption, FlangOption]>;
 def mno_apx_features_EQ : CommaJoined<["-"], "mno-apx-features=">, 
Group,
 HelpText<"Disable features of APX">, 
Values<"egpr,push2pop2,ppx,ndd,ccmp,nf,cf,zu">, Visibility<[ClangOption, 
CLOption, FlangOption]>;
-def mapxf : Flag<["-"], "mapxf">, Alias, 
AliasArgs<["egpr","push2pop2","ppx","ndd","ccmp","nf","cf","zu"]>;
-def mno_apxf : Flag<["-"], "mno-apxf">, Alias, 
AliasArgs<["egpr","push2pop2","ppx","ndd","ccmp","nf","cf","zu"]>;
+def mapxf : Flag<["-"], "mapxf">, Alias,
+AliasArgs<["egpr","push2pop2","ppx","ndd","ccmp","nf","cf","zu"]>,
+Group, HelpText<"Enable all features of 
APX">;
+def mno_apxf : Flag<["-"], "mno-apxf">, Alias,
+   
AliasArgs<["egpr","push2pop2","ppx","ndd","ccmp","nf","cf","zu"]>,
+   Group,
+   HelpText<"Disable all features of APX">;
 def mapx_inline_asm_use_gpr32 : Flag<["-"], "mapx-inline-asm-use-gpr32">, 
Group,
 HelpText<"Enable use of GPR32 in inline 
assembly for APX">;
 } // let Flags = [TargetSpecific]
diff --git a/clang/test/Driver/cl-x86-flags.c b/clang/test/Driver/cl-x86-flags.c
index 51b16f0ce3546..23fe96d604604 100644
--- a/clang/test/Driver/cl-x86-flags.c
+++ b/clang/test/Driver/cl-x86-flags.c
@@ -135,3 +135,53 @@
 
 void f(void) {
 }
+
+
+// RUN: not %clang_cl -### --target=i386 -mapx-features=ndd %s 2>&1 | 
FileCheck --check-prefix=NON-APX %s
+// RUN: not %clang_cl -### --target=i386 -mapxf %s 2>&1 | FileCheck 
--check-prefix=NON-APX %s
+// RUN: %clang_cl -### --target=i386 -mno-apxf %s 2>&1 > /dev/null
+// NON-APX:  error: unsupported option '-mapx-features=|-mapxf' for target 
'i386'
+// NON-APX-NOT:  error: {{.*}} -mapx-features=
+
+// RUN: %clang_cl -target x86_64-pc-windows -mapxf %s -### -o %t.o 2>&1 | 
FileCheck -check-prefix=APXF %s
+// RUN: %clang_cl -target x86_64-pc-windows -mno-apxf %s -### -o %t.o 2>&1 | 
FileCheck -check-prefix=NO-APXF %s
+// RUN: %clang_cl -target x86_64-pc-windows -mno-apxf -mapxf %s -### -o %t.o 
2>&1 | FileCheck -check-prefix=APXF %s
+// RUN: %clang_cl -target x86_64-pc-windows -mapxf -mno-apxf %s -### -o %t.o 
2>&1 | FileCheck -check-prefix=NO-APXF %s
+//
+// APXF: "-target-feature" "+egpr" "-target-feature" "+push2pop2" 
"-target-feature" "+ppx" "-target-feature" "+ndd" "-target-feature" "+ccmp" 
"-target-feature" "+nf" "-target-feature" "+cf" "-target-feature" "+zu"
+// NO-APXF: "-target-feature" "-egpr" "-target-feature" "-push2pop2" 
"-target-feature" "-ppx" "-target-feature" "-ndd" "-target-feature" "-ccmp" 
"-target-feature" "-nf" "-target-feature" "-cf" "-target-feature" "-zu"
+
+// RUN: %clang_cl -target x86_64-pc-windows -mapx-features=egpr %s -### -o 
%t.o 2>&1 | FileCheck -check-prefix=EGPR %s
+// RUN: %clang_cl -target x86_64-pc-windows -mapx-features=push2pop2 %s -### 
-o %t.o 2>&1 | FileCheck -check-prefix=PUSH2POP2 %s
+// RUN: %clang_cl -target x86_64-pc-windows -mapx-features=ppx %s -### -o %t.o 
2>&1 | FileCheck -check-prefix=PPX %s
+// RUN: %clang_cl -target x86_64-pc-windows -mapx-features=ndd %s -### -o %t.o 
2>&1 | FileCheck -check-prefix=NDD %s
+// RUN: %clang_cl -target x86_64-pc-windows -mapx-features=ccmp %s -### -o 
%t.o 2>&1 | FileCheck -check-prefix=CCMP %s
+// RUN: %clang_cl -target x86_64-pc-windows -mapx-features=nf %s -### -o %t.o 
2>&1 | FileCheck -check-prefix=NF %s
+// RUN: %clang_cl -target x86_64-pc-windows -mapx-features=cf %s -### -o %t.o 
2>&1 | FileCheck -check-prefix=CF %s
+// RUN: %clang_cl -target x86_64-pc-windows -mapx-features=zu %s -### -o %t.o 
2>&1 | FileCheck -check-prefix=ZU %s
+// EGPR: "-target-feature" "+egpr"
+// PUSH2POP2: "-target-feature" "+push2pop2"
+// PPX: "-target-feature" "+ppx"
+// NDD: "-target-feature" "+ndd"
+// CCMP: "-target-feature" "+ccmp"
+// NF: "-target-feature" "+nf"
+// CF: "-target-feature" "+cf"
+// ZU: "-target-feature" "+zu"
+
+// RUN: %clang_cl -target x86_64-pc-windows -mapx-features=egpr,ndd %s -### -o 
%t.o 2>&1 | FileCheck -check-prefix=EGPR-NDD %s
+// RUN: %clang_cl -target x86_64-pc-wi

[clang] [Driver][X86] Add -m[no-]apxf to m_x86_Features_Group (PR #140874)

2025-05-22 Thread Feng Zou via cfe-commits

https://github.com/fzou1 updated 
https://github.com/llvm/llvm-project/pull/140874

>From d683302d4768dfb5c618aa1b09553f3b22142aed Mon Sep 17 00:00:00 2001
From: Feng Zou 
Date: Tue, 20 May 2025 22:10:07 +0800
Subject: [PATCH 1/4] [Driver][X86] Add -m[no-]apxf to m_x86_Features_Group

This is to expose these options to clang-cl on Windows.
And add help text for these options.
---
 clang/include/clang/Driver/Options.td |  9 +++--
 clang/test/Driver/cl-x86-flags.c  | 50 +++
 2 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Driver/Options.td 
b/clang/include/clang/Driver/Options.td
index 9a4253113488d..ead729342173b 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6679,8 +6679,13 @@ def mapx_features_EQ : CommaJoined<["-"], 
"mapx-features=">, Group, 
Values<"egpr,push2pop2,ppx,ndd,ccmp,nf,cf,zu">,  Visibility<[ClangOption, 
CLOption, FlangOption]>;
 def mno_apx_features_EQ : CommaJoined<["-"], "mno-apx-features=">, 
Group,
 HelpText<"Disable features of APX">, 
Values<"egpr,push2pop2,ppx,ndd,ccmp,nf,cf,zu">, Visibility<[ClangOption, 
CLOption, FlangOption]>;
-def mapxf : Flag<["-"], "mapxf">, Alias, 
AliasArgs<["egpr","push2pop2","ppx","ndd","ccmp","nf","cf","zu"]>;
-def mno_apxf : Flag<["-"], "mno-apxf">, Alias, 
AliasArgs<["egpr","push2pop2","ppx","ndd","ccmp","nf","cf","zu"]>;
+def mapxf : Flag<["-"], "mapxf">, Alias,
+AliasArgs<["egpr","push2pop2","ppx","ndd","ccmp","nf","cf","zu"]>,
+Group, HelpText<"Enable all features of 
APX">;
+def mno_apxf : Flag<["-"], "mno-apxf">, Alias,
+   
AliasArgs<["egpr","push2pop2","ppx","ndd","ccmp","nf","cf","zu"]>,
+   Group,
+   HelpText<"Disable all features of APX">;
 def mapx_inline_asm_use_gpr32 : Flag<["-"], "mapx-inline-asm-use-gpr32">, 
Group,
 HelpText<"Enable use of GPR32 in inline 
assembly for APX">;
 } // let Flags = [TargetSpecific]
diff --git a/clang/test/Driver/cl-x86-flags.c b/clang/test/Driver/cl-x86-flags.c
index 51b16f0ce3546..23fe96d604604 100644
--- a/clang/test/Driver/cl-x86-flags.c
+++ b/clang/test/Driver/cl-x86-flags.c
@@ -135,3 +135,53 @@
 
 void f(void) {
 }
+
+
+// RUN: not %clang_cl -### --target=i386 -mapx-features=ndd %s 2>&1 | 
FileCheck --check-prefix=NON-APX %s
+// RUN: not %clang_cl -### --target=i386 -mapxf %s 2>&1 | FileCheck 
--check-prefix=NON-APX %s
+// RUN: %clang_cl -### --target=i386 -mno-apxf %s 2>&1 > /dev/null
+// NON-APX:  error: unsupported option '-mapx-features=|-mapxf' for target 
'i386'
+// NON-APX-NOT:  error: {{.*}} -mapx-features=
+
+// RUN: %clang_cl -target x86_64-pc-windows -mapxf %s -### -o %t.o 2>&1 | 
FileCheck -check-prefix=APXF %s
+// RUN: %clang_cl -target x86_64-pc-windows -mno-apxf %s -### -o %t.o 2>&1 | 
FileCheck -check-prefix=NO-APXF %s
+// RUN: %clang_cl -target x86_64-pc-windows -mno-apxf -mapxf %s -### -o %t.o 
2>&1 | FileCheck -check-prefix=APXF %s
+// RUN: %clang_cl -target x86_64-pc-windows -mapxf -mno-apxf %s -### -o %t.o 
2>&1 | FileCheck -check-prefix=NO-APXF %s
+//
+// APXF: "-target-feature" "+egpr" "-target-feature" "+push2pop2" 
"-target-feature" "+ppx" "-target-feature" "+ndd" "-target-feature" "+ccmp" 
"-target-feature" "+nf" "-target-feature" "+cf" "-target-feature" "+zu"
+// NO-APXF: "-target-feature" "-egpr" "-target-feature" "-push2pop2" 
"-target-feature" "-ppx" "-target-feature" "-ndd" "-target-feature" "-ccmp" 
"-target-feature" "-nf" "-target-feature" "-cf" "-target-feature" "-zu"
+
+// RUN: %clang_cl -target x86_64-pc-windows -mapx-features=egpr %s -### -o 
%t.o 2>&1 | FileCheck -check-prefix=EGPR %s
+// RUN: %clang_cl -target x86_64-pc-windows -mapx-features=push2pop2 %s -### 
-o %t.o 2>&1 | FileCheck -check-prefix=PUSH2POP2 %s
+// RUN: %clang_cl -target x86_64-pc-windows -mapx-features=ppx %s -### -o %t.o 
2>&1 | FileCheck -check-prefix=PPX %s
+// RUN: %clang_cl -target x86_64-pc-windows -mapx-features=ndd %s -### -o %t.o 
2>&1 | FileCheck -check-prefix=NDD %s
+// RUN: %clang_cl -target x86_64-pc-windows -mapx-features=ccmp %s -### -o 
%t.o 2>&1 | FileCheck -check-prefix=CCMP %s
+// RUN: %clang_cl -target x86_64-pc-windows -mapx-features=nf %s -### -o %t.o 
2>&1 | FileCheck -check-prefix=NF %s
+// RUN: %clang_cl -target x86_64-pc-windows -mapx-features=cf %s -### -o %t.o 
2>&1 | FileCheck -check-prefix=CF %s
+// RUN: %clang_cl -target x86_64-pc-windows -mapx-features=zu %s -### -o %t.o 
2>&1 | FileCheck -check-prefix=ZU %s
+// EGPR: "-target-feature" "+egpr"
+// PUSH2POP2: "-target-feature" "+push2pop2"
+// PPX: "-target-feature" "+ppx"
+// NDD: "-target-feature" "+ndd"
+// CCMP: "-target-feature" "+ccmp"
+// NF: "-target-feature" "+nf"
+// CF: "-target-feature" "+cf"
+// ZU: "-target-feature" "+zu"
+
+// RUN: %clang_cl -target x86_64-pc-windows -mapx-features=egpr,ndd %s -### -o 
%t.o 2>&1 | FileCheck -check-prefix=EGPR-NDD %s
+// RUN: %clang_cl -target x86_64-pc-wi

  1   2   >