codemzs updated this revision to Diff 525920.
codemzs marked 3 inline comments as done.
codemzs added a comment.
Addresses feedback on extended floating type documentation from @rjmccall and
@pengfei
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D150913/new/
https://reviews.llvm.org/D150913
Files:
clang/docs/LanguageExtensions.rst
clang/include/clang/Basic/DiagnosticSemaKinds.td
clang/include/clang/Basic/FPOptions.def
clang/include/clang/Basic/LangOptions.def
clang/include/clang/Basic/TargetInfo.h
clang/include/clang/Driver/Options.td
clang/lib/AST/Type.cpp
clang/lib/Basic/TargetInfo.cpp
clang/lib/Basic/Targets/AMDGPU.h
clang/lib/Basic/Targets/ARM.cpp
clang/lib/Basic/Targets/NVPTX.h
clang/lib/Basic/Targets/X86.cpp
clang/lib/Basic/Targets/X86.h
clang/lib/CodeGen/CGExprScalar.cpp
clang/lib/Driver/ToolChains/Clang.cpp
clang/lib/Sema/SemaCast.cpp
clang/lib/Sema/SemaExpr.cpp
clang/lib/Sema/SemaOverload.cpp
clang/test/CodeGen/X86/avx512bf16-error.c
clang/test/CodeGen/X86/bfloat-mangle.cpp
clang/test/CodeGen/X86/bfloat16.cpp
clang/test/CodeGen/X86/fexcess-precision-bfloat16.c
clang/test/CodeGenCUDA/amdgpu-bf16.cu
clang/test/CodeGenCUDA/bf16.cu
clang/test/Driver/fexcess-precision.c
clang/test/Sema/arm-bf16-forbidden-ops.c
clang/test/Sema/arm-bf16-forbidden-ops.cpp
clang/test/Sema/arm-bfloat.cpp
clang/test/SemaCUDA/amdgpu-bf16.cu
clang/test/SemaCUDA/bf16.cu
Index: clang/test/SemaCUDA/bf16.cu
===================================================================
--- clang/test/SemaCUDA/bf16.cu
+++ clang/test/SemaCUDA/bf16.cu
@@ -2,32 +2,32 @@
// REQUIRES: x86-registered-target
// RUN: %clang_cc1 "-triple" "x86_64-unknown-linux-gnu" "-aux-triple" "nvptx64-nvidia-cuda" \
-// RUN: "-target-cpu" "x86-64" -fsyntax-only -verify=scalar %s
+// RUN: "-target-cpu" "x86-64" -fsyntax-only -verify=scalar -Wno-unused %s
// RUN: %clang_cc1 "-aux-triple" "x86_64-unknown-linux-gnu" "-triple" "nvptx64-nvidia-cuda" \
-// RUN: -fcuda-is-device "-aux-target-cpu" "x86-64" -fsyntax-only -verify=scalar %s
+// RUN: -fcuda-is-device "-aux-target-cpu" "x86-64" -fsyntax-only -verify=scalar -Wno-unused %s
#include "Inputs/cuda.h"
__device__ void test(bool b, __bf16 *out, __bf16 in) {
__bf16 bf16 = in; // No error on using the type itself.
- bf16 + bf16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__bf16')}}
- bf16 - bf16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__bf16')}}
- bf16 * bf16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__bf16')}}
- bf16 / bf16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__bf16')}}
+ bf16 + bf16;
+ bf16 - bf16;
+ bf16 * bf16;
+ bf16 / bf16;
__fp16 fp16;
- bf16 + fp16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__fp16')}}
- fp16 + bf16; // scalar-error {{invalid operands to binary expression ('__fp16' and '__bf16')}}
- bf16 - fp16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__fp16')}}
- fp16 - bf16; // scalar-error {{invalid operands to binary expression ('__fp16' and '__bf16')}}
- bf16 * fp16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__fp16')}}
- fp16 * bf16; // scalar-error {{invalid operands to binary expression ('__fp16' and '__bf16')}}
- bf16 / fp16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__fp16')}}
- fp16 / bf16; // scalar-error {{invalid operands to binary expression ('__fp16' and '__bf16')}}
+ bf16 + fp16;
+ fp16 + bf16;
+ bf16 - fp16;
+ fp16 - bf16;
+ bf16 * fp16;
+ fp16 * bf16;
+ bf16 / fp16;
+ fp16 / bf16;
bf16 = fp16; // scalar-error {{assigning to '__bf16' from incompatible type '__fp16'}}
fp16 = bf16; // scalar-error {{assigning to '__fp16' from incompatible type '__bf16'}}
- bf16 + (b ? fp16 : bf16); // scalar-error {{incompatible operand types ('__fp16' and '__bf16')}}
+ bf16 + (b ? fp16 : bf16);
*out = bf16;
}
Index: clang/test/SemaCUDA/amdgpu-bf16.cu
===================================================================
--- clang/test/SemaCUDA/amdgpu-bf16.cu
+++ clang/test/SemaCUDA/amdgpu-bf16.cu
@@ -1,13 +1,8 @@
// REQUIRES: amdgpu-registered-target
// REQUIRES: x86-registered-target
-// RUN: %clang_cc1 "-triple" "x86_64-unknown-linux-gnu" "-aux-triple" "amdgcn-amd-amdhsa"\
-// RUN: "-target-cpu" "x86-64" -fsyntax-only -verify=amdgcn %s
-// RUN: %clang_cc1 "-aux-triple" "x86_64-unknown-linux-gnu" "-triple" "amdgcn-amd-amdhsa"\
-// RUN: -fcuda-is-device "-aux-target-cpu" "x86-64" -fsyntax-only -verify=amdgcn %s
-
// RUN: %clang_cc1 "-aux-triple" "x86_64-unknown-linux-gnu" "-triple" "r600-unknown-unknown"\
-// RUN: -fcuda-is-device "-aux-target-cpu" "x86-64" -fsyntax-only -verify=amdgcn,r600 %s
+// RUN: -fcuda-is-device "-aux-target-cpu" "x86-64" -fsyntax-only -verify=r600 %s
// AMDGCN has storage-only support for bf16. R600 does not support it should error out when
// it's the main target.
@@ -29,45 +24,8 @@
// r600-error@+1 2 {{__bf16 is not supported on this target}}
__device__ void test(bool b, __bf16 *out, __bf16 in) {
__bf16 bf16 = in; // r600-error {{__bf16 is not supported on this target}}
-
- bf16 + bf16; // amdgcn-error {{invalid operands to binary expression ('__bf16' and '__bf16')}}
- bf16 - bf16; // amdgcn-error {{invalid operands to binary expression ('__bf16' and '__bf16')}}
- bf16 * bf16; // amdgcn-error {{invalid operands to binary expression ('__bf16' and '__bf16')}}
- bf16 / bf16; // amdgcn-error {{invalid operands to binary expression ('__bf16' and '__bf16')}}
-
- __fp16 fp16;
-
- bf16 + fp16; // amdgcn-error {{invalid operands to binary expression ('__bf16' and '__fp16')}}
- fp16 + bf16; // amdgcn-error {{invalid operands to binary expression ('__fp16' and '__bf16')}}
- bf16 - fp16; // amdgcn-error {{invalid operands to binary expression ('__bf16' and '__fp16')}}
- fp16 - bf16; // amdgcn-error {{invalid operands to binary expression ('__fp16' and '__bf16')}}
- bf16 * fp16; // amdgcn-error {{invalid operands to binary expression ('__bf16' and '__fp16')}}
- fp16 * bf16; // amdgcn-error {{invalid operands to binary expression ('__fp16' and '__bf16')}}
- bf16 / fp16; // amdgcn-error {{invalid operands to binary expression ('__bf16' and '__fp16')}}
- fp16 / bf16; // amdgcn-error {{invalid operands to binary expression ('__fp16' and '__bf16')}}
- bf16 = fp16; // amdgcn-error {{assigning to '__bf16' from incompatible type '__fp16'}}
- fp16 = bf16; // amdgcn-error {{assigning to '__fp16' from incompatible type '__bf16'}}
- bf16 + (b ? fp16 : bf16); // amdgcn-error {{incompatible operand types ('__fp16' and '__bf16')}}
*out = bf16;
- // amdgcn-error@+1 {{static_cast from '__bf16' to 'unsigned short' is not allowed}}
- unsigned short u16bf16 = static_cast<unsigned short>(bf16);
- // amdgcn-error@+2 {{C-style cast from 'unsigned short' to '__bf16' is not allowed}}
- // r600-error@+1 {{__bf16 is not supported on this target}}
- bf16 = (__bf16)u16bf16;
-
- // amdgcn-error@+1 {{static_cast from '__bf16' to 'float' is not allowed}}
- float f32bf16 = static_cast<float>(bf16);
- // amdgcn-error@+2 {{C-style cast from 'float' to '__bf16' is not allowed}}
- // r600-error@+1 {{__bf16 is not supported on this target}}
- bf16 = (__bf16)f32bf16;
-
- // amdgcn-error@+1 {{static_cast from '__bf16' to 'double' is not allowed}}
- double f64bf16 = static_cast<double>(bf16);
- // amdgcn-error@+2 {{C-style cast from 'double' to '__bf16' is not allowed}}
- // r600-error@+1 {{__bf16 is not supported on this target}}
- bf16 = (__bf16)f64bf16;
-
// r600-error@+1 {{__bf16 is not supported on this target}}
typedef __attribute__((ext_vector_type(2))) __bf16 bf16_x2;
bf16_x2 vec2_a, vec2_b;
Index: clang/test/Sema/arm-bfloat.cpp
===================================================================
--- clang/test/Sema/arm-bfloat.cpp
+++ clang/test/Sema/arm-bfloat.cpp
@@ -1,38 +1,38 @@
// RUN: %clang_cc1 -fsyntax-only -verify=scalar,neon -std=c++11 \
// RUN: -triple aarch64-arm-none-eabi -target-cpu cortex-a75 \
-// RUN: -target-feature +bf16 -target-feature +neon %s
+// RUN: -target-feature +bf16 -target-feature +neon -Wno-unused %s
// RUN: %clang_cc1 -fsyntax-only -verify=scalar,neon -std=c++11 \
// RUN: -triple arm-arm-none-eabi -target-cpu cortex-a53 \
-// RUN: -target-feature +bf16 -target-feature +neon %s
+// RUN: -target-feature +bf16 -target-feature +neon -Wno-unused %s
// The types should be available under AArch64 even without the bf16 feature
// RUN: %clang_cc1 -fsyntax-only -verify=scalar -DNONEON -std=c++11 \
// RUN: -triple aarch64-arm-none-eabi -target-cpu cortex-a75 \
-// RUN: -target-feature -bf16 -target-feature +neon %s
+// RUN: -target-feature -bf16 -target-feature +neon -Wno-unused %s
// REQUIRES: aarch64-registered-target || arm-registered-target
void test(bool b) {
__bf16 bf16;
- bf16 + bf16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__bf16')}}
- bf16 - bf16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__bf16')}}
- bf16 * bf16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__bf16')}}
- bf16 / bf16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__bf16')}}
+ bf16 + bf16;
+ bf16 - bf16;
+ bf16 * bf16;
+ bf16 / bf16;
__fp16 fp16;
- bf16 + fp16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__fp16')}}
- fp16 + bf16; // scalar-error {{invalid operands to binary expression ('__fp16' and '__bf16')}}
- bf16 - fp16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__fp16')}}
- fp16 - bf16; // scalar-error {{invalid operands to binary expression ('__fp16' and '__bf16')}}
- bf16 * fp16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__fp16')}}
- fp16 * bf16; // scalar-error {{invalid operands to binary expression ('__fp16' and '__bf16')}}
- bf16 / fp16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__fp16')}}
- fp16 / bf16; // scalar-error {{invalid operands to binary expression ('__fp16' and '__bf16')}}
+ bf16 + fp16;
+ fp16 + bf16;
+ bf16 - fp16;
+ fp16 - bf16;
+ bf16 * fp16;
+ fp16 * bf16;
+ bf16 / fp16;
+ fp16 / bf16;
bf16 = fp16; // scalar-error {{assigning to '__bf16' from incompatible type '__fp16'}}
fp16 = bf16; // scalar-error {{assigning to '__fp16' from incompatible type '__bf16'}}
- bf16 + (b ? fp16 : bf16); // scalar-error {{incompatible operand types ('__fp16' and '__bf16')}}
+ bf16 + (b ? fp16 : bf16);
}
#ifndef NONEON
@@ -40,18 +40,18 @@
#include <arm_neon.h>
void test_vector(bfloat16x4_t a, bfloat16x4_t b, float16x4_t c) {
- a + b; // neon-error {{invalid operands to binary expression ('bfloat16x4_t' (vector of 4 'bfloat16_t' values) and 'bfloat16x4_t')}}
- a - b; // neon-error {{invalid operands to binary expression ('bfloat16x4_t' (vector of 4 'bfloat16_t' values) and 'bfloat16x4_t')}}
- a * b; // neon-error {{invalid operands to binary expression ('bfloat16x4_t' (vector of 4 'bfloat16_t' values) and 'bfloat16x4_t')}}
- a / b; // neon-error {{invalid operands to binary expression ('bfloat16x4_t' (vector of 4 'bfloat16_t' values) and 'bfloat16x4_t')}}
-
- a + c; // neon-error {{invalid operands to binary expression ('bfloat16x4_t' (vector of 4 'bfloat16_t' values) and 'float16x4_t' (vector of 4 'float16_t' values))}}
- a - c; // neon-error {{invalid operands to binary expression ('bfloat16x4_t' (vector of 4 'bfloat16_t' values) and 'float16x4_t' (vector of 4 'float16_t' values))}}
- a * c; // neon-error {{invalid operands to binary expression ('bfloat16x4_t' (vector of 4 'bfloat16_t' values) and 'float16x4_t' (vector of 4 'float16_t' values))}}
- a / c; // neon-error {{invalid operands to binary expression ('bfloat16x4_t' (vector of 4 'bfloat16_t' values) and 'float16x4_t' (vector of 4 'float16_t' values))}}
- c + b; // neon-error {{invalid operands to binary expression ('float16x4_t' (vector of 4 'float16_t' values) and 'bfloat16x4_t' (vector of 4 'bfloat16_t' values))}}
- c - b; // neon-error {{invalid operands to binary expression ('float16x4_t' (vector of 4 'float16_t' values) and 'bfloat16x4_t' (vector of 4 'bfloat16_t' values))}}
- c * b; // neon-error {{invalid operands to binary expression ('float16x4_t' (vector of 4 'float16_t' values) and 'bfloat16x4_t' (vector of 4 'bfloat16_t' values))}}
- c / b; // neon-error {{invalid operands to binary expression ('float16x4_t' (vector of 4 'float16_t' values) and 'bfloat16x4_t' (vector of 4 'bfloat16_t' values))}}
+ a + b;
+ a - b;
+ a * b;
+ a / b;
+
+ a + c;
+ a - c;
+ a * c;
+ a / c;
+ c + b;
+ c - b;
+ c * b;
+ c / b;
}
#endif
\ No newline at end of file
Index: clang/test/Sema/arm-bf16-forbidden-ops.cpp
===================================================================
--- clang/test/Sema/arm-bf16-forbidden-ops.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -triple aarch64 -target-feature +bf16 %s
-// RUN: %clang_cc1 -fsyntax-only -verify -triple aarch64 -target-feature -bf16 %s
-
-__bf16 test_static_cast_from_float(float in) {
- return static_cast<__bf16>(in); // expected-error {{static_cast from 'float' to '__bf16' is not allowed}}
-}
-
-__bf16 test_static_cast_from_float_literal(void) {
- return static_cast<__bf16>(1.0f); // expected-error {{static_cast from 'float' to '__bf16' is not allowed}}
-}
-
-__bf16 test_static_cast_from_int(int in) {
- return static_cast<__bf16>(in); // expected-error {{static_cast from 'int' to '__bf16' is not allowed}}
-}
-
-__bf16 test_static_cast_from_int_literal(void) {
- return static_cast<__bf16>(1); // expected-error {{static_cast from 'int' to '__bf16' is not allowed}}
-}
-
-__bf16 test_static_cast_bfloat(__bf16 in) {
- return static_cast<__bf16>(in); // this one should work
-}
-
-float test_static_cast_to_float(__bf16 in) {
- return static_cast<float>(in); // expected-error {{static_cast from '__bf16' to 'float' is not allowed}}
-}
-
-int test_static_cast_to_int(__bf16 in) {
- return static_cast<int>(in); // expected-error {{static_cast from '__bf16' to 'int' is not allowed}}
-}
-
-__bf16 test_implicit_from_float(float in) {
- return in; // expected-error {{cannot initialize return object of type '__bf16' with an lvalue of type 'float'}}
-}
-
-__bf16 test_implicit_from_float_literal() {
- return 1.0f; // expected-error {{cannot initialize return object of type '__bf16' with an rvalue of type 'float'}}
-}
-
-__bf16 test_implicit_from_int(int in) {
- return in; // expected-error {{cannot initialize return object of type '__bf16' with an lvalue of type 'int'}}
-}
-
-__bf16 test_implicit_from_int_literal() {
- return 1; // expected-error {{cannot initialize return object of type '__bf16' with an rvalue of type 'int'}}
-}
-
-__bf16 test_implicit_bfloat(__bf16 in) {
- return in; // this one should work
-}
-
-float test_implicit_to_float(__bf16 in) {
- return in; // expected-error {{cannot initialize return object of type 'float' with an lvalue of type '__bf16'}}
-}
-
-int test_implicit_to_int(__bf16 in) {
- return in; // expected-error {{cannot initialize return object of type 'int' with an lvalue of type '__bf16'}}
-}
-
-__bf16 test_cond(__bf16 a, __bf16 b, bool which) {
- // Conditional operator _should_ be supported, without nonsense
- // complaints like 'types __bf16 and __bf16 are not compatible'
- return which ? a : b;
-}
-
-__bf16 test_cond_float(__bf16 a, __bf16 b, bool which) {
- return which ? a : 1.0f; // expected-error {{incompatible operand types ('__bf16' and 'float')}}
-}
-
-__bf16 test_cond_int(__bf16 a, __bf16 b, bool which) {
- return which ? a : 1; // expected-error {{incompatible operand types ('__bf16' and 'int')}}
-}
Index: clang/test/Sema/arm-bf16-forbidden-ops.c
===================================================================
--- clang/test/Sema/arm-bf16-forbidden-ops.c
+++ /dev/null
@@ -1,72 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -triple aarch64 -target-feature +bf16 %s
-// RUN: %clang_cc1 -fsyntax-only -verify -triple aarch64 -target-feature -bf16 %s
-
-__bf16 test_cast_from_float(float in) {
- return (__bf16)in; // expected-error {{cannot type-cast to __bf16}}
-}
-
-__bf16 test_cast_from_float_literal(void) {
- return (__bf16)1.0f; // expected-error {{cannot type-cast to __bf16}}
-}
-
-__bf16 test_cast_from_int(int in) {
- return (__bf16)in; // expected-error {{cannot type-cast to __bf16}}
-}
-
-__bf16 test_cast_from_int_literal(void) {
- return (__bf16)1; // expected-error {{cannot type-cast to __bf16}}
-}
-
-__bf16 test_cast_bfloat(__bf16 in) {
- return (__bf16)in; // this one should work
-}
-
-float test_cast_to_float(__bf16 in) {
- return (float)in; // expected-error {{cannot type-cast from __bf16}}
-}
-
-int test_cast_to_int(__bf16 in) {
- return (int)in; // expected-error {{cannot type-cast from __bf16}}
-}
-
-__bf16 test_implicit_from_float(float in) {
- return in; // expected-error {{returning 'float' from a function with incompatible result type '__bf16'}}
-}
-
-__bf16 test_implicit_from_float_literal(void) {
- return 1.0f; // expected-error {{returning 'float' from a function with incompatible result type '__bf16'}}
-}
-
-__bf16 test_implicit_from_int(int in) {
- return in; // expected-error {{returning 'int' from a function with incompatible result type '__bf16'}}
-}
-
-__bf16 test_implicit_from_int_literal(void) {
- return 1; // expected-error {{returning 'int' from a function with incompatible result type '__bf16'}}
-}
-
-__bf16 test_implicit_bfloat(__bf16 in) {
- return in; // this one should work
-}
-
-float test_implicit_to_float(__bf16 in) {
- return in; // expected-error {{returning '__bf16' from a function with incompatible result type 'float'}}
-}
-
-int test_implicit_to_int(__bf16 in) {
- return in; // expected-error {{returning '__bf16' from a function with incompatible result type 'int'}}
-}
-
-__bf16 test_cond(__bf16 a, __bf16 b, _Bool which) {
- // Conditional operator _should_ be supported, without nonsense
- // complaints like 'types __bf16 and __bf16 are not compatible'
- return which ? a : b;
-}
-
-__bf16 test_cond_float(__bf16 a, __bf16 b, _Bool which) {
- return which ? a : 1.0f; // expected-error {{incompatible operand types ('__bf16' and 'float')}}
-}
-
-__bf16 test_cond_int(__bf16 a, __bf16 b, _Bool which) {
- return which ? a : 1; // expected-error {{incompatible operand types ('__bf16' and 'int')}}
-}
Index: clang/test/Driver/fexcess-precision.c
===================================================================
--- clang/test/Driver/fexcess-precision.c
+++ clang/test/Driver/fexcess-precision.c
@@ -62,9 +62,13 @@
// RUN: | FileCheck --check-prefix=CHECK-ERR-NONE %s
// CHECK-FAST: "-ffloat16-excess-precision=fast"
+// CHECK-FAST: "-fbfloat16-excess-precision=fast"
// CHECK-STD: "-ffloat16-excess-precision=standard"
+// CHECK-STD: "-fbfloat16-excess-precision=standard"
// CHECK-NONE: "-ffloat16-excess-precision=none"
+// CHECK-NONE: "-fbfloat16-excess-precision=none"
// CHECK-ERR-NONE: unsupported argument 'none' to option '-fexcess-precision='
// CHECK: "-cc1"
// CHECK-NOT: "-ffloat16-excess-precision=fast"
+// CHECK-NOT: "-fbfloat16-excess-precision=fast"
// CHECK-ERR-16: unsupported argument '16' to option '-fexcess-precision='
Index: clang/test/CodeGenCUDA/bf16.cu
===================================================================
--- clang/test/CodeGenCUDA/bf16.cu
+++ clang/test/CodeGenCUDA/bf16.cu
@@ -6,12 +6,12 @@
#include "Inputs/cuda.h"
-// CHECK-LABEL: .visible .func _Z8test_argPu6__bf16u6__bf16(
-// CHECK: .param .b64 _Z8test_argPu6__bf16u6__bf16_param_0,
-// CHECK: .param .b16 _Z8test_argPu6__bf16u6__bf16_param_1
+// CHECK-LABEL: .visible .func _Z8test_argPDF16bDF16b(
+// CHECK: .param .b64 _Z8test_argPDF16bDF16b_param_0,
+// CHECK: .param .b16 _Z8test_argPDF16bDF16b_param_1
//
__device__ void test_arg(__bf16 *out, __bf16 in) {
-// CHECK: ld.param.b16 %{{h.*}}, [_Z8test_argPu6__bf16u6__bf16_param_1];
+// CHECK: ld.param.b16 %{{h.*}}, [_Z8test_argPDF16bDF16b_param_1];
__bf16 bf16 = in;
*out = bf16;
// CHECK: st.b16
@@ -19,23 +19,23 @@
}
-// CHECK-LABEL: .visible .func (.param .b32 func_retval0) _Z8test_retu6__bf16(
-// CHECK: .param .b16 _Z8test_retu6__bf16_param_0
+// CHECK-LABEL: .visible .func (.param .b32 func_retval0) _Z8test_retDF16b(
+// CHECK: .param .b16 _Z8test_retDF16b_param_0
__device__ __bf16 test_ret( __bf16 in) {
-// CHECK: ld.param.b16 %h{{.*}}, [_Z8test_retu6__bf16_param_0];
+// CHECK: ld.param.b16 %h{{.*}}, [_Z8test_retDF16b_param_0];
return in;
// CHECK: st.param.b16 [func_retval0+0], %h
// CHECK: ret;
}
-// CHECK-LABEL: .visible .func (.param .b32 func_retval0) _Z9test_callu6__bf16(
-// CHECK: .param .b16 _Z9test_callu6__bf16_param_0
+// CHECK-LABEL: .visible .func (.param .b32 func_retval0) _Z9test_callDF16b(
+// CHECK: .param .b16 _Z9test_callDF16b_param_0
__device__ __bf16 test_call( __bf16 in) {
-// CHECK: ld.param.b16 %h{{.*}}, [_Z9test_callu6__bf16_param_0];
+// CHECK: ld.param.b16 %h{{.*}}, [_Z9test_callDF16b_param_0];
// CHECK: st.param.b16 [param0+0], %h2;
// CHECK: .param .b32 retval0;
// CHECK: call.uni (retval0),
-// CHECK-NEXT: _Z8test_retu6__bf16,
+// CHECK-NEXT: _Z8test_retDF16b,
// CHECK-NEXT: (
// CHECK-NEXT: param0
// CHECK-NEXT );
Index: clang/test/CodeGenCUDA/amdgpu-bf16.cu
===================================================================
--- clang/test/CodeGenCUDA/amdgpu-bf16.cu
+++ clang/test/CodeGenCUDA/amdgpu-bf16.cu
@@ -7,7 +7,7 @@
#include "Inputs/cuda.h"
-// CHECK-LABEL: @_Z8test_argPu6__bf16u6__bf16(
+// CHECK-LABEL: @_Z8test_argPDF16bDF16b(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[IN_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
@@ -29,7 +29,7 @@
*out = bf16;
}
-// CHECK-LABEL: @_Z9test_loadPu6__bf16S_(
+// CHECK-LABEL: @_Z9test_loadPDF16bS_(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[IN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -52,7 +52,7 @@
*out = bf16;
}
-// CHECK-LABEL: @_Z8test_retu6__bf16(
+// CHECK-LABEL: @_Z8test_retDF16b(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[RETVAL:%.*]] = alloca bfloat, align 2, addrspace(5)
// CHECK-NEXT: [[IN_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
@@ -66,7 +66,7 @@
return in;
}
-// CHECK-LABEL: @_Z9test_callu6__bf16(
+// CHECK-LABEL: @_Z9test_callDF16b(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[RETVAL:%.*]] = alloca bfloat, align 2, addrspace(5)
// CHECK-NEXT: [[IN_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
@@ -74,7 +74,7 @@
// CHECK-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr
// CHECK-NEXT: store bfloat [[IN:%.*]], ptr [[IN_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[IN_ADDR_ASCAST]], align 2
-// CHECK-NEXT: [[CALL:%.*]] = call contract noundef bfloat @_Z8test_retu6__bf16(bfloat noundef [[TMP0]]) #[[ATTR1:[0-9]+]]
+// CHECK-NEXT: [[CALL:%.*]] = call contract noundef bfloat @_Z8test_retDF16b(bfloat noundef [[TMP0]]) #[[ATTR1:[0-9]+]]
// CHECK-NEXT: ret bfloat [[CALL]]
//
__device__ __bf16 test_call( __bf16 in) {
Index: clang/test/CodeGen/X86/fexcess-precision-bfloat16.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/X86/fexcess-precision-bfloat16.c
@@ -0,0 +1,360 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=fast -emit-llvm -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-EXT %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=fast -target-feature +fullbf16 \
+// RUN: -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-NO-EXT %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=standard -emit-llvm -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-EXT %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=standard -target-feature +fullbf16 \
+// RUN: -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-NO-EXT %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=none -emit-llvm -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-NO-EXT %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=none -target-feature +fullbf16 \
+// RUN: -emit-llvm -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-NO-EXT %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=fast \
+// RUN: -emit-llvm -ffp-eval-method=source -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-EXT %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=fast -target-feature +fullbf16 \
+// RUN: -emit-llvm -ffp-eval-method=source -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-NO-EXT %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=standard \
+// RUN: -emit-llvm -ffp-eval-method=source -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-EXT %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=standard -target-feature +fullbf16 \
+// RUN: -emit-llvm -ffp-eval-method=source -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-NO-EXT %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=none \
+// RUN: -emit-llvm -ffp-eval-method=source -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-NO-EXT %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=none -target-feature +fullbf16 \
+// RUN: -emit-llvm -ffp-eval-method=source -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-NO-EXT %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=fast \
+// RUN: -emit-llvm -ffp-eval-method=double -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-EXT-DBL %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=fast -target-feature +fullbf16 \
+// RUN: -emit-llvm -ffp-eval-method=double -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-EXT-DBL %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=standard \
+// RUN: -emit-llvm -ffp-eval-method=double -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-EXT-DBL %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=standard -target-feature +fullbf16 \
+// RUN: -emit-llvm -ffp-eval-method=double -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-EXT-DBL %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=none \
+// RUN: -emit-llvm -ffp-eval-method=double -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-EXT-DBL %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=none -target-feature +fullbf16 \
+// RUN: -emit-llvm -ffp-eval-method=double -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-EXT-DBL %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=fast \
+// RUN: -emit-llvm -ffp-eval-method=extended -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-EXT-FP80 %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=fast -target-feature +fullbf16 \
+// RUN: -emit-llvm -ffp-eval-method=extended -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-EXT-FP80 %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=standard \
+// RUN: -emit-llvm -ffp-eval-method=extended -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-EXT-FP80 %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=standard -target-feature +fullbf16 \
+// RUN: -emit-llvm -ffp-eval-method=extended -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-EXT-FP80 %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=none \
+// RUN: -emit-llvm -ffp-eval-method=extended -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-EXT-FP80 %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=none -target-feature +fullbf16 \
+// RUN: -emit-llvm -ffp-eval-method=extended -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-EXT-FP80 %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=none \
+// RUN: -ffp-contract=on -emit-llvm -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-CONTRACT %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=none -target-feature +fullbf16 \
+// RUN: -ffp-contract=on -emit-llvm -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-CONTRACT %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=none \
+// RUN: -fmath-errno -ffp-contract=on -fno-rounding-math \
+// RUN: -ffp-eval-method=source -emit-llvm -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-CONTRACT %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=none -target-feature +fullbf16 \
+// RUN: -fmath-errno -ffp-contract=on -fno-rounding-math \
+// RUN: -ffp-eval-method=source -emit-llvm -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-CONTRACT %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=none \
+// RUN: -fmath-errno -ffp-contract=on -fno-rounding-math \
+// RUN: -ffp-eval-method=double -emit-llvm -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-CONTRACT-DBL %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=none -target-feature +fullbf16 \
+// RUN: -fmath-errno -ffp-contract=on -fno-rounding-math \
+// RUN: -ffp-eval-method=double -emit-llvm -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-CONTRACT-DBL %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=none \
+// RUN: -fmath-errno -ffp-contract=on -fno-rounding-math \
+// RUN: -ffp-eval-method=extended -emit-llvm -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-CONTRACT-EXT %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=none -target-feature +fullbf16 \
+// RUN: -fmath-errno -ffp-contract=on -fno-rounding-math \
+// RUN: -ffp-eval-method=extended -emit-llvm -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-CONTRACT-EXT %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=none \
+// RUN: -fapprox-func -fmath-errno -fno-signed-zeros -mreassociate \
+// RUN: -freciprocal-math -ffp-contract=on -fno-rounding-math \
+// RUN: -funsafe-math-optimizations -emit-llvm -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-UNSAFE %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -fbfloat16-excess-precision=none -target-feature +fullbf16 \
+// RUN: -fapprox-func -fmath-errno -fno-signed-zeros -mreassociate \
+// RUN: -freciprocal-math -ffp-contract=on -fno-rounding-math \
+// RUN: -funsafe-math-optimizations -emit-llvm -o - %s \
+// RUN: | FileCheck -check-prefixes=CHECK-UNSAFE %s
+
+// CHECK-EXT-LABEL: define dso_local bfloat @f
+// CHECK-EXT-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]], bfloat noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-EXT-NEXT: entry:
+// CHECK-EXT-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-EXT-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-EXT-NEXT: [[C_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-EXT-NEXT: [[D_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-EXT-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2
+// CHECK-EXT-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2
+// CHECK-EXT-NEXT: store bfloat [[C]], ptr [[C_ADDR]], align 2
+// CHECK-EXT-NEXT: store bfloat [[D]], ptr [[D_ADDR]], align 2
+// CHECK-EXT-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-EXT-NEXT: [[EXT:%.*]] = fpext bfloat [[TMP0]] to float
+// CHECK-EXT-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-EXT-NEXT: [[EXT1:%.*]] = fpext bfloat [[TMP1]] to float
+// CHECK-EXT-NEXT: [[MUL:%.*]] = fmul float [[EXT]], [[EXT1]]
+// CHECK-EXT-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2
+// CHECK-EXT-NEXT: [[EXT2:%.*]] = fpext bfloat [[TMP2]] to float
+// CHECK-EXT-NEXT: [[TMP3:%.*]] = load bfloat, ptr [[D_ADDR]], align 2
+// CHECK-EXT-NEXT: [[EXT3:%.*]] = fpext bfloat [[TMP3]] to float
+// CHECK-EXT-NEXT: [[MUL4:%.*]] = fmul float [[EXT2]], [[EXT3]]
+// CHECK-EXT-NEXT: [[ADD:%.*]] = fadd float [[MUL]], [[MUL4]]
+// CHECK-EXT-NEXT: [[UNPROMOTION:%.*]] = fptrunc float [[ADD]] to bfloat
+// CHECK-EXT-NEXT: ret bfloat [[UNPROMOTION]]
+//
+// CHECK-NO-EXT-LABEL: define dso_local bfloat @f
+// CHECK-NO-EXT-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]], bfloat noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NO-EXT-NEXT: entry:
+// CHECK-NO-EXT-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-NO-EXT-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-NO-EXT-NEXT: [[C_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-NO-EXT-NEXT: [[D_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-NO-EXT-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2
+// CHECK-NO-EXT-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2
+// CHECK-NO-EXT-NEXT: store bfloat [[C]], ptr [[C_ADDR]], align 2
+// CHECK-NO-EXT-NEXT: store bfloat [[D]], ptr [[D_ADDR]], align 2
+// CHECK-NO-EXT-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-NO-EXT-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-NO-EXT-NEXT: [[MUL:%.*]] = fmul bfloat [[TMP0]], [[TMP1]]
+// CHECK-NO-EXT-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2
+// CHECK-NO-EXT-NEXT: [[TMP3:%.*]] = load bfloat, ptr [[D_ADDR]], align 2
+// CHECK-NO-EXT-NEXT: [[MUL1:%.*]] = fmul bfloat [[TMP2]], [[TMP3]]
+// CHECK-NO-EXT-NEXT: [[ADD:%.*]] = fadd bfloat [[MUL]], [[MUL1]]
+// CHECK-NO-EXT-NEXT: ret bfloat [[ADD]]
+//
+// CHECK-EXT-DBL-LABEL: define dso_local bfloat @f
+// CHECK-EXT-DBL-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]], bfloat noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-EXT-DBL-NEXT: entry:
+// CHECK-EXT-DBL-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-EXT-DBL-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-EXT-DBL-NEXT: [[C_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-EXT-DBL-NEXT: [[D_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-EXT-DBL-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2
+// CHECK-EXT-DBL-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2
+// CHECK-EXT-DBL-NEXT: store bfloat [[C]], ptr [[C_ADDR]], align 2
+// CHECK-EXT-DBL-NEXT: store bfloat [[D]], ptr [[D_ADDR]], align 2
+// CHECK-EXT-DBL-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-EXT-DBL-NEXT: [[CONV:%.*]] = fpext bfloat [[TMP0]] to double
+// CHECK-EXT-DBL-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-EXT-DBL-NEXT: [[CONV1:%.*]] = fpext bfloat [[TMP1]] to double
+// CHECK-EXT-DBL-NEXT: [[MUL:%.*]] = fmul double [[CONV]], [[CONV1]]
+// CHECK-EXT-DBL-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2
+// CHECK-EXT-DBL-NEXT: [[CONV2:%.*]] = fpext bfloat [[TMP2]] to double
+// CHECK-EXT-DBL-NEXT: [[TMP3:%.*]] = load bfloat, ptr [[D_ADDR]], align 2
+// CHECK-EXT-DBL-NEXT: [[CONV3:%.*]] = fpext bfloat [[TMP3]] to double
+// CHECK-EXT-DBL-NEXT: [[MUL4:%.*]] = fmul double [[CONV2]], [[CONV3]]
+// CHECK-EXT-DBL-NEXT: [[ADD:%.*]] = fadd double [[MUL]], [[MUL4]]
+// CHECK-EXT-DBL-NEXT: [[CONV5:%.*]] = fptrunc double [[ADD]] to bfloat
+// CHECK-EXT-DBL-NEXT: ret bfloat [[CONV5]]
+//
+// CHECK-EXT-FP80-LABEL: define dso_local bfloat @f
+// CHECK-EXT-FP80-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]], bfloat noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-EXT-FP80-NEXT: entry:
+// CHECK-EXT-FP80-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-EXT-FP80-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-EXT-FP80-NEXT: [[C_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-EXT-FP80-NEXT: [[D_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-EXT-FP80-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2
+// CHECK-EXT-FP80-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2
+// CHECK-EXT-FP80-NEXT: store bfloat [[C]], ptr [[C_ADDR]], align 2
+// CHECK-EXT-FP80-NEXT: store bfloat [[D]], ptr [[D_ADDR]], align 2
+// CHECK-EXT-FP80-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-EXT-FP80-NEXT: [[CONV:%.*]] = fpext bfloat [[TMP0]] to x86_fp80
+// CHECK-EXT-FP80-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-EXT-FP80-NEXT: [[CONV1:%.*]] = fpext bfloat [[TMP1]] to x86_fp80
+// CHECK-EXT-FP80-NEXT: [[MUL:%.*]] = fmul x86_fp80 [[CONV]], [[CONV1]]
+// CHECK-EXT-FP80-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2
+// CHECK-EXT-FP80-NEXT: [[CONV2:%.*]] = fpext bfloat [[TMP2]] to x86_fp80
+// CHECK-EXT-FP80-NEXT: [[TMP3:%.*]] = load bfloat, ptr [[D_ADDR]], align 2
+// CHECK-EXT-FP80-NEXT: [[CONV3:%.*]] = fpext bfloat [[TMP3]] to x86_fp80
+// CHECK-EXT-FP80-NEXT: [[MUL4:%.*]] = fmul x86_fp80 [[CONV2]], [[CONV3]]
+// CHECK-EXT-FP80-NEXT: [[ADD:%.*]] = fadd x86_fp80 [[MUL]], [[MUL4]]
+// CHECK-EXT-FP80-NEXT: [[CONV5:%.*]] = fptrunc x86_fp80 [[ADD]] to bfloat
+// CHECK-EXT-FP80-NEXT: ret bfloat [[CONV5]]
+//
+// CHECK-CONTRACT-LABEL: define dso_local bfloat @f
+// CHECK-CONTRACT-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]], bfloat noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-CONTRACT-NEXT: entry:
+// CHECK-CONTRACT-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-CONTRACT-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-CONTRACT-NEXT: [[C_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-CONTRACT-NEXT: [[D_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-CONTRACT-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2
+// CHECK-CONTRACT-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2
+// CHECK-CONTRACT-NEXT: store bfloat [[C]], ptr [[C_ADDR]], align 2
+// CHECK-CONTRACT-NEXT: store bfloat [[D]], ptr [[D_ADDR]], align 2
+// CHECK-CONTRACT-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-CONTRACT-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-CONTRACT-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2
+// CHECK-CONTRACT-NEXT: [[TMP3:%.*]] = load bfloat, ptr [[D_ADDR]], align 2
+// CHECK-CONTRACT-NEXT: [[MUL1:%.*]] = fmul bfloat [[TMP2]], [[TMP3]]
+// CHECK-CONTRACT-NEXT: [[TMP4:%.*]] = call bfloat @llvm.fmuladd.bf16(bfloat [[TMP0]], bfloat [[TMP1]], bfloat [[MUL1]])
+// CHECK-CONTRACT-NEXT: ret bfloat [[TMP4]]
+//
+// CHECK-CONTRACT-DBL-LABEL: define dso_local bfloat @f
+// CHECK-CONTRACT-DBL-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]], bfloat noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-CONTRACT-DBL-NEXT: entry:
+// CHECK-CONTRACT-DBL-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-CONTRACT-DBL-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-CONTRACT-DBL-NEXT: [[C_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-CONTRACT-DBL-NEXT: [[D_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-CONTRACT-DBL-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2
+// CHECK-CONTRACT-DBL-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2
+// CHECK-CONTRACT-DBL-NEXT: store bfloat [[C]], ptr [[C_ADDR]], align 2
+// CHECK-CONTRACT-DBL-NEXT: store bfloat [[D]], ptr [[D_ADDR]], align 2
+// CHECK-CONTRACT-DBL-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-CONTRACT-DBL-NEXT: [[CONV:%.*]] = fpext bfloat [[TMP0]] to double
+// CHECK-CONTRACT-DBL-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-CONTRACT-DBL-NEXT: [[CONV1:%.*]] = fpext bfloat [[TMP1]] to double
+// CHECK-CONTRACT-DBL-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2
+// CHECK-CONTRACT-DBL-NEXT: [[CONV2:%.*]] = fpext bfloat [[TMP2]] to double
+// CHECK-CONTRACT-DBL-NEXT: [[TMP3:%.*]] = load bfloat, ptr [[D_ADDR]], align 2
+// CHECK-CONTRACT-DBL-NEXT: [[CONV3:%.*]] = fpext bfloat [[TMP3]] to double
+// CHECK-CONTRACT-DBL-NEXT: [[MUL4:%.*]] = fmul double [[CONV2]], [[CONV3]]
+// CHECK-CONTRACT-DBL-NEXT: [[TMP4:%.*]] = call double @llvm.fmuladd.f64(double [[CONV]], double [[CONV1]], double [[MUL4]])
+// CHECK-CONTRACT-DBL-NEXT: [[CONV5:%.*]] = fptrunc double [[TMP4]] to bfloat
+// CHECK-CONTRACT-DBL-NEXT: ret bfloat [[CONV5]]
+//
+// CHECK-CONTRACT-EXT-LABEL: define dso_local bfloat @f
+// CHECK-CONTRACT-EXT-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]], bfloat noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-CONTRACT-EXT-NEXT: entry:
+// CHECK-CONTRACT-EXT-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-CONTRACT-EXT-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-CONTRACT-EXT-NEXT: [[C_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-CONTRACT-EXT-NEXT: [[D_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-CONTRACT-EXT-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2
+// CHECK-CONTRACT-EXT-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2
+// CHECK-CONTRACT-EXT-NEXT: store bfloat [[C]], ptr [[C_ADDR]], align 2
+// CHECK-CONTRACT-EXT-NEXT: store bfloat [[D]], ptr [[D_ADDR]], align 2
+// CHECK-CONTRACT-EXT-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-CONTRACT-EXT-NEXT: [[CONV:%.*]] = fpext bfloat [[TMP0]] to x86_fp80
+// CHECK-CONTRACT-EXT-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-CONTRACT-EXT-NEXT: [[CONV1:%.*]] = fpext bfloat [[TMP1]] to x86_fp80
+// CHECK-CONTRACT-EXT-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2
+// CHECK-CONTRACT-EXT-NEXT: [[CONV2:%.*]] = fpext bfloat [[TMP2]] to x86_fp80
+// CHECK-CONTRACT-EXT-NEXT: [[TMP3:%.*]] = load bfloat, ptr [[D_ADDR]], align 2
+// CHECK-CONTRACT-EXT-NEXT: [[CONV3:%.*]] = fpext bfloat [[TMP3]] to x86_fp80
+// CHECK-CONTRACT-EXT-NEXT: [[MUL4:%.*]] = fmul x86_fp80 [[CONV2]], [[CONV3]]
+// CHECK-CONTRACT-EXT-NEXT: [[TMP4:%.*]] = call x86_fp80 @llvm.fmuladd.f80(x86_fp80 [[CONV]], x86_fp80 [[CONV1]], x86_fp80 [[MUL4]])
+// CHECK-CONTRACT-EXT-NEXT: [[CONV5:%.*]] = fptrunc x86_fp80 [[TMP4]] to bfloat
+// CHECK-CONTRACT-EXT-NEXT: ret bfloat [[CONV5]]
+//
+// CHECK-UNSAFE-LABEL: define dso_local bfloat @f
+// CHECK-UNSAFE-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]], bfloat noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-UNSAFE-NEXT: entry:
+// CHECK-UNSAFE-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-UNSAFE-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-UNSAFE-NEXT: [[C_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-UNSAFE-NEXT: [[D_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-UNSAFE-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2
+// CHECK-UNSAFE-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2
+// CHECK-UNSAFE-NEXT: store bfloat [[C]], ptr [[C_ADDR]], align 2
+// CHECK-UNSAFE-NEXT: store bfloat [[D]], ptr [[D_ADDR]], align 2
+// CHECK-UNSAFE-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-UNSAFE-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-UNSAFE-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2
+// CHECK-UNSAFE-NEXT: [[TMP3:%.*]] = load bfloat, ptr [[D_ADDR]], align 2
+// CHECK-UNSAFE-NEXT: [[MUL1:%.*]] = fmul reassoc nsz arcp afn bfloat [[TMP2]], [[TMP3]]
+// CHECK-UNSAFE-NEXT: [[TMP4:%.*]] = call reassoc nsz arcp afn bfloat @llvm.fmuladd.bf16(bfloat [[TMP0]], bfloat [[TMP1]], bfloat [[MUL1]])
+// CHECK-UNSAFE-NEXT: ret bfloat [[TMP4]]
+//
+__bf16 f(__bf16 a, __bf16 b, __bf16 c, __bf16 d) {
+ return a * b + c * d;
+}
Index: clang/test/CodeGen/X86/bfloat16.cpp
===================================================================
--- /dev/null
+++ clang/test/CodeGen/X86/bfloat16.cpp
@@ -0,0 +1,145 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -target-feature +fullbf16 -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck -check-prefix=CHECK-NBF16 %s
+
+// CHECK-LABEL: define dso_local void @_Z11test_scalarDF16bDF16b
+// CHECK-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK: [[A_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-NEXT: [[C:%.*]] = alloca bfloat, align 2
+// CHECK-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2
+// CHECK-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2
+// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-NEXT: [[ADD:%.*]] = fadd bfloat [[TMP0]], [[TMP1]]
+// CHECK-NEXT: store bfloat [[ADD]], ptr [[C]], align 2
+// CHECK-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-NEXT: [[TMP3:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-NEXT: [[SUB:%.*]] = fsub bfloat [[TMP2]], [[TMP3]]
+// CHECK-NEXT: store bfloat [[SUB]], ptr [[C]], align 2
+// CHECK-NEXT: [[TMP4:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-NEXT: [[TMP5:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-NEXT: [[MUL:%.*]] = fmul bfloat [[TMP4]], [[TMP5]]
+// CHECK-NEXT: store bfloat [[MUL]], ptr [[C]], align 2
+// CHECK-NEXT: [[TMP6:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-NEXT: [[TMP7:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-NEXT: [[DIV:%.*]] = fdiv bfloat [[TMP6]], [[TMP7]]
+// CHECK-NEXT: store bfloat [[DIV]], ptr [[C]], align 2
+// CHECK-NEXT: ret void
+//
+// CHECK-NBF16-LABEL: define dso_local void @_Z11test_scalarDF16bDF16b
+// CHECK-NBF16-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NBF16: [[A_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-NBF16-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-NBF16-NEXT: [[C:%.*]] = alloca bfloat, align 2
+// CHECK-NBF16-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2
+// CHECK-NBF16-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2
+// CHECK-NBF16-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-NBF16-NEXT: [[EXT:%.*]] = fpext bfloat [[TMP0]] to float
+// CHECK-NBF16-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-NBF16-NEXT: [[EXT1:%.*]] = fpext bfloat [[TMP1]] to float
+// CHECK-NBF16-NEXT: [[ADD:%.*]] = fadd float [[EXT]], [[EXT1]]
+// CHECK-NBF16-NEXT: [[UNPROMOTION:%.*]] = fptrunc float [[ADD]] to bfloat
+// CHECK-NBF16-NEXT: store bfloat [[UNPROMOTION]], ptr [[C]], align 2
+// CHECK-NBF16-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-NBF16-NEXT: [[EXT2:%.*]] = fpext bfloat [[TMP2]] to float
+// CHECK-NBF16-NEXT: [[TMP3:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-NBF16-NEXT: [[EXT3:%.*]] = fpext bfloat [[TMP3]] to float
+// CHECK-NBF16-NEXT: [[SUB:%.*]] = fsub float [[EXT2]], [[EXT3]]
+// CHECK-NBF16-NEXT: [[UNPROMOTION4:%.*]] = fptrunc float [[SUB]] to bfloat
+// CHECK-NBF16-NEXT: store bfloat [[UNPROMOTION4]], ptr [[C]], align 2
+// CHECK-NBF16-NEXT: [[TMP4:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-NBF16-NEXT: [[EXT5:%.*]] = fpext bfloat [[TMP4]] to float
+// CHECK-NBF16-NEXT: [[TMP5:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-NBF16-NEXT: [[EXT6:%.*]] = fpext bfloat [[TMP5]] to float
+// CHECK-NBF16-NEXT: [[MUL:%.*]] = fmul float [[EXT5]], [[EXT6]]
+// CHECK-NBF16-NEXT: [[UNPROMOTION7:%.*]] = fptrunc float [[MUL]] to bfloat
+// CHECK-NBF16-NEXT: store bfloat [[UNPROMOTION7]], ptr [[C]], align 2
+// CHECK-NBF16-NEXT: [[TMP6:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-NBF16-NEXT: [[EXT8:%.*]] = fpext bfloat [[TMP6]] to float
+// CHECK-NBF16-NEXT: [[TMP7:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-NBF16-NEXT: [[EXT9:%.*]] = fpext bfloat [[TMP7]] to float
+// CHECK-NBF16-NEXT: [[DIV:%.*]] = fdiv float [[EXT8]], [[EXT9]]
+// CHECK-NBF16-NEXT: [[UNPROMOTION10:%.*]] = fptrunc float [[DIV]] to bfloat
+// CHECK-NBF16-NEXT: store bfloat [[UNPROMOTION10]], ptr [[C]], align 2
+// CHECK-NBF16-NEXT: ret void
+//
+void test_scalar(__bf16 a, __bf16 b) {
+ __bf16 c;
+ c = a + b;
+ c = a - b;
+ c = a * b;
+ c = a / b;
+}
+
+typedef __bf16 v8bfloat16 __attribute__((__vector_size__(16)));
+
+// CHECK-LABEL: define dso_local void @_Z11test_vectorDv8_DF16bS_
+// CHECK-SAME: (<8 x bfloat> noundef [[A:%.*]], <8 x bfloat> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK: [[A_ADDR:%.*]] = alloca <8 x bfloat>, align 16
+// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <8 x bfloat>, align 16
+// CHECK-NEXT: [[C:%.*]] = alloca <8 x bfloat>, align 16
+// CHECK-NEXT: store <8 x bfloat> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT: store <8 x bfloat> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT: [[TMP0:%.*]] = load <8 x bfloat>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT: [[TMP1:%.*]] = load <8 x bfloat>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT: [[ADD:%.*]] = fadd <8 x bfloat> [[TMP0]], [[TMP1]]
+// CHECK-NEXT: store <8 x bfloat> [[ADD]], ptr [[C]], align 16
+// CHECK-NEXT: [[TMP2:%.*]] = load <8 x bfloat>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT: [[TMP3:%.*]] = load <8 x bfloat>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT: [[SUB:%.*]] = fsub <8 x bfloat> [[TMP2]], [[TMP3]]
+// CHECK-NEXT: store <8 x bfloat> [[SUB]], ptr [[C]], align 16
+// CHECK-NEXT: [[TMP4:%.*]] = load <8 x bfloat>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT: [[TMP5:%.*]] = load <8 x bfloat>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT: [[MUL:%.*]] = fmul <8 x bfloat> [[TMP4]], [[TMP5]]
+// CHECK-NEXT: store <8 x bfloat> [[MUL]], ptr [[C]], align 16
+// CHECK-NEXT: [[TMP6:%.*]] = load <8 x bfloat>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT: [[TMP7:%.*]] = load <8 x bfloat>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT: [[DIV:%.*]] = fdiv <8 x bfloat> [[TMP6]], [[TMP7]]
+// CHECK-NEXT: store <8 x bfloat> [[DIV]], ptr [[C]], align 16
+// CHECK-NEXT: ret void
+//
+// CHECK-NBF16-LABEL: define dso_local void @_Z11test_vectorDv8_DF16bS_
+// CHECK-NBF16-SAME: (<8 x bfloat> noundef [[A:%.*]], <8 x bfloat> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NBF16: [[A_ADDR:%.*]] = alloca <8 x bfloat>, align 16
+// CHECK-NBF16-NEXT: [[B_ADDR:%.*]] = alloca <8 x bfloat>, align 16
+// CHECK-NBF16-NEXT: [[C:%.*]] = alloca <8 x bfloat>, align 16
+// CHECK-NBF16-NEXT: store <8 x bfloat> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NBF16-NEXT: store <8 x bfloat> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NBF16-NEXT: [[TMP0:%.*]] = load <8 x bfloat>, ptr [[A_ADDR]], align 16
+// CHECK-NBF16-NEXT: [[EXT:%.*]] = fpext <8 x bfloat> [[TMP0]] to <8 x float>
+// CHECK-NBF16-NEXT: [[TMP1:%.*]] = load <8 x bfloat>, ptr [[B_ADDR]], align 16
+// CHECK-NBF16-NEXT: [[EXT1:%.*]] = fpext <8 x bfloat> [[TMP1]] to <8 x float>
+// CHECK-NBF16-NEXT: [[ADD:%.*]] = fadd <8 x float> [[EXT]], [[EXT1]]
+// CHECK-NBF16-NEXT: [[UNPROMOTION:%.*]] = fptrunc <8 x float> [[ADD]] to <8 x bfloat>
+// CHECK-NBF16-NEXT: store <8 x bfloat> [[UNPROMOTION]], ptr [[C]], align 16
+// CHECK-NBF16-NEXT: [[TMP2:%.*]] = load <8 x bfloat>, ptr [[A_ADDR]], align 16
+// CHECK-NBF16-NEXT: [[EXT2:%.*]] = fpext <8 x bfloat> [[TMP2]] to <8 x float>
+// CHECK-NBF16-NEXT: [[TMP3:%.*]] = load <8 x bfloat>, ptr [[B_ADDR]], align 16
+// CHECK-NBF16-NEXT: [[EXT3:%.*]] = fpext <8 x bfloat> [[TMP3]] to <8 x float>
+// CHECK-NBF16-NEXT: [[SUB:%.*]] = fsub <8 x float> [[EXT2]], [[EXT3]]
+// CHECK-NBF16-NEXT: [[UNPROMOTION4:%.*]] = fptrunc <8 x float> [[SUB]] to <8 x bfloat>
+// CHECK-NBF16-NEXT: store <8 x bfloat> [[UNPROMOTION4]], ptr [[C]], align 16
+// CHECK-NBF16-NEXT: [[TMP4:%.*]] = load <8 x bfloat>, ptr [[A_ADDR]], align 16
+// CHECK-NBF16-NEXT: [[EXT5:%.*]] = fpext <8 x bfloat> [[TMP4]] to <8 x float>
+// CHECK-NBF16-NEXT: [[TMP5:%.*]] = load <8 x bfloat>, ptr [[B_ADDR]], align 16
+// CHECK-NBF16-NEXT: [[EXT6:%.*]] = fpext <8 x bfloat> [[TMP5]] to <8 x float>
+// CHECK-NBF16-NEXT: [[MUL:%.*]] = fmul <8 x float> [[EXT5]], [[EXT6]]
+// CHECK-NBF16-NEXT: [[UNPROMOTION7:%.*]] = fptrunc <8 x float> [[MUL]] to <8 x bfloat>
+// CHECK-NBF16-NEXT: store <8 x bfloat> [[UNPROMOTION7]], ptr [[C]], align 16
+// CHECK-NBF16-NEXT: [[TMP6:%.*]] = load <8 x bfloat>, ptr [[A_ADDR]], align 16
+// CHECK-NBF16-NEXT: [[EXT8:%.*]] = fpext <8 x bfloat> [[TMP6]] to <8 x float>
+// CHECK-NBF16-NEXT: [[TMP7:%.*]] = load <8 x bfloat>, ptr [[B_ADDR]], align 16
+// CHECK-NBF16-NEXT: [[EXT9:%.*]] = fpext <8 x bfloat> [[TMP7]] to <8 x float>
+// CHECK-NBF16-NEXT: [[DIV:%.*]] = fdiv <8 x float> [[EXT8]], [[EXT9]]
+// CHECK-NBF16-NEXT: [[UNPROMOTION10:%.*]] = fptrunc <8 x float> [[DIV]] to <8 x bfloat>
+// CHECK-NBF16-NEXT: store <8 x bfloat> [[UNPROMOTION10]], ptr [[C]], align 16
+// CHECK-NBF16-NEXT: ret void
+//
+void test_vector(v8bfloat16 a, v8bfloat16 b) {
+ v8bfloat16 c;
+ c = a + b;
+ c = a - b;
+ c = a * b;
+ c = a / b;
+}
Index: clang/test/CodeGen/X86/bfloat-mangle.cpp
===================================================================
--- clang/test/CodeGen/X86/bfloat-mangle.cpp
+++ clang/test/CodeGen/X86/bfloat-mangle.cpp
@@ -3,6 +3,6 @@
// RUN: %clang_cc1 -triple i386-windows-msvc -target-feature +sse2 -emit-llvm -o - %s | FileCheck %s --check-prefixes=WINDOWS
// RUN: %clang_cc1 -triple x86_64-windows-msvc -target-feature +sse2 -emit-llvm -o - %s | FileCheck %s --check-prefixes=WINDOWS
-// LINUX: define {{.*}}void @_Z3foou6__bf16(bfloat noundef %b)
+// LINUX: define {{.*}}void @_Z3fooDF16b(bfloat noundef %b)
// WINDOWS: define {{.*}}void @"?foo@@YAXU__bf16@__clang@@@Z"(bfloat noundef %b)
void foo(__bf16 b) {}
Index: clang/test/CodeGen/X86/avx512bf16-error.c
===================================================================
--- clang/test/CodeGen/X86/avx512bf16-error.c
+++ clang/test/CodeGen/X86/avx512bf16-error.c
@@ -7,7 +7,6 @@
#include <immintrin.h>
-// expected-error@+4 {{invalid operands to binary expression ('__bfloat16' (aka '__bf16') and '__bfloat16')}}
// expected-warning@+2 3 {{'__bfloat16' is deprecated: use __bf16 instead}}
// expected-note@* 3 {{'__bfloat16' has been explicitly marked deprecated here}}
__bfloat16 bar(__bfloat16 a, __bfloat16 b) {
Index: clang/lib/Sema/SemaOverload.cpp
===================================================================
--- clang/lib/Sema/SemaOverload.cpp
+++ clang/lib/Sema/SemaOverload.cpp
@@ -1995,8 +1995,11 @@
// if their representation is different until there is back end support
// We of course allow this conversion if long double is really double.
- // Conversions between bfloat and other floats are not permitted.
- if (FromType == S.Context.BFloat16Ty || ToType == S.Context.BFloat16Ty)
+ // Conversions between bfloat16 and float16 are currently not supported.
+ if ((FromType->isBFloat16Type() &&
+ (ToType->isFloat16Type() || ToType->isHalfType())) ||
+ (ToType->isBFloat16Type() &&
+ (FromType->isFloat16Type() || FromType->isHalfType())))
return false;
// Conversions between IEEE-quad and IBM-extended semantics are not
@@ -2017,9 +2020,6 @@
ToType->isIntegralType(S.Context)) ||
(FromType->isIntegralOrUnscopedEnumerationType() &&
ToType->isRealFloatingType())) {
- // Conversions between bfloat and int are not permitted.
- if (FromType->isBFloat16Type() || ToType->isBFloat16Type())
- return false;
// Floating-integral conversions (C++ 4.9).
SCS.Second = ICK_Floating_Integral;
Index: clang/lib/Sema/SemaExpr.cpp
===================================================================
--- clang/lib/Sema/SemaExpr.cpp
+++ clang/lib/Sema/SemaExpr.cpp
@@ -10810,10 +10810,6 @@
const VectorType *RHSVecType = RHSType->getAs<VectorType>();
assert(LHSVecType || RHSVecType);
- if ((LHSVecType && LHSVecType->getElementType()->isBFloat16Type()) ||
- (RHSVecType && RHSVecType->getElementType()->isBFloat16Type()))
- return ReportInvalid ? InvalidOperands(Loc, LHS, RHS) : QualType();
-
// AltiVec-style "vector bool op vector bool" combinations are allowed
// for some operators but not others.
if (!AllowBothBool &&
Index: clang/lib/Sema/SemaCast.cpp
===================================================================
--- clang/lib/Sema/SemaCast.cpp
+++ clang/lib/Sema/SemaCast.cpp
@@ -3092,20 +3092,6 @@
return;
}
- // Can't cast to or from bfloat
- if (DestType->isBFloat16Type() && !SrcType->isBFloat16Type()) {
- Self.Diag(SrcExpr.get()->getExprLoc(), diag::err_cast_to_bfloat16)
- << SrcExpr.get()->getSourceRange();
- SrcExpr = ExprError();
- return;
- }
- if (SrcType->isBFloat16Type() && !DestType->isBFloat16Type()) {
- Self.Diag(SrcExpr.get()->getExprLoc(), diag::err_cast_from_bfloat16)
- << SrcExpr.get()->getSourceRange();
- SrcExpr = ExprError();
- return;
- }
-
// If either type is a pointer, the other type has to be either an
// integer or a pointer.
if (!DestType->isArithmeticType()) {
Index: clang/lib/Driver/ToolChains/Clang.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Clang.cpp
+++ clang/lib/Driver/ToolChains/Clang.cpp
@@ -2774,6 +2774,7 @@
FPContract = "on";
bool StrictFPModel = false;
StringRef Float16ExcessPrecision = "";
+ StringRef BFloat16ExcessPrecision = "";
if (const Arg *A = Args.getLastArg(options::OPT_flimited_precision_EQ)) {
CmdArgs.push_back("-mlimit-float-precision");
@@ -2989,6 +2990,7 @@
D.Diag(diag::err_drv_unsupported_option_argument)
<< A->getSpelling() << Val;
}
+ BFloat16ExcessPrecision = Float16ExcessPrecision;
break;
}
case options::OPT_ffinite_math_only:
@@ -3164,6 +3166,9 @@
if (!Float16ExcessPrecision.empty())
CmdArgs.push_back(Args.MakeArgString("-ffloat16-excess-precision=" +
Float16ExcessPrecision));
+ if (!BFloat16ExcessPrecision.empty())
+ CmdArgs.push_back(Args.MakeArgString("-fbfloat16-excess-precision=" +
+ BFloat16ExcessPrecision));
ParseMRecip(D, Args, CmdArgs);
Index: clang/lib/CodeGen/CGExprScalar.cpp
===================================================================
--- clang/lib/CodeGen/CGExprScalar.cpp
+++ clang/lib/CodeGen/CGExprScalar.cpp
@@ -814,13 +814,21 @@
Value *(ScalarExprEmitter::*F)(const BinOpInfo &));
QualType getPromotionType(QualType Ty) {
+ const auto &Ctx = CGF.getContext();
if (auto *CT = Ty->getAs<ComplexType>()) {
QualType ElementType = CT->getElementType();
- if (ElementType.UseExcessPrecision(CGF.getContext()))
- return CGF.getContext().getComplexType(CGF.getContext().FloatTy);
+ if (ElementType.UseExcessPrecision(Ctx))
+ return Ctx.getComplexType(Ctx.FloatTy);
}
- if (Ty.UseExcessPrecision(CGF.getContext()))
- return CGF.getContext().FloatTy;
+
+ if (Ty.UseExcessPrecision(Ctx)) {
+ if (auto *VT = Ty->getAs<VectorType>()) {
+ unsigned NumElements = VT->getNumElements();
+ return Ctx.getVectorType(Ctx.FloatTy, NumElements, VT->getVectorKind());
+ }
+ return Ctx.FloatTy;
+ }
+
return QualType();
}
Index: clang/lib/Basic/Targets/X86.h
===================================================================
--- clang/lib/Basic/Targets/X86.h
+++ clang/lib/Basic/Targets/X86.h
@@ -417,7 +417,6 @@
return getPointerWidthV(AddrSpace);
}
- const char *getBFloat16Mangling() const override { return "u6__bf16"; };
};
// X86-32 generic target
Index: clang/lib/Basic/Targets/X86.cpp
===================================================================
--- clang/lib/Basic/Targets/X86.cpp
+++ clang/lib/Basic/Targets/X86.cpp
@@ -359,6 +359,8 @@
HasCRC32 = true;
} else if (Feature == "+x87") {
HasX87 = true;
+ } else if (Feature == "+fullbf16") {
+ HasFullBFloat16 = true;
}
X86SSEEnum Level = llvm::StringSwitch<X86SSEEnum>(Feature)
@@ -376,6 +378,15 @@
HasFloat16 = SSELevel >= SSE2;
+ // X86 target has bfloat16 emulation support in the backend, where
+ // bfloat16 is treated as a 32-bit float, arithmetic operations are
+ // performed in 32-bit, and the result is converted back to bfloat16.
+ // Truncation and extension between bfloat16 and 32-bit float are supported
+ // by the compiler-rt library. However, native bfloat16 support is currently
+ // not available in the X86 target. Hence, HasFullBFloat16 will be false
+ // until native bfloat16 support is available. HasFullBFloat16 is used to
+ // determine whether to automatically use excess floating point precision
+ // for bfloat16 arithmetic operations in the front-end.
HasBFloat16 = SSELevel >= SSE2;
MMX3DNowEnum ThreeDNowLevel = llvm::StringSwitch<MMX3DNowEnum>(Feature)
@@ -1117,6 +1128,7 @@
.Case("xsavec", HasXSAVEC)
.Case("xsaves", HasXSAVES)
.Case("xsaveopt", HasXSAVEOPT)
+ .Case("fullbf16", HasFullBFloat16)
.Default(false);
}
Index: clang/lib/Basic/Targets/NVPTX.h
===================================================================
--- clang/lib/Basic/Targets/NVPTX.h
+++ clang/lib/Basic/Targets/NVPTX.h
@@ -181,7 +181,6 @@
bool hasBitIntType() const override { return true; }
bool hasBFloat16Type() const override { return true; }
- const char *getBFloat16Mangling() const override { return "u6__bf16"; };
};
} // namespace targets
} // namespace clang
Index: clang/lib/Basic/Targets/ARM.cpp
===================================================================
--- clang/lib/Basic/Targets/ARM.cpp
+++ clang/lib/Basic/Targets/ARM.cpp
@@ -514,6 +514,7 @@
HasFloat16 = true;
ARMCDECoprocMask = 0;
HasBFloat16 = false;
+ HasFullBFloat16 = false;
FPRegsDisabled = false;
// This does not diagnose illegal cases like having both
@@ -596,6 +597,8 @@
} else if (Feature == "+pacbti") {
HasPAC = 1;
HasBTI = 1;
+ } else if (Feature == "+fullbf16") {
+ HasFullBFloat16 = true;
}
}
Index: clang/lib/Basic/Targets/AMDGPU.h
===================================================================
--- clang/lib/Basic/Targets/AMDGPU.h
+++ clang/lib/Basic/Targets/AMDGPU.h
@@ -118,7 +118,6 @@
}
bool hasBFloat16Type() const override { return isAMDGCN(getTriple()); }
- const char *getBFloat16Mangling() const override { return "u6__bf16"; };
std::string_view getClobbers() const override { return ""; }
Index: clang/lib/Basic/TargetInfo.cpp
===================================================================
--- clang/lib/Basic/TargetInfo.cpp
+++ clang/lib/Basic/TargetInfo.cpp
@@ -64,6 +64,7 @@
HasIbm128 = false;
HasFloat16 = false;
HasBFloat16 = false;
+ HasFullBFloat16 = false;
HasLongDouble = true;
HasFPReturn = true;
HasStrictFP = false;
Index: clang/lib/AST/Type.cpp
===================================================================
--- clang/lib/AST/Type.cpp
+++ clang/lib/AST/Type.cpp
@@ -1487,7 +1487,13 @@
bool QualType::UseExcessPrecision(const ASTContext &Ctx) {
const BuiltinType *BT = getTypePtr()->getAs<BuiltinType>();
- if (BT) {
+ if (!BT) {
+ const VectorType *VT = getTypePtr()->getAs<VectorType>();
+ if (VT) {
+ QualType ElementType = VT->getElementType();
+ return ElementType.UseExcessPrecision(Ctx);
+ }
+ } else {
switch (BT->getKind()) {
case BuiltinType::Kind::Float16: {
const TargetInfo &TI = Ctx.getTargetInfo();
@@ -1496,7 +1502,15 @@
Ctx.getLangOpts().ExcessPrecisionKind::FPP_None)
return true;
return false;
- }
+ } break;
+ case BuiltinType::Kind::BFloat16: {
+ const TargetInfo &TI = Ctx.getTargetInfo();
+ if (TI.hasBFloat16Type() && !TI.hasFullBFloat16Type() &&
+ Ctx.getLangOpts().getBFloat16ExcessPrecision() !=
+ Ctx.getLangOpts().ExcessPrecisionKind::FPP_None)
+ return true;
+ return false;
+ } break;
default:
return false;
}
@@ -2183,8 +2197,7 @@
bool Type::isArithmeticType() const {
if (const auto *BT = dyn_cast<BuiltinType>(CanonicalType))
return BT->getKind() >= BuiltinType::Bool &&
- BT->getKind() <= BuiltinType::Ibm128 &&
- BT->getKind() != BuiltinType::BFloat16;
+ BT->getKind() <= BuiltinType::Ibm128;
if (const auto *ET = dyn_cast<EnumType>(CanonicalType))
// GCC allows forward declaration of enum types (forbid by C99 6.7.2.3p2).
// If a body isn't seen by the time we get here, return false.
Index: clang/include/clang/Driver/Options.td
===================================================================
--- clang/include/clang/Driver/Options.td
+++ clang/include/clang/Driver/Options.td
@@ -1642,6 +1642,15 @@
Values<"standard,fast,none">, NormalizedValuesScope<"LangOptions">,
NormalizedValues<["FPP_Standard", "FPP_Fast", "FPP_None"]>,
MarshallingInfoEnum<LangOpts<"Float16ExcessPrecision">, "FPP_Standard">;
+def fbfloat16_excess_precision_EQ : Joined<["-"], "fbfloat16-excess-precision=">,
+ Group<f_Group>, Flags<[CC1Option, NoDriverOption]>,
+ HelpText<"Allows control over excess precision on targets where native "
+ "support for BFloat16 precision types is not available. By default, excess "
+ "precision is used to calculate intermediate results following the "
+ "rules specified in ISO C99.">,
+ Values<"standard,fast,none">, NormalizedValuesScope<"LangOptions">,
+ NormalizedValues<["FPP_Standard", "FPP_Fast", "FPP_None"]>,
+ MarshallingInfoEnum<LangOpts<"BFloat16ExcessPrecision">, "FPP_Standard">;
def : Flag<["-"], "fexpensive-optimizations">, Group<clang_ignored_gcc_optimization_f_Group>;
def : Flag<["-"], "fno-expensive-optimizations">, Group<clang_ignored_gcc_optimization_f_Group>;
def fextdirs_EQ : Joined<["-"], "fextdirs=">, Group<f_Group>;
Index: clang/include/clang/Basic/TargetInfo.h
===================================================================
--- clang/include/clang/Basic/TargetInfo.h
+++ clang/include/clang/Basic/TargetInfo.h
@@ -219,6 +219,9 @@
bool HasFloat128;
bool HasFloat16;
bool HasBFloat16;
+ bool HasFullBFloat16; // True if the backend supports native bfloat16
+ // arithmetic. Used to determine excess precision
+ // support in the frontend.
bool HasIbm128;
bool HasLongDouble;
bool HasFPReturn;
@@ -648,7 +651,13 @@
virtual bool hasFloat16Type() const { return HasFloat16; }
/// Determine whether the _BFloat16 type is supported on this target.
- virtual bool hasBFloat16Type() const { return HasBFloat16; }
+ virtual bool hasBFloat16Type() const {
+ return HasBFloat16 || HasFullBFloat16;
+ }
+
+ /// Determine whether the BFloat type is fully supported on this target, i.e
+ /// arithemtic operations.
+ virtual bool hasFullBFloat16Type() const { return HasFullBFloat16; }
/// Determine whether the __ibm128 type is supported on this target.
virtual bool hasIbm128Type() const { return HasIbm128; }
@@ -756,9 +765,7 @@
}
/// Return the mangled code of bfloat.
- virtual const char *getBFloat16Mangling() const {
- llvm_unreachable("bfloat not implemented on this target");
- }
+ virtual const char *getBFloat16Mangling() const { return "DF16b"; }
/// Return the value for the C99 FLT_EVAL_METHOD macro.
virtual LangOptions::FPEvalMethodKind getFPEvalMethod() const {
Index: clang/include/clang/Basic/LangOptions.def
===================================================================
--- clang/include/clang/Basic/LangOptions.def
+++ clang/include/clang/Basic/LangOptions.def
@@ -317,7 +317,8 @@
BENIGN_LANGOPT(RoundingMath, 1, false, "Do not assume default floating-point rounding behavior")
BENIGN_ENUM_LANGOPT(FPExceptionMode, FPExceptionModeKind, 2, FPE_Default, "FP Exception Behavior Mode type")
BENIGN_ENUM_LANGOPT(FPEvalMethod, FPEvalMethodKind, 2, FEM_UnsetOnCommandLine, "FP type used for floating point arithmetic")
-ENUM_LANGOPT(Float16ExcessPrecision, ExcessPrecisionKind, 2, FPP_Standard, "Intermediate truncation behavior for floating point arithmetic")
+ENUM_LANGOPT(Float16ExcessPrecision, ExcessPrecisionKind, 2, FPP_Standard, "Intermediate truncation behavior for Float16 arithmetic")
+ENUM_LANGOPT(BFloat16ExcessPrecision, ExcessPrecisionKind, 2, FPP_Standard, "Intermediate truncation behavior for BFloat16 arithmetic")
LANGOPT(NoBitFieldTypeAlign , 1, 0, "bit-field type alignment")
LANGOPT(HexagonQdsp6Compat , 1, 0, "hexagon-qdsp6 backward compatibility")
LANGOPT(ObjCAutoRefCount , 1, 0, "Objective-C automated reference counting")
Index: clang/include/clang/Basic/FPOptions.def
===================================================================
--- clang/include/clang/Basic/FPOptions.def
+++ clang/include/clang/Basic/FPOptions.def
@@ -26,4 +26,5 @@
OPTION(AllowApproxFunc, bool, 1, AllowReciprocal)
OPTION(FPEvalMethod, LangOptions::FPEvalMethodKind, 2, AllowApproxFunc)
OPTION(Float16ExcessPrecision, LangOptions::ExcessPrecisionKind, 2, FPEvalMethod)
+OPTION(BFloat16ExcessPrecision, LangOptions::ExcessPrecisionKind, 2, FPEvalMethod)
#undef OPTION
Index: clang/include/clang/Basic/DiagnosticSemaKinds.td
===================================================================
--- clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -8747,8 +8747,6 @@
def err_nullptr_cast : Error<
"cannot cast an object of type %select{'nullptr_t' to %1|%1 to 'nullptr_t'}0"
>;
-def err_cast_to_bfloat16 : Error<"cannot type-cast to __bf16">;
-def err_cast_from_bfloat16 : Error<"cannot type-cast from __bf16">;
def err_typecheck_expect_scalar_operand : Error<
"operand of type %0 where arithmetic or pointer type is required">;
def err_typecheck_cond_incompatible_operands : Error<
Index: clang/docs/LanguageExtensions.rst
===================================================================
--- clang/docs/LanguageExtensions.rst
+++ clang/docs/LanguageExtensions.rst
@@ -774,61 +774,94 @@
Half-Precision Floating Point
=============================
-Clang supports three half-precision (16-bit) floating point types: ``__fp16``,
-``_Float16`` and ``__bf16``. These types are supported in all language modes.
-
-``__fp16`` is supported on every target, as it is purely a storage format; see below.
-``_Float16`` is currently only supported on the following targets, with further
-targets pending ABI standardization:
-
-* 32-bit ARM
-* 64-bit ARM (AArch64)
-* AMDGPU
-* SPIR
-* X86 (see below)
-
-On X86 targets, ``_Float16`` is supported as long as SSE2 is available, which
-includes all 64-bit and all recent 32-bit processors. When the target supports
-AVX512-FP16, ``_Float16`` arithmetic is performed using that native support.
-Otherwise, ``_Float16`` arithmetic is performed by promoting to ``float``,
-performing the operation, and then truncating to ``_Float16``. When doing this
-emulation, Clang defaults to following the C standard's rules for excess
-precision arithmetic, which avoids intermediate truncations within statements
-and may generate different results from a strict operation-by-operation
-emulation.
-
-``_Float16`` will be supported on more targets as they define ABIs for it.
-
-``__bf16`` is purely a storage format; it is currently only supported on the following targets:
-
-* 32-bit ARM
-* 64-bit ARM (AArch64)
-* X86 (see below)
-
-On X86 targets, ``__bf16`` is supported as long as SSE2 is available, which
-includes all 64-bit and all recent 32-bit processors.
-
-``__fp16`` is a storage and interchange format only. This means that values of
-``__fp16`` are immediately promoted to (at least) ``float`` when used in arithmetic
-operations, so that e.g. the result of adding two ``__fp16`` values has type ``float``.
-The behavior of ``__fp16`` is specified by the Arm C Language Extensions (`ACLE <https://github.com/ARM-software/acle/releases>`_).
-Clang uses the ``binary16`` format from IEEE 754-2008 for ``__fp16``, not the ARM
-alternative format.
-
-``_Float16`` is an interchange floating-point type. This means that, just like arithmetic on
-``float`` or ``double``, arithmetic on ``_Float16`` operands is formally performed in the
-``_Float16`` type, so that e.g. the result of adding two ``_Float16`` values has type
-``_Float16``. The behavior of ``_Float16`` is specified by ISO/IEC TS 18661-3:2015
-("Floating-point extensions for C"). As with ``__fp16``, Clang uses the ``binary16``
-format from IEEE 754-2008 for ``_Float16``.
-
-``_Float16`` arithmetic will be performed using native half-precision support
-when available on the target (e.g. on ARMv8.2a); otherwise it will be performed
-at a higher precision (currently always ``float``) and then truncated down to
-``_Float16``. Note that C and C++ allow intermediate floating-point operands
-of an expression to be computed with greater precision than is expressible in
-their type, so Clang may avoid intermediate truncations in certain cases; this may
-lead to results that are inconsistent with native arithmetic.
+Clang supports three half-precision (16-bit) floating point types:
+``__fp16``, ``_Float16`` and ``__bf16``. These types are supported
+in all language modes, but their support differs between targets.
+A target is said to have "native support" for a type if the target
+processor offers instructions for directly performing basic arithmetic
+on that type. In the absence of native support, a type can still be
+supported if the compiler can emulate arithmetic on the type by promoting
+to ``float``; see below for more information on this emulation.
+
+* ``__fp16`` is supported on all targets. The special semantics of this
+type mean that no arithmetic is ever performed directly on ``__fp16`` values;
+see below.
+
+* ``_Float16`` is supported on the following targets:
+ * 32-bit ARM (natively on some architecture versions)
+ * 64-bit ARM (AArch64) (natively on ARMv8.2a and above)
+ * AMDGPU (natively)
+ * SPIR (natively)
+ * X86 (if SSE2 is available; natively if AVX512-FP16 is also available)
+
+* ``__bf16`` is supported on the following targets (currently never natively):
+ * 32-bit ARM
+ * 64-bit ARM (AArch64)
+ * X86 (when SSE2 is available)
+
+(For X86, SSE2 is available on 64-bit and all recent 32-bit processors.)
+
+``__fp16`` and ``_Float16`` both use the binary16 format from IEEE
+754-2008, which provides a 5-bit exponent and an 11-bit significand
+(counting the implicit leading 1). ``__bf16`` uses the `bfloat16
+<https://en.wikipedia.org/wiki/Bfloat16_floating-point_format>`_ format,
+which provides an 8-bit exponent and an 8-bit significand; this is the same
+exponent range as `float`, just with greatly reduced precision.
+
+``_Float16`` and ``__bf16`` follow the usual rules for arithmetic
+floating-point types. Most importantly, this means that arithmetic operations
+on operands of these types are formally performed in the type and produce
+values of the type. ``__fp16`` does not follow those rules: most operations
+immediately promote operands of type ``__fp16`` to ``float``, and so
+arithmetic operations are defined to be performed in ``float`` and so result in
+a value of type ``float`` (unless further promoted because of other operands).
+See below for more information on the exact specifications of these types.
+
+Only some of the supported processors for ``_Float16`` and ``__bf16`` offer
+native hardware support for arithmetic in their corresponding formats.
+Arithmetic on ``_Float16`` and ``__bf16`` is enabled on some targets that don't
+provide native architectural support for arithmetic on these formats. These
+targets are noted in the lists of supported targets above.
+
+When compiling arithmetic on ``_Float16`` and ``__bf16`` for a target without
+native support, Clang will perform the arithmetic in ``float``, inserting
+extensions and truncations as necessary. This can be done in a way that
+exactly matches the operation-by-operation behavior of native support,
+but that can require many extra truncations and extensions. By default,
+when emulating ``_Float16`` and ``__bf16`` arithmetic using ``float``, Clang
+does not truncate intermediate operands back to their true type unless the
+operand is the result of an explicit cast or assignment. This is generally
+much faster but can generate different results from strict operation-by-operation
+emulation. Usually the results are more precise. This is permitted by the
+C and C++ standards under the rules for excess precision in intermediate operands;
+see the discussion of evaluation formats in the C standard and [expr.pre] in
+the C++ standard.
+
+The use of excess precision can be independently controlled for these two
+types with the ``-ffloat16-excess-precision=`` and
+``-fbfloat16-excess-precision=`` options. Valid values include:
+- ``none`` (meaning to perform strict operation-by-operation emulation)
+- ``standard`` (meaning that excess precision is permitted under the rules
+ described in the standard, i.e. never across explicit casts or statements)
+- ``fast`` (meaning that excess precision is permitted whenever the
+ optimizer sees an opportunity to avoid truncations; currently this has no
+ effect beyond ``standard``)
+
+The ``_Float16`` type is an interchange floating type specified in
+ ISO/IEC TS 18661-3:2015 ("Floating-point extensions for C"). It will
+be supported on more targets as they define ABIs for it.
+
+The ``__bf16`` type is a non-standard extension, but it generally follows
+the rules for arithmetic interchange floating types from ISO/IEC TS
+18661-3:2015. In previous versions of Clang, it was a storage-only type
+that forbade arithmetic operations. It will be supported on more targets
+as they define ABIs for it.
+
+The ``__fp16`` type was originally an ARM extension and is specified
+by the `ARM C Language Extensions <https://github.com/ARM-software/acle/releases>`_.
+Clang uses the ``binary16`` format from IEEE 754-2008 for ``__fp16``,
+not the ARM alternative format. Operators that expect arithmetic operands
+immediately promote ``__fp16`` operands to ``float``.
It is recommended that portable code use ``_Float16`` instead of ``__fp16``,
as it has been defined by the C standards committee and has behavior that is
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits