vikramRH updated this revision to Diff 527462.
vikramRH added a comment.
Herald added a subscriber: jdoerfert.
Few additional changes,
1. reflect printf-kind in module flags metadata
2. Test cases for the change
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D150427/new/
https://reviews.llvm.org/D150427
Files:
clang/include/clang/Basic/TargetOptions.h
clang/include/clang/Driver/Options.td
clang/lib/CodeGen/CGGPUBuiltin.cpp
clang/lib/CodeGen/CodeGenModule.cpp
clang/lib/Driver/ToolChains/Clang.cpp
clang/test/CodeGenHIP/default-attributes.hip
clang/test/CodeGenHIP/printf-kind-module-flag.hip
clang/test/CodeGenHIP/printf_nonhostcall.cpp
clang/test/CodeGenHIP/sanitize-undefined-null.hip
clang/test/Driver/hip-options.hip
llvm/include/llvm/Transforms/Utils/AMDGPUEmitPrintf.h
llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
Index: llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
===================================================================
--- llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
+++ llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
@@ -17,6 +17,9 @@
#include "llvm/Transforms/Utils/AMDGPUEmitPrintf.h"
#include "llvm/ADT/SparseBitVector.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/MD5.h"
+#include "llvm/Support/MathExtras.h"
using namespace llvm;
@@ -179,11 +182,7 @@
// Scan the format string to locate all specifiers, and mark the ones that
// specify a string, i.e, the "%s" specifier with optional '*' characters.
-static void locateCStrings(SparseBitVector<8> &BV, Value *Fmt) {
- StringRef Str;
- if (!getConstantStringInfo(Fmt, Str) || Str.empty())
- return;
-
+static void locateCStrings(SparseBitVector<8> &BV, StringRef Str) {
static const char ConvSpecifiers[] = "diouxXfFeEgGaAcspn";
size_t SpecPos = 0;
// Skip the first argument, the format string.
@@ -207,14 +206,305 @@
}
}
-Value *llvm::emitAMDGPUPrintfCall(IRBuilder<> &Builder,
- ArrayRef<Value *> Args) {
+// helper struct to package the string related data
+struct StringData {
+ std::string Str;
+ Value *RealSize = nullptr;
+ Value *AlignedSize = nullptr;
+ bool isConst = true;
+
+ StringData(std::string str, Value *RS, Value *AS, bool IC)
+ : Str(str), RealSize(RS), AlignedSize(AS), isConst(IC) {}
+};
+
+// Calculates frame size required for current printf expansion and allocates
+// space on printf buffer. Printf frame includes following contents
+// [ ControlDWord , format string/Hash , Arguments (each aligned to 8 byte) ]
+static Value *callBufferedPrintfStart(
+ IRBuilder<> &Builder, ArrayRef<Value *> Args, Value *Fmt,
+ bool isConstFmtStr, SparseBitVector<8> &SpecIsCString,
+ SmallVectorImpl<StringData> &StringContents, Value *&ArgSize) {
+ Module *M = Builder.GetInsertBlock()->getModule();
+ Value *NonConstStrLen = nullptr;
+ Value *LenWithNull = nullptr;
+ Value *LenWithNullAligned = nullptr;
+ Value *TempAdd = nullptr;
+
+ // First 4 bytes to be reserved for control dword
+ size_t BufSize = 4;
+ if (isConstFmtStr)
+ // First 8 bytes of MD5 hash
+ BufSize += 8;
+ else {
+ LenWithNull = getStrlenWithNull(Builder, Fmt);
+
+ // Align the computed length to next 8 byte boundary
+ TempAdd = Builder.CreateAdd(LenWithNull,
+ ConstantInt::get(LenWithNull->getType(), 7U));
+ NonConstStrLen = Builder.CreateAnd(
+ TempAdd, ConstantInt::get(LenWithNull->getType(), ~7U));
+
+ StringContents.push_back(
+ StringData("", LenWithNull, NonConstStrLen, false));
+ }
+
+ for (size_t i = 1; i < Args.size(); i++) {
+ if (SpecIsCString.test(i)) {
+ StringRef ArgStr;
+ if (getConstantStringInfo(Args[i], ArgStr)) {
+ auto alignedLen = alignTo(ArgStr.size() + 1, 8);
+ StringContents.push_back(StringData(
+ (ArgStr.str() + '\0'),
+ /*RealSize*/ nullptr, /*AlignedSize*/ nullptr, /*isConst*/ true));
+ BufSize += alignedLen;
+ } else {
+ LenWithNull = getStrlenWithNull(Builder, Args[i]);
+
+ // Align the computed length to next 8 byte boundary
+ TempAdd = Builder.CreateAdd(
+ LenWithNull, ConstantInt::get(LenWithNull->getType(), 7U));
+ LenWithNullAligned = Builder.CreateAnd(
+ TempAdd, ConstantInt::get(LenWithNull->getType(), ~7U));
+
+ if (NonConstStrLen) {
+ auto Val = Builder.CreateAdd(LenWithNullAligned, NonConstStrLen,
+ "cumulativeAdd");
+ NonConstStrLen = Val;
+ } else
+ NonConstStrLen = LenWithNullAligned;
+
+ StringContents.push_back(
+ StringData("", LenWithNull, LenWithNullAligned, false));
+ }
+ } else
+ // We end up expanding non string arguments to 8 bytes
+ BufSize += 8;
+ }
+
+ // calculate final size value to be passed to printf_alloc
+ Value *SizeToReserve = ConstantInt::get(Builder.getInt64Ty(), BufSize, false);
+ SmallVector<Value *, 1> Alloc_args;
+ if (NonConstStrLen)
+ SizeToReserve = Builder.CreateAdd(NonConstStrLen, SizeToReserve);
+
+ ArgSize = Builder.CreateTrunc(SizeToReserve, Builder.getInt32Ty());
+ Alloc_args.push_back(ArgSize);
+
+ // call the printf_alloc function
+ AttributeList Attr = AttributeList::get(
+ Builder.getContext(), AttributeList::FunctionIndex, Attribute::NoUnwind);
+
+ Type *Tys_alloc[1] = {Builder.getInt32Ty()};
+ Type *I8Ptr =
+ Builder.getInt8PtrTy(M->getDataLayout().getDefaultGlobalsAddressSpace());
+ FunctionType *FTy_alloc = FunctionType::get(I8Ptr, Tys_alloc, false);
+ auto PrintfAllocFn =
+ M->getOrInsertFunction(StringRef("__printf_alloc"), FTy_alloc, Attr);
+
+ return Builder.CreateCall(PrintfAllocFn, Alloc_args, "printf_alloc_fn");
+}
+
+// Prepare constant string argument to push onto the buffer
+static void processConstantStringArg(StringData *SD, IRBuilder<> &Builder,
+ SmallVectorImpl<Value *> &WhatToStore) {
+ StringRef Str = SD->Str;
+
+ DataExtractor Extractor(Str, /*IsLittleEndian=*/true, 8);
+ DataExtractor::Cursor Offset(0);
+ while (Offset && Offset.tell() < Str.size()) {
+ const uint64_t ReadSize = 4;
+ uint64_t ReadNow = std::min(ReadSize, Str.size() - Offset.tell());
+ uint64_t ReadBytes = 0;
+ switch (ReadNow) {
+ default:
+ llvm_unreachable("min(4, X) > 4?");
+ case 1:
+ ReadBytes = Extractor.getU8(Offset);
+ break;
+ case 2:
+ ReadBytes = Extractor.getU16(Offset);
+ break;
+ case 3:
+ ReadBytes = Extractor.getU24(Offset);
+ break;
+ case 4:
+ ReadBytes = Extractor.getU32(Offset);
+ break;
+ }
+ cantFail(Offset.takeError(), "failed to read bytes from constant array");
+
+ APInt IntVal(8 * ReadSize, ReadBytes);
+
+ // TODO: Should not bother aligning up.
+ if (ReadNow < ReadSize)
+ IntVal = IntVal.zext(8 * ReadSize);
+
+ Type *IntTy = Type::getIntNTy(Builder.getContext(), IntVal.getBitWidth());
+ WhatToStore.push_back(ConstantInt::get(IntTy, IntVal));
+ }
+ // Additional padding for 8 byte alignment
+ int Rem = (Str.size() % 8);
+ if (Rem > 0 && Rem <= 4)
+ WhatToStore.push_back(ConstantInt::get(Builder.getInt32Ty(), 0));
+}
+
+static void
+callBufferedPrintfArgPush(IRBuilder<> &Builder, ArrayRef<Value *> Args,
+ Value *PtrToStore, SparseBitVector<8> &SpecIsCString,
+ SmallVectorImpl<StringData> &StringContents,
+ bool IsConstFmtStr) {
+ Module *M = Builder.GetInsertBlock()->getModule();
+ auto StrIt = StringContents.begin();
+ size_t i = IsConstFmtStr ? 1 : 0;
+ for (; i < Args.size(); i++) {
+ SmallVector<Value *, 32> WhatToStore;
+ if ((i == 0) || SpecIsCString.test(i)) {
+ if (StrIt->isConst) {
+ processConstantStringArg(StrIt, Builder, WhatToStore);
+ StrIt++;
+ } else {
+ // This copies the contents of the string, however the next offset
+ // is at aligned length, the extra space that might be created due
+ // to alignment padding is not populated with any specific value
+ // here. This would be safe as long as runtime is sync with
+ // the offsets.
+ uint64_t DstAlign = (i == 0) ? 4 : 8;
+ Builder.CreateMemCpy(PtrToStore, /*DstAlign*/ Align(DstAlign), Args[i],
+ /*SrcAlign*/ Align(1), StrIt->RealSize);
+
+ PtrToStore =
+ Builder.CreateInBoundsGEP(Builder.getInt8Ty(), PtrToStore,
+ {StrIt->AlignedSize}, "PrintBuffNextPtr");
+ LLVM_DEBUG(dbgs() << "inserting gep to the printf buffer:"
+ << *PtrToStore << '\n');
+
+ // done with current argument, move to next
+ StrIt++;
+ continue;
+ }
+ } else {
+ auto IntTy = dyn_cast<IntegerType>(Args[i]->getType());
+ if (IntTy && IntTy->getBitWidth() == 32)
+ WhatToStore.push_back(
+ Builder.CreateZExt(Args[i], Builder.getInt64Ty()));
+ else
+ WhatToStore.push_back(Args[i]);
+ }
+
+ for (unsigned I = 0, E = WhatToStore.size(); I != E; ++I) {
+ Value *toStore = WhatToStore[I];
+
+ StoreInst *StBuff = Builder.CreateStore(toStore, PtrToStore);
+ LLVM_DEBUG(dbgs() << "inserting store to printf buffer:" << *StBuff
+ << '\n');
+ PtrToStore = Builder.CreateConstInBoundsGEP1_32(
+ Builder.getInt8Ty(), PtrToStore,
+ M->getDataLayout().getTypeAllocSize(toStore->getType()),
+ "PrintBuffNextPtr");
+ LLVM_DEBUG(dbgs() << "inserting gep to the printf buffer:" << *PtrToStore
+ << '\n');
+ }
+ }
+}
+
+Value *llvm::emitAMDGPUPrintfCall(IRBuilder<> &Builder, ArrayRef<Value *> Args,
+ bool IsBuffered) {
auto NumOps = Args.size();
assert(NumOps >= 1);
auto Fmt = Args[0];
SparseBitVector<8> SpecIsCString;
- locateCStrings(SpecIsCString, Fmt);
+ StringRef FmtStr;
+
+ if (getConstantStringInfo(Fmt, FmtStr))
+ locateCStrings(SpecIsCString, FmtStr);
+
+ if (IsBuffered) {
+ SmallVector<StringData, 8> StringContents;
+ Module *M = Builder.GetInsertBlock()->getModule();
+ LLVMContext &Ctx = Builder.getContext();
+ auto Int1Ty = Builder.getInt1Ty();
+ auto Int8Ty = Builder.getInt8Ty();
+ auto Int32Ty = Builder.getInt32Ty();
+ bool IsConstFmtStr = !FmtStr.empty();
+
+ Value *ArgSize = nullptr;
+ Value *Ptr =
+ callBufferedPrintfStart(Builder, Args, Fmt, IsConstFmtStr,
+ SpecIsCString, StringContents, ArgSize);
+
+ // The buffered version still follows OpenCL printf standards for
+ // printf return value, i.e 0 on success, -1 on failure.
+ ConstantPointerNull *zeroIntPtr =
+ ConstantPointerNull::get(cast<PointerType>(Ptr->getType()));
+
+ auto *Cmp = cast<ICmpInst>(Builder.CreateICmpNE(Ptr, zeroIntPtr, ""));
+
+ BasicBlock *End = BasicBlock::Create(Ctx, "end.block",
+ Builder.GetInsertBlock()->getParent());
+ BasicBlock *ArgPush = BasicBlock::Create(
+ Ctx, "argpush.block", Builder.GetInsertBlock()->getParent());
+
+ BranchInst::Create(ArgPush, End, Cmp, Builder.GetInsertBlock());
+ Builder.SetInsertPoint(ArgPush);
+
+ // Create controlDWord and store as the first entry, format as follows
+ // Bit 0 (LSB) -> stream (1 if stderr, 0 if stdout)
+ // Bit 1 -> constant format string (1 if constant)
+ // Bits 2-31 -> size of printf data frame
+ auto CreateControlDWord = M->getOrInsertFunction(
+ StringRef("__printf_control_dword"), Builder.getInt32Ty(),
+ Builder.getInt32Ty(), Int1Ty, Int1Ty);
+ auto valueToStore = Builder.CreateCall(
+ CreateControlDWord,
+ {ArgSize, ConstantInt::get(Int1Ty, IsConstFmtStr ? 1 : 0, false),
+ Builder.getFalse()});
+ Builder.CreateStore(valueToStore, Ptr);
+
+ Ptr = Builder.CreateConstInBoundsGEP1_32(Int8Ty, Ptr, 4);
+
+ // Create MD5 hash for costant format string, push low 64 bits of the
+ // same onto buffer and metadata.
+ NamedMDNode *metaD = M->getOrInsertNamedMetadata("llvm.printf.fmts");
+ if (IsConstFmtStr) {
+ MD5 Hasher;
+ MD5::MD5Result Hash;
+ Hasher.update(FmtStr);
+ Hasher.final(Hash);
+
+ // Try sticking to llvm.printf.fmts format, although we are not going to
+ // use the ID and argument size fields while printing,
+ std::string MetadataStr =
+ "0:0:" + llvm::utohexstr(Hash.low(), /*LowerCase=*/true) + "," +
+ FmtStr.str();
+ MDString *fmtStrArray = MDString::get(Ctx, MetadataStr);
+ MDNode *myMD = MDNode::get(Ctx, fmtStrArray);
+ metaD->addOperand(myMD);
+
+ Builder.CreateStore(ConstantInt::get(Builder.getInt64Ty(), Hash.low()),
+ Ptr);
+ Ptr = Builder.CreateConstInBoundsGEP1_32(Int8Ty, Ptr, 8);
+ } else {
+ // Include a dummy metadata instance in case of only non constant
+ // format string usage, This might be an absurd usecase but needs to
+ // be done for completeness
+ if (metaD->getNumOperands() == 0) {
+ MDString *fmtStrArray =
+ MDString::get(Ctx, "0:0:ffffffff,\"Non const format string\"");
+ MDNode *myMD = MDNode::get(Ctx, fmtStrArray);
+ metaD->addOperand(myMD);
+ }
+ }
+
+ // Push The printf arguments onto buffer
+ callBufferedPrintfArgPush(Builder, Args, Ptr, SpecIsCString, StringContents,
+ IsConstFmtStr);
+
+ // End block, returns -1 on failure
+ BranchInst::Create(End, ArgPush);
+ Builder.SetInsertPoint(End);
+ return Builder.CreateSExt(Builder.CreateNot(Cmp), Int32Ty, "printf_result");
+ }
auto Desc = callPrintfBegin(Builder, Builder.getIntN(64, 0));
Desc = appendString(Builder, Desc, Fmt, NumOps == 1);
Index: llvm/include/llvm/Transforms/Utils/AMDGPUEmitPrintf.h
===================================================================
--- llvm/include/llvm/Transforms/Utils/AMDGPUEmitPrintf.h
+++ llvm/include/llvm/Transforms/Utils/AMDGPUEmitPrintf.h
@@ -18,7 +18,8 @@
namespace llvm {
-Value *emitAMDGPUPrintfCall(IRBuilder<> &Builder, ArrayRef<Value *> Args);
+Value *emitAMDGPUPrintfCall(IRBuilder<> &Builder, ArrayRef<Value *> Args,
+ bool isBuffered);
} // end namespace llvm
Index: clang/test/Driver/hip-options.hip
===================================================================
--- clang/test/Driver/hip-options.hip
+++ clang/test/Driver/hip-options.hip
@@ -21,6 +21,22 @@
// PTH: "-cc1"{{.*}} "-E" {{.*}}"-fgpu-default-stream=per-thread"
// PTH: "-cc1"{{.*}} "-fgpu-default-stream=per-thread" {{.*}}"-x" "hip-cpp-output"
+// Check -mprintf-kind=hostcall
+// RUN: %clang -### -mprintf-kind=hostcall %s -save-temps 2>&1 | FileCheck -check-prefix=HOSTC %s
+// HOSTC: "-cc1" "-triple" "amdgcn-amd-amdhsa"{{.*}}"-mprintf-kind=hostcall" "-E" {{.*}}
+// HOSTC: "-cc1" "-triple" "amdgcn-amd-amdhsa"{{.*}} "-mprintf-kind=hostcall" {{.*}}"-x" "hip-cpp-output"
+// HOSTC: "-cc1" "-triple" "amdgcn-amd-amdhsa"{{.*}} "-mprintf-kind=hostcall" {{.*}}"-x" "ir"
+// HOSTC: "-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}} "-E" {{.*}}
+// HOSTC: "-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-x" "ir"
+
+// Check -mprintf-kind=buffered
+// RUN: %clang -### -mprintf-kind=buffered %s -save-temps 2>&1 | FileCheck -check-prefix=BUFF %s
+// BUFF: "-cc1" "-triple" "amdgcn-amd-amdhsa"{{.*}}"-mprintf-kind=buffered" "-E" {{.*}}
+// BUFF: "-cc1" "-triple" "amdgcn-amd-amdhsa"{{.*}} "-mprintf-kind=buffered" {{.*}}"-x" "hip-cpp-output"
+// BUFF: "-cc1" "-triple" "amdgcn-amd-amdhsa"{{.*}} "-mprintf-kind=buffered" {{.*}}"-x" "ir"
+// BUFF: "-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}} "-E" {{.*}}
+// BUFF: "-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-x" "ir"
+
// RUN: %clang -### -x hip --target=x86_64-pc-windows-msvc -fms-extensions \
// RUN: -mllvm -amdgpu-early-inline-all=true %s 2>&1 | \
// RUN: FileCheck -check-prefix=MLLVM %s
Index: clang/test/CodeGenHIP/sanitize-undefined-null.hip
===================================================================
--- clang/test/CodeGenHIP/sanitize-undefined-null.hip
+++ clang/test/CodeGenHIP/sanitize-undefined-null.hip
@@ -20,12 +20,12 @@
// CHECK-NEXT: [[P_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_ADDR]] to ptr
// CHECK-NEXT: store ptr [[P:%.*]], ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = icmp ne ptr [[TMP0]], null, !nosanitize !3
-// CHECK-NEXT: br i1 [[TMP1]], label [[CONT:%.*]], label [[HANDLER_TYPE_MISMATCH:%.*]], !prof [[PROF4:![0-9]+]], !nosanitize !3
+// CHECK-NEXT: [[TMP1:%.*]] = icmp ne ptr [[TMP0]], null, !nosanitize !4
+// CHECK-NEXT: br i1 [[TMP1]], label [[CONT:%.*]], label [[HANDLER_TYPE_MISMATCH:%.*]], !prof [[PROF4:![0-9]+]], !nosanitize !4
// CHECK: handler.type_mismatch:
-// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP0]] to i64, !nosanitize !3
-// CHECK-NEXT: call void @__ubsan_handle_type_mismatch_v1_abort(ptr addrspace(1) @[[GLOB1:[0-9]+]], i64 [[TMP2]]) #[[ATTR2:[0-9]+]], !nosanitize !3
-// CHECK-NEXT: unreachable, !nosanitize !3
+// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP0]] to i64, !nosanitize !4
+// CHECK-NEXT: call void @__ubsan_handle_type_mismatch_v1_abort(ptr addrspace(1) @[[GLOB1:[0-9]+]], i64 [[TMP2]]) #[[ATTR2:[0-9]+]], !nosanitize !4
+// CHECK-NEXT: unreachable, !nosanitize !4
// CHECK: cont:
// CHECK-NEXT: store i8 0, ptr [[TMP0]], align 1
// CHECK-NEXT: ret i32 3
Index: clang/test/CodeGenHIP/printf_nonhostcall.cpp
===================================================================
--- /dev/null
+++ clang/test/CodeGenHIP/printf_nonhostcall.cpp
@@ -0,0 +1,234 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -emit-llvm -mprintf-kind=buffered -fcuda-is-device \
+// RUN: -o - %s | FileCheck --enable-var-scope %s
+
+#define __device__ __attribute__((device))
+#define __shared__ __attribute__((shared))
+#define __constant__ __attribute__((constant))
+
+extern "C" __device__ int printf(const char *format, ...);
+
+// CHECK-LABEL: define dso_local noundef i32 @_Z4foo1v
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[S:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT: [[S_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[S]] to ptr
+// CHECK-NEXT: store ptr addrspacecast (ptr addrspace(4) @.str to ptr), ptr [[S_ASCAST]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ASCAST]], align 8
+// CHECK-NEXT: [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
+// CHECK-NEXT: br i1 [[TMP2]], label [[STRLEN_JOIN:%.*]], label [[STRLEN_WHILE:%.*]]
+// CHECK: strlen.while:
+// CHECK-NEXT: [[TMP3:%.*]] = phi ptr [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP4:%.*]], [[STRLEN_WHILE]] ]
+// CHECK-NEXT: [[TMP4]] = getelementptr i8, ptr [[TMP3]], i64 1
+// CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr [[TMP3]], align 1
+// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i8 [[TMP5]], 0
+// CHECK-NEXT: br i1 [[TMP6]], label [[STRLEN_WHILE_DONE:%.*]], label [[STRLEN_WHILE]]
+// CHECK: strlen.while.done:
+// CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP0]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP3]] to i64
+// CHECK-NEXT: [[TMP9:%.*]] = sub i64 [[TMP8]], [[TMP7]]
+// CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 1
+// CHECK-NEXT: br label [[STRLEN_JOIN]]
+// CHECK: strlen.join:
+// CHECK-NEXT: [[TMP11:%.*]] = phi i64 [ [[TMP10]], [[STRLEN_WHILE_DONE]] ], [ 0, [[ENTRY]] ]
+// CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 7
+// CHECK-NEXT: [[TMP13:%.*]] = and i64 [[TMP12]], 4294967288
+// CHECK-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], 52
+// CHECK-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
+// CHECK-NEXT: [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 [[TMP15]])
+// CHECK-NEXT: [[TMP16:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null
+// CHECK-NEXT: br i1 [[TMP16]], label [[ARGPUSH_BLOCK:%.*]], label [[END_BLOCK:%.*]]
+// CHECK: end.block:
+// CHECK-NEXT: [[TMP17:%.*]] = xor i1 [[TMP16]], true
+// CHECK-NEXT: [[PRINTF_RESULT:%.*]] = sext i1 [[TMP17]] to i32
+// CHECK-NEXT: ret i32 [[PRINTF_RESULT]]
+// CHECK: argpush.block:
+// CHECK-NEXT: [[TMP18:%.*]] = call i32 @__printf_control_dword(i32 [[TMP15]], i1 true, i1 false)
+// CHECK-NEXT: store i32 [[TMP18]], ptr addrspace(1) [[PRINTF_ALLOC_FN]], align 4
+// CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
+// CHECK-NEXT: store i64 1107004088646384690, ptr addrspace(1) [[TMP19]], align 8
+// CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP19]], i32 8
+// CHECK-NEXT: store i64 8, ptr addrspace(1) [[TMP20]], align 8
+// CHECK-NEXT: [[PRINTBUFFNEXTPTR:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i32 8
+// CHECK-NEXT: store double 3.141590e+00, ptr addrspace(1) [[PRINTBUFFNEXTPTR]], align 8
+// CHECK-NEXT: [[PRINTBUFFNEXTPTR1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR]], i32 8
+// CHECK-NEXT: store i64 8, ptr addrspace(1) [[PRINTBUFFNEXTPTR1]], align 8
+// CHECK-NEXT: [[PRINTBUFFNEXTPTR2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR1]], i32 8
+// CHECK-NEXT: store i64 4, ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], align 8
+// CHECK-NEXT: [[PRINTBUFFNEXTPTR3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], i32 8
+// CHECK-NEXT: call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 8 [[PRINTBUFFNEXTPTR3]], ptr align 1 [[TMP0]], i64 [[TMP11]], i1 false)
+// CHECK-NEXT: [[PRINTBUFFNEXTPTR4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR3]], i64 [[TMP13]]
+// CHECK-NEXT: store ptr [[TMP1]], ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], align 8
+// CHECK-NEXT: [[PRINTBUFFNEXTPTR5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], i32 8
+// CHECK-NEXT: br label [[END_BLOCK]]
+//
+__device__ int foo1() {
+ const char *s = "hello world";
+ return printf("%.*f %*.*s %p\n", 8, 3.14159, 8, 4, s, s);
+}
+
+__device__ char *dstr;
+__device__ const
+// CHECK-LABEL: define dso_local noundef i32 @_Z4foo2v
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[LCVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT: [[LCVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LCVAL]] to ptr
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspacecast (ptr addrspace(1) @dstr to ptr), align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr addrspacecast (ptr addrspace(1) @dstr to ptr), align 8
+// CHECK-NEXT: [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
+// CHECK-NEXT: br i1 [[TMP2]], label [[STRLEN_JOIN:%.*]], label [[STRLEN_WHILE:%.*]]
+// CHECK: strlen.while:
+// CHECK-NEXT: [[TMP3:%.*]] = phi ptr [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP4:%.*]], [[STRLEN_WHILE]] ]
+// CHECK-NEXT: [[TMP4]] = getelementptr i8, ptr [[TMP3]], i64 1
+// CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr [[TMP3]], align 1
+// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i8 [[TMP5]], 0
+// CHECK-NEXT: br i1 [[TMP6]], label [[STRLEN_WHILE_DONE:%.*]], label [[STRLEN_WHILE]]
+// CHECK: strlen.while.done:
+// CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP0]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP3]] to i64
+// CHECK-NEXT: [[TMP9:%.*]] = sub i64 [[TMP8]], [[TMP7]]
+// CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 1
+// CHECK-NEXT: br label [[STRLEN_JOIN]]
+// CHECK: strlen.join:
+// CHECK-NEXT: [[TMP11:%.*]] = phi i64 [ [[TMP10]], [[STRLEN_WHILE_DONE]] ], [ 0, [[ENTRY]] ]
+// CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 7
+// CHECK-NEXT: [[TMP13:%.*]] = and i64 [[TMP12]], 4294967288
+// CHECK-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], 36
+// CHECK-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
+// CHECK-NEXT: [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 [[TMP15]])
+// CHECK-NEXT: [[TMP16:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null
+// CHECK-NEXT: br i1 [[TMP16]], label [[ARGPUSH_BLOCK:%.*]], label [[END_BLOCK:%.*]]
+// CHECK: end.block:
+// CHECK-NEXT: [[TMP17:%.*]] = xor i1 [[TMP16]], true
+// CHECK-NEXT: [[PRINTF_RESULT:%.*]] = sext i1 [[TMP17]] to i32
+// CHECK-NEXT: ret i32 [[PRINTF_RESULT]]
+// CHECK: argpush.block:
+// CHECK-NEXT: [[TMP18:%.*]] = call i32 @__printf_control_dword(i32 [[TMP15]], i1 true, i1 false)
+// CHECK-NEXT: store i32 [[TMP18]], ptr addrspace(1) [[PRINTF_ALLOC_FN]], align 4
+// CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
+// CHECK-NEXT: store i64 7257695813269076350, ptr addrspace(1) [[TMP19]], align 8
+// CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP19]], i32 8
+// CHECK-NEXT: call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 8 [[TMP20]], ptr align 1 [[TMP0]], i64 [[TMP11]], i1 false)
+// CHECK-NEXT: [[PRINTBUFFNEXTPTR:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i64 [[TMP13]]
+// CHECK-NEXT: store ptr [[TMP1]], ptr addrspace(1) [[PRINTBUFFNEXTPTR]], align 8
+// CHECK-NEXT: [[PRINTBUFFNEXTPTR1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR]], i32 8
+// CHECK-NEXT: store ptr addrspacecast (ptr addrspace(3) @_ZZ4foo2vE5shval to ptr), ptr addrspace(1) [[PRINTBUFFNEXTPTR1]], align 8
+// CHECK-NEXT: [[PRINTBUFFNEXTPTR2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR1]], i32 8
+// CHECK-NEXT: store ptr [[LCVAL_ASCAST]], ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], align 8
+// CHECK-NEXT: [[PRINTBUFFNEXTPTR3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], i32 8
+// CHECK-NEXT: br label [[END_BLOCK]]
+//
+__device__ int foo2() {
+ __shared__ int shval;
+ int lcval;
+ return printf("%s %p %p %p\n", dstr, dstr, &shval, &lcval);
+}
+
+__device__ unsigned short g = 30;
+__device__ unsigned long n = 30;
+
+__device__ float f1 = 3.14f;
+__device__ double f2 = 2.71828;
+
+// CHECK-LABEL: define dso_local noundef i32 @_Z4foo3v
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT: store i32 25, ptr addrspacecast (ptr addrspace(3) @_ZZ4foo3vE1s to ptr), align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspacecast (ptr addrspace(3) @_ZZ4foo3vE1s to ptr), align 4
+// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr addrspacecast (ptr addrspace(1) @g to ptr), align 2
+// CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP1]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr addrspacecast (ptr addrspace(1) @n to ptr), align 8
+// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspacecast (ptr addrspace(1) @f1 to ptr), align 4
+// CHECK-NEXT: [[CONV1:%.*]] = fpext float [[TMP3]] to double
+// CHECK-NEXT: [[TMP4:%.*]] = load double, ptr addrspacecast (ptr addrspace(1) @f2 to ptr), align 8
+// CHECK-NEXT: [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 60)
+// CHECK-NEXT: [[TMP5:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null
+// CHECK-NEXT: br i1 [[TMP5]], label [[ARGPUSH_BLOCK:%.*]], label [[END_BLOCK:%.*]]
+// CHECK: end.block:
+// CHECK-NEXT: [[TMP6:%.*]] = xor i1 [[TMP5]], true
+// CHECK-NEXT: [[PRINTF_RESULT:%.*]] = sext i1 [[TMP6]] to i32
+// CHECK-NEXT: ret i32 [[PRINTF_RESULT]]
+// CHECK: argpush.block:
+// CHECK-NEXT: [[TMP7:%.*]] = call i32 @__printf_control_dword(i32 60, i1 true, i1 false)
+// CHECK-NEXT: store i32 [[TMP7]], ptr addrspace(1) [[PRINTF_ALLOC_FN]], align 4
+// CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
+// CHECK-NEXT: store i64 2197983583858494848, ptr addrspace(1) [[TMP8]], align 8
+// CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP8]], i32 8
+// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP0]] to i64
+// CHECK-NEXT: store i64 [[TMP10]], ptr addrspace(1) [[TMP9]], align 8
+// CHECK-NEXT: [[PRINTBUFFNEXTPTR:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP9]], i32 8
+// CHECK-NEXT: store ptr addrspacecast (ptr addrspace(3) @_ZZ4foo3vE1s to ptr), ptr addrspace(1) [[PRINTBUFFNEXTPTR]], align 8
+// CHECK-NEXT: [[PRINTBUFFNEXTPTR2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR]], i32 8
+// CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[CONV]] to i64
+// CHECK-NEXT: store i64 [[TMP11]], ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], align 8
+// CHECK-NEXT: [[PRINTBUFFNEXTPTR3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], i32 8
+// CHECK-NEXT: store i64 [[TMP2]], ptr addrspace(1) [[PRINTBUFFNEXTPTR3]], align 8
+// CHECK-NEXT: [[PRINTBUFFNEXTPTR4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR3]], i32 8
+// CHECK-NEXT: store double [[CONV1]], ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], align 8
+// CHECK-NEXT: [[PRINTBUFFNEXTPTR5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], i32 8
+// CHECK-NEXT: store double [[TMP4]], ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], align 8
+// CHECK-NEXT: [[PRINTBUFFNEXTPTR6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], i32 8
+// CHECK-NEXT: br label [[END_BLOCK]]
+//
+__device__ int foo3() {
+ __shared__ int s;
+ s = 25;
+ return printf("Random values: %d,%p,%hd,%ld,%f,%f\n",s, &s, g, n, f1, f2);
+}
+
+//A non trivial case,
+// CHECK-LABEL: define dso_local noundef i32 @_Z4foo4v
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[S:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT: [[S_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[S]] to ptr
+// CHECK-NEXT: store ptr addrspacecast (ptr addrspace(4) @.str.4 to ptr), ptr [[S_ASCAST]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = icmp eq ptr [[TMP0]], null
+// CHECK-NEXT: br i1 [[TMP1]], label [[STRLEN_JOIN:%.*]], label [[STRLEN_WHILE:%.*]]
+// CHECK: strlen.while:
+// CHECK-NEXT: [[TMP2:%.*]] = phi ptr [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP3:%.*]], [[STRLEN_WHILE]] ]
+// CHECK-NEXT: [[TMP3]] = getelementptr i8, ptr [[TMP2]], i64 1
+// CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP2]], align 1
+// CHECK-NEXT: [[TMP5:%.*]] = icmp eq i8 [[TMP4]], 0
+// CHECK-NEXT: br i1 [[TMP5]], label [[STRLEN_WHILE_DONE:%.*]], label [[STRLEN_WHILE]]
+// CHECK: strlen.while.done:
+// CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP0]] to i64
+// CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP2]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], [[TMP6]]
+// CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 1
+// CHECK-NEXT: br label [[STRLEN_JOIN]]
+// CHECK: strlen.join:
+// CHECK-NEXT: [[TMP10:%.*]] = phi i64 [ [[TMP9]], [[STRLEN_WHILE_DONE]] ], [ 0, [[ENTRY]] ]
+// CHECK-NEXT: [[TMP11:%.*]] = add i64 [[TMP10]], 7
+// CHECK-NEXT: [[TMP12:%.*]] = and i64 [[TMP11]], 4294967288
+// CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], 12
+// CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
+// CHECK-NEXT: [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 [[TMP14]])
+// CHECK-NEXT: [[TMP15:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null
+// CHECK-NEXT: br i1 [[TMP15]], label [[ARGPUSH_BLOCK:%.*]], label [[END_BLOCK:%.*]]
+// CHECK: end.block:
+// CHECK-NEXT: [[TMP16:%.*]] = xor i1 [[TMP15]], true
+// CHECK-NEXT: [[PRINTF_RESULT:%.*]] = sext i1 [[TMP16]] to i32
+// CHECK-NEXT: ret i32 [[PRINTF_RESULT]]
+// CHECK: argpush.block:
+// CHECK-NEXT: [[TMP17:%.*]] = call i32 @__printf_control_dword(i32 [[TMP14]], i1 false, i1 false)
+// CHECK-NEXT: store i32 [[TMP17]], ptr addrspace(1) [[PRINTF_ALLOC_FN]], align 4
+// CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
+// CHECK-NEXT: call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 [[TMP18]], ptr align 1 [[TMP0]], i64 [[TMP10]], i1 false)
+// CHECK-NEXT: [[PRINTBUFFNEXTPTR:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP18]], i64 [[TMP12]]
+// CHECK-NEXT: store i64 10, ptr addrspace(1) [[PRINTBUFFNEXTPTR]], align 8
+// CHECK-NEXT: [[PRINTBUFFNEXTPTR1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR]], i32 8
+// CHECK-NEXT: br label [[END_BLOCK]]
+//
+__device__ int foo4() {
+ const char* s = "format str%d";
+ return printf(s, 10);
+}
Index: clang/test/CodeGenHIP/printf-kind-module-flag.hip
===================================================================
--- /dev/null
+++ clang/test/CodeGenHIP/printf-kind-module-flag.hip
@@ -0,0 +1,17 @@
+// Create module flag for printf kind.
+
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
+// RUN: -o - %s | FileCheck %s -check-prefix=HOSTCALL
+
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
+// RUN: -mprintf-kind=hostcall -o - %s | FileCheck %s -check-prefix=HOSTCALL
+
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
+// RUN: -mprintf-kind=buffered -o - %s | FileCheck -check-prefix=BUFFERED %s
+
+// RUN: not %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
+// RUN: -mprintf-kind=none -o - %s 2>&1| FileCheck %s -check-prefix=INV
+
+// HOSTCALL: !{{.*}} = !{i32 1, !"amdgpu_printf_kind", !"hostcall"}
+// BUFFERED: !{{.*}} = !{i32 1, !"amdgpu_printf_kind", !"buffered"}
+// INV: error: invalid value 'none' in '-mprintf-kind=none'
\ No newline at end of file
Index: clang/test/CodeGenHIP/default-attributes.hip
===================================================================
--- clang/test/CodeGenHIP/default-attributes.hip
+++ clang/test/CodeGenHIP/default-attributes.hip
@@ -47,8 +47,10 @@
// OPT: attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
//.
// OPTNONE: !0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
-// OPTNONE: !1 = !{i32 1, !"wchar_size", i32 4}
+// OPTNONE: !1 = !{i32 1, !"amdgpu_printf_kind", !"hostcall"}
+// OPTNONE: !2 = !{i32 1, !"wchar_size", i32 4}
//.
// OPT: !0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
-// OPT: !1 = !{i32 1, !"wchar_size", i32 4}
+// OPT: !1 = !{i32 1, !"amdgpu_printf_kind", !"hostcall"}
+// OPT: !2 = !{i32 1, !"wchar_size", i32 4}
//.
Index: clang/lib/Driver/ToolChains/Clang.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Clang.cpp
+++ clang/lib/Driver/ToolChains/Clang.cpp
@@ -4691,8 +4691,22 @@
}
CmdArgs.push_back("-aux-triple");
CmdArgs.push_back(Args.MakeArgString(NormalizedTriple));
+
+ if (JA.isDeviceOffloading(Action::OFK_HIP) &&
+ getToolChain().getTriple().isAMDGPU()) {
+ // Device side compilation printf
+ if (Args.getLastArg(options::OPT_mprintf_kind_EQ)) {
+ CmdArgs.push_back(Args.MakeArgString(
+ "-mprintf-kind=" +
+ Args.getLastArgValue(options::OPT_mprintf_kind_EQ)));
+ }
+ }
}
+ // Unconditionally claim the printf option now to avoid unused diagnostic.
+ if (const Arg *PF = Args.getLastArg(options::OPT_mprintf_kind_EQ))
+ PF->claim();
+
if (Args.hasFlag(options::OPT_fsycl, options::OPT_fno_sycl, false)) {
CmdArgs.push_back("-fsycl-is-device");
Index: clang/lib/CodeGen/CodeGenModule.cpp
===================================================================
--- clang/lib/CodeGen/CodeGenModule.cpp
+++ clang/lib/CodeGen/CodeGenModule.cpp
@@ -611,6 +611,17 @@
"amdgpu_code_object_version",
getTarget().getTargetOpts().CodeObjectVersion);
}
+
+ // Currently, "-mprintf-kind" option is only supported for HIP
+ if (LangOpts.HIP) {
+ auto *MDStr = llvm::MDString::get(
+ getLLVMContext(), (getTarget().getTargetOpts().AMDGPUPrintfKindVal ==
+ TargetOptions::AMDGPUPrintfKind::Hostcall)
+ ? "hostcall"
+ : "buffered");
+ getModule().addModuleFlag(llvm::Module::Error, "amdgpu_printf_kind",
+ MDStr);
+ }
}
// Emit a global array containing all external kernels or device variables
Index: clang/lib/CodeGen/CGGPUBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGGPUBuiltin.cpp
+++ clang/lib/CodeGen/CGGPUBuiltin.cpp
@@ -202,7 +202,10 @@
llvm::IRBuilder<> IRB(Builder.GetInsertBlock(), Builder.GetInsertPoint());
IRB.SetCurrentDebugLocation(Builder.getCurrentDebugLocation());
- auto Printf = llvm::emitAMDGPUPrintfCall(IRB, Args);
+
+ bool isBuffered = (CGM.getTarget().getTargetOpts().AMDGPUPrintfKindVal ==
+ clang::TargetOptions::AMDGPUPrintfKind::Buffered);
+ auto Printf = llvm::emitAMDGPUPrintfCall(IRB, Args, isBuffered);
Builder.SetInsertPoint(IRB.GetInsertBlock(), IRB.GetInsertPoint());
return RValue::get(Printf);
}
Index: clang/include/clang/Driver/Options.td
===================================================================
--- clang/include/clang/Driver/Options.td
+++ clang/include/clang/Driver/Options.td
@@ -1033,6 +1033,17 @@
TargetOpts<"NVPTXUseShortPointers">, DefaultFalse,
PosFlag<SetTrue, [CC1Option], "Use 32-bit pointers for accessing const/local/shared address spaces">,
NegFlag<SetFalse>>;
+def mprintf_kind_EQ : Joined<["-"], "mprintf-kind=">, Group<m_Group>,
+ HelpText<"Specify the printf lowering scheme (AMDGPU only), allowed values are "
+ "\"hostcall\"(printing happens during kernel execution, this scheme "
+ "relies on hostcalls which require system to support pcie atomics) "
+ "and \"buffered\"(printing happens after all kernel threads exit"
+ "this uses a printf buffer and does not rely on pcie atomic support)">,
+ Flags<[CC1Option]>,
+ Values<"hostcall,buffered">,
+ NormalizedValuesScope<"TargetOptions::AMDGPUPrintfKind">,
+ NormalizedValues<["Hostcall", "Buffered"]>,
+ MarshallingInfoEnum<TargetOpts<"AMDGPUPrintfKindVal">, "Hostcall">;
def fgpu_default_stream_EQ : Joined<["-"], "fgpu-default-stream=">,
HelpText<"Specify default stream. The default value is 'legacy'. (HIP only)">,
Flags<[CC1Option]>,
Index: clang/include/clang/Basic/TargetOptions.h
===================================================================
--- clang/include/clang/Basic/TargetOptions.h
+++ clang/include/clang/Basic/TargetOptions.h
@@ -90,6 +90,19 @@
/// \brief Code object version for AMDGPU.
CodeObjectVersionKind CodeObjectVersion = CodeObjectVersionKind::COV_None;
+ /// \brief Enumeration values for AMDGPU printf lowering scheme
+ enum class AMDGPUPrintfKind {
+ /// printf lowering scheme involving hostcalls, currently used by HIP
+ /// programs by default
+ Hostcall = 0,
+
+ /// printf lowering scheme involving implicit printf buffers,
+ Buffered = 1,
+ };
+
+ /// \brief AMDGPU Printf lowering scheme
+ AMDGPUPrintfKind AMDGPUPrintfKindVal = AMDGPUPrintfKind::Hostcall;
+
// The code model to be used as specified by the user. Corresponds to
// CodeModel::Model enum defined in include/llvm/Support/CodeGen.h, plus
// "default" for the case when the user has not explicitly specified a
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits