https://github.com/jroelofs created https://github.com/llvm/llvm-project/pull/80093
… callers when there is a callee with a matching feature set, and no other higher priority callee. This optimization helps the inliner see past the ifunc+resolver to the callee that we know it will always land on. This is a conservative implementation of: https://github.com/llvm/llvm-project/issues/71714 >From ed52ee4424459ebc046a625341ad8dbbd38bcbe3 Mon Sep 17 00:00:00 2001 From: Jon Roelofs <jonathan_roel...@apple.com> Date: Tue, 30 Jan 2024 19:13:42 -0800 Subject: [PATCH] [clang][FMV] Direct-call multi-versioned callees from multi-versioned callers ... when there is a callee with a matching feature set, and no other higher priority callee. This optimization helps the inliner see past the ifunc+resolver to the callee that we know it will always land on. This is a conservative implementation of: https://github.com/llvm/llvm-project/issues/71714 --- clang/lib/CodeGen/CGCall.cpp | 72 +++++ clang/lib/CodeGen/CodeGenModule.cpp | 2 +- .../test/CodeGen/attr-target-mv-direct-call.c | 245 ++++++++++++++++++ 3 files changed, 318 insertions(+), 1 deletion(-) create mode 100644 clang/test/CodeGen/attr-target-mv-direct-call.c diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 28c211aa631e4..84a04e3ccddd8 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -4966,6 +4966,11 @@ static unsigned getMaxVectorWidth(const llvm::Type *Ty) { return MaxVectorWidth; } +// FIXME: put this somewhere nicer to share +unsigned +TargetMVPriority(const TargetInfo &TI, + const CodeGenFunction::MultiVersionResolverOption &RO); + RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, const CGCallee &Callee, ReturnValueSlot ReturnValue, @@ -5437,6 +5442,73 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, const CGCallee &ConcreteCallee = Callee.prepareConcreteCallee(*this); llvm::Value *CalleePtr = ConcreteCallee.getFunctionPointer(); + // If a multi-versioned caller calls a multi-versioned callee, skip the + // resolver when there is a precise match on the feature sets, and no + // possibility of a better match at runtime. + if (const auto *CallerFD = dyn_cast_or_null<FunctionDecl>(CurGD.getDecl())) + if (const auto *CallerTVA = CallerFD->getAttr<TargetVersionAttr>()) + if (const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(TargetDecl)) + // FIXME: do the same where either the caller or callee are + // target_clones. + if (FD->isTargetMultiVersion()) { + llvm::SmallVector<StringRef, 8> CallerFeats; + CallerTVA->getFeatures(CallerFeats); + MultiVersionResolverOption CallerMVRO(nullptr, "", CallerFeats); + + bool HasHigherPriorityCallee = false; + llvm::Constant *FoundMatchingCallee = nullptr; + getContext().forEachMultiversionedFunctionVersion( + FD, [this, FD, &CallerMVRO, &HasHigherPriorityCallee, + &FoundMatchingCallee](const FunctionDecl *CurFD) { + const auto *CalleeTVA = CurFD->getAttr<TargetVersionAttr>(); + + GlobalDecl CurGD{ + (CurFD->isDefined() ? CurFD->getDefinition() : CurFD)}; + StringRef MangledName = CGM.getMangledName(CurFD); + + llvm::SmallVector<StringRef, 8> CalleeFeats; + CalleeTVA->getFeatures(CalleeFeats); + MultiVersionResolverOption CalleeMVRO(nullptr, "", CalleeFeats); + + const TargetInfo &TI = getTarget(); + + // If there is a higher priority callee, we can't do the + // optimization at all, as it would be a valid choice at + // runtime. + if (TargetMVPriority(TI, CalleeMVRO) > + TargetMVPriority(TI, CallerMVRO)) { + HasHigherPriorityCallee = true; + return; + } + + // FIXME: we could allow a lower-priority match when the + // features are a proper subset. But for now, to keep things + // simpler, we only care about a precise match. + if (TargetMVPriority(TI, CalleeMVRO) < + TargetMVPriority(TI, CallerMVRO)) + return; + + if (llvm::Constant *Func = CGM.GetGlobalValue(MangledName)) { + FoundMatchingCallee = Func; + return; + } + + if (CurFD->isDefined()) { + // FIXME: not sure how to get the address + } else { + const CGFunctionInfo &FI = + getTypes().arrangeGlobalDeclaration(FD); + llvm::FunctionType *Ty = getTypes().GetFunctionType(FI); + FoundMatchingCallee = + CGM.GetAddrOfFunction(CurGD, Ty, /*ForVTable=*/false, + /*DontDefer=*/false, ForDefinition); + } + }); + + if (FoundMatchingCallee && !HasHigherPriorityCallee) + CalleePtr = FoundMatchingCallee; + } + // If we're using inalloca, set up that argument. if (ArgMemory.isValid()) { llvm::Value *Arg = ArgMemory.getPointer(); diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 6ec54cc01c923..c334e4a3a40f3 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -4092,7 +4092,7 @@ void CodeGenModule::EmitGlobalDefinition(GlobalDecl GD, llvm::GlobalValue *GV) { static void ReplaceUsesOfNonProtoTypeWithRealFunction(llvm::GlobalValue *Old, llvm::Function *NewFn); -static unsigned +unsigned TargetMVPriority(const TargetInfo &TI, const CodeGenFunction::MultiVersionResolverOption &RO) { unsigned Priority = 0; diff --git a/clang/test/CodeGen/attr-target-mv-direct-call.c b/clang/test/CodeGen/attr-target-mv-direct-call.c new file mode 100644 index 0000000000000..687fdd1ca3c24 --- /dev/null +++ b/clang/test/CodeGen/attr-target-mv-direct-call.c @@ -0,0 +1,245 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -emit-llvm -o - %s | FileCheck %s + +// Check that we make a direct call from direct_caller._Msimd to +// direct_callee._Msimd when there is no better option. +__attribute__((target_version("simd"))) void direct_callee(void) {} +__attribute__((target_version("default"))) void direct_callee(void) {} +__attribute__((target_version("simd"))) void direct_caller(void) { direct_callee(); } +__attribute__((target_version("default"))) void direct_caller(void) { direct_callee(); } + +// ... and that we go through the ifunc+resolver when there is a better option +// that might be chosen at runtime. +__attribute__((target_version("simd"))) void resolved_callee1(void) {} +__attribute__((target_version("fcma"))) void resolved_callee1(void) {} +__attribute__((target_version("default"))) void resolved_callee1(void) {} +__attribute__((target_version("simd"))) void resolved_caller1(void) { resolved_callee1(); } +__attribute__((target_version("default"))) void resolved_caller1(void) { resolved_callee1(); } + +// FIXME: we could direct call in cases like this: +__attribute__((target_version("fp"))) void resolved_callee2(void) {} +__attribute__((target_version("default"))) void resolved_callee2(void) {} +__attribute__((target_version("simd+fp"))) void resolved_caller2(void) { resolved_callee2(); } +__attribute__((target_version("default"))) void resolved_caller2(void) { resolved_callee2(); } + +void source() { + direct_caller(); + resolved_caller1(); + resolved_caller2(); +} + +//. +// CHECK: @__aarch64_cpu_features = external dso_local global { i64 } +// CHECK: @direct_callee.ifunc = weak_odr ifunc void (), ptr @direct_callee.resolver +// CHECK: @direct_caller.ifunc = weak_odr ifunc void (), ptr @direct_caller.resolver +// CHECK: @resolved_callee1.ifunc = weak_odr ifunc void (), ptr @resolved_callee1.resolver +// CHECK: @resolved_caller1.ifunc = weak_odr ifunc void (), ptr @resolved_caller1.resolver +// CHECK: @resolved_callee2.ifunc = weak_odr ifunc void (), ptr @resolved_callee2.resolver +// CHECK: @resolved_caller2.ifunc = weak_odr ifunc void (), ptr @resolved_caller2.resolver +//. +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@direct_callee._Msimd +// CHECK-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@direct_callee.resolver() comdat { +// CHECK-NEXT: resolver_entry: +// CHECK-NEXT: call void @__init_cpu_features_resolver() +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 512 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 512 +// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] +// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] +// CHECK: resolver_return: +// CHECK-NEXT: ret ptr @direct_callee._Msimd +// CHECK: resolver_else: +// CHECK-NEXT: ret ptr @direct_callee.default +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@direct_caller._Msimd +// CHECK-SAME: () #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @direct_callee._Msimd() +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@direct_caller.resolver() comdat { +// CHECK-NEXT: resolver_entry: +// CHECK-NEXT: call void @__init_cpu_features_resolver() +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 512 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 512 +// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] +// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] +// CHECK: resolver_return: +// CHECK-NEXT: ret ptr @direct_caller._Msimd +// CHECK: resolver_else: +// CHECK-NEXT: ret ptr @direct_caller.default +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@resolved_callee1._Msimd +// CHECK-SAME: () #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@resolved_callee1.resolver() comdat { +// CHECK-NEXT: resolver_entry: +// CHECK-NEXT: call void @__init_cpu_features_resolver() +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 2097152 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 2097152 +// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] +// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] +// CHECK: resolver_return: +// CHECK-NEXT: ret ptr @resolved_callee1._Mfcma +// CHECK: resolver_else: +// CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 512 +// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 512 +// CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] +// CHECK-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] +// CHECK: resolver_return1: +// CHECK-NEXT: ret ptr @resolved_callee1._Msimd +// CHECK: resolver_else2: +// CHECK-NEXT: ret ptr @resolved_callee1.default +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@resolved_caller1._Msimd +// CHECK-SAME: () #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @resolved_callee1.ifunc() +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@resolved_caller1.resolver() comdat { +// CHECK-NEXT: resolver_entry: +// CHECK-NEXT: call void @__init_cpu_features_resolver() +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 512 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 512 +// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] +// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] +// CHECK: resolver_return: +// CHECK-NEXT: ret ptr @resolved_caller1._Msimd +// CHECK: resolver_else: +// CHECK-NEXT: ret ptr @resolved_caller1.default +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@resolved_callee2._Mfp +// CHECK-SAME: () #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@resolved_callee2.resolver() comdat { +// CHECK-NEXT: resolver_entry: +// CHECK-NEXT: call void @__init_cpu_features_resolver() +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 256 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 256 +// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] +// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] +// CHECK: resolver_return: +// CHECK-NEXT: ret ptr @resolved_callee2._Mfp +// CHECK: resolver_else: +// CHECK-NEXT: ret ptr @resolved_callee2.default +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@resolved_caller2._MfpMsimd +// CHECK-SAME: () #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @resolved_callee2.ifunc() +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@resolved_caller2.resolver() comdat { +// CHECK-NEXT: resolver_entry: +// CHECK-NEXT: call void @__init_cpu_features_resolver() +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 768 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 768 +// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] +// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] +// CHECK: resolver_return: +// CHECK-NEXT: ret ptr @resolved_caller2._MfpMsimd +// CHECK: resolver_else: +// CHECK-NEXT: ret ptr @resolved_caller2.default +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@source +// CHECK-SAME: () #[[ATTR1:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @direct_caller.ifunc() +// CHECK-NEXT: call void @resolved_caller1.ifunc() +// CHECK-NEXT: call void @resolved_caller2.ifunc() +// CHECK-NEXT: ret void +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@direct_callee.default +// CHECK-SAME: () #[[ATTR1]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret void +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@direct_caller.default +// CHECK-SAME: () #[[ATTR1]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @direct_callee.ifunc() +// CHECK-NEXT: ret void +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@resolved_callee1._Mfcma +// CHECK-SAME: () #[[ATTR2:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret void +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@resolved_callee1.default +// CHECK-SAME: () #[[ATTR1]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret void +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@resolved_caller1.default +// CHECK-SAME: () #[[ATTR1]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @resolved_callee1.ifunc() +// CHECK-NEXT: ret void +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@resolved_callee2.default +// CHECK-SAME: () #[[ATTR1]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret void +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@resolved_caller2.default +// CHECK-SAME: () #[[ATTR1]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @resolved_callee2.ifunc() +// CHECK-NEXT: ret void +// +//. +// CHECK: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon" } +// CHECK: attributes #[[ATTR1]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// CHECK: attributes #[[ATTR2]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+complxnum,+fp-armv8,+neon" } +//. +// CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +//. _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits