[mlir] [lldb] [llvm] [clang] [compiler-rt] Enable Custom Lowering for fabs.v8f16 on AVX (PR #71730)
https://github.com/david-xl updated https://github.com/llvm/llvm-project/pull/71730 >From 6032b965f85482b39e841bd95842f4e17c92fefd Mon Sep 17 00:00:00 2001 From: David Li Date: Tue, 7 Nov 2023 23:29:44 -0800 Subject: [PATCH 1/6] Enable Custom Lowering for fabs.v8f16 on AVX --- llvm/lib/Target/X86/X86ISelLowering.cpp | 3 ++ llvm/test/CodeGen/X86/vec_fabs.ll | 41 + 2 files changed, 44 insertions(+) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 22fba5601ccfd38..8d9519b9f8c6b10 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2238,6 +2238,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } } + if (Subtarget.hasAVX()) +setOperationAction(ISD::FABS, MVT::v8f16, Custom); + if (!Subtarget.useSoftFloat() && (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) { addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass diff --git a/llvm/test/CodeGen/X86/vec_fabs.ll b/llvm/test/CodeGen/X86/vec_fabs.ll index 982062d8907542a..08364449ab1a378 100644 --- a/llvm/test/CodeGen/X86/vec_fabs.ll +++ b/llvm/test/CodeGen/X86/vec_fabs.ll @@ -1,8 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX512VL ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX512VLDQ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512VLDQ @@ -111,6 +113,45 @@ define <4 x double> @fabs_v4f64(<4 x double> %p) { } declare <4 x double> @llvm.fabs.v4f64(<4 x double> %p) +define <8 x half> @fabs_v8f16(ptr %p) { +; X86-AVX-LABEL: fabs_v8f16: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT:movl 4(%esp), [[ADDRREG:%.*]] +; X86-AVX-NEXT:vmovaps ([[ADDRREG]]), %xmm0 +; X86-AVX-NEXT:vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX-NEXT:retl + +; X86-AVX2-LABEL: fabs_v8f16: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT:movl 4(%esp), [[REG:%.*]] +; X86-AVX2-NEXT:vpbroadcastw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-AVX2-NEXT:vpand ([[REG]]), %xmm0, %xmm0 +; X86-AVX2-NEXT:retl + +; X64-AVX512VL-LABEL: fabs_v8f16: +; X64-AVX512VL: # %bb.0: +; X64-AVX512VL-NEXT:vpbroadcastw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-AVX512VL-NEXT:vpand (%rdi), %xmm0, %xmm0 +; X64-AVX512VL-NEXT:retq + +; X64-AVX-LABEL: fabs_v8f16: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT:vmovaps (%rdi), %xmm0 +; X64-AVX-NEXT:vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT:retq + +; X64-AVX2-LABEL: fabs_v8f16: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT:vpbroadcastw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-AVX2-NEXT:vpand (%rdi), %xmm0, %xmm0 +; X64-AVX2-NEXT:retq + + %v = load <8 x half>, ptr %p, align 16 + %nnv = call <8 x half> @llvm.fabs.v8f16(<8 x half> %v) + ret <8 x half> %nnv +} +declare <8 x half> @llvm.fabs.v8f16(<8 x half> %p) + define <8 x float> @fabs_v8f32(<8 x float> %p) { ; X86-AVX-LABEL: fabs_v8f32: ; X86-AVX: # %bb.0: >From f2f3136667805cc7202ccba45e01393afe34ccc5 Mon Sep 17 00:00:00 2001 From: David Li Date: Tue, 7 Nov 2023 23:29:44 -0800 Subject: [PATCH 2/6] Enable Custom Lowering for fabs.v8f16 on AVX --- llvm/lib/Target/X86/X86ISelLowering.cpp | 3 ++ llvm/test/CodeGen/X86/vec_fabs.ll | 41 + 2 files changed, 44 insertions(+) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 22fba5601ccfd38..8d9519b9f8c6b10 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2238,6 +2238,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } } + if (Subtarget.hasAVX()) +setOperationAction(ISD::FABS, MVT::v8f16, Custom); + if (!Subtarget.useSoftFloat() && (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) { addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass diff --git a/llvm/test/CodeGen/X86/vec_fabs.ll b/llvm/test/CodeGen/X86/vec_fabs.ll index 98
[clang-tools-extra] [clang] [compiler-rt] [llvm] [PGO][GlobalValue][LTO]In GlobalValues::getGlobalIdentifier, use semicolon as delimiter for local-linkage varibles. (PR #74008)
david-xl wrote: > > David says the itanium remapper file was only used once during gcc to llvm > > transition, so not relevant here. > > I believe it was actually for the libstdc++ to libc++ transition (see > https://reviews.llvm.org/D51247 and https://reviews.llvm.org/D51240). > > If it is broken we'll at least want to add a FIXME there. Yes, I meant libstdc++ to libc++ transition. Why source line is this comment addressing? I take take a look the changes/comments there. https://github.com/llvm/llvm-project/pull/74008 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [clang] [llvm] [compiler-rt] [PGO][GlobalValue][LTO]In GlobalValues::getGlobalIdentifier, use semicolon as delimiter for local-linkage varibles. (PR #74008)
david-xl wrote: > > > > David says the itanium remapper file was only used once during gcc to > > > > llvm transition, so not relevant here. > > > > > > > > > I believe it was actually for the libstdc++ to libc++ transition (see > > > https://reviews.llvm.org/D51247 and https://reviews.llvm.org/D51240). > > > If it is broken we'll at least want to add a FIXME there. > > > > > > Yes, I meant libstdc++ to libc++ transition. Why source line is this > > comment addressing? I take take a look the changes/comments there. > > Sorry for the misinformation, and thanks for the Phab links. > > I think the itanium remapper needs a `:` -> `;` update (going to update this > PR and related tests), since (for local-linkage functions) the function name > used to look up profiles should use `;` delimiter. > > > > David says the itanium remapper file was only used once during gcc to > > > > llvm transition, so not relevant here. > > > > > > > > > I believe it was actually for the libstdc++ to libc++ transition (see > > > https://reviews.llvm.org/D51247 and https://reviews.llvm.org/D51240). > > > If it is broken we'll at least want to add a FIXME there. > > > > > > Yes, I meant libstdc++ to libc++ transition. Why source line is this > > comment addressing? I take take a look the changes/comments there. > > Sorry for the misinformation, and thanks for the Phab links. > > I think the itanium remapper needs a `:` -> `;` update (going to update this > PR and related tests), since (for local-linkage functions) the function name > used to look up profiles should use `;` delimiter. The remapper is not aware of any internal symbol mangling scheme, so those entires won't be tracked by it. In other words, there is no need to change anything there, I think. https://github.com/llvm/llvm-project/pull/74008 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[compiler-rt] [llvm] [clang-tools-extra] [clang] [PGO][GlobalValue][LTO]In GlobalValues::getGlobalIdentifier, use semicolon as delimiter for local-linkage varibles. (PR #74008)
david-xl wrote: > > > > > > David says the itanium remapper file was only used once during gcc > > > > > > to llvm transition, so not relevant here. > > > > > > > > > > > > > > > I believe it was actually for the libstdc++ to libc++ transition (see > > > > > https://reviews.llvm.org/D51247 and https://reviews.llvm.org/D51240). > > > > > If it is broken we'll at least want to add a FIXME there. > > > > > > > > > > > > Yes, I meant libstdc++ to libc++ transition. Why source line is this > > > > comment addressing? I take take a look the changes/comments there. > > > > > > > > > Sorry for the misinformation, and thanks for the Phab links. > > > I think the itanium remapper needs a `:` -> `;` update (going to update > > > this PR and related tests), since (for local-linkage functions) the > > > function name used to look up profiles should use `;` delimiter. > > > > > > > > > > David says the itanium remapper file was only used once during gcc > > > > > > to llvm transition, so not relevant here. > > > > > > > > > > > > > > > I believe it was actually for the libstdc++ to libc++ transition (see > > > > > https://reviews.llvm.org/D51247 and https://reviews.llvm.org/D51240). > > > > > If it is broken we'll at least want to add a FIXME there. > > > > > > > > > > > > Yes, I meant libstdc++ to libc++ transition. Why source line is this > > > > comment addressing? I take take a look the changes/comments there. > > > > > > > > > Sorry for the misinformation, and thanks for the Phab links. > > > I think the itanium remapper needs a `:` -> `;` update (going to update > > > this PR and related tests), since (for local-linkage functions) the > > > function name used to look up profiles should use `;` delimiter. > > > > > > The remapper is not aware of any internal symbol mangling scheme, so those > > entires won't be tracked by it. In other words, there is no need to change > > anything there, I think. > > Not updating Itanium remapper should work for PGO counter matching until the > next transition (details below); for consistency I just updated Itanium > remapper's `extractName` to use semicolon. > > The details > > * For PGO counter matching, instr prof reader > [asks](https://github.com/llvm/llvm-project/blob/32ec5fbfed32f37aa070ee38e9b038bd84ca6479/llvm/lib/ProfileData/InstrProfReader.cpp#L1339) > read remapper for a record. > * Only with a remapping file provided (specified by > `-fprofile-remapping-file`), a itanium remap reader is constructed. And when > remapping file is not specified, a no-op remap reader is constructed. [source > code](https://github.com/llvm/llvm-project/blob/32ec5fbfed32f37aa070ee38e9b038bd84ca6479/llvm/lib/ProfileData/InstrProfReader.cpp#L1306-L1314) > * When remapping file is specified, remap reader tries to extract the mangled > name (removing `filename` prefix`) by finding a `:`(no longer used as > delimiter for newer profiles) and remaps the mangled name. If`:`is not > updated to`;`, the name is remapped to itself (irpgo func format) and > profiles could still be found. However, not updating means remapping becomes > no-op for local-linkage functions, which is fine after the transition > complete but doesn't work for new transitions (if it happens..) Sounds good. I checked the history of remapping for instrProfile. For IR PGO, there is basically no need to do so as the instrumentation and profile-use should be in-sync. For front-end instrumentation, there seem to be some use cases to use out of sync profile: https://reviews.llvm.org/D51240. The remapping is also best effort -- i.e. not guarantee to keep all the old profile e.g. icall targets are not supported. https://github.com/llvm/llvm-project/pull/74008 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [llvm] [compiler-rt] [clang] [PGO][GlobalValue][LTO]In GlobalValues::getGlobalIdentifier, use semicolon as delimiter for local-linkage varibles. (PR #74008)
david-xl wrote: > > > . For IR PGO, there is basically no need to do so as the instrumentation > > > and profile-use should be in-sync. For front-end instrumentation, there > > > seem to be some use cases to use out of sync profile: > > > https://reviews.llvm.org/D51240. > > > > > > Thanks for double checking. I noticed the ICP and stale profile tolerance > > discussions when read the Phab history; it's good Phab review history are > > still available nowadays. > > IRPGO profiles could be used along with supplementary sample-pgo profiles. > > I'll probably read relevant code in llvm-profdata to understand how these > > interact in theory mostly for my own curiosity (hopefully no rough edges as > > long as `llvm-profdata` uses the same pgo name format used by latest > > compiler) > > For irpgo with supplementary profiles, this line to build a map ( > > https://github.com/llvm/llvm-project/blob/44dc1e0baae7c4b8a02ba06dcf396d3d452aa873/llvm/tools/llvm-profdata/llvm-profdata.cpp#L982 > > ) needs an update. Will do it together with the test > [update](https://github.com/llvm/llvm-project/pull/74008#discussion_r1421018997) > in this pull request. A follow up patch to fix the tool is fine too (or may be better). https://github.com/llvm/llvm-project/pull/74008 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[flang] [lldb] [llvm] [clang-tools-extra] [compiler-rt] [mlir] [clang] [Profile] Add binary profile correlation for code coverage. (PR #69493)
david-xl wrote: > @david-xl , Zequan posted an > [RFC](https://discourse.llvm.org/t/rfc-add-binary-profile-correlation-to-not-load-profile-metadata-sections-into-memory-at-runtime/74565/8) > for this. Is there a PGO tag, or something we can use to increase visibility > for PGO reviewers? I think most of the complexity for this new mode is in the > PGO code, so I think the best person to review it would be someone with an > interest in PGO. > > I'll tag some folks, but let me know if there is a better way: @kparzysz > @snehasish I am aware of this patch and it mostly look good to me. Since there is a long list of reviewers attached to this PR, better give more time for them to chime in. I am quite busy early part of this week, but will try to give it another round by the end of the week if no one else beats me to it. https://github.com/llvm/llvm-project/pull/69493 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [clang-tools-extra] [coro][pgo] Do not insert counters in the `suspend` block (PR #71262)
https://github.com/david-xl approved this pull request. Please also update the comment about tail call and symmetric transfer. https://github.com/llvm/llvm-project/pull/71262 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] Bfi precision (PR #66285)
david-xl wrote: > And digging even deeper: > > * FWIW I noticed that I only used `clang -c` as benchmark previously, should > have used `clang -c -O3` resulting in this: > > ``` > Old-BFI: insn: 37,821,687,947 (baseline) > New-BFI: insn: 38,133,312,923 +0.82% > Old-BFI, no-cold: insn: 37,423,365,184 -1.05% > New-BFI, no-cold: insn: 37,438,736,713 -1.01% > ``` > > * The problematic part of the code that is inlined/not-inlined is actually > marked as `LLVM_UNLIKELY` here: > https://github.com/llvm/llvm-project/blob/main/llvm/include/llvm/ADT/DenseMap.h#L607 > and intuitively one would think that it is probably best to not inline > `grow` (as will happen with the new BFI version) > * In practice not-inlining `grow` mostly ends up being worse because of > knock-on effects as far as I can tell. These are the inlining decisions I > noticed for the most prominent situation: > > ``` > - `grow` inlined (Old-BFI): > - Instruction::getMetadataImpl > -> Value::getMetadata not inlined > -> DenseMap::operator[] inlined > -> DenseMap::FindAndConstruct inlined > -> DenseMap::InsertIntoBucket not inlined, size >likely too big with `grow` inlined here > -> DenseMap::grow inlined > - `grow` inlined (New-BFI): > - Instruction::getMadataImpl > -> Value::getMetadata inlined > -> DenseMap::operator[]inlined > -> DenseMap::FindAndConstruct not inlined, size > -> DenseMap::InsertIntoBucket inlined > -> DenseMap::grow not inlined > ``` > > Though if you look into this then I would state that the code got better for > the wrong reasons! Not inlining `grow` is a sensible decision in this context > and the `LLVM_UNLIKELY` annotation makes sense (I actually added some > counters and see the unlikely branch taken somewhere between 1% and 10% of > the cases depending on inputs, so seems sensible). > > Unfortunately the particular code here `getMetadataImpl` never inserts new > things into the map, but unfortunately `operator[]` gives you that behavior > by default so nobody noticed. So not inlining `InsertIntoBucket` happened to > be a great decision previously that the compiler did by accident without > having good data to support this. Now with better but still insufficient data > (as this is PGO) we happen to end up inlining `InsertIntoBucket` wasting code > size leading to worse inlining decisions further up the stack... > > Long story short: This ended up another of the many stories where the > compiler cannot make the right inlining decisions without having actual > runtime data. It tends to make a better decision based on the data that ends > up being wrong anyway here. I'm gonna leave things as is and rather put up > some improvements to LLVM code instead! Good analysis. Your plan sounds good but lowering the cold threshold is probably a good thing to do with the new BFI. The FindAndConstruct method in DenseMap can probably be annotated with buitin_expect to indicate the insertion is a unlikely path. https://github.com/llvm/llvm-project/pull/66285 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] Bfi precision (PR #66285)
david-xl wrote: > And digging even deeper: > > * FWIW I noticed that I only used `clang -c` as benchmark previously, should > have used `clang -c -O3` resulting in this: > > ``` > Old-BFI: insn: 37,821,687,947 (baseline) > New-BFI: insn: 38,133,312,923 +0.82% > Old-BFI, no-cold: insn: 37,423,365,184 -1.05% > New-BFI, no-cold: insn: 37,438,736,713 -1.01% > ``` > > * The problematic part of the code that is inlined/not-inlined is actually > marked as `LLVM_UNLIKELY` here: > https://github.com/llvm/llvm-project/blob/main/llvm/include/llvm/ADT/DenseMap.h#L607 > and intuitively one would think that it is probably best to not inline > `grow` (as will happen with the new BFI version) > * In practice not-inlining `grow` mostly ends up being worse because of > knock-on effects as far as I can tell. These are the inlining decisions I > noticed for the most prominent situation: > > ``` > - `grow` inlined (Old-BFI): > - Instruction::getMetadataImpl > -> Value::getMetadata not inlined > -> DenseMap::operator[] inlined > -> DenseMap::FindAndConstruct inlined > -> DenseMap::InsertIntoBucket not inlined, size >likely too big with `grow` inlined here > -> DenseMap::grow inlined > - `grow` inlined (New-BFI): > - Instruction::getMadataImpl > -> Value::getMetadata inlined > -> DenseMap::operator[]inlined > -> DenseMap::FindAndConstruct not inlined, size > -> DenseMap::InsertIntoBucket inlined > -> DenseMap::grow not inlined > ``` > > Though if you look into this then I would state that the code got better for > the wrong reasons! Not inlining `grow` is a sensible decision in this context > and the `LLVM_UNLIKELY` annotation makes sense (I actually added some > counters and see the unlikely branch taken somewhere between 1% and 10% of > the cases depending on inputs, so seems sensible). > > Unfortunately the particular code here `getMetadataImpl` never inserts new > things into the map, but unfortunately `operator[]` gives you that behavior > by default so nobody noticed. So not inlining `InsertIntoBucket` happened to > be a great decision previously that the compiler did by accident without > having good data to support this. Now with better but still insufficient data > (as this is PGO) we happen to end up inlining `InsertIntoBucket` wasting code > size leading to worse inlining decisions further up the stack... > > Long story short: This ended up another of the many stories where the > compiler cannot make the right inlining decisions without having actual > runtime data. It tends to make a better decision based on the data that ends > up being wrong anyway here. I'm gonna leave things as is and rather put up > some improvements to LLVM code instead! Good analysis. Your plan sounds good but lowering the cold threshold is probably a good thing to do with the new BFI. The FindAndConstruct method in DenseMap can probably be annotated with buitin_expect to indicate the insertion is a unlikely path. https://github.com/llvm/llvm-project/pull/66285 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [clang-tools-extra] [llvm-profdata] Emit warning when counter value is greater than 2^56. (PR #69513)
https://github.com/david-xl approved this pull request. https://github.com/llvm/llvm-project/pull/69513 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [clang-tools-extra] [llvm-profdata] Emit warning when counter value is greater than 2^56. (PR #69513)
https://github.com/david-xl edited https://github.com/llvm/llvm-project/pull/69513 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang-tools-extra] [clang] [llvm-profdata] Emit warning when counter value is greater than 2^56. (PR #69513)
@@ -314,7 +314,22 @@ static void loadInput(const WeightedFile &Input, SymbolRemapper *Remapper, } auto FS = vfs::getRealFileSystem(); - auto ReaderOrErr = InstrProfReader::create(Input.Filename, *FS, Correlator); + // TODO: This only saves the first non-fatal error from InstrProfReader, and + // then added to WriterContext::Errors. However, this is not extensiable, if david-xl wrote: extensiable --> extensible https://github.com/llvm/llvm-project/pull/69513 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[compiler-rt] [llvm] [clang] [clang-tools-extra] Bfi precision (PR #66285)
david-xl wrote: @MatzeB ,Our internal release testing have seen lots of very large regressions for tests without PGO or with XFDO only. While I agree this patch is the right way to go and the good performance (without PGO) with oldBFI is somewhat by chance, it is still better not to regress them. Can you help with a follow up patch to tune down relative coldness threshold? https://github.com/llvm/llvm-project/pull/66285 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [compiler-rt] [clang-tools-extra] [llvm] Bfi precision (PR #66285)
david-xl wrote: I asked for those data points and share it when ready. https://github.com/llvm/llvm-project/pull/66285 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[lldb] [llvm] [clang-tools-extra] [compiler-rt] [mlir] [clang] [flang] [Profile] Add binary profile correlation for code coverage. (PR #69493)
@@ -0,0 +1,9 @@ +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -mllvm -profile-correlate=binary -fprofile-instrument=clang -fcoverage-mapping -emit-llvm -o - %s | FileCheck %s --check-prefix=BIN-CORRELATE + +// CHECK: @__llvm_profile_raw_version = {{.*}} i64 9 david-xl wrote: why is it 9? https://github.com/llvm/llvm-project/pull/69493 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [flang] [mlir] [compiler-rt] [lldb] [clang-tools-extra] [Profile] Add binary profile correlation for code coverage. (PR #69493)
https://github.com/david-xl approved this pull request. LGTM. Thanks for the new feature. (Wait a few days in case other reviewers have additional comments). https://github.com/llvm/llvm-project/pull/69493 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[flang] [clang] [lldb] [mlir] [compiler-rt] [clang-tools-extra] [llvm] [Profile] Add binary profile correlation for code coverage. (PR #69493)
@@ -1829,6 +1833,22 @@ void CoverageMappingModuleGen::emit() { llvm::GlobalValue::InternalLinkage, NamesArrVal, llvm::getCoverageUnusedNamesVarName()); } + const StringRef VarName(INSTR_PROF_QUOTE(INSTR_PROF_RAW_VERSION_VAR)); + llvm::Type *IntTy64 = llvm::Type::getInt64Ty(Ctx); + uint64_t ProfileVersion = INSTR_PROF_RAW_VERSION; + if (llvm::ProfileCorrelate == llvm::InstrProfCorrelator::BINARY) +ProfileVersion |= VARIANT_MASK_BIN_CORRELATE; david-xl wrote: This might overwrite other modifier bits set else where. https://github.com/llvm/llvm-project/pull/69493 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[flang] [llvm] [compiler-rt] [clang] [lldb] [mlir] [clang-tools-extra] [Profile] Add binary profile correlation for code coverage. (PR #69493)
https://github.com/david-xl edited https://github.com/llvm/llvm-project/pull/69493 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[mlir] [flang] [clang-tools-extra] [lldb] [compiler-rt] [clang] [llvm] [Profile] Add binary profile correlation for code coverage. (PR #69493)
@@ -1829,6 +1833,22 @@ void CoverageMappingModuleGen::emit() { llvm::GlobalValue::InternalLinkage, NamesArrVal, llvm::getCoverageUnusedNamesVarName()); } + const StringRef VarName(INSTR_PROF_QUOTE(INSTR_PROF_RAW_VERSION_VAR)); david-xl wrote: Add some comments for this segment. https://github.com/llvm/llvm-project/pull/69493 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [lldb] [compiler-rt] [libunwind] [mlir] [libcxx] [clang] [libc] [flang] [clang-tools-extra] [lld] [IRPGO][ValueProfile] Instrument virtual table address that could be used to do virtual table a
https://github.com/david-xl approved this pull request. After this patch, follow up with a patch documenting raw and index format. This has long being requested by many in the community. https://github.com/llvm/llvm-project/pull/66825 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [llvm] [clang] [ISel] Add pattern matching for depositing subreg value (PR #75978)
@@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +;RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64 + +define i64 @sub8(i64 noundef %res, ptr %byte) { david-xl wrote: done https://github.com/llvm/llvm-project/pull/75978 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [llvm] [clang] [ISel] Add pattern matching for depositing subreg value (PR #75978)
@@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +;RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64 + +define i64 @sub8(i64 noundef %res, ptr %byte) { +; X64-LABEL: sub8: +; X64: # %bb.0: # %entry +; X64-NEXT:movq %rdi, %rax +; X64-NEXT:movb (%rsi), %al +; X64-NEXT:retq +entry: + %and = and i64 %res, -256 + %d = load i8, ptr %byte, align 1 + %conv2 = zext i8 %d to i64 + %or = or i64 %and, %conv2 + ret i64 %or +} + + +define i64 @sub16(i64 noundef %res, ptr %byte) { david-xl wrote: fixed. https://github.com/llvm/llvm-project/pull/75978 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [llvm] [clang] [ISel] Add pattern matching for depositing subreg value (PR #75978)
@@ -561,6 +561,16 @@ def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), [(set GR64:$dst, (load addr:$src))]>; } +def : Pat<(or (and GR64:$dst, -256), david-xl wrote: 32bit move has implicit zero extension, so won't be applicable. Moved the change to X86InstrCompiler.td https://github.com/llvm/llvm-project/pull/75978 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [clang-tools-extra] [ISel] Add pattern matching for depositing subreg value (PR #75978)
@@ -561,6 +561,16 @@ def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), [(set GR64:$dst, (load addr:$src))]>; } +def : Pat<(or (and GR64:$dst, -256), david-xl wrote: Sorry I misunderstood. Fixed now. Also updated the test. Note that sub16_32 case does not yet produce the optimized code for i386 because the pattern change (due to arg passing). https://github.com/llvm/llvm-project/pull/75978 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [clang-tools-extra] [ISel] Add pattern matching for depositing subreg value (PR #75978)
@@ -0,0 +1,34 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +;RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64 david-xl wrote: Done. x64 is used elsewhere too. Anyway change it to x86_64 and I386 for clarity. https://github.com/llvm/llvm-project/pull/75978 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [clang-tools-extra] [ISel] Add pattern matching for depositing subreg value (PR #75978)
@@ -0,0 +1,34 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +;RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64 + +define i64 @sub8(i64 noundef %res, ptr %byte) { +; X64-LABEL: sub8: +; X64: # %bb.0: # %entry +; X64-NEXT:movq %rdi, %rax +; X64-NEXT:movb (%rsi), %al +; X64-NEXT:retq +entry: + %and = and i64 %res, -256 + %d = load i8, ptr %byte, align 1 + %conv2 = zext i8 %d to i64 + %or = or i64 %and, %conv2 + ret i64 %or +} + +define i64 @sub16(i64 noundef %res, ptr %byte) { +; X64-LABEL: sub16: +; X64: # %bb.0: # %entry +; X64-NEXT:movq %rdi, %rax +; X64-NEXT:movw (%rsi), %ax +; X64-NEXT:retq +entry: + %and = and i64 %res, -65536 + %d = load i16, ptr %byte, align 1 + %conv2 = zext i16 %d to i64 + %or = or i64 %and, %conv2 + ret i64 %or +} + + + + david-xl wrote: done https://github.com/llvm/llvm-project/pull/75978 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [clang-tools-extra] [ISel] Add pattern matching for depositing subreg value (PR #75978)
@@ -1515,6 +1515,15 @@ def : Pat<(X86add_flag_nocf GR32:$src1, 128), def : Pat<(X86add_flag_nocf GR64:$src1, 128), (SUB64ri32 GR64:$src1, -128)>; +// Depositing value to 8/16 bit subreg: +def : Pat<(or (and GR64:$dst, -256), + (i64 (zextloadi8 addr:$src))), + (INSERT_SUBREG (i64 (COPY $dst)), (MOV8rm i8mem:$src), sub_8bit)>; david-xl wrote: done https://github.com/llvm/llvm-project/pull/75978 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[lld] [clang] [libcxx] [llvm] [libc] [compiler-rt] [flang] Fix ISel crash when lowering BUILD_VECTOR (PR #73186)
@@ -7254,6 +7255,10 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, EVT CVT = Ld.getValueType(); assert(!CVT.isVector() && "Must not broadcast a vector type"); +// 512 bit vpbroadcastw is only available with AVX512BW +if (ScalarSize == 16 && IsGT256 && !Subtarget.hasBWI()) + return SDValue(); david-xl wrote: Sounds good. Will add support there. https://github.com/llvm/llvm-project/pull/73186 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[flang] [libcxx] [lld] [clang] [llvm] [compiler-rt] [libc] Fix ISel crash when lowering BUILD_VECTOR (PR #73186)
@@ -7254,6 +7255,10 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, EVT CVT = Ld.getValueType(); assert(!CVT.isVector() && "Must not broadcast a vector type"); +// 512 bit vpbroadcastw is only available with AVX512BW +if (ScalarSize == 16 && IsGT256 && !Subtarget.hasBWI()) + return SDValue(); david-xl wrote: Done with some refactoring. PTAL. https://github.com/llvm/llvm-project/pull/73186 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[libc] [libcxx] [lld] [clang] [llvm] [compiler-rt] [flang] Fix ISel crash when lowering BUILD_VECTOR (PR #73186)
@@ -986,15 +1003,15 @@ void X86DAGToDAGISel::PreprocessISelDAG() { case X86ISD::VBROADCAST: { MVT VT = N->getSimpleValueType(0); // Emulate v32i16/v64i8 broadcast without BWI. - if (!Subtarget->hasBWI() && (VT == MVT::v32i16 || VT == MVT::v64i8)) { -MVT NarrowVT = VT == MVT::v32i16 ? MVT::v16i16 : MVT::v32i8; + if (!Subtarget->hasBWI() && needBWI(VT)) { +MVT NarrowVT = getNarrowType(VT); SDLoc dl(N); SDValue NarrowBCast = CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0)); SDValue Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT), NarrowBCast, CurDAG->getIntPtrConstant(0, dl)); -unsigned Index = VT == MVT::v32i16 ? 16 : 32; +unsigned Index = getInsertIndex(VT); david-xl wrote: Comments addressed. PTAL https://github.com/llvm/llvm-project/pull/73186 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[flang] [libc] [llvm] [libcxx] [lld] [clang] [compiler-rt] Fix ISel crash when lowering BUILD_VECTOR (PR #73186)
david-xl wrote: Gentle ping. https://github.com/llvm/llvm-project/pull/73186 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] Fix documentation on PGO/coverage related options. (PR #73845)
https://github.com/david-xl created https://github.com/llvm/llvm-project/pull/73845 Update the user manual to provide guidance on the usage for different flavors of instrumentations. >From b2c9081a0c3d5a982c2a23857bf986ec80c83cb5 Mon Sep 17 00:00:00 2001 From: David Li Date: Mon, 27 Nov 2023 13:49:25 -0800 Subject: [PATCH 1/2] Fix stale comment --- llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index 601903c29f799a2..73a7116f74e1180 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -6,7 +6,7 @@ // //===--===// // -// This pass lowers instrprof_* intrinsics emitted by a frontend for profiling. +// This pass lowers instrprof_* intrinsics emitted by an instrumentor. // It also builds the data structures and initialization code needed for // updating execution counts and emitting the profile at runtime. // >From e13f42f2e0fb666dac13c28579d2edba9ae8b8e9 Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 29 Nov 2023 11:56:31 -0800 Subject: [PATCH 2/2] Fix PGO documentation in user manual --- clang/docs/UsersManual.rst | 37 ++--- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst index 2e658557b0e310c..1d2165157b8be8a 100644 --- a/clang/docs/UsersManual.rst +++ b/clang/docs/UsersManual.rst @@ -2607,11 +2607,24 @@ overhead during the profiling, but it provides more detailed results than a sampling profiler. It also provides reproducible results, at least to the extent that the code behaves consistently across runs. +There are two types of instrumentation available in Clang: frontend based and +IR based. The frontend based instrumentation can be turned on with option +``-fprofile-instr-generate`` and the IR based instrumentation can be turned +on with ``-fprofile-generate`` option. For best performance with PGO, the IR +based instrumentation should be used. It has the benefits of lower instrumentation +overhead, smaller raw profile size, and better runtime performance. Frontend +based instrumnetaition, on the other hand, has better source correlation so should +be used with source line based coverage testing. + +Flag ``-fcs-profile-generate`` also instruments programs using the same +instrumentation method as ``-fprofile-generate``. It does a post-inline late +instrumentation and can produce context sensientive profile. + Here are the steps for using profile guided optimization with instrumentation: 1. Build an instrumented version of the code by compiling and linking with the - ``-fprofile-instr-generate`` option. + ``-fprofile-generate`` or ``-fprofile-instr-generate`` option. .. code-block:: console @@ -2674,8 +2687,8 @@ instrumentation: Note that this step is necessary even when there is only one "raw" profile, since the merge operation also changes the file format. -4. Build the code again using the ``-fprofile-instr-use`` option to specify the - collected profile data. +4. Build the code again using the ``-fprofile-use`` or ``-fprofile-instr-use`` + option to specify the collected profile data. .. code-block:: console @@ -2685,13 +2698,10 @@ instrumentation: profile. As you make changes to your code, clang may no longer be able to use the profile data. It will warn you when this happens. -Profile generation using an alternative instrumentation method can be -controlled by the GCC-compatible flags ``-fprofile-generate`` and -``-fprofile-use``. Although these flags are semantically equivalent to -their GCC counterparts, they *do not* handle GCC-compatible profiles. -They are only meant to implement GCC's semantics with respect to -profile creation and use. Flag ``-fcs-profile-generate`` also instruments -programs using the same instrumentation method as ``-fprofile-generate``. +Note that ``-fprofile-use`` option is semantically equivalent to +its GCC counterpart, it *does not* handle profile formats produced by GCC. +Both ``-fprofile-use`` and ``-fprofile-instr-use`` accept profiles in the +indexed format, regardeless whether it is produced by frontend or the IR pass. .. option:: -fprofile-generate[=] @@ -4401,6 +4411,9 @@ Execute ``clang-cl /?`` to see a list of supported options: Instrument only functions from files where names don't match all the regexes separated by a semi-colon -fprofile-filter-files= Instrument only functions from files where names match any regex separated by a semi-colon + -fprofile-generate[=] + Generate instrumented code to collect execution counts into a raw profile f
[clang] [llvm] Fix documentation on PGO/coverage related options. (PR #73845)
https://github.com/david-xl updated https://github.com/llvm/llvm-project/pull/73845 >From b2c9081a0c3d5a982c2a23857bf986ec80c83cb5 Mon Sep 17 00:00:00 2001 From: David Li Date: Mon, 27 Nov 2023 13:49:25 -0800 Subject: [PATCH 1/2] Fix stale comment --- llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index 601903c29f799a2..73a7116f74e1180 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -6,7 +6,7 @@ // //===--===// // -// This pass lowers instrprof_* intrinsics emitted by a frontend for profiling. +// This pass lowers instrprof_* intrinsics emitted by an instrumentor. // It also builds the data structures and initialization code needed for // updating execution counts and emitting the profile at runtime. // >From dfd0ae0197b8e7425af1fd02636dda876b54d5bc Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 29 Nov 2023 11:56:31 -0800 Subject: [PATCH 2/2] Fix PGO documentation in user manual --- clang/docs/UsersManual.rst | 45 ++ 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst index 2e658557b0e310c..51b9cc246ea8b55 100644 --- a/clang/docs/UsersManual.rst +++ b/clang/docs/UsersManual.rst @@ -2348,9 +2348,10 @@ differences between the two: 1. Profile data generated with one cannot be used by the other, and there is no conversion tool that can convert one to the other. So, a profile generated - via ``-fprofile-instr-generate`` must be used with ``-fprofile-instr-use``. - Similarly, sampling profiles generated by external profilers must be - converted and used with ``-fprofile-sample-use``. + via ``-fprofile-generate`` or ``-fprofile-instr-generate`` must be used with + ``-fprofile-use`` or ``-fprofile-instr-use``. Similarly, sampling profiles + generated by external profilers must be converted and used with ``-fprofile-sample-use`` + or ``-fauto-profile``. 2. Instrumentation profile data can be used for code coverage analysis and optimization. @@ -2607,11 +2608,25 @@ overhead during the profiling, but it provides more detailed results than a sampling profiler. It also provides reproducible results, at least to the extent that the code behaves consistently across runs. +Clang supports two types of instrumentation: frontend-based and IR-based. +Frontend-based instrumentation can be enabled with the option ``-fprofile-instr-generate``, +and IR-based instrumentation can be enabled with the option ``-fprofile-generate``. +For best performance with PGO, IR-based instrumentation should be used. It has +the benefits of lower instrumentation overhead, smaller raw profile size, and +better runtime performance. Frontend-based instrumentation, on the other hand, +has better source correlation, so it should be used with source line-based +coverage testing. + +The flag ``-fcs-profile-generate`` also instruments programs using the same +instrumentation method as ``-fprofile-generate``. However, it performs a +post-inline late instrumentation and can produce context-sensitive profiles. + + Here are the steps for using profile guided optimization with instrumentation: 1. Build an instrumented version of the code by compiling and linking with the - ``-fprofile-instr-generate`` option. + ``-fprofile-generate`` or ``-fprofile-instr-generate`` option. .. code-block:: console @@ -2674,8 +2689,8 @@ instrumentation: Note that this step is necessary even when there is only one "raw" profile, since the merge operation also changes the file format. -4. Build the code again using the ``-fprofile-instr-use`` option to specify the - collected profile data. +4. Build the code again using the ``-fprofile-use`` or ``-fprofile-instr-use`` + option to specify the collected profile data. .. code-block:: console @@ -2685,13 +2700,10 @@ instrumentation: profile. As you make changes to your code, clang may no longer be able to use the profile data. It will warn you when this happens. -Profile generation using an alternative instrumentation method can be -controlled by the GCC-compatible flags ``-fprofile-generate`` and -``-fprofile-use``. Although these flags are semantically equivalent to -their GCC counterparts, they *do not* handle GCC-compatible profiles. -They are only meant to implement GCC's semantics with respect to -profile creation and use. Flag ``-fcs-profile-generate`` also instruments -programs using the same instrumentation method as ``-fprofile-generate``. +Note that ``-fprofile-use`` option is semantically equivalent to +its GCC counterpart, it *does not* handle profile formats produced by GCC. +Bo
[clang] Fix documentation on PGO/coverage related options. (PR #73845)
https://github.com/david-xl updated https://github.com/llvm/llvm-project/pull/73845 >From 627d664f0281d6db778f5d624710a7066e6c362f Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 29 Nov 2023 11:56:31 -0800 Subject: [PATCH] Fix PGO documentation in user manual --- clang/docs/UsersManual.rst | 45 ++ 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst index 2e658557b0e310c..51b9cc246ea8b55 100644 --- a/clang/docs/UsersManual.rst +++ b/clang/docs/UsersManual.rst @@ -2348,9 +2348,10 @@ differences between the two: 1. Profile data generated with one cannot be used by the other, and there is no conversion tool that can convert one to the other. So, a profile generated - via ``-fprofile-instr-generate`` must be used with ``-fprofile-instr-use``. - Similarly, sampling profiles generated by external profilers must be - converted and used with ``-fprofile-sample-use``. + via ``-fprofile-generate`` or ``-fprofile-instr-generate`` must be used with + ``-fprofile-use`` or ``-fprofile-instr-use``. Similarly, sampling profiles + generated by external profilers must be converted and used with ``-fprofile-sample-use`` + or ``-fauto-profile``. 2. Instrumentation profile data can be used for code coverage analysis and optimization. @@ -2607,11 +2608,25 @@ overhead during the profiling, but it provides more detailed results than a sampling profiler. It also provides reproducible results, at least to the extent that the code behaves consistently across runs. +Clang supports two types of instrumentation: frontend-based and IR-based. +Frontend-based instrumentation can be enabled with the option ``-fprofile-instr-generate``, +and IR-based instrumentation can be enabled with the option ``-fprofile-generate``. +For best performance with PGO, IR-based instrumentation should be used. It has +the benefits of lower instrumentation overhead, smaller raw profile size, and +better runtime performance. Frontend-based instrumentation, on the other hand, +has better source correlation, so it should be used with source line-based +coverage testing. + +The flag ``-fcs-profile-generate`` also instruments programs using the same +instrumentation method as ``-fprofile-generate``. However, it performs a +post-inline late instrumentation and can produce context-sensitive profiles. + + Here are the steps for using profile guided optimization with instrumentation: 1. Build an instrumented version of the code by compiling and linking with the - ``-fprofile-instr-generate`` option. + ``-fprofile-generate`` or ``-fprofile-instr-generate`` option. .. code-block:: console @@ -2674,8 +2689,8 @@ instrumentation: Note that this step is necessary even when there is only one "raw" profile, since the merge operation also changes the file format. -4. Build the code again using the ``-fprofile-instr-use`` option to specify the - collected profile data. +4. Build the code again using the ``-fprofile-use`` or ``-fprofile-instr-use`` + option to specify the collected profile data. .. code-block:: console @@ -2685,13 +2700,10 @@ instrumentation: profile. As you make changes to your code, clang may no longer be able to use the profile data. It will warn you when this happens. -Profile generation using an alternative instrumentation method can be -controlled by the GCC-compatible flags ``-fprofile-generate`` and -``-fprofile-use``. Although these flags are semantically equivalent to -their GCC counterparts, they *do not* handle GCC-compatible profiles. -They are only meant to implement GCC's semantics with respect to -profile creation and use. Flag ``-fcs-profile-generate`` also instruments -programs using the same instrumentation method as ``-fprofile-generate``. +Note that ``-fprofile-use`` option is semantically equivalent to +its GCC counterpart, it *does not* handle profile formats produced by GCC. +Both ``-fprofile-use`` and ``-fprofile-instr-use`` accept profiles in the +indexed format, regardeless whether it is produced by frontend or the IR pass. .. option:: -fprofile-generate[=] @@ -4401,6 +4413,9 @@ Execute ``clang-cl /?`` to see a list of supported options: Instrument only functions from files where names don't match all the regexes separated by a semi-colon -fprofile-filter-files= Instrument only functions from files where names match any regex separated by a semi-colon + -fprofile-generate[=] + Generate instrumented code to collect execution counts into a raw profile file in + (overridden by LLVM_PROFILE_FILE env var) -fprofile-instr-generate= Generate instrumented code to collect execution counts into (overridden by LLVM_PROFILE_FILE env var) @@ -44
[clang] Fix documentation on PGO/coverage related options. (PR #73845)
https://github.com/david-xl updated https://github.com/llvm/llvm-project/pull/73845 >From ce4c93c2b250e2f4b6703f6321397dd774333f35 Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 29 Nov 2023 11:56:31 -0800 Subject: [PATCH] Fix PGO documentation in user manual --- clang/docs/UsersManual.rst | 50 ++ 1 file changed, 35 insertions(+), 15 deletions(-) diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst index 2e658557b0e310c..5d49ec70540a7e4 100644 --- a/clang/docs/UsersManual.rst +++ b/clang/docs/UsersManual.rst @@ -2348,9 +2348,10 @@ differences between the two: 1. Profile data generated with one cannot be used by the other, and there is no conversion tool that can convert one to the other. So, a profile generated - via ``-fprofile-instr-generate`` must be used with ``-fprofile-instr-use``. - Similarly, sampling profiles generated by external profilers must be - converted and used with ``-fprofile-sample-use``. + via ``-fprofile-generate`` or ``-fprofile-instr-generate`` must be used with + ``-fprofile-use`` or ``-fprofile-instr-use``. Similarly, sampling profiles + generated by external profilers must be converted and used with ``-fprofile-sample-use`` + or ``-fauto-profile``. 2. Instrumentation profile data can be used for code coverage analysis and optimization. @@ -2598,6 +2599,8 @@ Of those, 31,977 were spent inside the body of ``bar``. The last line of the profile (``2: 0``) corresponds to line 2 inside ``main``. No samples were collected there. +.. _prof_instr: + Profiling with Instrumentation ^^ @@ -2607,11 +2610,25 @@ overhead during the profiling, but it provides more detailed results than a sampling profiler. It also provides reproducible results, at least to the extent that the code behaves consistently across runs. +Clang supports two types of instrumentation: frontend-based and IR-based. +Frontend-based instrumentation can be enabled with the option ``-fprofile-instr-generate``, +and IR-based instrumentation can be enabled with the option ``-fprofile-generate``. +For best performance with PGO, IR-based instrumentation should be used. It has +the benefits of lower instrumentation overhead, smaller raw profile size, and +better runtime performance. Frontend-based instrumentation, on the other hand, +has better source correlation, so it should be used with source line-based +coverage testing. + +The flag ``-fcs-profile-generate`` also instruments programs using the same +instrumentation method as ``-fprofile-generate``. However, it performs a +post-inline late instrumentation and can produce context-sensitive profiles. + + Here are the steps for using profile guided optimization with instrumentation: 1. Build an instrumented version of the code by compiling and linking with the - ``-fprofile-instr-generate`` option. + ``-fprofile-generate`` or ``-fprofile-instr-generate`` option. .. code-block:: console @@ -2674,8 +2691,8 @@ instrumentation: Note that this step is necessary even when there is only one "raw" profile, since the merge operation also changes the file format. -4. Build the code again using the ``-fprofile-instr-use`` option to specify the - collected profile data. +4. Build the code again using the ``-fprofile-use`` or ``-fprofile-instr-use`` + option to specify the collected profile data. .. code-block:: console @@ -2685,13 +2702,10 @@ instrumentation: profile. As you make changes to your code, clang may no longer be able to use the profile data. It will warn you when this happens. -Profile generation using an alternative instrumentation method can be -controlled by the GCC-compatible flags ``-fprofile-generate`` and -``-fprofile-use``. Although these flags are semantically equivalent to -their GCC counterparts, they *do not* handle GCC-compatible profiles. -They are only meant to implement GCC's semantics with respect to -profile creation and use. Flag ``-fcs-profile-generate`` also instruments -programs using the same instrumentation method as ``-fprofile-generate``. +Note that ``-fprofile-use`` option is semantically equivalent to +its GCC counterpart, it *does not* handle profile formats produced by GCC. +Both ``-fprofile-use`` and ``-fprofile-instr-use`` accept profiles in the +indexed format, regardeless whether it is produced by frontend or the IR pass. .. option:: -fprofile-generate[=] @@ -4401,13 +4415,19 @@ Execute ``clang-cl /?`` to see a list of supported options: Instrument only functions from files where names don't match all the regexes separated by a semi-colon -fprofile-filter-files= Instrument only functions from files where names match any regex separated by a semi-colon - -fprofile-instr-generate= + -fprofile-generate[=] + Generate instrumented code to collect execution
[clang] Fix documentation on PGO/coverage related options. (PR #73845)
@@ -4401,13 +4413,18 @@ Execute ``clang-cl /?`` to see a list of supported options: Instrument only functions from files where names don't match all the regexes separated by a semi-colon -fprofile-filter-files= Instrument only functions from files where names match any regex separated by a semi-colon + -fprofile-generate[=] + Generate instrumented code to collect execution counts into a raw profile file in + (overridden by LLVM_PROFILE_FILE env var) -fprofile-instr-generate= david-xl wrote: Done https://github.com/llvm/llvm-project/pull/73845 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] Fix documentation on PGO/coverage related options. (PR #73845)
@@ -4401,13 +4413,18 @@ Execute ``clang-cl /?`` to see a list of supported options: Instrument only functions from files where names don't match all the regexes separated by a semi-colon -fprofile-filter-files= Instrument only functions from files where names match any regex separated by a semi-colon + -fprofile-generate[=] + Generate instrumented code to collect execution counts into a raw profile file in + (overridden by LLVM_PROFILE_FILE env var) -fprofile-instr-generate= Generate instrumented code to collect execution counts into (overridden by LLVM_PROFILE_FILE env var) -fprofile-instr-generate Generate instrumented code to collect execution counts into default.profraw file (overridden by '=' form of option or LLVM_PROFILE_FILE env var) -fprofile-instr-use= + Use instrumentation data for coverage testing or profile-guided optimization + -fprofile--use= david-xl wrote: fixed. https://github.com/llvm/llvm-project/pull/73845 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] Fix documentation on PGO/coverage related options. (PR #73845)
@@ -4401,13 +4413,18 @@ Execute ``clang-cl /?`` to see a list of supported options: Instrument only functions from files where names don't match all the regexes separated by a semi-colon -fprofile-filter-files= Instrument only functions from files where names match any regex separated by a semi-colon + -fprofile-generate[=] david-xl wrote: Updated description. It is actually a dirname, not pattern. https://github.com/llvm/llvm-project/pull/73845 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] Fix documentation on PGO/coverage related options. (PR #73845)
https://github.com/david-xl updated https://github.com/llvm/llvm-project/pull/73845 >From 7f64cc07b2883b9fde672510a3b7c1b71cfb4dfd Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 29 Nov 2023 11:56:31 -0800 Subject: [PATCH] Fix PGO documentation in user manual --- clang/docs/UsersManual.rst | 49 ++ 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst index 2e658557b0e310c..aac2b9691284921 100644 --- a/clang/docs/UsersManual.rst +++ b/clang/docs/UsersManual.rst @@ -2348,9 +2348,10 @@ differences between the two: 1. Profile data generated with one cannot be used by the other, and there is no conversion tool that can convert one to the other. So, a profile generated - via ``-fprofile-instr-generate`` must be used with ``-fprofile-instr-use``. - Similarly, sampling profiles generated by external profilers must be - converted and used with ``-fprofile-sample-use``. + via ``-fprofile-generate`` or ``-fprofile-instr-generate`` must be used with + ``-fprofile-use`` or ``-fprofile-instr-use``. Similarly, sampling profiles + generated by external profilers must be converted and used with ``-fprofile-sample-use`` + or ``-fauto-profile``. 2. Instrumentation profile data can be used for code coverage analysis and optimization. @@ -2598,6 +2599,8 @@ Of those, 31,977 were spent inside the body of ``bar``. The last line of the profile (``2: 0``) corresponds to line 2 inside ``main``. No samples were collected there. +.. _prof_instr: + Profiling with Instrumentation ^^ @@ -2607,11 +2610,25 @@ overhead during the profiling, but it provides more detailed results than a sampling profiler. It also provides reproducible results, at least to the extent that the code behaves consistently across runs. +Clang supports two types of instrumentation: frontend-based and IR-based. +Frontend-based instrumentation can be enabled with the option ``-fprofile-instr-generate``, +and IR-based instrumentation can be enabled with the option ``-fprofile-generate``. +For best performance with PGO, IR-based instrumentation should be used. It has +the benefits of lower instrumentation overhead, smaller raw profile size, and +better runtime performance. Frontend-based instrumentation, on the other hand, +has better source correlation, so it should be used with source line-based +coverage testing. + +The flag ``-fcs-profile-generate`` also instruments programs using the same +instrumentation method as ``-fprofile-generate``. However, it performs a +post-inline late instrumentation and can produce context-sensitive profiles. + + Here are the steps for using profile guided optimization with instrumentation: 1. Build an instrumented version of the code by compiling and linking with the - ``-fprofile-instr-generate`` option. + ``-fprofile-generate`` or ``-fprofile-instr-generate`` option. .. code-block:: console @@ -2674,8 +2691,8 @@ instrumentation: Note that this step is necessary even when there is only one "raw" profile, since the merge operation also changes the file format. -4. Build the code again using the ``-fprofile-instr-use`` option to specify the - collected profile data. +4. Build the code again using the ``-fprofile-use`` or ``-fprofile-instr-use`` + option to specify the collected profile data. .. code-block:: console @@ -2685,13 +2702,10 @@ instrumentation: profile. As you make changes to your code, clang may no longer be able to use the profile data. It will warn you when this happens. -Profile generation using an alternative instrumentation method can be -controlled by the GCC-compatible flags ``-fprofile-generate`` and -``-fprofile-use``. Although these flags are semantically equivalent to -their GCC counterparts, they *do not* handle GCC-compatible profiles. -They are only meant to implement GCC's semantics with respect to -profile creation and use. Flag ``-fcs-profile-generate`` also instruments -programs using the same instrumentation method as ``-fprofile-generate``. +Note that ``-fprofile-use`` option is semantically equivalent to +its GCC counterpart, it *does not* handle profile formats produced by GCC. +Both ``-fprofile-use`` and ``-fprofile-instr-use`` accept profiles in the +indexed format, regardeless whether it is produced by frontend or the IR pass. .. option:: -fprofile-generate[=] @@ -4401,13 +4415,18 @@ Execute ``clang-cl /?`` to see a list of supported options: Instrument only functions from files where names don't match all the regexes separated by a semi-colon -fprofile-filter-files= Instrument only functions from files where names match any regex separated by a semi-colon - -fprofile-instr-generate= + -fprofile-generate[=] + Generate instrumented code to collect execution
[clang] Fix documentation on PGO/coverage related options. (PR #73845)
https://github.com/david-xl updated https://github.com/llvm/llvm-project/pull/73845 >From 15dd2029a68a6fdbbc4eee5764ce966e0533880f Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 29 Nov 2023 11:56:31 -0800 Subject: [PATCH] Fix PGO documentation in user manual --- clang/docs/UsersManual.rst | 51 ++ 1 file changed, 35 insertions(+), 16 deletions(-) diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst index 2e658557b0e310c..af10a73285ce6cf 100644 --- a/clang/docs/UsersManual.rst +++ b/clang/docs/UsersManual.rst @@ -2348,9 +2348,10 @@ differences between the two: 1. Profile data generated with one cannot be used by the other, and there is no conversion tool that can convert one to the other. So, a profile generated - via ``-fprofile-instr-generate`` must be used with ``-fprofile-instr-use``. - Similarly, sampling profiles generated by external profilers must be - converted and used with ``-fprofile-sample-use``. + via ``-fprofile-generate`` or ``-fprofile-instr-generate`` must be used with + ``-fprofile-use`` or ``-fprofile-instr-use``. Similarly, sampling profiles + generated by external profilers must be converted and used with ``-fprofile-sample-use`` + or ``-fauto-profile``. 2. Instrumentation profile data can be used for code coverage analysis and optimization. @@ -2598,6 +2599,8 @@ Of those, 31,977 were spent inside the body of ``bar``. The last line of the profile (``2: 0``) corresponds to line 2 inside ``main``. No samples were collected there. +.. _prof_instr: + Profiling with Instrumentation ^^ @@ -2607,11 +2610,25 @@ overhead during the profiling, but it provides more detailed results than a sampling profiler. It also provides reproducible results, at least to the extent that the code behaves consistently across runs. +Clang supports two types of instrumentation: frontend-based and IR-based. +Frontend-based instrumentation can be enabled with the option ``-fprofile-instr-generate``, +and IR-based instrumentation can be enabled with the option ``-fprofile-generate``. +For best performance with PGO, IR-based instrumentation should be used. It has +the benefits of lower instrumentation overhead, smaller raw profile size, and +better runtime performance. Frontend-based instrumentation, on the other hand, +has better source correlation, so it should be used with source line-based +coverage testing. + +The flag ``-fcs-profile-generate`` also instruments programs using the same +instrumentation method as ``-fprofile-generate``. However, it performs a +post-inline late instrumentation and can produce context-sensitive profiles. + + Here are the steps for using profile guided optimization with instrumentation: 1. Build an instrumented version of the code by compiling and linking with the - ``-fprofile-instr-generate`` option. + ``-fprofile-generate`` or ``-fprofile-instr-generate`` option. .. code-block:: console @@ -2674,8 +2691,8 @@ instrumentation: Note that this step is necessary even when there is only one "raw" profile, since the merge operation also changes the file format. -4. Build the code again using the ``-fprofile-instr-use`` option to specify the - collected profile data. +4. Build the code again using the ``-fprofile-use`` or ``-fprofile-instr-use`` + option to specify the collected profile data. .. code-block:: console @@ -2685,13 +2702,10 @@ instrumentation: profile. As you make changes to your code, clang may no longer be able to use the profile data. It will warn you when this happens. -Profile generation using an alternative instrumentation method can be -controlled by the GCC-compatible flags ``-fprofile-generate`` and -``-fprofile-use``. Although these flags are semantically equivalent to -their GCC counterparts, they *do not* handle GCC-compatible profiles. -They are only meant to implement GCC's semantics with respect to -profile creation and use. Flag ``-fcs-profile-generate`` also instruments -programs using the same instrumentation method as ``-fprofile-generate``. +Note that ``-fprofile-use`` option is semantically equivalent to +its GCC counterpart, it *does not* handle profile formats produced by GCC. +Both ``-fprofile-use`` and ``-fprofile-instr-use`` accept profiles in the +indexed format, regardeless whether it is produced by frontend or the IR pass. .. option:: -fprofile-generate[=] @@ -4401,13 +4415,18 @@ Execute ``clang-cl /?`` to see a list of supported options: Instrument only functions from files where names don't match all the regexes separated by a semi-colon -fprofile-filter-files= Instrument only functions from files where names match any regex separated by a semi-colon - -fprofile-instr-generate= - Generate instrumented code to collect execution counts into + -fprofi
[clang] Fix documentation on PGO/coverage related options. (PR #73845)
https://github.com/david-xl updated https://github.com/llvm/llvm-project/pull/73845 >From 4cf62b1b780edef9902b5ec50b56d676810c3922 Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 29 Nov 2023 11:56:31 -0800 Subject: [PATCH] Fix PGO documentation in user manual --- clang/docs/UsersManual.rst | 54 +++--- 1 file changed, 38 insertions(+), 16 deletions(-) diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst index 2e658557b0e310c..6e9a7cd4cc17064 100644 --- a/clang/docs/UsersManual.rst +++ b/clang/docs/UsersManual.rst @@ -2348,9 +2348,10 @@ differences between the two: 1. Profile data generated with one cannot be used by the other, and there is no conversion tool that can convert one to the other. So, a profile generated - via ``-fprofile-instr-generate`` must be used with ``-fprofile-instr-use``. - Similarly, sampling profiles generated by external profilers must be - converted and used with ``-fprofile-sample-use``. + via ``-fprofile-generate`` or ``-fprofile-instr-generate`` must be used with + ``-fprofile-use`` or ``-fprofile-instr-use``. Similarly, sampling profiles + generated by external profilers must be converted and used with ``-fprofile-sample-use`` + or ``-fauto-profile``. 2. Instrumentation profile data can be used for code coverage analysis and optimization. @@ -2598,6 +2599,8 @@ Of those, 31,977 were spent inside the body of ``bar``. The last line of the profile (``2: 0``) corresponds to line 2 inside ``main``. No samples were collected there. +.. _prof_instr: + Profiling with Instrumentation ^^ @@ -2607,11 +2610,25 @@ overhead during the profiling, but it provides more detailed results than a sampling profiler. It also provides reproducible results, at least to the extent that the code behaves consistently across runs. +Clang supports two types of instrumentation: frontend-based and IR-based. +Frontend-based instrumentation can be enabled with the option ``-fprofile-instr-generate``, +and IR-based instrumentation can be enabled with the option ``-fprofile-generate``. +For best performance with PGO, IR-based instrumentation should be used. It has +the benefits of lower instrumentation overhead, smaller raw profile size, and +better runtime performance. Frontend-based instrumentation, on the other hand, +has better source correlation, so it should be used with source line-based +coverage testing. + +The flag ``-fcs-profile-generate`` also instruments programs using the same +instrumentation method as ``-fprofile-generate``. However, it performs a +post-inline late instrumentation and can produce context-sensitive profiles. + + Here are the steps for using profile guided optimization with instrumentation: 1. Build an instrumented version of the code by compiling and linking with the - ``-fprofile-instr-generate`` option. + ``-fprofile-generate`` or ``-fprofile-instr-generate`` option. .. code-block:: console @@ -2674,8 +2691,8 @@ instrumentation: Note that this step is necessary even when there is only one "raw" profile, since the merge operation also changes the file format. -4. Build the code again using the ``-fprofile-instr-use`` option to specify the - collected profile data. +4. Build the code again using the ``-fprofile-use`` or ``-fprofile-instr-use`` + option to specify the collected profile data. .. code-block:: console @@ -2685,13 +2702,10 @@ instrumentation: profile. As you make changes to your code, clang may no longer be able to use the profile data. It will warn you when this happens. -Profile generation using an alternative instrumentation method can be -controlled by the GCC-compatible flags ``-fprofile-generate`` and -``-fprofile-use``. Although these flags are semantically equivalent to -their GCC counterparts, they *do not* handle GCC-compatible profiles. -They are only meant to implement GCC's semantics with respect to -profile creation and use. Flag ``-fcs-profile-generate`` also instruments -programs using the same instrumentation method as ``-fprofile-generate``. +Note that ``-fprofile-use`` option is semantically equivalent to +its GCC counterpart, it *does not* handle profile formats produced by GCC. +Both ``-fprofile-use`` and ``-fprofile-instr-use`` accept profiles in the +indexed format, regardeless whether it is produced by frontend or the IR pass. .. option:: -fprofile-generate[=] @@ -4401,13 +4415,21 @@ Execute ``clang-cl /?`` to see a list of supported options: Instrument only functions from files where names don't match all the regexes separated by a semi-colon -fprofile-filter-files= Instrument only functions from files where names match any regex separated by a semi-colon - -fprofile-instr-generate= - Generate instrumented code to collect execution counts into + -fprofi
[clang] Fix documentation on PGO/coverage related options. (PR #73845)
https://github.com/david-xl updated https://github.com/llvm/llvm-project/pull/73845 >From 29b5b28f52c88ebd862163c4feb1573460c5c79e Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 29 Nov 2023 11:56:31 -0800 Subject: [PATCH] Fix PGO documentation in user manual --- clang/docs/UsersManual.rst | 54 +++--- 1 file changed, 38 insertions(+), 16 deletions(-) diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst index 2e658557b0e310c..17e52a715e79e86 100644 --- a/clang/docs/UsersManual.rst +++ b/clang/docs/UsersManual.rst @@ -2348,9 +2348,10 @@ differences between the two: 1. Profile data generated with one cannot be used by the other, and there is no conversion tool that can convert one to the other. So, a profile generated - via ``-fprofile-instr-generate`` must be used with ``-fprofile-instr-use``. - Similarly, sampling profiles generated by external profilers must be - converted and used with ``-fprofile-sample-use``. + via ``-fprofile-generate`` or ``-fprofile-instr-generate`` must be used with + ``-fprofile-use`` or ``-fprofile-instr-use``. Similarly, sampling profiles + generated by external profilers must be converted and used with ``-fprofile-sample-use`` + or ``-fauto-profile``. 2. Instrumentation profile data can be used for code coverage analysis and optimization. @@ -2598,6 +2599,8 @@ Of those, 31,977 were spent inside the body of ``bar``. The last line of the profile (``2: 0``) corresponds to line 2 inside ``main``. No samples were collected there. +.. _prof_instr: + Profiling with Instrumentation ^^ @@ -2607,11 +2610,25 @@ overhead during the profiling, but it provides more detailed results than a sampling profiler. It also provides reproducible results, at least to the extent that the code behaves consistently across runs. +Clang supports two types of instrumentation: frontend-based and IR-based. +Frontend-based instrumentation can be enabled with the option ``-fprofile-instr-generate``, +and IR-based instrumentation can be enabled with the option ``-fprofile-generate``. +For best performance with PGO, IR-based instrumentation should be used. It has +the benefits of lower instrumentation overhead, smaller raw profile size, and +better runtime performance. Frontend-based instrumentation, on the other hand, +has better source correlation, so it should be used with source line-based +coverage testing. + +The flag ``-fcs-profile-generate`` also instruments programs using the same +instrumentation method as ``-fprofile-generate``. However, it performs a +post-inline late instrumentation and can produce context-sensitive profiles. + + Here are the steps for using profile guided optimization with instrumentation: 1. Build an instrumented version of the code by compiling and linking with the - ``-fprofile-instr-generate`` option. + ``-fprofile-generate`` or ``-fprofile-instr-generate`` option. .. code-block:: console @@ -2674,8 +2691,8 @@ instrumentation: Note that this step is necessary even when there is only one "raw" profile, since the merge operation also changes the file format. -4. Build the code again using the ``-fprofile-instr-use`` option to specify the - collected profile data. +4. Build the code again using the ``-fprofile-use`` or ``-fprofile-instr-use`` + option to specify the collected profile data. .. code-block:: console @@ -2685,13 +2702,10 @@ instrumentation: profile. As you make changes to your code, clang may no longer be able to use the profile data. It will warn you when this happens. -Profile generation using an alternative instrumentation method can be -controlled by the GCC-compatible flags ``-fprofile-generate`` and -``-fprofile-use``. Although these flags are semantically equivalent to -their GCC counterparts, they *do not* handle GCC-compatible profiles. -They are only meant to implement GCC's semantics with respect to -profile creation and use. Flag ``-fcs-profile-generate`` also instruments -programs using the same instrumentation method as ``-fprofile-generate``. +Note that ``-fprofile-use`` option is semantically equivalent to +its GCC counterpart, it *does not* handle profile formats produced by GCC. +Both ``-fprofile-use`` and ``-fprofile-instr-use`` accept profiles in the +indexed format, regardeless whether it is produced by frontend or the IR pass. .. option:: -fprofile-generate[=] @@ -4401,13 +4415,21 @@ Execute ``clang-cl /?`` to see a list of supported options: Instrument only functions from files where names don't match all the regexes separated by a semi-colon -fprofile-filter-files= Instrument only functions from files where names match any regex separated by a semi-colon - -fprofile-instr-generate= - Generate instrumented code to collect execution counts into + -fprofi
[clang] Fix documentation on PGO/coverage related options. (PR #73845)
@@ -4401,13 +4415,21 @@ Execute ``clang-cl /?`` to see a list of supported options: Instrument only functions from files where names don't match all the regexes separated by a semi-colon -fprofile-filter-files= Instrument only functions from files where names match any regex separated by a semi-colon - -fprofile-instr-generate= - Generate instrumented code to collect execution counts into + -fprofile-generate= + Generate instrumented code to collect execution counts into a raw profile file in the directory specified by the argument. The filename uses the %m format. See :ref:`Profiling With Instrumentation ` section for details. + (overridden by LLVM_PROFILE_FILE env var) + -fprofile-generate + Generate instrumented code to collect execution counts into default_%m.profraw file + (overridden by '=' form of option or LLVM_PROFILE_FILE env var) + -fprofile-instr-generate= david-xl wrote: done (-fprofile-instr-generate was there before. I added -fprofile-generate). https://github.com/llvm/llvm-project/pull/73845 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] Fix documentation on PGO/coverage related options. (PR #73845)
@@ -4401,13 +4415,21 @@ Execute ``clang-cl /?`` to see a list of supported options: Instrument only functions from files where names don't match all the regexes separated by a semi-colon -fprofile-filter-files= Instrument only functions from files where names match any regex separated by a semi-colon - -fprofile-instr-generate= - Generate instrumented code to collect execution counts into + -fprofile-generate= + Generate instrumented code to collect execution counts into a raw profile file in the directory specified by the argument. The filename uses the %m format. See :ref:`Profiling With Instrumentation ` section for details. david-xl wrote: Fixed. https://github.com/llvm/llvm-project/pull/73845 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] Fix documentation on PGO/coverage related options. (PR #73845)
@@ -4401,13 +4413,18 @@ Execute ``clang-cl /?`` to see a list of supported options: Instrument only functions from files where names don't match all the regexes separated by a semi-colon -fprofile-filter-files= Instrument only functions from files where names match any regex separated by a semi-colon + -fprofile-generate[=] david-xl wrote: -fprofile-generate= in GCC specifies a path which is a directory name (the reason is that GCC's profile data is dumped per module, so there is no single profile name). When we introduced this option in Clang/LLVM, we just matched that behavior. https://github.com/llvm/llvm-project/pull/73845 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] Fix documentation on PGO/coverage related options. (PR #73845)
https://github.com/david-xl updated https://github.com/llvm/llvm-project/pull/73845 >From 4c0f907dc778e8cfd0e41008b8b2970a016201b0 Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 29 Nov 2023 11:56:31 -0800 Subject: [PATCH] Fix PGO documentation in user manual --- clang/docs/UsersManual.rst | 54 +++--- 1 file changed, 38 insertions(+), 16 deletions(-) diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst index 2e658557b0e31..9d64195ee338e 100644 --- a/clang/docs/UsersManual.rst +++ b/clang/docs/UsersManual.rst @@ -2348,9 +2348,10 @@ differences between the two: 1. Profile data generated with one cannot be used by the other, and there is no conversion tool that can convert one to the other. So, a profile generated - via ``-fprofile-instr-generate`` must be used with ``-fprofile-instr-use``. - Similarly, sampling profiles generated by external profilers must be - converted and used with ``-fprofile-sample-use``. + via ``-fprofile-generate`` or ``-fprofile-instr-generate`` must be used with + ``-fprofile-use`` or ``-fprofile-instr-use``. Similarly, sampling profiles + generated by external profilers must be converted and used with ``-fprofile-sample-use`` + or ``-fauto-profile``. 2. Instrumentation profile data can be used for code coverage analysis and optimization. @@ -2598,6 +2599,8 @@ Of those, 31,977 were spent inside the body of ``bar``. The last line of the profile (``2: 0``) corresponds to line 2 inside ``main``. No samples were collected there. +.. _prof_instr: + Profiling with Instrumentation ^^ @@ -2607,11 +2610,25 @@ overhead during the profiling, but it provides more detailed results than a sampling profiler. It also provides reproducible results, at least to the extent that the code behaves consistently across runs. +Clang supports two types of instrumentation: frontend-based and IR-based. +Frontend-based instrumentation can be enabled with the option ``-fprofile-instr-generate``, +and IR-based instrumentation can be enabled with the option ``-fprofile-generate``. +For best performance with PGO, IR-based instrumentation should be used. It has +the benefits of lower instrumentation overhead, smaller raw profile size, and +better runtime performance. Frontend-based instrumentation, on the other hand, +has better source correlation, so it should be used with source line-based +coverage testing. + +The flag ``-fcs-profile-generate`` also instruments programs using the same +instrumentation method as ``-fprofile-generate``. However, it performs a +post-inline late instrumentation and can produce context-sensitive profiles. + + Here are the steps for using profile guided optimization with instrumentation: 1. Build an instrumented version of the code by compiling and linking with the - ``-fprofile-instr-generate`` option. + ``-fprofile-generate`` or ``-fprofile-instr-generate`` option. .. code-block:: console @@ -2674,8 +2691,8 @@ instrumentation: Note that this step is necessary even when there is only one "raw" profile, since the merge operation also changes the file format. -4. Build the code again using the ``-fprofile-instr-use`` option to specify the - collected profile data. +4. Build the code again using the ``-fprofile-use`` or ``-fprofile-instr-use`` + option to specify the collected profile data. .. code-block:: console @@ -2685,13 +2702,10 @@ instrumentation: profile. As you make changes to your code, clang may no longer be able to use the profile data. It will warn you when this happens. -Profile generation using an alternative instrumentation method can be -controlled by the GCC-compatible flags ``-fprofile-generate`` and -``-fprofile-use``. Although these flags are semantically equivalent to -their GCC counterparts, they *do not* handle GCC-compatible profiles. -They are only meant to implement GCC's semantics with respect to -profile creation and use. Flag ``-fcs-profile-generate`` also instruments -programs using the same instrumentation method as ``-fprofile-generate``. +Note that ``-fprofile-use`` option is semantically equivalent to +its GCC counterpart, it *does not* handle profile formats produced by GCC. +Both ``-fprofile-use`` and ``-fprofile-instr-use`` accept profiles in the +indexed format, regardeless whether it is produced by frontend or the IR pass. .. option:: -fprofile-generate[=] @@ -4401,13 +4415,21 @@ Execute ``clang-cl /?`` to see a list of supported options: Instrument only functions from files where names don't match all the regexes separated by a semi-colon -fprofile-filter-files= Instrument only functions from files where names match any regex separated by a semi-colon - -fprofile-instr-generate= - Generate instrumented code to collect execution counts into + -fprofile-g
[clang] Fix documentation on PGO/coverage related options. (PR #73845)
https://github.com/david-xl closed https://github.com/llvm/llvm-project/pull/73845 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang-tools-extra] [llvm] [ISel] Add pattern matching for depositing subreg value (PR #75978)
@@ -0,0 +1,93 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc < %s -mtriple=i386-unknown-unknown | FileCheck %s --check-prefixes=I386 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X86_64 david-xl wrote: Fixed. https://github.com/llvm/llvm-project/pull/75978 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [clang] [compiler-rt] [llvm] [llvm-profdata] Use semicolon as the delimiter for supplementary profiles. (PR #75080)
https://github.com/david-xl approved this pull request. https://github.com/llvm/llvm-project/pull/75080 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] Add option to generate additional debug info for expression dereferencing pointer to pointers. (PR #81545)
https://github.com/david-xl commented: Why is a new user facing option needed? I suppose this can be done under -fdebug-info-for-profiling. An internal option can also be added to disable it. https://github.com/llvm/llvm-project/pull/81545 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] Add option to generate additional debug info for expression dereferencing pointer to pointers. (PR #81545)
david-xl wrote: > So the additional debug info for pointer type should be generated when > -fdebug-info-for-profiling is enabled? yes, it is extra debug info for profiling (can be used for samplePGO). https://github.com/llvm/llvm-project/pull/81545 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [PGO] Add ability to mark cold functions as optsize/minsize/optnone (PR #69030)
david-xl wrote: > > Good example. This pass should be run post-inline. @aeubanks, any reason we > > want to run it early in the pipeline? > > We want the main function simplification pipeline to see these function > attributes because some optimizations trigger or don't trigger depending on > the presence of the attributes. Modifying function attributes is typically > done in CGSCC/module passes since doing so can affect what callers of those > functions see (in effect changing other functions), which shouldn't happen in > function passes. I suppose it's possible to add this as a CGSCC pass that > runs after inlining and before the function simplification pipeline, but this > is more of a one time thing and CGSCC passes can revisit functions. So this > pass makes the most sense as a module pass, but we can't insert a module pass > between inlining and the function simplification pipeline. > > Can/does the inliner ignore these size attributes when it has call-site > profile information? Looking at the current change, this new pass is actually after the sample loader (including sample loader inlining) pass, so wenlei@'s concern should be addressed. https://github.com/llvm/llvm-project/pull/69030 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [PGO] Add ability to mark cold functions as optsize/minsize/optnone (PR #69030)
https://github.com/david-xl approved this pull request. https://github.com/llvm/llvm-project/pull/69030 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] Add option to generate additional debug info for expression dereferencing pointer to pointers. (PR #81545)
https://github.com/david-xl commented: Please add test cases. The clang user manual also needs update. https://github.com/llvm/llvm-project/pull/81545 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [PGO] Add ability to mark cold functions as optsize/minsize/optnone (PR #69030)
@@ -0,0 +1,73 @@ +//===--===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// + +#include "llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Support/ErrorHandling.h" + +using namespace llvm; + +static bool shouldRunOnFunction(Function &F, ProfileSummaryInfo &PSI, +FunctionAnalysisManager &FAM) { + if (F.hasFnAttribute(Attribute::Cold)) +return true; + if (!PSI.hasProfileSummary()) +return false; + BlockFrequencyInfo &BFI = FAM.getResult(F); + return PSI.isFunctionColdInCallGraph(&F, BFI); +} + +PreservedAnalyses PGOForceFunctionAttrsPass::run(Module &M, + ModuleAnalysisManager &AM) { + if (ColdType == PGOOptions::ColdFuncOpt::Default) +return PreservedAnalyses::all(); + ProfileSummaryInfo &PSI = AM.getResult(M); + FunctionAnalysisManager &FAM = + AM.getResult(M).getManager(); + bool MadeChange = false; + for (Function &F : M) { +if (F.isDeclaration()) + continue; +if (!shouldRunOnFunction(F, PSI, FAM)) + continue; +// Add optsize/minsize/optnone if requested. +switch (ColdType) { +case PGOOptions::ColdFuncOpt::Default: + llvm_unreachable("bailed out for default above"); + break; +case PGOOptions::ColdFuncOpt::OptSize: + if (!F.hasFnAttribute(Attribute::OptimizeNone) && + !F.hasFnAttribute(Attribute::OptimizeForSize) && + !F.hasFnAttribute(Attribute::MinSize)) { +F.addFnAttr(Attribute::OptimizeForSize); +MadeChange = true; + } + break; +case PGOOptions::ColdFuncOpt::MinSize: + // Change optsize to minsize. + if (!F.hasFnAttribute(Attribute::OptimizeNone) && + !F.hasFnAttribute(Attribute::MinSize)) { +F.removeFnAttr(Attribute::OptimizeForSize); david-xl wrote: or have an option to control this behavior? https://github.com/llvm/llvm-project/pull/69030 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [PGO] Add ability to mark cold functions as optsize/minsize/optnone (PR #69030)
@@ -0,0 +1,73 @@ +//===--===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// + +#include "llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Support/ErrorHandling.h" + +using namespace llvm; + +static bool shouldRunOnFunction(Function &F, ProfileSummaryInfo &PSI, +FunctionAnalysisManager &FAM) { + if (F.hasFnAttribute(Attribute::Cold)) +return true; + if (!PSI.hasProfileSummary()) +return false; + BlockFrequencyInfo &BFI = FAM.getResult(F); + return PSI.isFunctionColdInCallGraph(&F, BFI); +} + +PreservedAnalyses PGOForceFunctionAttrsPass::run(Module &M, + ModuleAnalysisManager &AM) { + if (ColdType == PGOOptions::ColdFuncOpt::Default) +return PreservedAnalyses::all(); + ProfileSummaryInfo &PSI = AM.getResult(M); + FunctionAnalysisManager &FAM = + AM.getResult(M).getManager(); + bool MadeChange = false; + for (Function &F : M) { +if (F.isDeclaration()) + continue; +if (!shouldRunOnFunction(F, PSI, FAM)) + continue; +// Add optsize/minsize/optnone if requested. +switch (ColdType) { +case PGOOptions::ColdFuncOpt::Default: + llvm_unreachable("bailed out for default above"); + break; +case PGOOptions::ColdFuncOpt::OptSize: + if (!F.hasFnAttribute(Attribute::OptimizeNone) && + !F.hasFnAttribute(Attribute::OptimizeForSize) && + !F.hasFnAttribute(Attribute::MinSize)) { +F.addFnAttr(Attribute::OptimizeForSize); +MadeChange = true; + } + break; +case PGOOptions::ColdFuncOpt::MinSize: + // Change optsize to minsize. + if (!F.hasFnAttribute(Attribute::OptimizeNone) && + !F.hasFnAttribute(Attribute::MinSize)) { +F.removeFnAttr(Attribute::OptimizeForSize); david-xl wrote: should it overwrite the existing attribute? https://github.com/llvm/llvm-project/pull/69030 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [PGO] Add ability to mark cold functions as optsize/minsize/optnone (PR #69030)
david-xl wrote: > How does this relate to the existing `shouldOptimizeForSize(Function&, ...)` > and `shouldOptimizeForSize(MachineFunction&, ...)` APIs which appear to > provide similar functionality at a first glance. If they are the same, then > we should have a plan in place to cleanup and only have one system > afterwards, if there are important differences, then I wouldn't mind some > comments explaining them. This patch allows more user control on the cold function action types. Another difference is that the existing API can be invoked in postInline passes which can see more cold functions after inlining. To replace those APIs, the new pass will need to run again post-inlining. https://github.com/llvm/llvm-project/pull/69030 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] New calling convention preserve_none (PR #76868)
https://github.com/david-xl edited https://github.com/llvm/llvm-project/pull/76868 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] New calling convention preserve_none (PR #76868)
https://github.com/david-xl commented: Can you increase the test coverage: 1) preserve_none caller TAIL-calls preserve_none callee -- the tail call is preserved 2) regular function TAIL-calls preserve_none callee -- the tail call should be disabled and all CSRs should be saved/restored around the call. 3) preserve-none caller calls preserve_none callee -- no registers are saved/restored around the call 4) preserve_none caller calls preserve_none callee with long argument list -- CSRs used for arg passing. https://github.com/llvm/llvm-project/pull/76868 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] New calling convention preserve_none (PR #76868)
@@ -0,0 +1,85 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck --check-prefixes=ALL %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck --check-prefixes=ALL,AVX %s + +; Don't need to preserve registers before using them. +define preserve_nonecc double @preserve_nonecc1() nounwind { +entry: +;ALL-LABEL: preserve_nonecc1 +;ALL-NOT: movaps %xmm1 +;ALL-NOT: movaps %xmm0 +;AVX-NOT: vmovups %ymm1 +;AVX-NOT: vmovups %ymm0 +;ALL-NOT: movaps {{.*}} %xmm0 +;ALL-NOT: movaps {{.*}} %xmm1 +;AVX-NOT: vmovups {{.*}} %ymm0 +;AVX-NOT: vmovups {{.*}} %ymm1 + call void asm sideeffect "", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{rbp},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + ret double 0. +} + +; Save/restore live registers across preserve_none function call. +declare preserve_nonecc double @bar_double(i64, i64) +define void @preserve_nonecc2() nounwind { +entry: +;ALL-LABEL: preserve_nonecc2 david-xl wrote: Those xmm registers will be preserved/restored around the call even without the attribute. Perhaps check the the GSRs ? https://github.com/llvm/llvm-project/pull/76868 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] New calling convention preserve_none (PR #76868)
@@ -416,6 +416,13 @@ added in the future: This calling convention, like the `PreserveMost` calling convention, will be used by a future version of the ObjectiveC runtime and should be considered experimental at this time. +"``preserve_nonecc``" - The `PreserveNone` calling convention +This calling convention doesn't preserve any general registers. So all +general registers are caller saved registers. It also uses all general +registers to pass arguments. This attribute doesn't impact non-general +purpose registers (e.g. floating point registers, on X86 XMMs/YMMs). david-xl wrote: Document that this is X86-only for now? https://github.com/llvm/llvm-project/pull/76868 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] New calling convention preserve_none (PR #76868)
@@ -0,0 +1,85 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck --check-prefixes=ALL %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck --check-prefixes=ALL,AVX %s + +; Don't need to preserve registers before using them. +define preserve_nonecc double @preserve_nonecc1() nounwind { +entry: +;ALL-LABEL: preserve_nonecc1 +;ALL-NOT: movaps %xmm1 david-xl wrote: check no savings of GSRs too? https://github.com/llvm/llvm-project/pull/76868 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] New calling convention preserve_none (PR #76868)
@@ -0,0 +1,131 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=corei7 < %s | FileCheck %s + +; This test checks various function call behaviors between preserve_none and +; normal calling conventions. + +declare preserve_nonecc void @callee(ptr) + +; Normal caller calls preserve_none callee. Will not generated tail call because +; of incompatible calling convention. Callee saved registers are saved/restored +; around the call. +define void @caller1(ptr %a) { +; CHECK-LABEL: caller1: +; CHECK: # %bb.0: +; CHECK-NEXT:pushq %r15 +; CHECK-NEXT:.cfi_def_cfa_offset 16 +; CHECK-NEXT:pushq %r14 +; CHECK-NEXT:.cfi_def_cfa_offset 24 +; CHECK-NEXT:pushq %r13 +; CHECK-NEXT:.cfi_def_cfa_offset 32 +; CHECK-NEXT:pushq %r12 +; CHECK-NEXT:.cfi_def_cfa_offset 40 +; CHECK-NEXT:pushq %rbx +; CHECK-NEXT:.cfi_def_cfa_offset 48 +; CHECK-NEXT:.cfi_offset %rbx, -48 +; CHECK-NEXT:.cfi_offset %r12, -40 +; CHECK-NEXT:.cfi_offset %r13, -32 +; CHECK-NEXT:.cfi_offset %r14, -24 +; CHECK-NEXT:.cfi_offset %r15, -16 +; CHECK-NEXT:callq callee@PLT +; CHECK-NEXT:popq %rbx +; CHECK-NEXT:.cfi_def_cfa_offset 40 +; CHECK-NEXT:popq %r12 +; CHECK-NEXT:.cfi_def_cfa_offset 32 +; CHECK-NEXT:popq %r13 +; CHECK-NEXT:.cfi_def_cfa_offset 24 +; CHECK-NEXT:popq %r14 +; CHECK-NEXT:.cfi_def_cfa_offset 16 +; CHECK-NEXT:popq %r15 +; CHECK-NEXT:.cfi_def_cfa_offset 8 +; CHECK-NEXT:retq + tail call preserve_nonecc void @callee(ptr %a) + ret void +} + +; Preserve_none caller calls preserve_none callee. Same function body. +; The tail call is preserved. No registers are saved/restored around the call. +; Actually a simple jmp instruction is generated. +define preserve_nonecc void @caller2(ptr %a) { +; CHECK-LABEL: caller2: +; CHECK: # %bb.0: +; CHECK-NEXT:jmp callee@PLT # TAILCALL + tail call preserve_nonecc void @callee(ptr %a) + ret void +} + +; Preserve_none function can use more registers to pass parameters. +define preserve_nonecc i64 @callee_with_many_param(i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7, i64 %a8, i64 %a9, i64 %a10, i64 %a11, i64 %a12) { david-xl wrote: To test whether the callee/caller register (parameters) sequence matches, it is probably better avoiding using the adding which are subject to re-association. How about do the following g1 = a1; g2 = a2; ... where g1, g2, etc are global variables. https://github.com/llvm/llvm-project/pull/76868 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] New calling convention preserve_none (PR #76868)
@@ -0,0 +1,131 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=corei7 < %s | FileCheck %s + +; This test checks various function call behaviors between preserve_none and +; normal calling conventions. + +declare preserve_nonecc void @callee(ptr) + +; Normal caller calls preserve_none callee. Will not generated tail call because +; of incompatible calling convention. Callee saved registers are saved/restored +; around the call. +define void @caller1(ptr %a) { +; CHECK-LABEL: caller1: +; CHECK: # %bb.0: +; CHECK-NEXT:pushq %r15 +; CHECK-NEXT:.cfi_def_cfa_offset 16 +; CHECK-NEXT:pushq %r14 +; CHECK-NEXT:.cfi_def_cfa_offset 24 +; CHECK-NEXT:pushq %r13 +; CHECK-NEXT:.cfi_def_cfa_offset 32 +; CHECK-NEXT:pushq %r12 +; CHECK-NEXT:.cfi_def_cfa_offset 40 +; CHECK-NEXT:pushq %rbx +; CHECK-NEXT:.cfi_def_cfa_offset 48 +; CHECK-NEXT:.cfi_offset %rbx, -48 +; CHECK-NEXT:.cfi_offset %r12, -40 +; CHECK-NEXT:.cfi_offset %r13, -32 +; CHECK-NEXT:.cfi_offset %r14, -24 +; CHECK-NEXT:.cfi_offset %r15, -16 +; CHECK-NEXT:callq callee@PLT +; CHECK-NEXT:popq %rbx +; CHECK-NEXT:.cfi_def_cfa_offset 40 +; CHECK-NEXT:popq %r12 +; CHECK-NEXT:.cfi_def_cfa_offset 32 +; CHECK-NEXT:popq %r13 +; CHECK-NEXT:.cfi_def_cfa_offset 24 +; CHECK-NEXT:popq %r14 +; CHECK-NEXT:.cfi_def_cfa_offset 16 +; CHECK-NEXT:popq %r15 +; CHECK-NEXT:.cfi_def_cfa_offset 8 +; CHECK-NEXT:retq + tail call preserve_nonecc void @callee(ptr %a) + ret void +} + +; Preserve_none caller calls preserve_none callee. Same function body. +; The tail call is preserved. No registers are saved/restored around the call. +; Actually a simple jmp instruction is generated. +define preserve_nonecc void @caller2(ptr %a) { +; CHECK-LABEL: caller2: +; CHECK: # %bb.0: +; CHECK-NEXT:jmp callee@PLT # TAILCALL + tail call preserve_nonecc void @callee(ptr %a) + ret void +} + +; Preserve_none function can use more registers to pass parameters. +define preserve_nonecc i64 @callee_with_many_param(i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7, i64 %a8, i64 %a9, i64 %a10, i64 %a11, i64 %a12) { david-xl wrote: or better yet, pass the arguments in order to (tail)call to another preserve_none callee. https://github.com/llvm/llvm-project/pull/76868 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] New calling convention preserve_none (PR #76868)
https://github.com/david-xl approved this pull request. https://github.com/llvm/llvm-project/pull/76868 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang-tools-extra] [clang] [PGO] Add ability to mark cold functions as optsize/minsize/optnone (PR #69030)
david-xl wrote: > > I don't understand, if you're saying the profile is accurate, then those > > functions are actually cold, so we should be able to mark them as optsize? > > Accurate is not black or white. The current heuristic requires certain level > of accuracy to be effective. If you make the heuristics more aggressive (like > what this patch is doing), you're raising the requirement of what can be > considered accurate, and profile not meeting that new requirement could see > regression with new heuristic. > > Whether a function is cold or not also depends on what is the calling context > and how inlining is done. All that makes function level annotation inherently > inaccurate when done before inlining. Not that we shouldn't try it, but it's > not as clear cut as it appears to be, and we need to be careful. It will be more conservative (pre-inlining), so won't cause additional optimization suppression compared with the current PGSO. https://github.com/llvm/llvm-project/pull/69030 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang-tools-extra] [clang] [PGO] Add ability to mark cold functions as optsize/minsize/optnone (PR #69030)
david-xl wrote: > > > > I don't understand, if you're saying the profile is accurate, then > > > > those functions are actually cold, so we should be able to mark them as > > > > optsize? > > > > > > > > > Accurate is not black or white. The current heuristic requires certain > > > level of accuracy to be effective. If you make the heuristics more > > > aggressive (like what this patch is doing), you're raising the > > > requirement of what can be considered accurate, and profile not meeting > > > that new requirement could see regression with new heuristic. > > > Whether a function is cold or not also depends on what is the calling > > > context and how inlining is done. All that makes function level > > > annotation inherently inaccurate when done before inlining. Not that we > > > shouldn't try it, but it's not as clear cut as it appears to be, and we > > > need to be careful. > > > > > > It will be more conservative (pre-inlining), so won't cause additional > > optimization suppression compared with the current PGSO. > > Sample PGO profile has inline context, so in the profile, we may have `foo` > as cold and `bar->foo` as hot, but if later inliner rejects `bar->foo` > inlining, `foo` can be hot. So marking `foo` as cold pre-inline can still be > inaccurate (and not conservative). > > In this PR, you have `PGOColdFuncAttr` default to `ColdFuncOpt::Default`. As > long as you keep it that way, this PR is fine for sample pgo (it's no-op > unless `pgo-cold-func-opt` is used explicitly). Good example. This pass should be run post-inline. @aeubanks, any reason we want to run it early in the pipeline? https://github.com/llvm/llvm-project/pull/69030 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Profile] Add binary profile correlation to offload profile metadata at runtime. (PR #69493)
david-xl wrote: Can you send a RFC for this enhancement ? The first 2 paragraph of this PR can be expanded a little more to show motivations (RAM saving, or raw profile size saving and why) and the savings data. https://github.com/llvm/llvm-project/pull/69493 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Profile] Add binary profile correlation to offload profile metadata at runtime. (PR #69493)
@@ -98,13 +99,16 @@ extern cl::opt PrintPipelinePasses; static cl::opt ClSanitizeOnOptimizerEarlyEP( "sanitizer-early-opt-ep", cl::Optional, cl::desc("Insert sanitizers on OptimizerEarlyEP."), cl::init(false)); -} + +extern cl::opt ProfileCorrelate; david-xl wrote: Can be extracted out as refactoring (without adding the new correlation kind). https://github.com/llvm/llvm-project/pull/69493 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Profile] Add binary profile correlation to offload profile metadata at runtime. (PR #69493)
@@ -374,6 +376,14 @@ class RawInstrProfReader : public InstrProfReader { return (Version & VARIANT_MASK_DBG_CORRELATE) != 0; } + bool useBinaryCorrelate() const override { +return (Version & VARIANT_MASK_BIN_CORRELATE) != 0; + } + + bool useCorrelate() const { david-xl wrote: candidate in refactoring patch. https://github.com/llvm/llvm-project/pull/69493 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Profile] Add binary profile correlation to offload profile metadata at runtime. (PR #69493)
@@ -66,8 +66,9 @@ int __llvm_profile_check_compatibility(const char *ProfileData, Header->NumCounters != __llvm_profile_get_num_counters(__llvm_profile_begin_counters(), __llvm_profile_end_counters()) || - Header->NamesSize != (uint64_t)(__llvm_profile_end_names() - - __llvm_profile_begin_names()) || + Header->NamesSize != david-xl wrote: candidate in refactor patch https://github.com/llvm/llvm-project/pull/69493 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Profile] Add binary profile correlation to offload profile metadata at runtime. (PR #69493)
@@ -267,6 +267,9 @@ uint64_t __llvm_profile_get_num_data(const __llvm_profile_data *Begin, uint64_t __llvm_profile_get_data_size(const __llvm_profile_data *Begin, const __llvm_profile_data *End); +/*! \brief Get the size of the profile name section in bytes. */ +uint64_t __llvm_profile_get_name_size(const char *Begin, const char *End); david-xl wrote: extract out as in a refactoring patch. If this interface is not intended to be public, no need to use the __llvm name space for it, but use the internal interface naming convention. https://github.com/llvm/llvm-project/pull/69493 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Profile] Add binary profile correlation to offload profile metadata at runtime. (PR #69493)
@@ -61,9 +64,20 @@ uint64_t __llvm_profile_get_num_data(const __llvm_profile_data *Begin, COMPILER_RT_VISIBILITY uint64_t __llvm_profile_get_data_size(const __llvm_profile_data *Begin, const __llvm_profile_data *End) { + if ((__llvm_profile_get_version() & VARIANT_MASK_DBG_CORRELATE) || david-xl wrote: same here. https://github.com/llvm/llvm-project/pull/69493 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Profile] Add binary profile correlation to offload profile metadata at runtime. (PR #69493)
@@ -259,19 +259,19 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin, const char *CountersBegin, const char *CountersEnd, VPDataReaderType *VPDataReader, const char *NamesBegin, const char *NamesEnd, int SkipNameDataWrite) { - int DebugInfoCorrelate = - (__llvm_profile_get_version() & VARIANT_MASK_DBG_CORRELATE) != 0ULL; + int ProfileCorrelation = + (__llvm_profile_get_version() & VARIANT_MASK_DBG_CORRELATE) || + (__llvm_profile_get_version() & VARIANT_MASK_BIN_CORRELATE); /* Calculate size of sections. */ const uint64_t DataSectionSize = - DebugInfoCorrelate ? 0 : __llvm_profile_get_data_size(DataBegin, DataEnd); - const uint64_t NumData = - DebugInfoCorrelate ? 0 : __llvm_profile_get_num_data(DataBegin, DataEnd); + __llvm_profile_get_data_size(DataBegin, DataEnd); + const uint64_t NumData = __llvm_profile_get_num_data(DataBegin, DataEnd); david-xl wrote: same here https://github.com/llvm/llvm-project/pull/69493 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Profile] Add binary profile correlation to offload profile metadata at runtime. (PR #69493)
@@ -61,9 +64,20 @@ uint64_t __llvm_profile_get_num_data(const __llvm_profile_data *Begin, COMPILER_RT_VISIBILITY uint64_t __llvm_profile_get_data_size(const __llvm_profile_data *Begin, const __llvm_profile_data *End) { + if ((__llvm_profile_get_version() & VARIANT_MASK_DBG_CORRELATE) || + (__llvm_profile_get_version() & VARIANT_MASK_BIN_CORRELATE)) +return 0; return __llvm_profile_get_num_data(Begin, End) * sizeof(__llvm_profile_data); } +COMPILER_RT_VISIBILITY +uint64_t __llvm_profile_get_name_size(const char *Begin, const char *End) { + if ((__llvm_profile_get_version() & VARIANT_MASK_DBG_CORRELATE) || david-xl wrote: here https://github.com/llvm/llvm-project/pull/69493 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Profile] Add binary profile correlation to offload profile metadata at runtime. (PR #69493)
@@ -540,10 +540,10 @@ Error RawInstrProfReader::readHeader( "\nPLEASE update this tool to version in the raw profile, or " "regenerate raw profile with expected version.") .str()); - if (useDebugInfoCorrelate() && !Correlator) -return error(instrprof_error::missing_debug_info_for_correlation); - if (!useDebugInfoCorrelate() && Correlator) -return error(instrprof_error::unexpected_debug_info_for_correlation); + if (useCorrelate() && !Correlator) david-xl wrote: candidate refactoring change. https://github.com/llvm/llvm-project/pull/69493 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Profile] Add binary profile correlation to offload profile metadata at runtime. (PR #69493)
@@ -98,13 +99,16 @@ extern cl::opt PrintPipelinePasses; static cl::opt ClSanitizeOnOptimizerEarlyEP( "sanitizer-early-opt-ep", cl::Optional, cl::desc("Insert sanitizers on OptimizerEarlyEP."), cl::init(false)); -} + +extern cl::opt ProfileCorrelate; david-xl wrote: yes - just use the debug-info kind and non-kind. https://github.com/llvm/llvm-project/pull/69493 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Profile] Refactor profile correlation. (PR #69656)
https://github.com/david-xl approved this pull request. https://github.com/llvm/llvm-project/pull/69656 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [PGO] Add ability to mark cold functions as optsize/minsize/optnone (PR #69030)
@@ -0,0 +1,65 @@ +//===--===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// + +#include "llvm/Transforms/Instrumentation/MarkColdFunctions.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/IR/PassManager.h" + +using namespace llvm; + +PreservedAnalyses MarkColdFunctionsPass::run(Module &M, + ModuleAnalysisManager &AM) { + if (ColdType == PGOOptions::ColdFuncAttr::None) +return PreservedAnalyses::all(); + ProfileSummaryInfo &PSI = AM.getResult(M); + if (!PSI.hasProfileSummary()) +return PreservedAnalyses::all(); + FunctionAnalysisManager &FAM = + AM.getResult(M).getManager(); + bool MadeChange = false; + for (Function &F : M) { +if (F.isDeclaration()) + continue; +BlockFrequencyInfo &BFI = FAM.getResult(F); +if (!PSI.isFunctionColdInCallGraph(&F, BFI)) david-xl wrote: The same optimization strategy should also be applied to function already marked with cold attribute (either by user or previous passes). https://github.com/llvm/llvm-project/pull/69030 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Profile] Add binary profile correlation to offload profile metadata at runtime. (PR #69493)
david-xl wrote: > > This should works with PGO when value profiling is disabled > > Is this not compatible with value profiling? or just not implemented yet? Not compatible, but this feature is mainly for coverage testing. https://github.com/llvm/llvm-project/pull/69493 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [IRPGO][ValueProfile] Instrument virtual table address that could be used to do virtual table address comparision for indirect-call-promotion. (PR #66825)
@@ -12,27 +12,96 @@ #ifndef LLVM_ANALYSIS_INDIRECTCALLVISITOR_H #define LLVM_ANALYSIS_INDIRECTCALLVISITOR_H +#include "llvm/ADT/SetVector.h" #include "llvm/IR/InstVisitor.h" #include namespace llvm { -// Visitor class that finds all indirect call. +// Visitor class that finds indirect calls or instructions that gives vtable +// value, depending on Type. struct PGOIndirectCallVisitor : public InstVisitor { + enum class InstructionType { +kIndirectCall = 0, +kVTableVal = 1, + }; std::vector IndirectCalls; - PGOIndirectCallVisitor() = default; + SetVector> VTableAddrs; + PGOIndirectCallVisitor(InstructionType Type) : Type(Type) {} void visitCallBase(CallBase &Call) { -if (Call.isIndirectCall()) +const CallInst *CI = dyn_cast(&Call); david-xl wrote: Suggest restructure the code a little more for better reading: ``` if (Call.isIndirectCall()) IndirectCalls.push_back(&Call); if (Type != kVTableVal) return; // Handle VTable type below ``` https://github.com/llvm/llvm-project/pull/66825 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [IRPGO][ValueProfile] Instrument virtual table address that could be used to do virtual table address comparision for indirect-call-promotion. (PR #66825)
@@ -12,27 +12,96 @@ #ifndef LLVM_ANALYSIS_INDIRECTCALLVISITOR_H #define LLVM_ANALYSIS_INDIRECTCALLVISITOR_H +#include "llvm/ADT/SetVector.h" #include "llvm/IR/InstVisitor.h" #include namespace llvm { -// Visitor class that finds all indirect call. +// Visitor class that finds indirect calls or instructions that gives vtable +// value, depending on Type. struct PGOIndirectCallVisitor : public InstVisitor { + enum class InstructionType { +kIndirectCall = 0, +kVTableVal = 1, + }; std::vector IndirectCalls; - PGOIndirectCallVisitor() = default; + SetVector> VTableAddrs; + PGOIndirectCallVisitor(InstructionType Type) : Type(Type) {} void visitCallBase(CallBase &Call) { -if (Call.isIndirectCall()) +const CallInst *CI = dyn_cast(&Call); +if (Type == InstructionType::kVTableVal && CI && CI->getCalledFunction()) { david-xl wrote: Add a description for this a code block and describe what code pattern it is looking for. https://github.com/llvm/llvm-project/pull/66825 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Profile] Refactor profile correlation. (PR #69656)
@@ -24,15 +24,38 @@ using namespace llvm; -/// Get the __llvm_prf_cnts section. -Expected getCountersSection(const object::ObjectFile &Obj) { +namespace llvm { +// Deprecated. Use -profile-correlate=debug-info. +cl::opt DebugInfoCorrelate( +"debug-info-correlate", +cl::desc("Use debug info to correlate profiles (Deprecated). Use " + "-profile-correlate=debug-info instead."), +cl::init(false)); + +cl::opt ProfileCorrelate( +"profile-correlate", +cl::desc("Use debug info or binary file to correlate profiles."), +cl::init(InstrProfCorrelator::NONE), +cl::values(clEnumValN(InstrProfCorrelator::NONE, "", + "No profile correlation"), + clEnumValN(InstrProfCorrelator::DEBUG_INFO, "debug-info", + "Use debug info to correlate"))); david-xl wrote: Use extern decl and add some comments. https://github.com/llvm/llvm-project/pull/69656 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] Bfi precision (PR #66285)
david-xl wrote: This is concerning. Can this be reverted for now and we can help with some internal performance testing. @xur-llvm https://github.com/llvm/llvm-project/pull/66285 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Profile] Refactor profile correlation. (PR #69656)
@@ -24,15 +24,38 @@ using namespace llvm; -/// Get the __llvm_prf_cnts section. -Expected getCountersSection(const object::ObjectFile &Obj) { +namespace llvm { +// Deprecated. Use -profile-correlate=debug-info. +cl::opt DebugInfoCorrelate( +"debug-info-correlate", +cl::desc("Use debug info to correlate profiles (Deprecated). Use " + "-profile-correlate=debug-info instead."), +cl::init(false)); + +cl::opt ProfileCorrelate( +"profile-correlate", +cl::desc("Use debug info or binary file to correlate profiles."), +cl::init(InstrProfCorrelator::NONE), +cl::values(clEnumValN(InstrProfCorrelator::NONE, "", + "No profile correlation"), + clEnumValN(InstrProfCorrelator::DEBUG_INFO, "debug-info", + "Use debug info to correlate"))); david-xl wrote: so keeping the option in InstrProfCorrelator.cpp works fine without linker error? That works for me. https://github.com/llvm/llvm-project/pull/69656 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] Bfi precision (PR #66285)
david-xl wrote: Yes -- the revert can wait until more data is available. I agree that it should help performance in theory. https://github.com/llvm/llvm-project/pull/66285 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] Bfi precision (PR #66285)
david-xl wrote: Yes -- the revert can wait until more data is available. I agree that it should help performance in theory. https://github.com/llvm/llvm-project/pull/66285 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] Bfi precision (PR #66285)
david-xl wrote: The old code of not marking them as cold works also just by accident though. https://github.com/llvm/llvm-project/pull/66285 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] Bfi precision (PR #66285)
david-xl wrote: Agree that deciding coldness based on local entry frequency (2%) is a bad idea though. https://github.com/llvm/llvm-project/pull/66285 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] Bfi precision (PR #66285)
david-xl wrote: Agree that deciding coldness based on local entry frequency (2%) is a bad idea though. https://github.com/llvm/llvm-project/pull/66285 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] Bfi precision (PR #66285)
david-xl wrote: Contracting my past self (one of the reviewers of the patch), I think coldness check should be based on a global threshold, which exists when synthetic entry count propagation is enabled (without PGO). https://github.com/llvm/llvm-project/pull/66285 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [IRPGO][ValueProfile] Instrument virtual table address that could be used to do virtual table address comparision for indirect-call-promotion. (PR #66825)
@@ -276,6 +286,12 @@ uint64_t __llvm_profile_get_num_counters(const char *Begin, const char *End); /*! \brief Get the size of the profile counters section in bytes. */ uint64_t __llvm_profile_get_counters_size(const char *Begin, const char *End); +uint64_t __llvm_profile_get_num_vtable(const VTableProfData *Begin, david-xl wrote: add brief documentation. https://github.com/llvm/llvm-project/pull/66825 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [IRPGO][ValueProfile] Instrument virtual table address that could be used to do virtual table address comparision for indirect-call-promotion. (PR #66825)
@@ -0,0 +1,139 @@ +; RUN: opt < %s -passes=pgo-instr-gen -S | FileCheck %s --check-prefix=GEN +; RUN: opt < %s -passes=pgo-instr-gen,instrprof -S | FileCheck %s --check-prefix=LOWER + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +$_ZTV7Derived = comdat any + +@_ZTV7Derived = constant { [3 x ptr], [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI7Derived, ptr @_ZN5Base15func1Eii], [3 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr @_ZTI7Derived, ptr @_ZN5Base25func2Eii] }, comdat, align 8, !type !0, !type !1, !type !2, !type !3, !type !4, !type !5, !type !6, !type !7, !type !8 +@_ZTVN10__cxxabiv121__vmi_class_type_infoE = external global [0 x ptr] +@_ZTS7Derived = constant [9 x i8] c"7Derived\00", align 1 +@_ZTI5Base1 = external constant ptr +@_ZTI5Base2 = external constant ptr +@_ZTI7Derived = constant { ptr, ptr, i32, i32, ptr, i64, ptr, i64 } { ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv121__vmi_class_type_infoE, i64 2), ptr @_ZTS7Derived, i32 0, i32 2, ptr @_ZTI5Base1, i64 2, ptr @_ZTI5Base2, i64 2050 }, align 8 +@_ZTV5Base1 = constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI5Base1, ptr @_ZN5Base15func1Eii] }, align 8, !type !0, !type !1 +@_ZTV5Base2 = constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI5Base2, ptr @_ZN5Base25func2Eii] }, align 8, !type !9, !type !4 +@llvm.compiler.used = appending global [2 x ptr] [ptr @_ZTV5Base1, ptr @_ZTV5Base2], section "llvm.metadata" + +declare ptr @_Z10createTypei(i32) +declare i32 @_ZN5Base15func1Eii(ptr, i32, i32) +declare i32 @_ZN5Base25func2Eii(ptr, i32, i32) + +; GEN: @__llvm_profile_raw_version = hidden constant i64 72057594037927945, comdat +; GEN: @__profn_test_vtable_value_profiling = private constant [27 x i8] c"test_vtable_value_profiling" + +; LOWER: $__profvt__ZTV7Derived = comdat any +; LOWER: $__profvt__ZTV5Base1 = comdat nodeduplicate +; LOWER: $__profvt__ZTV5Base2 = comdat nodeduplicate +; LOWER: @__llvm_profile_raw_version = hidden constant i64 72057594037927945, comdat +; LOWER: @__profc_test_vtable_value_profiling = private global [1 x i64] zeroinitializer, section "__llvm_prf_cnts", comdat, align 8 +; LOWER: @__profvp_test_vtable_value_profiling = private global [4 x i64] zeroinitializer, section "__llvm_prf_vals", comdat($__profc_test_vtable_value_profiling), align 8 +; LOWER: @__profd_test_vtable_value_profiling = private global { i64, i64, i64, ptr, ptr, i32, [3 x i16] } { i64 1593873508557585901, i64 567090795815895039, i64 sub (i64 ptrtoint (ptr @__profc_test_vtable_value_profiling to i64), i64 ptrtoint (ptr @__profd_test_vtable_value_profiling to i64)), ptr @test_vtable_value_profiling.local, ptr @__profvp_test_vtable_value_profiling, i32 1, [3 x i16] [i16 2, i16 0, i16 2] }, section "__llvm_prf_data", comdat($__profc_test_vtable_value_profiling), align 8 +; LOWER: @__profvt__ZTV7Derived = global { i64, ptr, i32 } { i64 -4576307468236080025, ptr @_ZTV7Derived, i32 48 }, section "__llvm_prf_vtab", comdat, align 8 +; LOWER: @__profvt__ZTV5Base1 = global { i64, ptr, i32 } { i64 3215870116411581797, ptr @_ZTV5Base1, i32 24 }, section "__llvm_prf_vtab", comdat, align 8 +; LOWER: @__profvt__ZTV5Base2 = global { i64, ptr, i32 } { i64 8378219803387680050, ptr @_ZTV5Base2, i32 24 }, section "__llvm_prf_vtab", comdat, align 8 +; LOWER: @__llvm_prf_vnodes = private global [10 x { i64, i64, ptr }] zeroinitializer, section "__llvm_prf_vnds", align 8 +; LOWER: @__llvm_prf_nm = private constant [37 x i8] c"\1B#x\DA+I-.\89/+IL\CAI\8D/K\CC)M\8D/(\CAO\CB\CC\C9\CCK\07\00\9Ea\0BC", section "__llvm_prf_names", align 1 +; LOWER: @__llvm_prf_vnm = private constant [34 x i8] c"\22 x\DA\8B\8F\0A\093wI-\CA,KMa\8C\07rL\9D\12\8BS\0D\11L#\00\C3\A2\0A\E9", section "__llvm_prf_vnames", align 1 +; LOWER: @llvm.used = appending global [6 x ptr] [ptr @__profvt__ZTV7Derived, ptr @__profvt__ZTV5Base1, ptr @__profvt__ZTV5Base2, ptr @__llvm_prf_vnodes, ptr @__llvm_prf_nm, ptr @__llvm_prf_vnm], section "llvm.metadata" + +define i32 @test_vtable_value_profiling(i32 %a, i32 %b, i32 %c) { +; GEN-LABEL: define i32 @test_vtable_value_profiling( +; GEN-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { +; GEN-NEXT: entry: +; GEN-NEXT:call void @llvm.instrprof.increment(ptr @__profn_test_vtable_value_profiling, i64 567090795815895039, i32 1, i32 0) +; GEN-NEXT:[[CALL:%.*]] = tail call ptr @_Z10createTypei(i32 [[C]]) +; GEN-NEXT:[[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[CALL]], i64 8 +; GEN-NEXT:[[VTABLE:%.*]] = load ptr, ptr [[ADD_PTR]], align 8 +; GEN-NEXT:[[TMP0:%.*]] = ptrtoint ptr [[VTABLE]] to i64 +; GEN-NEXT:call void @llvm.instrprof.value.profile(ptr @__profn_test_vtable_value_profiling, i64 567090795815895039, i64 [[TMP0]], i32 2, i32 0) +; GEN-NEXT:[[VFUNC:%.*]] = load ptr, ptr [[VTABLE]], align 8 +; GEN-NEXT:[[TMP1:%.*]] = ptrtoint ptr [[VFUNC]] to i64 +; GEN-NEXT:
[clang] [IRPGO][ValueProfile] Instrument virtual table address that could be used to do virtual table address comparision for indirect-call-promotion. (PR #66825)
@@ -276,6 +286,12 @@ uint64_t __llvm_profile_get_num_counters(const char *Begin, const char *End); /*! \brief Get the size of the profile counters section in bytes. */ uint64_t __llvm_profile_get_counters_size(const char *Begin, const char *End); +uint64_t __llvm_profile_get_num_vtable(const VTableProfData *Begin, + const VTableProfData *End); + +uint64_t __llvm_profile_get_vtable_size(const VTableProfData *Begin, +const VTableProfData *End); + /* ! \brief Given the sizes of the data and counter information, return the * number of padding bytes before and after the counters, and after the names, david-xl wrote: Update the comment https://github.com/llvm/llvm-project/pull/66825 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [IRPGO][ValueProfile] Instrument virtual table address that could be used to do virtual table address comparision for indirect-call-promotion. (PR #66825)
@@ -276,6 +286,12 @@ uint64_t __llvm_profile_get_num_counters(const char *Begin, const char *End); /*! \brief Get the size of the profile counters section in bytes. */ uint64_t __llvm_profile_get_counters_size(const char *Begin, const char *End); +uint64_t __llvm_profile_get_num_vtable(const VTableProfData *Begin, + const VTableProfData *End); + +uint64_t __llvm_profile_get_vtable_size(const VTableProfData *Begin, david-xl wrote: add brief documentation https://github.com/llvm/llvm-project/pull/66825 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [IRPGO][ValueProfile] Instrument virtual table address that could be used to do virtual table address comparision for indirect-call-promotion. (PR #66825)
david-xl wrote: > The work sounds interesting. Can you provide a bit more context about it? > Will it be used to improve ICP when it's sufficient to just compare the > vtable address instead of the vfunc address? yes -- it can not only eliminate vtable load, but also enable target check combining. What is more important is that it can be combined with more aggressive interprocedural type propagation that enables full (unconditional) devirtualization. Example: base->foo(); base->bar(); ==> if (base->vptr == Derived) { Derived::foo(base); // base type is known so virtual calls in foo,bar can further be devirtualized. Derived::bar(base); } else {.. } https://github.com/llvm/llvm-project/pull/66825 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm-profdata] Do not create numerical strings for MD5 function names read from a Sample Profile. (PR #66164)
@@ -537,6 +538,9 @@ class SampleContext { assert(!Name.empty() && "Name is empty"); } + SampleContext(ProfileFuncRef Name) david-xl wrote: Name ->Func https://github.com/llvm/llvm-project/pull/66164 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm-profdata] Do not create numerical strings for MD5 function names read from a Sample Profile. (PR #66164)
@@ -476,12 +471,12 @@ enum ContextAttributeMask { // Represents a context frame with function name and line location struct SampleContextFrame { - StringRef FuncName; + ProfileFuncRef FuncName; david-xl wrote: Nit: FuncName--> Func https://github.com/llvm/llvm-project/pull/66164 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [llvm-profdata] Do not create numerical strings for MD5 function names read from a Sample Profile. (PR #66164)
@@ -1135,12 +1141,12 @@ class FunctionSamples { /// translate \p Name in current FunctionSamples into its original name /// by looking up in the function map GUIDToFuncNameMap. /// If the original name doesn't exist in the map, return empty StringRef. - StringRef getFuncName(StringRef Name) const { + StringRef getFuncName(ProfileFuncRef Name) const { david-xl wrote: Name -> Func https://github.com/llvm/llvm-project/pull/66164 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [llvm-profdata] Do not create numerical strings for MD5 function names read from a Sample Profile. (PR #66164)
@@ -715,7 +717,7 @@ class SampleContext { private: /// Mangled name of the function. - StringRef Name; + ProfileFuncRef Name; david-xl wrote: Name --> Func https://github.com/llvm/llvm-project/pull/66164 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits