Amir updated this revision to Diff 484727.
Amir added a comment.
Convert perf profile using perf2bolt (aggregate-only mode)
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D139496/new/
https://reviews.llvm.org/D139496
Files:
clang/CMakeLists.txt
clang/cmake/caches/BOLT.cmake
llvm/docs/AdvancedBuilds.rst
Index: llvm/docs/AdvancedBuilds.rst
===================================================================
--- llvm/docs/AdvancedBuilds.rst
+++ llvm/docs/AdvancedBuilds.rst
@@ -241,6 +241,62 @@
$ ninja stage2-clang-bolt
+BOLT profile
+------------
+BOLT uses the profile collected by either Linux `perf` or via BOLT's own
+instrumentation. Both modes are supported by CMake automation, with
+instrumentation being the default (`-DCLANG_BOLT=INSTRUMENT`).
+
+It's strongly recommended to use `perf` if host system supports it as it
+is a significantly faster and potentially more reliable method:
+
+.. code-block:: console
+
+ $ cmake <...> -DCLANG_BOLT=perf \
+ -C <path to source>/clang/cmake/caches/BOLT.cmake
+
+If the host system supports profiling branch stacks (e.g. AMD or Intel LBR
+(Last Branch Record), Armv9-A BRBE (Branch Record Buffer Extension)), it can be
+enabled with `-DCLANG_BOLT=LBR` to further improve the profile quality:
+
+.. code-block:: console
+
+ $ cmake <...> -DCLANG_BOLT=LBR \
+ -C <path to source>/clang/cmake/caches/BOLT.cmake
+
+The following matrix describes supported profiling methods. Note that Linux/ELF
+is the only supported platform.
+
+============ ===============================
+Architecture `-DCLANG_BOLT` value
+------------ ------------ ------ -----------
+ `Instrument` `perf` `LBR`
+============ ============ ====== ===========
+x86_64 Yes Yes Yes
+AArch64 No Yes No HW exist
+============ ============ ====== ===========
+
+Profiling variables
+-------------------
+BOLT profile is collected from building one of in-tree projects/targets with
+Clang as a workload. The following configuration options can be used to change
+the profiling build and profiling mechanism:
+
+**CLANG_BOLT**
+ Profiling mechanism to be used. Supported values: `Instrument` (default),
+ `perf` (requires OS support), `LBR` (requires hardware support).
+
+**CLANG_BOLT_PROJECTS**
+ Projects to enable in profiling build. Defaults to `llvm`.
+
+**CLANG_BOLT_TARGETS**
+ Targets to build in profiling build. Defaults to `count` in instrumentation
+ build and `FileCheck` in perf-build.
+
+**CLANG_BOLT_EXTRA_CMAKE_FLAGS**
+ Extra CMake flags to pass to profiling build at configuration time.
+
+
3-Stage Non-Determinism
=======================
Index: clang/cmake/caches/BOLT.cmake
===================================================================
--- clang/cmake/caches/BOLT.cmake
+++ clang/cmake/caches/BOLT.cmake
@@ -1,15 +1,17 @@
set(CMAKE_BUILD_TYPE Release CACHE STRING "")
-set(CLANG_BOLT_INSTRUMENT ON CACHE BOOL "")
-set(CLANG_BOLT_INSTRUMENT_PROJECTS "llvm" CACHE STRING "")
-set(CLANG_BOLT_INSTRUMENT_TARGETS "count" CACHE STRING "")
+set(CLANG_BOLT "INSTRUMENT" CACHE STRING "Apply BOLT optimization to Clang. \
+ May be specified as Instrument or Perf or LBR to use a particular profiling \
+ mechanism.")
+
+set(CLANG_BOLT_PROJECTS "llvm" CACHE STRING "")
+string(TOUPPER "${CLANG_BOLT}" uppercase_CLANG_BOLT)
+if (uppercase_CLANG_BOLT STREQUAL "INSTRUMENT")
+ set(CLANG_BOLT_TARGETS "count" CACHE STRING "")
+else()
+ set(CLANG_BOLT_TARGETS "FileCheck" CACHE STRING "")
+endif()
set(CMAKE_EXE_LINKER_FLAGS "-Wl,--emit-relocs,-znow" CACHE STRING "")
-set(CLANG_BOLT_INSTRUMENT_EXTRA_CMAKE_FLAGS "" CACHE STRING "")
+set(CLANG_BOLT_EXTRA_CMAKE_FLAGS "" CACHE STRING "")
set(LLVM_ENABLE_PROJECTS "bolt;clang" CACHE STRING "")
set(LLVM_TARGETS_TO_BUILD Native CACHE STRING "")
-
-# Disable function splitting enabled by default in GCC8+
-if("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
- set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-reorder-blocks-and-partition")
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-reorder-blocks-and-partition")
-endif()
Index: clang/CMakeLists.txt
===================================================================
--- clang/CMakeLists.txt
+++ clang/CMakeLists.txt
@@ -869,67 +869,106 @@
endforeach()
endif()
-if (CLANG_BOLT_INSTRUMENT AND NOT LLVM_BUILD_INSTRUMENTED)
+if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
set(CLANG_PATH ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang)
set(CLANGXX_PATH ${CLANG_PATH}++)
- set(CLANG_INSTRUMENTED ${CLANG_PATH}-bolt.inst)
- set(CLANGXX_INSTRUMENTED ${CLANGXX_PATH}-bolt.inst)
set(CLANG_OPTIMIZED ${CLANG_PATH}-bolt)
set(CLANGXX_OPTIMIZED ${CLANGXX_PATH}-bolt)
- # Instrument clang with BOLT
- add_custom_target(clang-instrumented
- DEPENDS ${CLANG_INSTRUMENTED}
- )
- add_custom_command(OUTPUT ${CLANG_INSTRUMENTED}
- DEPENDS clang llvm-bolt
- COMMAND llvm-bolt ${CLANG_PATH} -o ${CLANG_INSTRUMENTED}
- -instrument --instrumentation-file-append-pid
- --instrumentation-file=${CMAKE_CURRENT_BINARY_DIR}/prof.fdata
- COMMENT "Instrumenting clang binary with BOLT"
- VERBATIM
- )
+ string(TOUPPER "${CLANG_BOLT}" uppercase_CLANG_BOLT)
+ if (uppercase_CLANG_BOLT STREQUAL "INSTRUMENT")
+ set(CLANG_INSTRUMENTED ${CLANG_PATH}-bolt.inst)
+ set(CLANGXX_INSTRUMENTED ${CLANGXX_PATH}-bolt.inst)
- # Make a symlink from clang-bolt.inst to clang++-bolt.inst
- add_custom_target(clang++-instrumented
- DEPENDS ${CLANGXX_INSTRUMENTED}
- )
- add_custom_command(OUTPUT ${CLANGXX_INSTRUMENTED}
- DEPENDS clang-instrumented
- COMMAND ${CMAKE_COMMAND} -E create_symlink
- ${CLANG_INSTRUMENTED}
- ${CLANGXX_INSTRUMENTED}
- COMMENT "Creating symlink from BOLT instrumented clang to clang++"
- VERBATIM
- )
+ # Instrument clang with BOLT
+ add_custom_target(clang-instrumented
+ DEPENDS ${CLANG_INSTRUMENTED}
+ )
+ add_custom_command(OUTPUT ${CLANG_INSTRUMENTED}
+ DEPENDS clang llvm-bolt
+ COMMAND llvm-bolt ${CLANG_PATH} -o ${CLANG_INSTRUMENTED}
+ -instrument --instrumentation-file-append-pid
+ --instrumentation-file=${CMAKE_CURRENT_BINARY_DIR}/prof.fdata
+ COMMAND ${CMAKE_COMMAND} -E create_symlink
+ ${CLANG_INSTRUMENTED}
+ ${CLANGXX_INSTRUMENTED}
+ COMMENT "Instrumenting clang binary with BOLT"
+ VERBATIM
+ )
+ endif()
+
+ # Set variables for profile collection step
+ if (uppercase_CLANG_BOLT STREQUAL "INSTRUMENT")
+ set(CLANG_BOLT_CC ${CLANG_INSTRUMENTED})
+ set(CLANG_BOLT_CXX ${CLANGXX_INSTRUMENTED})
+ else()
+ set(CLANG_BOLT_CC ${CLANG_PATH})
+ set(CLANG_BOLT_CXX ${CLANGXX_PATH})
+
+ # Perf sampling:
+ # - use maximum frequency to reduce training time
+ # - use cycle events instead of branches - empirically found to produce
+ # better results
+ # - if available, enable taken branch stack/LBR sampling
+ # (-j/--branch-filter)
+ set(PERF_CMDLINE
+ perf record --event=cycles:u
+ --output=${CMAKE_CURRENT_BINARY_DIR}/prof.data
+ --freq=max
+ )
+ if (uppercase_CLANG_BOLT STREQUAL "LBR")
+ list(APPEND PERF_CMDLINE --branch-filter=any,u)
+ endif()
+ list(APPEND PERF_CMDLINE --)
+ endif()
+
+ # Build specified targets to collect the profile
+ add_custom_target(bolt-profile-deps)
+ set(CLANG_BOLT_PROFILE ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata)
+ if (uppercase_CLANG_BOLT STREQUAL "INSTRUMENT")
+ add_dependencies(bolt-profile-deps clang-instrumented)
+ else()
+ add_dependencies(bolt-profile-deps clang)
+ endif()
+ set(STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt-clang-stamps/)
+ set(BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt-clang-bins/)
+ add_custom_target(bolt-clang-clear
+ DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/bolt-clang-cleared
+ )
+ add_custom_command(
+ OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/bolt-clang-cleared
+ DEPENDS bolt-profile-deps
+ COMMAND ${CMAKE_COMMAND} -E remove_directory ${BINARY_DIR}
+ COMMAND ${CMAKE_COMMAND} -E make_directory ${BINARY_DIR}
+ COMMAND ${CMAKE_COMMAND} -E remove_directory ${STAMP_DIR}
+ COMMAND ${CMAKE_COMMAND} -E make_directory ${STAMP_DIR}
+ COMMENT "Clobberring bolt-clang build and stamp directories"
+ )
- # Build specified targets with instrumented Clang to collect the profile
- set(STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt-instrumented-clang-stamps/)
- set(BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt-instrumented-clang-bins/)
set(build_configuration "$<CONFIG>")
include(ExternalProject)
- ExternalProject_Add(bolt-instrumentation-profile
- DEPENDS clang++-instrumented
- PREFIX bolt-instrumentation-profile
+ ExternalProject_Add(bolt-profile
+ DEPENDS bolt-profile-deps
+ PREFIX bolt-profile
SOURCE_DIR ${CMAKE_SOURCE_DIR}
STAMP_DIR ${STAMP_DIR}
BINARY_DIR ${BINARY_DIR}
EXCLUDE_FROM_ALL 1
CMAKE_ARGS
- ${CLANG_BOLT_INSTRUMENT_EXTRA_CMAKE_FLAGS}
+ ${CLANG_BOLT_EXTRA_CMAKE_FLAGS}
# We shouldn't need to set this here, but INSTALL_DIR doesn't
# seem to work, so instead I'm passing this through
-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}
- -DCMAKE_C_COMPILER=${CLANG_INSTRUMENTED}
- -DCMAKE_CXX_COMPILER=${CLANGXX_INSTRUMENTED}
- -DCMAKE_ASM_COMPILER=${CLANG_INSTRUMENTED}
+ -DCMAKE_C_COMPILER=${CLANG_BOLT_CC}
+ -DCMAKE_CXX_COMPILER=${CLANG_BOLT_CXX}
+ -DCMAKE_ASM_COMPILER=${CLANG_BOLT_CC}
-DCMAKE_ASM_COMPILER_ID=Clang
- -DCMAKE_BUILD_TYPE=Release
- -DLLVM_ENABLE_PROJECTS=${CLANG_BOLT_INSTRUMENT_PROJECTS}
+ -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+ -DLLVM_ENABLE_PROJECTS=${CLANG_BOLT_PROJECTS}
-DLLVM_TARGETS_TO_BUILD=${LLVM_TARGETS_TO_BUILD}
- BUILD_COMMAND ${CMAKE_COMMAND} --build ${BINARY_DIR}
+ BUILD_COMMAND ${PERF_CMDLINE} ${CMAKE_COMMAND} --build ${BINARY_DIR}
--config ${build_configuration}
- --target ${CLANG_BOLT_INSTRUMENT_TARGETS}
+ --target ${CLANG_BOLT_TARGETS}
INSTALL_COMMAND ""
STEP_TARGETS configure build
USES_TERMINAL_CONFIGURE 1
@@ -937,20 +976,37 @@
USES_TERMINAL_INSTALL 1
)
- # Merge profiles into one using merge-fdata
+ # Pass extra flag in no-LBR mode
+ if (uppercase_CLANG_BOLT STREQUAL "PERF")
+ set(BOLT_NO_LBR "-nl")
+ endif()
add_custom_target(clang-bolt-profile
- DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata
- )
- add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata
- DEPENDS merge-fdata bolt-instrumentation-profile-build
- WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
- COMMAND ${Python3_EXECUTABLE}
- ${CMAKE_CURRENT_SOURCE_DIR}/utils/perf-training/perf-helper.py merge-fdata
- $<TARGET_FILE:merge-fdata> ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata
- ${CMAKE_CURRENT_BINARY_DIR}
- COMMENT "Preparing BOLT profile"
- VERBATIM
+ DEPENDS ${CLANG_BOLT_PROFILE}
)
+ if (uppercase_CLANG_BOLT STREQUAL "INSTRUMENT")
+ # Merge profiles into one using merge-fdata
+ add_custom_command(OUTPUT ${CLANG_BOLT_PROFILE}
+ DEPENDS bolt-profile-build merge-fdata
+ WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+ COMMAND ${Python3_EXECUTABLE}
+ ${CMAKE_CURRENT_SOURCE_DIR}/utils/perf-training/perf-helper.py merge-fdata
+ $<TARGET_FILE:merge-fdata> prof.fdata .
+ COMMENT "Preparing BOLT profile"
+ VERBATIM
+ )
+ else() # perf with or without LBR
+ # perf profile is produced by running the build, use perf2bolt to convert it to fdata
+ add_custom_command(OUTPUT ${CLANG_BOLT_PROFILE}
+ DEPENDS bolt-profile-build
+ WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+ COMMAND llvm-bolt --aggregate-only ${CLANG_PATH}
+ -o ${CLANG_BOLT_PROFILE}
+ -p ${CMAKE_CURRENT_BINARY_DIR}/prof.data
+ ${BOLT_NO_LBR}
+ COMMENT "Converting perf profile to fdata"
+ VERBATIM
+ )
+ endif()
# Optimize original (pre-bolt) Clang using the collected profile
add_custom_target(clang-bolt
@@ -960,9 +1016,10 @@
DEPENDS clang-bolt-profile
COMMAND llvm-bolt ${CLANG_PATH}
-o ${CLANG_OPTIMIZED}
- -data ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata
+ -data ${CLANG_BOLT_PROFILE}
-reorder-blocks=ext-tsp -reorder-functions=hfsort+ -split-functions
- -split-all-cold -split-eh -dyno-stats -icf=1 -use-gnu-stack
+ -split-all-cold -split-eh -dyno-stats -icf=1 -use-gnu-stack -plt=hot
+ ${BOLT_NO_LBR}
COMMAND ${CMAKE_COMMAND} -E rename ${CLANG_OPTIMIZED} ${CLANG_PATH}-${CLANG_VERSION_MAJOR}
COMMENT "Optimizing Clang with BOLT"
VERBATIM
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits