ankurrj7 wrote: I did a small call-heavy benchmark to get a better feel for the cost of this mode compared with the existing source coverage mode and with gcc/icc coverage.
The benchmark is a single C file with 128 noinline leaf functions. `run_all()`
calls all 128 leaves once per iteration, and `main()` runs that path 1,200,000
times. This is intentionally heavier on function calls than normal application
code, because this PR adds continuation counters after calls.
```c
#include <inttypes.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#if defined(__GNUC__) || defined(__clang__)
#define NOINLINE __attribute__((noinline))
#else
#define NOINLINE
#endif
volatile uint64_t global_sink;
#define LEAF_LIST(F)
\
F(0)
\
F(1)
\
F(2)
\
F(3)
\
F(4)
\
F(5)
\
F(6)
\
F(7)
\
F(8)
\
F(9)
\
F(10)
\
F(11)
\
F(12)
\
F(13)
\
F(14)
\
F(15)
\
F(16)
\
F(17)
\
F(18)
\
F(19)
\
F(20)
\
F(21)
\
F(22)
\
F(23)
\
F(24)
\
F(25)
\
F(26)
\
F(27)
\
F(28)
\
F(29)
\
F(30)
\
F(31)
\
F(32)
\
F(33)
\
F(34)
\
F(35)
\
F(36)
\
F(37)
\
F(38)
\
F(39)
\
F(40)
\
F(41)
\
F(42)
\
F(43)
\
F(44)
\
F(45)
\
F(46)
\
F(47)
\
F(48)
\
F(49)
\
F(50)
\
F(51)
\
F(52)
\
F(53)
\
F(54)
\
F(55)
\
F(56)
\
F(57)
\
F(58)
\
F(59)
\
F(60)
\
F(61)
\
F(62)
\
F(63)
\
F(64)
\
F(65)
\
F(66)
\
F(67)
\
F(68)
\
F(69)
\
F(70)
\
F(71)
\
F(72)
\
F(73)
\
F(74)
\
F(75)
\
F(76)
\
F(77)
\
F(78)
\
F(79)
\
F(80)
\
F(81)
\
F(82)
\
F(83)
\
F(84)
\
F(85)
\
F(86)
\
F(87)
\
F(88)
\
F(89)
\
F(90)
\
F(91)
\
F(92)
\
F(93)
\
F(94)
\
F(95)
\
F(96)
\
F(97)
\
F(98)
\
F(99)
\
F(100)
\
F(101)
\
F(102)
\
F(103)
\
F(104)
\
F(105)
\
F(106)
\
F(107)
\
F(108)
\
F(109)
\
F(110)
\
F(111)
\
F(112)
\
F(113)
\
F(114)
\
F(115)
\
F(116)
\
F(117)
\
F(118)
\
F(119)
\
F(120)
\
F(121)
\
F(122)
\
F(123)
\
F(124)
\
F(125)
\
F(126)
\
F(127)
#define ROTL64(X, R) (((X) << (R)) | ((X) >> (64 - (R))))
#define DEFINE_LEAF(N)
\
static NOINLINE uint64_t leaf_##N(uint64_t x) {
\
enum { Rot = ((N) % 17) + 7 };
\
x ^= (uint64_t)((N) + 1) * UINT64_C(0x9e3779b97f4a7c15);
\
x = ROTL64(x, Rot);
\
x += (x >> 23) ^ (x << 11);
\
return x ^ (uint64_t)((N) * 1315423911u);
\
}
LEAF_LIST(DEFINE_LEAF)
#define CALL_LEAF(N)
\
x ^= leaf_##N(x + (uint64_t)(N));
\
x += ROTL64(x, ((N) % 13) + 5);
static NOINLINE uint64_t run_all(uint64_t x) {
LEAF_LIST(CALL_LEAF)
return x;
}
static NOINLINE uint64_t run_branchy(uint64_t x, uint64_t i) {
if ((x ^ i) & 1)
x ^= leaf_3(x);
else
x ^= leaf_97(x);
if ((x + i) & 8)
x += leaf_41(x);
else
x += leaf_113(x);
return x;
}
int main(int argc, char **argv) {
uint64_t iterations = 120000;
if (argc > 1)
iterations = strtoull(argv[1], NULL, 0);
uint64_t x = UINT64_C(0x123456789abcdef0);
for (uint64_t i = 0; i < iterations; ++i) {
x = run_all(x + i);
if ((i & 1023) == 0)
x ^= run_branchy(x, i);
}
global_sink = x;
printf("%" PRIu64 "\n", x);
return 0;
}
```
Commands used:
```text
clang default:
clang -O2 -g -fprofile-instr-generate -fcoverage-mapping bench.c -ldl
clang continuation mode:
clang -O2 -g -fprofile-instr-generate -fcoverage-mapping \
-fcoverage-call-continuations bench.c -ldl
gcc:
gcc -O2 -g --coverage bench.c
icc:
icc -O2 -g -prof-gen=srcpos bench.c
```
This was run with the rebuilt PR compiler:
```text
clang version 23.0.0git (https://github.com/llvm/llvm-project.git
2be52f1b4c7dd7907b08f554c0160550727195cb)
gcc (GCC) 14.2.1 20250110 (Red Hat 14.2.1-7)
icc (ICC) 2021.10.1 20231115
```
I used 5 compile samples and 5 runtime samples, and the table reports medians.
Runtime was measured by running the benchmark with `1200000` iterations.
| Mode | Compile samples (s) | Compile median | Runtime samples (s) | Runtime
median | Binary size | LLVM counter slots |
|---|---:|---:|---:|---:|---:|---:|
| Clang source coverage today | `0.60,0.60,0.60,0.60,0.59` | `0.60s` |
`0.48,0.48,0.48,0.48,0.48` | `0.48s` | `171280` | `136` |
| Clang with `-fcoverage-call-continuations` | `0.99,0.99,0.99,0.99,0.99` |
`0.99s` | `0.47,0.48,0.47,0.47,0.48` | `0.47s` | `178240` | `272` |
| GCC/gcov | `0.41,0.41,0.40,0.40,0.40` | `0.40s` | `0.47,0.47,0.47,0.47,0.47`
| `0.47s` | `115592` | n/a |
| ICC `-prof-gen=srcpos` | `0.45,0.44,0.44,0.43,0.44` | `0.44s` |
`0.68,0.68,0.67,0.68,0.67` | `0.68s` | `265056` | n/a |
For this call-heavy benchmark, the main cost I see is compile time and extra
counter slots, not runtime. The continuation mode doubles the LLVM counter
slots for this file, which is expected here because `run_all()` has 128 calls
and the new mode adds a continuation point after each call. In the generated
LLVM IR, `run_all` goes from one counter slot to 129 counter slots.
One extra compile-time note from initial exploration: the extra counters change
the IR shape enough that optimizer time increases on this benchmark. In one
`-ftime-report` run, `SLPVectorizerPass` went from about `0.210s` in default
source coverage to about `0.592s` with call continuations. With
`-fno-slp-vectorize`, the compile medians were much closer (`0.39s` default vs
`0.40s` with call continuations), so this looks like a follow-up optimization
question rather than a correctness issue in this PR.
https://github.com/llvm/llvm-project/pull/201079
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
