Dear GCC developers,
I would like to ask whether there might be room for improvement in memory
access optimization in GCC.
I've prepared a simple benchmark in both C++ (using -std=c++20 for digit
separators like 5'000'000) and Java. The benchmark allocates a large array of
random integers, performs a multiply-sum loop to prevent
dead-code elimination, and measures the time taken to sort the array.
C++ example:
```cpp
#include <algorithm>
#include <chrono>
#include <iostream>
#include <random>
#include <vector>
int main() {
std::mt19937 rng(1234);
const int nTrials = 20;
long sum = 0;
long long best = std::numeric_limits<long long>::max();
long long worst = std::numeric_limits<long long>::min();
long long totalTime = 0;
const int size = 5'000'000;
for (int n = 0; n<nTrials; ++n) {
std::cout << n+1 << "/" << nTrials << std::endl;
std::uniform_int_distribution<int> dist(std::numeric_limits<int>::min(),
std::numeric_limits<int>::max());
std::vector<int> numbers(size);
for (int& num : numbers) {
num = dist(rng);
}
for (int i = 0; i < size; ++i) {
sum += i * numbers[i];
}
auto start = std::chrono::high_resolution_clock::now();
std::sort(numbers.begin(), numbers.end());
auto end = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>
(end - start).count();
totalTime += elapsed;
if (elapsed < best) best = elapsed;
if (elapsed > worst) worst = elapsed;
}
double avgTime = (double)totalTime / nTrials;
double avgCTime = (double)(totalTime-worst)/(nTrials-1);;
std::cout << "(c++) vector sort time " << best / 1e6 << " ms\n";
std::cout << "nTrials = " << nTrials << "\n";
std::cout << "size = " << size << "\n";
std::cout << "worst/best = " << (double)worst/best << "\n";
std::cout << "avg/best = " << avgTime / best << "\n";
std::cout << "corrected avg/best = " << avgCTime / best << "\n";
std::cout << "\n\nresult for prevent eliminate dead code: " << sum << "\n";
return 0;
}
```
Java example:
```java
import java.util.Arrays;
import java.util.Random;
import static java.lang.System.out;
public class Main {
public static void main(String[] args) {
int nTrials = 50;
long sum = 0;
long best = Long.MAX_VALUE;
long worst = Long.MIN_VALUE;
long totalTime = 0;
int size = 5_000_000;
int nWarmupTrials = 6;
Random rand = new Random(1234);
for (int n = 0; n < nTrials; n++) {
if (n < nWarmupTrials)
System.out.print("warmup... ");
System.out.printf("%d/%d%n", n, nTrials);
int[] numbers = new int[size];
for (int i = 0; i < size; i++) {
numbers[i] = rand.nextInt();
}
for (int i = 0; i < size; i++) {
sum += (long) i *numbers[i];
}
long startTime = System.nanoTime();
Arrays.sort(numbers);
long endTime = System.nanoTime();
long elapsed = endTime - startTime;
//System.out.printf(" %f%n", (double) elapsed / 1e6);
if (n < nWarmupTrials)
continue;
totalTime += elapsed;
if (elapsed < best) best = elapsed;
if (elapsed > worst) worst = elapsed;
}
int nEffectiveTrials = nTrials - nWarmupTrials;
double avgTime = (double)totalTime / nEffectiveTrials;;
double avgCTime = (double)(totalTime-worst)/(nEffectiveTrials-1);
System.out.println("java.vm.version=" + System.getProperty
("java.vm.version"));
System.out.println("java.vm.name=" +
System.getProperty("java.vm.name"));
System.out.println("java.version=" +
System.getProperty("java.version"));
System.out.printf("(java) vector sort time %f ms%n", (double)
best / 1e6);
System.out.printf("nTrials=%d = %d (warmUp) + %d (effective)%n",
nTrials, nWarmupTrials, nEffectiveTrials);
System.out.printf("size=%d%n", size);
System.out.printf("worst/best=%f%n", (double) worst / best);
System.out.printf("avg/best=%f%n", avgTime / best);
System.out.printf("corrected avg/best=%f%n", avgCTime / best);
out.println("\n\n" + sum);
}
}
```
Results
C++ (Release mode, -O3):
```
(c++) vector sort time 410.469 ms
nTrials = 20
size = 5000000
worst/best = 1.0515
avg/best = 1.01533
corrected avg/best = 1.01342
```
Java:
```
java.vm.version=24.0.1+9-30
java.vm.name=OpenJDK 64-Bit Server VM
java.version=24.0.1
(java) vector sort time 69.322373 ms
nTrials=50 = 6 (warmUp) + 44 (effective)
size=5000000
worst/best=1.353855
avg/best=1.094769
corrected avg/best=1.088744
```
Notes:
The JVM was run with the following options to enable compilation logging
and assembly output:
-XX:+UnlockDiagnosticVMOptions -XX:+LogCompilation -XX:+PrintAssembly
This produces a file like hotspot_pidXXXXX.log, which can be inspected with
JITWatch.
Searching for DualPivotQuicksort.sort(...) in the JIT assembly reveals
very efficient memory access patterns.
Comparative observations:
```
Layer | Integer Comparison | Integer Move
-----------|-----------------------------|------------------------------------
Assembler | `cmp eax, edx`, `jl`, `jge` | `mov eax, [mem]`, `mov [mem], eax`
JIT | `cmp eax, edx`, `jle`, `jg` | `mov eax, [array+offset]`
```
One possible explanation is aliasing: the Java memory model can often prove
there is no aliasing between array accesses, while in C/C++ this is harder.
In C/C++, one can use the restrict qualifier to aid the compiler:
void foo(int* __restrict__ a, int* __restrict__ b);
Would using `restrict` help in this case?
Are there other techniques or flags I should explore to help GCC optimize
this memory access pattern more like the JVM does?
For transparency: I’m also sending a similar message to the LLVM developers
list to get a broader view across compilers.
Thank you in advance for your insights!
Best regards,
Andrzej Borucki