https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101262
--- Comment #2 from Hubert Kowalski ---
I've tried producing a minimum reproducer in form of code below, however I run
on gcc 10.3. And it depends on optimization type.
According to user reports - it's enough to compile darktable using GCC 11 with
RelWithDebInfo target (it applies -O2). Builds with Release target (-O3) are
apparently "fine"
(below code theoretically reproduces issue, but afaik it might not reliably
reproduce the problem)
#include
#include
#include
#if defined(__GNUC__)
#pragma GCC optimize ("unroll-loops", "tree-loop-if-convert", \
"tree-loop-distribution", "no-strict-aliasing", \
"loop-interchange", "loop-nest-optimize", "tree-loop-im",
\
"unswitch-loops", "tree-loop-ivcanon",
"ira-loop-pressure", \
"split-ivs-in-unroller",
"variable-expansion-in-unroller", \
"split-loops", "ivopts", "predictive-commoning",\
"tree-loop-linear", "loop-block", "loop-strip-mine", \
"finite-math-only", "fp-contract=fast", "fast-math")
#endif
#define dt_omp_firstprivate(...) firstprivate(__VA_ARGS__)
#define __DT_CLONE_TARGETS__ __attribute__((target_clones("default", "sse2",
"sse3", "sse4.1", "sse4.2", "popcnt", "avx", "avx2", "avx512f", "fma4")))
#define DT_ALIGNED_ARRAY __attribute__((aligned(64)))
#define PIXEL_CHAN 8
#define UI_SAMPLES 256
// radial distances used for pixel ops
static const float centers_ops[PIXEL_CHAN] DT_ALIGNED_ARRAY = {-56.0f / 7.0f,
// = -8.0f
-48.0f / 7.0f,
-40.0f / 7.0f,
-32.0f / 7.0f,
-24.0f / 7.0f,
-16.0f / 7.0f,
-8.0f / 7.0f,
0.0f / 7.0f};
typedef struct dt_iop_toneequalizer_gui_data_t
{
// Mem arrays 64-bits aligned - contiguous memory
float factors[PIXEL_CHAN] DT_ALIGNED_ARRAY;
float gui_lut[UI_SAMPLES] DT_ALIGNED_ARRAY; // LUT for the UI graph
float sigma;
} dt_iop_toneequalizer_gui_data_t;
#pragma omp declare simd
__DT_CLONE_TARGETS__
static inline float fast_clamp(const float value, const float bottom, const
float top)
{
// vectorizable clamping between bottom and top values
return fmaxf(fminf(value, top), bottom);
}
#pragma omp declare simd
__DT_CLONE_TARGETS__
static float gaussian_denom(const float sigma)
{
// Gaussian function denominator such that y = exp(- radius^2 / denominator)
// this is the constant factor of the exponential, so we don't need to
recompute it
// for every single pixel
return 2.0f * sigma * sigma;
}
#pragma omp declare simd
__DT_CLONE_TARGETS__
static float gaussian_func(const float radius, const float denominator)
{
// Gaussian function without normalization
// this is the variable part of the exponential
// the denominator should be evaluated with `gaussian_denom`
// ahead of the array loop for optimal performance
return expf(- radius * radius / denominator);
}
__DT_CLONE_TARGETS__
static inline float pixel_correction(const float exposure,
const float *const restrict factors,
const float sigma)
{
// build the correction for the current pixel
// as the sum of the contribution of each luminance channel
float result = 0.0f;
const float gauss_denom = gaussian_denom(sigma);
const float expo = fast_clamp(exposure, -8.0f, 0.0f);
#pragma omp simd aligned(centers_ops, factors:64) safelen(PIXEL_CHAN)
reduction(+:result)
for(int i = 0; i < PIXEL_CHAN; ++i)
result += gaussian_func(expo - centers_ops[i], gauss_denom) * factors[i];
return fast_clamp(result, 0.25f, 4.0f);
}
__DT_CLONE_TARGETS__
static inline void compute_lut_correction(struct
dt_iop_toneequalizer_gui_data_t *g,
const float offset,
const float scaling)
{
// Compute the LUT of the exposure corrections in EV,
// offset and scale it for display in GUI widget graph
float *const restrict LUT = g->gui_lut;
const float *const restrict factors = g->factors;
const float sigma = g->sigma;
#pragma omp parallel for simd schedule(static) default(none) \
dt_omp_firstprivate(factors, sigma, offset, scaling, LUT) \
aligned(LUT, factors:64)
for(int k = 0; k < UI_SAMPLES; k++)
{
// build the inset graph curve LUT
// the x range is [-14;+2] EV
const float x = (8.0f * (((float)k) / ((float)(UI_SAMPLES - 1 - 8.0f;
LUT[k] = offset - log2f(pixel_correction(x, factors, sigma)) / scaling;
}
}
int main() {
dt_iop_tone