https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115534
Bug ID: 115534
Summary: intermediate stack use not eliminated
Product: gcc
Version: 15.0
Status: UNCONFIRMED
Keywords: missed-optimization
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: tnfchris at gcc dot gnu.org
Target Milestone: ---
Consider the following example:
#include <stdint.h>
typedef struct _pixel_t
{
double red, green, blue, opacity;
} pixel_t;
typedef struct _PixelPacket
{
unsigned short blue, green, red, opacity;
} PixelPacket;
pixel_t f (unsigned height, unsigned width, unsigned virt_width,
uint8_t *restrict k, const PixelPacket *restrict k_pixels)
{
pixel_t result = {};
for (unsigned u=0; u < (width & -4); u++, k--) {
result.red += (*k)*k_pixels[u].red;
result.green += (*k)*k_pixels[u].green;
result.blue += (*k)*k_pixels[u].blue;
result.opacity += (*k)*k_pixels[u].opacity;
k_pixels += virt_width;
}
return result;
}
---
compiled with -O3 vectorizes as good, but the epilogue code is very
inefficient:
fadd v29.2d, v29.2d, v30.2d
fadd v28.2d, v28.2d, v31.2d
cmp w5, w1
bhi .L3
mov v31.16b, v28.16b
ins v31.d[1], v29.d[1]
ins v29.d[1], v28.d[1]
stp q31, q29, [sp, 32]
ldp d0, d1, [sp, 32]
ldp d2, d3, [sp, 48]
add sp, sp, 64
ret
.L4:
movi v29.2d, 0
mov v31.16b, v29.16b
stp q31, q29, [sp, 32]
ldp d0, d1, [sp, 32]
ldp d2, d3, [sp, 48]
add sp, sp, 64
ret
as in it goes through the stack to create the return registers. This looks
like at gimple we still have the store:
<bb 5> [local count: 105119324]:
_33 = VEC_PERM_EXPR <vect__10.16_41, vect__10.16_42, { 0, 3 }>;
_31 = VEC_PERM_EXPR <vect__10.16_42, vect__10.16_41, { 0, 3 }>;
<bb 6> [local count: 118111600]:
# vect_result_red_64.18_28 = PHI <_33(5), { 0.0, 0.0 }(2)>
# vect_result_red_64.18_105 = PHI <_31(5), { 0.0, 0.0 }(2)>
MEM <vector(2) double> [(double *)&D.4535] = vect_result_red_64.18_28;
MEM <vector(2) double> [(double *)&D.4535 + 16B] = vect_result_red_64.18_105;
return D.4535;
clang is able to generate much better code here:
fadd v0.2d, v0.2d, v1.2d
fadd v2.2d, v2.2d, v3.2d
b.ne .LBB0_2
.LBB0_3:
mov d1, v2.d[1]
mov d3, v0.d[1]
ret
The vectorized code gets reg-alloc'ed so that d0 an d2 are already in the right
registers at the end of the vector loop, and the epilogue only has to split the
registers up to get d1 and d3.
I think we would generate the same if we were to elide the intermediate stack
store.
See https://godbolt.org/z/ocqchWWs5