https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87105
Bug ID: 87105 Summary: Autovectorization [X86, SSE2, AVX2, DoublePrecision] Product: gcc Version: unknown Status: UNCONFIRMED Severity: normal Priority: P3 Component: c++ Assignee: unassigned at gcc dot gnu.org Reporter: kobalicek.petr at gmail dot com Target Milestone: --- GCC is unable to autovectorize the following code. It seems that it doesn't like min/max, but I'm not entirely sure. I stripped the code off my project so it's a bit longer, hope that's fine. I attached also a code compiled by clang, which is perfectly vectorized and what I would like to get from GCC. The demonstration code ---------------------- #include <algorithm> #include <cmath> #include <stdint.h> // Point structure [x, y] struct Point { double x, y; inline Point() noexcept = default; constexpr Point(const Point&) noexcept = default; constexpr Point(double x, double y) noexcept : x(x), y(y) {} }; // Box structure [x0, y0, x1, y1] struct Box { double x0, y0, x1, y1; inline void reset(double x0, double y0, double x1, double y1) noexcept { this->x0 = x0; this->y0 = y0; this->x1 = x1; this->y1 = y1; } }; // Overloads to make vector processing simpler. static constexpr Point operator-(const Point& a) noexcept { return Point(-a.x, -a.y); } static constexpr Point operator+(const Point& a, double b) noexcept { return Point(a.x + b, a.y + b); } static constexpr Point operator-(const Point& a, double b) noexcept { return Point(a.x - b, a.y - b); } static constexpr Point operator*(const Point& a, double b) noexcept { return Point(a.x * b, a.y * b); } static constexpr Point operator/(const Point& a, double b) noexcept { return Point(a.x / b, a.y / b); } static constexpr Point operator+(const Point& a, const Point& b) noexcept { return Point(a.x + b.x, a.y + b.y); } static constexpr Point operator-(const Point& a, const Point& b) noexcept { return Point(a.x - b.x, a.y - b.y); } static constexpr Point operator*(const Point& a, const Point& b) noexcept { return Point(a.x * b.x, a.y * b.y); } static constexpr Point operator/(const Point& a, const Point& b) noexcept { return Point(a.x / b.x, a.y / b.y); } static constexpr Point operator+(double a, const Point& b) noexcept { return Point(a + b.x, a + b.y); } static constexpr Point operator-(double a, const Point& b) noexcept { return Point(a - b.x, a - b.y); } static constexpr Point operator*(double a, const Point& b) noexcept { return Point(a * b.x, a * b.y); } static constexpr Point operator/(double a, const Point& b) noexcept { return Point(a / b.x, a / b.y); } // Min/Max - different semantics compared to std. template<typename T> constexpr T myMin(const T& a, const T& b) noexcept { return b < a ? b : a; } template<typename T> constexpr T myMax(const T& a, const T& b) noexcept { return a < b ? b : a; } // Linear interpolation, works with points as well. template<typename V, typename T = double> inline V lerp(const V& a, const V& b, const T& t) noexcept { return (a * (1.0 - t)) + (b * t); } // Merge a point into a box by possibly increasing its bounds. inline void boxMergePoint(Box& box, const Point& p) noexcept { box.x0 = myMin(box.x0, p.x); box.y0 = myMin(box.y0, p.y); box.x1 = myMax(box.x1, p.x); box.y1 = myMax(box.y1, p.y); } void quadBoundingBoxA(const Point bez[3], Box& bBox) noexcept { // Bounding box of start and end points. bBox.reset(myMin(bez[0].x, bez[2].x), myMin(bez[0].y, bez[2].y), myMax(bez[0].x, bez[2].x), myMax(bez[0].y, bez[2].y)); Point t = (bez[0] - bez[1]) / (bez[0] - bez[1] * 2.0 + bez[2]); t.x = myMax(t.x, 0.0); t.y = myMax(t.y, 0.0); t.x = myMin(t.x, 1.0); t.y = myMin(t.y, 1.0); boxMergePoint(bBox, lerp(lerp(bez[0], bez[1], t), lerp(bez[1], bez[2], t), t)); } GCC Output [-std=c++17 -O3 -mavx2 -fno-math-errno] -------------------------------------------------- quadBoundingBoxA(Point const*, Box&): push rbp mov rbp, rsp and rsp, -32 vmovsd xmm1, QWORD PTR [rdi+8] vmovsd xmm0, QWORD PTR [rdi] vmovsd xmm5, QWORD PTR [rdi+40] vmovsd xmm6, QWORD PTR [rdi+32] vmaxsd xmm13, xmm5, xmm1 vmaxsd xmm12, xmm6, xmm0 vminsd xmm5, xmm5, xmm1 vminsd xmm6, xmm6, xmm0 vunpcklpd xmm0, xmm12, xmm13 vunpcklpd xmm1, xmm6, xmm5 vmovups XMMWORD PTR [rsi+16], xmm0 vmovups XMMWORD PTR [rsi], xmm1 vmovsd xmm2, QWORD PTR [rdi+24] vmovsd xmm10, QWORD PTR [rdi+8] vmovsd xmm1, QWORD PTR [rdi+40] vmovsd xmm7, QWORD PTR [rdi+16] vaddsd xmm4, xmm2, xmm2 vsubsd xmm9, xmm10, xmm2 vmovsd xmm3, QWORD PTR [rdi] vmovsd xmm0, QWORD PTR [rdi+32] vsubsd xmm8, xmm3, xmm7 vsubsd xmm4, xmm10, xmm4 vaddsd xmm4, xmm4, xmm1 vdivsd xmm9, xmm9, xmm4 vaddsd xmm4, xmm7, xmm7 vsubsd xmm4, xmm3, xmm4 vaddsd xmm4, xmm4, xmm0 vdivsd xmm8, xmm8, xmm4 vxorpd xmm4, xmm4, xmm4 vcomisd xmm4, xmm8 ja .L6 vcomisd xmm4, xmm9 jbe .L36 vmovsd xmm11, QWORD PTR .LC1[rip] vmulsd xmm14, xmm1, xmm4 vmulsd xmm9, xmm2, xmm4 vcomisd xmm8, xmm11 jbe .L37 vmovsd QWORD PTR [rsp-16], xmm2 vmovapd xmm1, xmm14 vmovapd xmm2, xmm9 vxorpd xmm14, xmm14, xmm14 vmovsd QWORD PTR [rsp-8], xmm7 vmulsd xmm3, xmm3, xmm4 vmovapd xmm15, xmm11 vmovapd xmm8, xmm11 vmulsd xmm7, xmm7, xmm4 vxorpd xmm9, xmm9, xmm9 jmp .L13 .L6: vmulsd xmm11, xmm7, xmm4 vcomisd xmm4, xmm9 vxorpd xmm8, xmm8, xmm8 vmulsd xmm0, xmm0, xmm4 vmovsd QWORD PTR [rsp-8], xmm11 vmovsd xmm11, QWORD PTR .LC1[rip] vmovapd xmm14, xmm11 jbe .L10 .L19: vmovsd QWORD PTR [rsp-16], xmm2 vmulsd xmm1, xmm1, xmm4 vmovapd xmm15, xmm11 vxorpd xmm9, xmm9, xmm9 vmulsd xmm2, xmm2, xmm4 jmp .L13 .L36: vmovsd xmm11, QWORD PTR .LC1[rip] vcomisd xmm8, xmm11 jbe .L29 vmovsd QWORD PTR [rsp-8], xmm7 vmulsd xmm3, xmm3, xmm4 vxorpd xmm14, xmm14, xmm14 vmovapd xmm8, xmm11 vmulsd xmm7, xmm7, xmm4 .L10: vcomisd xmm9, xmm11 jbe .L30 vmulsd xmm15, xmm2, xmm4 vmovapd xmm9, xmm11 vmulsd xmm10, xmm10, xmm4 vmovsd QWORD PTR [rsp-16], xmm15 vxorpd xmm15, xmm15, xmm15 .L13: vaddsd xmm1, xmm1, QWORD PTR [rsp-16] vaddsd xmm3, xmm3, QWORD PTR [rsp-8] vaddsd xmm2, xmm2, xmm10 vaddsd xmm0, xmm0, xmm7 vmulsd xmm9, xmm1, xmm9 vmulsd xmm15, xmm2, xmm15 vmulsd xmm8, xmm0, xmm8 vmulsd xmm14, xmm3, xmm14 vaddsd xmm9, xmm9, xmm15 vaddsd xmm14, xmm8, xmm14 vminsd xmm5, xmm9, xmm5 vmaxsd xmm9, xmm9, xmm13 vminsd xmm6, xmm14, xmm6 vmaxsd xmm14, xmm14, xmm12 vmovsd QWORD PTR [rsi+8], xmm5 vmovsd QWORD PTR [rsi+24], xmm9 vmovsd QWORD PTR [rsi], xmm6 vmovsd QWORD PTR [rsi+16], xmm14 leave ret .L29: vmulsd xmm15, xmm7, xmm8 vsubsd xmm14, xmm11, xmm8 vmulsd xmm0, xmm0, xmm8 vmulsd xmm3, xmm3, xmm14 vmulsd xmm7, xmm7, xmm14 vmovsd QWORD PTR [rsp-8], xmm15 jmp .L10 .L37: vmulsd xmm15, xmm7, xmm8 vsubsd xmm14, xmm11, xmm8 vmulsd xmm0, xmm0, xmm8 vmulsd xmm3, xmm3, xmm14 vmulsd xmm7, xmm7, xmm14 vmovsd QWORD PTR [rsp-8], xmm15 jmp .L19 .L30: vsubsd xmm15, xmm11, xmm9 vmulsd xmm1, xmm1, xmm9 vmulsd xmm4, xmm2, xmm15 vmulsd xmm10, xmm10, xmm15 vmulsd xmm2, xmm2, xmm9 vmovsd QWORD PTR [rsp-16], xmm4 jmp .L13 Clang Output [-std=c++17 -O3 -mavx2 -fno-math-errno] ---------------------------------------------------- .LCPI0_0: .quad 4607182418800017408 # double 1 .quad 4607182418800017408 # double 1 quadBoundingBoxA(Point const*, Box&): # @quadBoundingBoxA(Point const*, Box&) vmovupd xmm0, xmmword ptr [rdi] vmovupd xmm1, xmmword ptr [rdi + 16] vmovupd xmm2, xmmword ptr [rdi + 32] vminpd xmm3, xmm2, xmm0 vmaxpd xmm4, xmm2, xmm0 vsubpd xmm5, xmm0, xmm1 vaddpd xmm6, xmm1, xmm1 vsubpd xmm6, xmm0, xmm6 vaddpd xmm6, xmm2, xmm6 vdivpd xmm5, xmm5, xmm6 vxorpd xmm6, xmm6, xmm6 vmaxpd xmm5, xmm6, xmm5 vmovapd xmm6, xmmword ptr [rip + .LCPI0_0] # xmm6 = [1.000000e+00,1.000000e+00] vminpd xmm5, xmm6, xmm5 vsubpd xmm6, xmm6, xmm5 vmulpd xmm0, xmm0, xmm6 vmulpd xmm7, xmm1, xmm5 vaddpd xmm0, xmm7, xmm0 vmulpd xmm1, xmm1, xmm6 vmulpd xmm2, xmm2, xmm5 vaddpd xmm1, xmm2, xmm1 vmulpd xmm0, xmm6, xmm0 vmulpd xmm1, xmm5, xmm1 vaddpd xmm0, xmm0, xmm1 vminpd xmm1, xmm0, xmm3 vmovupd xmmword ptr [rsi], xmm1 vmaxpd xmm0, xmm0, xmm4 vmovupd xmmword ptr [rsi + 16], xmm0 ret