https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85048

--- Comment #6 from Matthias Kretz (Vir) <mkretz at gcc dot gnu.org> ---
Most of the conversions are optimized perfectly now. Only the following
conversions are still missing for AVX-512:
https://godbolt.org/z/9afWbYod6

#include <cstdint>

template <class T, int N, int Size = N * sizeof(T)>
using V [[gnu::vector_size(Size)]] = T;

template <class From, class To> V<To, 4> cvt4(V<From, 4> x) {
    return V<To, 4>{To(x[0]), To(x[1]), To(x[2]), To(x[3])};
}
template <class From, class To> V<To, 8> cvt8(V<From, 8> x) {
    return V<To, 8>{
        To(x[0]), To(x[1]), To(x[2]), To(x[3]),
        To(x[4]), To(x[5]), To(x[6]), To(x[7])
    };
}
template <class From, class To> V<To, 16> cvt16(V<From, 16> x) {
    return V<To, 16>{
        To(x[0]), To(x[1]), To(x[2]), To(x[3]),
        To(x[4]), To(x[5]), To(x[6]), To(x[7]),
        To(x[8]), To(x[9]), To(x[10]), To(x[11]),
        To(x[12]), To(x[13]), To(x[14]), To(x[15])
    };
}

#define _(name, from, to, size) \
auto name(V<from, size> x) { return cvt##size<from, to>(x); }
// integral -> double
_(vcvtudq2pd, uint32_t, double, 4)
_(vcvtudq2pd, uint32_t, double, 8)

// integral -> float
_(vcvtqq2ps ,  int64_t, float, 16)
_(vcvtuqq2ps, uint64_t, float, 16)

// float -> integral
_(vcvttps2qq, float, int64_t, 16)

_( cvttps2udq, float, uint32_t,  4)
_(vcvttps2udq, float, uint32_t,  8)
_(vcvttps2uqq, float, uint64_t, 16)

// double -> integral
_(vcvttpd2udq, double, uint32_t, 4)

Reply via email to