================ @@ -24,567 +24,3243 @@ __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \ __min_vector_width__(256))) +/// Convert two 128-bit vectors, \a __A and \a __B, containing packed +/// single-precision (32-bit) floating-point elements to a 128-bit vector +/// containing FP16 elements. +/// +/// \code{.operation} +/// FOR i := 0 to 7 +/// IF i < 4 +/// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i]) +/// ELSE +/// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 4]) +/// FI +/// ENDFOR +/// \endcode ---------------- mikolaj-pirog wrote:
Recent intrinsics (amxfp8intrin.h) also follows this order, as vast majority of existing intrinsic do. It shouldn't be problematic to the tooling -- if it is, I will fix it (the tooling) https://github.com/llvm/llvm-project/pull/120766 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits