================
@@ -0,0 +1,174 @@
+/*===-------- avx512bmmintrin.h - AVX512BMM intrinsics *------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===---------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512bmmintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512BMMINTRIN_H
+#define _AVX512BMMINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS                                                     
\
+  __attribute__((__always_inline__, __nodebug__, __target__("avx512bmm"),      
\
+                 __min_vector_width__(512)))
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#else
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#endif
+
+/// Multiplies two 16x16 bit matrices using OR reduction and ORs the product
+/// into a third 16x16 bit matrix (which is also the destination).
+///
+/// For the 512-bit ZMM form, each register contains two 16x16 (256-bit)
+/// matrices in bits [255:0] and [511:256]. The operation performs:
+/// \code{.operation}
+///   for i in 0 to 15
+///     for j in 0 to 15
+///       reduction_bit = __C[16*i+j]
+///       for k in 0 to 15
+///         reduction_bit |= __A[16*i+k] & __B[16*k+j]
+///       end for k
+///       dest[16*i+j] = reduction_bit
+///     end for j
+///   end for i
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> VBMACOR16X16X16 </c> instruction.
+///
+/// \param __A
+///    A 512-bit vector containing two 16x16 bit matrices (one per 256-bit
+///    lane).
+/// \param __B
+///    A 512-bit vector containing two 16x16 bit matrices (one per 256-bit
+///    lane).
+/// \param __C
+///    A 512-bit accumulator vector containing the initial values to OR with.
+/// \returns A 512-bit vector containing the accumulated result for each lane.
+/// \note This instruction does not support masking.
+static __inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm512_bmacor16x16x16(__m512i __A, __m512i __B, __m512i __C) {
+  return (__m512i)__builtin_ia32_bmacor16x16x16_v32hi(
+      (__v32hi)__A, (__v32hi)__B, (__v32hi)__C);
+}
+
+/// Multiplies two 16x16 bit matrices using XOR reduction and XORs the product
+/// into a third 16x16 bit matrix (which is also the destination).
+///
+/// For the 512-bit ZMM form, each register contains two 16x16 (256-bit)
+/// matrices in bits [255:0] and [511:256]. The operation performs:
+/// \code{.operation}
+///   for i in 0 to 15
+///     for j in 0 to 15
+///       reduction_bit = __C[16*i+j]
+///       for k in 0 to 15
+///         reduction_bit ^= __A[16*i+k] & __B[16*k+j]
+///       end for k
+///       dest[16*i+j] = reduction_bit
+///     end for j
+///   end for i
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> VBMACXOR16X16X16 </c> instruction.
+///
+/// \param __A
+///    A 512-bit vector containing two 16x16 bit matrices (one per 256-bit
+///    lane).
+/// \param __B
+///    A 512-bit vector containing two 16x16 bit matrices (one per 256-bit
+///    lane).
+/// \param __C
+///    A 512-bit accumulator vector containing the initial values to XOR with.
+/// \returns A 512-bit vector containing the accumulated result for each lane.
+/// \note This instruction does not support masking.
+static __inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
----------------
RKSimon wrote:

not currently usable in a constant expression
```suggestion
static __inline __m512i __DEFAULT_FN_ATTRS
```

https://github.com/llvm/llvm-project/pull/182556
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to