================
@@ -4551,6 +4551,83 @@ static bool interp__builtin_ia32_vpdp(InterpState &S,
CodePtr OpPC,
return true;
}
+// Bit Matrix Multiply and Accumulate (AVX512BMM). Each 256-bit lane holds a
+// 16x16 bit matrix as 16 x i16 elements; element i is row i and bit j of that
+// element is entry [i][j]. The accumulator (third argument, src1 in the AMD
+// ISA) provides the initial value of each result bit, into which the
bit-matrix
+// product of the first two arguments (src2 * src3) is reduced with OR
(vbmacor)
+// or XOR (vbmacxor):
+// for i in 0..15, j in 0..15:
+// bit = C[16*i+j]
+// for k in 0..15: bit OP= A[16*i+k] & B[16*k+j]
+// dest[16*i+j] = bit
+static bool interp__builtin_ia32_bmac(InterpState &S, CodePtr OpPC,
+ const CallExpr *Call, bool IsXor) {
+ assert(Call->getNumArgs() == 3);
+
+ // Type checks BEFORE popping the stack.
+ QualType AType = Call->getArg(0)->getType();
+ QualType BType = Call->getArg(1)->getType();
+ QualType CType = Call->getArg(2)->getType();
+ if (!AType->isVectorType() || !BType->isVectorType() ||
+ !CType->isVectorType())
+ return false;
+
+ QualType AElemQT = AType->castAs<VectorType>()->getElementType();
+ QualType BElemQT = BType->castAs<VectorType>()->getElementType();
+ QualType CElemQT = CType->castAs<VectorType>()->getElementType();
+ OptPrimType ElemT = S.getContext().classify(AElemQT);
+ if (!ElemT || (*ElemT != PT_Sint16 && *ElemT != PT_Uint16))
+ return false;
+ if (S.getContext().classify(BElemQT) != ElemT ||
+ S.getContext().classify(CElemQT) != ElemT)
+ return false;
----------------
ganeshgit wrote:
Done!
https://github.com/llvm/llvm-project/pull/182556
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits