[llvm-branch-commits] [mlir] Add dpas and named barrier ops (PR #88439)

2024-04-12 Thread Adam Siemieniuk via llvm-branch-commits


@@ -662,4 +662,152 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
   }];
 }
 
+def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", 
"rhs"]>]> {
+  let summary = "It performs mma computation";
+
+  let description = [{DPAS performs matrix multiplication on matrix A of `mxk`
+size, B of `kxn` size, and accumulate on matrix C of `mxn` to the same size
+matrix , `m=8`, `n=16` and `k=8 * 32/bit_width_of_elem_type`. So for fp16
+data type, the matrices are `A: vector<8x16xf16>`, `B: vector<16x16xf16>`,
+and `C/D: vector<8x16xf32>`. Besides the matrix size requirements, DPAS
+also requires A and B to be loaded with the required data layout. 
Specially,
+VNNI layout is required for B operand. It is achieved via setting 
`vnni_axis = 0`
+of the corresponding `load_nd` operator. To keep both operands as 3D 
vector,
+operand A is loaded via setting `vnni_axis = 1` without impacting the
+physical layouts change in register. Due to the VNNI transformation, A and 
B operands
+are represented as 3D vector, with the last dimension representing the 
VNNI factor,
+which is computed as `32/bit_width_of_elem_type`. Therefore, `A: 
vector<8x16xf16>`
+is represented as `A: vector<4x8x2xf16>`, and `B:vector<16x16xf16>` is
+represented as `B: vector<8x16x2xf16>`.
+
+Note: on PVC, the hardware can perform load with VNN transformation when 
data

adam-smnk wrote:

```suggestion
Note: on PVC, the hardware can perform load with VNNI transformation when 
data
```

https://github.com/llvm/llvm-project/pull/88439
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] Add dpas and named barrier ops (PR #88439)

2024-04-12 Thread Adam Siemieniuk via llvm-branch-commits


@@ -662,4 +662,152 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
   }];
 }
 
+def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", 
"rhs"]>]> {
+  let summary = "It performs mma computation";
+
+  let description = [{DPAS performs matrix multiplication on matrix A of `mxk`
+size, B of `kxn` size, and accumulate on matrix C of `mxn` to the same size
+matrix , `m=8`, `n=16` and `k=8 * 32/bit_width_of_elem_type`. So for fp16
+data type, the matrices are `A: vector<8x16xf16>`, `B: vector<16x16xf16>`,
+and `C/D: vector<8x16xf32>`. Besides the matrix size requirements, DPAS
+also requires A and B to be loaded with the required data layout. 
Specially,
+VNNI layout is required for B operand. It is achieved via setting 
`vnni_axis = 0`
+of the corresponding `load_nd` operator. To keep both operands as 3D 
vector,
+operand A is loaded via setting `vnni_axis = 1` without impacting the
+physical layouts change in register. Due to the VNNI transformation, A and 
B operands
+are represented as 3D vector, with the last dimension representing the 
VNNI factor,
+which is computed as `32/bit_width_of_elem_type`. Therefore, `A: 
vector<8x16xf16>`
+is represented as `A: vector<4x8x2xf16>`, and `B:vector<16x16xf16>` is
+represented as `B: vector<8x16x2xf16>`.
+
+Note: on PVC, the hardware can perform load with VNN transformation when 
data
+  element type is 16-bit or lower precision, taking 2 or 4 elements 
from
+  the first dimension and inserted into the newly added innermost 
dimension.
+  }];
+
+  let arguments = (ins
+XeGPU_DpasOpType : $lhs,
+XeGPU_DpasOpType : $rhs,
+Optional: $acc);
+  let results = (outs XeGPU_Vector2DType: $result);
+
+  let extraClassDeclaration = [{
+VectorType getLhsType() {
+  return getLhs().getType();
+}
+
+VectorType getRhsType() {
+  return getRhs().getType();
+}
+
+VectorType getAccType() {
+  if (getAcc())
+return getAcc().getType();
+  return {};
+}
+
+VectorType getResultType() {
+  return getResult().getType();
+}
+  }];
+
+  let assemblyFormat = [{
+$lhs `,` $rhs (`,` $acc^)? attr-dict `:` type($lhs)`,` type($rhs) (`,` 
type($acc)^)?  `->` type($result)
+  }];
+
+  let hasVerifier = 1;
+}
+
+def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure,
+  AllElementTypesMatch<["tensorDesc", "value", "result"]>,
+  AllShapesMatch<["tensorDesc", "mask", "value", "result"]>]> {
+  let summary = "A ready-modify-write operation. ";
+
+  let description = [{
+`AtomicRMWOp` has same semantic to `memref.atomic_rmw`, except that
+it work on a `TensorDescType` object while `memref.atomic_rmw` works
+on a `MemRefType` object. It also has a `mask` variable, which has the
+same shape with `TensorDesc`, to enable or disable some data points of
+the `TensorDesc`.
+  }];
+
+  let arguments = (ins
+AtomicRMWKindAttr:$kind,
+XeGPU_TensorDesc:$tensorDesc,
+XeGPU_MaskType:$mask,
+XeGPU_ValueType:$value);
+
+  let results = (outs XeGPU_ValueType:$result);
+
+  let assemblyFormat = [{
+$kind $tensorDesc `,` $mask `,` $value attr-dict `:`
+type($tensorDesc) `,` type($mask) `,` type($value) `->` type($result)
+  }];
+}
+
+def XeGPU_AllocNbarrierOp: XeGPU_Op<"alloc_nbarrier", []> {
+  let summary = "It allocates a set of named barriers.";
+  let description = [{AllocNbarrier is to create a set of named barriers as
+  specified by `nbarrier_num`. Named barriers are workgroup level resources,
+and are shared by all threads in the workgroup. For example, there are
+up to 32 barriers (range 0-31) for each Xecore on PVC. A typical use case

adam-smnk wrote:

```suggestion
up to 32 barriers (range 0-31) for each XeCore on PVC. A typical use case
```

https://github.com/llvm/llvm-project/pull/88439
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] Add dpas and named barrier ops (PR #88439)

2024-04-12 Thread Adam Siemieniuk via llvm-branch-commits


@@ -662,4 +662,152 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
   }];
 }
 
+def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", 
"rhs"]>]> {
+  let summary = "It performs mma computation";
+
+  let description = [{DPAS performs matrix multiplication on matrix A of `mxk`
+size, B of `kxn` size, and accumulate on matrix C of `mxn` to the same size
+matrix , `m=8`, `n=16` and `k=8 * 32/bit_width_of_elem_type`. So for fp16
+data type, the matrices are `A: vector<8x16xf16>`, `B: vector<16x16xf16>`,
+and `C/D: vector<8x16xf32>`. Besides the matrix size requirements, DPAS
+also requires A and B to be loaded with the required data layout. 
Specially,
+VNNI layout is required for B operand. It is achieved via setting 
`vnni_axis = 0`
+of the corresponding `load_nd` operator. To keep both operands as 3D 
vector,
+operand A is loaded via setting `vnni_axis = 1` without impacting the
+physical layouts change in register. Due to the VNNI transformation, A and 
B operands
+are represented as 3D vector, with the last dimension representing the 
VNNI factor,
+which is computed as `32/bit_width_of_elem_type`. Therefore, `A: 
vector<8x16xf16>`
+is represented as `A: vector<4x8x2xf16>`, and `B:vector<16x16xf16>` is

adam-smnk wrote:

```suggestion
is represented as `A: vector<8x8x2xf16>`, and `B: vector<16x16xf16>` is
```

https://github.com/llvm/llvm-project/pull/88439
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits