I post an exmple for intrinsics choosing.

```
for (i, 0, 65535) {
   C[i] = (A[i] + B[i])
}
```

```
Call Engine: veadd_mm
// normal ===stmt cost : 2061.94 (smallest cost) shape : 1x65535
 [ tx.veadd_mm(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)0, 
(int64)65535, 2), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)0, 
(int64)65535, 1), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)0, 
(int64)65535, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)65535, "CSR_SHAPE_S1_ROW", 
(int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
 ]

// normal and align === stmt cost : 2071.91 shape : 1x65472 
 [ tx.veadd_mm(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)0, 
(int64)65535, 2), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)0, 
(int64)65535, 1), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)0, 
(int64)65535, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)65472, "CSR_SHAPE_S1_ROW", 
(int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
tx.veadd_mm(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)65472, 
(int64)63, 2), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)65472, 
(int64)63, 1), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)65472, 
(int64)63, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)63, "CSR_SHAPE_S1_ROW", 
(int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
 ]

// reshape === stmt cost : 131080
 [ tx.veadd_mm(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)0, 
(int64)65535, 2), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)0, 
(int64)65535, 1), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)0, 
(int64)65535, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)1, "CSR_SHAPE_S1_ROW", 
(int64)65535, "CSR_STRIDE_D", 0, "CSR_STRIDE_S", 0))
 ]

// === stmt cost : 786420 
 [ for (i, 0, (int64)65535) {
  tx.veadd_mm(tir.tvm_access_ptr(tir.type_annotation(), C, int64(i), 
((int64)65535 - int64(i)), 2), tir.tvm_access_ptr(tir.type_annotation(), A, 
int64(i), ((int64)65535 - int64(i)), 1), 
tir.tvm_access_ptr(tir.type_annotation(), B, int64(i), ((int64)65535 - 
int64(i)), 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)1, "CSR_SHAPE_S1_ROW", 
(int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
}
 ]

Call Engine: veadd_mv_dimh
// normal === stmt cost : 3085.91
 [ tx.veadd_mv_dimh(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)0, 
(int64)65535, 2), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)0, 
(int64)65535, 1), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)0, 
(int64)65535, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)65535, "CSR_SHAPE_S1_ROW", 
(int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
 ]

// normal and align === stmt cost : 2069.94
 [ tx.veadd_mv_dimh(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)0, 
(int64)65535, 2), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)0, 
(int64)65535, 1), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)0, 
(int64)65535, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)65472, "CSR_SHAPE_S1_ROW", 
(int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
tx.veadd_mv_dimh(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)65472, 
(int64)63, 2), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)65472, 
(int64)63, 1), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)65472, 
(int64)63, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)63, "CSR_SHAPE_S1_ROW", 
(int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
 ]

// === stmt cost : 720885
 [ for (i, 0, (int64)65535) {
  tx.veadd_mv_dimh(tir.tvm_access_ptr(tir.type_annotation(), C, int64(i), 
((int64)65535 - int64(i)), 2), tir.tvm_access_ptr(tir.type_annotation(), B, 
int64(i), ((int64)65535 - int64(i)), 1), 
tir.tvm_access_ptr(tir.type_annotation(), A, int64(i), ((int64)65535 - 
int64(i)), 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)1, "CSR_SHAPE_S1_ROW", 
(int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
}
 ]
Call Engine: veadd_mf
// === stmt cost : 720885
 [ for (i, 0, (int64)65535) {
  tx.veadd_mf(tir.tvm_access_ptr(tir.type_annotation(), C, int64(i), 
((int64)65535 - int64(i)), 2), tir.tvm_access_ptr(tir.type_annotation(), B, 
int64(i), ((int64)65535 - int64(i)), 1), A[i], tx.csrw("CSR_SHAPE_S1_COL", 
(int64)1, "CSR_SHAPE_S1_ROW", (int64)1, "CSR_STRIDE_D", (int64)0, 
"CSR_STRIDE_S", (int64)0))
}
 ]
```
So we need a big module(lots of design and code) to emit intrinsics, 
tensorization at the first place doesn't fit well for NPUs.





---
[Visit 
Topic](https://discuss.tvm.apache.org/t/do-we-have-any-way-to-process-codegen-with-more-fine-grade-control/9908/9)
 to respond.

You are receiving this because you enabled mailing list mode.

To unsubscribe from these emails, [click 
here](https://discuss.tvm.apache.org/email/unsubscribe/90426346a6e940da5c2d7f8a7e430372529e40098a9ed0f5c0c18b2365137cb1).

Reply via email to