I post an exmple for intrinsics choosing.
``` for (i, 0, 65535) { C[i] = (A[i] + B[i]) } ``` ``` Call Engine: veadd_mm // normal ===stmt cost : 2061.94 (smallest cost) shape : 1x65535 [ tx.veadd_mm(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)0, (int64)65535, 2), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)0, (int64)65535, 1), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)0, (int64)65535, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)65535, "CSR_SHAPE_S1_ROW", (int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0)) ] // normal and align === stmt cost : 2071.91 shape : 1x65472 [ tx.veadd_mm(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)0, (int64)65535, 2), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)0, (int64)65535, 1), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)0, (int64)65535, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)65472, "CSR_SHAPE_S1_ROW", (int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0)) tx.veadd_mm(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)65472, (int64)63, 2), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)65472, (int64)63, 1), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)65472, (int64)63, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)63, "CSR_SHAPE_S1_ROW", (int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0)) ] // reshape === stmt cost : 131080 [ tx.veadd_mm(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)0, (int64)65535, 2), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)0, (int64)65535, 1), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)0, (int64)65535, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)1, "CSR_SHAPE_S1_ROW", (int64)65535, "CSR_STRIDE_D", 0, "CSR_STRIDE_S", 0)) ] // === stmt cost : 786420 [ for (i, 0, (int64)65535) { tx.veadd_mm(tir.tvm_access_ptr(tir.type_annotation(), C, int64(i), ((int64)65535 - int64(i)), 2), tir.tvm_access_ptr(tir.type_annotation(), A, int64(i), ((int64)65535 - int64(i)), 1), tir.tvm_access_ptr(tir.type_annotation(), B, int64(i), ((int64)65535 - int64(i)), 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)1, "CSR_SHAPE_S1_ROW", (int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0)) } ] Call Engine: veadd_mv_dimh // normal === stmt cost : 3085.91 [ tx.veadd_mv_dimh(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)0, (int64)65535, 2), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)0, (int64)65535, 1), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)0, (int64)65535, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)65535, "CSR_SHAPE_S1_ROW", (int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0)) ] // normal and align === stmt cost : 2069.94 [ tx.veadd_mv_dimh(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)0, (int64)65535, 2), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)0, (int64)65535, 1), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)0, (int64)65535, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)65472, "CSR_SHAPE_S1_ROW", (int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0)) tx.veadd_mv_dimh(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)65472, (int64)63, 2), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)65472, (int64)63, 1), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)65472, (int64)63, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)63, "CSR_SHAPE_S1_ROW", (int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0)) ] // === stmt cost : 720885 [ for (i, 0, (int64)65535) { tx.veadd_mv_dimh(tir.tvm_access_ptr(tir.type_annotation(), C, int64(i), ((int64)65535 - int64(i)), 2), tir.tvm_access_ptr(tir.type_annotation(), B, int64(i), ((int64)65535 - int64(i)), 1), tir.tvm_access_ptr(tir.type_annotation(), A, int64(i), ((int64)65535 - int64(i)), 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)1, "CSR_SHAPE_S1_ROW", (int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0)) } ] Call Engine: veadd_mf // === stmt cost : 720885 [ for (i, 0, (int64)65535) { tx.veadd_mf(tir.tvm_access_ptr(tir.type_annotation(), C, int64(i), ((int64)65535 - int64(i)), 2), tir.tvm_access_ptr(tir.type_annotation(), B, int64(i), ((int64)65535 - int64(i)), 1), A[i], tx.csrw("CSR_SHAPE_S1_COL", (int64)1, "CSR_SHAPE_S1_ROW", (int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0)) } ] ``` So we need a big module(lots of design and code) to emit intrinsics, tensorization at the first place doesn't fit well for NPUs. --- [Visit Topic](https://discuss.tvm.apache.org/t/do-we-have-any-way-to-process-codegen-with-more-fine-grade-control/9908/9) to respond. You are receiving this because you enabled mailing list mode. To unsubscribe from these emails, [click here](https://discuss.tvm.apache.org/email/unsubscribe/90426346a6e940da5c2d7f8a7e430372529e40098a9ed0f5c0c18b2365137cb1).