I'm working on tensorizing kernels that does not have their input sizes 
divisible by the dimension of the tensor intrinsic.  An example of this 
situation would be a 30x30x30 GEMM with a 16x16x16 tensor intrinsic.  This 
currently would result in the following lowered IR before tensorization and 
error message when trying to tensorize:

```
produce C {                                                                     
                                                                     [35/1317]
  for (i.outer, 0, 2) {
    for (j.outer, 0, 2) {
      for (i.inner.init, 0, 16) {
        for (j.inner.init, 0, 16) {
          if (likely((((i.outer*16) + i.inner.init) < 30))) {
            if (likely((((j.outer*16) + j.inner.init) < 30))) {
              C[((((i.outer*480) + (i.inner.init*30)) + (j.outer*16)) + 
j.inner.init)] = (int8)0
            }
          }
        }
      }
      for (k.outer, 0, 2) {
        for (i.inner, 0, 16) {
          for (j.inner, 0, 16) {
            for (k.inner, 0, 16) {
              if (likely((((i.outer*16) + i.inner) < 30))) {
                if (likely((((j.outer*16) + j.inner) < 30))) {
                  if (likely((((k.outer*16) + k.inner) < 30))) {
                    if (likely((((i.outer*16) + i.inner) < 30))) {
                      if (likely((((j.outer*16) + j.inner) < 30))) {
                        if (likely((((k.outer*16) + k.inner) < 30))) {
                          C[((((i.outer*480) + (i.inner*30)) + (j.outer*16)) + 
j.inner)] = (C[((((i.outer*480) + (i.inner*30)) + (j.outer*16)) + j.inner)] + (
A[((((i.outer*480) + (i.inner*30)) + (k.outer*16)) + 
k.inner)]*B[((((k.outer*480) + (k.inner*30)) + (j.outer*16)) + j.inner)]))
                        }
                      }
                    }
                  }
                }
              }
            }
          }
        }
      }
    }
  }
}
```

```
Traceback (most recent call last):

  File "padding-spike.py", line 197, in <module>
    print(tvm.lower(*op, simple_mode=True))

  File "/home/jsteward/work/tvm/python/tvm/build_module.py", line 382, in lower
    stmt = form_body(sch)

  File "/home/jsteward/work/tvm/python/tvm/build_module.py", line 333, in 
form_body
    stmt = schedule.ScheduleOps(sch, bounds)

  File "/home/jsteward/work/tvm/python/tvm/_ffi/_ctypes/function.py", line 207, 
in __call__
    raise get_last_ffi_error()

tvm._ffi.base.TVMError: Traceback (most recent call last):
  [bt] (7) /home/jsteward/work/tvm/build/libtvm.so(TVMFuncCall+0x5f) 
[0x7f6418c18a7f]
  [bt] (6) /home/jsteward/work/tvm/build/libtvm.so(+0x40341b) [0x7f641840241b]
  [bt] (5) 
/home/jsteward/work/tvm/build/libtvm.so(tvm::schedule::ScheduleOps(tvm::Schedule,
 tvm::Map<tvm::IterVar, tvm::Range, void, void>, bool)+0x1fa1) [0x7f64187a5f41]
  [bt] (4) 
/home/jsteward/work/tvm/build/libtvm.so(tvm::schedule::MakePipeline(tvm::Stage 
const&, std::unordered_map<tvm::IterVar, tvm::Range, std::hash<tvm::IterVar>, 
std::equal_to<tvm::IterVar>, std::allocator<std::pair<tvm::IterVar const, 
tvm::Range> > > const&, tvm::Stmt, bool)+0x5a) [0x7f64187a367a]
  [bt] (3) 
/home/jsteward/work/tvm/build/libtvm.so(tvm::ComputeOpNode::BuildProvide(tvm::Stage
 const&, std::unordered_map<tvm::IterVar, tvm::Range, std::hash<tvm::IterVar>, 
std::equal_to<tvm::IterVar>, std::allocator<std::pair<tvm::IterVar const, 
tvm::Range> > > const&, bool) const+0x165) [0x7f64185cca05]
  [bt] (2) 
/home/jsteward/work/tvm/build/libtvm.so(tvm::MakeTensorize(tvm::ComputeOpNode 
const*, tvm::Stage const&, std::unordered_map<tvm::IterVar, tvm::Range, 
std::hash<tvm::IterVar>, std::equal_to<tvm::IterVar>, 
std::allocator<std::pair<tvm::IterVar const, tvm::Range> > > const&, 
bool)+0x263) [0x7f6418602d73]
  [bt] (1) 
/home/jsteward/work/tvm/build/libtvm.so(tvm::VerifyTensorizeLoopNest(tvm::ComputeOpNode
 const*, tvm::Stage const&, tvm::ComputeLoopNest const&, unsigned long)+0xbd6) 
[0x7f64185fefc6]
  [bt] (0) 
/home/jsteward/work/tvm/build/libtvm.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x43)
 [0x7f64183b9473]
  File "../src/op/tensorize.cc", line 147
TVMError: Tensorize failed, split condition likely(((i.inner + (i.outer*16)) < 
30)) relies on var defined inside tensorize scope
```

Seems like `src/op/tensorize.cc` mandates that the init and main predicates 
cannot contain variables defined inside the tensorization scope (in this case 
`i.inner`):

```cpp
  for (const Expr& pred : n.main_predicates) {
    if (ir::ExprUseVar(pred, banned)) {
      LOG(FATAL) << "Tensorize failed, split condition "
                 << pred << " relies on var defined inside tensorize scope";
    }
  }
  for (const Expr& pred : n.init_predicates) {
    if (ir::ExprUseVar(pred, banned)) {
      LOG(FATAL) << "Tensorize failed, split condition "
                 << pred << " relies on var defined inside tensorize scope";
    }
  }
```

The problem is that my tensor intrinsic can handle the padding (needed in this 
case) in hardware, in a similar manner as VTAs use their DMA engines to perform 
padding for sparse padding.  I think my tensor intrinsic would handle it 
correctly if the `likely` clauses are just removed, but there doesn't seem to 
be an apparent way.  How to achieve this?  Thanks in advance!





---
[Visit 
Topic](https://discuss.tvm.ai/t/tensorize-with-non-divisible-split-factor/6504/1)
 to respond.

You are receiving this because you enabled mailing list mode.

To unsubscribe from these emails, [click 
here](https://discuss.tvm.ai/email/unsubscribe/13edb97ee2c640cc682842ba2c2bb302b717574bd6db0b957b586aed42c0abcd).

Reply via email to