Hi all,
I am trying to build a SpMM kernel as following:
```python
import tvm
from tvm import te
import scipy
import scipy.sparse
feat_len = 128
num_rows = num_cols = 253
num_threads_per_block = 64
num_cuda_blocks = 127
SrcFeat = te.placeholder((num_cols, feat_len))
adj_scipy_csr = scipy.sparse.random(num_rows, num_cols, density=0.1,
format='csr').astype('float32')
adj_indptr = adj_scipy_csr.indptr
adj_indices = adj_scipy_csr.indices
adj_vals = adj_scipy_csr.data
adj_indptr_placeholder = te.placeholder(shape=adj_indptr.shape, \
dtype=str(adj_indptr.dtype), name='adj_indptr_placeholder')
adj_indices_placeholder = te.placeholder(shape=adj_indices.shape, \
dtype=str(adj_indices.dtype), name='adj_indices_placeholder')
adj_vals_placeholder = te.placeholder(shape=adj_vals.shape, \
dtype=str(adj_vals.dtype), name='adj_vals_placeholder')
def msgfunc(row, ff):
row_start = adj_indptr_placeholder[row]
row_end = adj_indptr_placeholder[row + 1]
row_num_elems = row_end - row_start
elem_idx = te.reduce_axis((0, row_num_elems), name="elem_idx")
adj_val = adj_vals_placeholder[row_start + elem_idx]
feat_val = SrcFeat[adj_indices_placeholder[row_start + elem_idx], ff]
return te.sum(adj_val * feat_val, axis=elem_idx)
Out = te.compute((num_rows, feat_len), msgfunc, name='Out')
s = te.create_schedule([Out.op])
row_axis = Out.op.axis[0]
feat_axis = Out.op.axis[1]
row_outer, row_inner = s[Out.op].split(row_axis, nparts=num_cuda_blocks)
feat_outer, feat_inner = s[Out.op].split(feat_axis,
factor=num_threads_per_block)
s[Out.op].reorder(feat_outer, row_outer, feat_inner, row_inner)
s[Out.op].bind(feat_outer, te.thread_axis("blockIdx.y"))
s[Out.op].bind(row_outer, te.thread_axis("blockIdx.x"))
s[Out.op].bind(feat_inner, te.thread_axis("threadIdx.x"))
out_placeholder = te.placeholder((num_rows, feat_len),
dtype=str(adj_vals.dtype), name="out")
f = tvm.build(s, [adj_indptr_placeholder, adj_indices_placeholder,
adj_vals_placeholder, SrcFeat, out_placeholder], target='cuda')
print(f.imported_modules[0].get_source())
```
And here is the generated kernel:
```C++
extern "C" __global__ void default_function_kernel0(float* __restrict__ Out,
void* __restrict__ adj_indptr_placeholder, void* __restrict__
adj_vals_placeholder, void* __restrict__ placeholder, void* __restrict__
adj_indices_placeholder) {
for (int row_inner = 0; row_inner < 2; ++row_inner) {
if (((((int)blockIdx.x) * 2) + row_inner) < 253) {
Out[(((((((int)blockIdx.x) * 256) + (row_inner * 128)) +
(((int)blockIdx.y) * 64)) + ((int)threadIdx.x)))] = 0.000000e+00f;
}
for (int elem_idx = 0; elem_idx <
(((int*)adj_indptr_placeholder)[((((((int)blockIdx.x) * 2) + row_inner) + 1))]
- ((int*)adj_indptr_placeholder)[(((((int)blockIdx.x) * 2) + row_inner))]);
++elem_idx) {
if (((((int)blockIdx.x) * 2) + row_inner) < 253) {
Out[(((((((int)blockIdx.x) * 256) + (row_inner * 128)) +
(((int)blockIdx.y) * 64)) + ((int)threadIdx.x)))] = (Out[(((((((int)blockIdx.x)
* 256) + (row_inner * 128)) + (((int)blockIdx.y) * 64)) + ((int)threadIdx.x)))]
+
(((float*)adj_vals_placeholder)[((((int*)adj_indptr_placeholder)[(((((int)blockIdx.x)
* 2) + row_inner))] + elem_idx))] *
((float*)placeholder)[((((((int*)adj_indices_placeholder)[((((int*)adj_indptr_placeholder)[(((((int)blockIdx.x)
* 2) + row_inner))] + elem_idx))] * 128) + (((int)blockIdx.y) * 64)) +
((int)threadIdx.x)))]));
}
}
}
}
```
TVM succeeds to prevent illegal access of the `Out` array, but fails to do so
with `adj_indptr`. The length of `Out` is 253, while that of `adj_indptr` is
254. The last block has `blockIdx.x=126`, so in the condition of the elem loop
`adj_indptr[254]` is accessed, which is beyond limit.
It seems like TVM does not know about the length of `adj_indptr`, why is it so?
How should I fix this?
Best Regards
---
[Visit Topic](https://discuss.tvm.ai/t/tvm-access-beyond-array-boundary/6998/1)
to respond.
You are receiving this because you enabled mailing list mode.
To unsubscribe from these emails, [click
here](https://discuss.tvm.ai/email/unsubscribe/026cb50c2ce413e0760760f90a61762d0dc0c4634bca3735b33ab0b01f28fd65).