Hello, i'm trying to fuse to reduce stage with schedule functions, here is 
current tir:
```
inputs=[Tensor(shape=[10240, 512], op.name=placeholder)]
outputs=[Tensor(shape=[1, 512], op.name=T_divide)]
function:
#[version = "0.0.5"]
primfn(placeholder_1: handle, T_divide_1: handle) -> ()
  attr = {"global_symbol": "fused_add_mean_add_mean", "tir.noalias": True}
  buffers = {T_divide: Buffer(T_divide_2: Pointer(float32), float32, [1, 512], 
[]),
             placeholder: Buffer(placeholder_2: Pointer(float32), float32, 
[10240, 512], [])}
  buffer_map = {placeholder_1: placeholder, T_divide_1: T_divide} {
  attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] 
"thread_extent" = 16;
  attr [T_add_red.rf: Pointer(float32)] "storage_scope" = "local";
  allocate(T_add_red.rf, float32, [1]);
  attr [reduce_temp0: handle] "storage_scope" = "local";
  allocate(reduce_temp0, float32, [1]);
  attr [T_add_red.rf_1: Pointer(float32)] "storage_scope" = "local";
  allocate(T_add_red.rf_1, float32, [1]);
  attr [reduce_temp0_1: handle] "storage_scope" = "local";
  allocate(reduce_temp0_1, float32, [1]);
  attr [IterVar(threadIdx.y: int32, [0:32], "ThreadIndex", "threadIdx.y")] 
"thread_extent" = 32 {
    attr [IterVar(threadIdx.x: int32, [0:32], "ThreadIndex", "threadIdx.x")] 
"thread_extent" = 32 {
      T_add_red.rf[0] = 0f32
      for (k0.outer: int32, 0, 320) {
        T_add_red.rf[0] = ((float32*)T_add_red.rf[0] + 
((float32*)placeholder_2[((((k0.outer*16384) + (threadIdx.x*512)) + 
(blockIdx.x*32)) + threadIdx.y)] + 1f32))
      }
      attr [meta[tir.CommReducer][0]] "reduce_scope" = @tir.reinterpret(0u64, 
dtype=handle);
      @tir.tvm_thread_allreduce(1u32, (float32*)T_add_red.rf[0], True, 
reduce_temp0, threadIdx.x, dtype=handle)
    }
    reduce_temp0[0] = ((float32*)reduce_temp0[0]*9.76563e-05f32)
    attr [IterVar(threadIdx.x, [0:32], "ThreadIndex", "threadIdx.x")] 
"thread_extent" = 32 {
      T_add_red.rf_1[0] = 0f32
      for (k0.outer_1: int32, 0, 320) {
        T_add_red.rf_1[0] = ((float32*)T_add_red.rf_1[0] + 
(((float32*)placeholder_2[((((k0.outer_1*16384) + (threadIdx.x*512)) + 
(blockIdx.x*32)) + threadIdx.y)] + 1f32) + (float32*)reduce_temp0[0]))
      }
      attr [meta[tir.CommReducer][1]] "reduce_scope" = @tir.reinterpret(0u64, 
dtype=handle);
      @tir.tvm_thread_allreduce(1u32, (float32*)T_add_red.rf_1[0], True, 
reduce_temp0_1, threadIdx.x, dtype=handle)
    }
    T_divide_2[((blockIdx.x*32) + threadIdx.y)] = 
((float32*)reduce_temp0_1[0]*9.76563e-05f32)
}
}
```
Here two tvm_thread_allreduce bind to threadIdx.x of same range independently, 
can they share same threadIdx.x so that only one kernel will be created? 
Thankyou very much!





---
[Visit 
Topic](https://discuss.tvm.apache.org/t/can-two-reduce-stage-fused/10182/1) to 
respond.

You are receiving this because you enabled mailing list mode.

To unsubscribe from these emails, [click 
here](https://discuss.tvm.apache.org/email/unsubscribe/3800667deeeb229d256ecc8b3db49d1cdc2008f4074635060730162ff5b5af47).

Reply via email to