simd-math-1.c (gcn offloading): timeout (for fmodf, remainderf) since r15-7257-g54bdeca3c62144

ams at gcc dot gnu.org via Gcc-bugs Wed, 19 Mar 2025 06:05:36 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119325


--- Comment #20 from Andrew Stubbs <ams at gcc dot gnu.org> ---
I tried the memcpy solution with the following testcase:

v2sf                                    
smaller (v64sf in)                      
{                                       
  v2sf out = RESIZE_VECTOR (v2sf, in);  
  return out;                           
}                                       

v64sf                                   
bigger (v2sf in)                        
{                                       
  v64sf out = RESIZE_VECTOR (v64sf, in);
  return out;                           
}                                       

It doesn't look great, compiled with -O3 -fomit-frame-pointer, as it's routing
the conversion through expensive stack memory:

smaller:                                             
        s_addc_u32      s17, s17, 0                  
        s_mov_b64       exec, -1                     
        v_lshlrev_b32   v4, 2, v1                    
        s_mov_b32       s22, scc                     
        s_add_u32       s12, s16, -256               
        s_addc_u32      s13, s17, -1                 
        s_cmpk_lg_u32   s22, 0                       
        v_add_co_u32    v4, s[22:23], s12, v4        
        v_mov_b32       v5, s13                      
        v_addc_co_u32   v5, s[22:23], 0, v5, s[22:23]
        flat_store_dword        v[4:5], v8 offset:0      <---- Store big vector
        s_mov_b64       exec, 3                      
        v_lshlrev_b32   v8, 2, v1                    
        v_add_co_u32    v8, s[22:23], s12, v8        
        v_mov_b32       v9, s13                      
        v_addc_co_u32   v9, s[22:23], 0, v9, s[22:23]
        flat_load_dword v8, v[8:9] offset:0              <---- Load smaller
vector
        s_waitcnt       0                            
        s_sub_u32       s16, s16, 256                
        s_subb_u32      s17, s17, 0                  
        s_setpc_b64     s[18:19]                     

bigger:                                              
        s_addc_u32      s17, s17, 0                  
        s_mov_b64       exec, -1                     
        v_mov_b32       v0, 0                        
        s_mov_b32       s22, scc                     
        s_add_u32       s12, s16, -256               
        s_addc_u32      s13, s17, -1                 
        s_cmpk_lg_u32   s22, 0                       
        v_lshlrev_b32   v4, 2, v1                    
        v_mov_b32       v5, s13                      
        v_add_co_u32    v4, s[22:23], s12, v4        
        v_addc_co_u32   v5, s[22:23], 0, v5, s[22:23]
        flat_store_dword        v[4:5], v0 offset:0    <---- Initialize zeroed
big vector
        s_mov_b64       exec, 3                      
        v_lshlrev_b32   v4, 2, v1                    
        v_mov_b32       v5, s13                      
        v_add_co_u32    v4, s[22:23], s12, v4        
        v_addc_co_u32   v5, s[22:23], 0, v5, s[22:23]
        flat_store_dword        v[4:5], v8 offset:0    <---- Store small vector
over zeros
        s_mov_b64       exec, -1                     
        v_lshlrev_b32   v4, 2, v1                    
        v_mov_b32       v5, s13                      
        v_add_co_u32    v4, s[22:23], s12, v4        
        v_addc_co_u32   v5, s[22:23], 0, v5, s[22:23]
        flat_load_dword v8, v[4:5] offset:0            <---- Load combined big
vector
        s_waitcnt       0                            
        s_sub_u32       s16, s16, 256                
        s_subb_u32      s17, s17, 0                  
        s_setpc_b64     s[18:19]                     

Here's my alternative in-register solution:

#define RESIZE_VECTOR(to_t, from) \                                            
({ \                                                                           
  to_t __to; \                                                                 
  if (VECTOR_WIDTH (to_t) < VECTOR_WIDTH (__typeof (from))) \                  
    asm ("; no-op cast %0" : "=v"(__to) : "0"(from)); \                        
  else \                                                                       
    { \                                                                        
      unsigned long __mask = -1L; \                                            
      int lanes = VECTOR_WIDTH (__typeof (from)); \                            
      __mask <<= lanes; \                                                      
      __builtin_choose_expr ( \                                                
        V_SF_SI_P (to_t), \                                                    
        ({asm ("v_mov_b32 %0, 0" : "=v"(__to) : "0"(from), "e"(__mask));}), \  
        ({asm ("v_mov_b32 %H0, 0\n\t" \                                        
               "v_mov_b32 %L0, 0" : "=v"(__to) : "0"(from), "e"(__mask));})); \
    } \                                                                        
  __to; \                                                                      
})                                                                             

Which looks like this compiled:

smaller:                        
        s_mov_b64       exec, -1
; 6 "resize.c" 1             
        ; no-op cast v8             <---- Making vectors smaller is trivial
; 0 "" 2                        
        s_setpc_b64     s[18:19]

bigger:                         
        s_mov_b64       exec, -4
; 13 "resize.c" 1            
        v_mov_b32 v8, 0             <---- The additional lanes are expected to
be zeroed
; 0 "" 2                        
        s_setpc_b64     s[18:19]

Much better. I will submit this to Newlib when I've finished testing it.

[Bug middle-end/119325] [15 Regression] libgomp.c/simd-math-1.c (gcn offloading): timeout (for fmodf, remainderf) since r15-7257-g54bdeca3c62144

Reply via email to