Re: Add scatter/gather costs

Jan Hubicka Thu, 26 Oct 2017 00:34:31 -0700

> Hi Honza, 
> 
> > +  /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
> > +     throughput 12.  Approx 9 uops do not depend on vector size and every
> > load
> > +     is 7 uops.  */
> > +  18, 8,                           /* Gather load static, per_elt.  */
> > +  18, 10,                          /* Gather store static, per_elt.  */
> 
> Can you please help on how you arrived at 18 for the load/store static cost 
> (based on throughput)?
> Per_elt is 8  i.e. (latency of load ) 4 * 2 (reg-reg move ) ?


>From the number of uops it seemed that gather is roughly 9+7*n where n is 
>number of
entries. reg-reg move is 2, so 18 is 9*2.  I think we need to account that CPU
is indeed doing n independent load operations (so it does not save anything 
compared
to scalar code) and bit more.  Load cost is set to 6 (perhaps it should be 8 for
integer and more for FP?). So I went for 8 to make it bit more expensive.

I plan to experiment with the values incrementally so any suggestions are 
welcome.
Honza
>  
> 
> >    32,                                      /* size of l1 cache.  */
> >    512,                                     /* size of l2 cache.  */
> >    64,                                      /* size of prefetch block.  */
> > @@ -1539,6 +1574,8 @@ const struct processor_costs btver1_cost
> >                                        in 32,64,128,256 and 512-bit */
> >    {10, 10, 12, 24, 48},                    /* cost of unaligned stores.  */
> >    14, 14,                          /* SSE->integer and integer->SSE
> > moves */
> > +  10, 10,                          /* Gather load static, per_elt.  */
> > +  10, 10,                          /* Gather store static, per_elt.  */
> >    32,                                      /* size of l1 cache.  */
> >    512,                                     /* size of l2 cache.  */
> >    64,                                      /* size of prefetch block */
> > @@ -1624,6 +1661,8 @@ const struct processor_costs btver2_cost
> >                                        in 32,64,128,256 and 512-bit */
> >    {10, 10, 12, 24, 48},                    /* cost of unaligned stores.  */
> >    14, 14,                          /* SSE->integer and integer->SSE
> > moves */
> > +  10, 10,                          /* Gather load static, per_elt.  */
> > +  10, 10,                          /* Gather store static, per_elt.  */
> >    32,                                      /* size of l1 cache.  */
> >    2048,                                    /* size of l2 cache.  */
> >    64,                                      /* size of prefetch block */
> > @@ -1708,6 +1747,8 @@ struct processor_costs pentium4_cost = {
> >                                        in 32,64,128,256 and 512-bit */
> >    {32, 32, 32, 64, 128},           /* cost of unaligned stores.  */
> >    20, 12,                          /* SSE->integer and integer->SSE
> > moves */
> > +  16, 16,                          /* Gather load static, per_elt.  */
> > +  16, 16,                          /* Gather store static, per_elt.  */
> >    8,                                       /* size of l1 cache.  */
> >    256,                                     /* size of l2 cache.  */
> >    64,                                      /* size of prefetch block */
> > @@ -1795,6 +1836,8 @@ struct processor_costs nocona_cost = {
> >                                        in 32,64,128,256 and 512-bit */
> >    {24, 24, 24, 48, 96},                    /* cost of unaligned stores.  */
> >    20, 12,                          /* SSE->integer and integer->SSE
> > moves */
> > +  12, 12,                          /* Gather load static, per_elt.  */
> > +  12, 12,                          /* Gather store static, per_elt.  */
> >    8,                                       /* size of l1 cache.  */
> >    1024,                                    /* size of l2 cache.  */
> >    64,                                      /* size of prefetch block */
> > @@ -1880,6 +1923,8 @@ struct processor_costs atom_cost = {
> >                                        in 32,64,128,256 and 512-bit */
> >    {16, 16, 16, 32, 64},                    /* cost of unaligned stores.  */
> >    8, 6,                                    /* SSE->integer and integer->SSE
> > moves */
> > +  8, 8,                                    /* Gather load static, per_elt. 
> >  */
> > +  8, 8,                                    /* Gather store static, 
> > per_elt.  */
> >    32,                                      /* size of l1 cache.  */
> >    256,                                     /* size of l2 cache.  */
> >    64,                                      /* size of prefetch block */
> > @@ -1965,6 +2010,8 @@ struct processor_costs slm_cost = {
> >                                        in 32,64,128,256 and 512-bit */
> >    {16, 16, 16, 32, 64},                    /* cost of unaligned stores.  */
> >    8, 6,                                    /* SSE->integer and integer->SSE
> > moves */
> > +  8, 8,                                    /* Gather load static, per_elt. 
> >  */
> > +  8, 8,                                    /* Gather store static, 
> > per_elt.  */
> >    32,                                      /* size of l1 cache.  */
> >    256,                                     /* size of l2 cache.  */
> >    64,                                      /* size of prefetch block */
> > @@ -2050,6 +2097,8 @@ struct processor_costs intel_cost = {
> >                                        in 32,64,128,256 and 512-bit */
> >    {10, 10, 10, 10, 10},                    /* cost of unaligned loads.  */
> >    4, 4,                                    /* SSE->integer and integer->SSE
> > moves */
> > +  6, 6,                                    /* Gather load static, per_elt. 
> >  */
> > +  6, 6,                                    /* Gather store static, 
> > per_elt.  */
> >    32,                                      /* size of l1 cache.  */
> >    256,                                     /* size of l2 cache.  */
> >    64,                                      /* size of prefetch block */
> > @@ -2142,6 +2191,8 @@ struct processor_costs generic_cost = {
> >                                        in 32,64,128,256 and 512-bit */
> >    {10, 10, 10, 15, 20},                    /* cost of unaligned storess.  
> > */
> >    20, 20,                          /* SSE->integer and integer->SSE
> > moves */
> > +  6, 6,                                    /* Gather load static, per_elt. 
> >  */
> > +  6, 6,                                    /* Gather store static, 
> > per_elt.  */
> >    32,                                      /* size of l1 cache.  */
> >    512,                                     /* size of l2 cache.  */
> >    64,                                      /* size of prefetch block */
> > @@ -2239,6 +2290,11 @@ struct processor_costs core_cost = {
> >                                        in 32,64,128,256 and 512-bit */
> >    {6, 6, 6, 6, 12},                        /* cost of unaligned stores.  */
> >    2, 2,                                    /* SSE->integer and integer->SSE
> > moves */
> > +  /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
> > +     rec. throughput 6.
> > +     So 5 uops statically and one uops per load.  */
> > +  10, 6,                           /* Gather load static, per_elt.  */
> > +  10, 6,                           /* Gather store static, per_elt.  */
> >    64,                                      /* size of l1 cache.  */
> >    512,                                     /* size of l2 cache.  */
> >    64,                                      /* size of prefetch block */
> 
> Regards,
> Venkat.

Re: Add scatter/gather costs

Reply via email to