> Hi Honza, > > > + /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, > > + throughput 12. Approx 9 uops do not depend on vector size and every > > load > > + is 7 uops. */ > > + 18, 8, /* Gather load static, per_elt. */ > > + 18, 10, /* Gather store static, per_elt. */ > > Can you please help on how you arrived at 18 for the load/store static cost > (based on throughput)? > Per_elt is 8 i.e. (latency of load ) 4 * 2 (reg-reg move ) ?
>From the number of uops it seemed that gather is roughly 9+7*n where n is >number of entries. reg-reg move is 2, so 18 is 9*2. I think we need to account that CPU is indeed doing n independent load operations (so it does not save anything compared to scalar code) and bit more. Load cost is set to 6 (perhaps it should be 8 for integer and more for FP?). So I went for 8 to make it bit more expensive. I plan to experiment with the values incrementally so any suggestions are welcome. Honza > > > > 32, /* size of l1 cache. */ > > 512, /* size of l2 cache. */ > > 64, /* size of prefetch block. */ > > @@ -1539,6 +1574,8 @@ const struct processor_costs btver1_cost > > in 32,64,128,256 and 512-bit */ > > {10, 10, 12, 24, 48}, /* cost of unaligned stores. */ > > 14, 14, /* SSE->integer and integer->SSE > > moves */ > > + 10, 10, /* Gather load static, per_elt. */ > > + 10, 10, /* Gather store static, per_elt. */ > > 32, /* size of l1 cache. */ > > 512, /* size of l2 cache. */ > > 64, /* size of prefetch block */ > > @@ -1624,6 +1661,8 @@ const struct processor_costs btver2_cost > > in 32,64,128,256 and 512-bit */ > > {10, 10, 12, 24, 48}, /* cost of unaligned stores. */ > > 14, 14, /* SSE->integer and integer->SSE > > moves */ > > + 10, 10, /* Gather load static, per_elt. */ > > + 10, 10, /* Gather store static, per_elt. */ > > 32, /* size of l1 cache. */ > > 2048, /* size of l2 cache. */ > > 64, /* size of prefetch block */ > > @@ -1708,6 +1747,8 @@ struct processor_costs pentium4_cost = { > > in 32,64,128,256 and 512-bit */ > > {32, 32, 32, 64, 128}, /* cost of unaligned stores. */ > > 20, 12, /* SSE->integer and integer->SSE > > moves */ > > + 16, 16, /* Gather load static, per_elt. */ > > + 16, 16, /* Gather store static, per_elt. */ > > 8, /* size of l1 cache. */ > > 256, /* size of l2 cache. */ > > 64, /* size of prefetch block */ > > @@ -1795,6 +1836,8 @@ struct processor_costs nocona_cost = { > > in 32,64,128,256 and 512-bit */ > > {24, 24, 24, 48, 96}, /* cost of unaligned stores. */ > > 20, 12, /* SSE->integer and integer->SSE > > moves */ > > + 12, 12, /* Gather load static, per_elt. */ > > + 12, 12, /* Gather store static, per_elt. */ > > 8, /* size of l1 cache. */ > > 1024, /* size of l2 cache. */ > > 64, /* size of prefetch block */ > > @@ -1880,6 +1923,8 @@ struct processor_costs atom_cost = { > > in 32,64,128,256 and 512-bit */ > > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ > > 8, 6, /* SSE->integer and integer->SSE > > moves */ > > + 8, 8, /* Gather load static, per_elt. > > */ > > + 8, 8, /* Gather store static, > > per_elt. */ > > 32, /* size of l1 cache. */ > > 256, /* size of l2 cache. */ > > 64, /* size of prefetch block */ > > @@ -1965,6 +2010,8 @@ struct processor_costs slm_cost = { > > in 32,64,128,256 and 512-bit */ > > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ > > 8, 6, /* SSE->integer and integer->SSE > > moves */ > > + 8, 8, /* Gather load static, per_elt. > > */ > > + 8, 8, /* Gather store static, > > per_elt. */ > > 32, /* size of l1 cache. */ > > 256, /* size of l2 cache. */ > > 64, /* size of prefetch block */ > > @@ -2050,6 +2097,8 @@ struct processor_costs intel_cost = { > > in 32,64,128,256 and 512-bit */ > > {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ > > 4, 4, /* SSE->integer and integer->SSE > > moves */ > > + 6, 6, /* Gather load static, per_elt. > > */ > > + 6, 6, /* Gather store static, > > per_elt. */ > > 32, /* size of l1 cache. */ > > 256, /* size of l2 cache. */ > > 64, /* size of prefetch block */ > > @@ -2142,6 +2191,8 @@ struct processor_costs generic_cost = { > > in 32,64,128,256 and 512-bit */ > > {10, 10, 10, 15, 20}, /* cost of unaligned storess. > > */ > > 20, 20, /* SSE->integer and integer->SSE > > moves */ > > + 6, 6, /* Gather load static, per_elt. > > */ > > + 6, 6, /* Gather store static, > > per_elt. */ > > 32, /* size of l1 cache. */ > > 512, /* size of l2 cache. */ > > 64, /* size of prefetch block */ > > @@ -2239,6 +2290,11 @@ struct processor_costs core_cost = { > > in 32,64,128,256 and 512-bit */ > > {6, 6, 6, 6, 12}, /* cost of unaligned stores. */ > > 2, 2, /* SSE->integer and integer->SSE > > moves */ > > + /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops, > > + rec. throughput 6. > > + So 5 uops statically and one uops per load. */ > > + 10, 6, /* Gather load static, per_elt. */ > > + 10, 6, /* Gather store static, per_elt. */ > > 64, /* size of l1 cache. */ > > 512, /* size of l2 cache. */ > > 64, /* size of prefetch block */ > > Regards, > Venkat.