------- Comment #36 from jakub at gcc dot gnu dot org 2009-07-27 11:02 ------- Here is the loop in C and vectorized by hand as well: #include <emmintrin.h>
float arr[1024]; unsigned int foo (unsigned int end) { unsigned int pos = 1; unsigned int i; float limit = __FLT_MAX__; for (i = 0; i < end; i++) if (arr[i] < limit) { limit = arr[i]; pos = i + 1; } return pos; } unsigned int bar (unsigned int end) { __m128 pos = (__m128) _mm_set1_epi32 (1); __m128 limit = _mm_set1_ps (__FLT_MAX__); __m128i curi = _mm_set_epi32 (4, 3, 2, 1); __m128i inc = _mm_set1_epi32 (4); unsigned int i = 0; if (end > 4) { for (; i < end - 4; i += 4) { __m128 val = _mm_loadu_ps (arr + i); __m128 mask = _mm_cmplt_ps (val, limit); limit = _mm_min_ps (limit, val); pos = _mm_andnot_ps (mask, pos); pos = _mm_or_ps (pos, _mm_and_ps (mask, (__m128) curi)); curi = _mm_add_epi32 (curi, inc); } /* Reduction. */ __m128 tmp1 = _mm_movehl_ps (limit, limit); __m128 tmp2 = _mm_movehl_ps (pos, pos); __m128 mask = _mm_cmplt_ps (tmp1, limit); limit = _mm_min_ps (tmp1, limit); tmp2 = _mm_and_ps (mask, tmp2); pos = _mm_or_ps (tmp2, _mm_andnot_ps (mask, pos)); tmp1 = _mm_shuffle_ps (limit, limit, _MM_SHUFFLE (1, 1, 1, 1)); tmp2 = _mm_shuffle_ps (pos, pos, _MM_SHUFFLE (1, 1, 1, 1)); mask = _mm_cmplt_ps (tmp1, limit); limit = _mm_min_ps (tmp1, limit); tmp2 = _mm_and_ps (mask, tmp2); pos = _mm_or_ps (tmp2, _mm_andnot_ps (mask, pos)); } float limit_ = _mm_cvtss_f32 (limit); unsigned int pos_ = (unsigned int) _mm_cvtsi128_si32 ((__m128i) pos); for (; i < end; i++) if (arr[i] < limit_) { limit_ = arr[i]; pos_ = i + 1; } return pos_; } int main (void) { unsigned int k; arr[0] = -1; arr[2] = -3; arr[8] = -5; arr[9] = -6; if (foo (32) != bar (32)) __builtin_abort (); for (k = 10; k < 32; k++) { arr[k] = -k; if (foo (32) != bar (32)) __builtin_abort (); } return 0; } Don't know how hard would be to vectorize this in the vectorizer, but clearly icc manages to handle that. The loop is: <bb 4>: # pos_22 = PHI <pos_1(7), 1(3)> # i_23 = PHI <i_15(7), 0(3)> # limit_24 = PHI <limit_4(7), 3.4028234663852885981170418348451692544e+38(3)> limit_11 = arr[i_23]; D.2700_12 = limit_11 < limit_24; pos_1 = [cond_expr] D.2700_12 ? i_23 : pos_22; limit_4 = [cond_expr] D.2700_12 ? limit_11 : limit_24; i_15 = i_23 + 1; D.2703_9 = (long unsigned int) i_15; if (D.2703_9 < end_10(D)) goto <bb 7>; else goto <bb 8>; <bb 7>: goto <bb 4>; before vectorization. -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=31067