https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117888
--- Comment #1 from Hongtao Liu <liuhongt at gcc dot gnu.org> --- This is the case which failed the recogonize innermost correctly. typedef unsigned short ggml_fp16_t; static float table_f32_f16[1 << 16]; inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { unsigned short s; __builtin_memcpy(&s, &f, sizeof(unsigned short)); return table_f32_f16[s]; } typedef struct { ggml_fp16_t d; ggml_fp16_t m; unsigned char qh[4]; unsigned char qs[32 / 2]; } block_q5_1; typedef struct { float d; float s; char qs[32]; } block_q8_1; void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int qk = 32; const int nb = n / qk; const block_q5_1 * restrict x = vx; const block_q8_1 * restrict y = vy; float sumf = 0.0; for (int i = 0; i < nb; i++) { unsigned qh; __builtin_memcpy(&qh, x[i].qh, sizeof(qh)); int sumi = 0; if (qh) { for (int j = 0; j < qk/2; ++j) { const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10; const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10; const int x0 = (x[i].qs[j] & 0xF) | xh_0; const int x1 = (x[i].qs[j] >> 4) | xh_1; sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]); } } else { for (int j = 0; j < qk/2; ++j) { const int x0 = (x[i].qs[j] & 0xF); const int x1 = (x[i].qs[j] >> 4); sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]); } } sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s; } *s = sumf; } <source>:59:66: optimized: Inlining ggml_lookup_fp16_to_fp32/1 into ggml_vec_dot_q5_1_q8_1/2. <source>:59:18: optimized: Inlining ggml_lookup_fp16_to_fp32/1 into ggml_vec_dot_q5_1_q8_1/2. <source>:50:29: optimized: loop with 16 iterations completely unrolled (header execution count 63136012) <source>:39:27: optimized: loop with 16 iterations completely unrolled (header execution count 63136012)