https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117888

--- Comment #1 from Hongtao Liu <liuhongt at gcc dot gnu.org> ---
This is the case which failed the recogonize innermost correctly.

typedef unsigned short ggml_fp16_t;
static float table_f32_f16[1 << 16];

inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
    unsigned short s;
    __builtin_memcpy(&s, &f, sizeof(unsigned short));
    return table_f32_f16[s];
}

typedef struct {
    ggml_fp16_t d;
    ggml_fp16_t m;
    unsigned char qh[4];
    unsigned char qs[32 / 2];
} block_q5_1;

typedef struct {
    float d;
    float s;
    char qs[32];
} block_q8_1;

void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
restrict vx, const void * restrict vy) {
    const int qk = 32;
    const int nb = n / qk;

    const block_q5_1 * restrict x = vx;
    const block_q8_1 * restrict y = vy;

    float sumf = 0.0;

    for (int i = 0; i < nb; i++) {
        unsigned qh;
        __builtin_memcpy(&qh, x[i].qh, sizeof(qh));

        int sumi = 0;

        if (qh) {
        for (int j = 0; j < qk/2; ++j) {
            const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
            const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;

            const int x0 = (x[i].qs[j] & 0xF) | xh_0;
            const int x1 = (x[i].qs[j] >> 4) | xh_1;

            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
        }
        }
        else {
          for (int j = 0; j < qk/2; ++j) {
                const int x0 = (x[i].qs[j] & 0xF);
                const int x1 = (x[i].qs[j] >>  4);

            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
            }

        }

        sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi +
ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
    }

    *s = sumf;
}


<source>:59:66: optimized:  Inlining ggml_lookup_fp16_to_fp32/1 into
ggml_vec_dot_q5_1_q8_1/2.
<source>:59:18: optimized:  Inlining ggml_lookup_fp16_to_fp32/1 into
ggml_vec_dot_q5_1_q8_1/2.
<source>:50:29: optimized: loop with 16 iterations completely unrolled (header
execution count 63136012)
<source>:39:27: optimized: loop with 16 iterations completely unrolled (header
execution count 63136012)

Reply via email to