On Wed, 20 May 2026 11:16:51 +0200,
Kirill A. Korinsky <[email protected]> wrote:
> 
> ports@,
> 
> here a fix for devel/libggml's compiler issue.
> 
> I can't check it due to no access to Sapphire Rapids CPU, but it should be
> safe. Can someone test it?
> 
> Thanks.
>

I'd like to withdraw this diff.

It is llvm-22 bug, and backport of the fix will be send to ports@ shortly

> Index: Makefile
> ===================================================================
> RCS file: /home/cvs/ports/devel/libggml/Makefile,v
> diff -u -p -r1.15 Makefile
> --- Makefile  17 May 2026 20:30:51 -0000      1.15
> +++ Makefile  20 May 2026 09:10:03 -0000
> @@ -3,6 +3,7 @@ COMMENT=              tensor library for machine lea
>  GH_ACCOUNT=          ggml-org
>  GH_PROJECT=          ggml
>  GH_TAGNAME =         v0.12.0
> +REVISION =           0
>  PKGNAME=             lib${DISTNAME}
>  
>  SHARED_LIBS +=               ggml 3.8
> Index: patches/patch-src_ggml-cpu_amx_mmq_cpp
> ===================================================================
> RCS file: patches/patch-src_ggml-cpu_amx_mmq_cpp
> diff -N patches/patch-src_ggml-cpu_amx_mmq_cpp
> --- /dev/null 1 Jan 1970 00:00:00 -0000
> +++ patches/patch-src_ggml-cpu_amx_mmq_cpp    20 May 2026 00:22:06 -0000
> @@ -0,0 +1,61 @@
> +https://github.com/ggml-org/ggml/issues/1499
> +
> +Index: src/ggml-cpu/amx/mmq.cpp
> +--- src/ggml-cpu/amx/mmq.cpp.orig
> ++++ src/ggml-cpu/amx/mmq.cpp
> +@@ -1510,18 +1510,15 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, 
> fl
> +         const char * RESTRICT B = static_cast<const char *>(_B);
> + 
> +         __m512i va[8];
> +-        __m512i vb[8];
> +         __m512 vc[COLS];
> +         __m512 vd1;
> + 
> +         // Notes: s8s8 igemm compensation in avx512-vnni
> +         // change s8s8 to u8s8 with compensate
> +-        //   a * b = (a + 128) * b - 128 * b
> ++        //   a * b = (b + 128) * a - 128 * a
> +         //   s   s       u       s    u    s
> +-        //
> +-        // (128 * b is pre-computed when packing B to vnni formats)
> +-        //
> +         const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
> ++        __m512i vcomp;
> + 
> +         auto loadc = [&](auto col) {
> +             vc[col] = _mm512_setzero_ps();
> +@@ -1529,29 +1526,25 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, 
> fl
> +         Unroll<COLS>{}(loadc);
> + 
> +         auto compute = [&](auto col, auto i) {
> +-            // load a and add offset 128
> ++            // load a and compute compensation
> +             if constexpr (col == 0) {
> +                 const int32_t * a_ptr = reinterpret_cast<const int32_t 
> *>(A[0 * KB + i].qs);
> ++                vcomp = _mm512_setzero_si512();
> +                 for (int k = 0; k < 8; ++k) {
> +                     va[k] = _mm512_set1_epi32(a_ptr[k]);
> +-                    va[k] = _mm512_add_epi8(va[k], off);
> ++                    vcomp = _mm512_dpbusd_epi32(vcomp, off, va[k]);
> +                 }
> +                 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + 
> i].d));
> +             }
> + 
> +-            // load b
> +             const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
> +-            for (int k = 0; k < 8; ++k) {
> +-                vb[k] = _mm512_loadu_si512((const __m512i *)(b_ptr + k * 
> 64));
> +-            }
> +             const int offset = TILE_N * TILE_K;
> +             const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const 
> __m256i *)(b_ptr + offset)));
> +-            const int offset2 = TILE_N * TILE_K + TILE_N * 
> sizeof(ggml_half);
> +-            const __m512i vcomp = _mm512_loadu_si512((const __m512i 
> *)(b_ptr + offset2));
> + 
> +             __m512i vsum = _mm512_setzero_si512();
> +             for (int k = 0; k < 8; ++k) {
> +-                vsum = _mm512_dpbusd_epi32(vsum, va[k], vb[k]);
> ++                const __m512i vb = 
> _mm512_add_epi8(_mm512_loadu_si512((const __m512i *)(b_ptr + k * 64)), off);
> ++                vsum = _mm512_dpbusd_epi32(vsum, vb, va[k]);
> +             }
> +             vsum = _mm512_sub_epi32(vsum, vcomp);
> + 
> 
> 
> -- 
> wbr, Kirill
> 

-- 
wbr, Kirill

Reply via email to