ports@,

here a fix for devel/libggml's compiler issue.

I can't check it due to no access to Sapphire Rapids CPU, but it should be
safe. Can someone test it?

Thanks.

Index: Makefile
===================================================================
RCS file: /home/cvs/ports/devel/libggml/Makefile,v
diff -u -p -r1.15 Makefile
--- Makefile    17 May 2026 20:30:51 -0000      1.15
+++ Makefile    20 May 2026 09:10:03 -0000
@@ -3,6 +3,7 @@ COMMENT=                tensor library for machine lea
 GH_ACCOUNT=            ggml-org
 GH_PROJECT=            ggml
 GH_TAGNAME =           v0.12.0
+REVISION =             0
 PKGNAME=               lib${DISTNAME}
 
 SHARED_LIBS +=         ggml 3.8
Index: patches/patch-src_ggml-cpu_amx_mmq_cpp
===================================================================
RCS file: patches/patch-src_ggml-cpu_amx_mmq_cpp
diff -N patches/patch-src_ggml-cpu_amx_mmq_cpp
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ patches/patch-src_ggml-cpu_amx_mmq_cpp      20 May 2026 00:22:06 -0000
@@ -0,0 +1,61 @@
+https://github.com/ggml-org/ggml/issues/1499
+
+Index: src/ggml-cpu/amx/mmq.cpp
+--- src/ggml-cpu/amx/mmq.cpp.orig
++++ src/ggml-cpu/amx/mmq.cpp
+@@ -1510,18 +1510,15 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, fl
+         const char * RESTRICT B = static_cast<const char *>(_B);
+ 
+         __m512i va[8];
+-        __m512i vb[8];
+         __m512 vc[COLS];
+         __m512 vd1;
+ 
+         // Notes: s8s8 igemm compensation in avx512-vnni
+         // change s8s8 to u8s8 with compensate
+-        //   a * b = (a + 128) * b - 128 * b
++        //   a * b = (b + 128) * a - 128 * a
+         //   s   s       u       s    u    s
+-        //
+-        // (128 * b is pre-computed when packing B to vnni formats)
+-        //
+         const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
++        __m512i vcomp;
+ 
+         auto loadc = [&](auto col) {
+             vc[col] = _mm512_setzero_ps();
+@@ -1529,29 +1526,25 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, fl
+         Unroll<COLS>{}(loadc);
+ 
+         auto compute = [&](auto col, auto i) {
+-            // load a and add offset 128
++            // load a and compute compensation
+             if constexpr (col == 0) {
+                 const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 
* KB + i].qs);
++                vcomp = _mm512_setzero_si512();
+                 for (int k = 0; k < 8; ++k) {
+                     va[k] = _mm512_set1_epi32(a_ptr[k]);
+-                    va[k] = _mm512_add_epi8(va[k], off);
++                    vcomp = _mm512_dpbusd_epi32(vcomp, off, va[k]);
+                 }
+                 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
+             }
+ 
+-            // load b
+             const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
+-            for (int k = 0; k < 8; ++k) {
+-                vb[k] = _mm512_loadu_si512((const __m512i *)(b_ptr + k * 64));
+-            }
+             const int offset = TILE_N * TILE_K;
+             const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const 
__m256i *)(b_ptr + offset)));
+-            const int offset2 = TILE_N * TILE_K + TILE_N * sizeof(ggml_half);
+-            const __m512i vcomp = _mm512_loadu_si512((const __m512i *)(b_ptr 
+ offset2));
+ 
+             __m512i vsum = _mm512_setzero_si512();
+             for (int k = 0; k < 8; ++k) {
+-                vsum = _mm512_dpbusd_epi32(vsum, va[k], vb[k]);
++                const __m512i vb = _mm512_add_epi8(_mm512_loadu_si512((const 
__m512i *)(b_ptr + k * 64)), off);
++                vsum = _mm512_dpbusd_epi32(vsum, vb, va[k]);
+             }
+             vsum = _mm512_sub_epi32(vsum, vcomp);
+ 


-- 
wbr, Kirill

Reply via email to