ports@,
here a fix for devel/libggml's compiler issue.
I can't check it due to no access to Sapphire Rapids CPU, but it should be
safe. Can someone test it?
Thanks.
Index: Makefile
===================================================================
RCS file: /home/cvs/ports/devel/libggml/Makefile,v
diff -u -p -r1.15 Makefile
--- Makefile 17 May 2026 20:30:51 -0000 1.15
+++ Makefile 20 May 2026 09:10:03 -0000
@@ -3,6 +3,7 @@ COMMENT= tensor library for machine lea
GH_ACCOUNT= ggml-org
GH_PROJECT= ggml
GH_TAGNAME = v0.12.0
+REVISION = 0
PKGNAME= lib${DISTNAME}
SHARED_LIBS += ggml 3.8
Index: patches/patch-src_ggml-cpu_amx_mmq_cpp
===================================================================
RCS file: patches/patch-src_ggml-cpu_amx_mmq_cpp
diff -N patches/patch-src_ggml-cpu_amx_mmq_cpp
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ patches/patch-src_ggml-cpu_amx_mmq_cpp 20 May 2026 00:22:06 -0000
@@ -0,0 +1,61 @@
+https://github.com/ggml-org/ggml/issues/1499
+
+Index: src/ggml-cpu/amx/mmq.cpp
+--- src/ggml-cpu/amx/mmq.cpp.orig
++++ src/ggml-cpu/amx/mmq.cpp
+@@ -1510,18 +1510,15 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, fl
+ const char * RESTRICT B = static_cast<const char *>(_B);
+
+ __m512i va[8];
+- __m512i vb[8];
+ __m512 vc[COLS];
+ __m512 vd1;
+
+ // Notes: s8s8 igemm compensation in avx512-vnni
+ // change s8s8 to u8s8 with compensate
+- // a * b = (a + 128) * b - 128 * b
++ // a * b = (b + 128) * a - 128 * a
+ // s s u s u s
+- //
+- // (128 * b is pre-computed when packing B to vnni formats)
+- //
+ const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
++ __m512i vcomp;
+
+ auto loadc = [&](auto col) {
+ vc[col] = _mm512_setzero_ps();
+@@ -1529,29 +1526,25 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, fl
+ Unroll<COLS>{}(loadc);
+
+ auto compute = [&](auto col, auto i) {
+- // load a and add offset 128
++ // load a and compute compensation
+ if constexpr (col == 0) {
+ const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0
* KB + i].qs);
++ vcomp = _mm512_setzero_si512();
+ for (int k = 0; k < 8; ++k) {
+ va[k] = _mm512_set1_epi32(a_ptr[k]);
+- va[k] = _mm512_add_epi8(va[k], off);
++ vcomp = _mm512_dpbusd_epi32(vcomp, off, va[k]);
+ }
+ vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
+ }
+
+- // load b
+ const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
+- for (int k = 0; k < 8; ++k) {
+- vb[k] = _mm512_loadu_si512((const __m512i *)(b_ptr + k * 64));
+- }
+ const int offset = TILE_N * TILE_K;
+ const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const
__m256i *)(b_ptr + offset)));
+- const int offset2 = TILE_N * TILE_K + TILE_N * sizeof(ggml_half);
+- const __m512i vcomp = _mm512_loadu_si512((const __m512i *)(b_ptr
+ offset2));
+
+ __m512i vsum = _mm512_setzero_si512();
+ for (int k = 0; k < 8; ++k) {
+- vsum = _mm512_dpbusd_epi32(vsum, va[k], vb[k]);
++ const __m512i vb = _mm512_add_epi8(_mm512_loadu_si512((const
__m512i *)(b_ptr + k * 64)), off);
++ vsum = _mm512_dpbusd_epi32(vsum, vb, va[k]);
+ }
+ vsum = _mm512_sub_epi32(vsum, vcomp);
+
--
wbr, Kirill