On Wed, 20 May 2026 11:16:51 +0200,
Kirill A. Korinsky <[email protected]> wrote:
>
> ports@,
>
> here a fix for devel/libggml's compiler issue.
>
> I can't check it due to no access to Sapphire Rapids CPU, but it should be
> safe. Can someone test it?
>
> Thanks.
>
I'd like to withdraw this diff.
It is llvm-22 bug, and backport of the fix will be send to ports@ shortly
> Index: Makefile
> ===================================================================
> RCS file: /home/cvs/ports/devel/libggml/Makefile,v
> diff -u -p -r1.15 Makefile
> --- Makefile 17 May 2026 20:30:51 -0000 1.15
> +++ Makefile 20 May 2026 09:10:03 -0000
> @@ -3,6 +3,7 @@ COMMENT= tensor library for machine lea
> GH_ACCOUNT= ggml-org
> GH_PROJECT= ggml
> GH_TAGNAME = v0.12.0
> +REVISION = 0
> PKGNAME= lib${DISTNAME}
>
> SHARED_LIBS += ggml 3.8
> Index: patches/patch-src_ggml-cpu_amx_mmq_cpp
> ===================================================================
> RCS file: patches/patch-src_ggml-cpu_amx_mmq_cpp
> diff -N patches/patch-src_ggml-cpu_amx_mmq_cpp
> --- /dev/null 1 Jan 1970 00:00:00 -0000
> +++ patches/patch-src_ggml-cpu_amx_mmq_cpp 20 May 2026 00:22:06 -0000
> @@ -0,0 +1,61 @@
> +https://github.com/ggml-org/ggml/issues/1499
> +
> +Index: src/ggml-cpu/amx/mmq.cpp
> +--- src/ggml-cpu/amx/mmq.cpp.orig
> ++++ src/ggml-cpu/amx/mmq.cpp
> +@@ -1510,18 +1510,15 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0,
> fl
> + const char * RESTRICT B = static_cast<const char *>(_B);
> +
> + __m512i va[8];
> +- __m512i vb[8];
> + __m512 vc[COLS];
> + __m512 vd1;
> +
> + // Notes: s8s8 igemm compensation in avx512-vnni
> + // change s8s8 to u8s8 with compensate
> +- // a * b = (a + 128) * b - 128 * b
> ++ // a * b = (b + 128) * a - 128 * a
> + // s s u s u s
> +- //
> +- // (128 * b is pre-computed when packing B to vnni formats)
> +- //
> + const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
> ++ __m512i vcomp;
> +
> + auto loadc = [&](auto col) {
> + vc[col] = _mm512_setzero_ps();
> +@@ -1529,29 +1526,25 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0,
> fl
> + Unroll<COLS>{}(loadc);
> +
> + auto compute = [&](auto col, auto i) {
> +- // load a and add offset 128
> ++ // load a and compute compensation
> + if constexpr (col == 0) {
> + const int32_t * a_ptr = reinterpret_cast<const int32_t
> *>(A[0 * KB + i].qs);
> ++ vcomp = _mm512_setzero_si512();
> + for (int k = 0; k < 8; ++k) {
> + va[k] = _mm512_set1_epi32(a_ptr[k]);
> +- va[k] = _mm512_add_epi8(va[k], off);
> ++ vcomp = _mm512_dpbusd_epi32(vcomp, off, va[k]);
> + }
> + vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB +
> i].d));
> + }
> +
> +- // load b
> + const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
> +- for (int k = 0; k < 8; ++k) {
> +- vb[k] = _mm512_loadu_si512((const __m512i *)(b_ptr + k *
> 64));
> +- }
> + const int offset = TILE_N * TILE_K;
> + const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const
> __m256i *)(b_ptr + offset)));
> +- const int offset2 = TILE_N * TILE_K + TILE_N *
> sizeof(ggml_half);
> +- const __m512i vcomp = _mm512_loadu_si512((const __m512i
> *)(b_ptr + offset2));
> +
> + __m512i vsum = _mm512_setzero_si512();
> + for (int k = 0; k < 8; ++k) {
> +- vsum = _mm512_dpbusd_epi32(vsum, va[k], vb[k]);
> ++ const __m512i vb =
> _mm512_add_epi8(_mm512_loadu_si512((const __m512i *)(b_ptr + k * 64)), off);
> ++ vsum = _mm512_dpbusd_epi32(vsum, vb, va[k]);
> + }
> + vsum = _mm512_sub_epi32(vsum, vcomp);
> +
>
>
> --
> wbr, Kirill
>
--
wbr, Kirill