PR #21009 opened by george.zaguri
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21009
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21009.patch

All changes are made during development/testing of SME for ffmpeg (vvc). Tested 
on Apple M4.

Additinally fixed bug in Makefile vars to exclude subvars for SVE/SVE2, 
otherwise it leads to inclusion of SVE/SVE2/SME-specific object files into each 
ffmpeg target library


>From 652083378193c2a812fab9fa01d1ae0593768858 Mon Sep 17 00:00:00 2001
From: Georgii Zagoruiko <[email protected]>
Date: Mon, 24 Nov 2025 21:01:03 +0000
Subject: [PATCH] configure: add detection of assembler support for SME

All changes are made during development/testing of SME for ffmpeg (vvc). Tested 
on Apple M4.

Additinally fixed bug in Makefile vars to exclude subvars for SVE/SVE2, 
otherwise it leads to inclusion of SVE/SVE2/SME-specific object files into each 
ffmpeg target library
---
 Makefile                    |  3 ++-
 configure                   | 11 +++++++++--
 ffbuild/arch.mak            |  1 +
 libavutil/aarch64/Makefile  |  2 ++
 libavutil/aarch64/asm.S     |  9 +++++++++
 libavutil/aarch64/cpu.c     | 16 ++++++++++++++++
 libavutil/aarch64/cpu.h     |  5 +++++
 libavutil/aarch64/cpu_sme.S | 31 +++++++++++++++++++++++++++++++
 libavutil/cpu.c             |  1 +
 libavutil/cpu.h             |  1 +
 libavutil/tests/cpu.c       |  5 +++++
 tests/checkasm/checkasm.c   |  1 +
 12 files changed, 83 insertions(+), 3 deletions(-)
 create mode 100644 libavutil/aarch64/cpu_sme.S

diff --git a/Makefile b/Makefile
index 2f78db02a5..7a972b8502 100644
--- a/Makefile
+++ b/Makefile
@@ -110,7 +110,8 @@ SUBDIR_VARS := CLEANFILES FFLIBS HOSTPROGS TESTPROGS TOOLS  
             \
                ALTIVEC-OBJS VSX-OBJS MMX-OBJS X86ASM-OBJS                \
                MIPSFPU-OBJS MIPSDSPR2-OBJS MIPSDSP-OBJS MSA-OBJS         \
                MMI-OBJS LSX-OBJS LASX-OBJS RV-OBJS RVV-OBJS RVVB-OBJS    \
-               OBJS SHLIBOBJS STLIBOBJS HOSTOBJS TESTOBJS SIMD128-OBJS
+               OBJS SHLIBOBJS STLIBOBJS HOSTOBJS TESTOBJS SIMD128-OBJS   \
+               SVE-OBJS SVE2-OBJS SME-OBJS
 
 define RESET
 $(1) :=
diff --git a/configure b/configure
index 7d6061b55c..508268303f 100755
--- a/configure
+++ b/configure
@@ -478,6 +478,7 @@ Optimization options (experts only):
   --disable-i8mm           disable I8MM optimizations
   --disable-sve            disable SVE optimizations
   --disable-sve2           disable SVE2 optimizations
+  --disable-sme            disable SME optimizations
   --disable-inline-asm     disable use of inline assembly
   --disable-x86asm         disable use of standalone x86 assembly
   --disable-mipsdsp        disable MIPS DSP ASE R1 optimizations
@@ -2224,6 +2225,7 @@ ARCH_EXT_LIST_ARM="
     setend
     sve
     sve2
+    sme
 "
 
 ARCH_EXT_LIST_MIPS="
@@ -2491,6 +2493,7 @@ TOOLCHAIN_FEATURES="
     as_archext_i8mm_directive
     as_archext_sve_directive
     as_archext_sve2_directive
+    as_archext_sme_directive
     as_dn_directive
     as_fpu_directive
     as_func
@@ -2823,6 +2826,7 @@ dotprod_deps="aarch64 neon"
 i8mm_deps="aarch64 neon"
 sve_deps="aarch64 neon"
 sve2_deps="aarch64 neon sve"
+sme_deps="aarch64 neon sve sve2"
 
 map 'eval ${v}_inline_deps=inline_asm' $ARCH_EXT_LIST_ARM
 
@@ -5526,6 +5530,7 @@ elif enabled arm; then
         elif check_arm_arch 7M  7_M;  then echo armv7-m
         elif check_arm_arch 7EM 7E_M; then echo armv7-m
         elif check_arm_arch 8A  8_A;  then echo armv8-a
+        elif check_arm_arch 9A  9_A;  then echo armv9-a
         fi
     }
 
@@ -6423,7 +6428,7 @@ if enabled aarch64; then
         # Clang before version 17 (Xcode versions up to and including 15.0)
         # didn't support controlling the dotprod/i8mm extensions via
         # .arch_extension; thus try to enable them via the .arch level as well.
-        for level in armv8.2-a armv8.4-a armv8.6-a; do
+        for level in armv8.2-a armv8.4-a armv8.6-a armv9-a; do
             check_arch_level $level
         done
         # Clang before version 17 (Xcode versions up to and including 15.0)
@@ -6440,11 +6445,12 @@ if enabled aarch64; then
     # internal assembler in clang 3.3 does not support this instruction
     enabled neon && check_insn neon 'ext   v0.8B, v0.8B, v1.8B, #1'
 
-    archext_list="dotprod i8mm sve sve2"
+    archext_list="dotprod i8mm sve sve2 sme"
     enabled dotprod && check_archext_insn dotprod 'udot v0.4s, v0.16b, v0.16b'
     enabled i8mm    && check_archext_insn i8mm    'usdot v0.4s, v0.16b, v0.16b'
     enabled sve     && check_archext_insn sve     'whilelt p0.s, x0, x1'
     enabled sve2    && check_archext_insn sve2    'sqrdmulh z0.s, z0.s, z0.s'
+    enabled sme     && check_archext_insn sme     'smstop'
 
     # Disable the main feature (e.g. HAVE_NEON) if neither inline nor external
     # assembly support the feature out of the box. Skip this for the features
@@ -8202,6 +8208,7 @@ if enabled aarch64; then
     echo "I8MM enabled              ${i8mm-no}"
     echo "SVE enabled               ${sve-no}"
     echo "SVE2 enabled              ${sve2-no}"
+    echo "SME enabled               ${sme-no}"
 fi
 if enabled arm; then
     echo "ARMv5TE enabled           ${armv5te-no}"
diff --git a/ffbuild/arch.mak b/ffbuild/arch.mak
index 197e30bb89..7dec692b15 100644
--- a/ffbuild/arch.mak
+++ b/ffbuild/arch.mak
@@ -5,6 +5,7 @@ OBJS-$(HAVE_VFP)     += $(VFP-OBJS)     $(VFP-OBJS-yes)
 OBJS-$(HAVE_NEON)    += $(NEON-OBJS)    $(NEON-OBJS-yes)
 OBJS-$(HAVE_SVE)     += $(SVE-OBJS)     $(SVE-OBJS-yes)
 OBJS-$(HAVE_SVE2)    += $(SVE2-OBJS)    $(SVE2-OBJS-yes)
+OBJS-$(HAVE_SME)     += $(SME-OBJS)     $(SME-OBJS-yes)
 
 OBJS-$(HAVE_MIPSFPU)   += $(MIPSFPU-OBJS)    $(MIPSFPU-OBJS-yes)
 OBJS-$(HAVE_MIPSDSP)   += $(MIPSDSP-OBJS)    $(MIPSDSP-OBJS-yes)
diff --git a/libavutil/aarch64/Makefile b/libavutil/aarch64/Makefile
index 992e95e4df..744c2c53d7 100644
--- a/libavutil/aarch64/Makefile
+++ b/libavutil/aarch64/Makefile
@@ -6,3 +6,5 @@ NEON-OBJS += aarch64/float_dsp_neon.o                           
      \
              aarch64/tx_float_neon.o                                  \
 
 SVE-OBJS += aarch64/cpu_sve.o                                         \
+
+SME-OBJS += aarch64/cpu_sme.o                                         \
diff --git a/libavutil/aarch64/asm.S b/libavutil/aarch64/asm.S
index 2e4e451ec2..77cea57cfc 100644
--- a/libavutil/aarch64/asm.S
+++ b/libavutil/aarch64/asm.S
@@ -72,10 +72,19 @@
 #define DISABLE_SVE2
 #endif
 
+#if HAVE_AS_ARCHEXT_SME_DIRECTIVE
+#define ENABLE_SME   .arch_extension sme
+#define DISABLE_SME  .arch_extension nosme
+#else
+#define ENABLE_SME
+#define DISABLE_SME
+#endif
+
 DISABLE_DOTPROD
 DISABLE_I8MM
 DISABLE_SVE
 DISABLE_SVE2
+DISABLE_SME
 
 
 /* Support macros for
diff --git a/libavutil/aarch64/cpu.c b/libavutil/aarch64/cpu.c
index e82c0f19ab..1776da3bfa 100644
--- a/libavutil/aarch64/cpu.c
+++ b/libavutil/aarch64/cpu.c
@@ -28,6 +28,7 @@
 #define HWCAP_AARCH64_SVE     (1 << 22)
 #define HWCAP2_AARCH64_SVE2   (1 << 1)
 #define HWCAP2_AARCH64_I8MM   (1 << 13)
+#define HWCAP2_SME            (1 << 23)
 
 static int detect_flags(void)
 {
@@ -44,6 +45,8 @@ static int detect_flags(void)
         flags |= AV_CPU_FLAG_SVE2;
     if (hwcap2 & HWCAP2_AARCH64_I8MM)
         flags |= AV_CPU_FLAG_I8MM;
+    if (hwcap & HWCAP_AARCH64_SME)
+        flags |= AV_CPU_FLAG_SME;
 
     return flags;
 }
@@ -67,6 +70,12 @@ static int detect_flags(void)
         flags |= AV_CPU_FLAG_DOTPROD;
     if (have_feature("hw.optional.arm.FEAT_I8MM"))
         flags |= AV_CPU_FLAG_I8MM;
+    if (have_feature("hw.optional.arm.FEAT_SVE"))
+        flags |= AV_CPU_FLAG_SVE;
+    if (have_feature("hw.optional.arm.FEAT_SVE2"))
+        flags |= AV_CPU_FLAG_SVE2;
+    if (have_feature("hw.optional.arm.FEAT_SME"))
+        flags |= AV_CPU_FLAG_SME;
 
     return flags;
 }
@@ -133,6 +142,10 @@ static int detect_flags(void)
 #ifdef PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE
     if (IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE))
         flags |= AV_CPU_FLAG_SVE2;
+#endif
+#ifdef PF_ARM_SME_INSTRUCTIONS_AVAILABLE
+    if (IsProcessorFeaturePresent(PF_ARM_SME_INSTRUCTIONS_AVAILABLE))
+        flags |= AV_CPU_FLAG_SME;
 #endif
     return flags;
 }
@@ -162,6 +175,9 @@ int ff_get_cpu_flags_aarch64(void)
 #ifdef __ARM_FEATURE_SVE2
     flags |= AV_CPU_FLAG_SVE2;
 #endif
+#ifdef __ARM_FEATURE_SME
+    flags |= AV_CPU_FLAG_SME;
+#endif
 
     flags |= detect_flags();
 
diff --git a/libavutil/aarch64/cpu.h b/libavutil/aarch64/cpu.h
index a41b729659..62d5eb768f 100644
--- a/libavutil/aarch64/cpu.h
+++ b/libavutil/aarch64/cpu.h
@@ -29,9 +29,14 @@
 #define have_i8mm(flags)    CPUEXT(flags, I8MM)
 #define have_sve(flags)     CPUEXT(flags, SVE)
 #define have_sve2(flags)    CPUEXT(flags, SVE2)
+#define have_sme(flags)     CPUEXT(flags, SME)
 
 #if HAVE_SVE
 int ff_aarch64_sve_length(void);
 #endif
 
+#if HAVE_SME
+int ff_aarch64_sme_length(void);
+#endif
+
 #endif /* AVUTIL_AARCH64_CPU_H */
diff --git a/libavutil/aarch64/cpu_sme.S b/libavutil/aarch64/cpu_sme.S
new file mode 100644
index 0000000000..ba79d483a1
--- /dev/null
+++ b/libavutil/aarch64/cpu_sme.S
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2025 Georgii Zagoruiko
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "asm.S"
+
+ENABLE_SME
+
+function ff_aarch64_sme_length, export=1
+        smstart
+        cntb            x0
+        smstop
+        ret
+endfunc
diff --git a/libavutil/cpu.c b/libavutil/cpu.c
index 8f9b785ebc..5aed2f39dc 100644
--- a/libavutil/cpu.c
+++ b/libavutil/cpu.c
@@ -186,6 +186,7 @@ int av_parse_cpu_caps(unsigned *flags, const char *s)
         { "i8mm",     NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_I8MM    
 },    .unit = "flags" },
         { "sve",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_SVE     
 },    .unit = "flags" },
         { "sve2",     NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_SVE2    
 },    .unit = "flags" },
+        { "sme",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_SME     
 },    .unit = "flags" },
 #elif ARCH_MIPS
         { "mmi",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_MMI     
 },    .unit = "flags" },
         { "msa",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_MSA     
 },    .unit = "flags" },
diff --git a/libavutil/cpu.h b/libavutil/cpu.h
index a06fc08e56..87cecd0424 100644
--- a/libavutil/cpu.h
+++ b/libavutil/cpu.h
@@ -76,6 +76,7 @@
 #define AV_CPU_FLAG_I8MM         (1 << 9)
 #define AV_CPU_FLAG_SVE          (1 <<10)
 #define AV_CPU_FLAG_SVE2         (1 <<11)
+#define AV_CPU_FLAG_SME          (1 <<12)
 #define AV_CPU_FLAG_SETEND       (1 <<16)
 
 #define AV_CPU_FLAG_MMI          (1 << 0)
diff --git a/libavutil/tests/cpu.c b/libavutil/tests/cpu.c
index fd2e32901d..f59db0915c 100644
--- a/libavutil/tests/cpu.c
+++ b/libavutil/tests/cpu.c
@@ -48,6 +48,7 @@ static const struct {
     { AV_CPU_FLAG_I8MM,      "i8mm"       },
     { AV_CPU_FLAG_SVE,       "sve"        },
     { AV_CPU_FLAG_SVE2,      "sve2"       },
+    { AV_CPU_FLAG_SME,       "sme"        },
 #elif ARCH_ARM
     { AV_CPU_FLAG_ARMV5TE,   "armv5te"    },
     { AV_CPU_FLAG_ARMV6,     "armv6"      },
@@ -174,6 +175,10 @@ int main(int argc, char **argv)
 #if ARCH_AARCH64 && HAVE_SVE
     if (cpu_flags_raw & AV_CPU_FLAG_SVE)
         printf("sve_vector_length = %d\n", 8 * ff_aarch64_sve_length());
+#endif
+#if ARCH_AARCH64 && HAVE_SME
+    if (cpu_flags_raw & AV_CPU_FLAG_SME)
+        printf("sme_vector_length = %d\n", 8 * ff_aarch64_sme_length());
 #elif ARCH_RISCV && HAVE_RVV
     if (cpu_flags_raw & AV_CPU_FLAG_RVV_I32) {
         size_t bytes = ff_get_rv_vlenb();
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 20d8f19757..ae3756d5d0 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -351,6 +351,7 @@ static const struct {
     { "I8MM",     "i8mm",     AV_CPU_FLAG_I8MM },
     { "SVE",      "sve",      AV_CPU_FLAG_SVE },
     { "SVE2",     "sve2",     AV_CPU_FLAG_SVE2 },
+    { "SME",      "sme",      AV_CPU_FLAG_SME },
 #elif ARCH_ARM
     { "ARMV5TE",  "armv5te",  AV_CPU_FLAG_ARMV5TE },
     { "ARMV6",    "armv6",    AV_CPU_FLAG_ARMV6 },
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to