Signed-off-by: James Almer <[email protected]>
---
GCC apparently can't generate a bzhi instruction on its own from the c version,
so
here's a custom implementation.
Before:
gcc -O3
<av_zhb_c>:
0: 89 f1 mov ecx,esi
2: ba 01 00 00 00 mov edx,0x1
7: d3 e2 shl edx,cl
9: 83 ea 01 sub edx,0x1
c: 89 d0 mov eax,edx
e: 21 f8 and eax,edi
10: c3 ret
gcc -mbmi2 -O3
<av_zhb_c>:
0: ba 01 00 00 00 mov edx,0x1
5: c4 e2 49 f7 d2 shlx edx,edx,esi
a: 8d 42 ff lea eax,[rdx-0x1]
d: 21 f8 and eax,edi
f: c3 ret
After:
gcc -mbmi2 -O3
<av_zhb_bmi2>:
0: c4 e2 48 f5 c7 bzhi eax,edi,esi
5: c3 ret
The non-bmi2 example is a bit bloated with movs to have values in ecx (needed
for
shl) and eax (ret value) since, unlike the actual function, it was not inlined.
Still, best case scenario is mov + shl + sub/dec/lea + and versus a single bzhi
when p is not a constant.
libavutil/x86/intmath.h | 25 +++++++++++++++++++++++--
1 file changed, 23 insertions(+), 2 deletions(-)
diff --git a/libavutil/x86/intmath.h b/libavutil/x86/intmath.h
index 7aa6bc4..f19ef64 100644
--- a/libavutil/x86/intmath.h
+++ b/libavutil/x86/intmath.h
@@ -24,15 +24,36 @@
#include <stdint.h>
#include "config.h"
+#if defined(__GNUC__)
+
/* Our generic version of av_popcount is faster than GCC's built-in on
* CPUs that don't support the popcnt instruction.
*/
-#if defined(__GNUC__) && defined(__POPCNT__)
+#if defined(__POPCNT__)
+
#define av_popcount __builtin_popcount
#if ARCH_X86_64
#define av_popcount64 __builtin_popcountll
#endif
-#endif /* defined(__GNUC__) && defined(__POPCNT__) */
+#endif /* __POPCNT__ */
+
+#if defined(__BMI2__)
+
+#define av_zhb av_zhb_bmi2
+static av_always_inline av_const unsigned av_zhb_bmi2(unsigned a, unsigned p)
+{
+ if (av_builtin_constant_p(p))
+ return a & ((1 << p) - 1);
+ else {
+ unsigned x;
+ __asm__ ("bzhi %2, %1, %0 \n\t" : "=r"(x) : "rm"(a), "r"(p));
+ return x;
+ }
+}
+
+#endif /* __BMI2__ */
+
+#endif /* __GNUC__ */
#endif /* AVUTIL_X86_INTMATH_H */
--
2.3.2
_______________________________________________
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel