Let E denote an multi-dimensional extent; n the rank of E; r = 0, ...,
n; E[i] the i-th extent; and D[k] be the (possibly empty) array of
dynamic extents.
The two partial products for r = 0, ..., n:
\prod_{i = 0}^r E[i] (fwd)
\prod_{i = r+1}^n E[i] (rev)
can be computed as the product of static and dynamic extents. The static
fwd and rev product can be computed at compile time for all values of r.
Three methods are directly affected by this optimization:
layout_left::mapping::stride
layout_right::mapping::stride
mdspan::size
We'll check the generated code (-O2) for all three methods for a generic
(artificially) high-dimensional multi-dimensional extents.
Consider a generic case:
using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>;
int stride_left(const std::layout_left::mapping<Extents>& m, size_t r)
{ return m.stride(r); }
The code generated by master:
80: 49 89 f0 mov r8,rsi
83: 49 89 f1 mov r9,rsi
86: 49 c1 e0 03 shl r8,0x3
8a: 74 6c je f8 <stride_left+0x78>
8c: 49 81 c0 00 00 00 00 add r8,0x0
93: ba 00 00 00 00 mov edx,0x0
98: b8 01 00 00 00 mov eax,0x1
9d: 0f 1f 00 nop DWORD PTR [rax]
a0: 48 8b 0a mov rcx,QWORD PTR [rdx]
a3: 48 89 c6 mov rsi,rax
a6: 48 0f af f1 imul rsi,rcx
aa: 48 83 f9 ff cmp rcx,0xffffffffffffffff
ae: 48 0f 45 c6 cmovne rax,rsi
b2: 48 83 c2 08 add rdx,0x8
b6: 49 39 d0 cmp r8,rdx
b9: 75 e5 jne a0 <stride_left+0x20>
bb: 31 d2 xor edx,edx
bd: 48 85 c0 test rax,rax
c0: 74 30 je f2 <stride_left+0x72>
c2: 4a 8b 0c cd 00 00 00 mov rcx,QWORD PTR [r9*8+0x0]
c9: 00
ca: 48 c1 e1 02 shl rcx,0x2
ce: 74 20 je f0 <stride_left+0x70>
d0: 48 01 f9 add rcx,rdi
d3: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
da: 00 00 00 00
de: 66 90 xchg ax,ax
e0: 48 63 17 movsxd rdx,DWORD PTR [rdi]
e3: 48 83 c7 04 add rdi,0x4
e7: 48 0f af c2 imul rax,rdx
eb: 48 39 f9 cmp rcx,rdi
ee: 75 f0 jne e0 <stride_left+0x60>
f0: 89 c2 mov edx,eax
f2: 89 d0 mov eax,edx
f4: c3 ret
f5: 0f 1f 00 nop DWORD PTR [rax]
f8: b8 01 00 00 00 mov eax,0x1
fd: eb c3 jmp c2 <stride_left+0x42>
is reduced to:
50: 48 8b 0c f5 00 00 00 mov rcx,QWORD PTR [rsi*8+0x0]
57: 00
58: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0]
5f: 00
60: 48 c1 e1 02 shl rcx,0x2
64: 74 1a je 80 <stride_left+0x30>
66: 48 01 f9 add rcx,rdi
69: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
70: 48 63 17 movsxd rdx,DWORD PTR [rdi]
73: 48 83 c7 04 add rdi,0x4
77: 48 0f af c2 imul rax,rdx
7b: 48 39 f9 cmp rcx,rdi
7e: 75 f0 jne 70 <stride_left+0x20>
80: c3 ret
Loosely speaking this does the following:
1. Load the starting position k in the array of dynamic extents.
2. Load the partial product of static extents.
3. Computes the \prod_{i = k}^d D[i] where d is the number of
dynamic extents in a loop.
It shows that the span used for passing in the dynamic extents is
completely eliminated; and the fact that the product always runs to the
end of the array of dynamic extents is used by the compiler to eliminate
one indirection to determine the end position in the array of dynamic
extents.
The analogous code is generated for layout_right.
Next, consider
using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>;
int size2(const std::mdspan<double, E2>& md)
{ return md.size(); }
on master the generated code is
10: b8 00 00 00 00 mov eax,0x0
15: ba 01 00 00 00 mov edx,0x1
1a: 66 0f 1f 44 00 00 nop WORD PTR [rax+rax*1+0x0]
20: 48 8b 08 mov rcx,QWORD PTR [rax]
23: 48 89 d6 mov rsi,rdx
26: 48 0f af f1 imul rsi,rcx
2a: 48 83 f9 ff cmp rcx,0xffffffffffffffff
2e: 48 0f 45 d6 cmovne rdx,rsi
32: 48 83 c0 08 add rax,0x8
36: 48 3d 00 00 00 00 cmp rax,0x0
3c: 75 e2 jne 20 <size2+0x10>
3e: 31 c0 xor eax,eax
40: 48 85 d2 test rdx,rdx
43: 74 12 je 57 <size2+0x47>
45: 48 63 07 movsxd rax,DWORD PTR [rdi]
48: 48 63 4f 04 movsxd rcx,DWORD PTR [rdi+0x4]
4c: 48 0f af c1 imul rax,rcx
50: 0f af 47 08 imul eax,DWORD PTR [rdi+0x8]
54: 0f af c2 imul eax,edx
57: c3 ret
the optimized version is:
10: 48 63 07 movsxd rax,DWORD PTR [rdi]
13: 48 63 57 04 movsxd rdx,DWORD PTR [rdi+0x4]
17: 48 0f af c2 imul rax,rdx
1b: 0f af 47 08 imul eax,DWORD PTR [rdi+0x8]
1f: 69 c0 83 04 00 00 imul eax,eax,0x483
25: c3 ret
Which simply computes the product:
D[0] * D[1] * D[2] * const
where const is the product of all static extents. Meaning the loop to
compute the product of dynamic extents has been fully unrolled and
all constants are perfectly precomputed.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__fwd_prod): Compute as the
product of pre-computed static static and the product of dynamic
extents.
(__mdspan::__rev_prod): Ditto.
Signed-off-by: Luc Grosheintz <[email protected]>
---
libstdc++-v3/include/std/mdspan | 81 +++++++++++++++++++++++----------
1 file changed, 56 insertions(+), 25 deletions(-)
diff --git a/libstdc++-v3/include/std/mdspan b/libstdc++-v3/include/std/mdspan
index 5e79d4bfb59..06ccf3e3827 100644
--- a/libstdc++-v3/include/std/mdspan
+++ b/libstdc++-v3/include/std/mdspan
@@ -184,6 +184,49 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
__valid_static_extent = _Extent == dynamic_extent
|| _Extent <= numeric_limits<_IndexType>::max();
+ template<typename _Extents>
+ consteval size_t
+ __static_prod(size_t __begin, size_t __end)
+ {
+ size_t __prod = 1;
+ for(size_t __i = __begin; __i < __end; ++__i)
+ {
+ auto __ext = _Extents::static_extent(__i);
+ __prod *= __ext == dynamic_extent ? size_t(1) : __ext;
+ }
+ return __prod;
+ }
+
+ // Pre-compute: \prod_{i = 0}^r _Extents[i]
+ template<typename _Extents>
+ struct _FwdProd
+ {
+ constexpr static std::array<size_t, _Extents::rank() + 1> _S_value =
+ [] consteval
+ {
+ constexpr size_t __rank = _Extents::rank();
+ std::array<size_t, __rank + 1> __ret;
+ for(size_t __r = 0; __r < __rank + 1; ++__r)
+ __ret[__r] = __static_prod<_Extents>(0, __r);
+ return __ret;
+ }();
+ };
+
+ // Pre-compute: \prod_{i = r+1}^n _Extents[i]
+ template<typename _Extents>
+ struct _RevProd
+ {
+ constexpr static std::array<size_t, _Extents::rank()> _S_value =
+ [] consteval
+ {
+ constexpr size_t __rank = _Extents::rank();
+ std::array<size_t, __rank> __ret;
+ for(size_t __r = 0; __r < __rank; ++__r)
+ __ret[__r] = __static_prod<_Extents>(__r + 1, __rank);
+ return __ret;
+ }();
+ };
+
template<typename _Extents>
constexpr span<const size_t>
__static_extents(size_t __begin = 0, size_t __end = _Extents::rank())
@@ -352,46 +395,34 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
return false;
}
- constexpr size_t
- __static_extents_prod(const auto& __sta_exts) noexcept
- {
- size_t __ret = 1;
- for (auto __factor : __sta_exts)
- if (__factor != dynamic_extent)
- __ret *= __factor;
- return __ret;
- }
-
template<typename _Extents>
constexpr typename _Extents::index_type
- __exts_prod(const _Extents& __exts, size_t __begin, size_t __end)
noexcept
+ __extents_prod(const _Extents& __exts, size_t __sta_prod, size_t __begin,
+ size_t __end)
{
- using _IndexType = typename _Extents::index_type;
-
- size_t __ret = 1;
- if constexpr (_Extents::rank_dynamic() != _Extents::rank())
- {
- auto __sta_exts = __static_extents<_Extents>(__begin, __end);
- __ret = __static_extents_prod(__sta_exts);
- if (__ret == 0)
- return 0;
- }
-
+ size_t __ret = __sta_prod;
if constexpr (_Extents::rank_dynamic() > 0)
for (auto __factor : __dynamic_extents(__exts, __begin, __end))
__ret *= size_t(__factor);
- return _IndexType(__ret);
+ return static_cast<typename _Extents::index_type>(__ret);
}
template<typename _Extents>
constexpr typename _Extents::index_type
__fwd_prod(const _Extents& __exts, size_t __r) noexcept
- { return __exts_prod(__exts, 0, __r); }
+ {
+ size_t __sta_prod = _FwdProd<_Extents>::_S_value[__r];
+ return __extents_prod(__exts, __sta_prod, 0, __r);
+ }
template<typename _Extents>
constexpr typename _Extents::index_type
__rev_prod(const _Extents& __exts, size_t __r) noexcept
- { return __exts_prod(__exts, __r + 1, __exts.rank()); }
+ {
+ constexpr size_t __rank = _Extents::rank();
+ size_t __sta_prod = _RevProd<_Extents>::_S_value[__r];
+ return __extents_prod(__exts, __sta_prod, __r + 1, __rank);
+ }
template<typename _Extents>
constexpr typename _Extents::index_type
--
2.50.0