gsmiller commented on a change in pull request #69:
URL: https://github.com/apache/lucene/pull/69#discussion_r609073999



##########
File path: lucene/core/src/java/org/apache/lucene/codecs/lucene90/PForUtil.java
##########
@@ -121,4 +167,146 @@ void skip(DataInput in) throws IOException {
       in.skipBytes(forUtil.numBytes(bitsPerValue) + (numExceptions << 1));
     }
   }
+
+  /**
+   * Fill {@code longs} with the final values for the case of all deltas being 
1. Note this assumes
+   * there are no exceptions to apply.
+   */
+  private static void prefixSumOfOnes(long[] longs, long base) {
+    System.arraycopy(IDENTITY_PLUS_ONE, 0, longs, 0, ForUtil.BLOCK_SIZE);
+    // This loop gets auto-vectorized
+    for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) {
+      longs[i] += base;
+    }
+  }
+
+  /**
+   * Fill {@code longs} with the final values for the case of all deltas being 
{@code val}. Note
+   * this assumes there are no exceptions to apply.
+   */
+  private static void prefixSumOf(long[] longs, long base, long val) {
+    for (int i = 0; i < ForUtil.BLOCK_SIZE; i++) {
+      longs[i] = (i + 1) * val + base;

Review comment:
       Well, I've made less progress than I would have liked in being able to 
answer whether-or-not this loop is getting auto-vectorized. To be honest, I've 
been stumbling along just trying to get to the assembly. I _think_ I've got the 
assembly for both the `prefixSumOf` and `prefixSumOfOnes` (as a comparison), 
but this is from my 2019 MBP, not a "proper" linux box (which seems to have 
bitten the dust overnight... so sorting that out). But now I'm not really sure 
what I'm looking for. I'm going to keep trying to sort this out on my own, but 
since it's a little slow-going, I'm pasting the results here in case someone 
more well-versed in this wants to have a look in the meantime.
   
   I grabbed the following using `java -XX:+UnlockDiagnosticVMOptions 
-XX:CompileCommand="print jpountz.PForDeltaDecoder::prefixSum*" -jar 
target/benchmarks.jar` on a 
[microbenchmark](https://github.com/gsmiller/decode-128-ints-benchmark/tree/pfor-delta)
 I setup from forking earlier work by @jpountz. So I think the code should be 
"hot" here (and for whatever reason, I've been having trouble getting `-prof 
perfasm` to work on the bench run, which is another thing I'm trying to sort 
through):
   
   prefixSumOf (the method in question):
   ```
   ----------------------------------------------------------------------
   jpountz/PForDeltaDecoder.prefixSumOf(J[JJ)V  [0x0000000129eaed20, 
0x0000000129eaee38]  280 bytes
   [Entry Point]
   [Verified Entry Point]
   [Constants]
     # {method} {0x0000000135b8b150} 'prefixSumOf' '(J[JJ)V' in 
'jpountz/PForDeltaDecoder'
     # parm0:    rsi:rsi   = long
     # parm1:    rdx:rdx   = '[J'
     # parm2:    rcx:rcx   = long
     #           [sp+0x30]  (sp of caller)
     0x0000000129eaed20: mov    %eax,-0x14000(%rsp)
     0x0000000129eaed27: push   %rbp
     0x0000000129eaed28: sub    $0x20,%rsp         ;*synchronization entry
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOf@-1 (line 28)
   
     0x0000000129eaed2c: mov    0xc(%rdx),%r10d    ; implicit exception: 
dispatches to 0x0000000129eaedf6
     0x0000000129eaed30: test   %r10d,%r10d
     0x0000000129eaed33: jbe    0x0000000129eaedf6
     0x0000000129eaed39: cmp    $0x7f,%r10d
     0x0000000129eaed3d: jbe    0x0000000129eaedf6
     0x0000000129eaed43: mov    %rsi,%r10
     0x0000000129eaed46: add    %rcx,%r10
     0x0000000129eaed49: mov    %r10,0x10(%rdx)    ;*lastore {reexecute=0 
rethrow=0 return_oop=0}
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOf@23 (line 29)
   
     0x0000000129eaed4d: mov    $0x1,%r11d
     0x0000000129eaed53: nopw   0x0(%rax,%rax,1)
     0x0000000129eaed5c: data32 data32 xchg %ax,%ax  ;*aload_2 {reexecute=0 
rethrow=0 return_oop=0}
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOf@11 (line 29)
   
     0x0000000129eaed60: movslq %r11d,%r10
     0x0000000129eaed63: mov    %r10,%r8
     0x0000000129eaed66: add    $0x1,%r8
     0x0000000129eaed6a: imul   %rsi,%r8
     0x0000000129eaed6e: add    %rcx,%r8
     0x0000000129eaed71: mov    %r8,0x10(%rdx,%r10,8)
     0x0000000129eaed76: mov    %r10,%r8
     0x0000000129eaed79: add    $0x4,%r8
     0x0000000129eaed7d: imul   %rsi,%r8
     0x0000000129eaed81: add    %rcx,%r8
     0x0000000129eaed84: mov    %r10,%r9
     0x0000000129eaed87: add    $0x3,%r9
     0x0000000129eaed8b: imul   %rsi,%r9
     0x0000000129eaed8f: add    %rcx,%r9
     0x0000000129eaed92: mov    %r10,%rbx
     0x0000000129eaed95: add    $0x2,%rbx
     0x0000000129eaed99: imul   %rsi,%rbx
     0x0000000129eaed9d: add    %rcx,%rbx
     0x0000000129eaeda0: mov    %rbx,0x18(%rdx,%r10,8)
     0x0000000129eaeda5: mov    %r9,0x20(%rdx,%r10,8)
     0x0000000129eaedaa: mov    %r8,0x28(%rdx,%r10,8)  ;*lastore {reexecute=0 
rethrow=0 return_oop=0}
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOf@23 (line 29)
   
     0x0000000129eaedaf: add    $0x4,%r11d         ;*iadd {reexecute=0 
rethrow=0 return_oop=0}
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOf@17 (line 29)
   
     0x0000000129eaedb3: cmp    $0x7d,%r11d
     0x0000000129eaedb7: jl     0x0000000129eaed60  ;*if_icmpge {reexecute=0 
rethrow=0 return_oop=0}
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOf@8 (line 28)
   
     0x0000000129eaedb9: cmp    $0x80,%r11d
     0x0000000129eaedc0: jge    0x0000000129eaede6
     0x0000000129eaedc2: xchg   %ax,%ax            ;*aload_2 {reexecute=0 
rethrow=0 return_oop=0}
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOf@11 (line 29)
   
     0x0000000129eaedc4: movslq %r11d,%r10
     0x0000000129eaedc7: mov    %r10,%r8
     0x0000000129eaedca: add    $0x1,%r8
     0x0000000129eaedce: imul   %rsi,%r8
     0x0000000129eaedd2: add    %rcx,%r8
     0x0000000129eaedd5: mov    %r8,0x10(%rdx,%r10,8)  ;*lastore {reexecute=0 
rethrow=0 return_oop=0}
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOf@23 (line 29)
   
     0x0000000129eaedda: inc    %r11d              ;*iadd {reexecute=0 
rethrow=0 return_oop=0}
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOf@17 (line 29)
   
     0x0000000129eaeddd: cmp    $0x80,%r11d
     0x0000000129eaede4: jl     0x0000000129eaedc4
     0x0000000129eaede6: add    $0x20,%rsp
     0x0000000129eaedea: pop    %rbp
     0x0000000129eaedeb: mov    0x108(%r15),%r10
     0x0000000129eaedf2: test   %eax,(%r10)        ;   {poll_return}
     0x0000000129eaedf5: retq                      ;*if_icmpge {reexecute=0 
rethrow=0 return_oop=0}
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOf@8 (line 28)
   
     0x0000000129eaedf6: mov    %rsi,%rbp
     0x0000000129eaedf9: mov    %rdx,(%rsp)
     0x0000000129eaedfd: mov    %rcx,0x8(%rsp)
     0x0000000129eaee02: mov    $0xffffff7e,%esi
     0x0000000129eaee07: callq  0x00000001223e1b00  ; ImmutableOopMap{[0]=Oop }
                                                   ;*if_icmpge {reexecute=1 
rethrow=0 return_oop=0}
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOf@8 (line 28)
                                                   ;   {runtime_call 
UncommonTrapBlob}
     0x0000000129eaee0c: hlt    
     0x0000000129eaee0d: hlt    
     0x0000000129eaee0e: hlt    
     0x0000000129eaee0f: hlt    
     0x0000000129eaee10: hlt    
     0x0000000129eaee11: hlt    
     0x0000000129eaee12: hlt    
     0x0000000129eaee13: hlt    
     0x0000000129eaee14: hlt    
     0x0000000129eaee15: hlt    
     0x0000000129eaee16: hlt    
     0x0000000129eaee17: hlt    
     0x0000000129eaee18: hlt    
     0x0000000129eaee19: hlt    
     0x0000000129eaee1a: hlt    
     0x0000000129eaee1b: hlt    
     0x0000000129eaee1c: hlt    
     0x0000000129eaee1d: hlt    
     0x0000000129eaee1e: hlt    
     0x0000000129eaee1f: hlt    
   [Exception Handler]
   [Stub Code]
     0x0000000129eaee20: jmpq   0x0000000122417200  ;   {no_reloc}
   [Deopt Handler Code]
     0x0000000129eaee25: callq  0x0000000129eaee2a
     0x0000000129eaee2a: subq   $0x5,(%rsp)
     0x0000000129eaee2f: jmpq   0x00000001223e1720  ;   {runtime_call 
DeoptimizationBlob}
     0x0000000129eaee34: hlt    
     0x0000000129eaee35: hlt    
     0x0000000129eaee36: hlt    
     0x0000000129eaee37: hlt  
   ```
   
   prefixSumOfOnes (for comparison):
   ```
   ----------------------------------------------------------------------
   jpountz/PForDeltaDecoder.prefixSumOfOnes([JJ)V  [0x00000001151e2320, 
0x00000001151e24f8]  472 bytes
   [Entry Point]
   [Verified Entry Point]
   [Constants]
     # {method} {0x0000000126ebb048} 'prefixSumOfOnes' '([JJ)V' in 
'jpountz/PForDeltaDecoder'
     # parm0:    rsi:rsi   = '[J'
     # parm1:    rdx:rdx   = long
     #           [sp+0x30]  (sp of caller)
     0x00000001151e2320: mov    %eax,-0x14000(%rsp)
     0x00000001151e2327: push   %rbp
     0x00000001151e2328: sub    $0x20,%rsp         ;*synchronization entry
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOfOnes@-1 (line 20)
   
     0x00000001151e232c: mov    %rdx,%rbx
     0x00000001151e232f: mov    %rsi,%r13
     0x00000001151e2332: mov    0xc(%rsi),%ebp     ; implicit exception: 
dispatches to 0x00000001151e24cc
     0x00000001151e2335: cmp    $0x80,%ebp
     0x00000001151e233b: jb     0x00000001151e24a4
     0x00000001151e2341: add    $0x10,%rsi
     0x00000001151e2345: mov    $0x80,%edx
     0x00000001151e234a: movabs $0x70f829c00,%rdi  ;   
{oop([J{0x000000070f829c00})}
     0x00000001151e2354: add    $0x10,%rdi
     0x00000001151e2358: vzeroupper 
     0x00000001151e235b: movabs $0x10d721460,%r10
     0x00000001151e2365: callq  *%r10              ;*invokestatic arraycopy 
{reexecute=0 rethrow=0 return_oop=0}
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOfOnes@9 (line 20)
   
     0x00000001151e2368: test   %ebp,%ebp
     0x00000001151e236a: jbe    0x00000001151e24b8
     0x00000001151e2370: cmp    $0x7f,%ebp
     0x00000001151e2373: jbe    0x00000001151e24b8
     0x00000001151e2379: mov    %r13d,%r11d
     0x00000001151e237c: shr    $0x3,%r11d
     0x00000001151e2380: and    $0x3,%r11d
     0x00000001151e2384: xor    %r10d,%r10d
     0x00000001151e2387: mov    $0x1,%r8d
     0x00000001151e238d: sub    %r11d,%r8d
     0x00000001151e2390: and    $0x3,%r8d
     0x00000001151e2394: inc    %r8d
     0x00000001151e2397: mov    $0x80,%r11d
     0x00000001151e239d: cmp    %r11d,%r8d
     0x00000001151e23a0: cmovg  %r11d,%r8d         ;*aload_0 {reexecute=0 
rethrow=0 return_oop=0}
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOfOnes@21 (line 23)
   
     0x00000001151e23a4: add    %rbx,0x10(%r13,%r10,8)  ;*lastore {reexecute=0 
rethrow=0 return_oop=0}
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOfOnes@27 (line 23)
   
     0x00000001151e23a9: inc    %r10d              ;*iinc {reexecute=0 
rethrow=0 return_oop=0}
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOfOnes@28 (line 22)
   
     0x00000001151e23ac: cmp    %r8d,%r10d
     0x00000001151e23af: jl     0x00000001151e23a4  ;*if_icmpge {reexecute=0 
rethrow=0 return_oop=0}
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOfOnes@18 (line 22)
   
     0x00000001151e23b1: vmovq  %rbx,%xmm0
     0x00000001151e23b6: vpunpcklqdq %xmm0,%xmm0,%xmm0
     0x00000001151e23ba: (bad)  
     0x00000001151e23be: rolb   $0xc4,(%rcx)       ;*aload_0 {reexecute=0 
rethrow=0 return_oop=0}
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOfOnes@21 (line 23)
   
     0x00000001151e23c1: cmpl   $0xc410d54c,-0x2c(%rbp)
     0x00000001151e23c8: cmpl   $0xc410d54c,0x7f(%rsi)
     0x00000001151e23cf: cmpl   $0xc430d54c,-0x2c(%rbp)
     0x00000001151e23d6: cmpl   $0xc430d54c,0x7f(%rsi)
     0x00000001151e23dd: cmpl   $0xc450d54c,-0x2c(%rbp)
     0x00000001151e23e4: cmpl   $0xc450d54c,0x7f(%rsi)
     0x00000001151e23eb: cmpl   $0xc470d54c,-0x2c(%rbp)
     0x00000001151e23f2: cmpl   $0xc470d54c,0x7f(%rsi)
     0x00000001151e23f9: cmpl   $0x90d58c,-0x2c(%rbp)
     0x00000001151e2400: add    %al,(%rax)
     0x00000001151e2402: vmovdqu %ymm1,0x90(%r13,%r10,8)
     0x00000001151e240c: (bad)  
     0x00000001151e2410: mov    %ss,%ebp
     0x00000001151e2412: mov    $0x0,%al
     0x00000001151e2414: add    %al,(%rax)
     0x00000001151e2416: vmovdqu %ymm1,0xb0(%r13,%r10,8)
     0x00000001151e2420: (bad)  
     0x00000001151e2424: mov    %ss,%ebp
     0x00000001151e2426: rolb   (%rax)
     0x00000001151e2428: add    %al,(%rax)
     0x00000001151e242a: vmovdqu %ymm1,0xd0(%r13,%r10,8)
     0x00000001151e2434: (bad)  
     0x00000001151e2438: mov    %ss,%ebp
     0x00000001151e243a: lock add %al,(%rax)
     0x00000001151e243d: add    %al,%ah
     0x00000001151e243f: cmpl   $0xf0d58c,0x7f(%rsi)
     0x00000001151e2446: add    %al,(%rax)         ;*lastore {reexecute=0 
rethrow=0 return_oop=0}
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOfOnes@27 (line 23)
   
     0x00000001151e2448: add    $0x20,%r10d        ;*iinc {reexecute=0 
rethrow=0 return_oop=0}
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOfOnes@28 (line 22)
   
     0x00000001151e244c: cmp    $0x61,%r10d
     0x00000001151e2450: jl     0x00000001151e23c0  ;*if_icmpge {reexecute=0 
rethrow=0 return_oop=0}
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOfOnes@18 (line 22)
   
     0x00000001151e2456: cmp    $0x7d,%r10d
     0x00000001151e245a: jge    0x00000001151e2474  ;*aload_0 {reexecute=0 
rethrow=0 return_oop=0}
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOfOnes@21 (line 23)
   
     0x00000001151e245c: (bad)  
     0x00000001151e2460: rex.WR (bad) 
     0x00000001151e2462: adc    %al,%ah
     0x00000001151e2464: cmpl   $0x4110d54c,0x7f(%rsi)  ;*lastore {reexecute=0 
rethrow=0 return_oop=0}
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOfOnes@27 (line 23)
   
     0x00000001151e246b: add    $0x4,%edx          ;*iinc {reexecute=0 
rethrow=0 return_oop=0}
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOfOnes@28 (line 22)
   
     0x00000001151e246e: cmp    $0x7d,%r10d
     0x00000001151e2472: jl     0x00000001151e245c  ;*if_icmpge {reexecute=0 
rethrow=0 return_oop=0}
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOfOnes@18 (line 22)
   
     0x00000001151e2474: cmp    $0x80,%r10d
     0x00000001151e247b: jge    0x00000001151e2491
     0x00000001151e247d: data32 xchg %ax,%ax       ;*aload_0 {reexecute=0 
rethrow=0 return_oop=0}
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOfOnes@21 (line 23)
   
     0x00000001151e2480: add    %rbx,0x10(%r13,%r10,8)  ;*lastore {reexecute=0 
rethrow=0 return_oop=0}
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOfOnes@27 (line 23)
   
     0x00000001151e2485: inc    %r10d              ;*iinc {reexecute=0 
rethrow=0 return_oop=0}
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOfOnes@28 (line 22)
   
     0x00000001151e2488: cmp    $0x80,%r10d
     0x00000001151e248f: jl     0x00000001151e2480  ;*synchronization entry
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOfOnes@-1 (line 20)
   
     0x00000001151e2491: vzeroupper 
     0x00000001151e2494: add    $0x20,%rsp
     0x00000001151e2498: pop    %rbp
     0x00000001151e2499: mov    0x108(%r15),%r10
     0x00000001151e24a0: test   %eax,(%r10)        ;   {poll_return}
     0x00000001151e24a3: retq   
     0x00000001151e24a4: mov    $0xffffffcc,%esi
     0x00000001151e24a9: mov    %r13,%rbp
     0x00000001151e24ac: mov    %rdx,(%rsp)
     0x00000001151e24b0: vzeroupper 
     0x00000001151e24b3: callq  0x000000010d713b00  ; ImmutableOopMap{rbp=Oop }
                                                   ;*invokestatic arraycopy 
{reexecute=0 rethrow=0 return_oop=0}
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOfOnes@9 (line 20)
                                                   ;   {runtime_call 
UncommonTrapBlob}
     0x00000001151e24b8: mov    $0xffffff7e,%esi
     0x00000001151e24bd: mov    %r13,%rbp
     0x00000001151e24c0: mov    %rbx,(%rsp)
     0x00000001151e24c4: vzeroupper 
     0x00000001151e24c7: callq  0x000000010d713b00  ; ImmutableOopMap{rbp=Oop }
                                                   ;*if_icmpge {reexecute=1 
rethrow=0 return_oop=0}
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOfOnes@18 (line 22)
                                                   ;   {runtime_call 
UncommonTrapBlob}
     0x00000001151e24cc: mov    $0xfffffff6,%esi
     0x00000001151e24d1: data32 xchg %ax,%ax
     0x00000001151e24d4: vzeroupper 
     0x00000001151e24d7: callq  0x000000010d713b00  ; ImmutableOopMap{}
                                                   ;*invokestatic arraycopy 
{reexecute=0 rethrow=0 return_oop=0}
                                                   ; - 
jpountz.PForDeltaDecoder::prefixSumOfOnes@9 (line 20)
                                                   ;   {runtime_call 
UncommonTrapBlob}
     0x00000001151e24dc: hlt    
     0x00000001151e24dd: hlt    
     0x00000001151e24de: hlt    
     0x00000001151e24df: hlt    
   [Exception Handler]
   [Stub Code]
     0x00000001151e24e0: jmpq   0x000000010d749200  ;   {no_reloc}
   [Deopt Handler Code]
     0x00000001151e24e5: callq  0x00000001151e24ea
     0x00000001151e24ea: subq   $0x5,(%rsp)
     0x00000001151e24ef: jmpq   0x000000010d713720  ;   {runtime_call 
DeoptimizationBlob}
     0x00000001151e24f4: hlt    
     0x00000001151e24f5: hlt    
     0x00000001151e24f6: hlt    
     0x00000001151e24f7: hlt  
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Reply via email to