gsmiller commented on a change in pull request #69: URL: https://github.com/apache/lucene/pull/69#discussion_r609073999
########## File path: lucene/core/src/java/org/apache/lucene/codecs/lucene90/PForUtil.java ########## @@ -121,4 +167,146 @@ void skip(DataInput in) throws IOException { in.skipBytes(forUtil.numBytes(bitsPerValue) + (numExceptions << 1)); } } + + /** + * Fill {@code longs} with the final values for the case of all deltas being 1. Note this assumes + * there are no exceptions to apply. + */ + private static void prefixSumOfOnes(long[] longs, long base) { + System.arraycopy(IDENTITY_PLUS_ONE, 0, longs, 0, ForUtil.BLOCK_SIZE); + // This loop gets auto-vectorized + for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) { + longs[i] += base; + } + } + + /** + * Fill {@code longs} with the final values for the case of all deltas being {@code val}. Note + * this assumes there are no exceptions to apply. + */ + private static void prefixSumOf(long[] longs, long base, long val) { + for (int i = 0; i < ForUtil.BLOCK_SIZE; i++) { + longs[i] = (i + 1) * val + base; Review comment: Well, I've made less progress than I would have liked in being able to answer whether-or-not this loop is getting auto-vectorized. To be honest, I've been stumbling along just trying to get to the assembly. I _think_ I've got the assembly for both the `prefixSumOf` and `prefixSumOfOnes` (as a comparison), but this is from my 2019 MBP, not a "proper" linux box (which seems to have bitten the dust overnight... so sorting that out). But now I'm not really sure what I'm looking for. I'm going to keep trying to sort this out on my own, but since it's a little slow-going, I'm pasting the results here in case someone more well-versed in this wants to have a look in the meantime. I grabbed the following using `java -XX:+UnlockDiagnosticVMOptions -XX:CompileCommand="print jpountz.PForDeltaDecoder::prefixSum*" -jar target/benchmarks.jar` on a [microbenchmark](https://github.com/gsmiller/decode-128-ints-benchmark/tree/pfor-delta) I setup from forking earlier work by @jpountz. So I think the code should be "hot" here (and for whatever reason, I've been having trouble getting `-prof perfasm` to work on the bench run, which is another thing I'm trying to sort through): prefixSumOf (the method in question): ``` ---------------------------------------------------------------------- jpountz/PForDeltaDecoder.prefixSumOf(J[JJ)V [0x0000000129eaed20, 0x0000000129eaee38] 280 bytes [Entry Point] [Verified Entry Point] [Constants] # {method} {0x0000000135b8b150} 'prefixSumOf' '(J[JJ)V' in 'jpountz/PForDeltaDecoder' # parm0: rsi:rsi = long # parm1: rdx:rdx = '[J' # parm2: rcx:rcx = long # [sp+0x30] (sp of caller) 0x0000000129eaed20: mov %eax,-0x14000(%rsp) 0x0000000129eaed27: push %rbp 0x0000000129eaed28: sub $0x20,%rsp ;*synchronization entry ; - jpountz.PForDeltaDecoder::prefixSumOf@-1 (line 28) 0x0000000129eaed2c: mov 0xc(%rdx),%r10d ; implicit exception: dispatches to 0x0000000129eaedf6 0x0000000129eaed30: test %r10d,%r10d 0x0000000129eaed33: jbe 0x0000000129eaedf6 0x0000000129eaed39: cmp $0x7f,%r10d 0x0000000129eaed3d: jbe 0x0000000129eaedf6 0x0000000129eaed43: mov %rsi,%r10 0x0000000129eaed46: add %rcx,%r10 0x0000000129eaed49: mov %r10,0x10(%rdx) ;*lastore {reexecute=0 rethrow=0 return_oop=0} ; - jpountz.PForDeltaDecoder::prefixSumOf@23 (line 29) 0x0000000129eaed4d: mov $0x1,%r11d 0x0000000129eaed53: nopw 0x0(%rax,%rax,1) 0x0000000129eaed5c: data32 data32 xchg %ax,%ax ;*aload_2 {reexecute=0 rethrow=0 return_oop=0} ; - jpountz.PForDeltaDecoder::prefixSumOf@11 (line 29) 0x0000000129eaed60: movslq %r11d,%r10 0x0000000129eaed63: mov %r10,%r8 0x0000000129eaed66: add $0x1,%r8 0x0000000129eaed6a: imul %rsi,%r8 0x0000000129eaed6e: add %rcx,%r8 0x0000000129eaed71: mov %r8,0x10(%rdx,%r10,8) 0x0000000129eaed76: mov %r10,%r8 0x0000000129eaed79: add $0x4,%r8 0x0000000129eaed7d: imul %rsi,%r8 0x0000000129eaed81: add %rcx,%r8 0x0000000129eaed84: mov %r10,%r9 0x0000000129eaed87: add $0x3,%r9 0x0000000129eaed8b: imul %rsi,%r9 0x0000000129eaed8f: add %rcx,%r9 0x0000000129eaed92: mov %r10,%rbx 0x0000000129eaed95: add $0x2,%rbx 0x0000000129eaed99: imul %rsi,%rbx 0x0000000129eaed9d: add %rcx,%rbx 0x0000000129eaeda0: mov %rbx,0x18(%rdx,%r10,8) 0x0000000129eaeda5: mov %r9,0x20(%rdx,%r10,8) 0x0000000129eaedaa: mov %r8,0x28(%rdx,%r10,8) ;*lastore {reexecute=0 rethrow=0 return_oop=0} ; - jpountz.PForDeltaDecoder::prefixSumOf@23 (line 29) 0x0000000129eaedaf: add $0x4,%r11d ;*iadd {reexecute=0 rethrow=0 return_oop=0} ; - jpountz.PForDeltaDecoder::prefixSumOf@17 (line 29) 0x0000000129eaedb3: cmp $0x7d,%r11d 0x0000000129eaedb7: jl 0x0000000129eaed60 ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0} ; - jpountz.PForDeltaDecoder::prefixSumOf@8 (line 28) 0x0000000129eaedb9: cmp $0x80,%r11d 0x0000000129eaedc0: jge 0x0000000129eaede6 0x0000000129eaedc2: xchg %ax,%ax ;*aload_2 {reexecute=0 rethrow=0 return_oop=0} ; - jpountz.PForDeltaDecoder::prefixSumOf@11 (line 29) 0x0000000129eaedc4: movslq %r11d,%r10 0x0000000129eaedc7: mov %r10,%r8 0x0000000129eaedca: add $0x1,%r8 0x0000000129eaedce: imul %rsi,%r8 0x0000000129eaedd2: add %rcx,%r8 0x0000000129eaedd5: mov %r8,0x10(%rdx,%r10,8) ;*lastore {reexecute=0 rethrow=0 return_oop=0} ; - jpountz.PForDeltaDecoder::prefixSumOf@23 (line 29) 0x0000000129eaedda: inc %r11d ;*iadd {reexecute=0 rethrow=0 return_oop=0} ; - jpountz.PForDeltaDecoder::prefixSumOf@17 (line 29) 0x0000000129eaeddd: cmp $0x80,%r11d 0x0000000129eaede4: jl 0x0000000129eaedc4 0x0000000129eaede6: add $0x20,%rsp 0x0000000129eaedea: pop %rbp 0x0000000129eaedeb: mov 0x108(%r15),%r10 0x0000000129eaedf2: test %eax,(%r10) ; {poll_return} 0x0000000129eaedf5: retq ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0} ; - jpountz.PForDeltaDecoder::prefixSumOf@8 (line 28) 0x0000000129eaedf6: mov %rsi,%rbp 0x0000000129eaedf9: mov %rdx,(%rsp) 0x0000000129eaedfd: mov %rcx,0x8(%rsp) 0x0000000129eaee02: mov $0xffffff7e,%esi 0x0000000129eaee07: callq 0x00000001223e1b00 ; ImmutableOopMap{[0]=Oop } ;*if_icmpge {reexecute=1 rethrow=0 return_oop=0} ; - jpountz.PForDeltaDecoder::prefixSumOf@8 (line 28) ; {runtime_call UncommonTrapBlob} 0x0000000129eaee0c: hlt 0x0000000129eaee0d: hlt 0x0000000129eaee0e: hlt 0x0000000129eaee0f: hlt 0x0000000129eaee10: hlt 0x0000000129eaee11: hlt 0x0000000129eaee12: hlt 0x0000000129eaee13: hlt 0x0000000129eaee14: hlt 0x0000000129eaee15: hlt 0x0000000129eaee16: hlt 0x0000000129eaee17: hlt 0x0000000129eaee18: hlt 0x0000000129eaee19: hlt 0x0000000129eaee1a: hlt 0x0000000129eaee1b: hlt 0x0000000129eaee1c: hlt 0x0000000129eaee1d: hlt 0x0000000129eaee1e: hlt 0x0000000129eaee1f: hlt [Exception Handler] [Stub Code] 0x0000000129eaee20: jmpq 0x0000000122417200 ; {no_reloc} [Deopt Handler Code] 0x0000000129eaee25: callq 0x0000000129eaee2a 0x0000000129eaee2a: subq $0x5,(%rsp) 0x0000000129eaee2f: jmpq 0x00000001223e1720 ; {runtime_call DeoptimizationBlob} 0x0000000129eaee34: hlt 0x0000000129eaee35: hlt 0x0000000129eaee36: hlt 0x0000000129eaee37: hlt ``` prefixSumOfOnes (for comparison): ``` ---------------------------------------------------------------------- jpountz/PForDeltaDecoder.prefixSumOfOnes([JJ)V [0x00000001151e2320, 0x00000001151e24f8] 472 bytes [Entry Point] [Verified Entry Point] [Constants] # {method} {0x0000000126ebb048} 'prefixSumOfOnes' '([JJ)V' in 'jpountz/PForDeltaDecoder' # parm0: rsi:rsi = '[J' # parm1: rdx:rdx = long # [sp+0x30] (sp of caller) 0x00000001151e2320: mov %eax,-0x14000(%rsp) 0x00000001151e2327: push %rbp 0x00000001151e2328: sub $0x20,%rsp ;*synchronization entry ; - jpountz.PForDeltaDecoder::prefixSumOfOnes@-1 (line 20) 0x00000001151e232c: mov %rdx,%rbx 0x00000001151e232f: mov %rsi,%r13 0x00000001151e2332: mov 0xc(%rsi),%ebp ; implicit exception: dispatches to 0x00000001151e24cc 0x00000001151e2335: cmp $0x80,%ebp 0x00000001151e233b: jb 0x00000001151e24a4 0x00000001151e2341: add $0x10,%rsi 0x00000001151e2345: mov $0x80,%edx 0x00000001151e234a: movabs $0x70f829c00,%rdi ; {oop([J{0x000000070f829c00})} 0x00000001151e2354: add $0x10,%rdi 0x00000001151e2358: vzeroupper 0x00000001151e235b: movabs $0x10d721460,%r10 0x00000001151e2365: callq *%r10 ;*invokestatic arraycopy {reexecute=0 rethrow=0 return_oop=0} ; - jpountz.PForDeltaDecoder::prefixSumOfOnes@9 (line 20) 0x00000001151e2368: test %ebp,%ebp 0x00000001151e236a: jbe 0x00000001151e24b8 0x00000001151e2370: cmp $0x7f,%ebp 0x00000001151e2373: jbe 0x00000001151e24b8 0x00000001151e2379: mov %r13d,%r11d 0x00000001151e237c: shr $0x3,%r11d 0x00000001151e2380: and $0x3,%r11d 0x00000001151e2384: xor %r10d,%r10d 0x00000001151e2387: mov $0x1,%r8d 0x00000001151e238d: sub %r11d,%r8d 0x00000001151e2390: and $0x3,%r8d 0x00000001151e2394: inc %r8d 0x00000001151e2397: mov $0x80,%r11d 0x00000001151e239d: cmp %r11d,%r8d 0x00000001151e23a0: cmovg %r11d,%r8d ;*aload_0 {reexecute=0 rethrow=0 return_oop=0} ; - jpountz.PForDeltaDecoder::prefixSumOfOnes@21 (line 23) 0x00000001151e23a4: add %rbx,0x10(%r13,%r10,8) ;*lastore {reexecute=0 rethrow=0 return_oop=0} ; - jpountz.PForDeltaDecoder::prefixSumOfOnes@27 (line 23) 0x00000001151e23a9: inc %r10d ;*iinc {reexecute=0 rethrow=0 return_oop=0} ; - jpountz.PForDeltaDecoder::prefixSumOfOnes@28 (line 22) 0x00000001151e23ac: cmp %r8d,%r10d 0x00000001151e23af: jl 0x00000001151e23a4 ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0} ; - jpountz.PForDeltaDecoder::prefixSumOfOnes@18 (line 22) 0x00000001151e23b1: vmovq %rbx,%xmm0 0x00000001151e23b6: vpunpcklqdq %xmm0,%xmm0,%xmm0 0x00000001151e23ba: (bad) 0x00000001151e23be: rolb $0xc4,(%rcx) ;*aload_0 {reexecute=0 rethrow=0 return_oop=0} ; - jpountz.PForDeltaDecoder::prefixSumOfOnes@21 (line 23) 0x00000001151e23c1: cmpl $0xc410d54c,-0x2c(%rbp) 0x00000001151e23c8: cmpl $0xc410d54c,0x7f(%rsi) 0x00000001151e23cf: cmpl $0xc430d54c,-0x2c(%rbp) 0x00000001151e23d6: cmpl $0xc430d54c,0x7f(%rsi) 0x00000001151e23dd: cmpl $0xc450d54c,-0x2c(%rbp) 0x00000001151e23e4: cmpl $0xc450d54c,0x7f(%rsi) 0x00000001151e23eb: cmpl $0xc470d54c,-0x2c(%rbp) 0x00000001151e23f2: cmpl $0xc470d54c,0x7f(%rsi) 0x00000001151e23f9: cmpl $0x90d58c,-0x2c(%rbp) 0x00000001151e2400: add %al,(%rax) 0x00000001151e2402: vmovdqu %ymm1,0x90(%r13,%r10,8) 0x00000001151e240c: (bad) 0x00000001151e2410: mov %ss,%ebp 0x00000001151e2412: mov $0x0,%al 0x00000001151e2414: add %al,(%rax) 0x00000001151e2416: vmovdqu %ymm1,0xb0(%r13,%r10,8) 0x00000001151e2420: (bad) 0x00000001151e2424: mov %ss,%ebp 0x00000001151e2426: rolb (%rax) 0x00000001151e2428: add %al,(%rax) 0x00000001151e242a: vmovdqu %ymm1,0xd0(%r13,%r10,8) 0x00000001151e2434: (bad) 0x00000001151e2438: mov %ss,%ebp 0x00000001151e243a: lock add %al,(%rax) 0x00000001151e243d: add %al,%ah 0x00000001151e243f: cmpl $0xf0d58c,0x7f(%rsi) 0x00000001151e2446: add %al,(%rax) ;*lastore {reexecute=0 rethrow=0 return_oop=0} ; - jpountz.PForDeltaDecoder::prefixSumOfOnes@27 (line 23) 0x00000001151e2448: add $0x20,%r10d ;*iinc {reexecute=0 rethrow=0 return_oop=0} ; - jpountz.PForDeltaDecoder::prefixSumOfOnes@28 (line 22) 0x00000001151e244c: cmp $0x61,%r10d 0x00000001151e2450: jl 0x00000001151e23c0 ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0} ; - jpountz.PForDeltaDecoder::prefixSumOfOnes@18 (line 22) 0x00000001151e2456: cmp $0x7d,%r10d 0x00000001151e245a: jge 0x00000001151e2474 ;*aload_0 {reexecute=0 rethrow=0 return_oop=0} ; - jpountz.PForDeltaDecoder::prefixSumOfOnes@21 (line 23) 0x00000001151e245c: (bad) 0x00000001151e2460: rex.WR (bad) 0x00000001151e2462: adc %al,%ah 0x00000001151e2464: cmpl $0x4110d54c,0x7f(%rsi) ;*lastore {reexecute=0 rethrow=0 return_oop=0} ; - jpountz.PForDeltaDecoder::prefixSumOfOnes@27 (line 23) 0x00000001151e246b: add $0x4,%edx ;*iinc {reexecute=0 rethrow=0 return_oop=0} ; - jpountz.PForDeltaDecoder::prefixSumOfOnes@28 (line 22) 0x00000001151e246e: cmp $0x7d,%r10d 0x00000001151e2472: jl 0x00000001151e245c ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0} ; - jpountz.PForDeltaDecoder::prefixSumOfOnes@18 (line 22) 0x00000001151e2474: cmp $0x80,%r10d 0x00000001151e247b: jge 0x00000001151e2491 0x00000001151e247d: data32 xchg %ax,%ax ;*aload_0 {reexecute=0 rethrow=0 return_oop=0} ; - jpountz.PForDeltaDecoder::prefixSumOfOnes@21 (line 23) 0x00000001151e2480: add %rbx,0x10(%r13,%r10,8) ;*lastore {reexecute=0 rethrow=0 return_oop=0} ; - jpountz.PForDeltaDecoder::prefixSumOfOnes@27 (line 23) 0x00000001151e2485: inc %r10d ;*iinc {reexecute=0 rethrow=0 return_oop=0} ; - jpountz.PForDeltaDecoder::prefixSumOfOnes@28 (line 22) 0x00000001151e2488: cmp $0x80,%r10d 0x00000001151e248f: jl 0x00000001151e2480 ;*synchronization entry ; - jpountz.PForDeltaDecoder::prefixSumOfOnes@-1 (line 20) 0x00000001151e2491: vzeroupper 0x00000001151e2494: add $0x20,%rsp 0x00000001151e2498: pop %rbp 0x00000001151e2499: mov 0x108(%r15),%r10 0x00000001151e24a0: test %eax,(%r10) ; {poll_return} 0x00000001151e24a3: retq 0x00000001151e24a4: mov $0xffffffcc,%esi 0x00000001151e24a9: mov %r13,%rbp 0x00000001151e24ac: mov %rdx,(%rsp) 0x00000001151e24b0: vzeroupper 0x00000001151e24b3: callq 0x000000010d713b00 ; ImmutableOopMap{rbp=Oop } ;*invokestatic arraycopy {reexecute=0 rethrow=0 return_oop=0} ; - jpountz.PForDeltaDecoder::prefixSumOfOnes@9 (line 20) ; {runtime_call UncommonTrapBlob} 0x00000001151e24b8: mov $0xffffff7e,%esi 0x00000001151e24bd: mov %r13,%rbp 0x00000001151e24c0: mov %rbx,(%rsp) 0x00000001151e24c4: vzeroupper 0x00000001151e24c7: callq 0x000000010d713b00 ; ImmutableOopMap{rbp=Oop } ;*if_icmpge {reexecute=1 rethrow=0 return_oop=0} ; - jpountz.PForDeltaDecoder::prefixSumOfOnes@18 (line 22) ; {runtime_call UncommonTrapBlob} 0x00000001151e24cc: mov $0xfffffff6,%esi 0x00000001151e24d1: data32 xchg %ax,%ax 0x00000001151e24d4: vzeroupper 0x00000001151e24d7: callq 0x000000010d713b00 ; ImmutableOopMap{} ;*invokestatic arraycopy {reexecute=0 rethrow=0 return_oop=0} ; - jpountz.PForDeltaDecoder::prefixSumOfOnes@9 (line 20) ; {runtime_call UncommonTrapBlob} 0x00000001151e24dc: hlt 0x00000001151e24dd: hlt 0x00000001151e24de: hlt 0x00000001151e24df: hlt [Exception Handler] [Stub Code] 0x00000001151e24e0: jmpq 0x000000010d749200 ; {no_reloc} [Deopt Handler Code] 0x00000001151e24e5: callq 0x00000001151e24ea 0x00000001151e24ea: subq $0x5,(%rsp) 0x00000001151e24ef: jmpq 0x000000010d713720 ; {runtime_call DeoptimizationBlob} 0x00000001151e24f4: hlt 0x00000001151e24f5: hlt 0x00000001151e24f6: hlt 0x00000001151e24f7: hlt ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org