rmuir commented on code in PR #12737: URL: https://github.com/apache/lucene/pull/12737#discussion_r1377027128
########## lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorUtilSupport.java: ########## @@ -17,72 +17,46 @@ package org.apache.lucene.internal.vectorization; +import org.apache.lucene.util.Constants; +import org.apache.lucene.util.SuppressForbidden; + final class DefaultVectorUtilSupport implements VectorUtilSupport { DefaultVectorUtilSupport() {} + // the way FMA should work! if available use it, otherwise fall back to mul/add + @SuppressForbidden(reason = "Uses FMA only where fast and carefully contained") + private static float fma(float a, float b, float c) { + if (Constants.HAS_FAST_FMA) { + return Math.fma(a, b, c); + } else { + return a * b + c; + } + } + @Override public float dotProduct(float[] a, float[] b) { float res = 0f; - /* - * If length of vector is larger than 8, we use unrolled dot product to accelerate the - * calculation. - */ - int i; - for (i = 0; i < a.length % 8; i++) { - res += b[i] * a[i]; - } - if (a.length < 8) { - return res; - } - for (; i + 31 < a.length; i += 32) { - res += - b[i + 0] * a[i + 0] - + b[i + 1] * a[i + 1] - + b[i + 2] * a[i + 2] - + b[i + 3] * a[i + 3] - + b[i + 4] * a[i + 4] - + b[i + 5] * a[i + 5] - + b[i + 6] * a[i + 6] - + b[i + 7] * a[i + 7]; - res += - b[i + 8] * a[i + 8] - + b[i + 9] * a[i + 9] - + b[i + 10] * a[i + 10] - + b[i + 11] * a[i + 11] - + b[i + 12] * a[i + 12] - + b[i + 13] * a[i + 13] - + b[i + 14] * a[i + 14] - + b[i + 15] * a[i + 15]; - res += - b[i + 16] * a[i + 16] - + b[i + 17] * a[i + 17] - + b[i + 18] * a[i + 18] - + b[i + 19] * a[i + 19] - + b[i + 20] * a[i + 20] - + b[i + 21] * a[i + 21] - + b[i + 22] * a[i + 22] - + b[i + 23] * a[i + 23]; - res += - b[i + 24] * a[i + 24] - + b[i + 25] * a[i + 25] - + b[i + 26] * a[i + 26] - + b[i + 27] * a[i + 27] - + b[i + 28] * a[i + 28] - + b[i + 29] * a[i + 29] - + b[i + 30] * a[i + 30] - + b[i + 31] * a[i + 31]; + int i = 0; + + // if the array is big, unroll it + if (a.length > 32) { + float acc1 = 0; + float acc2 = 0; + float acc3 = 0; + float acc4 = 0; + int upperBound = a.length & ~(4 - 1); Review Comment: this logic is from the javadocs of `Species.loopBound` of vector api where width is a power of 2. I used it in these functions for consistency, and because i assume it means the compiler will do a good job. we could maybe put in a static method for other places doing crap like this (e.g. stringhelper's murmurhash) as a followup? I'm guessing any other places do it ad-hoc like what was here before. I wanted to keep this PR minimally invasive though. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org