[ https://issues.apache.org/jira/browse/LUCENE-9101?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
kkewwei updated LUCENE-9101: ---------------------------- Description: In CompressingTermVectorsWriter, We use lengthsBuf to save the length of every terms, for example: "a a a b1 b1 b1 b1", the lengthsBuf=[1,1,1,2,2,2,2], a appears three times, we count three times, it seems a bit redundant. We use it in CompressingTermVectorsWriter.flushOffsets: {code:java} private void flushOffsets(int[] fieldNums) throws IOException { ...... // lengths writer.reset(vectorsStream); for (DocData dd : pendingDocs) { for (FieldData fd : dd.fields) { if ((fd.flags & OFFSETS) != 0) { int pos = 0; for (int i = 0; i < fd.numTerms; ++i) { for (int j = 0; j < fd.freqs[i]; ++j) { writer.add(lengthsBuf[fd.offStart + pos++] - fd.prefixLengths[i] - fd.suffixLengths[i]); } } assert pos == fd.totalPositions; } } } writer.finish(); }{code} we can simply it: lengthsBuf=[1,2], the same term just count one time. we could use `int count;` to count which current term we are process, for example: {code:java} private void flushOffsets(int[] fieldNums) throws IOException { ...... // lengths writer.reset(vectorsStream); for (DocData dd : pendingDocs) { for (FieldData fd : dd.fields) { if ((fd.flags & OFFSETS) != 0) { int pos = 0; for (int i = 0; i < fd.numTerms; ++i) { count ++; for (int j = 0; j < fd.freqs[i]; ++j) { writer.add(lengthsBuf[count] - fd.prefixLengths[i] - fd.suffixLengths[i]); } } assert pos == fd.totalPositions; } } } writer.finish(); }{code} was: In CompressingTermVectorsWriter, We use lengthsBuf to save the length of every terms, for example: "a a a b1 b1 b1 b1", the lengthsBuf=[1,1,1,2,2,2,2], a appears three times, we count three time, it seems a bit redundant. We use it in CompressingTermVectorsWriter.flushOffsets: {code:java} private void flushOffsets(int[] fieldNums) throws IOException { ...... // lengths writer.reset(vectorsStream); for (DocData dd : pendingDocs) { for (FieldData fd : dd.fields) { if ((fd.flags & OFFSETS) != 0) { int pos = 0; for (int i = 0; i < fd.numTerms; ++i) { for (int j = 0; j < fd.freqs[i]; ++j) { writer.add(lengthsBuf[fd.offStart + pos++] - fd.prefixLengths[i] - fd.suffixLengths[i]); } } assert pos == fd.totalPositions; } } } writer.finish(); }{code} we can simply it: lengthsBuf=[1,2], the same term just count one time. we could use `int count;` to count which current term we are process, for example: {code:java} private void flushOffsets(int[] fieldNums) throws IOException { ...... // lengths writer.reset(vectorsStream); for (DocData dd : pendingDocs) { for (FieldData fd : dd.fields) { if ((fd.flags & OFFSETS) != 0) { int pos = 0; for (int i = 0; i < fd.numTerms; ++i) { count ++; for (int j = 0; j < fd.freqs[i]; ++j) { writer.add(lengthsBuf[count] - fd.prefixLengths[i] - fd.suffixLengths[i]); } } assert pos == fd.totalPositions; } } } writer.finish(); }{code} > lengthsBuf in CompressingTermVectorsWriter is a bit redundant > -------------------------------------------------------------- > > Key: LUCENE-9101 > URL: https://issues.apache.org/jira/browse/LUCENE-9101 > Project: Lucene - Core > Issue Type: Improvement > Components: core/codecs > Affects Versions: 8.2 > Reporter: kkewwei > Priority: Major > > In CompressingTermVectorsWriter, We use lengthsBuf to save the length of > every terms, for example: "a a a b1 b1 b1 b1", the > lengthsBuf=[1,1,1,2,2,2,2], a appears three times, we count three times, it > seems a bit redundant. > We use it in CompressingTermVectorsWriter.flushOffsets: > > {code:java} > private void flushOffsets(int[] fieldNums) throws IOException { > ...... > // lengths > writer.reset(vectorsStream); > for (DocData dd : pendingDocs) { > for (FieldData fd : dd.fields) { > if ((fd.flags & OFFSETS) != 0) { > int pos = 0; > for (int i = 0; i < fd.numTerms; ++i) { > for (int j = 0; j < fd.freqs[i]; ++j) { > writer.add(lengthsBuf[fd.offStart + pos++] - fd.prefixLengths[i] > - fd.suffixLengths[i]); > } > } > assert pos == fd.totalPositions; > } > } > } > writer.finish(); > }{code} > > we can simply it: lengthsBuf=[1,2], the same term just count one time. we > could use `int count;` to count which current term we are process, for > example: > > {code:java} > private void flushOffsets(int[] fieldNums) throws IOException { > ...... > // lengths > writer.reset(vectorsStream); > for (DocData dd : pendingDocs) { > for (FieldData fd : dd.fields) { > if ((fd.flags & OFFSETS) != 0) { > int pos = 0; > for (int i = 0; i < fd.numTerms; ++i) { > count ++; > for (int j = 0; j < fd.freqs[i]; ++j) { > writer.add(lengthsBuf[count] - fd.prefixLengths[i] - > fd.suffixLengths[i]); > } > } > assert pos == fd.totalPositions; > } > } > } > writer.finish(); > }{code} > > -- This message was sent by Atlassian Jira (v8.3.4#803005) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org