[GitHub] [lucene] jpountz commented on a change in pull request #103: Fix regression to account payloads while merging

GitBox Thu, 29 Apr 2021 01:39:30 -0700


jpountz commented on a change in pull request #103:
URL: https://github.com/apache/lucene/pull/103#discussion_r622844633




##########
File path: lucene/core/src/test/org/apache/lucene/index/TestTermVectors.java
##########
@@ -166,4 +118,97 @@ public void testFullMergeAddIndexesReader() throws 
Exception {
     verifyIndex(target);
     IOUtils.close(target, input[0], input[1]);
   }
+
+  /**
+   * Assert that a merged segment has payloads set up in fieldInfo, if at 
least 1 segment has
+   * payloads for this field.
+   */
+  public void testMergeWithPayloads() throws Exception {
+
+    final FieldType ft1 = new FieldType(TextField.TYPE_NOT_STORED);
+    ft1.setStoreTermVectors(true);
+    ft1.setStoreTermVectorOffsets(true);
+    ft1.setStoreTermVectorPositions(true);
+    ft1.setStoreTermVectorPayloads(true);
+    ft1.freeze();
+
+    Directory dir = newDirectory();
+    final int numDocsInSegment = 10;
+    IndexWriterConfig indexWriterConfig =
+        new IndexWriterConfig(new 
MockAnalyzer(random())).setMaxBufferedDocs(numDocsInSegment);
+    IndexWriter writer = new IndexWriter(dir, indexWriterConfig);
+
+    boolean hasPayloads1 = random().nextBoolean();
+    boolean hasPayloads2 = !hasPayloads1;

Review comment:
       Can you test both cases everytime instead of relying on randomization?

##########
File path: lucene/core/src/test/org/apache/lucene/index/TestTermVectors.java
##########
@@ -16,70 +16,22 @@
  */
 package org.apache.lucene.index;
 
+import static 
com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween;
+
 import java.io.IOException;
 import org.apache.lucene.analysis.MockAnalyzer;
-import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.store.Directory;
-import org.apache.lucene.util.English;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.TestUtil;
-import org.junit.AfterClass;
-import org.junit.BeforeClass;
 
 public class TestTermVectors extends LuceneTestCase {
-  private static IndexReader reader;
-  private static Directory directory;
-
-  @BeforeClass
-  public static void beforeClass() throws Exception {
-    directory = newDirectory();
-    RandomIndexWriter writer =
-        new RandomIndexWriter(
-            random(),
-            directory,
-            newIndexWriterConfig(new MockAnalyzer(random(), 
MockTokenizer.SIMPLE, true))
-                .setMergePolicy(newLogMergePolicy()));
-    // writer.setNoCFSRatio(1.0);
-    // writer.infoStream = System.out;
-    for (int i = 0; i < 1000; i++) {
-      Document doc = new Document();
-      FieldType ft = new FieldType(TextField.TYPE_STORED);
-      int mod3 = i % 3;
-      int mod2 = i % 2;
-      if (mod2 == 0 && mod3 == 0) {
-        ft.setStoreTermVectors(true);
-        ft.setStoreTermVectorOffsets(true);
-        ft.setStoreTermVectorPositions(true);
-      } else if (mod2 == 0) {
-        ft.setStoreTermVectors(true);
-        ft.setStoreTermVectorPositions(true);
-      } else if (mod3 == 0) {
-        ft.setStoreTermVectors(true);
-        ft.setStoreTermVectorOffsets(true);
-      } else {
-        ft.setStoreTermVectors(true);
-      }
-      doc.add(new Field("field", English.intToEnglish(i), ft));
-      // test no term vectors too
-      doc.add(new TextField("noTV", English.intToEnglish(i), Field.Store.YES));
-      writer.addDocument(doc);
-    }
-    reader = writer.getReader();
-    writer.close();
-  }
-
-  @AfterClass
-  public static void afterClass() throws Exception {
-    reader.close();
-    directory.close();
-    reader = null;
-    directory = null;
-  }

Review comment:
       oh, this was unused?

##########
File path: lucene/core/src/test/org/apache/lucene/index/TestTermVectors.java
##########
@@ -166,4 +118,97 @@ public void testFullMergeAddIndexesReader() throws 
Exception {
     verifyIndex(target);
     IOUtils.close(target, input[0], input[1]);
   }
+
+  /**
+   * Assert that a merged segment has payloads set up in fieldInfo, if at 
least 1 segment has
+   * payloads for this field.
+   */
+  public void testMergeWithPayloads() throws Exception {
+
+    final FieldType ft1 = new FieldType(TextField.TYPE_NOT_STORED);
+    ft1.setStoreTermVectors(true);
+    ft1.setStoreTermVectorOffsets(true);
+    ft1.setStoreTermVectorPositions(true);
+    ft1.setStoreTermVectorPayloads(true);
+    ft1.freeze();
+
+    Directory dir = newDirectory();
+    final int numDocsInSegment = 10;
+    IndexWriterConfig indexWriterConfig =
+        new IndexWriterConfig(new 
MockAnalyzer(random())).setMaxBufferedDocs(numDocsInSegment);
+    IndexWriter writer = new IndexWriter(dir, indexWriterConfig);
+
+    boolean hasPayloads1 = random().nextBoolean();
+    boolean hasPayloads2 = !hasPayloads1;
+    TokenStreamGenerator tkg1 = new TokenStreamGenerator(hasPayloads1);
+    TokenStreamGenerator tkg2 = new TokenStreamGenerator(hasPayloads2);
+    // create one segment with payloads, and another without payloads
+    for (int i = 0; i < numDocsInSegment; i++) {
+      Document doc = new Document();
+      doc.add(new Field("c", tkg1.newTokenStream(), ft1));
+      writer.addDocument(doc);
+    }
+    for (int i = 0; i < numDocsInSegment; i++) {
+      Document doc = new Document();
+      doc.add(new Field("c", tkg2.newTokenStream(), ft1));
+      writer.addDocument(doc);
+    }
+
+    IndexReader reader1 = writer.getReader();
+    assertEquals(

Review comment:
       Can you assert on the number of leaves?
   
   ```suggestion
       assertEquals(2, reader1.leaves().size());
       assertEquals(
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [lucene] jpountz commented on a change in pull request #103: Fix regression to account payloads while merging

Reply via email to