[zeppelin] branch branch-0.10 updated: [ZEPPELIN-5507] Polish LuceneSerach

pdallig Fri, 10 Sep 2021 04:51:05 -0700

This is an automated email from the ASF dual-hosted git repository.

pdallig pushed a commit to branch branch-0.10
in repository https://gitbox.apache.org/repos/asf/zeppelin.git



The following commit(s) were added to refs/heads/branch-0.10 by this push:
     new 4dbcf38  [ZEPPELIN-5507] Polish LuceneSerach
4dbcf38 is described below

commit 4dbcf3830750db6738de3c442f780316a0fd0f10
Author: Philipp Dallig <philipp.dal...@gmail.com>
AuthorDate: Thu Sep 2 16:00:14 2021 +0200

    [ZEPPELIN-5507] Polish LuceneSerach
    
    ### What is this PR for?
    - This PR polish the LuceneSearch class.
    - RAMDirectory is deprecated, we are switching to ByteBuffersDirectory
    - Re-throw IOException instead of catching it so we get a better stacktrace
    
    ### What type of PR is it?
     - Refactoring
    
    ### What is the Jira issue?
    * https://issues.apache.org/jira/browse/ZEPPELIN-5507
    
    ### How should this be tested?
    * via CI
    
    ### Questions:
    * Does the licenses files need update? No
    * Is there breaking changes for older versions? No
    * Does this needs documentation? No
    
    Author: Philipp Dallig <philipp.dal...@gmail.com>
    
    Closes #4215 from Reamer/search_cleanup and squashes the following commits:
    
    12baa5d27 [Philipp Dallig] Polish LuceneSerach
    
    (cherry picked from commit 27609b9ae17372215f48c91c6cea546625372ae7)
    Signed-off-by: Philipp Dallig <philipp.dal...@gmail.com>
---
 .../org/apache/zeppelin/search/LuceneSearch.java   | 101 +++++++++++----------
 1 file changed, 52 insertions(+), 49 deletions(-)

diff --git 
a/zeppelin-zengine/src/main/java/org/apache/zeppelin/search/LuceneSearch.java 
b/zeppelin-zengine/src/main/java/org/apache/zeppelin/search/LuceneSearch.java
index 586b5d9..5c2110c 100644
--- 
a/zeppelin-zengine/src/main/java/org/apache/zeppelin/search/LuceneSearch.java
+++ 
b/zeppelin-zengine/src/main/java/org/apache/zeppelin/search/LuceneSearch.java
@@ -16,12 +16,11 @@
  */
 package org.apache.zeppelin.search;
 
-import com.google.common.base.Joiner;
 import com.google.common.collect.ImmutableMap;
-import com.google.common.collect.Lists;
 import java.io.IOException;
 import java.nio.file.Path;
 import java.nio.file.Paths;
+import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Date;
 import java.util.List;
@@ -30,6 +29,7 @@ import java.util.stream.Stream;
 import javax.annotation.PreDestroy;
 import javax.inject.Inject;
 
+import org.apache.commons.lang3.StringUtils;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
@@ -55,9 +55,9 @@ import org.apache.lucene.search.highlight.QueryScorer;
 import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
 import org.apache.lucene.search.highlight.TextFragment;
 import org.apache.lucene.search.highlight.TokenSources;
+import org.apache.lucene.store.ByteBuffersDirectory;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
-import org.apache.lucene.store.RAMDirectory;
 import org.apache.zeppelin.conf.ZeppelinConfiguration;
 import org.apache.zeppelin.notebook.Note;
 import org.apache.zeppelin.notebook.Paragraph;
@@ -77,34 +77,30 @@ public class LuceneSearch extends SearchService {
   private static final String PARAGRAPH = "paragraph";
   private static final String ID_FIELD = "id";
 
-  private Path indexPath;
-  private Directory indexDirectory;
-  private Analyzer analyzer;
-  private IndexWriterConfig indexWriterConfig;
-  private IndexWriter indexWriter;
+  private final Directory indexDirectory;
+  private final IndexWriter indexWriter;
 
   @Inject
-  public LuceneSearch(ZeppelinConfiguration conf) {
+  public LuceneSearch(ZeppelinConfiguration conf) throws IOException {
     super("LuceneSearch");
 
     if (conf.isZeppelinSearchUseDisk()) {
       try {
-        this.indexPath = Paths.get(conf.getZeppelinSearchIndexPath());
+        final Path indexPath = Paths.get(conf.getZeppelinSearchIndexPath());
         this.indexDirectory = FSDirectory.open(indexPath);
-        LOGGER.info("Use {} for storing lucene search index", this.indexPath);
+        LOGGER.info("Use {} for storing lucene search index", indexPath);
       } catch (IOException e) {
-        throw new RuntimeException(
-            "Failed to create index directory for search service. Use memory 
instead", e);
+        throw new IOException("Failed to create index directory for search 
service.", e);
       }
     } else {
-      this.indexDirectory = new RAMDirectory();
+      this.indexDirectory = new ByteBuffersDirectory();
     }
-    this.analyzer = new StandardAnalyzer();
-    this.indexWriterConfig = new IndexWriterConfig(analyzer);
+    final Analyzer analyzer = new StandardAnalyzer();
+    final IndexWriterConfig indexWriterConfig = new 
IndexWriterConfig(analyzer);
     try {
       this.indexWriter = new IndexWriter(indexDirectory, indexWriterConfig);
     } catch (IOException e) {
-      LOGGER.error("Failed to create new IndexWriter", e);
+      throw new IOException("Failed to create new IndexWriter", e);
     }
   }
 
@@ -125,7 +121,10 @@ public class LuceneSearch extends SearchService {
           new MultiFieldQueryParser(new String[] {SEARCH_FIELD_TEXT, 
SEARCH_FIELD_TITLE}, analyzer);
 
       Query query = parser.parse(queryStr);
-      LOGGER.debug("Searching for: {}", query.toString(SEARCH_FIELD_TEXT));
+
+      if (LOGGER.isDebugEnabled()) {
+        LOGGER.debug("Searching for: {}", query.toString(SEARCH_FIELD_TEXT));
+      }
 
       SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
       Highlighter highlighter = new Highlighter(htmlFormatter, new 
QueryScorer(query));
@@ -141,7 +140,7 @@ public class LuceneSearch extends SearchService {
 
   private List<Map<String, String>> doSearch(
       IndexSearcher searcher, Query query, Analyzer analyzer, Highlighter 
highlighter) {
-    List<Map<String, String>> matchingParagraphs = Lists.newArrayList();
+    List<Map<String, String>> matchingParagraphs = new ArrayList<>();
     ScoreDoc[] hits;
     try {
       hits = searcher.search(query, 20).scoreDocs;
@@ -166,14 +165,14 @@ public class LuceneSearch extends SearchService {
             TokenStream tokenStream =
                 TokenSources.getTokenStream(
                     searcher.getIndexReader(), id, SEARCH_FIELD_TEXT, 
analyzer);
-            TextFragment[] frag = 
highlighter.getBestTextFragments(tokenStream, text, true, 3);
-            LOGGER.debug("    {} fragments found for query '{}'", frag.length, 
query);
-            for (int j = 0; j < frag.length; j++) {
-              if ((frag[j] != null) && (frag[j].getScore() > 0)) {
-                LOGGER.debug("    Fragment: {}", frag[j].toString());
+            TextFragment[] frags = 
highlighter.getBestTextFragments(tokenStream, text, true, 3);
+            LOGGER.debug("    {} fragments found for query '{}'", 
frags.length, query);
+            for (TextFragment frag : frags) {
+              if ((frag != null) && (frag.getScore() > 0)) {
+                LOGGER.debug("    Fragment: {}", frag);
               }
             }
-            fragment = (frag != null && frag.length > 0) ? frag[0].toString() 
: "";
+            fragment = (frags != null && frags.length > 0) ? 
frags[0].toString() : "";
           }
 
           if (header != null) {
@@ -228,10 +227,10 @@ public class LuceneSearch extends SearchService {
    * Updates index for the given note: either note.name or a paragraph If 
paragraph is <code>null
    * </code> - updates only for the note.name
    *
-   * @param noteId
-   * @param noteName
-   * @param p
-   * @throws IOException
+   * @param noteId id of the note
+   * @param noteName name of the note
+   * @param p paragraph
+   * @throws IOException if there is a low-level IO error
    */
   private void updateDoc(String noteId, String noteName, Paragraph p) throws 
IOException {
     String id = formatId(noteId, p);
@@ -240,7 +239,7 @@ public class LuceneSearch extends SearchService {
       indexWriter.updateDocument(new Term(ID_FIELD, id), doc);
       indexWriter.commit();
     } catch (IOException e) {
-      LOGGER.error("Failed to update index of notebook {}", noteId, e);
+      throw new IOException("Failed to update index of notebook " + noteId, e);
     }
   }
 
@@ -251,7 +250,7 @@ public class LuceneSearch extends SearchService {
   static String formatId(String noteId, Paragraph p) {
     String id = noteId;
     if (null != p) {
-      id = Joiner.on('/').join(id, PARAGRAPH, p.getId());
+      id = String.join("/", id, PARAGRAPH, p.getId());
     }
     return id;
   }
@@ -259,7 +258,7 @@ public class LuceneSearch extends SearchService {
   static String formatDeleteId(String noteId, Paragraph p) {
     String id = noteId;
     if (null != p) {
-      id = Joiner.on('/').join(id, PARAGRAPH, p.getId());
+      id = String.join("/", id, PARAGRAPH, p.getId());
     } else {
       id = id + "*";
     }
@@ -272,7 +271,7 @@ public class LuceneSearch extends SearchService {
    * @param id id of the document, different for Note name and paragraph
    * @param noteName name of the note
    * @param p paragraph
-   * @return
+   * @return a document
    */
   private Document newDocument(String id, String noteName, Paragraph p) {
     Document doc = new Document();
@@ -300,12 +299,12 @@ public class LuceneSearch extends SearchService {
    * @see 
org.apache.zeppelin.search.Search#addIndexDoc(org.apache.zeppelin.notebook.Note)
    */
   @Override
-  public void addNoteIndex(Note note) {
+  public void addNoteIndex(Note note) throws IOException {
     try {
       addIndexDocAsync(note);
       indexWriter.commit();
     } catch (IOException e) {
-      LOGGER.error("Failed to add note {} to index", note, e);
+      throw new IOException("Failed to add note " + note + " to index", e);
     }
   }
 
@@ -317,11 +316,11 @@ public class LuceneSearch extends SearchService {
   /**
    * Indexes the given notebook, but does not commit changes.
    *
-   * @param note
-   * @throws IOException
+   * @param note Note to add the index
+   * @throws IOException if there is a low-level IO error
    */
   private void addIndexDocAsync(Note note) throws IOException {
-    indexNoteName(indexWriter, note.getId(), note.getName());
+    indexNoteName(note.getId(), note.getName());
     for (Paragraph paragraph : note.getParagraphs()) {
       updateDoc(note.getId(), note.getName(), paragraph);
     }
@@ -331,7 +330,7 @@ public class LuceneSearch extends SearchService {
    * @see 
org.apache.zeppelin.search.Search#deleteIndexDocs(org.apache.zeppelin.notebook.Note)
    */
   @Override
-  public void deleteNoteIndex(Note note) {
+  public void deleteNoteIndex(Note note) throws IOException{
     if (note == null) {
       return;
     }
@@ -346,24 +345,24 @@ public class LuceneSearch extends SearchService {
    *  #deleteIndexDoc(org.apache.zeppelin.notebook.Note, 
org.apache.zeppelin.notebook.Paragraph)
    */
   @Override
-  public void deleteParagraphIndex(String noteId, Paragraph p) {
+  public void deleteParagraphIndex(String noteId, Paragraph p) throws 
IOException{
     deleteDoc(noteId, p);
   }
 
   /**
    * Delete note index of paragraph index (when p is not null).
    *
-   * @param noteId
-   * @param p
+   * @param noteId id of the note
+   * @param p paragraph
    */
-  private void deleteDoc(String noteId, Paragraph p) {
+  private void deleteDoc(String noteId, Paragraph p) throws IOException {
     String fullNoteOrJustParagraph = formatDeleteId(noteId, p);
     LOGGER.debug("Deleting note {}, out of: {}", noteId, 
indexWriter.getDocStats().numDocs);
     try {
       indexWriter.deleteDocuments(new WildcardQuery(new Term(ID_FIELD, 
fullNoteOrJustParagraph)));
       indexWriter.commit();
     } catch (IOException e) {
-      LOGGER.error("Failed to delete {} from index by '{}'", noteId, 
fullNoteOrJustParagraph, e);
+      throw new IOException("Failed to delete " + noteId + " from index by '" 
+ fullNoteOrJustParagraph + "'", e);
     }
     LOGGER.debug("Done, index contains {} docs now", 
indexWriter.getDocStats().numDocs);
   }
@@ -387,12 +386,12 @@ public class LuceneSearch extends SearchService {
   /**
    * Indexes a notebook name
    *
-   * @throws IOException
+   * @throws IOException if there is a low-level IO error
    */
-  private void indexNoteName(IndexWriter w, String noteId, String noteName) 
throws IOException {
+  private void indexNoteName(String noteId, String noteName) throws 
IOException {
     LOGGER.debug("Indexing Notebook {}, '{}'", noteId, noteName);
-    if (null == noteName || noteName.isEmpty()) {
-      LOGGER.debug("Skipping empty notebook name");
+    if (StringUtils.isBlank(noteName)) {
+      LOGGER.debug("Skipping empty or blank notebook name");
       return;
     }
     updateDoc(noteId, noteName, null);
@@ -403,7 +402,11 @@ public class LuceneSearch extends SearchService {
     Thread thread = new Thread(() -> {
       LOGGER.info("Starting rebuild index");
       notes.forEach(note -> {
-        addNoteIndex(note);
+        try {
+          addNoteIndex(note);
+        } catch (IOException e) {
+          LOGGER.error("Unable to add Note " + note.getName() + " to Index", 
e);
+        }
         note.unLoad();
       });
       LOGGER.info("Finish rebuild index");

[zeppelin] branch branch-0.10 updated: [ZEPPELIN-5507] Polish LuceneSerach

Reply via email to