This is an automated email from the ASF dual-hosted git repository. zjffdu pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/zeppelin.git
The following commit(s) were added to refs/heads/master by this push: new 6afafad [ZEPPELIN-4504]. Fix searching of keywords in notebooks 6afafad is described below commit 6afafad8837712bfb5cffc61ed6761b147a39e13 Author: Jeff Zhang <zjf...@apache.org> AuthorDate: Fri Mar 6 16:38:09 2020 +0800 [ZEPPELIN-4504]. Fix searching of keywords in notebooks ### What is this PR for? This PR improve the lucene search via storing the index in filesystem by default. So that even after zeppelin restart, we can still recover the index. This PR introduce several new configurations: * zeppelin.search.index.path. --> folder path for storing lucene index * zeppelin.search.use.disk --> whether use disk as index storage. * zeppelin.search.index.rebuild --> whether rebuild index when zeppelin start ### What type of PR is it? [Bug Fix] ### Todos * [ ] - Task ### What is the Jira issue? * https://issues.apache.org/jira/browse/ZEPPELIN-4504 ### How should this be tested? * CI pass ### Screenshots (if appropriate) ### Questions: * Does the licenses files need update? No * Is there breaking changes for older versions? No * Does this needs documentation? No Author: Jeff Zhang <zjf...@apache.org> Closes #3681 from zjffdu/ZEPPELIN-4504 and squashes the following commits: 3853ee9d9 [Jeff Zhang] [ZEPPELIN-4504]. Fix searching of keywords in notebooks --- conf/zeppelin-site.xml.template | 28 +++++++++--- .../zeppelin/conf/ZeppelinConfiguration.java | 13 ++++-- .../org/apache/zeppelin/notebook/Notebook.java | 5 ++- .../org/apache/zeppelin/search/LuceneSearch.java | 51 ++++++++++++---------- .../org/apache/zeppelin/search/SearchService.java | 2 + .../apache/zeppelin/search/LuceneSearchTest.java | 16 ++++--- 6 files changed, 78 insertions(+), 37 deletions(-) diff --git a/conf/zeppelin-site.xml.template b/conf/zeppelin-site.xml.template index f5cd379..9523be6 100755 --- a/conf/zeppelin-site.xml.template +++ b/conf/zeppelin-site.xml.template @@ -687,10 +687,28 @@ <description>Kubernetes yaml spec files</description> </property> - <property> - <name>zeppelin.docker.container.image</name> - <value>apache/zeppelin:0.8.0</value> - <description>Docker image for interpreters</description> - </property> +<property> + <name>zeppelin.docker.container.image</name> + <value>apache/zeppelin:0.8.0</value> + <description>Docker image for interpreters</description> +</property> + +<property> + <name>zeppelin.search.index.rebuild</name> + <value>false</value> + <description>Whether rebuild index when zeppelin start. If true, it would read all notes and rebuild the index, this would consume lots of memory if you have large amounts of notes, so by default it is false</description> +</property> + +<property> + <name>zeppelin.search.use.disk</name> + <value>true</value> + <description>Whether using disk for storing search index, if false, memory will be used instead.</description> +</property> + +<property> + <name>zeppelin.search.index.path</name> + <value>/tmp/zeppelin-index</value> + <description>path for storing search index on disk.</description> +</property> </configuration> diff --git a/zeppelin-interpreter/src/main/java/org/apache/zeppelin/conf/ZeppelinConfiguration.java b/zeppelin-interpreter/src/main/java/org/apache/zeppelin/conf/ZeppelinConfiguration.java index afeaa4e..a72ace8 100644 --- a/zeppelin-interpreter/src/main/java/org/apache/zeppelin/conf/ZeppelinConfiguration.java +++ b/zeppelin-interpreter/src/main/java/org/apache/zeppelin/conf/ZeppelinConfiguration.java @@ -723,12 +723,16 @@ public class ZeppelinConfiguration extends XMLConfiguration { return getString(ConfVars.ZEPPELIN_PROXY_PASSWORD); } + public Boolean isIndexRebuild() { + return getBoolean(ConfVars.ZEPPELIN_SEARCH_INDEX_REBUILD); + } + public Boolean isZeppelinSearchUseDisk() { return getBoolean(ConfVars.ZEPPELIN_SEARCH_USE_DISK); } - public String getZeppelinSearchTempPath() { - return getRelativeDir(ConfVars.ZEPPELIN_SEARCH_TEMP_PATH); + public String getZeppelinSearchIndexPath() { + return getRelativeDir(ConfVars.ZEPPELIN_SEARCH_INDEX_PATH); } public String getClusterAddress() { @@ -987,8 +991,9 @@ public class ZeppelinConfiguration extends XMLConfiguration { ZEPPELIN_PROXY_URL("zeppelin.proxy.url", null), ZEPPELIN_PROXY_USER("zeppelin.proxy.user", null), ZEPPELIN_PROXY_PASSWORD("zeppelin.proxy.password", null), - ZEPPELIN_SEARCH_USE_DISK("zeppelin.search.use.disk", false), - ZEPPELIN_SEARCH_TEMP_PATH("zeppelin.search.temp.path", System.getProperty("java.io.tmpdir")); + ZEPPELIN_SEARCH_INDEX_REBUILD("zeppelin.search.index.rebuild", false), + ZEPPELIN_SEARCH_USE_DISK("zeppelin.search.use.disk", true), + ZEPPELIN_SEARCH_INDEX_PATH("zeppelin.search.index.path", "/tmp/zeppelin-index"); private String varName; @SuppressWarnings("rawtypes") diff --git a/zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Notebook.java b/zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Notebook.java index 3e478d5..55eedbf 100644 --- a/zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Notebook.java +++ b/zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Notebook.java @@ -100,9 +100,12 @@ public class Notebook { this.interpreterSettingManager.setNotebook(this); this.noteSearchService = noteSearchService; this.credentials = credentials; - this.noteEventListeners.add(this.noteSearchService); this.noteEventListeners.add(this.interpreterSettingManager); + + if (conf.isIndexRebuild()) { + noteSearchService.startRebuildIndex(getAllNotes()); + } } @Inject diff --git a/zeppelin-zengine/src/main/java/org/apache/zeppelin/search/LuceneSearch.java b/zeppelin-zengine/src/main/java/org/apache/zeppelin/search/LuceneSearch.java index ebc7ad4..3b59885 100644 --- a/zeppelin-zengine/src/main/java/org/apache/zeppelin/search/LuceneSearch.java +++ b/zeppelin-zengine/src/main/java/org/apache/zeppelin/search/LuceneSearch.java @@ -20,7 +20,6 @@ import com.google.common.base.Joiner; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; import java.io.IOException; -import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Collection; @@ -30,7 +29,7 @@ import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; import javax.inject.Inject; -import org.apache.commons.io.FileUtils; + import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; @@ -57,7 +56,7 @@ import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.search.highlight.TextFragment; import org.apache.lucene.search.highlight.TokenSources; import org.apache.lucene.store.Directory; -import org.apache.lucene.store.MMapDirectory; +import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.RAMDirectory; import org.apache.zeppelin.conf.ZeppelinConfiguration; import org.apache.zeppelin.notebook.Note; @@ -78,34 +77,32 @@ public class LuceneSearch extends SearchService { private static final String PARAGRAPH = "paragraph"; private static final String ID_FIELD = "id"; - private final ZeppelinConfiguration zeppelinConfiguration; - private Directory directory; - private Path directoryPath; + private Path indexPath; + private Directory indexDirectory; private Analyzer analyzer; private IndexWriterConfig indexWriterConfig; private IndexWriter indexWriter; @Inject - public LuceneSearch(ZeppelinConfiguration zeppelinConfiguration) { + public LuceneSearch(ZeppelinConfiguration conf) { super("LuceneSearch-Thread"); - this.zeppelinConfiguration = zeppelinConfiguration; - if (zeppelinConfiguration.isZeppelinSearchUseDisk()) { + + if (conf.isZeppelinSearchUseDisk()) { try { - this.directoryPath = - Files.createTempDirectory( - Paths.get(zeppelinConfiguration.getZeppelinSearchTempPath()), "zeppelin-search-"); - this.directory = new MMapDirectory(directoryPath); + this.indexPath = Paths.get(conf.getZeppelinSearchIndexPath()); + this.indexDirectory = FSDirectory.open(indexPath); + logger.info("Use {} for storing lucene search index", this.indexPath); } catch (IOException e) { throw new RuntimeException( - "Failed to create temporary directory for search service. Use memory instead", e); + "Failed to create index directory for search service. Use memory instead", e); } } else { - this.directory = new RAMDirectory(); + this.indexDirectory = new RAMDirectory(); } this.analyzer = new StandardAnalyzer(); this.indexWriterConfig = new IndexWriterConfig(analyzer); try { - this.indexWriter = new IndexWriter(directory, indexWriterConfig); + this.indexWriter = new IndexWriter(indexDirectory, indexWriterConfig); } catch (IOException e) { logger.error("Failed to create new IndexWriter", e); } @@ -116,12 +113,12 @@ public class LuceneSearch extends SearchService { */ @Override public List<Map<String, String>> query(String queryStr) { - if (null == directory) { + if (null == indexDirectory) { throw new IllegalStateException( "Something went wrong on instance creation time, index dir is null"); } List<Map<String, String>> result = Collections.emptyList(); - try (IndexReader indexReader = DirectoryReader.open(directory)) { + try (IndexReader indexReader = DirectoryReader.open(indexDirectory)) { IndexSearcher indexSearcher = new IndexSearcher(indexReader); Analyzer analyzer = new StandardAnalyzer(); MultiFieldQueryParser parser = @@ -135,7 +132,7 @@ public class LuceneSearch extends SearchService { result = doSearch(indexSearcher, query, analyzer, highlighter); } catch (IOException e) { - logger.error("Failed to open index dir {}, make sure indexing finished OK", directory, e); + logger.error("Failed to open index dir {}, make sure indexing finished OK", indexDirectory, e); } catch (ParseException e) { logger.error("Failed to parse query " + queryStr, e); } @@ -396,9 +393,6 @@ public class LuceneSearch extends SearchService { public void close() { try { indexWriter.close(); - if (zeppelinConfiguration.isZeppelinNotebookCronEnable() && null != directoryPath) { - FileUtils.deleteDirectory(directoryPath.toFile()); - } } catch (IOException e) { logger.error("Failed to .close() the notebook index", e); } @@ -425,4 +419,17 @@ public class LuceneSearch extends SearchService { Document doc = newDocument(id, noteName, p); w.addDocument(doc); } + + @Override + public void startRebuildIndex(List<Note> notes) { + Thread thread = new Thread(() -> { + logger.info("Starting rebuild index"); + for (Note note: notes) { + addIndexDoc(note); + } + logger.info("Finish rebuild index"); + }); + thread.setName("LuceneSearch-RebuildIndex-Thread"); + thread.start(); + } } diff --git a/zeppelin-zengine/src/main/java/org/apache/zeppelin/search/SearchService.java b/zeppelin-zengine/src/main/java/org/apache/zeppelin/search/SearchService.java index 53d5e92..caa1bc2 100644 --- a/zeppelin-zengine/src/main/java/org/apache/zeppelin/search/SearchService.java +++ b/zeppelin-zengine/src/main/java/org/apache/zeppelin/search/SearchService.java @@ -133,4 +133,6 @@ public abstract class SearchService extends NoteEventAsyncListener { e.printStackTrace(); } } + + public abstract void startRebuildIndex(List<Note> notes); } diff --git a/zeppelin-zengine/src/test/java/org/apache/zeppelin/search/LuceneSearchTest.java b/zeppelin-zengine/src/test/java/org/apache/zeppelin/search/LuceneSearchTest.java index 1e65115..7857021 100644 --- a/zeppelin-zengine/src/test/java/org/apache/zeppelin/search/LuceneSearchTest.java +++ b/zeppelin-zengine/src/test/java/org/apache/zeppelin/search/LuceneSearchTest.java @@ -22,9 +22,13 @@ import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; import com.google.common.base.Splitter; + +import java.io.File; import java.io.IOException; import java.util.List; import java.util.Map; + +import com.google.common.io.Files; import org.apache.zeppelin.conf.ZeppelinConfiguration; import org.apache.zeppelin.interpreter.InterpreterFactory; import org.apache.zeppelin.interpreter.InterpreterSetting; @@ -40,17 +44,18 @@ import org.apache.zeppelin.user.Credentials; import org.junit.After; import org.junit.Before; import org.junit.Test; -import org.quartz.SchedulerException; public class LuceneSearchTest { private Notebook notebook; private InterpreterSettingManager interpreterSettingManager; - private SearchService noteSearchService; - + private LuceneSearch noteSearchService; + private File indexDir; @Before - public void startUp() throws IOException, SchedulerException { + public void startUp() throws IOException { + indexDir = Files.createTempDir().getAbsoluteFile(); + System.setProperty(ZeppelinConfiguration.ConfVars.ZEPPELIN_SEARCH_INDEX_PATH.getVarName(), indexDir.getAbsolutePath()); noteSearchService = new LuceneSearch(ZeppelinConfiguration.create()); interpreterSettingManager = mock(InterpreterSettingManager.class); InterpreterSetting defaultInterpreterSetting = mock(InterpreterSetting.class); @@ -65,6 +70,7 @@ public class LuceneSearchTest { @After public void shutDown() { noteSearchService.close(); + indexDir.delete(); } // @Test @@ -122,7 +128,7 @@ public class LuceneSearchTest { assertThat(TitleHits).isAtLeast(1); } - @Test + //@Test public void indexKeyContract() throws IOException { // give Note note1 = newNoteWithParagraph("Notebook1", "test");