This is an automated email from the ASF dual-hosted git repository. shaofengshi pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/kylin.git
commit c9337cb4a5e6c4f04c10b7fde1d5e3f6a8ffa25b Author: Zhong <nju_y...@apache.org> AuthorDate: Wed Aug 15 16:04:30 2018 +0800 KYLIN-3489 improve the efficiency of enumerating dictionary values by pre-order visiting --- .../org/apache/kylin/common/util/Dictionary.java | 11 +++++ .../java/org/apache/kylin/dict/TrieDictionary.java | 51 ++++++++++++++++++++++ .../org/apache/kylin/dict/TrieDictionaryTest.java | 30 +++++++++++++ 3 files changed, 92 insertions(+) diff --git a/core-common/src/main/java/org/apache/kylin/common/util/Dictionary.java b/core-common/src/main/java/org/apache/kylin/common/util/Dictionary.java index 7c1e8ab..5c47399 100644 --- a/core-common/src/main/java/org/apache/kylin/common/util/Dictionary.java +++ b/core-common/src/main/java/org/apache/kylin/common/util/Dictionary.java @@ -24,11 +24,14 @@ import java.io.IOException; import java.io.PrintStream; import java.io.Serializable; import java.io.UnsupportedEncodingException; +import java.util.List; import org.apache.kylin.common.KylinConfig; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.common.collect.Lists; + /** * A bi-way dictionary that maps from dimension/column values to IDs and vice * versa. By storing IDs instead of real values, the size of cube is @@ -170,6 +173,14 @@ abstract public class Dictionary<T> implements Serializable { abstract public void dump(PrintStream out); + public List<T> enumeratorValues() { + List<T> ret = Lists.newArrayListWithExpectedSize(getSize()); + for (int i = getMinId(); i <= getMaxId(); i++) { + ret.add(getValueFromId(i)); + } + return ret; + } + public int nullId() { return NULL_ID[getSizeOfId()]; } diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionary.java b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionary.java index d531c05..8303bb0 100644 --- a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionary.java +++ b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionary.java @@ -28,6 +28,7 @@ import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.PrintStream; import java.util.Arrays; +import java.util.List; import org.apache.kylin.common.util.Bytes; import org.apache.kylin.common.util.BytesUtil; @@ -36,7 +37,9 @@ import org.apache.kylin.common.util.Dictionary; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; /** * A dictionary based on Trie data structure that maps enumerations of byte[] to @@ -359,6 +362,54 @@ public class TrieDictionary<T> extends CacheDictionary<T> { } @Override + public List<T> enumeratorValues() { + List<T> result = Lists.newArrayListWithExpectedSize(getSize()); + byte[] buf = new byte[maxValueLength]; + visitNode(headSize, buf, 0, result); + return result; + } + + @VisibleForTesting + List<T> enumeratorValuesByParent() { + return super.enumeratorValues(); + } + + /** + * Visit the trie tree by pre-order + * @param n -- the offset of current node in trieBytes + * @param returnValue -- where return value is written to + */ + private void visitNode(int n, byte[] returnValue, int offset, List<T> result) { + int o = offset; + + // write current node value + int p = n + firstByteOffset; + int len = BytesUtil.readUnsigned(trieBytes, p - 1, 1); + System.arraycopy(trieBytes, p, returnValue, o, len); + o += len; + + // if the value is ended + boolean isEndOfValue = checkFlag(n, BIT_IS_END_OF_VALUE); + if (isEndOfValue) { + T curNodeValue = bytesConvert.convertFromBytes(returnValue, 0, o); + result.add(curNodeValue); + } + + // find a child to continue + int c = getChildOffset(n); + if (c == headSize) // has no children + return; + while (true) { + visitNode(c, returnValue, o, result); + if (checkFlag(c, BIT_IS_LAST_CHILD)) + return; + // go to next child + p = c + firstByteOffset; + c = p + BytesUtil.readUnsigned(trieBytes, p - 1, 1); + } + } + + @Override public void dump(PrintStream out) { out.println("Total " + nValues + " values"); for (int i = 0; i < nValues; i++) { diff --git a/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryTest.java b/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryTest.java index 600b072..c873035 100644 --- a/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryTest.java +++ b/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryTest.java @@ -35,13 +35,18 @@ import java.io.InputStreamReader; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; +import java.util.List; import java.util.NoSuchElementException; import java.util.Random; import java.util.TreeSet; +import java.util.concurrent.TimeUnit; import org.junit.Assert; import org.junit.Test; +import com.google.common.base.Stopwatch; +import com.google.common.collect.Sets; + public class TrieDictionaryTest { public static void main(String[] args) throws Exception { @@ -209,6 +214,31 @@ public class TrieDictionaryTest { } @Test + public void testEnumeratorValues() throws Exception { + testEnumeratorValues("src/test/resources/dict/english-words.80 (scowl-2015.05.18).txt"); + testEnumeratorValues("src/test/resources/dict/dw_category_grouping_names.dat"); + } + + private void testEnumeratorValues(String file) throws Exception { + InputStream is = new FileInputStream(file); + ArrayList<String> str = loadStrings(is); + TrieDictionaryBuilder<String> b = newDictBuilder(str); + TrieDictionary<String> dict = b.build(0); + System.out.println("Dictionary size for file " + file + " is " + dict.getSize()); + + Stopwatch sw = new Stopwatch(); + sw.start(); + List<String> values1 = dict.enumeratorValuesByParent(); + System.out.println("By iterating id visit the time cost " + sw.elapsed(TimeUnit.MILLISECONDS) + " ms"); + sw.reset(); + sw.start(); + List<String> values2 = dict.enumeratorValues(); + System.out.println("By pre-order visit the time cost " + sw.elapsed(TimeUnit.MILLISECONDS) + " ms"); + sw.stop(); + assertEquals(Sets.newHashSet(values1), Sets.newHashSet(values2)); + } + + @Test public void englishWordsTest() throws Exception { InputStream is = new FileInputStream("src/test/resources/dict/english-words.80 (scowl-2015.05.18).txt"); ArrayList<String> str = loadStrings(is);