Repository: kylin Updated Branches: refs/heads/master 350547e6e -> 21a1c5c09
KYLIN-1851 code review TrieDictionaryForest and Builder Project: http://git-wip-us.apache.org/repos/asf/kylin/repo Commit: http://git-wip-us.apache.org/repos/asf/kylin/commit/21a1c5c0 Tree: http://git-wip-us.apache.org/repos/asf/kylin/tree/21a1c5c0 Diff: http://git-wip-us.apache.org/repos/asf/kylin/diff/21a1c5c0 Branch: refs/heads/master Commit: 21a1c5c09e8e40adb4b11aed11ff2878fef18134 Parents: 350547e Author: Yang Li <[email protected]> Authored: Thu Nov 17 07:57:31 2016 +0800 Committer: Yang Li <[email protected]> Committed: Thu Nov 17 07:57:31 2016 +0800 ---------------------------------------------------------------------- .../apache/kylin/dict/TrieDictionaryForest.java | 153 ++++++++----------- .../kylin/dict/TrieDictionaryForestBuilder.java | 74 ++++----- 2 files changed, 90 insertions(+), 137 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kylin/blob/21a1c5c0/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java ---------------------------------------------------------------------- diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java index 38cd0dc..9a26c4c 100755 --- a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java +++ b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java @@ -17,7 +17,6 @@ */ package org.apache.kylin.dict; - import java.io.ByteArrayOutputStream; import java.io.DataInput; import java.io.DataOutput; @@ -26,7 +25,6 @@ import java.io.IOException; import java.io.PrintStream; import java.util.ArrayList; import java.util.Collections; -import java.util.Comparator; import java.util.List; import org.apache.kylin.common.util.ByteArray; @@ -35,11 +33,10 @@ import org.apache.kylin.common.util.BytesUtil; import org.apache.kylin.common.util.ClassUtil; import org.apache.kylin.common.util.Dictionary; - /** - * use trie forest to optimize trie dictionary + * Use trie forest to optimize trie dictionary * <p> - * the input data must in an increase order(sort by org.apache.kylin.dict.ByteComparator) + * The input data should be in an increase order (sort by org.apache.kylin.dict.ByteComparator) * <p> * Created by xiefan on 16-10-26. */ @@ -48,34 +45,19 @@ public class TrieDictionaryForest<T> extends Dictionary<T> { private ArrayList<TrieDictionary<T>> trees; - //private ArrayList<byte[]> valueDivide; //find tree - private ArrayList<ByteArray> valueDivide; - private ArrayList<Integer> accuOffset; //find tree + private ArrayList<Integer> accuOffset; //find tree private BytesConverter<T> bytesConvert; private int baseId; - /*public AtomicLong getValueIndexTime = new AtomicLong(0); - - public AtomicLong getValueTime = new AtomicLong(0); - - public AtomicLong binarySearchTime = new AtomicLong(0); - - public AtomicLong copyTime = new AtomicLong(0); - - public AtomicLong getValueIndexTime2 = new AtomicLong(0); - - public AtomicLong getValueTime2 = new AtomicLong(0);*/ - public TrieDictionaryForest() { // default constructor for Writable interface - } - public TrieDictionaryForest(ArrayList<TrieDictionary<T>> trees, - ArrayList<ByteArray> valueDivide, ArrayList<Integer> accuOffset, BytesConverter<T> bytesConverter, int baseId) { + public TrieDictionaryForest(ArrayList<TrieDictionary<T>> trees, ArrayList<ByteArray> valueDivide, // + ArrayList<Integer> accuOffset, BytesConverter<T> bytesConverter, int baseId) { this.trees = trees; this.valueDivide = valueDivide; this.accuOffset = accuOffset; @@ -83,16 +65,17 @@ public class TrieDictionaryForest<T> extends Dictionary<T> { this.baseId = baseId; } - @Override public int getMinId() { - if (trees.isEmpty()) return baseId; + if (trees.isEmpty()) + return baseId; return trees.get(0).getMinId() + baseId; } @Override public int getMaxId() { - if (trees.isEmpty()) return baseId - 1; + if (trees.isEmpty()) + return baseId - 1; int index = trees.size() - 1; int id = accuOffset.get(index) + trees.get(index).getMaxId() + baseId; return id; @@ -100,7 +83,8 @@ public class TrieDictionaryForest<T> extends Dictionary<T> { @Override public int getSizeOfId() { - if (trees.isEmpty()) return -1; + if (trees.isEmpty()) + return -1; int maxOffset = accuOffset.get(accuOffset.size() - 1); TrieDictionary<T> lastTree = trees.get(trees.size() - 1); int sizeOfId = BytesUtil.sizeForValue(baseId + maxOffset + lastTree.getMaxId() + 1); @@ -115,15 +99,13 @@ public class TrieDictionaryForest<T> extends Dictionary<T> { return maxValue; } - //value --> id + // value --> id @Override - protected int getIdFromValueImpl(T value, int roundingFlag) - throws IllegalArgumentException { + protected int getIdFromValueImpl(T value, int roundingFlag) throws IllegalArgumentException { byte[] valueBytes = bytesConvert.convertToBytes(value); return getIdFromValueBytesImpl(valueBytes, 0, valueBytes.length, roundingFlag); } - @Override protected int getIdFromValueBytesImpl(byte[] value, int offset, int len, int roundingFlag) throws IllegalArgumentException { @@ -132,13 +114,9 @@ public class TrieDictionaryForest<T> extends Dictionary<T> { return result; } - //id = tree_inner_offset + accumulate_offset + baseId - protected int _getIdFromValueBytesImpl(byte[] value, int offset, int len, int roundingFlag) - throws IllegalArgumentException { - - //long startTime = System.currentTimeMillis(); + // id = tree_inner_offset + accumulate_offset + baseId + protected int _getIdFromValueBytesImpl(byte[] value, int offset, int len, int roundingFlag) throws IllegalArgumentException { ByteArray search = new ByteArray(value, offset, len); - //copyTime.addAndGet(System.currentTimeMillis() - startTime); int index = findIndexByValue(search); if (index < 0) { if (roundingFlag > 0) { @@ -152,7 +130,6 @@ public class TrieDictionaryForest<T> extends Dictionary<T> { T curTreeMax = trees.get(index).getValueFromId(trees.get(index).getMaxId()); byte[] b1 = bytesConvert.convertToBytes(curTreeMax); ByteArray ba1 = new ByteArray(b1, 0, b1.length); - //ByteArray ba2 = new ByteArray(value, 0, value.length); if (search.compareTo(ba1) > 0) index++; if (index >= trees.size()) @@ -162,9 +139,6 @@ public class TrieDictionaryForest<T> extends Dictionary<T> { id = tree.getIdFromValueBytes(value, offset, len, roundingFlag); id = id + accuOffset.get(index); id += baseId; - if (id < 0) { - throw new IllegalArgumentException("Value '" + Bytes.toString(value, offset, len) + "' (" + Bytes.toStringBinary(value, offset, len) + ") not exists!"); - } return id; } @@ -179,9 +153,7 @@ public class TrieDictionaryForest<T> extends Dictionary<T> { } @Override - protected int getValueBytesFromIdImpl(int id, byte[] returnValue, int offset) - throws IllegalArgumentException { - //long startTime = System.currentTimeMillis(); + protected int getValueBytesFromIdImpl(int id, byte[] returnValue, int offset) throws IllegalArgumentException { int index = findIndexById(id); int treeInnerOffset = getTreeInnerOffset(id, index); TrieDictionary<T> tree = trees.get(index); @@ -189,20 +161,15 @@ public class TrieDictionaryForest<T> extends Dictionary<T> { return size; } - @Override protected byte[] getValueBytesFromIdImpl(int id) throws IllegalArgumentException { - int index = findIndexById(id); //lower bound - if (index < 0) { - throw new IllegalArgumentException("Tree Not Found. index < 0"); - } + int index = findIndexById(id); int treeInnerOffset = getTreeInnerOffset(id, index); TrieDictionary<T> tree = trees.get(index); byte[] result = tree.getValueBytesFromId(treeInnerOffset); return result; } - private int getTreeInnerOffset(int id, int index) { id -= baseId; id = id - accuOffset.get(index); @@ -259,7 +226,6 @@ public class TrieDictionaryForest<T> extends Dictionary<T> { out.write(head); } - private void writeBody(DataOutput out) throws IOException { for (int i = 0; i < trees.size(); i++) { TrieDictionary<T> tree = trees.get(i); @@ -307,21 +273,6 @@ public class TrieDictionaryForest<T> extends Dictionary<T> { } - /*@Override - public boolean equals(Object o) { - if ((o instanceof TrieDictionaryForest) == false) { - logger.info("Equals return false because it's not TrieDictionaryForest"); - return false; - } - TrieDictionaryForest that = (TrieDictionaryForest) o; - if(this.trees.size() != that.getTrees().size()) - return false; - for(int i=0;i<trees.size();i++){ - if(!trees.get(i).equals(that.getTrees().get(i))) return false; - } - return true; - }*/ - @Override public boolean contains(Dictionary other) { if (other.getSize() > this.getSize()) { @@ -337,35 +288,59 @@ public class TrieDictionaryForest<T> extends Dictionary<T> { return true; } - public List<TrieDictionary<T>> getTrees() { + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + baseId; + result = prime * result + ((trees == null) ? 0 : trees.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + TrieDictionaryForest other = (TrieDictionaryForest) obj; + if (baseId != other.baseId) + return false; + if (trees == null) { + if (other.trees != null) + return false; + } else if (!trees.equals(other.trees)) + return false; + return true; + } + + List<TrieDictionary<T>> getTrees() { return Collections.unmodifiableList(this.trees); } + BytesConverter<T> getBytesConvert() { + return bytesConvert; + } + private int findIndexByValue(ByteArray value) { - int index = lowerBound(value, new Comparator<ByteArray>() { - @Override - public int compare(ByteArray o1, ByteArray o2) { - return o1.compareTo(o2); - } - }, this.valueDivide); + int index = lowerBound(value, this.valueDivide); return index; } private int findIndexById(Integer id) { id -= baseId; - int index = lowerBound(id, new Comparator<Integer>() { - @Override - public int compare(Integer o1, Integer o2) { - return o1.compareTo(o2); - } - }, this.accuOffset); + int index = lowerBound(id, this.accuOffset); + if (index < 0) { + throw new IllegalArgumentException("Tree Not Found. index < 0"); + } return index; } - - private static <K> int lowerBound(K lookfor, Comparator<K> comparator, ArrayList<K> list) { + private static <K extends Comparable> int lowerBound(K lookfor, ArrayList<K> list) { if (list == null || list.isEmpty()) - return 0; //return the first tree + return -1; // not found int left = 0; int right = list.size() - 1; int mid = 0; @@ -373,7 +348,7 @@ public class TrieDictionaryForest<T> extends Dictionary<T> { int comp = 0; while (!found && left <= right) { mid = left + (right - left) / 2; - comp = comparator.compare(lookfor, list.get(mid)); + comp = lookfor.compareTo(list.get(mid)); if (comp < 0) right = mid - 1; else if (comp > 0) @@ -384,7 +359,7 @@ public class TrieDictionaryForest<T> extends Dictionary<T> { if (found) { return mid; } else { - return Math.min(left, right); //value may be bigger than the right tree + return Math.min(left, right); // value may be bigger than the right tree (could return -1) } } @@ -402,16 +377,8 @@ public class TrieDictionaryForest<T> extends Dictionary<T> { list.add("paint"); Collections.sort(list); for (String str : list) { - System.out.println("found value:" + str + " index:" + lowerBound(str, new Comparator<String>() { - @Override - public int compare(String o1, String o2) { - return o1.compareTo(o2); - } - }, list)); + System.out.println("found value:" + str + " index:" + lowerBound(str, list)); } } - public BytesConverter<T> getBytesConvert() { - return bytesConvert; - } } http://git-wip-us.apache.org/repos/asf/kylin/blob/21a1c5c0/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForestBuilder.java ---------------------------------------------------------------------- diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForestBuilder.java b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForestBuilder.java index 1ceac27..8c577cf 100755 --- a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForestBuilder.java +++ b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForestBuilder.java @@ -24,12 +24,14 @@ import org.slf4j.LoggerFactory; import java.util.ArrayList; - +/** + * Build a trie dictionary forest if the input values is ordered, or the forest falls back to a single trie. + */ public class TrieDictionaryForestBuilder<T> { public static int DEFAULT_MAX_TRIE_TREE_SIZE_MB = 500; - //public static int MaxTrieTreeSize = 1024;//1k + private static final Logger logger = LoggerFactory.getLogger(TrieDictionaryForestBuilder.class); private BytesConverter<T> bytesConverter; @@ -41,11 +43,9 @@ public class TrieDictionaryForestBuilder<T> { private ArrayList<ByteArray> valueDivide = new ArrayList<>(); //find tree - private ArrayList<Integer> accuOffset = new ArrayList<>(); //find tree - - private ByteArray previousValue = null; //value use for remove duplicate + private ArrayList<Integer> accuOffset = new ArrayList<>(); //find tree - private static final Logger logger = LoggerFactory.getLogger(TrieDictionaryForestBuilder.class); + private ByteArray previousValue = null; //value use for remove duplicate private int baseId; @@ -55,7 +55,6 @@ public class TrieDictionaryForestBuilder<T> { private boolean isOrdered = true; - public TrieDictionaryForestBuilder(BytesConverter<T> bytesConverter) { this(bytesConverter, 0); } @@ -68,46 +67,45 @@ public class TrieDictionaryForestBuilder<T> { this.bytesConverter = bytesConverter; this.trieBuilder = new TrieDictionaryBuilder<T>(bytesConverter); this.baseId = baseId; - curOffset = 0; + this.curOffset = 0; this.maxTrieTreeSize = maxTrieTreeSizeMB * 1024 * 1024; - logger.info("maxTrieSize is set to:" + maxTrieTreeSize + "B"); } - public void addValue(T value) { - if (value == null) return; + if (value == null) + return; byte[] valueBytes = bytesConverter.convertToBytes(value); addValue(new ByteArray(valueBytes, 0, valueBytes.length)); } public void addValue(byte[] value) { - if (value == null) return; + if (value == null) + return; ByteArray array = new ByteArray(value, 0, value.length); addValue(array); } public void addValue(ByteArray value) { - //System.out.println("value length:"+value.length); - if (value == null) return; - //logger.info("going to add value:" + new String(value.array())); - if (previousValue == null) { - previousValue = value; - } else { + if (value == null) + return; + if (previousValue != null && isOrdered) { int comp = previousValue.compareTo(value); if (comp == 0) { - //logger.info("find duplicate value:" + new String(value.array())); return; //duplicate value } - if (comp > 0 && isOrdered) { - logger.info("values not in ascending order:" + new String(value.array())); + if (comp > 0) { + logger.info("values not in ascending order, previous '{}', current '{}'", previousValue, value); isOrdered = false; - //System.out.println("."); + if (trees.size() > 0) { + throw new IllegalStateException("Invalid input data. Unordered data cannot be split into multi trees"); + } } } - this.trieBuilder.addValue(value.array()); previousValue = value; - this.curTreeSize += value.length(); - if (curTreeSize >= this.maxTrieTreeSize) { + trieBuilder.addValue(value.array()); + curTreeSize += value.length(); + + if (curTreeSize >= maxTrieTreeSize && isOrdered) { TrieDictionary<T> tree = trieBuilder.build(0); addTree(tree); reset(); @@ -115,28 +113,17 @@ public class TrieDictionaryForestBuilder<T> { } public TrieDictionaryForest<T> build() { - if (curTreeSize != 0) { //last tree + if (curTreeSize != 0) { //last tree TrieDictionary<T> tree = trieBuilder.build(0); addTree(tree); reset(); } - TrieDictionaryForest<T> forest = new TrieDictionaryForest<T>(this.trees, - this.valueDivide, this.accuOffset, this.bytesConverter, baseId); - - //log - logger.info("tree num:" + forest.getTrees().size()); - StringBuilder sb = new StringBuilder(); - for (ByteArray ba : valueDivide) { - sb.append(new String(ba.array()) + " "); - } - logger.info("value divide:" + sb.toString()); - /* - If input values are not in ascending order and tree num>1,TrieDictionaryForest can not work correctly. - */ + TrieDictionaryForest<T> forest = new TrieDictionaryForest<T>(this.trees, this.valueDivide, this.accuOffset, this.bytesConverter, baseId); + + // if input values are not in ascending order and tree num>1,TrieDictionaryForest can not work correctly. if (forest.getTrees().size() > 1 && !isOrdered) { - throw new IllegalStateException("Invalid input data.Unordered data can not be split into multi trees"); + throw new IllegalStateException("Invalid input data. Unordered data can not be split into multi trees"); } - return forest; } @@ -144,7 +131,7 @@ public class TrieDictionaryForestBuilder<T> { return maxTrieTreeSize; } - public void setMaxTrieTreeSize(int maxTrieTreeSize) { + void setMaxTrieTreeSize(int maxTrieTreeSize) { this.maxTrieTreeSize = maxTrieTreeSize; logger.info("maxTrieSize is set to:" + maxTrieTreeSize + "B"); } @@ -156,7 +143,6 @@ public class TrieDictionaryForestBuilder<T> { byte[] valueBytes = tree.getValueBytesFromId(minId); valueDivide.add(new ByteArray(valueBytes, 0, valueBytes.length)); curOffset += (tree.getMaxId() + 1); - //System.out.println(" curOffset:"+ curOffset); } private void reset() { @@ -169,7 +155,7 @@ public class TrieDictionaryForestBuilder<T> { try { config = KylinConfig.getInstanceFromEnv(); } catch (RuntimeException e) { - logger.info("can not get KylinConfig from env.Use default setting:" + DEFAULT_MAX_TRIE_TREE_SIZE_MB + "MB"); + logger.info("cannot get KylinConfig from env.Use default setting:" + DEFAULT_MAX_TRIE_TREE_SIZE_MB + "MB"); } int maxTrieTreeSizeMB; if (config != null) {
