Repository: kylin Updated Branches: refs/heads/master 00ba74327 -> 0a0c5547d
KYLIN-1851 Refactor NumberDictionaryForest Project: http://git-wip-us.apache.org/repos/asf/kylin/repo Commit: http://git-wip-us.apache.org/repos/asf/kylin/commit/0a0c5547 Tree: http://git-wip-us.apache.org/repos/asf/kylin/tree/0a0c5547 Diff: http://git-wip-us.apache.org/repos/asf/kylin/diff/0a0c5547 Branch: refs/heads/master Commit: 0a0c5547d4f7d87696822b322a9a78112ec26c40 Parents: 00ba743 Author: Li Yang <[email protected]> Authored: Thu Nov 17 11:34:43 2016 +0800 Committer: Li Yang <[email protected]> Committed: Thu Nov 17 11:34:54 2016 +0800 ---------------------------------------------------------------------- .../apache/kylin/dict/DictionaryGenerator.java | 2 +- .../kylin/dict/NumberDictionaryBuilder.java | 1 - .../kylin/dict/NumberDictionaryForest.java | 284 ------------------- .../dict/NumberDictionaryForestBuilder.java | 68 +++-- .../kylin/dict/TrieDictionaryForestTest.java | 260 ++++++++--------- .../mr/steps/NumberDictionaryForestTest.java | 19 +- .../mr/steps/SelfDefineSortableKeyTest.java | 22 +- 7 files changed, 180 insertions(+), 476 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kylin/blob/0a0c5547/core-dictionary/src/main/java/org/apache/kylin/dict/DictionaryGenerator.java ---------------------------------------------------------------------- diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/DictionaryGenerator.java b/core-dictionary/src/main/java/org/apache/kylin/dict/DictionaryGenerator.java index 8eafe5f..ad07423 100644 --- a/core-dictionary/src/main/java/org/apache/kylin/dict/DictionaryGenerator.java +++ b/core-dictionary/src/main/java/org/apache/kylin/dict/DictionaryGenerator.java @@ -156,7 +156,7 @@ public class DictionaryGenerator { private static class NumberDictBuilder implements IDictionaryBuilder { @Override public Dictionary<String> build(DictionaryInfo dictInfo, IDictionaryValueEnumerator valueEnumerator, int baseId, int nSamples, ArrayList<String> returnSamples) throws IOException { - NumberDictionaryForestBuilder builder = new NumberDictionaryForestBuilder(new StringBytesConverter(), baseId); + NumberDictionaryForestBuilder builder = new NumberDictionaryForestBuilder(baseId); byte[] value; while (valueEnumerator.moveNext()) { value = valueEnumerator.current(); http://git-wip-us.apache.org/repos/asf/kylin/blob/0a0c5547/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionaryBuilder.java ---------------------------------------------------------------------- diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionaryBuilder.java b/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionaryBuilder.java index 6d7d0db..27d81ba 100644 --- a/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionaryBuilder.java +++ b/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionaryBuilder.java @@ -22,7 +22,6 @@ import org.apache.kylin.common.util.Bytes; /** * @author yangli9 - * */ public class NumberDictionaryBuilder<T> extends TrieDictionaryBuilder<T> { http://git-wip-us.apache.org/repos/asf/kylin/blob/0a0c5547/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionaryForest.java ---------------------------------------------------------------------- diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionaryForest.java b/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionaryForest.java deleted file mode 100644 index fdf1e68..0000000 --- a/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionaryForest.java +++ /dev/null @@ -1,284 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. -*/ - -package org.apache.kylin.dict; - -import org.apache.kylin.common.util.Bytes; -import org.apache.kylin.common.util.Dictionary; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.io.PrintStream; - -/** - * Created by xiefan on 16-11-1. - * <p> - * notice:number dictionary forest currently could not handle - * very big or very small double and float value such as 4.9E-324 - */ -public class NumberDictionaryForest<T> extends Dictionary<T> { - - public static final int MAX_DIGITS_BEFORE_DECIMAL_POINT = 19; - - // encode a number into an order preserving byte sequence - // for positives -- padding '0' - // for negatives -- '-' sign, padding '9', invert digits, and terminate by ';' - static class NumberBytesCodec { - int maxDigitsBeforeDecimalPoint; - byte[] buf; - int bufOffset; - int bufLen; - - NumberBytesCodec(int maxDigitsBeforeDecimalPoint) { - this.maxDigitsBeforeDecimalPoint = maxDigitsBeforeDecimalPoint; - this.buf = new byte[maxDigitsBeforeDecimalPoint * 3]; - this.bufOffset = 0; - this.bufLen = 0; - } - - void encodeNumber(byte[] value, int offset, int len) { - if (len == 0) { - bufOffset = 0; - bufLen = 0; - return; - } - - if (len > buf.length) { - throw new IllegalArgumentException("Too many digits for NumberDictionary: " + Bytes.toString(value, offset, len) + ". Internal buffer is only " + buf.length + " bytes"); - } - - boolean negative = value[offset] == '-'; - - // terminate negative ';' - int start = buf.length - len; - int end = buf.length; - if (negative) { - start--; - end--; - buf[end] = ';'; - } - - // copy & find decimal point - int decimalPoint = end; - for (int i = start, j = offset; i < end; i++, j++) { - buf[i] = value[j]; - if (buf[i] == '.' && i < decimalPoint) { - decimalPoint = i; - } - } - // remove '-' sign - if (negative) { - start++; - } - - // prepend '0' - int nZeroPadding = maxDigitsBeforeDecimalPoint - (decimalPoint - start); - if (nZeroPadding < 0 || nZeroPadding + 1 > start) - throw new IllegalArgumentException("Too many digits for NumberDictionary: " + Bytes.toString(value, offset, len) + ". Expect " + maxDigitsBeforeDecimalPoint + " digits before decimal point at max."); - for (int i = 0; i < nZeroPadding; i++) { - buf[--start] = '0'; - } - - // consider negative - if (negative) { - buf[--start] = '-'; - for (int i = start + 1; i < buf.length; i++) { - int c = buf[i]; - if (c >= '0' && c <= '9') { - buf[i] = (byte) ('9' - (c - '0')); - } - } - } else { - buf[--start] = '0'; - } - - bufOffset = start; - bufLen = buf.length - start; - } - - int decodeNumber(byte[] returnValue, int offset) { - if (bufLen == 0) { - return 0; - } - - int in = bufOffset; - int end = bufOffset + bufLen; - int out = offset; - - // sign - boolean negative = buf[in] == '-'; - if (negative) { - returnValue[out++] = '-'; - in++; - end--; - } - - // remove padding - byte padding = (byte) (negative ? '9' : '0'); - for (; in < end; in++) { - if (buf[in] != padding) - break; - } - - // all paddings before '.', special case for '0' - if (in == end || !(buf[in] >= '0' && buf[in] <= '9')) { - returnValue[out++] = '0'; - } - - // copy the rest - if (negative) { - for (; in < end; in++, out++) { - int c = buf[in]; - if (c >= '0' && c <= '9') { - c = '9' - (c - '0'); - } - returnValue[out] = (byte) c; - } - } else { - System.arraycopy(buf, in, returnValue, out, end - in); - out += end - in; - } - - return out - offset; - } - } - - static ThreadLocal<NumberBytesCodec> localCodec = - new ThreadLocal<NumberBytesCodec>(); - - // ============================================================================ - - private TrieDictionaryForest<T> dict; - - private BytesConverter<T> converter; - - public NumberDictionaryForest() { - } - - public NumberDictionaryForest(TrieDictionaryForest<T> dict, BytesConverter<T> converter) { - this.dict = dict; - this.converter = converter; - } - - protected NumberBytesCodec getCodec() { - NumberBytesCodec codec = localCodec.get(); - if (codec == null) { - codec = new NumberBytesCodec(MAX_DIGITS_BEFORE_DECIMAL_POINT); - localCodec.set(codec); - } - return codec; - } - - @Override - public int getMinId() { - return dict.getMinId(); - } - - @Override - public int getMaxId() { - return dict.getMaxId(); - } - - @Override - public int getSizeOfId() { - return dict.getSizeOfId(); - } - - @Override - public int getSizeOfValue() { - return dict.getSizeOfValue(); - } - - @Override - public boolean contains(Dictionary<?> another) { - return dict.contains(another); - } - - @Override - protected int getIdFromValueImpl(T value, int roundingFlag) { - if (value == null) return -1; - byte[] data = converter.convertToBytes(value); - return getIdFromValueBytesImpl(data, 0, data.length, roundingFlag); - } - - @Override - protected int getIdFromValueBytesImpl(byte[] value, int offset, int len, int roundingFlag) { - NumberBytesCodec codec = getCodec(); - codec.encodeNumber(value, offset, len); - return this.dict.getIdFromValueBytesImpl(codec.buf, codec.bufOffset, codec.bufLen, roundingFlag); - } - - @Override - protected T getValueFromIdImpl(int id) { - byte[] data = getValueBytesFromIdImpl(id); - if (data == null) return null; - else return converter.convertFromBytes(data, 0, data.length); - } - - @Override - protected byte[] getValueBytesFromIdImpl(int id) { - NumberBytesCodec codec = getCodec(); - codec.bufOffset = 0; - byte[] buf = new byte[dict.getSizeOfValue()]; - codec.bufLen = getValueBytesFromIdImpl(id, buf, 0); - - if (codec.bufLen == buf.length) { - return buf; - } else { - byte[] result = new byte[codec.bufLen]; - System.arraycopy(buf, 0, result, 0, codec.bufLen); - return result; - } - } - - @Override - protected int getValueBytesFromIdImpl(int id, byte[] returnValue, int offset) { - NumberBytesCodec codec = getCodec(); - codec.bufOffset = 0; - codec.bufLen = this.dict.getValueBytesFromIdImpl(id, codec.buf, 0); - return codec.decodeNumber(returnValue, offset); - } - - @Override - public void dump(PrintStream out) { - dict.dump(out); - } - - @Override - public void write(DataOutput out) throws IOException { - dict.write(out); - } - - @Override - public void readFields(DataInput in) throws IOException { - this.dict = new TrieDictionaryForest<>(); - dict.readFields(in); - this.converter = this.dict.getBytesConvert(); - } - - public BytesConverter<T> getConverter() { - return converter; - } - - public int getTreeSize(){ - return this.dict.getTrees().size(); - } - - -} http://git-wip-us.apache.org/repos/asf/kylin/blob/0a0c5547/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionaryForestBuilder.java ---------------------------------------------------------------------- diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionaryForestBuilder.java b/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionaryForestBuilder.java index 519d4c3..4bd6c0f 100644 --- a/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionaryForestBuilder.java +++ b/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionaryForestBuilder.java @@ -19,49 +19,57 @@ package org.apache.kylin.dict; import org.apache.kylin.common.util.Bytes; +import org.apache.kylin.dict.NumberDictionary.NumberBytesCodec; /** * Created by xiefan on 16-11-2. */ -public class NumberDictionaryForestBuilder<T> { +public class NumberDictionaryForestBuilder extends TrieDictionaryForestBuilder<String> { - private TrieDictionaryForestBuilder<T> trieBuilder; + public static class Number2BytesConverter implements BytesConverter<String> { - private BytesConverter<T> bytesConverter; + static final int MAX_DIGITS_BEFORE_DECIMAL_POINT = NumberDictionary.MAX_DIGITS_BEFORE_DECIMAL_POINT; + static final ThreadLocal<NumberBytesCodec> LOCAL = new ThreadLocal<NumberBytesCodec>(); - private NumberDictionaryForest.NumberBytesCodec codec = new NumberDictionaryForest.NumberBytesCodec(NumberDictionaryForest.MAX_DIGITS_BEFORE_DECIMAL_POINT); + static NumberBytesCodec getCodec() { + NumberBytesCodec codec = LOCAL.get(); + if (codec == null) { + codec = new NumberBytesCodec(MAX_DIGITS_BEFORE_DECIMAL_POINT); + LOCAL.set(codec); + } + return codec; + } + + @Override + public byte[] convertToBytes(String v) { + NumberBytesCodec codec = getCodec(); + byte[] num = Bytes.toBytes(v); + codec.encodeNumber(num, 0, num.length); + return Bytes.copy(codec.buf, codec.bufOffset, codec.bufLen); + } - public NumberDictionaryForestBuilder(BytesConverter<T> bytesConverter) { - this(bytesConverter, 0); + @Override + public String convertFromBytes(byte[] b, int offset, int length) { + NumberBytesCodec codec = getCodec(); + byte[] backup = codec.buf; + codec.buf = b; + codec.bufOffset = offset; + codec.bufLen = length; + int len = codec.decodeNumber(backup, 0); + codec.buf = backup; + return Bytes.toString(backup, 0, len); + } } - public NumberDictionaryForestBuilder(BytesConverter<T> bytesConverter, int baseId) { - this.trieBuilder = new TrieDictionaryForestBuilder<T>(bytesConverter, baseId); - this.bytesConverter = bytesConverter; + public NumberDictionaryForestBuilder() { + super(new Number2BytesConverter()); } - public NumberDictionaryForestBuilder(BytesConverter<T> bytesConverter, int baseId, int maxTrieSizeMB) { - this.trieBuilder = new TrieDictionaryForestBuilder<T>(bytesConverter, baseId, maxTrieSizeMB); - this.bytesConverter = bytesConverter; + public NumberDictionaryForestBuilder(int baseId) { + super(new Number2BytesConverter(), 0); } - public void addValue(T value) { - addValue(bytesConverter.convertToBytes(value)); - } - - public void addValue(byte[] value) { - codec.encodeNumber(value, 0, value.length); - byte[] copy = Bytes.copy(codec.buf, codec.bufOffset, codec.bufLen); - this.trieBuilder.addValue(copy); - } - - //TODO:ensure ordered - public NumberDictionaryForest<T> build() { - TrieDictionaryForest<T> forest = trieBuilder.build(); - return new NumberDictionaryForest<T>(forest, bytesConverter); - } - - public void setMaxTrieSize(int size) { - this.trieBuilder.setMaxTrieTreeSize(size); + public NumberDictionaryForestBuilder(int baseId, int maxTrieSizeMB) { + super(new Number2BytesConverter(), 0, maxTrieSizeMB); } } http://git-wip-us.apache.org/repos/asf/kylin/blob/0a0c5547/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryForestTest.java ---------------------------------------------------------------------- diff --git a/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryForestTest.java b/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryForestTest.java index 07511d1..c4c0fd8 100755 --- a/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryForestTest.java +++ b/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryForestTest.java @@ -16,29 +16,41 @@ * limitations under the License. */ - package org.apache.kylin.dict; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.io.BufferedReader; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.NoSuchElementException; +import java.util.Random; +import java.util.TreeSet; -import org.apache.kylin.common.util.Array; -import org.apache.kylin.common.util.MemoryBudgetController; import org.junit.Ignore; import org.junit.Test; -import java.io.*; -import java.util.*; - -import static org.junit.Assert.*; -import static org.junit.Assert.assertEquals; - /** * Created by xiefan on 16-10-26. */ public class TrieDictionaryForestTest { - - @Test public void testBasicFound() { ArrayList<String> strs = new ArrayList<String>(); @@ -62,7 +74,7 @@ public class TrieDictionaryForestTest { System.out.println("test ok"); } - @Test //one string one tree + @Test //one string one tree public void testMultiTree() { ArrayList<String> strs = new ArrayList<String>(); strs.add("part"); @@ -71,7 +83,7 @@ public class TrieDictionaryForestTest { strs.add("party"); strs.add("parties"); strs.add("paint"); - strs.add("ä¸äºä¸"); //Chinese test + strs.add("ä¸äºä¸"); //Chinese test strs.add("åäºå "); strs.add(""); Collections.sort(strs, new ByteComparator<String>(new StringBytesConverter())); @@ -91,7 +103,7 @@ public class TrieDictionaryForestTest { } @Test - public void testNullValue(){ + public void testNullValue() { //encounter null value when building dictionary ArrayList<String> strs = new ArrayList<String>(); strs.add(null); @@ -102,15 +114,14 @@ public class TrieDictionaryForestTest { TrieDictionaryForest<String> dict = builder.build(); dict.dump(System.out); //null value query - int id = dict.getIdFromValue(null,0); + int id = dict.getIdFromValue(null, 0); System.out.println(id); - id = dict.getIdFromValue(null,1); + id = dict.getIdFromValue(null, 1); System.out.println(id); - id = dict.getIdFromValue(null,-1); + id = dict.getIdFromValue(null, -1); System.out.println(id); } - @Test public void testBigDataSet() { //h=generate data @@ -209,7 +220,6 @@ public class TrieDictionaryForestTest { testStringDictionary(str, notFound); } - @Test public void dictionaryContainTest() { ArrayList<String> str = new ArrayList<String>(); @@ -263,103 +273,102 @@ public class TrieDictionaryForestTest { } } - @Test - public void roundingFlagTest(){ + public void roundingFlagTest() { ArrayList<String> testData = new ArrayList<>(); testData.add("b"); testData.add("bdd"); testData.add("ccc"); int baseId = 10; - TrieDictionaryForestBuilder<String> b = TrieDictionaryForestTest.newDictBuilder(testData,baseId, 0); + TrieDictionaryForestBuilder<String> b = TrieDictionaryForestTest.newDictBuilder(testData, baseId, 0); TrieDictionaryForest<String> dict = b.build(); //left String smallerStr = "a"; int id; - try{ - id = dict.getIdFromValue(smallerStr,0); - fail("should throw IllegalArgumentException,but id is:"+id); - }catch (IllegalArgumentException e){ + try { + id = dict.getIdFromValue(smallerStr, 0); + fail("should throw IllegalArgumentException,but id is:" + id); + } catch (IllegalArgumentException e) { //correct } - try{ - id = dict.getIdFromValue(smallerStr,-1); - fail("should throw IllegalArgumentException,but id is:"+id); - }catch (IllegalArgumentException e){ + try { + id = dict.getIdFromValue(smallerStr, -1); + fail("should throw IllegalArgumentException,but id is:" + id); + } catch (IllegalArgumentException e) { //correct } - id = dict.getIdFromValue(smallerStr,1); - assertEquals(baseId,id); + id = dict.getIdFromValue(smallerStr, 1); + assertEquals(baseId, id); //middle String middleStr = "bd"; - try{ - id = dict.getIdFromValue(middleStr,0); - fail("should throw IllegalArgumentException,but id is:"+id); - }catch (IllegalArgumentException e){ + try { + id = dict.getIdFromValue(middleStr, 0); + fail("should throw IllegalArgumentException,but id is:" + id); + } catch (IllegalArgumentException e) { //correct } - id = dict.getIdFromValue(middleStr,-1); - assertEquals(baseId,id); - id = dict.getIdFromValue(middleStr,1); - assertEquals(baseId+1,id); + id = dict.getIdFromValue(middleStr, -1); + assertEquals(baseId, id); + id = dict.getIdFromValue(middleStr, 1); + assertEquals(baseId + 1, id); //right String rightStr = "e"; - try{ - id = dict.getIdFromValue(rightStr,0); - fail("should throw IllegalArgumentException,but id is:"+id); - }catch (IllegalArgumentException e){ + try { + id = dict.getIdFromValue(rightStr, 0); + fail("should throw IllegalArgumentException,but id is:" + id); + } catch (IllegalArgumentException e) { //correct } - id = dict.getIdFromValue(rightStr,-1); - assertEquals(baseId+2,id); - try{ - id = dict.getIdFromValue(rightStr,1); - fail("should throw IllegalArgumentException,but id is:"+id); - }catch (IllegalArgumentException e){ + id = dict.getIdFromValue(rightStr, -1); + assertEquals(baseId + 2, id); + try { + id = dict.getIdFromValue(rightStr, 1); + fail("should throw IllegalArgumentException,but id is:" + id); + } catch (IllegalArgumentException e) { //correct } } @Test - public void stringDictRoundFlagTest(){ + public void stringDictRoundFlagTest() { TreeSet<String> set = new TreeSet<>(new ByteComparator<>(new StringBytesConverter())); - Iterator<String> it = new RandomStrings(10*10000).iterator(); + Iterator<String> it = new RandomStrings(10 * 10000).iterator(); int size = 0; - while(it.hasNext()){ + while (it.hasNext()) { BytesConverter converter = new StringBytesConverter(); String str = it.next(); set.add(str); size += converter.convertToBytes(str).length; } int treeNum = 5; - TrieDictionaryForestBuilder<String> builder = newDictBuilder(set.iterator(),0,size / treeNum); + TrieDictionaryForestBuilder<String> builder = newDictBuilder(set.iterator(), 0, size / treeNum); TrieDictionaryForest<String> dict = builder.build(); //dict.dump(System.out); //test roundingFlag > 0 - Iterator<String> it2 = new RandomStrings(100*10000).iterator(); - while(it2.hasNext()){ + Iterator<String> it2 = new RandomStrings(100 * 10000).iterator(); + while (it2.hasNext()) { String query = it2.next(); //System.out.println("query:"+query); try { int id = dict.getIdFromValue(query, 1); - assertEquals(set.ceiling(query),dict.getValueFromId(id)); - }catch(IllegalArgumentException e){ + assertEquals(set.ceiling(query), dict.getValueFromId(id)); + } catch (IllegalArgumentException e) { assertNull(set.ceiling(query)); } } //test roundingFlag < 0 - Iterator<String> it3 = new RandomStrings(100*10000).iterator(); - while(it3.hasNext()){ + Iterator<String> it3 = new RandomStrings(100 * 10000).iterator(); + while (it3.hasNext()) { String query = it3.next(); try { int id = dict.getIdFromValue(query, -1); - assertEquals(set.floor(query),dict.getValueFromId(id)); - }catch(IllegalArgumentException e){ + assertEquals(set.floor(query), dict.getValueFromId(id)); + } catch (IllegalArgumentException e) { assertNull(set.floor(query)); } } @@ -367,15 +376,15 @@ public class TrieDictionaryForestTest { } @Test - public void longDictRoundingFlagTest(){ + public void longDictRoundingFlagTest() { TreeSet<String> set = new TreeSet<>(new Comparator<String>() { @Override public int compare(String o1, String o2) { - try{ + try { Long l1 = Long.parseLong(o1); Long l2 = Long.parseLong(o2); return l1.compareTo(l2); - }catch(NumberFormatException e){ + } catch (NumberFormatException e) { e.printStackTrace(); return 0; } @@ -385,49 +394,45 @@ public class TrieDictionaryForestTest { int k = -48481; int size = 0; StringBytesConverter converter = new StringBytesConverter(); - for(int i=0;i<num;i++) - { - String value = k+""; + for (int i = 0; i < num; i++) { + String value = k + ""; set.add(value); k += 1; String basic = "-9999999999999952517"; size += converter.convertToBytes(basic).length; } - System.out.println("tree num:"+size); + System.out.println("tree num:" + size); int treeNum = 5; //TrieDictionaryForestBuilder<String> builder = newDictBuilder(set.iterator(),0,size / treeNum); //TrieDictionaryForest<String> dict = builder.build(); - NumberDictionaryForestBuilder<String> builder = new NumberDictionaryForestBuilder<String>(new StringBytesConverter(),0); - builder.setMaxTrieSize(size / treeNum); + TrieDictionaryForestBuilder builder = new NumberDictionaryForestBuilder(0); + builder.setMaxTrieTreeSize(size / treeNum); Iterator<String> it = set.iterator(); - while(it.hasNext()) + while (it.hasNext()) builder.addValue(it.next()); - NumberDictionaryForest<String> dict = builder.build(); - System.out.println(dict.getTreeSize()); + TrieDictionaryForest<String> dict = builder.build(); + System.out.println(dict.getTrees().size()); int testTimes = 100 * 10000; Random rand = new Random(System.currentTimeMillis()); //test roundingFlag > 0 - for(int i=0;i<testTimes;i++) - { - String query = rand.nextInt(2*num)+""; + for (int i = 0; i < testTimes; i++) { + String query = rand.nextInt(2 * num) + ""; try { int id = dict.getIdFromValue(query, 1); - assertEquals(set.ceiling(query),dict.getValueFromId(id)); - }catch(IllegalArgumentException e){ + assertEquals(set.ceiling(query), dict.getValueFromId(id)); + } catch (IllegalArgumentException e) { assertNull(set.ceiling(query)); } } - //test roundingFlag < 0 - for(int i=0;i<testTimes;i++) - { - String query = rand.nextInt(2*num)+""; + for (int i = 0; i < testTimes; i++) { + String query = rand.nextInt(2 * num) + ""; try { int id = dict.getIdFromValue(query, -1); - assertEquals(set.floor(query),dict.getValueFromId(id)); - }catch(IllegalArgumentException e){ + assertEquals(set.floor(query), dict.getValueFromId(id)); + } catch (IllegalArgumentException e) { assertNull(set.floor(query)); } } @@ -480,15 +485,15 @@ public class TrieDictionaryForestTest { */ @Ignore @Test - public void doubleDictRoundingFlagTest(){ + public void doubleDictRoundingFlagTest() { TreeSet<String> set = new TreeSet<>(new Comparator<String>() { @Override public int compare(String o1, String o2) { - try{ + try { Double d1 = Double.parseDouble(o1); Double d2 = Double.parseDouble(o2); return d1.compareTo(d2); - }catch(NumberFormatException e){ + } catch (NumberFormatException e) { e.printStackTrace(); return 0; } @@ -498,9 +503,8 @@ public class TrieDictionaryForestTest { double k = -0.0; int size = 0; StringBytesConverter converter = new StringBytesConverter(); - for(int i=0;i<num;i++) - { - String value = k+""; + for (int i = 0; i < num; i++) { + String value = k + ""; set.add(value); k += 1.55; String basic = "-9999999999999952517"; @@ -509,21 +513,20 @@ public class TrieDictionaryForestTest { int treeNum = 5; //TrieDictionaryForestBuilder<String> builder = newDictBuilder(set.iterator(),0,size / treeNum); //TrieDictionaryForest<String> dict = builder.build(); - NumberDictionaryForestBuilder<String> builder = new NumberDictionaryForestBuilder<String>(new StringBytesConverter(),0); - builder.setMaxTrieSize(size / treeNum); + NumberDictionaryForestBuilder builder = new NumberDictionaryForestBuilder(0); + builder.setMaxTrieTreeSize(size / treeNum); Iterator<String> it = set.iterator(); - while(it.hasNext()){ + while (it.hasNext()) { String str = it.next(); - if(str.contains("E")){ + if (str.contains("E")) { set.remove(str); - } - else{ + } else { builder.addValue(str); } } - NumberDictionaryForest<String> dict = builder.build(); - System.out.println("tree size:"+dict.getTreeSize()); + TrieDictionaryForest<String> dict = builder.build(); + System.out.println("tree size:" + dict.getTrees().size()); System.out.println("--------------dict-----------------"); dict.dump(System.out); System.out.println("--------------set-------------------"); @@ -531,7 +534,7 @@ public class TrieDictionaryForestTest { //test special value String query1 = "183.82499999999996"; - int id1 = dict.getIdFromValue(query1,1); + int id1 = dict.getIdFromValue(query1, 1); String actualValue = dict.getValueFromId(id1); //System.out.println("id:"+id1+" value:"+actualValue); //System.out.println(set.ceiling(query1)); @@ -540,40 +543,36 @@ public class TrieDictionaryForestTest { int testTimes = 1000000; double queryBasic = -145.355; //test roundingFlag > 0 - for(int i=0;i<testTimes;i++) - { - String query = queryBasic+""; + for (int i = 0; i < testTimes; i++) { + String query = queryBasic + ""; //System.out.println("query:"+query); queryBasic += 1.51; - if(query.contains("E")) + if (query.contains("E")) continue; try { int id = dict.getIdFromValue(query, 1); - assertEquals(set.ceiling(query),dict.getValueFromId(id)); - }catch(IllegalArgumentException e){ + assertEquals(set.ceiling(query), dict.getValueFromId(id)); + } catch (IllegalArgumentException e) { assertNull(set.ceiling(query)); } } - //test roundingFlag < 0 queryBasic = -551.3588; - for(int i=0;i<testTimes;i++) - { - String query = queryBasic+""; + for (int i = 0; i < testTimes; i++) { + String query = queryBasic + ""; queryBasic += 1.0; - if(query.contains("E")) + if (query.contains("E")) continue; try { int id = dict.getIdFromValue(query, -1); - assertEquals(set.floor(query),dict.getValueFromId(id)); - }catch(IllegalArgumentException e){ + assertEquals(set.floor(query), dict.getValueFromId(id)); + } catch (IllegalArgumentException e) { assertNull(set.floor(query)); } } } - private static TrieDictionaryForest<String> testSerialize(TrieDictionaryForest<String> dict) { try { ByteArrayOutputStream bout = new ByteArrayOutputStream(); @@ -603,7 +602,7 @@ public class TrieDictionaryForestTest { dict.dump(System.out); byte[] data = converter.convertToBytes(value); int id = dict.getIdFromValueBytes(data,0,data.length); - + }*/ /* @@ -612,14 +611,14 @@ public class TrieDictionaryForestTest { @Ignore @Test public void memoryUsageBenchmarkOldDictTest() throws Exception { - System.out.println("max memory:"+Runtime.getRuntime().maxMemory()); + System.out.println("max memory:" + Runtime.getRuntime().maxMemory()); System.gc(); Thread.currentThread().sleep(1000); NumberDictionaryBuilder<String> b = new NumberDictionaryBuilder<>(new StringBytesConverter()); int k = 0; - while(true){ - b.addValue(k+""); - if(k%100000 == 0) + while (true) { + b.addValue(k + ""); + if (k % 100000 == 0) System.out.println(k); k++; } @@ -629,14 +628,14 @@ public class TrieDictionaryForestTest { @Ignore @Test public void memoryUsageBenchmarkNewDictForestTest() throws Exception { - System.out.println("max memory:"+Runtime.getRuntime().maxMemory()); + System.out.println("max memory:" + Runtime.getRuntime().maxMemory()); System.gc(); Thread.currentThread().sleep(3000); - NumberDictionaryForestBuilder<String> b = new NumberDictionaryForestBuilder<>(new StringBytesConverter(),0,0); + NumberDictionaryForestBuilder b = new NumberDictionaryForestBuilder(0, 0); int k = 0; - while(true){ - b.addValue(k+""); - if(k%100000 == 0) + while (true) { + b.addValue(k + ""); + if (k % 100000 == 0) System.out.println(k); k++; } @@ -648,12 +647,11 @@ public class TrieDictionaryForestTest { maxTrieSize:50M entry:128400000 maxTrieSize:25M entry:148100000 maxTrieSize:0M entry: 5000000 - + 5-8 */ } - @Deprecated private long getSystemCurUsedMemory() throws Exception { System.gc(); @@ -703,19 +701,15 @@ public class TrieDictionaryForestTest { System.out.println("times:" + i); } - System.out.println("compare build time. Old trie : " + oldDictTotalBuildTime / 1000.0 + "s.New trie : " + newDictTotalBuildTime / 1000.0 + "s"); } - @Test public void queryTimeBenchmarkTest() throws Exception { int count = (int) (Integer.MAX_VALUE * 0.8 / 640); - //int count = (int) (2); benchmarkStringDictionary(new RandomStrings(count)); } - private void evaluateDataSize(ArrayList<String> list) { long size = 0; for (String str : list) @@ -763,7 +757,6 @@ public class TrieDictionaryForestTest { array[id] = converter.convertToBytes(value); } - // System.out.println("Dict size in bytes: " + //MemoryUtil.deepMemoryUsageOf(dict)); // System.out.println("Map size in bytes: " + @@ -879,7 +872,7 @@ public class TrieDictionaryForestTest { } } int maxId = dict.getMaxId(); - int[] notExistIds = {-10, -20, -Integer.MIN_VALUE, -Integer.MAX_VALUE, maxId + 1, maxId + 2}; + int[] notExistIds = { -10, -20, -Integer.MIN_VALUE, -Integer.MAX_VALUE, maxId + 1, maxId + 2 }; for (Integer i : notExistIds) { try { dict.getValueFromId(i); @@ -925,7 +918,7 @@ public class TrieDictionaryForestTest { public static TrieDictionaryForestBuilder<String> newDictBuilder(Iterator<String> strs, int baseId, int treeSize) { TrieDictionaryForestBuilder<String> b = new TrieDictionaryForestBuilder<String>(new StringBytesConverter(), baseId); b.setMaxTrieTreeSize(treeSize); - while(strs.hasNext()) + while (strs.hasNext()) b.addValue(strs.next()); return b; } @@ -1000,7 +993,6 @@ public class TrieDictionaryForestTest { return r; } - private ArrayList<String> getTestData(int count) { RandomStrings rs = new RandomStrings(count); Iterator<String> itr = rs.iterator(); http://git-wip-us.apache.org/repos/asf/kylin/blob/0a0c5547/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/NumberDictionaryForestTest.java ---------------------------------------------------------------------- diff --git a/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/NumberDictionaryForestTest.java b/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/NumberDictionaryForestTest.java index 66946b7..33dca01 100644 --- a/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/NumberDictionaryForestTest.java +++ b/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/NumberDictionaryForestTest.java @@ -20,9 +20,9 @@ import org.apache.hadoop.io.Text; import org.apache.kylin.common.util.Bytes; import org.apache.kylin.dict.NumberDictionary; import org.apache.kylin.dict.NumberDictionaryBuilder; -import org.apache.kylin.dict.NumberDictionaryForest; import org.apache.kylin.dict.NumberDictionaryForestBuilder; import org.apache.kylin.dict.StringBytesConverter; +import org.apache.kylin.dict.TrieDictionaryForest; import org.apache.kylin.engine.mr.steps.fdc2.SelfDefineSortableKey; import org.apache.kylin.engine.mr.steps.fdc2.TypeFlag; import org.junit.Test; @@ -51,14 +51,13 @@ public class NumberDictionaryForestTest { ArrayList<SelfDefineSortableKey> keyList = createKeyList(list, (byte) flag.ordinal()); Collections.sort(keyList); //build tree - NumberDictionaryForestBuilder<String> b = new NumberDictionaryForestBuilder<String>( - new StringBytesConverter(), 0, 0); + NumberDictionaryForestBuilder b = new NumberDictionaryForestBuilder(0, 0); for (SelfDefineSortableKey key : keyList) { String fieldValue = printKey(key); b.addValue(fieldValue); } - NumberDictionaryForest<String> dict = b.build(); + TrieDictionaryForest<String> dict = b.build(); dict.dump(System.out); ArrayList<Integer> resultIds = new ArrayList<>(); for (SelfDefineSortableKey key : keyList) { @@ -81,10 +80,10 @@ public class NumberDictionaryForestTest { testData.add("2"); testData.add("100"); //TrieDictionaryForestBuilder.MaxTrieTreeSize = 0; - NumberDictionaryForestBuilder<String> b = new NumberDictionaryForestBuilder<String>(new StringBytesConverter()); + NumberDictionaryForestBuilder b = new NumberDictionaryForestBuilder(); for (String str : testData) b.addValue(str); - NumberDictionaryForest<String> dict = b.build(); + TrieDictionaryForest<String> dict = b.build(); dict = testSerialize(dict); dict.dump(System.out); for (String str : testData) { @@ -99,10 +98,10 @@ public class NumberDictionaryForestTest { testData.add(Double.MIN_VALUE + ""); testData.add("1.01"); testData.add("2.0"); - NumberDictionaryForestBuilder<String> b = new NumberDictionaryForestBuilder<String>(new StringBytesConverter()); + NumberDictionaryForestBuilder b = new NumberDictionaryForestBuilder(); for (String str : testData) b.addValue(str); - NumberDictionaryForest<String> dict = b.build(); + TrieDictionaryForest<String> dict = b.build(); dict.dump(System.out); NumberDictionaryBuilder<String> b2 = new NumberDictionaryBuilder<>(new StringBytesConverter()); @@ -113,7 +112,7 @@ public class NumberDictionaryForestTest { } - private static NumberDictionaryForest<String> testSerialize(NumberDictionaryForest<String> dict) { + private static TrieDictionaryForest<String> testSerialize(TrieDictionaryForest<String> dict) { try { ByteArrayOutputStream bout = new ByteArrayOutputStream(); DataOutputStream dataout = new DataOutputStream(bout); @@ -121,7 +120,7 @@ public class NumberDictionaryForestTest { dataout.close(); ByteArrayInputStream bin = new ByteArrayInputStream(bout.toByteArray()); DataInputStream datain = new DataInputStream(bin); - NumberDictionaryForest<String> r = new NumberDictionaryForest<>(); + TrieDictionaryForest<String> r = new TrieDictionaryForest<>(); //r.dump(System.out); r.readFields(datain); //r.dump(System.out); http://git-wip-us.apache.org/repos/asf/kylin/blob/0a0c5547/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/SelfDefineSortableKeyTest.java ---------------------------------------------------------------------- diff --git a/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/SelfDefineSortableKeyTest.java b/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/SelfDefineSortableKeyTest.java index 858bba4..81aa836 100644 --- a/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/SelfDefineSortableKeyTest.java +++ b/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/SelfDefineSortableKeyTest.java @@ -1,31 +1,21 @@ package org.apache.kylin.engine.mr.steps; -import org.apache.hadoop.hbase.KeyValue; -import org.apache.hadoop.io.Text; -import org.apache.kylin.common.util.Array; -import org.apache.kylin.common.util.Bytes; -import org.apache.kylin.dict.NumberDictionaryForest; -import org.apache.kylin.dict.NumberDictionaryForestBuilder; -import org.apache.kylin.dict.StringBytesConverter; -import org.apache.kylin.dict.TrieDictionary; -import org.apache.kylin.dict.TrieDictionaryBuilder; -import org.apache.kylin.dict.TrieDictionaryForest; -import org.apache.kylin.dict.TrieDictionaryForestBuilder; -import org.apache.kylin.engine.mr.steps.fdc2.SelfDefineSortableKey; -import org.apache.kylin.engine.mr.steps.fdc2.TypeFlag; -import org.junit.Test; -import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.nio.ByteBuffer; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Random; import java.util.UUID; +import org.apache.hadoop.io.Text; +import org.apache.kylin.common.util.Bytes; +import org.apache.kylin.engine.mr.steps.fdc2.SelfDefineSortableKey; +import org.apache.kylin.engine.mr.steps.fdc2.TypeFlag; +import org.junit.Test; + /** * Created by xiefan on 16-11-2. */
