[kylin] branch kylin-on-parquet-v2 updated: KYLIN-4719 Refine kylin-defaults.properties for parquet Storage

xxyu Fri, 04 Sep 2020 00:04:23 -0700

This is an automated email from the ASF dual-hosted git repository.

xxyu pushed a commit to branch kylin-on-parquet-v2
in repository https://gitbox.apache.org/repos/asf/kylin.git



The following commit(s) were added to refs/heads/kylin-on-parquet-v2 by this 
push:
     new 3405ab2  KYLIN-4719 Refine kylin-defaults.properties for parquet 
Storage
3405ab2 is described below

commit 3405ab2609da1fe8e95c5e0fb9603d429c3e81b6
Author: Zhichao Zhang <441586...@qq.com>
AuthorDate: Fri Sep 4 11:56:26 2020 +0800

    KYLIN-4719 Refine kylin-defaults.properties for parquet Storage
---
 .../src/main/resources/kylin-defaults.properties   | 202 +++------------------
 .../kylin/rest/service/AdminServiceTest.java       |   4 +-
 2 files changed, 31 insertions(+), 175 deletions(-)

diff --git a/core-common/src/main/resources/kylin-defaults.properties 
b/core-common/src/main/resources/kylin-defaults.properties
index 8903487..565d4e9 100644
--- a/core-common/src/main/resources/kylin-defaults.properties
+++ b/core-common/src/main/resources/kylin-defaults.properties
@@ -18,7 +18,7 @@
 ### METADATA | ENV ###
 
 # The metadata store in hbase
-kylin.metadata.url=kylin_metadata@hbase
+kylin.metadata.url=kylin_metadata@jdbc,url=jdbc:mysql://localhost:3306/kylin,username=XXXX,password=XXXXXX,maxActive=10,maxIdle=10
 
 # metadata cache sync retry times
 kylin.metadata.sync-retries=3
@@ -27,7 +27,7 @@ kylin.metadata.sync-retries=3
 kylin.env.hdfs-working-dir=/kylin
 
 # DEV|QA|PROD. DEV will turn on some dev features, QA and PROD has no 
difference in terms of functions.
-kylin.env=QA
+kylin.env=DEV
 
 # kylin zk base path
 kylin.env.zookeeper-base-path=/kylin
@@ -87,29 +87,6 @@ kylin.server.external-acl-provider=
 kylin.web.default-time-filter=1
 
 ### SOURCE ###
-
-# Hive client, valid value [cli, beeline]
-kylin.source.hive.client=cli
-
-# Absolute path to beeline shell, can be set to spark beeline instead of the 
default hive beeline on PATH
-kylin.source.hive.beeline-shell=beeline
-
-# Parameters for beeline client, only necessary if hive client is beeline
-#kylin.source.hive.beeline-params=-n root --hiveconf 
hive.security.authorization.sqlstd.confwhitelist.append='mapreduce.job.*|dfs.*' 
-u jdbc:hive2://localhost:10000
-
-# While hive client uses above settings to read hive table metadata,
-# table operations can go through a separate SparkSQL command line, given 
SparkSQL connects to the same Hive metastore.
-kylin.source.hive.enable-sparksql-for-table-ops=false
-#kylin.source.hive.sparksql-beeline-shell=/path/to/spark-client/bin/beeline
-#kylin.source.hive.sparksql-beeline-params=-n root --hiveconf 
hive.security.authorization.sqlstd.confwhitelist.append='mapreduce.job.*|dfs.*' 
-u jdbc:hive2://localhost:10000
-
-kylin.source.hive.keep-flat-table=false
-
-# Hive database name for putting the intermediate flat tables
-kylin.source.hive.database-for-flat-table=default
-
-# Whether redistribute the intermediate flat table before building
-kylin.source.hive.redistribute-flat-table=true
 # Define how to access to hive metadata
 # When user deploy kylin on AWS EMR and Glue is used as external metadata, use 
gluecatalog instead
 kylin.source.hive.metadata-type=hcatalog
@@ -119,47 +96,6 @@ kylin.source.hive.metadata-type=hcatalog
 # The storage for final cube file in hbase
 kylin.storage.url=hbase
 
-# The prefix of hbase table
-kylin.storage.hbase.table-name-prefix=KYLIN_
-
-# The namespace for hbase storage
-kylin.storage.hbase.namespace=default
-
-# Compression codec for htable, valid value [none, snappy, lzo, gzip, lz4]
-kylin.storage.hbase.compression-codec=none
-
-# HBase Cluster FileSystem, which serving hbase, format as 
hdfs://hbase-cluster:8020
-# Leave empty if hbase running on same cluster with hive and mapreduce
-#kylin.storage.hbase.cluster-fs=
-
-# The cut size for hbase region, in GB.
-kylin.storage.hbase.region-cut-gb=5
-
-# The hfile size of GB, smaller hfile leading to the converting hfile MR has 
more reducers and be faster.
-# Set 0 to disable this optimization.
-kylin.storage.hbase.hfile-size-gb=2
-
-kylin.storage.hbase.min-region-count=1
-kylin.storage.hbase.max-region-count=500
-
-# Optional information for the owner of kylin platform, it can be your team's 
email
-# Currently it will be attached to each kylin's htable attribute
-kylin.storage.hbase.owner-tag=who...@kylin.apache.org
-
-kylin.storage.hbase.coprocessor-mem-gb=3
-
-# By default kylin can spill query's intermediate results to disks when it's 
consuming too much memory.
-# Set it to false if you want query to abort immediately in such condition.
-kylin.storage.partition.aggr-spill-enabled=true
-
-# The maximum number of bytes each coprocessor is allowed to scan.
-# To allow arbitrary large scan, you can set it to 0.
-kylin.storage.partition.max-scan-bytes=3221225472
-
-# The default coprocessor timeout is (hbase.rpc.timeout * 0.9) / 1000 seconds,
-# You can set it to a smaller value. 0 means use default.
-# kylin.storage.hbase.coprocessor-timeout-seconds=0
-
 # clean real storage after delete operation
 # if you want to delete the real storage like htable of deleting segment, you 
can set it to true
 kylin.storage.clean-after-delete-operation=false
@@ -186,27 +122,6 @@ kylin.job.sampling-percentage=100
 
kylin.job.scheduler.provider.100=org.apache.kylin.job.impl.curator.CuratorScheduler
 kylin.job.scheduler.default=0
 
-### ENGINE ###
-
-# Time interval to check hadoop job status
-kylin.engine.mr.yarn-check-interval-seconds=10
-
-kylin.engine.mr.reduce-input-mb=500
-
-kylin.engine.mr.max-reducer-number=500
-
-kylin.engine.mr.mapper-input-rows=1000000
-
-# Enable dictionary building in MR reducer
-kylin.engine.mr.build-dict-in-reducer=true
-
-# Number of reducers for fetching UHC column distinct values
-kylin.engine.mr.uhc-reducer-count=3
-
-# Whether using an additional step to build UHC dictionary
-kylin.engine.mr.build-uhc-dict-in-additional-step=false
-
-
 ### CUBE | DICTIONARY ###
 
 kylin.cube.cuboid-scheduler=org.apache.kylin.cube.cuboid.DefaultCuboidScheduler
@@ -227,15 +142,14 @@ kylin.cube.aggrgroup.max-combination=32768
 
 kylin.snapshot.max-mb=300
 
-kylin.cube.cubeplanner.enabled=true
-kylin.cube.cubeplanner.enabled-for-existing-cube=true
+kylin.cube.cubeplanner.enabled=false
+kylin.cube.cubeplanner.enabled-for-existing-cube=false
 kylin.cube.cubeplanner.expansion-threshold=15.0
 kylin.cube.cubeplanner.recommend-cache-max-size=200
 kylin.cube.cubeplanner.mandatory-rollup-threshold=1000
 kylin.cube.cubeplanner.algorithm-threshold-greedy=8
 kylin.cube.cubeplanner.algorithm-threshold-genetic=23
 
-
 ### QUERY ###
 
 # Controls the maximum number of bytes a query is allowed to scan storage.
@@ -244,6 +158,10 @@ kylin.cube.cubeplanner.algorithm-threshold-genetic=23
 kylin.query.max-scan-bytes=0
 
 kylin.query.cache-enabled=true
+kylin.query.cache-threshold-scan-count=10240
+kylin.query.cache-threshold-duration=2000
+kylin.query.cache-threshold-scan-bytes=1048576
+kylin.query.large-query-threshold=1000000
 
 # Controls extras properties for Calcite jdbc driver
 # all extras properties should undder prefix 
"kylin.query.calcite.extras-props."
@@ -309,45 +227,31 @@ kylin.security.saml.context-server-name=hostname
 kylin.security.saml.context-server-port=443
 kylin.security.saml.context-path=/kylin
 
-### SPARK ENGINE CONFIGS ###
+### SPARK BUILD/MERGE ENGINE CONFIGS ###
 
 # Hadoop conf folder, will export this as "HADOOP_CONF_DIR" to run spark-submit
 # This must contain site xmls of core, yarn, hive, and hbase in one folder
 #kylin.env.hadoop-conf-dir=/etc/hadoop/conf
 
-# Estimate the RDD partition numbers
-kylin.engine.spark.rdd-partition-cut-mb=10
-
-# Minimal partition numbers of rdd
-kylin.engine.spark.min-partition=1
-
-# Max partition numbers of rdd
-kylin.engine.spark.max-partition=5000
-
 # Spark conf (default is in spark/conf/spark-defaults.conf)
 kylin.engine.spark-conf.spark.master=yarn
-#kylin.engine.spark-conf.spark.submit.deployMode=cluster
+#kylin.engine.spark-conf.spark.submit.deployMode=client
 kylin.engine.spark-conf.spark.yarn.queue=default
+kylin.engine.spark-conf.spark.executor.cores=1
 kylin.engine.spark-conf.spark.executor.memory=4G
-kylin.engine.spark-conf.spark.executor.instances=40
+kylin.engine.spark-conf.spark.executor.instances=1
+kylin.engine.spark-conf.spark.executor.memoryOverhead=1024M
 kylin.engine.spark-conf.spark.driver.cores=1
-kylin.engine.spark-conf.spark.executor.memoryOverhead=1024
+kylin.engine.spark-conf.spark.driver.memory=1G
 kylin.engine.spark-conf.spark.shuffle.service.enabled=true
 kylin.engine.spark-conf.spark.eventLog.enabled=true
 kylin.engine.spark-conf.spark.eventLog.dir=hdfs\:///kylin/spark-history
 
kylin.engine.spark-conf.spark.history.fs.logDirectory=hdfs\:///kylin/spark-history
 kylin.engine.spark-conf.spark.hadoop.yarn.timeline-service.enabled=false
-
-### Spark conf for specific job
-kylin.engine.spark-conf-mergedict.spark.executor.memory=6G
-kylin.engine.spark-conf-mergedict.spark.memory.fraction=0.2
-
-### Spark conf overwrite for query engine
-kylin.query.spark-conf.spark.executor.cores=5
-kylin.query.spark-conf.spark.executor.instances=4
+#kylin.engine.spark-conf.spark.sql.shuffle.partitions=1
 
 # manually upload spark-assembly jar to HDFS and then set this property will 
avoid repeatedly uploading jar at runtime
-#kylin.engine.spark-conf.spark.yarn.archive=hdfs://namenode:8020/kylin/spark/spark-libs.jar
+#kylin.engine.spark-conf.spark.yarn.jars=hdfs://localhost:9000/spark2_jars/*
 
#kylin.engine.spark-conf.spark.io.compression.codec=org.apache.spark.io.SnappyCompressionCodec
 
 # uncomment for HDP
@@ -355,69 +259,21 @@ kylin.query.spark-conf.spark.executor.instances=4
 #kylin.engine.spark-conf.spark.yarn.am.extraJavaOptions=-Dhdp.version=current
 #kylin.engine.spark-conf.spark.executor.extraJavaOptions=-Dhdp.version=current
 
-
-### FLINK ENGINE CONFIGS ###
-
-## Flink conf (default is in flink/conf/flink-conf.yaml)
-kylin.engine.flink-conf.jobmanager.heap.size=2G
-kylin.engine.flink-conf.taskmanager.heap.size=4G
-kylin.engine.flink-conf.taskmanager.numberOfTaskSlots=1
-kylin.engine.flink-conf.taskmanager.memory.preallocate=false
-kylin.engine.flink-conf.job.parallelism=1
-kylin.engine.flink-conf.program.enableObjectReuse=false
-kylin.engine.flink-conf.yarn.queue=
-kylin.engine.flink-conf.yarn.nodelabel=
+### SPARK QUERY ENGINE CONFIGS ###
+kylin.query.spark-conf.spark.master=yarn
+#kylin.query.spark-conf.spark.submit.deployMode=client
+kylin.query.spark-conf.spark.driver.cores=1
+kylin.query.spark-conf.spark.driver.memory=4G
+kylin.query.spark-conf.spark.driver.memoryOverhead=1G
+kylin.query.spark-conf.spark.executor.cores=5
+kylin.query.spark-conf.spark.executor.instances=4
+kylin.query.spark-conf.spark.executor.memory=20G
+kylin.query.spark-conf.spark.executor.memoryOverhead=2G
+kylin.query.spark-conf.spark.serializer=org.apache.spark.serializer.JavaSerializer
+#kylin.query.spark-conf.spark.sql.shuffle.partitions=40
+#kylin.query.spark-conf.spark.yarn.jars=hdfs://localhost:9000/spark2_jars/*
 
 ### QUERY PUSH DOWN ###
 
 
#kylin.query.pushdown.runner-class-name=org.apache.kylin.query.pushdown.PushDownRunnerSparkImpl
-
 #kylin.query.pushdown.update-enabled=false
-
-### JDBC Data Source
-#kylin.source.jdbc.connection-url=
-#kylin.source.jdbc.driver=
-#kylin.source.jdbc.dialect=
-#kylin.source.jdbc.user=
-#kylin.source.jdbc.pass=
-#kylin.source.jdbc.sqoop-home=
-#kylin.source.jdbc.filed-delimiter=|
-
-### Livy with Kylin
-#kylin.engine.livy-conf.livy-enabled=false
-#kylin.engine.livy-conf.livy-url=http://LivyHost:8998
-#kylin.engine.livy-conf.livy-key.file=hdfs:///path-to-kylin-job-jar
-#kylin.engine.livy-conf.livy-arr.jars=hdfs:///path-to-hadoop-dependency-jar
-
-
-### Realtime OLAP ###
-
-# Where should local segment cache located, for absolute path, the real path 
will be ${KYLIN_HOME}/${kylin.stream.index.path}
-kylin.stream.index.path=stream_index
-
-# The timezone for Derived Time Column like hour_start, try set to GMT+N, 
please check detail at KYLIN-4010
-kylin.stream.event.timezone=
-
-# Debug switch for print realtime global dict encode information, please check 
detail at KYLIN-4141
-kylin.stream.print-realtime-dict-enabled=false
-
-# Should enable latest coordinator, please check detail at KYLIN-4167
-kylin.stream.new.coordinator-enabled=true
-
-# In which way should we collect receiver's metrics info
-#kylin.stream.metrics.option=console/csv/jmx
-
-# When enable a streaming cube, should cousme from earliest offset or least 
offset
-kylin.stream.consume.offsets.latest=true
-
-# The parallelism of scan in receiver side
-kylin.stream.receiver.use-threads-per-query=8
-
-# How coordinator/receiver register itself into StreamMetadata, there are 
three option:
-# 1. hostname:port, then kylin will set the config ip and port as the 
currentNode;
-# 2. port, then kylin will get the node's hostname and append port as the 
currentNode;
-# 3. not set, then kylin will get the node hostname address and set the 
hostname and defaultPort(7070 for coordinator or 9090 for receiver) as the 
currentNode.
-#kylin.stream.node=
-
-# Auto resubmit after job be discarded
-kylin.stream.auto-resubmit-after-discard-enabled=true
diff --git 
a/server/src/test/java/org/apache/kylin/rest/service/AdminServiceTest.java 
b/server/src/test/java/org/apache/kylin/rest/service/AdminServiceTest.java
index 34fb8b4..7bcb8a2 100644
--- a/server/src/test/java/org/apache/kylin/rest/service/AdminServiceTest.java
+++ b/server/src/test/java/org/apache/kylin/rest/service/AdminServiceTest.java
@@ -61,7 +61,7 @@ public class AdminServiceTest extends ServiceTestBase {
                     "kylin.server.external-acl-provider=\n" +
                     "kylin.tool.auto-migrate-cube.enabled=\n" +
                     "kylin.storage.default=4\n" +
-                    "kylin.cube.cubeplanner.enabled=true\n" +
+                    "kylin.cube.cubeplanner.enabled=false\n" +
                     "kylin.web.help=\n" +
                     "kylin.web.export-allow-other=true\n" +
                     "kylin.web.link-hadoop=\n" +
@@ -69,7 +69,7 @@ public class AdminServiceTest extends ServiceTestBase {
                     "kylin.htrace.show-gui-trace-toggle=false\n" +
                     "kylin.security.additional-profiles=\n" +
                     "kylin.web.export-allow-admin=true\n" +
-                    "kylin.env=QA\n" +
+                    "kylin.env=DEV\n" +
                     "kylin.web.hive-limit=20\n" +
                     "kylin.engine.default=6\n" +
                     "kylin.web.help.3=onboard|Cube Design 
Tutorial|http://kylin.apache.org/docs/howto/howto_optimize_cubes.html\n"; +

[kylin] branch kylin-on-parquet-v2 updated: KYLIN-4719 Refine kylin-defaults.properties for parquet Storage

Reply via email to