This is an automated email from the ASF dual-hosted git repository. xxyu pushed a commit to branch kylin-on-parquet-v2 in repository https://gitbox.apache.org/repos/asf/kylin.git
The following commit(s) were added to refs/heads/kylin-on-parquet-v2 by this push: new 3405ab2 KYLIN-4719 Refine kylin-defaults.properties for parquet Storage 3405ab2 is described below commit 3405ab2609da1fe8e95c5e0fb9603d429c3e81b6 Author: Zhichao Zhang <441586...@qq.com> AuthorDate: Fri Sep 4 11:56:26 2020 +0800 KYLIN-4719 Refine kylin-defaults.properties for parquet Storage --- .../src/main/resources/kylin-defaults.properties | 202 +++------------------ .../kylin/rest/service/AdminServiceTest.java | 4 +- 2 files changed, 31 insertions(+), 175 deletions(-) diff --git a/core-common/src/main/resources/kylin-defaults.properties b/core-common/src/main/resources/kylin-defaults.properties index 8903487..565d4e9 100644 --- a/core-common/src/main/resources/kylin-defaults.properties +++ b/core-common/src/main/resources/kylin-defaults.properties @@ -18,7 +18,7 @@ ### METADATA | ENV ### # The metadata store in hbase -kylin.metadata.url=kylin_metadata@hbase +kylin.metadata.url=kylin_metadata@jdbc,url=jdbc:mysql://localhost:3306/kylin,username=XXXX,password=XXXXXX,maxActive=10,maxIdle=10 # metadata cache sync retry times kylin.metadata.sync-retries=3 @@ -27,7 +27,7 @@ kylin.metadata.sync-retries=3 kylin.env.hdfs-working-dir=/kylin # DEV|QA|PROD. DEV will turn on some dev features, QA and PROD has no difference in terms of functions. -kylin.env=QA +kylin.env=DEV # kylin zk base path kylin.env.zookeeper-base-path=/kylin @@ -87,29 +87,6 @@ kylin.server.external-acl-provider= kylin.web.default-time-filter=1 ### SOURCE ### - -# Hive client, valid value [cli, beeline] -kylin.source.hive.client=cli - -# Absolute path to beeline shell, can be set to spark beeline instead of the default hive beeline on PATH -kylin.source.hive.beeline-shell=beeline - -# Parameters for beeline client, only necessary if hive client is beeline -#kylin.source.hive.beeline-params=-n root --hiveconf hive.security.authorization.sqlstd.confwhitelist.append='mapreduce.job.*|dfs.*' -u jdbc:hive2://localhost:10000 - -# While hive client uses above settings to read hive table metadata, -# table operations can go through a separate SparkSQL command line, given SparkSQL connects to the same Hive metastore. -kylin.source.hive.enable-sparksql-for-table-ops=false -#kylin.source.hive.sparksql-beeline-shell=/path/to/spark-client/bin/beeline -#kylin.source.hive.sparksql-beeline-params=-n root --hiveconf hive.security.authorization.sqlstd.confwhitelist.append='mapreduce.job.*|dfs.*' -u jdbc:hive2://localhost:10000 - -kylin.source.hive.keep-flat-table=false - -# Hive database name for putting the intermediate flat tables -kylin.source.hive.database-for-flat-table=default - -# Whether redistribute the intermediate flat table before building -kylin.source.hive.redistribute-flat-table=true # Define how to access to hive metadata # When user deploy kylin on AWS EMR and Glue is used as external metadata, use gluecatalog instead kylin.source.hive.metadata-type=hcatalog @@ -119,47 +96,6 @@ kylin.source.hive.metadata-type=hcatalog # The storage for final cube file in hbase kylin.storage.url=hbase -# The prefix of hbase table -kylin.storage.hbase.table-name-prefix=KYLIN_ - -# The namespace for hbase storage -kylin.storage.hbase.namespace=default - -# Compression codec for htable, valid value [none, snappy, lzo, gzip, lz4] -kylin.storage.hbase.compression-codec=none - -# HBase Cluster FileSystem, which serving hbase, format as hdfs://hbase-cluster:8020 -# Leave empty if hbase running on same cluster with hive and mapreduce -#kylin.storage.hbase.cluster-fs= - -# The cut size for hbase region, in GB. -kylin.storage.hbase.region-cut-gb=5 - -# The hfile size of GB, smaller hfile leading to the converting hfile MR has more reducers and be faster. -# Set 0 to disable this optimization. -kylin.storage.hbase.hfile-size-gb=2 - -kylin.storage.hbase.min-region-count=1 -kylin.storage.hbase.max-region-count=500 - -# Optional information for the owner of kylin platform, it can be your team's email -# Currently it will be attached to each kylin's htable attribute -kylin.storage.hbase.owner-tag=who...@kylin.apache.org - -kylin.storage.hbase.coprocessor-mem-gb=3 - -# By default kylin can spill query's intermediate results to disks when it's consuming too much memory. -# Set it to false if you want query to abort immediately in such condition. -kylin.storage.partition.aggr-spill-enabled=true - -# The maximum number of bytes each coprocessor is allowed to scan. -# To allow arbitrary large scan, you can set it to 0. -kylin.storage.partition.max-scan-bytes=3221225472 - -# The default coprocessor timeout is (hbase.rpc.timeout * 0.9) / 1000 seconds, -# You can set it to a smaller value. 0 means use default. -# kylin.storage.hbase.coprocessor-timeout-seconds=0 - # clean real storage after delete operation # if you want to delete the real storage like htable of deleting segment, you can set it to true kylin.storage.clean-after-delete-operation=false @@ -186,27 +122,6 @@ kylin.job.sampling-percentage=100 kylin.job.scheduler.provider.100=org.apache.kylin.job.impl.curator.CuratorScheduler kylin.job.scheduler.default=0 -### ENGINE ### - -# Time interval to check hadoop job status -kylin.engine.mr.yarn-check-interval-seconds=10 - -kylin.engine.mr.reduce-input-mb=500 - -kylin.engine.mr.max-reducer-number=500 - -kylin.engine.mr.mapper-input-rows=1000000 - -# Enable dictionary building in MR reducer -kylin.engine.mr.build-dict-in-reducer=true - -# Number of reducers for fetching UHC column distinct values -kylin.engine.mr.uhc-reducer-count=3 - -# Whether using an additional step to build UHC dictionary -kylin.engine.mr.build-uhc-dict-in-additional-step=false - - ### CUBE | DICTIONARY ### kylin.cube.cuboid-scheduler=org.apache.kylin.cube.cuboid.DefaultCuboidScheduler @@ -227,15 +142,14 @@ kylin.cube.aggrgroup.max-combination=32768 kylin.snapshot.max-mb=300 -kylin.cube.cubeplanner.enabled=true -kylin.cube.cubeplanner.enabled-for-existing-cube=true +kylin.cube.cubeplanner.enabled=false +kylin.cube.cubeplanner.enabled-for-existing-cube=false kylin.cube.cubeplanner.expansion-threshold=15.0 kylin.cube.cubeplanner.recommend-cache-max-size=200 kylin.cube.cubeplanner.mandatory-rollup-threshold=1000 kylin.cube.cubeplanner.algorithm-threshold-greedy=8 kylin.cube.cubeplanner.algorithm-threshold-genetic=23 - ### QUERY ### # Controls the maximum number of bytes a query is allowed to scan storage. @@ -244,6 +158,10 @@ kylin.cube.cubeplanner.algorithm-threshold-genetic=23 kylin.query.max-scan-bytes=0 kylin.query.cache-enabled=true +kylin.query.cache-threshold-scan-count=10240 +kylin.query.cache-threshold-duration=2000 +kylin.query.cache-threshold-scan-bytes=1048576 +kylin.query.large-query-threshold=1000000 # Controls extras properties for Calcite jdbc driver # all extras properties should undder prefix "kylin.query.calcite.extras-props." @@ -309,45 +227,31 @@ kylin.security.saml.context-server-name=hostname kylin.security.saml.context-server-port=443 kylin.security.saml.context-path=/kylin -### SPARK ENGINE CONFIGS ### +### SPARK BUILD/MERGE ENGINE CONFIGS ### # Hadoop conf folder, will export this as "HADOOP_CONF_DIR" to run spark-submit # This must contain site xmls of core, yarn, hive, and hbase in one folder #kylin.env.hadoop-conf-dir=/etc/hadoop/conf -# Estimate the RDD partition numbers -kylin.engine.spark.rdd-partition-cut-mb=10 - -# Minimal partition numbers of rdd -kylin.engine.spark.min-partition=1 - -# Max partition numbers of rdd -kylin.engine.spark.max-partition=5000 - # Spark conf (default is in spark/conf/spark-defaults.conf) kylin.engine.spark-conf.spark.master=yarn -#kylin.engine.spark-conf.spark.submit.deployMode=cluster +#kylin.engine.spark-conf.spark.submit.deployMode=client kylin.engine.spark-conf.spark.yarn.queue=default +kylin.engine.spark-conf.spark.executor.cores=1 kylin.engine.spark-conf.spark.executor.memory=4G -kylin.engine.spark-conf.spark.executor.instances=40 +kylin.engine.spark-conf.spark.executor.instances=1 +kylin.engine.spark-conf.spark.executor.memoryOverhead=1024M kylin.engine.spark-conf.spark.driver.cores=1 -kylin.engine.spark-conf.spark.executor.memoryOverhead=1024 +kylin.engine.spark-conf.spark.driver.memory=1G kylin.engine.spark-conf.spark.shuffle.service.enabled=true kylin.engine.spark-conf.spark.eventLog.enabled=true kylin.engine.spark-conf.spark.eventLog.dir=hdfs\:///kylin/spark-history kylin.engine.spark-conf.spark.history.fs.logDirectory=hdfs\:///kylin/spark-history kylin.engine.spark-conf.spark.hadoop.yarn.timeline-service.enabled=false - -### Spark conf for specific job -kylin.engine.spark-conf-mergedict.spark.executor.memory=6G -kylin.engine.spark-conf-mergedict.spark.memory.fraction=0.2 - -### Spark conf overwrite for query engine -kylin.query.spark-conf.spark.executor.cores=5 -kylin.query.spark-conf.spark.executor.instances=4 +#kylin.engine.spark-conf.spark.sql.shuffle.partitions=1 # manually upload spark-assembly jar to HDFS and then set this property will avoid repeatedly uploading jar at runtime -#kylin.engine.spark-conf.spark.yarn.archive=hdfs://namenode:8020/kylin/spark/spark-libs.jar +#kylin.engine.spark-conf.spark.yarn.jars=hdfs://localhost:9000/spark2_jars/* #kylin.engine.spark-conf.spark.io.compression.codec=org.apache.spark.io.SnappyCompressionCodec # uncomment for HDP @@ -355,69 +259,21 @@ kylin.query.spark-conf.spark.executor.instances=4 #kylin.engine.spark-conf.spark.yarn.am.extraJavaOptions=-Dhdp.version=current #kylin.engine.spark-conf.spark.executor.extraJavaOptions=-Dhdp.version=current - -### FLINK ENGINE CONFIGS ### - -## Flink conf (default is in flink/conf/flink-conf.yaml) -kylin.engine.flink-conf.jobmanager.heap.size=2G -kylin.engine.flink-conf.taskmanager.heap.size=4G -kylin.engine.flink-conf.taskmanager.numberOfTaskSlots=1 -kylin.engine.flink-conf.taskmanager.memory.preallocate=false -kylin.engine.flink-conf.job.parallelism=1 -kylin.engine.flink-conf.program.enableObjectReuse=false -kylin.engine.flink-conf.yarn.queue= -kylin.engine.flink-conf.yarn.nodelabel= +### SPARK QUERY ENGINE CONFIGS ### +kylin.query.spark-conf.spark.master=yarn +#kylin.query.spark-conf.spark.submit.deployMode=client +kylin.query.spark-conf.spark.driver.cores=1 +kylin.query.spark-conf.spark.driver.memory=4G +kylin.query.spark-conf.spark.driver.memoryOverhead=1G +kylin.query.spark-conf.spark.executor.cores=5 +kylin.query.spark-conf.spark.executor.instances=4 +kylin.query.spark-conf.spark.executor.memory=20G +kylin.query.spark-conf.spark.executor.memoryOverhead=2G +kylin.query.spark-conf.spark.serializer=org.apache.spark.serializer.JavaSerializer +#kylin.query.spark-conf.spark.sql.shuffle.partitions=40 +#kylin.query.spark-conf.spark.yarn.jars=hdfs://localhost:9000/spark2_jars/* ### QUERY PUSH DOWN ### #kylin.query.pushdown.runner-class-name=org.apache.kylin.query.pushdown.PushDownRunnerSparkImpl - #kylin.query.pushdown.update-enabled=false - -### JDBC Data Source -#kylin.source.jdbc.connection-url= -#kylin.source.jdbc.driver= -#kylin.source.jdbc.dialect= -#kylin.source.jdbc.user= -#kylin.source.jdbc.pass= -#kylin.source.jdbc.sqoop-home= -#kylin.source.jdbc.filed-delimiter=| - -### Livy with Kylin -#kylin.engine.livy-conf.livy-enabled=false -#kylin.engine.livy-conf.livy-url=http://LivyHost:8998 -#kylin.engine.livy-conf.livy-key.file=hdfs:///path-to-kylin-job-jar -#kylin.engine.livy-conf.livy-arr.jars=hdfs:///path-to-hadoop-dependency-jar - - -### Realtime OLAP ### - -# Where should local segment cache located, for absolute path, the real path will be ${KYLIN_HOME}/${kylin.stream.index.path} -kylin.stream.index.path=stream_index - -# The timezone for Derived Time Column like hour_start, try set to GMT+N, please check detail at KYLIN-4010 -kylin.stream.event.timezone= - -# Debug switch for print realtime global dict encode information, please check detail at KYLIN-4141 -kylin.stream.print-realtime-dict-enabled=false - -# Should enable latest coordinator, please check detail at KYLIN-4167 -kylin.stream.new.coordinator-enabled=true - -# In which way should we collect receiver's metrics info -#kylin.stream.metrics.option=console/csv/jmx - -# When enable a streaming cube, should cousme from earliest offset or least offset -kylin.stream.consume.offsets.latest=true - -# The parallelism of scan in receiver side -kylin.stream.receiver.use-threads-per-query=8 - -# How coordinator/receiver register itself into StreamMetadata, there are three option: -# 1. hostname:port, then kylin will set the config ip and port as the currentNode; -# 2. port, then kylin will get the node's hostname and append port as the currentNode; -# 3. not set, then kylin will get the node hostname address and set the hostname and defaultPort(7070 for coordinator or 9090 for receiver) as the currentNode. -#kylin.stream.node= - -# Auto resubmit after job be discarded -kylin.stream.auto-resubmit-after-discard-enabled=true diff --git a/server/src/test/java/org/apache/kylin/rest/service/AdminServiceTest.java b/server/src/test/java/org/apache/kylin/rest/service/AdminServiceTest.java index 34fb8b4..7bcb8a2 100644 --- a/server/src/test/java/org/apache/kylin/rest/service/AdminServiceTest.java +++ b/server/src/test/java/org/apache/kylin/rest/service/AdminServiceTest.java @@ -61,7 +61,7 @@ public class AdminServiceTest extends ServiceTestBase { "kylin.server.external-acl-provider=\n" + "kylin.tool.auto-migrate-cube.enabled=\n" + "kylin.storage.default=4\n" + - "kylin.cube.cubeplanner.enabled=true\n" + + "kylin.cube.cubeplanner.enabled=false\n" + "kylin.web.help=\n" + "kylin.web.export-allow-other=true\n" + "kylin.web.link-hadoop=\n" + @@ -69,7 +69,7 @@ public class AdminServiceTest extends ServiceTestBase { "kylin.htrace.show-gui-trace-toggle=false\n" + "kylin.security.additional-profiles=\n" + "kylin.web.export-allow-admin=true\n" + - "kylin.env=QA\n" + + "kylin.env=DEV\n" + "kylin.web.hive-limit=20\n" + "kylin.engine.default=6\n" + "kylin.web.help.3=onboard|Cube Design Tutorial|http://kylin.apache.org/docs/howto/howto_optimize_cubes.html\n" +