prepare docs for 2.0

Project: http://git-wip-us.apache.org/repos/asf/kylin/repo
Commit: http://git-wip-us.apache.org/repos/asf/kylin/commit/7ea64f38
Tree: http://git-wip-us.apache.org/repos/asf/kylin/tree/7ea64f38
Diff: http://git-wip-us.apache.org/repos/asf/kylin/diff/7ea64f38

Branch: refs/heads/document
Commit: 7ea64f38af437893648622dc86ed62d29bd1ca79
Parents: 55b1670
Author: Li Yang <liy...@apache.org>
Authored: Sat Mar 25 09:22:16 2017 +0800
Committer: Li Yang <liy...@apache.org>
Committed: Sat Mar 25 09:22:16 2017 +0800

----------------------------------------------------------------------
 website/_config.yml                             |   10 +-
 website/_data/docs20-cn.yml                     |   20 +
 website/_data/docs20.yml                        |   65 +
 website/_docs16/index.md                        |    1 -
 website/_docs16/tutorial/cube_spark.md          |  166 ---
 .../_docs20/gettingstarted/best_practices.md    |   27 +
 website/_docs20/gettingstarted/concepts.md      |   64 +
 website/_docs20/gettingstarted/events.md        |   24 +
 website/_docs20/gettingstarted/faq.md           |  119 ++
 website/_docs20/gettingstarted/terminology.md   |   25 +
 website/_docs20/howto/howto_backup_metadata.md  |   60 +
 .../howto/howto_build_cube_with_restapi.md      |   53 +
 website/_docs20/howto/howto_cleanup_storage.md  |   22 +
 website/_docs20/howto/howto_jdbc.md             |   92 ++
 website/_docs20/howto/howto_ldap_and_sso.md     |  128 ++
 website/_docs20/howto/howto_optimize_build.md   |  190 +++
 website/_docs20/howto/howto_optimize_cubes.md   |  212 +++
 .../_docs20/howto/howto_update_coprocessor.md   |   14 +
 website/_docs20/howto/howto_upgrade.md          |   66 +
 website/_docs20/howto/howto_use_beeline.md      |   14 +
 .../howto/howto_use_distributed_scheduler.md    |   16 +
 website/_docs20/howto/howto_use_restapi.md      | 1113 +++++++++++++++
 .../_docs20/howto/howto_use_restapi_in_js.md    |   46 +
 website/_docs20/index.cn.md                     |   26 +
 website/_docs20/index.md                        |   59 +
 website/_docs20/install/advance_settings.md     |   98 ++
 website/_docs20/install/hadoop_evn.md           |   40 +
 website/_docs20/install/index.cn.md             |   46 +
 website/_docs20/install/index.md                |   35 +
 website/_docs20/install/kylin_cluster.md        |   32 +
 website/_docs20/install/kylin_docker.md         |   10 +
 .../_docs20/install/manual_install_guide.cn.md  |   48 +
 website/_docs20/release_notes.md                | 1333 ++++++++++++++++++
 website/_docs20/tutorial/acl.cn.md              |   35 +
 website/_docs20/tutorial/acl.md                 |   32 +
 website/_docs20/tutorial/create_cube.cn.md      |  129 ++
 website/_docs20/tutorial/create_cube.md         |  198 +++
 website/_docs20/tutorial/cube_build_job.cn.md   |   66 +
 website/_docs20/tutorial/cube_build_job.md      |   67 +
 website/_docs20/tutorial/cube_spark.md          |  166 +++
 website/_docs20/tutorial/cube_streaming.md      |  219 +++
 website/_docs20/tutorial/flink.md               |  249 ++++
 .../_docs20/tutorial/kylin_client_tool.cn.md    |   97 ++
 website/_docs20/tutorial/kylin_sample.md        |   21 +
 website/_docs20/tutorial/odbc.cn.md             |   34 +
 website/_docs20/tutorial/odbc.md                |   49 +
 website/_docs20/tutorial/powerbi.cn.md          |   56 +
 website/_docs20/tutorial/powerbi.md             |   54 +
 website/_docs20/tutorial/squirrel.md            |  112 ++
 website/_docs20/tutorial/tableau.cn.md          |  116 ++
 website/_docs20/tutorial/tableau.md             |  113 ++
 website/_docs20/tutorial/tableau_91.cn.md       |   51 +
 website/_docs20/tutorial/tableau_91.md          |   50 +
 website/_docs20/tutorial/web.cn.md              |  134 ++
 website/_docs20/tutorial/web.md                 |  123 ++
 website/_includes/docs20_nav.cn.html            |   33 +
 website/_includes/docs20_nav.html               |   33 +
 website/_includes/docs20_ul.cn.html             |   28 +
 website/_includes/docs20_ul.html                |   29 +
 website/_includes/header.cn.html                |    2 +-
 website/_includes/header.html                   |    2 +-
 website/_layouts/docs20-cn.html                 |   46 +
 website/_layouts/docs20.html                    |   50 +
 63 files changed, 6496 insertions(+), 172 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kylin/blob/7ea64f38/website/_config.yml
----------------------------------------------------------------------
diff --git a/website/_config.yml b/website/_config.yml
index c58bc26..efeed85 100644
--- a/website/_config.yml
+++ b/website/_config.yml
@@ -61,10 +61,14 @@ collections:
   docs15:
     output: true
   docs15-cn:
-    output: true   
+    output: true
   docs16:
     output: true
   docs16-cn:
-    output: true     
+    output: true
+  docs20:
+    output: true
+  docs20-cn:
+    output: true
   dev:
-    output: true  
+    output: true

http://git-wip-us.apache.org/repos/asf/kylin/blob/7ea64f38/website/_data/docs20-cn.yml
----------------------------------------------------------------------
diff --git a/website/_data/docs20-cn.yml b/website/_data/docs20-cn.yml
new file mode 100644
index 0000000..f69fbe5
--- /dev/null
+++ b/website/_data/docs20-cn.yml
@@ -0,0 +1,20 @@
+- title: 开始
+  docs:
+  - index
+
+- title: 安装
+  docs:
+  - install/install_guide
+  - install/manual_install_guide
+
+- title: 教程
+  docs:
+  - tutorial/create_cube_cn
+  - tutorial/cube_build_job
+  - tutorial/acl
+  - tutorial/web
+  - tutorial/tableau
+  - tutorial/tableau_91
+  - tutorial/powerbi
+  - tutorial/odbc
+

http://git-wip-us.apache.org/repos/asf/kylin/blob/7ea64f38/website/_data/docs20.yml
----------------------------------------------------------------------
diff --git a/website/_data/docs20.yml b/website/_data/docs20.yml
new file mode 100644
index 0000000..1d4501d
--- /dev/null
+++ b/website/_data/docs20.yml
@@ -0,0 +1,65 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Docs menu items, for English one, docs20-cn.yml is for Chinese one
+# The docs menu is constructed in docs20_nav.html with these data
+- title: Getting Started
+  docs:
+  - index
+  - release_notes
+  - gettingstarted/faq
+  - gettingstarted/events
+  - gettingstarted/best_practices
+  - gettingstarted/terminology
+  - gettingstarted/concepts
+
+- title: Installation
+  docs:
+  - install/index
+  - install/hadoop_env
+  - install/manual_install_guide
+  - install/kylin_cluster
+  - install/advance_settings
+  - install/kylin_docker
+
+- title: Tutorial
+  docs:
+  - tutorial/kylin_sample
+  - tutorial/create_cube
+  - tutorial/cube_build_job
+  - tutorial/cube_spark
+  - tutorial/acl
+  - tutorial/web
+  - tutorial/tableau
+  - tutorial/tableau_91
+  - tutorial/powerbi
+  - tutorial/odbc
+  - tutorial/flink
+  - tutorial/squirrel
+
+- title: How To
+  docs:
+  - howto/howto_build_cube_with_restapi
+  - howto/howto_use_restapi_in_js
+  - howto/howto_use_restapi
+  - howto/howto_optimize_cubes
+  - howto/howto_optimize_build
+  - howto/howto_backup_metadata
+  - howto/howto_cleanup_storage
+  - howto/howto_jdbc
+  - howto/howto_upgrade
+  - howto/howto_ldap_and_sso
+  - howto/howto_use_beeline
+  - howto/howto_update_coprocessor

http://git-wip-us.apache.org/repos/asf/kylin/blob/7ea64f38/website/_docs16/index.md
----------------------------------------------------------------------
diff --git a/website/_docs16/index.md b/website/_docs16/index.md
index 87c97b4..b4eee3b 100644
--- a/website/_docs16/index.md
+++ b/website/_docs16/index.md
@@ -32,7 +32,6 @@ Tutorial
 4. [Web Interface](tutorial/web.html)
 5. [SQL reference: by Apache 
Calcite](http://calcite.apache.org/docs/reference.html)
 6. [Build Cube with Streaming Data (beta)](tutorial/cube_streaming.html)
-6. [Build Cube with Spark engine (v2.0 beta)](tutorial/cube_spark.html)
 
 
 Connectivity and APIs

http://git-wip-us.apache.org/repos/asf/kylin/blob/7ea64f38/website/_docs16/tutorial/cube_spark.md
----------------------------------------------------------------------
diff --git a/website/_docs16/tutorial/cube_spark.md 
b/website/_docs16/tutorial/cube_spark.md
deleted file mode 100644
index 743eb51..0000000
--- a/website/_docs16/tutorial/cube_spark.md
+++ /dev/null
@@ -1,166 +0,0 @@
----
-layout: docs16
-title:  Build Cube with Spark (beta)
-categories: tutorial
-permalink: /docs16/tutorial/cube_spark.html
----
-Kylin v2.0 introduces the Spark cube engine, it uses Apache Spark to replace 
MapReduce in the build cube step; You can check [this 
blog](/blog/2017/02/23/by-layer-spark-cubing/) for an overall picture. The 
current document uses the sample cube to demo how to try the new engine.
-
-## Preparation
-To finish this tutorial, you need a Hadoop environment which has Kylin v2.0.0 
or above installed. Here we will use Hortonworks HDP 2.4 Sandbox VM, the Hadoop 
components as well as Hive/HBase has already been started. 
-
-## Install Kylin v2.0.0 beta
-
-Download the Kylin v2.0.0 beta for HBase 1.x from Kylin's download page, and 
then uncompress the tar ball into */usr/local/* folder:
-
-{% highlight Groff markup %}
-
-wget 
https://dist.apache.org/repos/dist/dev/kylin/apache-kylin-2.0.0-beta/apache-kylin-2.0.0-beta-hbase1x.tar.gz
 -P /tmp
-
-tar -zxvf /tmp/apache-kylin-2.0.0-beta-hbase1x.tar.gz -C /usr/local/
-
-export KYLIN_HOME=/usr/local/apache-kylin-2.0.0-SNAPSHOT-bin
-{% endhighlight %}
-
-## Prepare "kylin.env.hadoop-conf-dir"
-
-To run Spark on Yarn, need specify **HADOOP_CONF_DIR** environment variable, 
which is the directory that contains the (client side) configuration files for 
Hadoop. In many Hadoop distributions the directory is "/etc/hadoop/conf"; But 
Kylin not only need access HDFS, Yarn and Hive, but also HBase, so the default 
directory might not have all necessary files. In this case, you need create a 
new directory and then copying or linking those client files (core-site.xml, 
yarn-site.xml, hive-site.xml and hbase-site.xml) there. In HDP 2.4, there is a 
conflict between hive-tez and Spark, so need change the default engine from 
"tez" to "mr" when copy for Kylin.
-
-{% highlight Groff markup %}
-
-mkdir $KYLIN_HOME/hadoop-conf
-ln -s /etc/hadoop/conf/core-site.xml $KYLIN_HOME/hadoop-conf/core-site.xml 
-ln -s /etc/hadoop/conf/yarn-site.xml $KYLIN_HOME/hadoop-conf/yarn-site.xml 
-ln -s /etc/hbase/2.4.0.0-169/0/hbase-site.xml 
$KYLIN_HOME/hadoop-conf/hbase-site.xml 
-cp /etc/hive/2.4.0.0-169/0/hive-site.xml $KYLIN_HOME/hadoop-conf/hive-site.xml 
-vi $KYLIN_HOME/hadoop-conf/hive-site.xml (change "hive.execution.engine" value 
from "tez" to "mr")
-
-{% endhighlight %}
-
-Now, let Kylin know this directory with property "kylin.env.hadoop-conf-dir" 
in kylin.properties:
-
-{% highlight Groff markup %}
-kylin.env.hadoop-conf-dir=/usr/local/apache-kylin-2.0.0-SNAPSHOT-bin/hadoop-conf
-{% endhighlight %}
-
-If this property isn't set, Kylin will use the directory that "hive-site.xml" 
locates in; while that folder may have no "hbase-site.xml", will get HBase/ZK 
connection error in Spark.
-
-## Check Spark configuration
-
-Kylin embedes a Spark binary (v1.6.3) in $KYLIN_HOME/spark, all the Spark 
configurations can be managed in $KYLIN_HOME/conf/kylin.properties with prefix 
*"kylin.engine.spark-conf."*. These properties will be extracted and applied 
when runs submit Spark job; E.g, if you configure 
"kylin.engine.spark-conf.spark.executor.memory=4G", Kylin will use "--conf 
spark.executor.memory=4G" as parameter when execute "spark-submit".
-
-Before you run Spark cubing, suggest take a look on these configurations and 
do customization according to your cluster. Below is the default 
configurations, which is also the minimal config for a sandbox (1 executor with 
1GB memory); usually in a normal cluster, need much more executors and each has 
at least 4GB memory and 2 cores:
-
-{% highlight Groff markup %}
-kylin.engine.spark-conf.spark.master=yarn
-kylin.engine.spark-conf.spark.submit.deployMode=cluster
-kylin.engine.spark-conf.spark.yarn.queue=default
-kylin.engine.spark-conf.spark.executor.memory=1G
-kylin.engine.spark-conf.spark.executor.cores=2
-kylin.engine.spark-conf.spark.executor.instances=1
-kylin.engine.spark-conf.spark.eventLog.enabled=true
-kylin.engine.spark-conf.spark.eventLog.dir=hdfs\:///kylin/spark-history
-kylin.engine.spark-conf.spark.history.fs.logDirectory=hdfs\:///kylin/spark-history
-#kylin.engine.spark-conf.spark.yarn.jar=hdfs://namenode:8020/kylin/spark/spark-assembly-1.6.3-hadoop2.6.0.jar
-#kylin.engine.spark-conf.spark.io.compression.codec=org.apache.spark.io.SnappyCompressionCodec
-
-## uncomment for HDP
-#kylin.engine.spark-conf.spark.driver.extraJavaOptions=-Dhdp.version=current
-#kylin.engine.spark-conf.spark.yarn.am.extraJavaOptions=-Dhdp.version=current
-#kylin.engine.spark-conf.spark.executor.extraJavaOptions=-Dhdp.version=current
-
-{% endhighlight %}
-
-For running on Hortonworks platform, need specify "hdp.version" as Java 
options for Yarn containers, so please uncommment the last three lines in 
kylin.properties. 
-
-Besides, in order to avoid repeatedly uploading Spark assembly jar to Yarn, 
you can manually do that once, and then configure the jar's HDFS location; 
Please note, the HDFS location need be full qualified name.
-
-{% highlight Groff markup %}
-hadoop fs -mkdir -p /kylin/spark/
-hadoop fs -put $KYLIN_HOME/spark/lib/spark-assembly-1.6.3-hadoop2.6.0.jar 
/kylin/spark/
-{% endhighlight %}
-
-After do that, the config in kylin.properties will be:
-{% highlight Groff markup %}
-kylin.engine.spark-conf.spark.yarn.jar=hdfs://sandbox.hortonworks.com:8020/kylin/spark/spark-assembly-1.6.3-hadoop2.6.0.jar
-kylin.engine.spark-conf.spark.driver.extraJavaOptions=-Dhdp.version=current
-kylin.engine.spark-conf.spark.yarn.am.extraJavaOptions=-Dhdp.version=current
-kylin.engine.spark-conf.spark.executor.extraJavaOptions=-Dhdp.version=current
-{% endhighlight %}
-
-All the "kylin.engine.spark-conf.*" parameters can be overwritten at Cube or 
Project level, this gives more flexibility to the user.
-
-## Create and modify sample cube
-
-Run the sample.sh to create the sample cube, and then start Kylin server:
-
-{% highlight Groff markup %}
-
-$KYLIN_HOME/bin/sample.sh
-$KYLIN_HOME/bin/kylin.sh start
-
-{% endhighlight %}
-
-After Kylin is started, access Kylin web, edit the "kylin_sales" cube, in the 
"Advanced Setting" page, change the "Cube Engine" from "MapReduce" to "Spark 
(Beta)":
-
-
-   ![](/images/tutorial/2.0/Spark-Cubing-Tutorial/1_cube_engine.png)
-
-Click "Next" to the "Configuration Overwrites" page, click "+Property" to add 
property "kylin.engine.spark.rdd-partition-cut-mb" with value "100" (reasons 
below):
-
-   ![](/images/tutorial/2.0/Spark-Cubing-Tutorial/2_overwrite_partition.png)
-
-The sample cube has two memory hungry measures: a "COUNT DISTINCT" and a 
"TOPN(100)"; Their size estimation can be inaccurate when the source data is 
small: the estimized size is much larger than the real size, that causes much 
more RDD partitions be splitted, which slows down the build. Here 100 is a more 
reasonable number for it. Click "Next" and "Save" to save the cube.
-
-
-## Build Cube with Spark
-
-Click "Build", select current date as the build end date. Kylin generates a 
build job in the "Monitor" page, in which the 7th step is the Spark cubing. The 
job engine starts to execute the steps in sequence. 
-
-
-   ![](/images/tutorial/2.0/Spark-Cubing-Tutorial/2_job_with_spark.png)
-
-
-   ![](/images/tutorial/2.0/Spark-Cubing-Tutorial/3_spark_cubing_step.png)
-
-When Kylin executes this step, you can monitor the status in Yarn resource 
manager. Click the "Application Master" link will open Spark web UI, it shows 
the progress of each stage and the detailed information.
-
-
-   ![](/images/tutorial/2.0/Spark-Cubing-Tutorial/4_job_on_rm.png)
-
-
-   ![](/images/tutorial/2.0/Spark-Cubing-Tutorial/5_spark_web_gui.png)
-
-
-After all steps be successfully executed, the Cube becomes "Ready" and you can 
query it as normal.
-
-## Troubleshooting
-
-When getting error, you should check "logs/kylin.log" firstly. There has the 
full Spark command that Kylin executes, e.g:
-
-{% highlight Groff markup %}
-2017-03-06 14:44:38,574 INFO  [Job 2d5c1178-c6f6-4b50-8937-8e5e3b39227e-306] 
spark.SparkExecutable:121 : cmd:export 
HADOOP_CONF_DIR=/usr/local/apache-kylin-2.0.0-SNAPSHOT-bin/hadoop-conf && 
/usr/local/apache-kylin-2.0.0-SNAPSHOT-bin/spark/bin/spark-submit --class 
org.apache.kylin.common.util.SparkEntry  --conf spark.executor.instances=1  
--conf 
spark.yarn.jar=hdfs://sandbox.hortonworks.com:8020/kylin/spark/spark-assembly-1.6.3-hadoop2.6.0.jar
  --conf spark.yarn.queue=default  --conf 
spark.yarn.am.extraJavaOptions=-Dhdp.version=current  --conf 
spark.history.fs.logDirectory=hdfs:///kylin/spark-history  --conf 
spark.driver.extraJavaOptions=-Dhdp.version=current  --conf spark.master=yarn  
--conf spark.executor.extraJavaOptions=-Dhdp.version=current  --conf 
spark.executor.memory=1G  --conf spark.eventLog.enabled=true  --conf 
spark.eventLog.dir=hdfs:///kylin/spark-history  --conf spark.executor.cores=2  
--conf spark.submit.deployMode=cluster --files 
/etc/hbase/2.4.0.0-169/0/hbase-site.xml
  --jars 
/usr/local/apache-kylin-2.0.0-SNAPSHOT-bin/spark/lib/spark-assembly-1.6.3-hadoop2.6.0.jar,/usr/hdp/2.4.0.0-169/hbase/lib/htrace-core-3.1.0-incubating.jar,/usr/hdp/2.4.0.0-169/hbase/lib/hbase-client-1.1.2.2.4.0.0-169.jar,/usr/hdp/2.4.0.0-169/hbase/lib/hbase-common-1.1.2.2.4.0.0-169.jar,/usr/hdp/2.4.0.0-169/hbase/lib/hbase-protocol-1.1.2.2.4.0.0-169.jar,/usr/hdp/2.4.0.0-169/hbase/lib/metrics-core-2.2.0.jar,/usr/hdp/2.4.0.0-169/hbase/lib/guava-12.0.1.jar,
 /usr/local/apache-kylin-2.0.0-SNAPSHOT-bin/lib/kylin-job-2.0.0-SNAPSHOT.jar 
-className org.apache.kylin.engine.spark.SparkCubingByLayer -hiveTable 
kylin_intermediate_kylin_sales_cube_555c4d32_40bb_457d_909a_1bb017bf2d9e 
-segmentId 555c4d32-40bb-457d-909a-1bb017bf2d9e -confPath 
/usr/local/apache-kylin-2.0.0-SNAPSHOT-bin/conf -output 
hdfs:///kylin/kylin_metadata/kylin-2d5c1178-c6f6-4b50-8937-8e5e3b39227e/kylin_sales_cube/cuboid/
 -cubename kylin_sales_cube
-
-{% endhighlight %}
-
-You can copy the cmd to execute manually in shell and then tunning the 
parameters quickly; During the execution, you can access Yarn resource manager 
to check more. If the job has already finished, you can check the history info 
in Spark history server. 
-
-By default Kylin outputs the history to "hdfs:///kylin/spark-history", you 
need start Spark history server on that directory, or change to use your 
existing Spark history server's event directory in conf/kylin.properties with 
parameter "kylin.engine.spark-conf.spark.eventLog.dir" and 
"kylin.engine.spark-conf.spark.history.fs.logDirectory".
-
-The following command will start a Spark history server instance on Kylin's 
output directory, before run it making sure you have stopped the existing Spark 
history server in sandbox:
-
-{% highlight Groff markup %}
-$KYLIN_HOME/spark/sbin/start-history-server.sh 
hdfs://sandbox.hortonworks.com:8020/kylin/spark-history 
-{% endhighlight %}
-
-In web browser, access "http://sandbox:18080"; it shows the job history:
-
-   ![](/images/tutorial/2.0/Spark-Cubing-Tutorial/9_spark_history.png)
-
-Click a specific job, there you will see the detail runtime information, that 
is very helpful for trouble shooting and performance tuning.
-
-## Go further
-
-If you're a Kylin administrator but new to Spark, suggest you go through 
[Spark documents](https://spark.apache.org/docs/1.6.3/), and don't forget to 
update the configurations accordingly. Spark's performance relies on Cluster's 
memory and CPU resource, while Kylin's Cube build is a heavy task when having a 
complex data model and a huge dataset to build at one time. If your cluster 
resource couldn't fulfill, errors like "OutOfMemorry" will be thrown in Spark 
executors, so please use it properly. For Cube which has UHC dimension, many 
combinations (e.g, a full cube with more than 12 dimensions), or memory hungry 
measures (Count Distinct, Top-N), suggest to use the MapReduce engine. If your 
Cube model is simple, all measures are SUM/MIN/MAX/COUNT, source data is small 
to medium scale, Spark engine would be a good choice. Besides, Streaming build 
isn't supported in this engine so far (KYLIN-2484).
-
-Now the Spark engine is in public beta; If you have any question, comment, or 
bug fix, welcome to discuss in d...@kylin.apache.org.

http://git-wip-us.apache.org/repos/asf/kylin/blob/7ea64f38/website/_docs20/gettingstarted/best_practices.md
----------------------------------------------------------------------
diff --git a/website/_docs20/gettingstarted/best_practices.md 
b/website/_docs20/gettingstarted/best_practices.md
new file mode 100644
index 0000000..59e9005
--- /dev/null
+++ b/website/_docs20/gettingstarted/best_practices.md
@@ -0,0 +1,27 @@
+---
+layout: docs20
+title:  "Community Best Practices"
+categories: gettingstarted
+permalink: /docs20/gettingstarted/best_practices.html
+since: v1.3.x
+---
+
+List of articles about Kylin best practices contributed by community. Some of 
them are from Chinese community. Many thanks!
+
+* [Apache 
Kylin在百度地图的实践](http://www.infoq.com/cn/articles/practis-of-apache-kylin-in-baidu-map)
+
+* [Apache Kylin 
大数据时代的OLAP利器](http://www.bitstech.net/2016/01/04/kylin-olap/)(网易案例)
+
+* [Apache 
Kylin在云海的实践](http://www.csdn.net/article/2015-11-27/2826343)(京东案例)
+
+* [Kylin, Mondrian, 
Saiku系统的整合](http://tech.youzan.com/kylin-mondrian-saiku/)(有赞案例)
+
+* [Big Data MDX with Mondrian and Apache 
Kylin](https://www.inovex.de/fileadmin/files/Vortraege/2015/big-data-mdx-with-mondrian-and-apache-kylin-sebastien-jelsch-pcm-11-2015.pdf)
+
+* [Kylin and Mondrain 
Interaction](https://github.com/mustangore/kylin-mondrian-interaction) (Thanks 
to [mustangore](https://github.com/mustangore))
+
+* [Kylin And Tableau 
Tutorial](https://github.com/albertoRamon/Kylin/tree/master/KylinWithTableau) 
(Thanks to [Ramón Portolés, 
Alberto](https://www.linkedin.com/in/alberto-ramon-portoles-a02b523b))
+
+* [Kylin and Qlik 
Integration](https://github.com/albertoRamon/Kylin/tree/master/KylinWithQlik) 
(Thanks to [Ramón Portolés, 
Alberto](https://www.linkedin.com/in/alberto-ramon-portoles-a02b523b))
+
+* [How to use Hue with 
Kylin](https://github.com/albertoRamon/Kylin/tree/master/KylinWithHue) (Thanks 
to [Ramón Portolés, 
Alberto](https://www.linkedin.com/in/alberto-ramon-portoles-a02b523b))
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/kylin/blob/7ea64f38/website/_docs20/gettingstarted/concepts.md
----------------------------------------------------------------------
diff --git a/website/_docs20/gettingstarted/concepts.md 
b/website/_docs20/gettingstarted/concepts.md
new file mode 100644
index 0000000..138a7f1
--- /dev/null
+++ b/website/_docs20/gettingstarted/concepts.md
@@ -0,0 +1,64 @@
+---
+layout: docs20
+title:  "Technical Concepts"
+categories: gettingstarted
+permalink: /docs20/gettingstarted/concepts.html
+since: v1.2
+---
+ 
+Here are some basic technical concepts used in Apache Kylin, please check them 
for your reference.
+For terminology in domain, please refer to: [Terminology](terminology.html)
+
+## CUBE
+* __Table__ - This is definition of hive tables as source of cubes, which must 
be synced before building cubes.
+![](/images/docs/concepts/DataSource.png)
+
+* __Data Model__ - This describes a [STAR 
SCHEMA](https://en.wikipedia.org/wiki/Star_schema) data model, which defines 
fact/lookup tables and filter condition.
+![](/images/docs/concepts/DataModel.png)
+
+* __Cube Descriptor__ - This describes definition and settings for a cube 
instance, defining which data model to use, what dimensions and measures to 
have, how to partition to segments and how to handle auto-merge etc.
+![](/images/docs/concepts/CubeDesc.png)
+
+* __Cube Instance__ - This is instance of cube, built from one cube 
descriptor, and consist of one or more cube segments according partition 
settings.
+![](/images/docs/concepts/CubeInstance.png)
+
+* __Partition__ - User can define a DATE/STRING column as partition column on 
cube descriptor, to separate one cube into several segments with different date 
periods.
+![](/images/docs/concepts/Partition.png)
+
+* __Cube Segment__ - This is actual carrier of cube data, and maps to a HTable 
in HBase. One building job creates one new segment for the cube instance. Once 
data change on specified data period, we can refresh related segments to avoid 
rebuilding whole cube.
+![](/images/docs/concepts/CubeSegment.png)
+
+* __Aggregation Group__ - Each aggregation group is subset of dimensions, and 
build cuboid with combinations inside. It aims at pruning for optimization.
+![](/images/docs/concepts/AggregationGroup.png)
+
+## DIMENSION & MEASURE
+* __Mandotary__ - This dimension type is used for cuboid pruning, if a 
dimension is specified as “mandatory”, then those combinations without such 
dimension are pruned.
+* __Hierarchy__ - This dimension type is used for cuboid pruning, if dimension 
A,B,C forms a “hierarchy” relation, then only combinations with A, AB or 
ABC shall be remained. 
+* __Derived__ - On lookup tables, some dimensions could be generated from its 
PK, so there's specific mapping between them and FK from fact table. So those 
dimensions are DERIVED and don't participate in cuboid generation.
+![](/images/docs/concepts/Dimension.png)
+
+* __Count Distinct(HyperLogLog)__ - Immediate COUNT DISTINCT is hard to 
calculate, a approximate algorithm - 
[HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog) is introduced, and 
keep error rate in a lower level. 
+* __Count Distinct(Precise)__ - Precise COUNT DISTINCT will be pre-calculated 
basing on RoaringBitmap, currently only int or bigint are supported.
+* __Top N__ - For example, with this measure type, user can easily get 
specified numbers of top sellers/buyers etc. 
+![](/images/docs/concepts/Measure.png)
+
+## CUBE ACTIONS
+* __BUILD__ - Given an interval of partition column, this action is to build a 
new cube segment.
+* __REFRESH__ - This action will rebuilt cube segment in some partition 
period, which is used in case of source table increasing.
+* __MERGE__ - This action will merge multiple continuous cube segments into 
single one. This can be automated with auto-merge settings in cube descriptor.
+* __PURGE__ - Clear segments under a cube instance. This will only update 
metadata, and won't delete cube data from HBase.
+![](/images/docs/concepts/CubeAction.png)
+
+## JOB STATUS
+* __NEW__ - This denotes one job has been just created.
+* __PENDING__ - This denotes one job is paused by job scheduler and waiting 
for resources.
+* __RUNNING__ - This denotes one job is running in progress.
+* __FINISHED__ - This denotes one job is successfully finished.
+* __ERROR__ - This denotes one job is aborted with errors.
+* __DISCARDED__ - This denotes one job is cancelled by end users.
+![](/images/docs/concepts/Job.png)
+
+## JOB ACTION
+* __RESUME__ - Once a job in ERROR status, this action will try to restore it 
from latest successful point.
+* __DISCARD__ - No matter status of a job is, user can end it and release 
resources with DISCARD action.
+![](/images/docs/concepts/JobAction.png)

http://git-wip-us.apache.org/repos/asf/kylin/blob/7ea64f38/website/_docs20/gettingstarted/events.md
----------------------------------------------------------------------
diff --git a/website/_docs20/gettingstarted/events.md 
b/website/_docs20/gettingstarted/events.md
new file mode 100644
index 0000000..db72c61
--- /dev/null
+++ b/website/_docs20/gettingstarted/events.md
@@ -0,0 +1,24 @@
+---
+layout: docs20
+title:  "Events and Conferences"
+categories: gettingstarted
+permalink: /docs20/gettingstarted/events.html
+---
+
+__Conferences__
+
+* [The Evolution of Apache Kylin: Realtime and Plugin Architecture in 
Kylin](https://www.youtube.com/watch?v=n74zvLmIgF0)([slides](http://www.slideshare.net/YangLi43/apache-kylin-15-updates))
 by [Li Yang](https://github.com/liyang-gmt8), at [Hadoop Summit 2016 
Dublin](http://hadoopsummit.org/dublin/agenda/), Ireland, 2016-04-14
+* [Apache Kylin - Balance Between Space and 
Time](http://www.chinahadoop.com/2015/July/Shanghai/agenda.php) 
([slides](http://www.slideshare.net/qhzhou/apache-kylin-china-hadoop-summit-2015-shanghai))
 by [Qianhao Zhou](https://github.com/qhzhou), at Hadoop Summit 2015 in 
Shanghai, China, 2015-07-24
+* [Apache Kylin - Balance Between Space and 
Time](https://www.youtube.com/watch?v=jgvZSFaXPgI), 
[slides](http://www.slideshare.net/DebashisSaha/apache-kylin-balance-between-space-and-time-hadop-summit-2015)
 ([video](https://www.youtube.com/watch?v=jgvZSFaXPgI), 
[slides](http://www.slideshare.net/DebashisSaha/apache-kylin-balance-between-space-and-time-hadop-summit-2015))
 by [Debashis Saha](https://twitter.com/debashis_saha) & [Luke 
Han](https://twitter.com/lukehq), at Hadoop Summit 2015 in San Jose, US, 
2015-06-09
+* [HBaseCon 2015: Apache Kylin; Extreme OLAP Engine for 
Hadoop](https://vimeo.com/128152444) ([video](https://vimeo.com/128152444), 
[slides](http://www.slideshare.net/HBaseCon/ecosystem-session-3b)) by [Seshu 
Adunuthula](https://twitter.com/SeshuAd) at HBaseCon 2015 in San Francisco, US, 
2015-05-07
+* [Apache Kylin - Extreme OLAP Engine for 
Hadoop](http://strataconf.com/big-data-conference-uk-2015/public/schedule/detail/40029)
 
([slides](http://www.slideshare.net/lukehan/apache-kylin-extreme-olap-engine-for-big-data))
 by [Luke Han](https://twitter.com/lukehq) & [Yang 
Li](https://github.com/liyang-gmt8), at Strata+Hadoop World in London, UK, 
2015-05-06
+* [Apache Kylin Open Source 
Journey](http://www.infoq.com/cn/presentations/open-source-journey-of-apache-kylin)
 
([slides](http://www.slideshare.net/lukehan/apache-kylin-open-source-journey-for-qcon2015-beijing))
 by [Luke Han](https://twitter.com/lukehq), at QCon Beijing in Beijing, China, 
2015-04-23
+* [Apache Kylin - OLAP on 
Hadoop](http://cio.it168.com/a2015/0418/1721/000001721404.shtml) by [Yang 
Li](https://github.com/liyang-gmt8), at Database Technology Conference China 
2015 in Beijing, China, 2015-04-18
+* [Apache Kylin – Cubes on 
Hadoop](https://www.youtube.com/watch?v=U0SbrVzuOe4) 
([video](https://www.youtube.com/watch?v=U0SbrVzuOe4), 
[slides](http://www.slideshare.net/Hadoop_Summit/apache-kylin-cubes-on-hadoop)) 
by [Ted Dunning](https://twitter.com/ted_dunning), at Hadoop Summit 2015 Europe 
in Brussels, Belgium, 2015-04-16
+* [Apache Kylin - Hadoop 
上的大规模联机分析平台](http://bdtc2014.hadooper.cn/m/zone/bdtc_2014/schedule3)
 
([slides](http://www.slideshare.net/lukehan/apache-kylin-big-data-technology-conference-2014-beijing-v2))
 by [Luke Han](https://twitter.com/lukehq), at Big Data Technology Conference 
China in Beijing, China, 2014-12-14
+* [Apache Kylin: OLAP Engine on Hadoop - Tech Deep 
Dive](http://v.csdn.hudong.com/s/article.html?arcid=15820707) 
([video](http://v.csdn.hudong.com/s/article.html?arcid=15820707), 
[slides](http://www.slideshare.net/XuJiang2/kylin-hadoop-olap-engine)) by 
[Jiang Xu](https://www.linkedin.com/pub/xu-jiang/4/5a8/230), at Shanghai Big 
Data Summit 2014 in Shanghai, China , 2014-10-25
+
+__Meetup__
+
+* [Apache Kylin Meetup @Bay 
Area](http://www.meetup.com/Cloud-at-ebayinc/events/218914395/), in San Jose, 
US, 6:00PM - 7:30PM, Thursday, 2014-12-04
+

http://git-wip-us.apache.org/repos/asf/kylin/blob/7ea64f38/website/_docs20/gettingstarted/faq.md
----------------------------------------------------------------------
diff --git a/website/_docs20/gettingstarted/faq.md 
b/website/_docs20/gettingstarted/faq.md
new file mode 100644
index 0000000..d1455b4
--- /dev/null
+++ b/website/_docs20/gettingstarted/faq.md
@@ -0,0 +1,119 @@
+---
+layout: docs20
+title:  "FAQ"
+categories: gettingstarted
+permalink: /docs20/gettingstarted/faq.html
+since: v0.6.x
+---
+
+#### 1. "bin/find-hive-dependency.sh" can locate hive/hcat jars in local, but 
Kylin reports error like "java.lang.NoClassDefFoundError: 
org/apache/hive/hcatalog/mapreduce/HCatInputFormat"
+
+  * Kylin need many dependent jars (hadoop/hive/hcat/hbase/kafka) on classpath 
to work, but Kylin doesn't ship them. It will seek these jars from your local 
machine by running commands like `hbase classpath`, `hive -e set` etc. The 
founded jars' path will be appended to the environment variable 
*HBASE_CLASSPATH* (Kylin uses `hbase` shell command to start up, which will 
read this). But in some Hadoop distribution (like EMR 5.0), the `hbase` shell 
doesn't keep the origin `HBASE_CLASSPATH` value, that causes the 
"NoClassDefFoundError".
+
+  * To fix this, find the hbase shell script (in hbase/bin folder), and search 
*HBASE_CLASSPATH*, check whether it overwrite the value like :
+
+  {% highlight Groff markup %}
+  export 
HBASE_CLASSPATH=$HADOOP_CONF:$HADOOP_HOME/*:$HADOOP_HOME/lib/*:$ZOOKEEPER_HOME/*:$ZOOKEEPER_HOME/lib/*
+  {% endhighlight %}
+
+  * If true, change it to keep the origin value like:
+
+   {% highlight Groff markup %}
+  export 
HBASE_CLASSPATH=$HADOOP_CONF:$HADOOP_HOME/*:$HADOOP_HOME/lib/*:$ZOOKEEPER_HOME/*:$ZOOKEEPER_HOME/lib/*:$HBASE_CLASSPATH
+  {% endhighlight %}
+
+#### 2. Get "java.lang.IllegalArgumentException: Too high cardinality is not 
suitable for dictionary -- cardinality: 5220674" in "Build Dimension 
Dictionary" step
+
+  * Kylin uses "Dictionary" encoding to encode/decode the dimension values 
(check [this blog](/blog/2015/08/13/kylin-dictionary/)); Usually a dimension's 
cardinality is less than millions, so the "Dict" encoding is good to use. As 
dictionary need be persisted and loaded into memory, if a dimension's 
cardinality is very high, the memory footprint will be tremendous, so Kylin add 
a check on this. If you see this error, suggest to identify the UHC dimension 
first and then re-evaluate the design (whether need to make that as 
dimension?). If must keep it, you can by-pass this error with couple ways: 1) 
change to use other encoding (like `fixed_length`, `integer`) 2) or set a 
bigger value for `kylin.dictionary.max.cardinality` in `conf/kylin.properties`.
+
+#### 3. Build cube failed due to "error check status"
+
+  * Check if `kylin.log` contains 
*yarn.resourcemanager.webapp.address:http://0.0.0.0:8088* and 
*java.net.ConnectException: Connection refused*
+  * If yes, then the problem is the address of resource manager was not 
available in yarn-site.xml
+  * A workaround is update `kylin.properties`, set 
`kylin.job.yarn.app.rest.check.status.url=http://YOUR_RM_NODE:8088/ws/v1/cluster/apps/${job_id}?anonymous=true`
+
+#### 4. HBase cannot get master address from ZooKeeper on Hortonworks Sandbox
+   
+  * By default hortonworks disables hbase, you'll have to start hbase in 
ambari homepage first.
+
+#### 5. Map Reduce Job information cannot display on Hortonworks Sandbox
+   
+  * Check out 
[https://github.com/KylinOLAP/Kylin/issues/40](https://github.com/KylinOLAP/Kylin/issues/40)
+
+#### 6. How to Install Kylin on CDH 5.2 or Hadoop 2.5.x
+
+  * Check out discussion: 
[https://groups.google.com/forum/?utm_medium=email&utm_source=footer#!msg/kylin-olap/X0GZfsX1jLc/nzs6xAhNpLkJ](https://groups.google.com/forum/?utm_medium=email&utm_source=footer#!msg/kylin-olap/X0GZfsX1jLc/nzs6xAhNpLkJ)
+
+  {% highlight Groff markup %}
+  I was able to deploy Kylin with following option in POM.
+  <hadoop2.version>2.5.0</hadoop2.version>
+  <yarn.version>2.5.0</yarn.version>
+  <hbase-hadoop2.version>0.98.6-hadoop2</hbase-hadoop2.version>
+  <zookeeper.version>3.4.5</zookeeper.version>
+  <hive.version>0.13.1</hive.version>
+  My Cluster is running on Cloudera Distribution CDH 5.2.0.
+  {% endhighlight %}
+
+
+#### 7. SUM(field) returns a negtive result while all the numbers in this 
field are > 0
+  * If a column is declared as integer in Hive, the SQL engine (calcite) will 
use column's type (integer) as the data type for "SUM(field)", while the 
aggregated value on this field may exceed the scope of integer; in that case 
the cast will cause a negtive value be returned; The workround is, alter that 
column's type to BIGINT in hive, and then sync the table schema to Kylin (the 
cube doesn't need rebuild); Keep in mind that, always declare as BIGINT in hive 
for an integer column which would be used as a measure in Kylin; See hive 
number types: 
[https://cwiki.apache.org/confluence/display/Hive/LanguageManual+Types#LanguageManualTypes-NumericTypes](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+Types#LanguageManualTypes-NumericTypes)
+
+#### 8. Why Kylin need extract the distinct columns from Fact Table before 
building cube?
+  * Kylin uses dictionary to encode the values in each column, this greatly 
reduce the cube's storage size. To build the dictionary, Kylin need fetch the 
distinct values for each column.
+
+#### 9. Why Kylin calculate the HIVE table cardinality?
+  * The cardinality of dimensions is an important measure of cube complexity. 
The higher the cardinality, the bigger the cube, and thus the longer to build 
and the slower to query. Cardinality > 1,000 is worth attention and > 1,000,000 
should be avoided at best effort. For optimal cube performance, try reduce high 
cardinality by categorize values or derive features.
+
+#### 10. How to add new user or change the default password?
+  * Kylin web's security is implemented with Spring security framework, where 
the kylinSecurity.xml is the main configuration file:
+
+   {% highlight Groff markup %}
+   ${KYLIN_HOME}/tomcat/webapps/kylin/WEB-INF/classes/kylinSecurity.xml
+   {% endhighlight %}
+
+  * The password hash for pre-defined test users can be found in the profile 
"sandbox,testing" part; To change the default password, you need generate a new 
hash and then update it here, please refer to the code snippet in: 
[https://stackoverflow.com/questions/25844419/spring-bcryptpasswordencoder-generate-different-password-for-same-input](https://stackoverflow.com/questions/25844419/spring-bcryptpasswordencoder-generate-different-password-for-same-input)
+  * When you deploy Kylin for more users, switch to LDAP authentication is 
recommended.
+
+#### 11. Using sub-query for un-supported SQL
+
+{% highlight Groff markup %}
+Original SQL:
+select fact.slr_sgmt,
+sum(case when cal.RTL_WEEK_BEG_DT = '2015-09-06' then gmv else 0 end) as W36,
+sum(case when cal.RTL_WEEK_BEG_DT = '2015-08-30' then gmv else 0 end) as W35
+from ih_daily_fact fact
+inner join dw_cal_dt cal on fact.cal_dt = cal.cal_dt
+group by fact.slr_sgmt
+{% endhighlight %}
+
+{% highlight Groff markup %}
+Using sub-query
+select a.slr_sgmt,
+sum(case when a.RTL_WEEK_BEG_DT = '2015-09-06' then gmv else 0 end) as W36,
+sum(case when a.RTL_WEEK_BEG_DT = '2015-08-30' then gmv else 0 end) as W35
+from (
+    select fact.slr_sgmt as slr_sgmt,
+    cal.RTL_WEEK_BEG_DT as RTL_WEEK_BEG_DT,
+    sum(gmv) as gmv36,
+    sum(gmv) as gmv35
+    from ih_daily_fact fact
+    inner join dw_cal_dt cal on fact.cal_dt = cal.cal_dt
+    group by fact.slr_sgmt, cal.RTL_WEEK_BEG_DT
+) a
+group by a.slr_sgmt
+{% endhighlight %}
+
+#### 12. Build kylin meet NPM errors 
(中国大陆地区用户请特别注意此问题)
+
+  * Please add proxy for your NPM:  
+  `npm config set proxy http://YOUR_PROXY_IP`
+
+  * Please update your local NPM repository to using any mirror of npmjs.org, 
like Taobao NPM (请更新您本地的NPM仓库以使用国内
的NPM镜像,例如淘宝NPM镜像) :  
+  [http://npm.taobao.org](http://npm.taobao.org)
+
+#### 13. Failed to run BuildCubeWithEngineTest, saying failed to connect to 
hbase while hbase is active
+  * User may get this error when first time run hbase client, please check the 
error trace to see whether there is an error saying couldn't access a folder 
like "/hadoop/hbase/local/jars"; If that folder doesn't exist, create it.
+
+
+
+

http://git-wip-us.apache.org/repos/asf/kylin/blob/7ea64f38/website/_docs20/gettingstarted/terminology.md
----------------------------------------------------------------------
diff --git a/website/_docs20/gettingstarted/terminology.md 
b/website/_docs20/gettingstarted/terminology.md
new file mode 100644
index 0000000..5d7ecf6
--- /dev/null
+++ b/website/_docs20/gettingstarted/terminology.md
@@ -0,0 +1,25 @@
+---
+layout: docs20
+title:  "Terminology"
+categories: gettingstarted
+permalink: /docs20/gettingstarted/terminology.html
+since: v0.5.x
+---
+ 
+
+Here are some domain terms we are using in Apache Kylin, please check them for 
your reference.   
+They are basic knowledge of Apache Kylin which also will help to well 
understand such concerpt, term, knowledge, theory and others about Data 
Warehouse, Business Intelligence for analycits. 
+
+* __Data Warehouse__: a data warehouse (DW or DWH), also known as an 
enterprise data warehouse (EDW), is a system used for reporting and data 
analysis, [wikipedia](https://en.wikipedia.org/wiki/Data_warehouse)
+* __Business Intelligence__: Business intelligence (BI) is the set of 
techniques and tools for the transformation of raw data into meaningful and 
useful information for business analysis purposes, 
[wikipedia](https://en.wikipedia.org/wiki/Business_intelligence)
+* __OLAP__: OLAP is an acronym for [online analytical 
processing](https://en.wikipedia.org/wiki/Online_analytical_processing)
+* __OLAP Cube__: an OLAP cube is an array of data understood in terms of its 0 
or more dimensions, [wikipedia](http://en.wikipedia.org/wiki/OLAP_cube)
+* __Star Schema__: the star schema consists of one or more fact tables 
referencing any number of dimension tables, 
[wikipedia](https://en.wikipedia.org/wiki/Star_schema)
+* __Fact Table__: a Fact table consists of the measurements, metrics or facts 
of a business process, [wikipedia](https://en.wikipedia.org/wiki/Fact_table)
+* __Lookup Table__: a lookup table is an array that replaces runtime 
computation with a simpler array indexing operation, 
[wikipedia](https://en.wikipedia.org/wiki/Lookup_table)
+* __Dimension__: A dimension is a structure that categorizes facts and 
measures in order to enable users to answer business questions. Commonly used 
dimensions are people, products, place and time, 
[wikipedia](https://en.wikipedia.org/wiki/Dimension_(data_warehouse))
+* __Measure__: a measure is a property on which calculations (e.g., sum, 
count, average, minimum, maximum) can be made, 
[wikipedia](https://en.wikipedia.org/wiki/Measure_(data_warehouse))
+* __Join__: a SQL join clause combines records from two or more tables in a 
relational database, [wikipedia](https://en.wikipedia.org/wiki/Join_(SQL))
+
+
+

http://git-wip-us.apache.org/repos/asf/kylin/blob/7ea64f38/website/_docs20/howto/howto_backup_metadata.md
----------------------------------------------------------------------
diff --git a/website/_docs20/howto/howto_backup_metadata.md 
b/website/_docs20/howto/howto_backup_metadata.md
new file mode 100644
index 0000000..f742e01
--- /dev/null
+++ b/website/_docs20/howto/howto_backup_metadata.md
@@ -0,0 +1,60 @@
+---
+layout: docs20
+title:  Backup Metadata
+categories: howto
+permalink: /docs20/howto/howto_backup_metadata.html
+---
+
+Kylin organizes all of its metadata (including cube descriptions and 
instances, projects, inverted index description and instances, jobs, tables and 
dictionaries) as a hierarchy file system. However, Kylin uses hbase to store 
it, rather than normal file system. If you check your kylin configuration 
file(kylin.properties) you will find such a line:
+
+{% highlight Groff markup %}
+## The metadata store in hbase
+kylin.metadata.url=kylin_metadata@hbase
+{% endhighlight %}
+
+This indicates that the metadata will be saved as a htable called 
`kylin_metadata`. You can scan the htable in hbase shell to check it out.
+
+## Backup Metadata Store with binary package
+
+Sometimes you need to backup the Kylin's Metadata Store from hbase to your 
disk file system.
+In such cases, assuming you're on the hadoop CLI(or sandbox) where you 
deployed Kylin, you can go to KYLIN_HOME and run :
+
+{% highlight Groff markup %}
+./bin/metastore.sh backup
+{% endhighlight %}
+
+to dump your metadata to your local folder a folder under 
KYLIN_HOME/metadata_backps, the folder is named after current time with the 
syntax: KYLIN_HOME/meta_backups/meta_year_month_day_hour_minute_second
+
+## Restore Metadata Store with binary package
+
+In case you find your metadata store messed up, and you want to restore to a 
previous backup:
+
+Firstly, reset the metadata store (this will clean everything of the Kylin 
metadata store in hbase, make sure to backup):
+
+{% highlight Groff markup %}
+./bin/metastore.sh reset
+{% endhighlight %}
+
+Then upload the backup metadata to Kylin's metadata store:
+{% highlight Groff markup %}
+./bin/metastore.sh restore $KYLIN_HOME/meta_backups/meta_xxxx_xx_xx_xx_xx_xx
+{% endhighlight %}
+
+## Backup/restore metadata in development env (available since 0.7.3)
+
+When developing/debugging Kylin, typically you have a dev machine with an IDE, 
and a backend sandbox. Usually you'll write code and run test cases at dev 
machine. It would be troublesome if you always have to put a binary package in 
the sandbox to check the metadata. There is a helper class called 
SandboxMetastoreCLI to help you download/upload metadata locally at your dev 
machine. Follow the Usage information and run it in your IDE.
+
+## Cleanup unused resources from Metadata Store (available since 0.7.3)
+As time goes on, some resources like dictionary, table snapshots became 
useless (as the cube segment be dropped or merged), but they still take space 
there; You can run command to find and cleanup them from metadata store:
+
+Firstly, run a check, this is safe as it will not change anything:
+{% highlight Groff markup %}
+./bin/metastore.sh clean
+{% endhighlight %}
+
+The resources that will be dropped will be listed;
+
+Next, add the "--delete true" parameter to cleanup those resources; before 
this, make sure you have made a backup of the metadata store;
+{% highlight Groff markup %}
+./bin/metastore.sh clean --delete true
+{% endhighlight %}

http://git-wip-us.apache.org/repos/asf/kylin/blob/7ea64f38/website/_docs20/howto/howto_build_cube_with_restapi.md
----------------------------------------------------------------------
diff --git a/website/_docs20/howto/howto_build_cube_with_restapi.md 
b/website/_docs20/howto/howto_build_cube_with_restapi.md
new file mode 100644
index 0000000..42df9cd
--- /dev/null
+++ b/website/_docs20/howto/howto_build_cube_with_restapi.md
@@ -0,0 +1,53 @@
+---
+layout: docs20
+title:  Build Cube with RESTful API
+categories: howto
+permalink: /docs20/howto/howto_build_cube_with_restapi.html
+---
+
+### 1. Authentication
+*   Currently, Kylin uses [basic 
authentication](http://en.wikipedia.org/wiki/Basic_access_authentication).
+*   Add `Authorization` header to first request for authentication
+*   Or you can do a specific request by `POST 
http://localhost:7070/kylin/api/user/authentication`
+*   Once authenticated, client can go subsequent requests with cookies.
+{% highlight Groff markup %}
+POST http://localhost:7070/kylin/api/user/authentication
+    
+Authorization:Basic xxxxJD124xxxGFxxxSDF
+Content-Type: application/json;charset=UTF-8
+{% endhighlight %}
+
+### 2. Get details of cube. 
+*   `GET 
http://localhost:7070/kylin/api/cubes?cubeName={cube_name}&limit=15&offset=0`
+*   Client can find cube segment date ranges in returned cube detail.
+{% highlight Groff markup %}
+GET 
http://localhost:7070/kylin/api/cubes?cubeName=test_kylin_cube_with_slr&limit=15&offset=0
+
+Authorization:Basic xxxxJD124xxxGFxxxSDF
+Content-Type: application/json;charset=UTF-8
+{% endhighlight %}
+### 3. Then submit a build job of the cube. 
+*   `PUT http://localhost:7070/kylin/api/cubes/{cube_name}/rebuild`
+*   For put request body detail please refer to [Build Cube 
API](howto_use_restapi.html#build-cube). 
+    *   `startTime` and `endTime` should be utc timestamp.
+    *   `buildType` can be `BUILD` ,`MERGE` or `REFRESH`. `BUILD` is for 
building a new segment, `REFRESH` for refreshing an existing segment. `MERGE` 
is for merging multiple existing segments into one bigger segment.
+*   This method will return a new created job instance,  whose uuid is the 
unique id of job to track job status.
+{% highlight Groff markup %}
+PUT http://localhost:7070/kylin/api/cubes/test_kylin_cube_with_slr/rebuild
+
+Authorization:Basic xxxxJD124xxxGFxxxSDF
+Content-Type: application/json;charset=UTF-8
+    
+{
+    "startTime": 0,
+    "endTime": 1388563200000,
+    "buildType": "BUILD"
+}
+{% endhighlight %}
+
+### 4. Track job status. 
+*   `GET http://localhost:7070/kylin/api/jobs/{job_uuid}`
+*   Returned `job_status` represents current status of job.
+
+### 5. If the job got errors, you can resume it. 
+*   `PUT http://localhost:7070/kylin/api/jobs/{job_uuid}/resume`

http://git-wip-us.apache.org/repos/asf/kylin/blob/7ea64f38/website/_docs20/howto/howto_cleanup_storage.md
----------------------------------------------------------------------
diff --git a/website/_docs20/howto/howto_cleanup_storage.md 
b/website/_docs20/howto/howto_cleanup_storage.md
new file mode 100644
index 0000000..badede1
--- /dev/null
+++ b/website/_docs20/howto/howto_cleanup_storage.md
@@ -0,0 +1,22 @@
+---
+layout: docs20
+title:  Cleanup Storage (HDFS & HBase)
+categories: howto
+permalink: /docs20/howto/howto_cleanup_storage.html
+---
+
+Kylin will generate intermediate files in HDFS during the cube building; 
Besides, when purge/drop/merge cubes, some HBase tables may be left in HBase 
and will no longer be queried; Although Kylin has started to do some 
+automated garbage collection, it might not cover all cases; You can do an 
offline storage cleanup periodically:
+
+Steps:
+1. Check which resources can be cleanup, this will not remove anything:
+{% highlight Groff markup %}
+export KYLIN_HOME=/path/to/kylin_home
+${KYLIN_HOME}/bin/kylin.sh 
org.apache.kylin.storage.hbase.util.StorageCleanupJob --delete false
+{% endhighlight %}
+Here please replace (version) with the specific Kylin jar version in your 
installation;
+2. You can pickup 1 or 2 resources to check whether they're no longer be 
referred; Then add the "--delete true" option to start the cleanup:
+{% highlight Groff markup %}
+${KYLIN_HOME}/bin/kylin.sh 
org.apache.kylin.storage.hbase.util.StorageCleanupJob --delete true
+{% endhighlight %}
+On finish, the intermediate HDFS location and HTables should be dropped;

http://git-wip-us.apache.org/repos/asf/kylin/blob/7ea64f38/website/_docs20/howto/howto_jdbc.md
----------------------------------------------------------------------
diff --git a/website/_docs20/howto/howto_jdbc.md 
b/website/_docs20/howto/howto_jdbc.md
new file mode 100644
index 0000000..9e6deeb
--- /dev/null
+++ b/website/_docs20/howto/howto_jdbc.md
@@ -0,0 +1,92 @@
+---
+layout: docs20
+title:  Use JDBC Driver
+categories: howto
+permalink: /docs20/howto/howto_jdbc.html
+---
+
+### Authentication
+
+###### Build on Apache Kylin authentication restful service. Supported 
parameters:
+* user : username 
+* password : password
+* ssl: true/false. Default be false; If true, all the services call will use 
https.
+
+### Connection URL format:
+{% highlight Groff markup %}
+jdbc:kylin://<hostname>:<port>/<kylin_project_name>
+{% endhighlight %}
+* If "ssl" = true, the "port" should be Kylin server's HTTPS port; 
+* If "port" is not specified, the driver will use default port: HTTP 80, HTTPS 
443;
+* The "kylin_project_name" must be specified and user need ensure it exists in 
Kylin server;
+
+### 1. Query with Statement
+{% highlight Groff markup %}
+Driver driver = (Driver) 
Class.forName("org.apache.kylin.jdbc.Driver").newInstance();
+
+Properties info = new Properties();
+info.put("user", "ADMIN");
+info.put("password", "KYLIN");
+Connection conn = 
driver.connect("jdbc:kylin://localhost:7070/kylin_project_name", info);
+Statement state = conn.createStatement();
+ResultSet resultSet = state.executeQuery("select * from test_table");
+
+while (resultSet.next()) {
+    assertEquals("foo", resultSet.getString(1));
+    assertEquals("bar", resultSet.getString(2));
+    assertEquals("tool", resultSet.getString(3));
+}
+{% endhighlight %}
+
+### 2. Query with PreparedStatement
+
+###### Supported prepared statement parameters:
+* setString
+* setInt
+* setShort
+* setLong
+* setFloat
+* setDouble
+* setBoolean
+* setByte
+* setDate
+* setTime
+* setTimestamp
+
+{% highlight Groff markup %}
+Driver driver = (Driver) 
Class.forName("org.apache.kylin.jdbc.Driver").newInstance();
+Properties info = new Properties();
+info.put("user", "ADMIN");
+info.put("password", "KYLIN");
+Connection conn = 
driver.connect("jdbc:kylin://localhost:7070/kylin_project_name", info);
+PreparedStatement state = conn.prepareStatement("select * from test_table 
where id=?");
+state.setInt(1, 10);
+ResultSet resultSet = state.executeQuery();
+
+while (resultSet.next()) {
+    assertEquals("foo", resultSet.getString(1));
+    assertEquals("bar", resultSet.getString(2));
+    assertEquals("tool", resultSet.getString(3));
+}
+{% endhighlight %}
+
+### 3. Get query result set metadata
+Kylin jdbc driver supports metadata list methods:
+List catalog, schema, table and column with sql pattern filters(such as %).
+
+{% highlight Groff markup %}
+Driver driver = (Driver) 
Class.forName("org.apache.kylin.jdbc.Driver").newInstance();
+Properties info = new Properties();
+info.put("user", "ADMIN");
+info.put("password", "KYLIN");
+Connection conn = 
driver.connect("jdbc:kylin://localhost:7070/kylin_project_name", info);
+Statement state = conn.createStatement();
+ResultSet resultSet = state.executeQuery("select * from test_table");
+
+ResultSet tables = conn.getMetaData().getTables(null, null, "dummy", null);
+while (tables.next()) {
+    for (int i = 0; i < 10; i++) {
+        assertEquals("dummy", tables.getString(i + 1));
+    }
+}
+{% endhighlight %}

http://git-wip-us.apache.org/repos/asf/kylin/blob/7ea64f38/website/_docs20/howto/howto_ldap_and_sso.md
----------------------------------------------------------------------
diff --git a/website/_docs20/howto/howto_ldap_and_sso.md 
b/website/_docs20/howto/howto_ldap_and_sso.md
new file mode 100644
index 0000000..8085f39
--- /dev/null
+++ b/website/_docs20/howto/howto_ldap_and_sso.md
@@ -0,0 +1,128 @@
+---
+layout: docs20
+title: Enable Security with LDAP and SSO
+categories: howto
+permalink: /docs20/howto/howto_ldap_and_sso.html
+---
+
+## Enable LDAP authentication
+
+Kylin supports LDAP authentication for enterprise or production deployment; 
This is implemented with Spring Security framework; Before enable LDAP, please 
contact your LDAP administrator to get necessary information, like LDAP server 
URL, username/password, search patterns;
+
+#### Configure LDAP server info
+
+Firstly, provide LDAP URL, and username/password if the LDAP server is 
secured; The password in kylin.properties need be encrypted; You can run the 
following command to get the encrypted value (please note, the password's 
length should be less than 16 characters, see 
[KYLIN-2416](https://issues.apache.org/jira/browse/KYLIN-2416)):
+
+```
+cd $KYLIN_HOME/tomcat/webapps/kylin/WEB-INF/lib
+java -classpath 
kylin-server-base-1.6.0.jar:spring-beans-3.2.17.RELEASE.jar:spring-core-3.2.17.RELEASE.jar:commons-codec-1.7.jar
 org.apache.kylin.rest.security.PasswordPlaceholderConfigurer AES 
<your_password>
+```
+
+Config them in the conf/kylin.properties:
+
+```
+ldap.server=ldap://<your_ldap_host>:<port>
+ldap.username=<your_user_name>
+ldap.password=<your_password_encrypted>
+```
+
+Secondly, provide the user search patterns, this is by LDAP design, here is 
just a sample:
+
+```
+ldap.user.searchBase=OU=UserAccounts,DC=mycompany,DC=com
+ldap.user.searchPattern=(&(cn={0})(memberOf=CN=MYCOMPANY-USERS,DC=mycompany,DC=com))
+ldap.user.groupSearchBase=OU=Group,DC=mycompany,DC=com
+```
+
+If you have service accounts (e.g, for system integration) which also need be 
authenticated, configure them in ldap.service.*; Otherwise, leave them be empty;
+
+### Configure the administrator group and default role
+
+To map an LDAP group to the admin group in Kylin, need set the "acl.adminRole" 
to "ROLE_" + GROUP_NAME. For example, in LDAP the group "KYLIN-ADMIN-GROUP" is 
the list of administrators, here need set it as:
+
+```
+acl.adminRole=ROLE_KYLIN-ADMIN-GROUP
+acl.defaultRole=ROLE_ANALYST,ROLE_MODELER
+```
+
+The "acl.defaultRole" is a list of the default roles that grant to everyone, 
keep it as-is.
+
+#### Enable LDAP
+
+Set "kylin.security.profile=ldap" in conf/kylin.properties, then restart Kylin 
server.
+
+## Enable SSO authentication
+
+From v1.5, Kylin provides SSO with SAML. The implementation is based on Spring 
Security SAML Extension. You can read [this 
reference](http://docs.spring.io/autorepo/docs/spring-security-saml/1.0.x-SNAPSHOT/reference/htmlsingle/)
 to get an overall understand.
+
+Before trying this, you should have successfully enabled LDAP and managed 
users with it, as SSO server may only do authentication, Kylin need search LDAP 
to get the user's detail information.
+
+### Generate IDP metadata xml
+Contact your IDP (ID provider), asking to generate the SSO metadata file; 
Usually you need provide three piece of info:
+
+  1. Partner entity ID, which is an unique ID of your app, e.g,: 
https://host-name/kylin/saml/metadata 
+  2. App callback endpoint, to which the SAML assertion be posted, it need be: 
https://host-name/kylin/saml/SSO
+  3. Public certificate of Kylin server, the SSO server will encrypt the 
message with it.
+
+### Generate JKS keystore for Kylin
+As Kylin need send encrypted message (signed with Kylin's private key) to SSO 
server, a keystore (JKS) need be provided. There are a couple ways to generate 
the keystore, below is a sample.
+
+Assume kylin.crt is the public certificate file, kylin.key is the private 
certificate file; firstly create a PKCS#12 file with openssl, then convert it 
to JKS with keytool: 
+
+```
+$ openssl pkcs12 -export -in kylin.crt -inkey kylin.key -out kylin.p12
+Enter Export Password: <export_pwd>
+Verifying - Enter Export Password: <export_pwd>
+
+
+$ keytool -importkeystore -srckeystore kylin.p12 -srcstoretype PKCS12 
-srcstorepass <export_pwd> -alias 1 -destkeystore samlKeystore.jks -destalias 
kylin -destkeypass changeit
+
+Enter destination keystore password:  changeit
+Re-enter new password: changeit
+```
+
+It will put the keys to "samlKeystore.jks" with alias "kylin";
+
+### Enable Higher Ciphers
+
+Make sure your environment is ready to handle higher level crypto keys, you 
may need to download Java Cryptography Extension (JCE) Unlimited Strength 
Jurisdiction Policy Files, copy local_policy.jar and US_export_policy.jar to 
$JAVA_HOME/jre/lib/security .
+
+### Deploy IDP xml file and keystore to Kylin
+
+The IDP metadata and keystore file need be deployed in Kylin web app's 
classpath in $KYLIN_HOME/tomcat/webapps/kylin/WEB-INF/classes 
+       
+  1. Name the IDP file to sso_metadata.xml and then copy to Kylin's classpath;
+  2. Name the keystore as "samlKeystore.jks" and then copy to Kylin's 
classpath;
+  3. If you use another alias or password, remember to update that 
kylinSecurity.xml accordingly:
+
+```
+<!-- Central storage of cryptographic keys -->
+<bean id="keyManager" 
class="org.springframework.security.saml.key.JKSKeyManager">
+       <constructor-arg value="classpath:samlKeystore.jks"/>
+       <constructor-arg type="java.lang.String" value="changeit"/>
+       <constructor-arg>
+               <map>
+                       <entry key="kylin" value="changeit"/>
+               </map>
+       </constructor-arg>
+       <constructor-arg type="java.lang.String" value="kylin"/>
+</bean>
+
+```
+
+### Other configurations
+In conf/kylin.properties, add the following properties with your server 
information:
+
+```
+saml.metadata.entityBaseURL=https://host-name/kylin
+saml.context.scheme=https
+saml.context.serverName=host-name
+saml.context.serverPort=443
+saml.context.contextPath=/kylin
+```
+
+Please note, Kylin assume in the SAML message there is a "email" attribute 
representing the login user, and the name before @ will be used to search LDAP. 
+
+### Enable SSO
+Set "kylin.security.profile=saml" in conf/kylin.properties, then restart Kylin 
server; After that, type a URL like "/kylin" or "/kylin/cubes" will redirect to 
SSO for login, and jump back after be authorized. While login with LDAP is 
still available, you can type "/kylin/login" to use original way. The Rest API 
(/kylin/api/*) still use LDAP + basic authentication, no impact.
+

http://git-wip-us.apache.org/repos/asf/kylin/blob/7ea64f38/website/_docs20/howto/howto_optimize_build.md
----------------------------------------------------------------------
diff --git a/website/_docs20/howto/howto_optimize_build.md 
b/website/_docs20/howto/howto_optimize_build.md
new file mode 100644
index 0000000..8b1ff65
--- /dev/null
+++ b/website/_docs20/howto/howto_optimize_build.md
@@ -0,0 +1,190 @@
+---
+layout: docs20
+title:  Optimize Cube Build
+categories: howto
+permalink: /docs20/howto/howto_optimize_build.html
+---
+
+Kylin decomposes a Cube build task into several steps and then executes them 
in sequence. These steps include Hive operations, MapReduce jobs, and other 
types job. When you have many Cubes to build daily, then you definitely want to 
speed up this process. Here are some practices that you probably want to know, 
and they are organized in the same order as the steps sequence.
+
+
+
+## Create Intermediate Flat Hive Table
+
+This step extracts data from source Hive tables (with all tables joined) and 
inserts them into an intermediate flat table. If Cube is partitioned, Kylin 
will add a time condition so that only the data in the range would be fetched. 
You can check the related Hive command in the log of this step, e.g: 
+
+```
+hive -e "USE default;
+DROP TABLE IF EXISTS 
kylin_intermediate_airline_cube_v3610f668a3cdb437e8373c034430f6c34;
+
+CREATE EXTERNAL TABLE IF NOT EXISTS 
kylin_intermediate_airline_cube_v3610f668a3cdb437e8373c034430f6c34
+(AIRLINE_FLIGHTDATE date,AIRLINE_YEAR int,AIRLINE_QUARTER 
int,...,AIRLINE_ARRDELAYMINUTES int)
+STORED AS SEQUENCEFILE
+LOCATION 
'hdfs:///kylin/kylin200instance/kylin-0a8d71e8-df77-495f-b501-03c06f785b6c/kylin_intermediate_airline_cube_v3610f668a3cdb437e8373c034430f6c34';
+
+SET dfs.replication=2;
+SET hive.exec.compress.output=true;
+SET hive.auto.convert.join.noconditionaltask=true;
+SET hive.auto.convert.join.noconditionaltask.size=100000000;
+SET mapreduce.job.split.metainfo.maxsize=-1;
+
+INSERT OVERWRITE TABLE 
kylin_intermediate_airline_cube_v3610f668a3cdb437e8373c034430f6c34 SELECT
+AIRLINE.FLIGHTDATE
+,AIRLINE.YEAR
+,AIRLINE.QUARTER
+,...
+,AIRLINE.ARRDELAYMINUTES
+FROM AIRLINE.AIRLINE as AIRLINE
+WHERE (AIRLINE.FLIGHTDATE >= '1987-10-01' AND AIRLINE.FLIGHTDATE < 
'2017-01-01');
+"
+
+```
+
+Kylin applies the configuration in conf/kylin\_hive\_conf.xml while Hive 
commands are running, for instance, use less replication and enable Hive's 
mapper side join. If it is needed, you can add other configurations which are 
good for your cluster.
+
+If Cube's partition column ("FLIGHTDATE" in this case) is the same as Hive 
table's partition column, then filtering on it will let Hive smartly skip those 
non-matched partitions. So it is highly recommended to use Hive table's 
paritition column (if it is a date column) as the Cube's partition column. This 
is almost required for those very large tables, or Hive has to scan all files 
each time in this step, costing terribly long time.
+
+If your Hive enables file merge, you can disable them in 
"conf/kylin\_hive\_conf.xml" as Kylin has its own way to merge files (in the 
next step): 
+
+    <property>
+        <name>hive.merge.mapfiles</name>
+        <value>false</value>
+        <description>Disable Hive's auto merge</description>
+    </property>
+
+
+## Redistribute intermediate table
+
+After the previous step, Hive generates the data files in HDFS folder: while 
some files are large, some are small or even empty. The imbalanced file 
distribution would lead subsequent MR jobs to imbalance as well: some mappers 
finish quickly yet some others are very slow. To balance them, Kylin adds this 
step to "redistribute" the data and here is a sample output:
+
+```
+total input rows = 159869711
+expected input rows per mapper = 1000000
+num reducers for RedistributeFlatHiveTableStep = 160
+
+```
+
+
+Redistribute table, cmd: 
+
+```
+hive -e "USE default;
+SET dfs.replication=2;
+SET hive.exec.compress.output=true;
+SET hive.auto.convert.join.noconditionaltask=true;
+SET hive.auto.convert.join.noconditionaltask.size=100000000;
+SET mapreduce.job.split.metainfo.maxsize=-1;
+set mapreduce.job.reduces=160;
+set hive.merge.mapredfiles=false;
+
+INSERT OVERWRITE TABLE 
kylin_intermediate_airline_cube_v3610f668a3cdb437e8373c034430f6c34 SELECT * 
FROM kylin_intermediate_airline_cube_v3610f668a3cdb437e8373c034430f6c34 
DISTRIBUTE BY RAND();
+"
+
+```
+
+
+
+Firstly, Kylin gets the row count of this intermediate table; then based on 
the number of row count, it would get amount of files needed to get data 
redistributed. By default, Kylin allocates one file per 1 million rows. In this 
sample, there are 160 million rows and exist 160 reducers, and each reducer 
would write 1 file. In following MR step over this table, Hadoop will start the 
same number Mappers as the files to process (usually 1 million's data size is 
small than a HDFS block size). If your daily data scale isn't so large or 
Hadoop cluster has enough resources, you may want to get more concurrency. 
Setting "kylin.job.mapreduce.mapper.input.rows" in "conf/kylin.properties" to a 
smaller value will get that, e.g:
+
+`kylin.job.mapreduce.mapper.input.rows=500000`
+
+
+Secondly, Kylin runs a *"INSERT OVERWIRTE TABLE .... DISTRIBUTE BY "* HiveQL 
to distribute the rows among a specified number of reducers.
+
+In most cases, Kylin asks Hive to randomly distributes the rows among 
reducers, then get files very closed in size. The distribute clause is 
"DISTRIBUTE BY RAND()".
+
+If your Cube has specified a "shard by" dimension (in Cube's "Advanced 
setting" page), which is a high cardinality column (like "USER\_ID"), Kylin 
will ask Hive to redistribute data by that column's value. Then for the rows 
that have the same value as this column has, they will go to the same file. 
This is much better than "by random",  because the data will be not only 
redistributed but also pre-categorized without additional cost, thus benefiting 
the subsequent Cube build process. Under a typical scenario, this optimization 
can cut off 40% building time. In this case the distribute clause will be 
"DISTRIBUTE BY USER_ID":
+
+**Please note:** 1) The "shard by" column should be a high cardinality 
dimension column, and it appears in many cuboids (not just appears in seldom 
cuboids). Utilize it to distribute properly can get equidistribution in every 
time range; otherwise it will cause data incline, which will reduce the 
building speed. Typical good cases are: "USER\_ID", "SELLER\_ID", "PRODUCT", 
"CELL\_NUMBER", so forth, whose cardinality is higher than one thousand (should 
be much more than the reducer numbers). 2) Using "shard by" has other advantage 
in Cube storage, but it is out of this doc's scope.
+
+
+
+## Extract Fact Table Distinct Columns
+
+In this step Kylin runs a MR job to fetch distinct values for the dimensions, 
which are using dictionary encoding. 
+
+Actually this step does more: it collects the Cube statistics by using 
HyperLogLog counters to estimate the row count of each Cuboid. If you find that 
mappers work incredible slowly, it usually indicates that the Cube design is 
too complex, please check [optimize cube design](howto_optimize_cubes.html) to 
make the Cube thinner. If the reducers get OutOfMemory error, it indicates that 
the Cuboid combination does explode or the default YARN memory allocation 
cannot meet demands. If this step couldn't finish in a reasonable time by all 
means, you can give up and revisit the design as the real building will take 
longer.
+
+You can reduce the sampling percentage (kylin.job.cubing.inmem.sampling.percen 
in kylin.properties) to get this step accelerated, but this may not help much 
and impact on the accuracy of Cube statistics, thus we don't recommend.  
+
+
+
+## Build Dimension Dictionary
+
+With the distinct values fetched in previous step, Kylin will build 
dictionaries in memory (in next version this will be moved to MR). Usually this 
step is fast, but if the value set is large, Kylin may report error like "Too 
high cardinality is not suitable for dictionary". For UHC column, please use 
other encoding method for the UHC column, such as "fixed_length", "integer" and 
so on.
+
+
+
+## Save Cuboid Statistics and Create HTable
+
+These two steps are lightweight and fast.
+
+
+
+## Build Base Cuboid 
+
+This step is building the base cuboid from the intermediate table, which is 
the first round MR of the "by-layer" cubing algorithm. The mapper number is 
equals to the reducer number of step 2; The reducer number is estimated with 
the cube statistics: by default use 1 reducer every 500MB output; If you 
observed the reducer number is small, you can set 
"kylin.job.mapreduce.default.reduce.input.mb" in kylin.properties to a smaller 
value to get more resources, e.g: 
`kylin.job.mapreduce.default.reduce.input.mb=200`
+
+
+## Build N-Dimension Cuboid 
+
+These steps are the "by-layer" cubing process, each step uses the output of 
previous step as the input, and then cut off one dimension to aggregate to get 
one child cuboid. For example, from cuboid ABCD, cut off A get BCD, cut off B 
get ACD etc. 
+
+Some cuboid can be aggregated from more than 1 parent cubiods, in this case, 
Kylin will select the minimal parent cuboid. For example, AB can be generated 
from ABC (id: 1110) and ABD (id: 1101), so ABD will be used as its id is 
smaller than ABC. Based on this, if D's cardinality is small, the aggregation 
will be cost-efficient. So, when you design the Cube rowkey sequence, please 
remember to put low cardinality dimensions to the tail position. This not only 
benefit the Cube build, but also benefit the Cube query as the post-aggregation 
follows the same rule.
+
+Usually from the N-D to (N/2)-D the building is slow, because it is the cuboid 
explosion process: N-D has 1 Cuboid, (N-1)-D has N cuboids, (N-2)-D has N*(N-1) 
cuboids, etc. After (N/2)-D step, the building gets faster gradually.
+
+
+
+## Build Cube
+
+This step uses a new algorithm to build the Cube: "by-split" Cubing (also 
called as "in-mem" cubing). It will use one round MR to calculate all cuboids, 
but it requests more memory than normal. The "conf/kylin\_job\_conf\_inmem.xml" 
is made for this step. By default it requests 3GB memory for each mapper. If 
your cluster has enough memory, you can allocate more in 
"conf/kylin\_job\_conf\_inmem.xml" so it will use as much possible memory to 
hold the data and gain a better performance, e.g:
+
+    <property>
+        <name>mapreduce.map.memory.mb</name>
+        <value>6144</value>
+        <description></description>
+    </property>
+    
+    <property>
+        <name>mapreduce.map.java.opts</name>
+        <value>-Xmx5632m</value>
+        <description></description>
+    </property>
+
+
+Please note, Kylin will automatically select the best algorithm based on the 
data distribution (get in Cube statistics). The not-selected algorithm's steps 
will be skipped. You don't need to select the algorithm explicitly.
+
+
+
+## Convert Cuboid Data to HFile
+
+This step starts a MR job to convert the Cuboid files (sequence file format) 
into HBase's HFile format. Kylin calculates the HBase region number with the 
Cube statistics, by default 1 region per 5GB. The more regions got, the more 
reducers would be utilized. If you observe the reducer's number is small and 
performance is poor, you can set the following parameters in 
"conf/kylin.properties" to smaller, as follows:
+
+```
+kylin.hbase.region.cut=2
+kylin.hbase.hfile.size.gb=1
+```
+
+If you're not sure what size a region should be, contact your HBase 
administrator. 
+
+
+## Load HFile to HBase Table
+
+This step uses HBase API to load the HFile to region servers, it is 
lightweight and fast.
+
+
+
+## Update Cube Info
+
+After loading data into HBase, Kylin marks this Cube segment as ready in 
metadata. This step is very fast.
+
+
+
+## Cleanup
+
+Drop the intermediate table from Hive. This step doesn't block anything as the 
segment has been marked ready in the previous step. If this step gets error, no 
need to worry, the garbage can be collected later when Kylin executes the 
[StorageCleanupJob](howto_cleanup_storage.html).
+
+
+## Summary
+There are also many other methods to boost the performance. If you have 
practices to share, welcome to discuss in 
[d...@kylin.apache.org](mailto:d...@kylin.apache.org).
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/kylin/blob/7ea64f38/website/_docs20/howto/howto_optimize_cubes.md
----------------------------------------------------------------------
diff --git a/website/_docs20/howto/howto_optimize_cubes.md 
b/website/_docs20/howto/howto_optimize_cubes.md
new file mode 100644
index 0000000..171149d
--- /dev/null
+++ b/website/_docs20/howto/howto_optimize_cubes.md
@@ -0,0 +1,212 @@
+---
+layout: docs20
+title:  Optimize Cube Design
+categories: howto
+permalink: /docs20/howto/howto_optimize_cubes.html
+---
+
+## Hierarchies:
+
+Theoretically for N dimensions you'll end up with 2^N dimension combinations. 
However for some group of dimensions there are no need to create so many 
combinations. For example, if you have three dimensions: continent, country, 
city (In hierarchies, the "bigger" dimension comes first). You will only need 
the following three combinations of group by when you do drill down analysis:
+
+group by continent
+group by continent, country
+group by continent, country, city
+
+In such cases the combination count is reduced from 2^3=8 to 3, which is a 
great optimization. The same goes for the YEAR,QUATER,MONTH,DATE case.
+
+If we Donate the hierarchy dimension as H1,H2,H3, typical scenarios would be:
+
+
+A. Hierarchies on lookup table
+
+
+<table>
+  <tr>
+    <td align="center">Fact table</td>
+    <td align="center">(joins)</td>
+    <td align="center">Lookup Table</td>
+  </tr>
+  <tr>
+    <td>column1,column2,,,,,, FK</td>
+    <td></td>
+    <td>PK,,H1,H2,H3,,,,</td>
+  </tr>
+</table>
+
+---
+
+B. Hierarchies on fact table
+
+
+<table>
+  <tr>
+    <td align="center">Fact table</td>
+  </tr>
+  <tr>
+    <td>column1,column2,,,H1,H2,H3,,,,,,, </td>
+  </tr>
+</table>
+
+---
+
+
+There is a special case for scenario A, where PK on the lookup table is 
accidentally being part of the hierarchies. For example we have a calendar 
lookup table where cal_dt is the primary key:
+
+A*. Hierarchies on lookup table over its primary key
+
+
+<table>
+  <tr>
+    <td align="center">Lookup Table(Calendar)</td>
+  </tr>
+  <tr>
+    <td>cal_dt(PK), week_beg_dt, month_beg_dt, quarter_beg_dt,,,</td>
+  </tr>
+</table>
+
+---
+
+
+For cases like A* what you need is another optimization called "Derived 
Columns"
+
+## Derived Columns:
+
+Derived column is used when one or more dimensions (They must be dimension on 
lookup table, these columns are called "Derived") can be deduced from 
another(Usually it is the corresponding FK, this is called the "host column")
+
+For example, suppose we have a lookup table where we join fact table and it 
with "where DimA = DimX". Notice in Kylin, if you choose FK into a dimension, 
the corresponding PK will be automatically querable, without any extra cost. 
The secret is that since FK and PK are always identical, Kylin can apply 
filters/groupby on the FK first, and transparently replace them to PK.  This 
indicates that if we want the DimA(FK), DimX(PK), DimB, DimC in our cube, we 
can safely choose DimA,DimB,DimC only.
+
+<table>
+  <tr>
+    <td align="center">Fact table</td>
+    <td align="center">(joins)</td>
+    <td align="center">Lookup Table</td>
+  </tr>
+  <tr>
+    <td>column1,column2,,,,,, DimA(FK) </td>
+    <td></td>
+    <td>DimX(PK),,DimB, DimC</td>
+  </tr>
+</table>
+
+---
+
+
+Let's say that DimA(the dimension representing FK/PK) has a special mapping to 
DimB:
+
+
+<table>
+  <tr>
+    <th>dimA</th>
+    <th>dimB</th>
+    <th>dimC</th>
+  </tr>
+  <tr>
+    <td>1</td>
+    <td>a</td>
+    <td>?</td>
+  </tr>
+  <tr>
+    <td>2</td>
+    <td>b</td>
+    <td>?</td>
+  </tr>
+  <tr>
+    <td>3</td>
+    <td>c</td>
+    <td>?</td>
+  </tr>
+  <tr>
+    <td>4</td>
+    <td>a</td>
+    <td>?</td>
+  </tr>
+</table>
+
+
+in this case, given a value in DimA, the value of DimB is determined, so we 
say dimB can be derived from DimA. When we build a cube that contains both DimA 
and DimB, we simple include DimA, and marking DimB as derived. Derived 
column(DimB) does not participant in cuboids generation:
+
+original combinations:
+ABC,AB,AC,BC,A,B,C
+
+combinations when driving B from A:
+AC,A,C
+
+at Runtime, in case queries like "select count(*) from fact_table inner join 
looup1 group by looup1 .dimB", it is expecting cuboid containing DimB to answer 
the query. However, DimB will appear in NONE of the cuboids due to derived 
optimization. In this case, we modify the execution plan to make it group by  
DimA(its host column) first, we'll get intermediate answer like:
+
+
+<table>
+  <tr>
+    <th>DimA</th>
+    <th>count(*)</th>
+  </tr>
+  <tr>
+    <td>1</td>
+    <td>1</td>
+  </tr>
+  <tr>
+    <td>2</td>
+    <td>1</td>
+  </tr>
+  <tr>
+    <td>3</td>
+    <td>1</td>
+  </tr>
+  <tr>
+    <td>4</td>
+    <td>1</td>
+  </tr>
+</table>
+
+
+Afterwards, Kylin will replace DimA values with DimB values(since both of 
their values are in lookup table, Kylin can load the whole lookup table into 
memory and build a mapping for them), and the intermediate result becomes:
+
+
+<table>
+  <tr>
+    <th>DimB</th>
+    <th>count(*)</th>
+  </tr>
+  <tr>
+    <td>a</td>
+    <td>1</td>
+  </tr>
+  <tr>
+    <td>b</td>
+    <td>1</td>
+  </tr>
+  <tr>
+    <td>c</td>
+    <td>1</td>
+  </tr>
+  <tr>
+    <td>a</td>
+    <td>1</td>
+  </tr>
+</table>
+
+
+After this, the runtime SQL engine(calcite) will further aggregate the 
intermediate result to:
+
+
+<table>
+  <tr>
+    <th>DimB</th>
+    <th>count(*)</th>
+  </tr>
+  <tr>
+    <td>a</td>
+    <td>2</td>
+  </tr>
+  <tr>
+    <td>b</td>
+    <td>1</td>
+  </tr>
+  <tr>
+    <td>c</td>
+    <td>1</td>
+  </tr>
+</table>
+
+
+this step happens at query runtime, this is what it means "at the cost of 
extra runtime aggregation"

http://git-wip-us.apache.org/repos/asf/kylin/blob/7ea64f38/website/_docs20/howto/howto_update_coprocessor.md
----------------------------------------------------------------------
diff --git a/website/_docs20/howto/howto_update_coprocessor.md 
b/website/_docs20/howto/howto_update_coprocessor.md
new file mode 100644
index 0000000..8f83d70
--- /dev/null
+++ b/website/_docs20/howto/howto_update_coprocessor.md
@@ -0,0 +1,14 @@
+---
+layout: docs20
+title:  How to Update HBase Coprocessor
+categories: howto
+permalink: /docs20/howto/howto_update_coprocessor.html
+---
+
+Kylin leverages HBase coprocessor to optimize query performance. After new 
versions released, the RPC protocol may get changed, so user need to redeploy 
coprocessor to HTable.
+
+There's a CLI tool to update HBase Coprocessor:
+
+{% highlight Groff markup %}
+$KYLIN_HOME/bin/kylin.sh 
org.apache.kylin.storage.hbase.util.DeployCoprocessorCLI 
$KYLIN_HOME/lib/kylin-coprocessor-*.jar all
+{% endhighlight %}

http://git-wip-us.apache.org/repos/asf/kylin/blob/7ea64f38/website/_docs20/howto/howto_upgrade.md
----------------------------------------------------------------------
diff --git a/website/_docs20/howto/howto_upgrade.md 
b/website/_docs20/howto/howto_upgrade.md
new file mode 100644
index 0000000..811b6c6
--- /dev/null
+++ b/website/_docs20/howto/howto_upgrade.md
@@ -0,0 +1,66 @@
+---
+layout: docs20
+title:  Upgrade From Old Versions
+categories: howto
+permalink: /docs20/howto/howto_upgrade.html
+since: v1.5.1
+---
+
+Running as a Hadoop client, Apache Kylin's metadata and Cube data are 
persistended in Hadoop (HBase and HDFS), so the upgrade is relatively easy and 
user doesn't need worry about data loss. The upgrade can be performed in the 
following steps:
+
+* Download the new Apache Kylin binary package for your Hadoop version from 
Kylin's download page;
+* Uncompress the new version Kylin package to a new folder, e.g, 
/usr/local/kylin/apache-kylin-1.6.0/ (directly overwrite old instance is not 
recommended);
+* Copy the configuration files (`$KYLIN_HOME/conf/*`) from old instance (e.g 
/usr/local/kylin/apache-kylin-1.5.4/) to the new instance's `conf` folder if 
you have customized configurations; It is recommended to do a compare and merge 
since there might be new parameters introduced. If you have modified tomcat 
configuration ($KYLIN_HOME/tomcat/conf/), also remember to do the same.
+* Stop the current Kylin instance with `./bin/kylin.sh stop`;
+* Set the `KYLIN_HOME` env variable to the new instance's installation folder. 
If you have set `KYLIN_HOME` in `~/.bash_profile` or other scripts, remember to 
update them as well.
+* Start the new Kylin instance with `$KYLIN_HOME/bin/kylin start`; After be 
started, login Kylin web to check whether your cubes can be loaded correctly.
+* [Upgrade coprocessor](howto_update_coprocessor.html) to ensure the HBase 
region servers use the latest Kylin coprocessor.
+* Verify your SQL queries can be performed successfully.
+
+Below are versions specific guides:
+
+## Upgrade from v1.5.4 to v1.6.0
+Kylin v1.5.4 and v1.6.0 are compitible in metadata; Please follow the common 
upgrade steps above.
+
+## Upgrade from v1.5.3 to v1.5.4
+Kylin v1.5.3 and v1.5.4 are compitible in metadata; Please follow the common 
upgrade steps above.
+
+## Upgrade from 1.5.2 to v1.5.3
+Kylin v1.5.3 metadata is compitible with v1.5.2, your cubes don't need 
rebuilt, as usual, some actions need to be performed:
+
+#### 1. Update HBase coprocessor
+The HBase tables for existing cubes need be updated to the latest coprocessor; 
Follow [this guide](howto_update_coprocessor.html) to update;
+
+#### 2. Update conf/kylin_hive_conf.xml
+From 1.5.3, Kylin doesn't need Hive to merge small files anymore; For users 
who copy the conf/ from previous version, please remove the "merge" related 
properties in kylin_hive_conf.xml, including "hive.merge.mapfiles", 
"hive.merge.mapredfiles", and "hive.merge.size.per.task"; this will save the 
time on extracting data from Hive.
+
+
+## Upgrade from 1.5.1 to v1.5.2
+Kylin v1.5.2 metadata is compitible with v1.5.1, your cubes don't need 
upgrade, while some actions need to be performed:
+
+#### 1. Update HBase coprocessor
+The HBase tables for existing cubes need be updated to the latest coprocessor; 
Follow [this guide](howto_update_coprocessor.html) to update;
+
+#### 2. Update conf/kylin.properties
+In v1.5.2 several properties are deprecated, and several new one are added:
+
+Deprecated:
+
+* kylin.hbase.region.cut.small=5
+* kylin.hbase.region.cut.medium=10
+* kylin.hbase.region.cut.large=50
+
+New:
+
+* kylin.hbase.region.cut=5
+* kylin.hbase.hfile.size.gb=2
+
+These new parameters determines how to split HBase region; To use different 
size you can overwite these params in Cube level. 
+
+When copy from old kylin.properties file, suggest to remove the deprecated 
ones and add the new ones.
+
+#### 3. Add conf/kylin\_job\_conf\_inmem.xml
+A new job conf file named "kylin\_job\_conf\_inmem.xml" is added in "conf" 
folder; As Kylin 1.5 introduced the "fast cubing" algorithm, which aims to 
leverage more memory to do the in-mem aggregation; Kylin will use this new conf 
file for submitting the in-mem cube build job, which requesting different 
memory with a normal job; Please update it properly according to your cluster 
capacity.
+
+Besides, if you have used separate config files for different capacity cubes, 
for example "kylin\_job\_conf\_small.xml", "kylin\_job\_conf\_medium.xml" and 
"kylin\_job\_conf\_large.xml", please note that they are deprecated now; Only 
"kylin\_job\_conf.xml" and "kylin\_job\_conf\_inmem.xml" will be used for 
submitting cube job; If you have cube level job configurations (like using 
different Yarn job queue), you can customize at cube level, check 
[KYLIN-1706](https://issues.apache.org/jira/browse/KYLIN-1706)
+

http://git-wip-us.apache.org/repos/asf/kylin/blob/7ea64f38/website/_docs20/howto/howto_use_beeline.md
----------------------------------------------------------------------
diff --git a/website/_docs20/howto/howto_use_beeline.md 
b/website/_docs20/howto/howto_use_beeline.md
new file mode 100644
index 0000000..1effdca
--- /dev/null
+++ b/website/_docs20/howto/howto_use_beeline.md
@@ -0,0 +1,14 @@
+---
+layout: docs20
+title:  Use Beeline for Hive Commands
+categories: howto
+permalink: /docs20/howto/howto_use_beeline.html
+---
+
+Beeline(https://cwiki.apache.org/confluence/display/Hive/HiveServer2+Clients) 
is recommended by many venders to replace Hive CLI. By default Kylin uses Hive 
CLI to synchronize Hive tables, create flatten intermediate tables, etc. By 
simple configuration changes you can set Kylin to use Beeline instead.
+
+Edit $KYLIN_HOME/conf/kylin.properties by:
+
+  1. change kylin.hive.client=cli to kylin.hive.client=beeline
+  2. add "kylin.hive.beeline.params", this is where you can specifiy beeline 
commmand parameters. Like username(-n), JDBC URL(-u),etc. There's a sample 
kylin.hive.beeline.params included in default kylin.properties, however it's 
commented. You can modify the sample based on your real environment.
+

http://git-wip-us.apache.org/repos/asf/kylin/blob/7ea64f38/website/_docs20/howto/howto_use_distributed_scheduler.md
----------------------------------------------------------------------
diff --git a/website/_docs20/howto/howto_use_distributed_scheduler.md 
b/website/_docs20/howto/howto_use_distributed_scheduler.md
new file mode 100644
index 0000000..4cdac8a
--- /dev/null
+++ b/website/_docs20/howto/howto_use_distributed_scheduler.md
@@ -0,0 +1,16 @@
+---
+layout: docs20
+title:  Use distributed job scheduler
+categories: howto
+permalink: /docs20/howto/howto_use_distributed_scheduler.html
+---
+
+Since Kylin 2.0, Kylin support distributed job scheduler.
+Which is more extensible, available and reliable than default job scheduler.
+To enable the distributed job scheduler, you need to set or update three 
configs in the kylin.properties:
+
+```
+1. kylin.job.scheduler.default=2
+2. 
kylin.job.lock=org.apache.kylin.storage.hbase.util.ZookeeperDistributedJobLock
+3. add all job servers and query servers to the kylin.server.cluster-servers
+```

Reply via email to