This is an automated email from the ASF dual-hosted git repository. xxyu pushed a commit to branch doc5.0 in repository https://gitbox.apache.org/repos/asf/kylin.git
commit 5221ec3de31e39c43183af24d5809f704295114b Author: Mukvin <boyboys...@163.com> AuthorDate: Tue Aug 16 18:10:20 2022 +0800 KYLIN-5221 add operations doc --- website/docs/configuration/configuration.md | 2 +- website/docs/configuration/hadoop_queue_config.md | 2 +- website/docs/configuration/query_cache.md | 2 +- website/docs/configuration/spark_rpc_encryption.md | 5 +- .../data-access-control/acl_table.md | 75 +++++ .../access-control/data-access-control/intro.md | 41 +++ .../data-access-control/project_acl.md | 54 ++++ .../operations/access-control/group_management.md | 52 ++++ .../operations/access-control/images/acl_5_en.png | Bin 0 -> 337881 bytes .../access-control/images/table_acl_1_en.png | Bin 0 -> 70055 bytes .../access-control/images/table_acl_2_en.png | Bin 0 -> 33907 bytes .../access-control/images/table_acl_3_en.png | Bin 0 -> 81429 bytes website/docs/operations/access-control/intro.md | 24 ++ .../operations/access-control/user_management.md | 93 +++++++ .../operations/access-control/users/user_w1.png | Bin 0 -> 30698 bytes .../operations/access-control/users/user_w2.png | Bin 0 -> 11792 bytes .../operations/access-control/users/user_w3.png | Bin 0 -> 7887 bytes .../operations/access-control/users/user_w4.png | Bin 0 -> 9204 bytes .../operations/access-control/users/user_w5.png | Bin 0 -> 11144 bytes .../operations/access-control/users/user_w6.png | Bin 0 -> 16649 bytes .../operations/access-control/users/user_w7.png | Bin 0 -> 16242 bytes .../operations/access-control/users/user_w8.png | Bin 0 -> 27691 bytes website/docs/operations/logs/audit_log.md | 208 ++++++++++++++ .../operations/logs/images/after_update_model.png | Bin 0 -> 332033 bytes .../operations/logs/images/after_update_pwd.png | Bin 0 -> 284440 bytes .../operations/logs/images/before_update_model.png | Bin 0 -> 339841 bytes .../operations/logs/images/before_update_pwd.png | Bin 0 -> 292514 bytes .../docs/operations/logs/images/metadata_table.png | Bin 0 -> 565143 bytes website/docs/operations/logs/intro.md | 21 ++ website/docs/operations/logs/system_log.md | 249 +++++++++++++++++ website/docs/operations/overview.md | 22 ++ .../images/project_settings_acceleration.png | Bin 213058 -> 0 bytes .../images/project_settings_acceleration_tips.png | Bin 150813 -> 0 bytes .../images/project_settings_storage_low_usage.png | Bin 240252 -> 0 bytes .../images/recommendation_preferences_en.png | Bin 153741 -> 0 bytes .../images/service_state.en.png | Bin 58508 -> 0 bytes .../images/storage_quota.en.png | Bin 34171 -> 0 bytes .../alerting.md | 0 .../project-operation/images/add_project.png | Bin 0 -> 66925 bytes .../project-operation/images/project_list.png | Bin 0 -> 262909 bytes .../images/project_settings.png | Bin .../images/project_settings_basic_general.png | Bin .../images/project_settings_cc_expose.png | Bin .../images/project_settings_custom_config.png | Bin .../project_settings_default_database_en.png | Bin .../images/project_settings_job_notification.png | Bin .../project_settings_job_notification_page.png | Bin .../images/project_settings_model_rewrite.png | Bin .../images/project_settings_pushdown.png | Bin .../images/project_settings_segment_en.png | Bin .../images/project_settings_storage.png | Bin .../images/project_settings_yarn_queue.png | Bin .../images/toolbar.en.png | Bin .../intro.md | 0 .../project-operation/project_management.md | 62 +++++ .../project_settings.md | 0 .../{ => system-operation}/cli_tool/diagnosis.md | 0 .../cli_tool/environment_dependency_check.md | 48 ++++ .../system-operation/cli_tool/images/accept_en.png | Bin 0 -> 76977 bytes .../system-operation/cli_tool/images/button_en.png | Bin 0 -> 195535 bytes .../cli_tool/images/rollback_en.png | Bin 0 -> 80422 bytes .../{ => system-operation}/cli_tool/intro.md | 0 .../cli_tool/metadata_tool/intro.md | 18 ++ .../metadata_tool/metadata_backup_restore.md | 88 ++++++ .../system-operation/cli_tool/rollback.md | 123 +++++++++ .../diagnosis/build_flame_graph.md | 78 ++++++ .../system-operation/diagnosis/diagnosis.md | 86 ++++++ .../diagnosis/images/diagnosis_web_en.png | Bin 0 -> 179703 bytes .../diagnosis/images/job_diagnosis_web_en.png | Bin 0 -> 141217 bytes .../diagnosis/images/query_diagnosis_web.png | Bin 0 -> 127898 bytes .../operations/system-operation/diagnosis/intro.md | 17 ++ .../diagnosis/query_flame_graph.md | 58 ++++ .../docs/operations/system-operation/guardian.md | 62 +++++ ...den_modify_metadata_during_maintenance_mode.png | Bin 0 -> 189044 bytes ..._metadata_promoting_during_maintenance_mode.png | Bin 0 -> 188242 bytes .../system-operation/images/maintenance_mode.png | Bin 0 -> 246534 bytes .../system_recover_from_maintenance_mode.png | Bin 0 -> 178347 bytes website/docs/operations/system-operation/intro.md | 17 ++ .../operations/system-operation/junk_file_clean.md | 90 ++++++ .../operations/system-operation/limit_query.md | 72 +++++ .../system-operation/maintenance_mode.md | 76 +++++ .../system-operation/update-session-table.md | 46 ++++ website/sidebars.js | 306 +++++++++++++++++---- 83 files changed, 2044 insertions(+), 53 deletions(-) diff --git a/website/docs/configuration/configuration.md b/website/docs/configuration/configuration.md index 979e92303f..3046f38233 100644 --- a/website/docs/configuration/configuration.md +++ b/website/docs/configuration/configuration.md @@ -82,7 +82,7 @@ The file **kylin.properties** occupies some of the most important configurations | kylin.engine.spark-conf.spark.driver.host | Configure the IP of the node where the Kylin is located | | kylin.engine.sanity-check-enabled | Configure Kylin whether to open Sanity Check during indexes building. The default value is `true` | | kylin.job.finished-notifier-url | When the building job is completed, the job status information will be sent to the url via HTTP request | -| kylin.diag.obf.level | The desensitization level of the diagnostic package. `RAW` means no desensitization, `OBF` means desensitization. Configuring `OBF` will desensitize sensitive information such as usernames and passwords in the `kylin.properties` file (please refer to the [Diagnosis Kit Tool](../operations/cli_tool/diagnosis.md) chapter), The default value is `OBF`. | +| kylin.diag.obf.level | The desensitization level of the diagnostic package. `RAW` means no desensitization, `OBF` means desensitization. Configuring `OBF` will desensitize sensitive information such as usernames and passwords in the `kylin.properties` file (please refer to the [Diagnosis Kit Tool](../operations/system-operation/cli_tool/diagnosis.md) chapter), The default value is `OBF`. | | kylin.diag.task-timeout | The subtask timeout time for the diagnostic package, whose default value is 3 minutes | | kylin.diag.task-timeout-black-list | Diagnostic package subtask timeout blacklist (the values are separated by commas). The subtasks in the blacklist will be skipped by the timeout settings and will run until it finished. The default value is `METADATA`, `LOG` <br />The optional value is as below: <br />METADATA, AUDIT_LOG, CLIENT, JSTACK, CONF, HADOOP_CONF, BIN, HADOOP_ENV, CATALOG_INFO, SYSTEM_METRICS, MONITOR_METRICS, SPARK_LOGS, SPARDER_HISTORY, KG_LOGS, L [...] | kylin.query.queryhistory.max-size | The total number of records in the query history of all projects, the default is 10000000 | diff --git a/website/docs/configuration/hadoop_queue_config.md b/website/docs/configuration/hadoop_queue_config.md index b252ec2350..f85925250b 100644 --- a/website/docs/configuration/hadoop_queue_config.md +++ b/website/docs/configuration/hadoop_queue_config.md @@ -50,4 +50,4 @@ Similarly, you may set up YARN queue for other Kylin instances to achieve comput ###<span id="project">Project-level YARN Queue Setting</span> -The system admin user can set the YARN Application Queue of the project in **Setting -> Advanced Settings -> YARN Application Queue**, please refer to the [Project Settings](../operations/project-maintenance/project_settings.md) for more information. +The system admin user can set the YARN Application Queue of the project in **Setting -> Advanced Settings -> YARN Application Queue**, please refer to the [Project Settings](../operations/project-operation/project_settings.md) for more information. diff --git a/website/docs/configuration/query_cache.md b/website/docs/configuration/query_cache.md index 27e09396f1..f767537f57 100644 --- a/website/docs/configuration/query_cache.md +++ b/website/docs/configuration/query_cache.md @@ -19,7 +19,7 @@ By default, Kylin enables query cache in each process to improve query performan > **Note**: In order to ensure data consistency, query cache is not available > in pushdown. -###Use Default Cache +### Use Default Cache Kylin enables query cache by default at each node/process level. The configuration details are described below. You can change them in `$KYLIN_HOME/conf/kylin.properties` under Kylin installation directory. diff --git a/website/docs/configuration/spark_rpc_encryption.md b/website/docs/configuration/spark_rpc_encryption.md index 0e696b829e..e6e3b9bb17 100644 --- a/website/docs/configuration/spark_rpc_encryption.md +++ b/website/docs/configuration/spark_rpc_encryption.md @@ -21,8 +21,11 @@ For more details about Spark RPC communication encryption, please see [Spark Sec This function is disabled by default. If you need to enable it, please refer to the following method for configuration. ### Spark RPC Communication Encryption Configuration -1、1. Please refer to [Spark Security](http://spark.apache.org/docs/1.6.2/job-scheduling.html#dynamic-resource-allocation) to ensure that RPC communication encryption is enabled in the Spark cluster. + +1、Please refer to [Spark Security](http://spark.apache.org/docs/1.6.2/job-scheduling.html#dynamic-resource-allocation) to ensure that RPC communication encryption is enabled in the Spark cluster. + 2、Add the following configurations in `$KYLIN_HOME/conf/kylin.properties`, to To enable Kylin nodes and Spark cluster communication encryption + ``` ### spark rpc encryption for build jobs kylin.storage.columnar.spark-conf.spark.authenticate=true diff --git a/website/docs/operations/access-control/data-access-control/acl_table.md b/website/docs/operations/access-control/data-access-control/acl_table.md new file mode 100644 index 0000000000..5978d8aabc --- /dev/null +++ b/website/docs/operations/access-control/data-access-control/acl_table.md @@ -0,0 +1,75 @@ +--- +title: Table Access Control +language: en +sidebar_label: Table Access Control +pagination_label: Table Access Control +toc_min_heading_level: 2 +toc_max_heading_level: 6 +pagination_prev: null +pagination_next: null +keywords: + - table access control +draft: true +last_update: + date: Aug 16, 2022 +--- + +### <span id="table">Table ACL</span> + +**Introduction** + +Table ACLs determines whether a user/user group can access a certain table loaded into Kyligence Enterprise. When a user/group is restricted to a table, the user/user group cannot query the table. + +- System administrator can grant table-level data access to a user/user group. All users/user groups that have access rights to the current project can read or query all tables in the current project by default. +- When the access rights of the user/user group are deleted, or the user/user group is removed from the system, the corresponding table-level permissions are also deleted. +- When the table is deleted from the project, the Table ACL for all users/user groups are also deleted. +- When the table is reloaded in the project, the Table ACL for all users/user groups are retained. + +**Example** + +Now we will use specific examples to show the product behavior after setting table ACLs. + +If a user do not have the access to table `SSB.SUPPLIER`, he cannot see the corresponding table under the project or in the connected BI tool. + + + + + +If you query the table, the result will show an error. + + + +### <span id="set">Grant Table ACL</span> + +1. In the **Admin** -> Project page, the system administrator can grant user/user group project-level access rights. For details, please refer to [Project ACL](project_acl.en.md). + +2. Select the project you want to authorize,then click **Authorization** under the right **Action** bar and go to the authorization page. + +3. Click the left arrow button and expand the user/user group that requires authorization, and the interface displays the tables / columns / rows that the user/user group has access to. + +  + +4. Click the **Edit** button and check the tables and columns that need to be granted access. There is no row-level access restriction by default, which means that users can access all the rows in the column. + +  + +5. Click the **+ Add** button on the right of the **Row Access List**, select the column in the pop-up window and enter the rows that can be accessed. Support for **IN** or **LIKE** rules to set row level permissions, increase or decrease row-level access via the **+/-** button on the right, and click **Submit**. + +  + +6. For granted row-level access, click the **...** button on the right to modify the settings, or click **Delete** on the right. + +  + +7. Confirm that the currently settings for Table ACLs are correct and click **Submit**. + + + +### <span id="notice">Notices</span> + +- The authorization operation is a whitelist display for the user/user group, that is, only the tables / rows / columns permissions added for the user/user group are accessible. +- By default, a user/user group will be automatically granted all access permissions on all tables in this project after added into this project. You can modify the configuration item `kylin.acl.project-internal-default-permission-granted=true` in the configuration file. After setting it to `false`, when the user/user group is added to the project, there is no table granted by default. System administrators can manually select tables and set access permissions. +- The access to the user/user group takes the largest set of results that can be accessed (union). + - If user `user1` has access to table `table1`, user `user1` is in user group `group1`, and user group `group1` has access to table `table2`. At this time user `user1` can also access the tables `table1` and `table2`. + - If user group `group1` has access to tables `table1` and `table2`, user group `group2` has access to tables `table2` and `table3`, user `user1` belongs to user group `group1` and `group2`. At this time `user1` has access to the tables `table1`, `table2` and `table3`. +- Only system administrator users can perform **add/delete/change ** operations at the Table / Column / Row ACL. diff --git a/website/docs/operations/access-control/data-access-control/intro.md b/website/docs/operations/access-control/data-access-control/intro.md new file mode 100644 index 0000000000..b84e0958ab --- /dev/null +++ b/website/docs/operations/access-control/data-access-control/intro.md @@ -0,0 +1,41 @@ +--- +title: Data Access Control +language: en +sidebar_label: Data Access Control +pagination_label: Data Access Control +toc_min_heading_level: 2 +toc_max_heading_level: 6 +pagination_prev: null +pagination_next: null +keywords: + - data access control +draft: true +last_update: + date: Aug 16, 2022 +--- + + +Kylin provides a rich set of access control features for big enterprise. Start from Kylin 5, every action from user must satisfy both **Operation Permission** and **Data Access Permission**, before the action can perform. + +- Operation Permission: Defined at project level, specifies what operations a user can perform within a project. User can have one of the four permissions, from weak to powerful: + - *QUERY*: Allows to run query in a project. + - *OPERATION*: Allows to operate models, like building, refreshing, and managing Segments. Implies the QUERY permission. + - *MANAGEMENT*: Allows to manage models and cubes, like create and edit. Implies the OPERATION permission. + - *ADMIN*: Project level administrator permission, allows to manage source tables, and all other operations in a project. + + See [Project ACL](project_acl.md) for more details. + +- Data Access Permission: Defined on data, specifies which tables, columns, and rows a user can access. See [Table](acl_table.md) for more details. + +### Examples of Permission Check + +To perform an action, user must have both operation permission and data access permission. Below are a few examples. + +- To manage source tables, user needs the ADMIN permission, and only the tables user can access can be seen and acted. (Column and row ACLs does not impact the source table management.) +- To edit a model, user must have the MANAGEMENT permission and have access to all the tables and columns in the model. +- Running queries is mostly about data access control, since all users in a project have at least QUERY permission. First user must have access to all the tables and columns in the query, or the system will prompt permission error and refuse to execute. Second the system will only return rows that are accessible to a user. If different row ACLs are set for users, they may see different results from a same query. + +### Other Notes + +- The system administrator is not restricted by the data access controls by default, he/she has access to all data. +- The system does not provide operation permissions at model level yet. diff --git a/website/docs/operations/access-control/data-access-control/project_acl.md b/website/docs/operations/access-control/data-access-control/project_acl.md new file mode 100755 index 0000000000..2b38cac6e7 --- /dev/null +++ b/website/docs/operations/access-control/data-access-control/project_acl.md @@ -0,0 +1,54 @@ +--- +title: Project Access Control +language: en +sidebar_label: Project Access Control +pagination_label: Project Access Control +toc_min_heading_level: 2 +toc_max_heading_level: 6 +pagination_prev: null +pagination_next: null +keywords: + - project access control +draft: true +last_update: + date: Aug 16, 2022 +--- + +Project ACLs determines whether a user/user group can access a certain project in Kylin. Kylin has four built-in project level permissions, *Admin*, *Management*, *Operation* and *Query*. *Admin* includes the other three permissions, *Management* includes *Operation* and *Query* permissions, *Operation* includes *Query* permissions. + +- *QUERY*: Permission to query tables/models in the project. If pushdown is enabled, user/group can query tables loaded to the project when there's no ready model to answer the query. +- *OPERATION*: Permission to build a model in the project, including rebuild a segment, resume or discard jobs. +- *MANAGEMENT*: Permission to edit/delete models in the project. +- *ADMIN*: Permission to manage data sources, models in the project. + +After the system administrator logs in to Kylin, click the **Admin** button in the global toolbar to enter the system management page, and click the **Project** field to enter the Project Management page. + +After the system administrator assigns project access permission to a group, users in the group will inherit the access permission on data source, models and segments accordingly. + + +### Grant Project ACL + +1. Select the project and click the **Authorization** icon under the **Actions** column on the right to enter the authorization page. +2. Expand a project on the project list. +3. Click **+ User / Group** to grant access for a user / user group. +4. Select the grant type : by **User** or by **User Group**. Then select the user / user group and access permission to be granted, and click **Submit**. + + + + +### Modify Project ACL + +1. Select the project and click the **Authorization** icon under the **Actions** column on the right to enter the authorization page. +2. Select the user / user group in the list and click the **Edit** icon under the **Actions** column on the right. +3. Modify user / user group's access permission and click **Submit**. + +### Revoke Project ACL + +1. Select the project and click the **Authorization** icon under the **Actions** column on the right to enter the authorization page. +2. Select the user / user group in the list and click the **Delete** icon under the **Actions** column on the right. + +> **Caution**: When user/user group's project access permission has been revoked, all access permission on this project including table-level, row-level and column-level will be revoked subsequently. + +### Data authority control + +After setting the system level configuration parameter **kylin.security.allow-project-admin-grant-acl=false**, the project administrator in each project has no right to assign table / row / column access rights to users or user groups, but can still view the table / row / column access rights of users or user groups. diff --git a/website/docs/operations/access-control/group_management.md b/website/docs/operations/access-control/group_management.md new file mode 100644 index 0000000000..1045c03d45 --- /dev/null +++ b/website/docs/operations/access-control/group_management.md @@ -0,0 +1,52 @@ +--- +title: User Group Management +language: en +sidebar_label: User Group Management +pagination_label: User Group Management +toc_min_heading_level: 2 +toc_max_heading_level: 6 +pagination_prev: null +pagination_next: null +keywords: + - user group management +draft: true +last_update: + date: Aug 16, 2022 +--- + +This chapter provides an overview of what a user group is and how a user group can be managed. + +### <span id="group">About User Group</span> + +A user group is a collection of users, and users in a user group share the same ACL. By default, Kylin initializes four user groups, namely ALL_USERS, ROLE_ADMIN, ROLE_ANALYST, and ROLE_MODELER, and ALL_USERS group is a default user group, all users are included in the ALL_USERS user group. ALL_USERS and ROLE_ADMIN user group cannot be modified or deleted. System administrators can add or remove users in user groups except ALL_USERS, or add a user to multiple groups except ALL_USERS. Use [...] + + +### <span id="authority">About User Group Permissions</span> + +The system administrator can grant the project-level access permissions to a user group. When a user group has been granted the project-level permissions, users in this group will inherit the corresponding permissions from the group. + +When a user belongs to multiple groups, the user will inherit the project-level permissions from the groups he/she belongs to. + +### <span id="management">Manage user groups</span> + +After the system administrator logs in to Kylin, click the **Admin** button in the top toolbar to enter the system management page, and click the **Group** field to enter the User Group Management page. + +### <span id="add">Create a user group</span> + +On the User Group Management page, click **+ User Group** button to create a new group. In the pop-up window, the system administrator can fill in the group name and click **OK** to save a new user group. + +### <span id="drop">Delete a user group</span> + +On the User Group Management page, select a user to be deleted, click the **Drop** button under the **Actions** bar on the right. In the pop-up window, the system administrator can confirm to delete a user group, once a user group is deleted, users in this user group will not be deleted and permission grant to this user group will be removed. + +### <span id="assign">Assign users to a user group</span> + +1. On the User Group Management page, select the user group to be assigned users to. +2. Click **Assign Users** under the **Actions** bar on the right. +3. In the pop-up window, check the users who need to be assigned to the group, click the right arrow **>**, the user will be assigned to the **Assigned Users**. +4. Click **OK** and the user will be assigned to this group. + +### <span id="update_group">Modify user's user group</span> + +Please refer to [User Management](user_management.md) + diff --git a/website/docs/operations/access-control/images/acl_5_en.png b/website/docs/operations/access-control/images/acl_5_en.png new file mode 100644 index 0000000000..ae06dff693 Binary files /dev/null and b/website/docs/operations/access-control/images/acl_5_en.png differ diff --git a/website/docs/operations/access-control/images/table_acl_1_en.png b/website/docs/operations/access-control/images/table_acl_1_en.png new file mode 100644 index 0000000000..b2baa58ea3 Binary files /dev/null and b/website/docs/operations/access-control/images/table_acl_1_en.png differ diff --git a/website/docs/operations/access-control/images/table_acl_2_en.png b/website/docs/operations/access-control/images/table_acl_2_en.png new file mode 100644 index 0000000000..abd00a9022 Binary files /dev/null and b/website/docs/operations/access-control/images/table_acl_2_en.png differ diff --git a/website/docs/operations/access-control/images/table_acl_3_en.png b/website/docs/operations/access-control/images/table_acl_3_en.png new file mode 100644 index 0000000000..0e7e47e31d Binary files /dev/null and b/website/docs/operations/access-control/images/table_acl_3_en.png differ diff --git a/website/docs/operations/access-control/intro.md b/website/docs/operations/access-control/intro.md new file mode 100644 index 0000000000..fcc139d327 --- /dev/null +++ b/website/docs/operations/access-control/intro.md @@ -0,0 +1,24 @@ +--- +title: Access Control +language: en +sidebar_label: Access Control +pagination_label: Access Control +toc_min_heading_level: 2 +toc_max_heading_level: 6 +pagination_prev: null +pagination_next: null +keywords: + - access control +draft: true +last_update: + date: Aug 16, 2022 +--- + + +This chapter will describe how to grant roles to users/groups in Kyligence to control their access rights, and how to perform fine-grained data access control, it will cover: + +* [User Management](user_management.en.md) +* [User Group Management](group_management.md) + +* [Data Access Control](access_control.md) + * [Project Access Control](project_acl.md) diff --git a/website/docs/operations/access-control/user_management.md b/website/docs/operations/access-control/user_management.md new file mode 100644 index 0000000000..0b7a4fdfd6 --- /dev/null +++ b/website/docs/operations/access-control/user_management.md @@ -0,0 +1,93 @@ +--- +title: User Management +language: en +sidebar_label: User Management +pagination_label: User Management +toc_min_heading_level: 2 +toc_max_heading_level: 6 +pagination_prev: null +pagination_next: null +keywords: + - user management +draft: true +last_update: + date: Aug 16, 2022 +--- + + +This chapter introduces what a user is and how a user can be managed. + +### <span id="user">About Users</span> + +To use Kylin, a user must log in to the system using a user name and corresponding password. Every user is unique in a Kylin instance, which is to say, it is not necessary to create the same user for every project in a single instance. +By default, Kylin initializes one user, namely `ADMIN`. The user `ADMIN` is a built-in system administrator, and the system administrator has all the permissions of the entire system. + + +### <span id="management">Manage Users</span> + +After the system administrator logs in to Kylin, click the **Admin** button in the top toolbar to enter the system management page, and click the **User** field to enter the User Management page. + +**Note**: + +1. Except for the system administrator, simply creating a user does not give the user access to any project. +2. Except for the system administrator, other users need to be given access at the project level. + + +### <span id="add">Add a User</span> + +On the User Management page, the system administrator can click the **+User** button to add new users. In the pop-up window, please fill in the user name, password, confirm new password, select whether the user role is a system administrator or a normal user, and click **OK**. + +> **tips:** username is case insensitive, so duplicate names with existing user names are not allowed. + +### <span id="edit">Edit a User Role</span> + +On the User Management page, select a user to be edited, click the **...** (More Actions) button under the **Actions** bar on the right. Then click **Edit Role**. + +In the pop-up window, the system administrator can modify user role to administrator or user. + +### <span id="drop">Delete a User</span> + +On the User Management page, select a user to be deleted, click the **...** (More Actions) button under the **Actions** bar on the right. Then click **Delete**. The system administrator can confirm to delete a user in the prompted window. User can not be restored after deleting, and user's access permission on all projects will be removed. + +### <span id="disable">Enable/Disable a User</span> + +On the User Management page, select a user, and click the **...** (More Actions) button under the **Actions** bar on the right. Then click **Enable / Disable**. The system administrator can enable or disable a user, and disabled users cannot login to the system. + +### <span id="adminpwd">Reset Password for ADMIN</span> + +On the User Management page, select a user, click **Reset Password** under the **Actions** bar on the right. + +In the pop-up window, the system administrator can change the password and need to enter the new password twice. + +The initial ADMIN account password needs to be modified after the first login. To reset the password, you can execute the following command. After successful execution, the ADMIN account will regenerate a random password and display it on the console. When you log in, you need to change the password: + +```sh +$KYLIN_HOME/bin/admin-tool.sh admin-password-reset +``` + +When the parameter `kylin.metadata.random-admin-password.enabled=false`, it will not regenerate a random password but the fixed password `KYLIN`. If the parameter `kylin.metadata.random-admin-password.enabled` is set from `false` to `true` , it will regenerate a random password and display it on the console after all the Kylin nodes restarted. + +**Caution** When run this command, Kylin will enter maintenance mode. If the command is interrupted by force, you may need to exit maintain mode manually. Refer to [maintenance_mode](../system-operation/maintenance_mode.en.md). + + +### <span id="pwd">Reset password for Non-admin</span> + +Click **<username\>**-->**Setup** on the top right corner of the navigation bar. In the pop-up window, user need to provide the old password and repeat the new password twice to reset password. + + +### <span id="group">Assign a User to a Group</span> + +To assign a user to a group, please do the followings: +1. On the User Management page, select a user to be grouped. +2. Click **Assign to Group** under the **Actions** bar on the right. +3. Select a group to assign the user to under **Candidates**, and then click the right arrow **>**. The group will enter into **Selected**. +4. Click **OK** and the user will be in the selected group. + + +### <span id="update_group">Modify User Group</span> + +To modify user group, please do the following steps: +1. On the User Management page, select a user to modify the group membership. +2. Click **Assign to Group** under the **Actions** bar on the right. +3. Select the group to be modified under **Selected**, and then click the left arrow **<**. The group will enter into **Candidates**. +4. Click **OK** and the user group membership will be modified. diff --git a/website/docs/operations/access-control/users/user_w1.png b/website/docs/operations/access-control/users/user_w1.png new file mode 100755 index 0000000000..82c4ef5d29 Binary files /dev/null and b/website/docs/operations/access-control/users/user_w1.png differ diff --git a/website/docs/operations/access-control/users/user_w2.png b/website/docs/operations/access-control/users/user_w2.png new file mode 100755 index 0000000000..d4fd0d31cd Binary files /dev/null and b/website/docs/operations/access-control/users/user_w2.png differ diff --git a/website/docs/operations/access-control/users/user_w3.png b/website/docs/operations/access-control/users/user_w3.png new file mode 100755 index 0000000000..442612fbdc Binary files /dev/null and b/website/docs/operations/access-control/users/user_w3.png differ diff --git a/website/docs/operations/access-control/users/user_w4.png b/website/docs/operations/access-control/users/user_w4.png new file mode 100755 index 0000000000..52c313cfc2 Binary files /dev/null and b/website/docs/operations/access-control/users/user_w4.png differ diff --git a/website/docs/operations/access-control/users/user_w5.png b/website/docs/operations/access-control/users/user_w5.png new file mode 100755 index 0000000000..c7722851e7 Binary files /dev/null and b/website/docs/operations/access-control/users/user_w5.png differ diff --git a/website/docs/operations/access-control/users/user_w6.png b/website/docs/operations/access-control/users/user_w6.png new file mode 100755 index 0000000000..a436ebbbc3 Binary files /dev/null and b/website/docs/operations/access-control/users/user_w6.png differ diff --git a/website/docs/operations/access-control/users/user_w7.png b/website/docs/operations/access-control/users/user_w7.png new file mode 100755 index 0000000000..e88c9701db Binary files /dev/null and b/website/docs/operations/access-control/users/user_w7.png differ diff --git a/website/docs/operations/access-control/users/user_w8.png b/website/docs/operations/access-control/users/user_w8.png new file mode 100755 index 0000000000..46b7233371 Binary files /dev/null and b/website/docs/operations/access-control/users/user_w8.png differ diff --git a/website/docs/operations/logs/audit_log.md b/website/docs/operations/logs/audit_log.md new file mode 100644 index 0000000000..3cd52a124f --- /dev/null +++ b/website/docs/operations/logs/audit_log.md @@ -0,0 +1,208 @@ +--- +title: Audit Log +language: en +sidebar_label: Audit Log +pagination_label: Audit Log +toc_min_heading_level: 2 +toc_max_heading_level: 6 +pagination_prev: null +pagination_next: null +keywords: + - audit log +draft: false +last_update: + date: 08/16/2022 +--- + +### Basic Concept + +In the database, Audit Log is mainly used to monitor and record the operating behavior of data, which is simply understood as a log. + +Kylin instances are stateless services. All state information is stored in metadata. All operations that change data will create or modify metadata. Changes to metadata are included in a database transaction. At the same time, every time Modifications to metadata are recorded in the audit log, which is equivalent to a snapshot of each version of the metadata. Audit Log mechanism can not only monitor metadata through the Audit Log, but also help disaster recovery. + + + +> Caution:The Audit Log can only be considered as a log of metadata, used to monitor and troubleshoot problems, and can be used to repair some metadata. If you want to ensure the stability of the system, you need to ensure that the metadata is correct and back up the metadata in a timely manner. + + + +When Kylin is started for the first time, according to the metadata table name you filled in for the configuration item `kylin.metadata.url` in the configuration file` kylin.properties`, create a file named `{identifier} _audit_log` in the metadata database. The audit table has a suffix `_audit_log` compared to the metadata table. + + + +e.g: + +`kylin.metadata.url=ke_metadata@jdbc,driverClassName=org.postgresql.Driver,url=jdbc:postgresql://sandbox:5432/kylin,username=postgres,password=` + +The metadata is named `kylin`, the metadata table is named` ke_metadata`, and the Audit Log table is named `ke_metadata_audit_log`. + + + +In Kylin, PostgreSQL is used as the metastore by default, and PostgreSQL is used as an example later. + + + +**Audit Log table field descriptions** + +| name | type(postgresql) | type(mysql) | description | +| :----------- | ------------------ | ------------ | :----------------------------------------------------------- | +| id | bigserial | bigint | Auto incremental id | +| meta_key | varchar(255) | varchar(255) | The key of metadata, corresponding to the META_TABLE_KEY field in the metadata table | +| meta_content | bytea | longblob | The content of the current metadata, when the operation is deletion, the value is NULL | +| meta_ts | bigint | bigint | Update timestamp, when the operation is deletion, the value is NULL | +| meta_mvcc | bigint | bigint | The version of the current metadata, when the operation is deletion, the value is NULL | +| unit_id | varchar(255) | varchar(255) | Transaction id | +| operator | varchar(255) | varchar(255) | The username of operator | +| instance | varchar | varchar | The instance of operator | + + + +### Version Information + +The `meta_table_key` field in the metadata table is associated with the `meta_key` in the audit log table. + +- Metadata version: In the metadata table, the `meta_table_mvcc` field records the latest version number of each item of metadata; + +- Audit log version: In the audit log table, the `meta_mvcc` field records the version number. You can filter and view all historical versions of a certain metadata according to the` meta_key` field; + + + +### Functions + +The following functions can be implemented through audit logs: + +- View the change history of all / a certain metadata and the corresponding operation user +- View the change of metadata in a transaction +- View the audit log over a period of time +- Facilitates metadata disaster recovery + + + +### Sample + +**Tool** + +PostgreSQL client tool:DBeaver + + + +**Table description** + + + + + +As shown in the table, the meta_key field is a metadata item and meta_content is a metadata value. The meta_key value starts with `/ _global` to indicate global metadata,` / project_name` starts to indicate metadata for a project, and `/ UUID` is a globally unique identifier, which serves as an identifier for a piece of metadata. + +e.g: + +- `_global/user/ADMIN` Represents metadata information of ADMIN users, the specific information is in the meta_content field; +- `_global/project/kylin.json` Represents metadata information for a project named kylin; +- `/${project_name}/model_desc/${model_id}` Model description information representing a project; + + + +**Basic Operations** + +1. View the history of an item of metadata + + ``` + select * from ke_metadata_audit_log where meta_key = '/_global/project/default.json'; + ``` + +2. View the history of a model + + ``` + select * from ke_metadata_audit_log where meta_key = '/project/model_desc/49529000-c161-4013-bb80-9a78f4f0248d.json' + ``` + +3. View the change of metadata in a transaction + + ``` + select * from ke_metadata_audit_log where unit_id = '6090bfb5-2401-4176-8475-fe6fd82bc439'; + ``` + +4. View audit logs for metadata over a period of time + + ``` + select * from ke_metadata_audit_log where meta_ts > 1325376000000 and meta_ts < 1328054400000 ; + ``` + +5. Associate the metadata table to view the history changes of a user + + ``` + select a.meta_mvcc, a.meta_content, b.meta_table_mvcc, b.meta_table_content from ke_metadata_audit_log a left join ke_metadata b on a.meta_key = b.meta_table_key where a.meta_key = '/_global/user/ADMIN' + ``` + + + +**Examples of actual scenarios** + +- Monitor if user password is changed + + The ADMIN user is very important. It is not allowed to modify the password by anyone other than the system administrator. You can monitor the content of meta_content whose meta_key is `/_global/user/ADMIN`. One of the fields is password. If this value changes, explained the password modified. + + + + The meta_mvcc field version number increases before and after the password is changed, the password value in the meta_content field changes, and the value of default_password also changes from true to false: + +  + +  + + + +- Monitor if the model has been modified + + Assume that the project name is kylin, and the table connection relationship of model name test_model is not allowed to be modified. You can view the JSON format of a model on the Kylin model page, where uuid represents the model id, then the corresponding meta_key is `/kylin/model_desc/${model_id} `, monitor whether the corresponding meta_mvcc field value increases, if there is a change, the model is modified. + + + + The second record is to change the format of the time partition column of the model, and the unit_id field value of the two records is different, indicating that the two changes were made in different transactions, and the user who checked the operator field was ADMIN: + +  + +  + + + +### Configuration instructions + + +In the Kylin configuration file `kylin.properties`, there are the following configuration items about the audit log, which can be modified as needed. **Please make sure the disk space of the audit log node is always sufficient.** + + +- `kylin.metadata.audit-log.max-size=500000` The audit log stores the latest 500,000 rows by default. By default, the redundant operation log is cleared every morning. You can modify this configuration item to adjust it. + + + +### Export Audit Log + +The Audit Log is stored in the database. You can use the tools provided by Kylin to export the data within the specified time range to the local for backup, or export it as an attachment to the Kylin ticket when encountering problems, which is convenient for technology support personnel to locate the problem. + +There are two ways to execute commands on the KE node: + +1. Use the diagnostic package command: `$ {KYLIN_HOME}/bin/diag.sh` + + - -The Audit Log of the last 3 days will be obtained by default and stored in the `audit_log/${starttime}_${endtime}.jsonl` file in the diagnostic package directory; + + + +2. Using the AuditLogTool tool: `${KYLIN_HOME}/bin/kylin.sh io.kyligence.kap.tool.AuditLogTool -startTime ${starttime} -endTime ${endtime} -dir ${target_dir}` + + - `${starttime}` and `${endtime}` Retrieves the specified range of Audit Log. The format is timestamp in milliseconds: e.g `1579868382749`; + - `${target_dir}` specifies the directory where your Audit Log files are stored. The generated Audit Log is stored in the `${target_dir}/${starttime}_${endtime}` file; + + + +### Import Audit Log + +If you have the Audit Log file exported locally and want to view and analyze it through the database, you can use the following methods to import. + +On a machine that already has a Kylin environment, use the AuditLogTool tool: `${KYLIN_HOME}/kylin.sh io.kyligence.kap.tool.AuditLogTool -restore -table ${target_table_name} -dir ${auditlog_dir}` + +- The `$ {target_table_name}` parameter specifies the name of the Audit Log table to be generated. Be careful not to duplicate the name of the Audit Log table already in the Kylin environment. +- The `$ {auditlog_dir}` parameter specifies the directory where the Audit Log file is located; + +After the execution of the command is completed, the Audit Log table is generated under the metastore specified by the configuration item `kylin.metadata.url` in the` $ {KYLIN_HOME}/conf/kylin.properties` file. + diff --git a/website/docs/operations/logs/images/after_update_model.png b/website/docs/operations/logs/images/after_update_model.png new file mode 100644 index 0000000000..c0da4545a8 Binary files /dev/null and b/website/docs/operations/logs/images/after_update_model.png differ diff --git a/website/docs/operations/logs/images/after_update_pwd.png b/website/docs/operations/logs/images/after_update_pwd.png new file mode 100644 index 0000000000..4bcfc0c801 Binary files /dev/null and b/website/docs/operations/logs/images/after_update_pwd.png differ diff --git a/website/docs/operations/logs/images/before_update_model.png b/website/docs/operations/logs/images/before_update_model.png new file mode 100644 index 0000000000..e60309d83a Binary files /dev/null and b/website/docs/operations/logs/images/before_update_model.png differ diff --git a/website/docs/operations/logs/images/before_update_pwd.png b/website/docs/operations/logs/images/before_update_pwd.png new file mode 100644 index 0000000000..88fea477c6 Binary files /dev/null and b/website/docs/operations/logs/images/before_update_pwd.png differ diff --git a/website/docs/operations/logs/images/metadata_table.png b/website/docs/operations/logs/images/metadata_table.png new file mode 100644 index 0000000000..85baef4deb Binary files /dev/null and b/website/docs/operations/logs/images/metadata_table.png differ diff --git a/website/docs/operations/logs/intro.md b/website/docs/operations/logs/intro.md new file mode 100644 index 0000000000..9c2f2ebec6 --- /dev/null +++ b/website/docs/operations/logs/intro.md @@ -0,0 +1,21 @@ +--- +title: Logs +language: en +sidebar_label: Logs +pagination_label: Logs +toc_min_heading_level: 2 +toc_max_heading_level: 6 +pagination_prev: null +pagination_next: null +keywords: + - logs +draft: false +last_update: + date: 08/16/2022 +--- + +This chapter mainly introduces different log types: + +* [System Log](logging.md) +* [Audit Log](audit_log.md) + diff --git a/website/docs/operations/logs/system_log.md b/website/docs/operations/logs/system_log.md new file mode 100755 index 0000000000..d80321a73d --- /dev/null +++ b/website/docs/operations/logs/system_log.md @@ -0,0 +1,249 @@ +--- +title: System Log +language: en +sidebar_label: System Log +pagination_label: System Log +toc_min_heading_level: 2 +toc_max_heading_level: 6 +pagination_prev: null +pagination_next: null +keywords: + - system log +draft: false +last_update: + date: 08/16/2022 +--- + + +## System Log + +After being successfully started, Kylin will create a directory named `logs/` by default, all logs generated during Kylin runtime will be stored in this directory. + +### Log files +Log files generated by Kylin is as followings. + +#### `kylin.log` +This file is Kylin's main log file, whose default logging level is DEBUG. + +#### `kylin.out` +The standard output of Kylin process will be redirected to this file, including the output of Tomcat and Hive. + +#### `kylin.gc` +This file is the GC(Garbage Collection) log of Kylin Java process. And it appends pid as filename suffix to avoid being overwritten. + +#### `access.log` +This file stores the Tomcat access log. It records all HTTP request response information.For example, User-Agent, access URL, etc. + +#### `jstack.timed.log` + +This file records Java stack traces of Java threads of Kylin, which is used to record some threads running status. To avoid the storage overused, only 20 log files can be kept. The new file will replace the oldest one, when exceeding the maximum number. + +> Note:Because the execution of jstack depends on the .java_pid file written by jvm in the /tmp directory, if the file is deleted (for example, a scheduled clean up script), it will cause the jstack to not run properly, so that the jstack.timed .log will not be generated + +#### `check-env.out` +The standard output of executing `check-env.sh` script will be redirected to this file. + +#### `check-env.error` +The error message of executing `check-env.sh` script will be redirected to this file. + +#### `shell.stderr` +The result of running command lines will be stored in this file. + +#### `shell.stdout` +The standard output of running command lines will be redirected to this file. + +#### `kylin.security.log` + +The log of system start, stop, upgrade, login and logout will be redirected to this file. + +> Notes: When using LDAP service to implement user authentication, two logs will be recorded for each login failure. Because when using the LDAP service, if the login fails, another method will be used for authentication. + +#### `kylin.schedule.log` +This file records logs related to task scheduling, whose default logging level is DEBUG. + +#### `kylin.query.log` +This file records query related logs, whose default logging level is DEBUG. + +#### `kylin.smart.log` +This file records recommendation-related logs, whose default logging level is DEBUG. + +#### `dump.hprof` +When Out of Memory (OOM) occurs in Kylin, it will dump the entire heap, which is convenient for checking the cause. +> Note: When you have a large memory setting and Out of Memory OOM occurs, the file dump.hprof will occupy a large storage space, which may cause your disk space to be insufficient and the node to be abnormal. You can manually clean up the historical file. + +### Logging Analysis +Take query as an example, submitting a query on Web UI, and we'll see the following information in `kylin.query.log` + + +``` +==========================[QUERY]=============================== +Query Id: 8586e718-67b4-c840-61b4-a8898415a154 +SQL: select lo_revenue as from p_lineorder; +User: ADMIN +Success: true +Duration: 1.243 +Project: ssb100_10 +Realization Names: [AUTO_MODEL_P_LINEORDER_1] +Index Layout Ids: [30001] +Snapshot Names: [] +Is Partial Match Model: [false] +Scan rows: [35000] +Total Scan rows: 35000 +Scan bytes: [246530] +Total Scan Bytes: 246530 +Result Row Count: 280 +Shuffle partitions: 1 +Hit Exception Cache: false +Storage Cache Used: false +Storage Cache Type: null +Is Query Push-Down: false +Is Prepare: false +Is Timeout: false +Time Line Schema: massage,end calcite parse sql,end_convert_to_relnode,end_calcite_optimize,end_plan,collect_olap_context_info,end select realization,end_rewrite,to_spark_plan,seg_pruning,fetch_file_status,shard_pruning,executed_plan,collect_result +Time Line: 6,1,4,11,0,0,1,1,14,6,0,0,1,1198 +Message: null +Is forced to Push-Down: false +User Agent: null +Scan Segment Count: 1 +Scan File Count: 1 +==========================[QUERY]=============================== +``` +The main fields in the above clip are described as follows: + +* `Query Id`: Query id +* `SQL`: Query statement +* `User`: The user name to execute the query +* `Success`: Status flag of query result, true execution succeeded, false execution failed +* `Duration`: Query time (unit: seconds) +* `Project`: The name of the project used in the query +* `Realization Names`: The name of the model hit by the query +* `Index Layout Ids`: ID of the layout hit by the query +* `Snapshot Names`: Query the hit snapshot +* `Is Partial Match Model`: Partial match model, such as a left B, you can check table a alone +* `Scan rows`: Query the number of data rows scanned +* `Total Scan rows`: Total rows of data scanned by query +* `Scan bytes`: Query the number of data bytes scanned +* `Total Scan Bytes` : Query the total number of bytes of scanned data +* `Result Row Count`: The number of data rows returned by the query +* `Shuffle partitions`: A spark query parameter that affects how many partitions / tasks are generated after a shuffle. It is calculated by kylin.query.engine.sparkl-sql-shuffle-parittions or dynamic calculation. The calculation formula is min (the estimated value of data size and the total number of cores of spark cluster) +* `Hit Exception Cache`: Whether to hit the cache of failed queries +* `Storage Cache Used`: Whether to hit the cache successfully queried +* `Storage Cache Type`: The cache type of the hit query +* `Is Query Push-Down`: Is it a push down query +* `Is Prepare`: Whether it is a probe query (this item will be true for the query sent by BI) +* `Is Timeout`: Whether to timeout +* `Time Line Schema`: Steps in query module +* `Time Line`: Time spent in each step of the query module (MS) +* `Message`:Query the prompt information on the page. The query is successful. This item is null +* `Is forced to Push-Down`: Whether to force down +* `User Agent`: The environment information used to submit the query +* `Scan Segment Count`: Number of scanned segments +* `Scan File Count`: Number of scanned files + +### Logging Configuration + +Kylin uses log4j2 to configure logs. Users can edit the `kylin-server-log4j.xml` file in the `$KYLIN_HOME/server/conf/` directory to modify the log level, path, etc. +After modification, you need to restart Kylin for the configuration to take effect. + +The configuration of all logs starting with kylin and ending with log is in `kylin-server-log4j.xml`, the configuration code is as follows. + +```xml + <Routing name="routing"> + <Routes pattern="$${ctx:logCategory}"> + <Route> + <RollingFile name="rolling-${ctx:logCategory}" + fileName="${env:KYLIN_HOME}/logs/kylin.${ctx:logCategory}.log" + filePattern="${env:KYLIN_HOME}/logs/kylin.${ctx:logCategory}.log.%i"> + <Policies> + <SizeBasedTriggeringPolicy size="268435456"/> + </Policies> + <DefaultRolloverStrategy max="10"/> + <PatternLayout pattern="%d{ISO8601} %-5p %X{request.project}[%t] %c{2} : %mask{%m}%n"/> + </RollingFile> + </Route> + + <Route ref="server" key="$${ctx:logCategory}"/> + </Routes> + </Routing> + +``` + +In the default configuration, log rolling is triggered when the log file reaches 256MB, keeping the last 10 log files. + +If you need to configure one of the log files (such as kylin.query.log) separately, you need to add a new Route under the Routes configuration in the above configuration code, and configure the key as the corresponding log file name (query, schedule). +It should be noted that the new route needs to be configured before the existing route, otherwise it will not take effect. + +The following is an example, modify the rolling strategy of kylin.query.log to trigger at 0:00 every day, back up the last 5 logs. + +```xml +<Route key="query"> + <RollingFile name="rolling-${ctx:logCategory}" fileName="${env:KYLIN_HOME}/logs/kylin.${ctx:logCategory}.log" filePattern="${env:KYLIN_HOME}/logs/kylin.${ctx:logCategory}.log.%i"> + <Policies> + <CronTriggeringPolicy schedule="0 0 0 * * ?"/> + </Policies> + <DefaultRolloverStrategy max="5" /> + <PatternLayout pattern="%d{ISO8601} %-5p %X{request.project}[%t] %c{2} : %mask{%m}%n" /> + </RollingFile> +</Route> +``` + +If you need to configure kylin.log, you can modify the RollingRandomAccessFile configuration, for example, change the number of reserved files to 5 + +```xml +<RollingRandomAccessFile name="server" fileName="${env:KYLIN_HOME}/logs/kylin.log" append="true" + filePattern="${env:KYLIN_HOME}/logs/kylin.log.%i" immediateFlush="false" > + <Policies> + <SizeBasedTriggeringPolicy size="268435456"/> + </Policies> + <DefaultRolloverStrategy max="5"/> + <PatternLayout pattern="%d{ISO8601} %-5p %X{request.project}[%t] %c{2} : %mask{%m}%n"/> +</RollingRandomAccessFile> +``` + +### Error Code In Log + +The format of log error code is KE-AABBBCCC, AA refers to the error reporting module, BBB refers to the more detailed business error reporting and CCC refers to the error number. + +| AA | Description | +| ----------------------------------------------------------- | ------------------------------------------------------------ | +| 00 | common | +| 10 | server | +| 20 | query | +| 30 | build | +| 40 | system | +| 50 | tool | + + + +| BBB | Description | +| ----------------------------------------------------------- | ------------------------------------------------------------ | +| 000 | general | +| 001 | project | +| 002 | model | +| 003 | user | +| 004 | user group | +| 005 | password | +| 006 | column | +| 007 | table | +| 008 | database | +| 009 | measure | +| 010 | dimension | +| 011 | cc | +| 012 | index | +| 013 | job | +| 014 | sql expression | +| 015 | license | +| 016 | email | +| 017 | file | +| 018 | kerberos | +| 019 | catalog | +| 020 | recommendation | +| 021 | server | +| 022 | segment | +| 023 | diag | +| 024 | auth | +| 025 | shell | +| 026 | metadata | +| 027 | frequency query | +| 028 | json | + diff --git a/website/docs/operations/overview.md b/website/docs/operations/overview.md new file mode 100755 index 0000000000..d3c9b22883 --- /dev/null +++ b/website/docs/operations/overview.md @@ -0,0 +1,22 @@ +--- +title: Overview +language: en +sidebar_label: Overview +pagination_label: Overview +toc_min_heading_level: 2 +toc_max_heading_level: 6 +pagination_prev: null +pagination_next: null +keywords: + - operation overview +draft: false +last_update: + date: 08/16/2022 +--- + +As Kylin system administrator, typical daily operations include: + +- To ensure Kylin service running smoothly, system administrator should monitor system logs on a regular basis. +- To ensure building jobs run successfully, system administrator needs to monitor jobs execution status via email notification or system web UI. +- To ensure there are enough cluster resources for Kylin, system administrator should check the YARN queue and storage utilization frequently. +- To prevent any data loss or system failure, system administrator should make plans for system backup and disaster recovery. diff --git a/website/docs/operations/project-maintenance/images/project_settings_acceleration.png b/website/docs/operations/project-maintenance/images/project_settings_acceleration.png deleted file mode 100644 index e88b5e7167..0000000000 Binary files a/website/docs/operations/project-maintenance/images/project_settings_acceleration.png and /dev/null differ diff --git a/website/docs/operations/project-maintenance/images/project_settings_acceleration_tips.png b/website/docs/operations/project-maintenance/images/project_settings_acceleration_tips.png deleted file mode 100644 index e5d611f514..0000000000 Binary files a/website/docs/operations/project-maintenance/images/project_settings_acceleration_tips.png and /dev/null differ diff --git a/website/docs/operations/project-maintenance/images/project_settings_storage_low_usage.png b/website/docs/operations/project-maintenance/images/project_settings_storage_low_usage.png deleted file mode 100644 index e11df66530..0000000000 Binary files a/website/docs/operations/project-maintenance/images/project_settings_storage_low_usage.png and /dev/null differ diff --git a/website/docs/operations/project-maintenance/images/recommendation_preferences_en.png b/website/docs/operations/project-maintenance/images/recommendation_preferences_en.png deleted file mode 100644 index 6538741e6c..0000000000 Binary files a/website/docs/operations/project-maintenance/images/recommendation_preferences_en.png and /dev/null differ diff --git a/website/docs/operations/project-maintenance/images/service_state.en.png b/website/docs/operations/project-maintenance/images/service_state.en.png deleted file mode 100644 index 69dd4e44b1..0000000000 Binary files a/website/docs/operations/project-maintenance/images/service_state.en.png and /dev/null differ diff --git a/website/docs/operations/project-maintenance/images/storage_quota.en.png b/website/docs/operations/project-maintenance/images/storage_quota.en.png deleted file mode 100644 index 40953f6ebc..0000000000 Binary files a/website/docs/operations/project-maintenance/images/storage_quota.en.png and /dev/null differ diff --git a/website/docs/operations/project-maintenance/alerting.md b/website/docs/operations/project-operation/alerting.md similarity index 100% rename from website/docs/operations/project-maintenance/alerting.md rename to website/docs/operations/project-operation/alerting.md diff --git a/website/docs/operations/project-operation/images/add_project.png b/website/docs/operations/project-operation/images/add_project.png new file mode 100644 index 0000000000..474e307aac Binary files /dev/null and b/website/docs/operations/project-operation/images/add_project.png differ diff --git a/website/docs/operations/project-operation/images/project_list.png b/website/docs/operations/project-operation/images/project_list.png new file mode 100644 index 0000000000..f9763fe6b1 Binary files /dev/null and b/website/docs/operations/project-operation/images/project_list.png differ diff --git a/website/docs/operations/project-maintenance/images/project_settings.png b/website/docs/operations/project-operation/images/project_settings.png similarity index 100% rename from website/docs/operations/project-maintenance/images/project_settings.png rename to website/docs/operations/project-operation/images/project_settings.png diff --git a/website/docs/operations/project-maintenance/images/project_settings_basic_general.png b/website/docs/operations/project-operation/images/project_settings_basic_general.png similarity index 100% rename from website/docs/operations/project-maintenance/images/project_settings_basic_general.png rename to website/docs/operations/project-operation/images/project_settings_basic_general.png diff --git a/website/docs/operations/project-maintenance/images/project_settings_cc_expose.png b/website/docs/operations/project-operation/images/project_settings_cc_expose.png similarity index 100% rename from website/docs/operations/project-maintenance/images/project_settings_cc_expose.png rename to website/docs/operations/project-operation/images/project_settings_cc_expose.png diff --git a/website/docs/operations/project-maintenance/images/project_settings_custom_config.png b/website/docs/operations/project-operation/images/project_settings_custom_config.png similarity index 100% rename from website/docs/operations/project-maintenance/images/project_settings_custom_config.png rename to website/docs/operations/project-operation/images/project_settings_custom_config.png diff --git a/website/docs/operations/project-maintenance/images/project_settings_default_database_en.png b/website/docs/operations/project-operation/images/project_settings_default_database_en.png similarity index 100% rename from website/docs/operations/project-maintenance/images/project_settings_default_database_en.png rename to website/docs/operations/project-operation/images/project_settings_default_database_en.png diff --git a/website/docs/operations/project-maintenance/images/project_settings_job_notification.png b/website/docs/operations/project-operation/images/project_settings_job_notification.png similarity index 100% rename from website/docs/operations/project-maintenance/images/project_settings_job_notification.png rename to website/docs/operations/project-operation/images/project_settings_job_notification.png diff --git a/website/docs/operations/project-maintenance/images/project_settings_job_notification_page.png b/website/docs/operations/project-operation/images/project_settings_job_notification_page.png similarity index 100% rename from website/docs/operations/project-maintenance/images/project_settings_job_notification_page.png rename to website/docs/operations/project-operation/images/project_settings_job_notification_page.png diff --git a/website/docs/operations/project-maintenance/images/project_settings_model_rewrite.png b/website/docs/operations/project-operation/images/project_settings_model_rewrite.png similarity index 100% rename from website/docs/operations/project-maintenance/images/project_settings_model_rewrite.png rename to website/docs/operations/project-operation/images/project_settings_model_rewrite.png diff --git a/website/docs/operations/project-maintenance/images/project_settings_pushdown.png b/website/docs/operations/project-operation/images/project_settings_pushdown.png similarity index 100% rename from website/docs/operations/project-maintenance/images/project_settings_pushdown.png rename to website/docs/operations/project-operation/images/project_settings_pushdown.png diff --git a/website/docs/operations/project-maintenance/images/project_settings_segment_en.png b/website/docs/operations/project-operation/images/project_settings_segment_en.png similarity index 100% rename from website/docs/operations/project-maintenance/images/project_settings_segment_en.png rename to website/docs/operations/project-operation/images/project_settings_segment_en.png diff --git a/website/docs/operations/project-maintenance/images/project_settings_storage.png b/website/docs/operations/project-operation/images/project_settings_storage.png similarity index 100% rename from website/docs/operations/project-maintenance/images/project_settings_storage.png rename to website/docs/operations/project-operation/images/project_settings_storage.png diff --git a/website/docs/operations/project-maintenance/images/project_settings_yarn_queue.png b/website/docs/operations/project-operation/images/project_settings_yarn_queue.png similarity index 100% rename from website/docs/operations/project-maintenance/images/project_settings_yarn_queue.png rename to website/docs/operations/project-operation/images/project_settings_yarn_queue.png diff --git a/website/docs/operations/project-maintenance/images/toolbar.en.png b/website/docs/operations/project-operation/images/toolbar.en.png similarity index 100% rename from website/docs/operations/project-maintenance/images/toolbar.en.png rename to website/docs/operations/project-operation/images/toolbar.en.png diff --git a/website/docs/operations/project-maintenance/intro.md b/website/docs/operations/project-operation/intro.md similarity index 100% rename from website/docs/operations/project-maintenance/intro.md rename to website/docs/operations/project-operation/intro.md diff --git a/website/docs/operations/project-operation/project_management.md b/website/docs/operations/project-operation/project_management.md new file mode 100644 index 0000000000..ba84002070 --- /dev/null +++ b/website/docs/operations/project-operation/project_management.md @@ -0,0 +1,62 @@ +--- +title: Project Management +language: en +sidebar_label: Project Management +pagination_label: Project Management +toc_min_heading_level: 2 +toc_max_heading_level: 6 +pagination_prev: null +pagination_next: null +keywords: + - project management +draft: true +last_update: + date: Aug 16, 2022 +--- + +This chapter introduces what a project is and how to manage projects. + +### <span id="about">About Projects</span> + +Project is the primary management unit of Kylin. In a project, you can design multiple models and perform query analysis. + +System settings are isolated at the project level, so you can set different operational preferences for different projects. + +### <span id="management">Manage Projects</span> + +After the system administrator logs in to Kylin, click the **Admin** button in the top toolbar to enter the system management page, and click the **Porject** field to enter the Project Management page. + +**Note**: if no project exist, you will not be allowed to access the **Admin** page. Please add a project firstly and then enter the **Admin** page. + +On the project management page, the system administrator can view the project information in the project list, and can also add projects, delete projects, backup projects, or grant user access permissions. + + + +### <span id="add">Add a Project</span> + +The system administrator has two ways to add a new project: + +- On the **Admin -> Project Management** page, click the **+ Project** button above the project list. +- On the product normal page, click the **+**(Add Project) button at the top toolbar. + +Fill in the project name and description in the pop-up window. The project name is mandatory; the project description is optional. A good project description will help with the maintenance of the project in the future. + + + +> **tips:** project name is case insensitive, so duplicate names with existing project names are not allowed. + +### <span id="delete">Delete a Project</span> + +On the Project Management page, select a project to be deleted, click the **...**(More Actions) button under the **Actions** bar on the right, then click **Delete**. + +The system administrator can confirm to delete a project in the prompted window. After the project is deleted, it will not be restored and the related data will be cleared. + +If there are jobs in **RUNNING, PENDING, or PAUSED** status in the project, it is needed to terminate the jobs before deleting the project. + +### <span id="change_owner">Change Project Owner</span> + +On the Project Management page, select a project to change owner, click the **...**(More Actions) button under the **Actions** bar on the right, then click **Change Owner**. Only the system administrator has permission to change the project owner. + +### <span id="grant">Grant Porject-level Access Rights to User</span> + +On the Project Management page, select a project to authorizer, click the **Authorization** button under the **Actions** bar on the right and set permissions for the project. You can find out more at the [Project ACL](../access-control/project_acl.md) section. diff --git a/website/docs/operations/project-maintenance/project_settings.md b/website/docs/operations/project-operation/project_settings.md similarity index 100% rename from website/docs/operations/project-maintenance/project_settings.md rename to website/docs/operations/project-operation/project_settings.md diff --git a/website/docs/operations/cli_tool/diagnosis.md b/website/docs/operations/system-operation/cli_tool/diagnosis.md similarity index 100% rename from website/docs/operations/cli_tool/diagnosis.md rename to website/docs/operations/system-operation/cli_tool/diagnosis.md diff --git a/website/docs/operations/system-operation/cli_tool/environment_dependency_check.md b/website/docs/operations/system-operation/cli_tool/environment_dependency_check.md new file mode 100644 index 0000000000..c6bf7a05a9 --- /dev/null +++ b/website/docs/operations/system-operation/cli_tool/environment_dependency_check.md @@ -0,0 +1,48 @@ +--- +title: Environment Dependency Check +language: en +sidebar_label: Environment Dependency Check +pagination_label: Environment Dependency Check +toc_min_heading_level: 2 +toc_max_heading_level: 6 +pagination_prev: null +pagination_next: null +keywords: + - environment dependency check +draft: false +last_update: + date: 08/16/2022 +--- + +Before you start Kylin, we provide an environment dependency checking tool to help you spot the potential problems in advance. This tool will be automatically executed by startup script when you run Kylin at the first time. + +### How To Use + +As said above, if you start Kylin at the first time, the startup script will automatically run this tool. If it check failed, this tool will be executed again when you start this product. Once successfully passed this check, the tool will not be executed automatically. + +If you need to check the environment dependency manually, just run the below command: + +```sh +$KYLIN_HOME/bin/check-env.sh +``` + +### What To Check + +The following table describes what will be checked in the tool. + +| Check Item | Description | +| ---------------------- | ------------------------------------------------------------ | +| Kerberos | To check whether user enable Kerberos in the settings. If not, the check will be skipped. Otherwise, it will execute the following operations: <br /> 1. check if Kerberos command exists <br /> 2. initialize Kerberos| +| OS version and command | Kylin only supports Linux operating systems. Besides operating system, this tool will also check if `hadoop` and `yarn` commands exist. If these two commands are not available, please make sure Hadoop cluster whether is available. | +| Hadoop configuration files | Kylin copies Hadoop configuration files to Kylin installation directory `$KYLIN_HOME/hadoop_conf`. For instance, `core-site.xml`, `hdfs-site.xml`, `yarn-site.xml`, `hive-site.xml`, etc. This tool will check if `$KYLIN_HOME/hadoop_conf` exists and contains necessary configuration files. | +| HDFS working directory | 1. Check if HDFS working directory exists <br /> 2. If yes, check whether current user has write privilege | +| Java version | Currently, we only support Java versions above 1.8 | +| Server port | Check if the port is in use | +| Spark | 1. Check if the configured resource size exceeds the cluster's actual resource size, such as, executor cores and executor instances. <br /> 2. Check if Spark is available <br /> 3. Check if the configured yarn queues for submitting query jobs and build jobs are legal 4. Check if the configured driver host address is legal| +| Spark log directory | Users can configure a HDFS directory to store Spark logs, so it checks if the directory exists and current user has read and write privileges. | +| Metastore | Check if the metastore is accessible and current user can perform necessary operations on metadata. | +| InfluxDB | 1. Check if InfluxDB is accessible <br /> 2. Check if current user has read and write privileges | +| ZooKeeper | Check if the service discovery is available. | +| Clickhouse| Checking Clickhouse cluster health. | +| KylinConfig | Checking kylin config, must starts with kylin / spring / server. | +| Query history | Check whether the current user has permissions of reading and writing on the `query_history` and `query_history_realization` tables in the RDBMS database| diff --git a/website/docs/operations/system-operation/cli_tool/images/accept_en.png b/website/docs/operations/system-operation/cli_tool/images/accept_en.png new file mode 100644 index 0000000000..f79cbb094e Binary files /dev/null and b/website/docs/operations/system-operation/cli_tool/images/accept_en.png differ diff --git a/website/docs/operations/system-operation/cli_tool/images/button_en.png b/website/docs/operations/system-operation/cli_tool/images/button_en.png new file mode 100644 index 0000000000..cfdcb1966d Binary files /dev/null and b/website/docs/operations/system-operation/cli_tool/images/button_en.png differ diff --git a/website/docs/operations/system-operation/cli_tool/images/rollback_en.png b/website/docs/operations/system-operation/cli_tool/images/rollback_en.png new file mode 100644 index 0000000000..c3f5742818 Binary files /dev/null and b/website/docs/operations/system-operation/cli_tool/images/rollback_en.png differ diff --git a/website/docs/operations/cli_tool/intro.md b/website/docs/operations/system-operation/cli_tool/intro.md similarity index 100% rename from website/docs/operations/cli_tool/intro.md rename to website/docs/operations/system-operation/cli_tool/intro.md diff --git a/website/docs/operations/system-operation/cli_tool/metadata_tool/intro.md b/website/docs/operations/system-operation/cli_tool/metadata_tool/intro.md new file mode 100644 index 0000000000..683caa8623 --- /dev/null +++ b/website/docs/operations/system-operation/cli_tool/metadata_tool/intro.md @@ -0,0 +1,18 @@ +--- +title: Metadata Tool +language: en +sidebar_label: Metadata Tool +pagination_label: Metadata Tool +toc_min_heading_level: 2 +toc_max_heading_level: 6 +pagination_prev: null +pagination_next: null +keywords: + - metadata tool +draft: true +last_update: + date: 08/16/2022 +--- + +This chapter introduces the metadata tool. + diff --git a/website/docs/operations/system-operation/cli_tool/metadata_tool/metadata_backup_restore.md b/website/docs/operations/system-operation/cli_tool/metadata_tool/metadata_backup_restore.md new file mode 100644 index 0000000000..2669da34e5 --- /dev/null +++ b/website/docs/operations/system-operation/cli_tool/metadata_tool/metadata_backup_restore.md @@ -0,0 +1,88 @@ +--- +title: System Metadata Backup and Restore +language: en +sidebar_label: System Metadata Backup and Restore +pagination_label: System Metadata Backup and Restore +toc_min_heading_level: 2 +toc_max_heading_level: 6 +pagination_prev: null +pagination_next: null +keywords: + - system metadata backup and restore +draft: true +last_update: + date: 08/16/2022 +--- + +Kylin instances are stateless services, and all state information is stored in metadata. Therefore, backing up and restoring metadata is a crucial part of operation and maintenance. + +Metadata is divided into system level and project level. + +### Metadata Backup {#metadata_backup} + +In general, it is a good practice to back up metadata before each failure recovery or system upgrade. This can guarantee the possibility of rollback after the operation fails, and still maintain the stability of the system in the worst case. + +In addition, metadata backup is also a tool for fault finding. When the system fails, the frontend frequently reports errors. By downloading and viewing metadata, it is often helpful to determine whether there is a problem with the metadata or not. + +Metadata can be backed up via the command line, as follows: + +- Metadata backup via **command line** + + Kylin provides a command line tool for backing up metadata, using the following methods: + + - Backup **system level** metadata + + ```sh + $KYLIN_HOME/bin/metastore.sh backup METADATA_BACKUP_PATH + ``` + Parameter Description: + + - `METADATA_BACKUP_PATH` - optional, represents the metadata storage path of the backup, the default value is `${KYLIN_HOME}/meta_backups/` + + - Backup **project level** metadata + + ```sh + $KYLIN_HOME/bin/metastore.sh backup-project PROJECT_NAME METADATA_BACKUP_PATH + ``` + + Parameter Description: + + - `PROJECT_NAME` - required, the name of the project to be backed up, such as learn_kylin + - `METADATA_BACKUP_PATH` - optional, represents the metadata storage path of the backup, the default value is `${KYLIN_HOME}/meta_backups/` + + +### Metadata Restore {#metadata_restore} + +Metadata recovery is required in Kylin with the **command line**. + +- Restore **system level** metadata + + ```sh + $KYLIN_HOME/bin/metastore.sh restore METADATA_BACKUP_PATH [--after-truncate] + ``` + Example: + ```sh + ./bin/metastore.sh restore meta_backups/2019-12-19-14-18-01_backup/ + ``` + + Parameter Description: + - `METADATA_BACKUP_PATH` - required, represents the metadata path that are going to be recovered, the default value is `${KYLIN_HOME}/meta_backups/ + - `--after-truncate` - optional, if this parameter is added, the system metadata will be completely restored, otherwise only the deleted and modified metadata will be restored, and the new metadata will still be retained. + +- Restore **project level** metadata + + ```sh + $KYLIN_HOME/bin/metastore.sh restore-project PROJECT_NAME METADATA_BACKUP_PATH [--after-truncate] + ``` + Example: + ```sh + ./bin/metastore.sh restore-project projectA meta_backups/2019-12-19-14-18-01_backup/ + ``` + + Parameter Description: + + - `PROJECT_NAME` - required, represents the project name + - `METADATA_BACKUP_PATH` - required, represents the metadata path that are going to be recovered, the default value is `${KYLIN_HOME}/meta_backups/ + - `--after-truncate` - optional, if this parameter is added, the project metadata will be completely restored, otherwise only the deleted and modified metadata will be restored, and the new metadata will still be retained. + +**Caution** When run this command, Kylin will enter maintenance mode. If the command is interrupted by force, you may need to exit maintain mode manually. Refer [maintenance_mode](../../maintenance_mode.en.md). diff --git a/website/docs/operations/system-operation/cli_tool/rollback.md b/website/docs/operations/system-operation/cli_tool/rollback.md new file mode 100644 index 0000000000..a27193153c --- /dev/null +++ b/website/docs/operations/system-operation/cli_tool/rollback.md @@ -0,0 +1,123 @@ +--- +title: Rollback Tool +language: en +sidebar_label: Rollback Tool +pagination_label: Rollback Tool +toc_min_heading_level: 2 +toc_max_heading_level: 6 +pagination_prev: null +pagination_next: null +keywords: + - rollback tool +draft: false +last_update: + date: 08/12/2022 +--- + +When a user error causes metadata or data loss, or when Kylin is unavailable due to an unknown issue, you can roll back to a specified moment with the rollback tool to ensure production stability. + +> Note: The rollback tool is used in some emergency cases, please use this before read the materials carefully. + +### Introduction {#tools_Introduction} + +**How to Use** + +- Stop all Kylin services. + +- Use tools to roll back + + ```bash + $KYLIN_HOME/bin/rollback.sh --project project_example --time '2020-09-01 15:20:19' + ``` + +- Observe logs to identify differences of metadata such as project, model, user, Segment, task, etc. + +- Complete the interaction and confirmation to ensure that you are aware of the impact of the rollback. + +- After completed the rollback, start Kylin service. + +##### Parameters: + +- `-p,--project <arg> `: Project name [optional], `<arg>` is the project name. +- `-t, --time <arg>`: Historical time point to roll back [required]. `<arg>` is the specific time to roll back to, the format is ` yyyy-MM-dd HH: mm: ss`. The available value is the time point from the earliest backup version time to date. +- `--skip-check-data`: Skip checking whether the resource file is available [optional]. + +### Use Case {#user_case} + +Below is a common scenario of using the rollback tool in the operational process, describing how the tool can be used to roll back and downgrade issues in the event of product agnostic issues. + +- In AI augmented mode, the `rollback_example` project has recommendation with recommendation mode turned on. If the user accepts the recommendations under the model at 16:06:05 on September 1, 2020, then the original model of this project and the metadata related to the index will be updated. +  +- At this point, it can be seen that the index recommended by the system has been generated, but at this point there is an unknown problem, which results in the index not being built properly and affects the health of the entire cluster, and the product is also in an inoperable state. +- To guarantee production availability, the rollback tool can be used to roll back the metadata and data for this item to 15:20 on September 1. + +```sh +$KYLIN_HOME/bin/rollback.sh --project rollback_example --time '2020-09-01 15:20:19' +``` + +- It can be seen that the item has returned to the state before the accident, the index recommended by the system has disappeared, and the cluster has returned to a stable state.. + + + +- After the rollback is complete, a downgrade is also required for issues that occur. In this example, you can click **Setting**, temporarily turn off the smart recommendation feature under the **Basic Settings** tab first and communicate with the Kylin technical support team in a timely manner. +  +- After the product problem is fixed, you can turn on the recommendation mode again. + +### Difference from Metadata Backup and Rollback Tool{#difference} + +Kylin now offers metadata backup and restore tools that go some way to protecting metadata from loss. However, the tool has some limitations. + +- There is no way to rewind to the specified moment, you can only rewind to the version that has been backed up in the past. +- The metadata backup tool is straightforward, and it is possible that metadata may not be available after being backed up. For example, some files are cleaned up as garbage because the metadata does not exist, and the corresponding backup metadata is not available. + +### Configurations{#related_configurations} + + +The premise of using the rollback tool is that the resource data (cube data, dictionary data, snapshot data, etc.) must be guaranteed not to be deleted within the rollback time range. The retention period of the resource data involves two configurations + +* `kylin.storage.time-machine-enabled` After this configuration is enabled, the resources in the retention period will not be deleted in the KE service. After being enabled, the snapshot data retention time will be the same as the time configured in `kylin.storage.resource-survival-time-threshold`, the default value is False. +* `kylin.storage.resource-survival-time-threshold` Resource data retention time, the default value is `7d`, unit description:` d` (day), `h` (hour),` m` (minute). + +### Caution and Common Errors {#caution_and_common_errors} + +The following are some errors and points of attention that may be encountered during the use of the tool + +**Points to take attention** + +- Using the rollback tool will roll back the state of the task execution to the state of the historical moment, and will restart the execution after the KE service is started. +- After the rollback tool configuration is turned on, more garbage files may be saved and more storage space may be token up. Using the garbage cleaning tool during the retention period cannot clean up the expired resource data during the retention period. +- During the execution of the tool, if it is run multiple times, each run will keep a backup of the current metadata in the `{working-dir}/_ current_backup` directory, the file names are distinguished by time. +- The time specified by the user cannot be greater than the current time. +- All service nodes must be shut down before using the tool, otherwise it will cause data inconsistency. +- If the user manually deletes the dictionary data of the project and then regenerates the dictionary data again, using the rollback tool will cause the dictionary data and the index data to be inconsistent. +- After opening the `kylin.storage.time-machine-enabled` configuration after upgrading, users need to wait for a configured retention period before they can be rolled back. +- The user rolls back to the historical moment, and the snapshot data used is also the snapshot data of the historical moment instead of using the latest snapshot data. +- If the rollback time specified by the user is less than the minimum time of the metadata backup, the rollback cannot be performed. + +**Possible error results** + +- Using the rollback tool reverts the state of the task execution back to the historical moment, and the execution is triggered again when the KE is started. +- Turning on the time machine causes more junk files to be saved, taking up more storage space, and using the junk cleanup tool during the retention period does not clean up resource data that has expired during the retention period. +- During tool execution, if there are multiple runs, each run keeps a backup of the current metadata in the `{working-dir}/_current_backup` directory, distinguishing the file name by time. +- The time specified by the user cannot be greater than the current time. +- All service nodes must be turned off before using the tool, otherwise data inconsistencies will result. +- If a user manually deletes the dictionary data for an item and then regenerates the dictionary data, using the rollback tool can cause inconsistencies between the dictionary data and CUBE data. +- A user who has just upgraded a `kylin.storage.time-machine-enabled` configuration needs to wait until a configuration's retention period has passed before being guaranteed to roll back any time within the retention period. +- The user rolls back to the historical moment and the snapshot data used is also the snapshot data for the historical moment, not the latest snapshot data used. +- If the user-specified rollback time is less than the minimum time for metadata backup, it cannot be rolled back. +- `dectect port available failed` -> Failure to detect user ports requires shutting down the service nodes of the cluster. +- `check storage data available failed` -> Failed to detect resource file, user can use `--skip-check-data` parameter to force rollback。 +- `restore current metadata failed, please restore the metadata database manually` -> The metadata rollback fails, and overwriting with the current backup also fails. Manual intervention is required to solve the problem. This situation must be handled carefully to avoid loss of metadata. +- The rollback scope of the rollback tool does not include historical recommendations and projects manually deleted by the user + +### Appendix {#appendix} + +The following is a detailed process for the tool to perform rollback. + +- Backup metadata +- Check if the cluster is stopped +- Find the snapshot file of metadata from the backup directory, and then replay the `auditlog` log to the time specified by the user +- Compare the metadata differences and remind the user +- Wait for confirmation +- Check if the resource referenced to by metadata is available +- Roll back the metadata. If the rollback fails, it will be overwritten with a backup of the current metadata diff --git a/website/docs/operations/system-operation/diagnosis/build_flame_graph.md b/website/docs/operations/system-operation/diagnosis/build_flame_graph.md new file mode 100644 index 0000000000..bfea568978 --- /dev/null +++ b/website/docs/operations/system-operation/diagnosis/build_flame_graph.md @@ -0,0 +1,78 @@ +--- +title: Build Flame Graph +language: en +sidebar_label: Build Flame Graph +pagination_label: Build Flame Graph +toc_min_heading_level: 2 +toc_max_heading_level: 6 +pagination_prev: null +pagination_next: null +keywords: + - build flame graph +draft: false +last_update: + date: 08/12/2022 +--- + +Kylin has built-in async-profiler. When flame graphs are needed to diagnose build tasks performance, users can generate query flame graphs for Spark Driver and Executor by calling the API interface. + +Since the flame graph is generated at the system level, it will affect all projects. Only the Admin user has the authority to use this function. + +### Configs +| Config | Comment | +|---------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------| +| kylin.engine.async-profiler-enabled | enable the profiling feature (default to FALSE). After enables, you can trigger the generation and download of the flame graph by calling the API | +| kylin.engine.async-profiler-result-timeout | the timeout for the result collection (default to 60s) | +| kylin.engine.async-profiler-profile-timeout | the timeout for the profiling (default to 5min) | + + +### Start Profiling +invoke the below HTTP API to start generating flame graph + +- HTTP Header + - `Accept: application/vnd.apache.kylin-v4-public+json` + - `Accept-Language: en` + - `Accept-Language: zh` + - `Content-Type: application/json;charset=utf-8` + +**There are two ways to generate** + +- GET `http://host:port/kylin/api/jobs/profile/start_project?project={projectName}&step_id={jobStepId}¶ms={params}` + - URL Parameters + - `project`, Required, String, specifies the projectName where the current build task is located, no default value + - `step_id`, Required, String, specifies the jobStepId of the current build task, which can be found in the YARN interface, copying the rest of its Name except `job_step_`. + - `params`, Optional, String, specify async-profiler parameter, default is `start,event=cpu` (profile cpu usage) + +- GET `http://host:port/kylin/api/jobs/profile/start_appid?app_id={yarnAppId}¶ms={params}` + - URL Parameters + - `app_id`, Required, String, specifies the Application ID of the current build task submitted to YARN + - `params`, Optional, String, specify async-profiler parameter, default is `start,event=cpu` (profile cpu usage) + +### Stop Profiling +invoke the below HTTP API to stop generating flame graph and download flame graph + +- HTTP Header + - `Accept: application/vnd.apache.kylin-v4-public+json` + - `Accept-Language: en` + - `Accept-Language: zh` + - `Content-Type: application/json;charset=utf-8` + +**There are two ways to get it** + +- GET `http://host:port/kylin/api/jobs/profile/dump_project?project={projectName}&step_id={jobStepId}¶ms={params}` + - URL Parameters + - `project`, Required, String, specifies the projectName where the current build task is located, no default value + - `step_id`, Required, String, specifies the jobStepId of the current build task, which can be found in the YARN interface, copying the rest of its Name except `job_step_`. + - `params`, Optional, String, specify the async-profiler parameter, default is `flamegraph` (collects the results as a flame graph) + +- GET `http://host:port/kylin/api/jobs/profile/dump_appid?app_id={yarnAppId}¶ms={params}` + - URL Parameters + - `app_id`, Required,String,specify the Application ID of the current build task submitted to YARN + - `params`, Optional, String, specify the async-profiler parameter, the default is `flamegraph` (collects the results as a flame graph) + +### Limitations +1. Current async-profiler version comes with Kylin is Linux x64 (glibc): async-profiler-2.5-linux-x64.tar.gz, Other platforms are not supported. +2. The profiling may have some impact on the performance of Kylin, avoid doing a long time profiling in production. +3. The parameters involved in building the flame chart are system level parameters, please do not set them at other levels as they may cause abnormal behavior. +4. The `params` support configuration parameters can be found at https://github.com/jvm-profiling-tools/async-profiler +5. On some machines, the required native libraries may fail to load when the `/tmp` disk has the noexec attribute, causing Spark initialization to fail and affecting normal build tasks, so this feature is disabled by default. diff --git a/website/docs/operations/system-operation/diagnosis/diagnosis.md b/website/docs/operations/system-operation/diagnosis/diagnosis.md new file mode 100644 index 0000000000..7f16a0475c --- /dev/null +++ b/website/docs/operations/system-operation/diagnosis/diagnosis.md @@ -0,0 +1,86 @@ +--- +title: System, Job and Query Diagnosis +language: en +sidebar_label: System, Job and Query Diagnosis +pagination_label: System, Job and Query Diagnosis +toc_min_heading_level: 2 +toc_max_heading_level: 6 +pagination_prev: null +pagination_next: null +keywords: + - diagnosis + - system diagnosis + - job diagnosis + - query diagnosis +draft: false +last_update: + date: 08/12/2022 +--- + + +## System, Job and Query Diagnosis + +Kylin users may face with many problems during usage, such as job failure, SQL query failure, SQL query overtime, etc. In order to help solve these problems efficiently, Kylin provides a *Diagnosis* function on Web UI to pack related information into a zip package to help operation staff and Kyligence Support team to analyze the root cause. + +Diagnosis function includes System, Job and Query Diagnosis. In addition to the method of generating diagnostic package on web UI described in this chapter, you can also generate the diagnostic package through the bash script. For details, please refer to [Diagnosis Tool](cli_tool/diagnosis.en.md). + + +### Generate System Diagnostic Package in Web UI + +System diagnostic package includes all diagnostic information of a Kylin instance, and users can generate system diagnostic package on Web UI following the following steps: + + + +1. Enter the **Admin** page and click the **Diagnosis** button in the lower left corner. + + > **Note**: Only the system administrator can generate the system diagnostic package in web ui for the time being. + +2. Select **Time Range**: You can select *last one hour*, *last one day*, *last three days* and *last one month*, or self-defined. + + > **Note:** The selected time range must include the period of incidents. + +3. Select **Server**. + + > **Note:** If Kylin is deployed on multiple nodes, please locate the specific node on which your issue happened by selecting the right server name, otherwise the generated system diagnostic package may not include useful information about the issue. + +4. Click **Generate and Download** button: After the diagnostic package is generated, the downloads will be triggered automatically. If the diagnostic package generation fails, you can view the details of the failure on the interface. + +### Generate Job Diagnostic Package in Web UI + +Job diagnostic package includes all diagnostic information of a specific job, and users can generate job diagnostic package on Web UI by following the following steps: + + + +1. After logging in to Kylin, click **Monitor** in the left navigation bar. In the action bar of a job on the **Jobs List** page, click the **Download Job Diagnostic Package** button. +2. Select **Server**. +3. Click **Generate and Download** button: After the diagnostic package is generated, the downloads will be triggered automatically. If the diagnostic package generation fails, you can view the details of the failure on the interface. + +### Generate Query Diagnosis Package in Web UI + +Query diagnosis package includes all diagnostic information of a specific query, and users can generate query diagnosis package on Web UI by following the following steps: + + + +1. After logging in to Kylin, click **Query** in the left navigation bar. In the action bar of a query on the **History** page, click the **Download Query Diagnostic Package** button in **Action**. +2. Click **Generate and Download** button: After the diagnosis package is generated, the downloads will be triggered automatically. If the diagnosis package generation fails, you can view the details of the failure on the interface. + +By default, all users with project query permissions can download the query diagnostic package to facilitate query problem diagnosis. +Since the query diagnostic package contains some configuration information, if you want to reduce the relevant permissions for downloading the query diagnostic package, you can add configuration `kylin.security.allow-non-admin-generate-query-diag-package=false` in `$KYLIN_HOME/conf/kylin.properties`, only system administrators and users with project ADMIN permissions are allowed to download the query diagnostic package. + +### FAQ + +**Q: What are the differences in the contents of the three diagnostic packages?** + +For the contents of the three diagnostic packages, please refer to [Diagnosis Package Tool](../cli_tool/diagnosis.md). + +**Q: If I failed to generate diagnostic packages because of timeout.** + +Please change parameter `kylin.diag.package.timeout-seconds`(in seconds, the default value is one hour) in `$KYLIN_HOME/conf/kylin.properties` and restart Kylin. + +**Q: What should I do if the page does not download the diagnostic package automatically after the packaging is completed?** + +If the system has generated the diagnostic package successfully but fails to download it automatically, you can click **Download Manually** at the bottom left, select the diagnostic package you want to download, and then click the download button to download manually. + +**Q: What should I do if the hostname used by Kylin contains an underscore (`_`) and generates a diagnostic package has exception?** + +Please add service discovery parameter `spring.cloud.zookeeper.discovery.instance-host=IP` in `$KYLIN_HOME/kylin.properties.override` and restart Kylin. diff --git a/website/docs/operations/system-operation/diagnosis/images/diagnosis_web_en.png b/website/docs/operations/system-operation/diagnosis/images/diagnosis_web_en.png new file mode 100644 index 0000000000..329ee9c778 Binary files /dev/null and b/website/docs/operations/system-operation/diagnosis/images/diagnosis_web_en.png differ diff --git a/website/docs/operations/system-operation/diagnosis/images/job_diagnosis_web_en.png b/website/docs/operations/system-operation/diagnosis/images/job_diagnosis_web_en.png new file mode 100644 index 0000000000..5a6f0457e2 Binary files /dev/null and b/website/docs/operations/system-operation/diagnosis/images/job_diagnosis_web_en.png differ diff --git a/website/docs/operations/system-operation/diagnosis/images/query_diagnosis_web.png b/website/docs/operations/system-operation/diagnosis/images/query_diagnosis_web.png new file mode 100644 index 0000000000..9c84ffd914 Binary files /dev/null and b/website/docs/operations/system-operation/diagnosis/images/query_diagnosis_web.png differ diff --git a/website/docs/operations/system-operation/diagnosis/intro.md b/website/docs/operations/system-operation/diagnosis/intro.md new file mode 100644 index 0000000000..d4a4705068 --- /dev/null +++ b/website/docs/operations/system-operation/diagnosis/intro.md @@ -0,0 +1,17 @@ +--- +title: Diagnosis +language: en +sidebar_label: Diagnosis +pagination_label: Diagnosis +toc_min_heading_level: 2 +toc_max_heading_level: 6 +pagination_prev: null +pagination_next: null +keywords: + - diagnosis +draft: false +last_update: + date: 08/12/2022 +--- + +Kylin provides a diagnosis function to help users solve problems they may encounter, such as job failure, SQL query failure, SQL query overtime, etc. diff --git a/website/docs/operations/system-operation/diagnosis/query_flame_graph.md b/website/docs/operations/system-operation/diagnosis/query_flame_graph.md new file mode 100644 index 0000000000..117b28c8fa --- /dev/null +++ b/website/docs/operations/system-operation/diagnosis/query_flame_graph.md @@ -0,0 +1,58 @@ +--- +title: Query Flame Graph +language: en +sidebar_label: Query Flame Graph +pagination_label: Query Flame Graph +toc_min_heading_level: 2 +toc_max_heading_level: 6 +pagination_prev: null +pagination_next: null +keywords: + - query flame graph +draft: false +last_update: + date: 08/12/2022 +--- + +Kylin has built-in async-profiler. When flame graphs are needed to diagnose query performance, users can generate query flame graphs for Spark Driver and Executor by calling the API interface. + +Since the flame graph is generated at the system level, it will affect all projects. Only the Admin user has the authority to use this function. + +### Configs +| Config | Comment | +| ----------------------------------------------------------- | ------------------------------------------------------------ | +| kylin.query.async-profiler-enabled | enable the profiling feature (default to TRUE). After enables, you can trigger the generation and download of the flame graph by calling the API | +| kylin.query.async-profiler-result-timeout | the timeout for the result collection (default to 60s) | +| kylin.query.async-profiler-profile-timeout | the timeout for the profiling (default to 5min) | + + +### Start Profiling +invoke the below HTTP API to start generating flame graph + +- GET `http://host:port/kylin/api/query/profile/start?params={params}` + +- URL Parameters + - `params`, Optional, String, specify async-profiler params to start profiling, default to `start,event=cpu` (profile the cpu only), ref to https://github.com/jvm-profiling-tools/async-profiler for more parameters + +- HTTP Header + - `Accept: application/vnd.apache.kylin-v4-public+json` + - `Accept-Language: en` + - `Content-Type: application/json;charset=utf-8` + +### Stop Profiling +invoke the below HTTP API to stop generating flame graph and download flame graph + +- GET `http://host:port/kylin//api/query/profile/dump?params={params}` + +- URL Parameters + - `params`, Optional, String, specify async-profiler params to start profiling, default to `flamegraph` (dump the result as flamegraph), ref to https://github.com/jvm-profiling-tools/async-profiler for more parameters + +- HTTP Header + - `Accept: application/vnd.apache.kylin-v4-public+json` + - `Accept-Language: en` + - `Content-Type: application/json;charset=utf-8` + +### Limitations +1. Current async-profiler version comes with Kylin is Linux x64 (glibc): async-profiler-2.5-linux-x64.tar.gz, Other platforms are not supported. +2. The profiling may have some impact on the performance of Kylin, avoid doing a long time profiling in production. +3. The flame graph result can only be downloaded once. diff --git a/website/docs/operations/system-operation/guardian.md b/website/docs/operations/system-operation/guardian.md new file mode 100755 index 0000000000..ec80d2fc7e --- /dev/null +++ b/website/docs/operations/system-operation/guardian.md @@ -0,0 +1,62 @@ +--- +title: Kylin Guardian Process +language: en +sidebar_label: Kylin Guardian Process +pagination_label: Kylin Guardian Process +toc_min_heading_level: 2 +toc_max_heading_level: 6 +pagination_prev: null +pagination_next: null +keywords: + - guardian process +draft: false +last_update: + date: 08/12/2022 +--- + +Since Kylin 5.0, the system added a function of a daemon process for monitoring the health state of Kylin. This function is called **Kylin Guardian Process**. If the Kylin Guardian Process detects Kylin is in an unhealthy state, it will restart Kylin server or downgrade service. + +### Usage + +#### Turn On +Kylin Guardian Process is **disabled** by default. If you want to enable it, you need to add the configuration `kylin.guardian.enabled = true` in the global configuration file `$KYLIN_HOME/conf/kylin.properties`. +> **Note**: All the following configurations take effect if `kylin.guardian.enabled = true` + +If Kylin Guardian Process is enabled, a daemon process will be automatically started after starting Kylin. This process is bound to environment variable `KYLIN_HOME`, which means each Kylin instance has only one Kylin Guardian Process corresponding to it. + +Kylin Guardian Process description: +- The process ID is recorded in `$KYLIN_HOME/kgid`. + +- The log of the process is output in `$KYLIN_HOME/logs/guardian.log`. + +- Kylin Guardian Process will periodically check the health status of Kylin. The time delay of the first check is configured by the parameter `kylin.guardian.check-init-delay` (Unit: minutes), the default is 5 minutes, and the check interval is set by the parameter `kylin.guardian.check-interval` (Unit: minutes), the default is 1 minute. + + +#### Check Items +Kylin Guardian Process currently checks the following 4 aspects of Kylin instance's health. + +- Kylin process status + + If the process number file `$KYLIN_HOME/pid` exists and the corresponding process does not exist, it means Kylin server is in an abnormal down state, and Kylin Guardian Process will restart it. + +- Spark Context restart failure check + + If the number of Spark Context restart failure times is greater than or equals to the value of configuration `kylin.guardian.restart-spark-fail-threshold`, which is 3 times by default, Kylin Guardian Process will restart Kylin. This function is enabled by default. If you want to disable it, please add the configuration `kylin.guardian.restart-spark-fail-restart-enabled = false` in `$KYLIN_HOME/conf/kylin.properties`. + +- **Bad Query** canceled failed check + + >**Note**: Some queries will be forcibly closed due to abnormal reasons. At this time, the query is **Bad Query**, and the common case is a timeout query. + + If Kylin Guardian Process detects the number of Bad Query cancellation times is greater than or equals to the value of configuration `kylin.guardian.kill-slow-query-fail-threshold`, which is 3 times by default, Kylin Guardian Process will restart Kylin. It is enabled by default. If you want to disable it, you can add the configuration `kylin.guardian.kill-slow-query-fail-restart-enabled = false` in `$KYLIN_HOME/conf/kylin.properties`. + +- Full GC(Garbage Collection, Garbage collection mechanism in Java) duration check + + If the Full GC duration ratio in most recent period (default is value of `kylin.guardian.full-gc-check-factor` * value of `kylin.guardian.check-interval`) is greater than or equals to the value of configuration `kylin.guardian.full-gc-duration-ratio-threshold` which is 75% by default, Kylin Guardian Process will restart Kylin. It is enabled by default. If you want to disable it, you can add the configuration `kylin.guardian.full-gc-duration-ratio-restart-enabled = false` in `$KYLIN_HOM [...] + + +#### Kylin Guardian Process High Availability +To ensure the high availability of Kylin Guardian Process, Kylin will also periodically check the status of Kylin Guardian Process. If Kylin detects the Kylin Guardian Process does not exist, it will automatically start it. The feature is enabled by default. If you want to disable it, you can add the configuration `kylin.guardian.ha-enabled=false` in `$$KYLIN_HOME/conf/kylin.properties`. The time delay of the first check is configured by the parameter `kylin.guardian.ha-check-init-delay` [...] + + +#### Kylin OOM(Out of Memory) restarts automatically +Kylin Guardian Process supports restarting Kylin when the JVM of Kylin appears OOM. diff --git a/website/docs/operations/system-operation/images/forbidden_modify_metadata_during_maintenance_mode.png b/website/docs/operations/system-operation/images/forbidden_modify_metadata_during_maintenance_mode.png new file mode 100644 index 0000000000..b5fdfd120d Binary files /dev/null and b/website/docs/operations/system-operation/images/forbidden_modify_metadata_during_maintenance_mode.png differ diff --git a/website/docs/operations/system-operation/images/forbidden_modify_metadata_promoting_during_maintenance_mode.png b/website/docs/operations/system-operation/images/forbidden_modify_metadata_promoting_during_maintenance_mode.png new file mode 100644 index 0000000000..6f5c7afdd2 Binary files /dev/null and b/website/docs/operations/system-operation/images/forbidden_modify_metadata_promoting_during_maintenance_mode.png differ diff --git a/website/docs/operations/system-operation/images/maintenance_mode.png b/website/docs/operations/system-operation/images/maintenance_mode.png new file mode 100644 index 0000000000..2becffa013 Binary files /dev/null and b/website/docs/operations/system-operation/images/maintenance_mode.png differ diff --git a/website/docs/operations/system-operation/images/system_recover_from_maintenance_mode.png b/website/docs/operations/system-operation/images/system_recover_from_maintenance_mode.png new file mode 100644 index 0000000000..23b1050322 Binary files /dev/null and b/website/docs/operations/system-operation/images/system_recover_from_maintenance_mode.png differ diff --git a/website/docs/operations/system-operation/intro.md b/website/docs/operations/system-operation/intro.md new file mode 100644 index 0000000000..c78f5ee144 --- /dev/null +++ b/website/docs/operations/system-operation/intro.md @@ -0,0 +1,17 @@ +--- +title: System Operation +language: en +sidebar_label: System Operation +pagination_label: System Operation +toc_min_heading_level: 2 +toc_max_heading_level: 6 +pagination_prev: null +pagination_next: null +keywords: + - system operation +draft: false +last_update: + date: 08/12/2022 +--- + +This chapter introduces how to do system operation. diff --git a/website/docs/operations/system-operation/junk_file_clean.md b/website/docs/operations/system-operation/junk_file_clean.md new file mode 100644 index 0000000000..6bd6008ed8 --- /dev/null +++ b/website/docs/operations/system-operation/junk_file_clean.md @@ -0,0 +1,90 @@ +--- +title: Junk File Cleanup +language: en +sidebar_label: Junk File Cleanup +pagination_label: Junk File Cleanup +toc_min_heading_level: 2 +toc_max_heading_level: 6 +pagination_prev: null +pagination_next: null +keywords: + - junk file cleanup +draft: false +last_update: + date: 08/12/2022 +--- + + +## Junk File Cleanup + +After Kylin runs for a period of time, the system may generate a certain number of junk files, which may occupy a large amount of storage space. At this time, junk cleaning is required. + +Junk file cleaning can improve the stability and performance of the Kylin system. Effective junk file cleaning can not only save storage space, but also ensure the ecological health of the cluster where Kylin is located. + + +### Default recommended regular junk file cleaning method + +By default, the system will automatically clean up junk every day at 0:00 AM. + +- To modify the time and frequency of regular junk file cleaning, adjust the parameters in `$KYLIN_HOME/conf/kylin.properties`. The default configuration is `kylin.metadata.ops-cron=0 0 0 * * *`, which refers to junk file cleaning at 0:00 a.m. every day. The parameters from left to right in the configuration items represent: seconds, minutes, hours, Day, month, day of the week. By modifying the cron configuration, users can customize the junk file cleaning time, for example, every Saturd [...] + +- The default 4-hour timeout for automatic junk file cleaning, and it will automatically terminate after the timeout. The default configuration is `kylin.metadata.ops-cron-timeout=4h`. + +- Before the system regularly cleans up junk files, the metadata will be automatically backed up to the HDFS path `{kylin.env.hdfs-working-dir}/{MetadataIdentitiy}/_backup/{yyyy-MM-dd-HH-mm-ss}_backup/ metadata.zip`. + +- The system regularly cleans up junk files and will not enter system maintenance mode. + +- For more details on cron configuration, please refer to [Introduction to CronTrigger](http://www.quartz-scheduler.org/documentation/quartz-2.3.0/tutorials/crontrigger.html). + +### Junk file cleanup range + +The scope of junk file cleanup includes: +- Invalid or expired metadata: + - Query history. + - The total number of query history for all projects. The query history that exceeds this threshold number `kylin.query.queryhistory.project-max-size=10000000` (default) will be cleared. + - The query history of a single project exceeds this threshold `kylin.query.queryhistory.project-max-size=1000000` (default) The query history will be cleared. + - The query history time of all projects. The query history that exceeds this threshold `kylin.query.queryhistory.survival-time-threshold=30d` (default 30 days) will be cleared. This configuration also supports units: milliseconds ms, microseconds us, minutes m or min, hours h. + - Real-time job status/record table. Realtime jobs that exceed this threshold `kylin.streaming.jobstats.survival-time-threshold=7d` (default 7 days) will be cleaned up. + - Invalid optimization suggestion table data. + - Expired capacity billing metadata. Capacity billing information that exceeds this threshold `kylin.garbage.storage.sourceusage-survival-time-threshold=90d` (default 90 days) will be cleaned up. + - Invalid or out-of-date item-related metadata. + - `kylin.garbage.storage.executable-survival-time-threshold=30d` (default 30 days) above this threshold and completed metadata tasks are cleaned up. + - Audit log. Audit logs that exceed this threshold `kylin.metadata.audit-log.max-size=500000` (default) will be cleaned up. +- Invalid or expired HDFS data: + - Asynchronous query result file. HDFS asynchronous query result files that exceed this threshold `kylin.query.async.result-retain-days=7d` (default 7 days) will be cleaned up. + - Invalid or expired files on HDFS. Include invalid or expired indexes, snapshots, dictionaries, etc. + - Invalid files on HDFS that exceed this threshold `kylin.garbage.storage.cuboid-layout-survival-time-threshold=7d` (default 7 days) are cleaned up. + - Low Usage indexes on HDFS. + - Low usage storage refers to indexes whose usage frequency is lower than a certain threshold and data built under them within a certain time interval. You can configure the definition of low usage storage under a project in the project's Settings > Basic Settings > Index Optimization > Low Usage Storage . + - If recommendation is turned off, indexes with low cost performance will be cleaned according to the index optimization strategy during junk file cleaning. You can also manually clean up by clicking the **Clear** button under **Dashboard > Storage Quota > Low Usage Storage**. + - If recommendation is turned on, the cleanup of Low Usage storage will no longer be triggered during junk file cleaning, and the corresponding inefficient index will be converted to **model optimization suggestions**, and the button to clean up junk file will not appear in the dashboard. + + > Note: The default timed junk file cleaning method starts from Kylin 5.0 and later, will clean up invalid or expired HDFS data. + +### Compatible with historically supported junk cleaning tools + +> Note: In order to be compatible with the command-line tool cleanup that has been provided in history, the behavior of the previously provided tools has not changed. Users who have used this method can gradually abandon this method according to the actual situation. Users who are not using this tool could not pay attention to this section. + +Kylin provides a junk file cleaning command line tool for checking and cleaning HDFS data, so as to ensure that the system is in a good running state. Please execute the following command in the terminal: + +````sh +$KYLIN_HOME/bin/kylin.sh org.apache.kylin.tool.routine.RoutineTool +```` + +When executing this command without any parameters, it will only list the data in HDFS that can be cleaned, but will not perform the actual cleaning action. + +This command supports standard short and long parameters. The parameter descriptions are as follows: +- `-m, --metadata`: Perform metadata junk file cleaning. +- `-c, --cleanup`: Perform data junk file cleanup. Without this parameter, the tool will not make any modification to the HDFS data. +- `-p [project...], --projects=[project...]`: Specifies the projects to clean. When specifying multiple items, separate them with commas. Without this parameter, the tool will clean up all items. +- `-h, --help`: Print help information. +- `-r number`: The number of requests per second when accessing cloud environment object storage. `-r 10` means 10 requests per second. You can use this parameter to limit the frequency of requests for object storage in the cloud environment by the junk file cleaning tool to avoid errors due to exceeding the request frequency limit. +- `-t number`: The number of request retries when accessing the cloud environment object storage fails. `-t 3` means to retry 3 times. + +**Note**: Whether this command executes metadata junk file cleaning through -m or data junk file cleaning through -c, Kylin will enter maintenance mode. If junk file cleaning is forcibly interrupted, you need to manually exit maintenance mode. Refer to [ Maintenance Mode](system-operation/maintenance_mode.en.md). + +In addition, from Kylin 4.5.6, the new command line tool `FastRoutineTool` +````sh +$KYLIN_HOME/bin/kylin.sh org.apache.kylin.tool.routine.FastRoutineTool +```` +The only difference compared to `RoutineTool` is that when performing data junk file cleaning with the `-c` parameter, it does not enter maintenance mode. Maintenance mode is still entered when performing metadata junk file cleanup via -m. diff --git a/website/docs/operations/system-operation/limit_query.md b/website/docs/operations/system-operation/limit_query.md new file mode 100644 index 0000000000..b53e46b151 --- /dev/null +++ b/website/docs/operations/system-operation/limit_query.md @@ -0,0 +1,72 @@ +--- +title: Limit query current capacity, protect query stability +language: en +sidebar_label: Limit query current capacity, protect query stability +pagination_label: Limit query current capacity, protect query stability +toc_min_heading_level: 2 +toc_max_heading_level: 6 +pagination_prev: null +pagination_next: null +keywords: + - limit query current capacity + - protect query stability +draft: false +last_update: + date: 08/12/2022 +--- + +Query resources are usually limited. During certain periods of time, the query volume suddenly increases, or when a small number of large queries occupy too many resources, query resource competition may occur, resulting in large fluctuations in overall query performance. + +In order to avoid the above situation, we can adopt the query current capacity limiting strategy, by rejecting or limiting the performance of part of large queries, to ensure that most of the small queries are not affected, and to ensure the overall stability of the query. + + +### Query classification + +Through long-term observation, we can roughly divide queries into two categories: large queries and small queries. They have different typical characteristics: + +- **Big query**: The number is small and the resources are occupied. The fluctuation of the big query has a great impact on the overall stability of the query. +- **Small queries**: The number is large, each small query occupies less resources, protecting small queries can effectively ensure the overall stability of the query. + +According to the characteristics of these two types of queries, we have designed different query current limiting strategies, which can be selected as needed. For details, see **two query current limiting strategies** below. + +At the same time, for the judgment of large and small queries, we also provide parameters, allowing users to fine-tune according to the actual situation. See below for **Determination of Large Query**. + +### Two query current capacity limiting strategies + +**Strategy 1: Small query priority scheduling strategy** + +After enabling the priority scheduling policy for small queries, small queries will be scheduled first, and large queries will be scheduled later. + +Set `kylin.query.query-limit-enabled = true` in `kylin.properties`, the default value is **false**. Also configure Ops Plan to enable large query rejection policy. + +**Strategy 2: Large query rejection strategy** + +Different from the post-scheduling of large queries in strategy 1, when strategy 2 is used, large queries will be rejected directly after reaching the upper limit of Spark task load. Spark task load refers to the ratio of the number of tasks in the Pending state to the number of tasks in the Running state in the Spark cluster. This strategy requires the cooperation of the Ops Plan to collect the task load indicator, and when the indicator value reaches the upper limit, it triggers the re [...] + +Set `kylin.query.share-state-switch-implement=jdbc` in `kylin.properties`, and configure Ops Plan to enable large query rejection policy. The default value is **close**. + +Among them, the default value of Spark task load is 50. Generally, it is not recommended to modify it. + +To configure the Ops Plan to turn on the large query rejection strategy, please contact the [Kyligence Technical Support Team](https://support.kyligence.io/#/) for help. + + +### Judgment of large query + +An important factor that affects the effect of the above query current limiting strategy is the determination of large queries. We provide both default values and allow flexible adjustments based on actual queries and system conditions. + +**Main principle:** + +The system mainly uses **data scan rows** as the basis for judging whether it is a large query. The sum of the number of rows scanned for a query data, when this value exceeds the threshold, it is determined as a large query, otherwise it is a small query. This value may be different from the "Number of records scanned by query" displayed on the page of the query result. This number of rows refers to the number of rows of the parquet file scanned after **pruning**. + +**Judgment settings for large queries:** + +The system provides the initial threshold setting for determining whether it is a large query and the number of data scan rows, and also provides a mechanism to automatically update this threshold. The configuration of related parameters will be described in detail below. + +To adjust, adjust the following parameters in `kylin.properties`: +- `kylin.query.big-query-source-scan-rows-threshold`: Determines whether it is a big query, the initial threshold of the number of rows to be scanned. The default value is `-1` , which means that the user does not specify, and the system automatically calculates the initial threshold at startup. In addition, this threshold can be automatically updated to suit the cluster environment by collecting query information during system operation. +- `kylin.query.auto-adjust-big-query-rows-threshold-enabled`: Whether to automatically update the above thresholds. The default value is false, set to true to enable automatic update. +- `kylin.query.big-query-threshold-update-interval-second`: Interval to automatically update the above threshold. The default value is 10800, in seconds. +- `kylin.query.big-query-second`: The time limit that the big query needs to meet when the above threshold is automatically updated, the default value is 10, in seconds. + +In addition, when the query contains limit, the following optimizations can also be enabled to make the automatic update threshold more accurate and avoid misjudgment of large queries. +- `kylin.query.apply-limit-info-to-source-scan-rows-enabled`: Whether to apply limit information to optimize scan row count estimation. The default value is false. diff --git a/website/docs/operations/system-operation/maintenance_mode.md b/website/docs/operations/system-operation/maintenance_mode.md new file mode 100644 index 0000000000..1765bb97d9 --- /dev/null +++ b/website/docs/operations/system-operation/maintenance_mode.md @@ -0,0 +1,76 @@ +--- +title: Maintenance Mode +language: en +sidebar_label: Maintenance Mode +pagination_label: Maintenance Mode +toc_min_heading_level: 2 +toc_max_heading_level: 6 +pagination_prev: null +pagination_next: null +keywords: + - maintenance mode +draft: true +last_update: + date: 08/16/2022 +--- + +## Basic Concepts + +When in maintenance mode, Kylin nodes will not be able to modify metadata, and only read operations such as queries will not be affected + +## Entering maintenance mode + Run the following command to enter maintenance mode: + ```sh + ${KYLIN_HOME}/bin/kylin.sh org.apache.kylin.tool.MaintainModeTool -on -reason ${reason} + ``` + + **Note**: + +- You need to use the argument -reason to explain why you turn on maintenance mode。 + + - When it is in maintenance mode, it is not allowed to enter maintenance mode again. You need to manually exit maintenance mode first + +- ${reason} refers to the reason for entering maintenance mode. There are no fixed characters. You can enter any word, such as `upgrade` + + When in maintenance mode, you will get following prompt: + +  + + metadata modification operations, such as create project, load/unload tables, create/update models, etc., + can't be executed. + + If you do the above, the following errors will occur: + +  + +  + +## Exiting Maintenance mode + Run the following command to exit maintenance mode: + ```sh + ${KYLIN_HOME}/bin/kylin.sh org.apache.kylin.tool.MaintainModeTool -off + ``` + **Note**:While system is already in maintenance mode, the system will prompt you after executing command. In speciality, you can use `-f` or `--force` parameter to force to turn off maintenance mode, + to release all occupation of project. + + After the command was executed, you may not modify metadata immediately. It should be waiting Kylin nodes to + get the resources. You can refer following properties: + ```properties + kylin.server.leader-race.heart-beat-interval=30 + ``` + Its unit is seconds. You should wait up to 30 seconds. + + During the maintenance, if you try to modify metadata, you will get following prompt: + +  + + +## Recommendations for the use of maintenance mode + +Maintenance mode is provided to prevent metadata write conflicts in the system. +In principle, users are not recommended to turn on or turn off this function manually. +When running the following commands, the system will automatically turn on the maintenance mode, and turn off the maintenance mode after the operation ends. + + - [reset admin password](../access-control/user_management.md) + - [junk file clean](junk_file_clean.md) + - [metadata restore](cli_tool/metadata_tool/metadata_backup_restore.en.md#metadata-restore----metadata_restore) diff --git a/website/docs/operations/system-operation/update-session-table.md b/website/docs/operations/system-operation/update-session-table.md new file mode 100644 index 0000000000..01f1b15af5 --- /dev/null +++ b/website/docs/operations/system-operation/update-session-table.md @@ -0,0 +1,46 @@ +--- +title: Update Session Table Tool +language: en +sidebar_label: Update Session Table Tool +pagination_label: Update Session Table Tool +toc_min_heading_level: 2 +toc_max_heading_level: 6 +pagination_prev: null +pagination_next: null +keywords: + - update session table tool +draft: false +last_update: + date: 08/12/2022 +--- + +When configured `kylin.web.session.secure-random-create-enabled=true` or `kylin.web.session.jdbc-encode-enabled=true`, the session table needs to be upgraded, otherwise the user cannot log in. + +**How to Use** + +- Use tools to update + + ```bash + $KYLIN_HOME/bin/kylin.sh org.apache.kylin.tool.upgrade.UpdateSessionTableCLI + ``` + +> Note: During the upgrade process, the update may fail due to permission reasons. At this time, the operation and maintenance personnel need to manually execute the statement to update the session table. + +### Sql + +**Use PostgreSQL as Metastore** + +```sql +ALTER TABLE spring_session ALTER COLUMN SESSION_ID TYPE VARCHAR(180) , ALTER COLUMN SESSION_ID SET NOT NULL; + +ALTER TABLE spring_session_ATTRIBUTES ALTER COLUMN SESSION_ID TYPE VARCHAR(180) , ALTER COLUMN SESSION_ID SET NOT NULL; +``` + +**Use MySQL as Metastore** + +```sql +ALTER TABLE spring_session MODIFY COLUMN SESSION_ID VARCHAR(180) NOT NULL; + +ALTER TABLE spring_session_ATTRIBUTES MODIFY COLUMN SESSION_ID VARCHAR(180) NOT NULL; +``` + diff --git a/website/sidebars.js b/website/sidebars.js index 37f85b5db4..ec86ae733f 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -30,7 +30,6 @@ const sidebars = { { type: 'category', label: 'Tutorial', - items: [ { type: 'doc', @@ -143,9 +142,265 @@ const sidebars = { id: 'deployment/deploy_mode/rw_separation' }, ] + }, + { + type: 'category', + label: 'System Configuration', + link: { + type: 'doc', + id: 'configuration/intro', + }, + items: [ + { + type: 'doc', + id: 'configuration/configuration' + }, + { + type: 'doc', + id: 'configuration/spark_dynamic_allocation' + }, + { + type: 'doc', + id: 'configuration/hadoop_queue_config' + }, + { + type: 'doc', + id: 'configuration/query_cache' + }, + { + type: 'doc', + id: 'configuration/https' + }, + { + type: 'doc', + id: 'configuration/spark_rpc_encryption' + }, + { + type: 'doc', + id: 'configuration/log_rotate' + }, + ], + }, + { + type: 'doc', + id: 'deployment/installation/uninstallation' } ], }, + { + type: 'category', + label: 'Operation and Maintenance Guide', + link: { + type: 'doc', + id: 'operations/intro', + }, + items: [ + { + type: 'doc', + id: 'operations/overview' + }, + { + type: 'category', + label: 'Project Operation', + link: { + type: 'doc', + id: 'operations/project-operation/intro' + }, + items: [ + { + type: 'doc', + id: 'operations/project-operation/project_management' + }, + { + type: 'doc', + id: 'operations/project-operation/project_settings' + }, + { + type: 'doc', + id: 'operations/project-operation/alerting' + }, + ], + }, + { + type: 'category', + label: 'Access Control', + link: { + type: 'doc', + id: 'operations/access-control/intro' + }, + items: [ + { + type: 'doc', + id: 'operations/access-control/user_management' + }, + { + type: 'doc', + id: 'operations/access-control/group_management' + }, + { + type: 'category', + label: 'Data Access Control', + link: { + type: 'doc', + id: 'operations/access-control/data-access-control/intro' + }, + items: [ + { + type: 'doc', + id: 'operations/access-control/data-access-control/project_acl' + }, + { + type: 'doc', + id: 'operations/access-control/data-access-control/acl_table' + }, + ], + }, + ], + }, + { + type: 'category', + label: 'System Operation', + link: { + type: 'doc', + id: 'operations/system-operation/intro', + }, + items: [ + { + type: 'category', + label: 'Diagnosis', + link: { + type: 'doc', + id: 'operations/system-operation/diagnosis/intro', + }, + items: [ + { + type: 'doc', + id: 'operations/system-operation/diagnosis/diagnosis', + }, + { + type: 'doc', + id: 'operations/system-operation/diagnosis/query_flame_graph', + }, + { + type: 'doc', + id: 'operations/system-operation/diagnosis/build_flame_graph', + }, + ], + }, + { + type: 'doc', + id: 'operations/system-operation/update-session-table', + }, + { + type: 'category', + label: 'CLI Operation Tool', + link: { + type: 'doc', + id: 'operations/system-operation/cli_tool/intro', + }, + items: [ + { + type: 'doc', + id: 'operations/system-operation/cli_tool/environment_dependency_check', + }, + { + type: 'doc', + id: 'operations/system-operation/cli_tool/diagnosis' + }, + { + type: 'category', + label: 'Metadata Tool', + link: { + type: 'doc', + id: 'operations/system-operation/cli_tool/metadata_tool/intro', + }, + items: [ + { + type: 'doc', + id: 'operations/system-operation/cli_tool/metadata_tool/metadata_backup_restore' + }, + ], + }, + { + type: 'doc', + id: 'operations/system-operation/cli_tool/rollback' + }, + ], + }, + { + type: 'doc', + id: 'operations/system-operation/maintenance_mode' + }, + { + type: 'doc', + id: 'operations/system-operation/guardian', + }, + { + type: 'doc', + id: 'operations/system-operation/junk_file_clean', + }, + { + type: 'doc', + id: 'operations/system-operation/limit_query', + }, + ], + }, + { + type: 'category', + label: 'Monitoring', + link: { + type: 'doc', + id: 'operations/monitoring/intro', + }, + items: [ + { + type: 'category', + label: 'InfluxDB', + link: { + type: 'doc', + id: 'operations/monitoring/influxdb/intro', + }, + items: [ + { + type: 'doc', + id: 'operations/monitoring/influxdb/influxdb' + }, + { + type: 'doc', + id: 'operations/monitoring/influxdb/influxdb_maintenance' + }, + ], + }, + { + type: 'doc', + id: 'operations/monitoring/metrics_intro', + }, + { + type: 'doc', + id: 'operations/monitoring/service' + }, + ], + }, + { + type: 'category', + label: 'Logs', + link: { + type: 'doc', + id: 'operations/logs/intro', + }, + items: [ + { + type: 'doc', + id: 'operations/logs/system_log' + }, + { + type: 'doc', + id: 'operations/logs/audit_log' + }, + ], + }, + ], + }, { type: 'category', label: 'Modeling', @@ -272,54 +527,7 @@ const sidebars = { }, ], }, - { - type: 'category', - label: 'Operation and Maintenance Guide', - link: { - type: 'doc', - id: 'operations/intro', - }, - items: [ - { - type: 'category', - label: 'Monitoring', - link: { - type: 'doc', - id: 'operations/monitoring/intro', - }, - items: [ - { - type: 'category', - label: 'InfluxDB', - link: { - type: 'doc', - id: 'operations/monitoring/influxdb/intro', - }, - items: [ - { - type: 'doc', - id: 'operations/monitoring/influxdb/influxdb' - }, - { - type: 'doc', - id: 'operations/monitoring/influxdb/influxdb_maintenance' - }, - ], - }, - ], - }, - ], - }, - { - type: 'category', - label: 'Configuration Guide', - items: [ - { - type: 'doc', - id: 'configuration/intro' - }, - ], - }, + ], DevelopmentSideBar: [ {