This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-1.2-lts
in repository https://gitbox.apache.org/repos/asf/doris.git

commit ffdfa567c7722f7a035af82c91f23d37319b11d8
Author: Zhengguo Yang <[email protected]>
AuthorDate: Fri Apr 28 17:42:00 2023 +0800

    [chore](recover) add a config to recover remaining data in emergency 
(#18986)
---
 docs/en/docs/admin-manual/config/fe-config.md      | 16 ++++++++++++++--
 docs/zh-CN/docs/admin-manual/config/fe-config.md   | 20 ++++++++++++++++++--
 .../main/java/org/apache/doris/common/Config.java  | 14 ++++++++++++++
 .../org/apache/doris/planner/OlapScanNode.java     | 22 +++++++++++++++++++++-
 4 files changed, 67 insertions(+), 5 deletions(-)

diff --git a/docs/en/docs/admin-manual/config/fe-config.md 
b/docs/en/docs/admin-manual/config/fe-config.md
index 3ca3c32c66..6a9e8b4a0b 100644
--- a/docs/en/docs/admin-manual/config/fe-config.md
+++ b/docs/en/docs/admin-manual/config/fe-config.md
@@ -1504,8 +1504,6 @@ MasterOnly:true
 
 Default non-streaming mini load timeout
 
-### broker_load_default_timeout_second
-
 Default:14400   (4 hour)
 
 IsMutable:true
@@ -2400,3 +2398,17 @@ If false, when select from tables in information_schema 
database,
 the result will not contain the information of the table in external catalog.
 This is to avoid query time when external catalog is not reachable.
 
+#### `recover_with_skip_missing_version`
+
+Default:disable
+
+IsMutable:true
+
+MasterOnly:true
+
+In some scenarios, there is an unrecoverable metadata problem in the cluster, 
and the visibleVersion of the data does not match be. In this case, it is still 
necessary to restore the remaining data (which may cause problems with the 
correctness of the data). This configuration is the same as` 
recover_with_empty_tablet` should only be used in emergency situations
+This configuration has three values:
+* disable : If an exception occurs, an error will be reported normally.
+* ignore_version: ignore the visibleVersion information recorded in fe 
partition, use replica version
+* ignore_all: In addition to ignore_version, when encountering no queryable 
replica, skip it directly instead of throwing an exception
+
diff --git a/docs/zh-CN/docs/admin-manual/config/fe-config.md 
b/docs/zh-CN/docs/admin-manual/config/fe-config.md
index 01d6abbdc7..eb1fbe7ed8 100644
--- a/docs/zh-CN/docs/admin-manual/config/fe-config.md
+++ b/docs/zh-CN/docs/admin-manual/config/fe-config.md
@@ -1643,8 +1643,6 @@ Thrift Server最大工作线程数
 
 两个发布版本操作之间的最小间隔
 
-### `publish_version_timeout_second`
-
 默认值:30 (s)
 
 是否可以动态配置:true
@@ -2464,3 +2462,21 @@ broker load job 保存的失败tablet 信息的最大数量
 
 这个参数主要用于避免因 external catalog 无法访问、信息过多等原因导致的查询 `information_schema` 超时的问题。
 
+#### `recover_with_skip_missing_version`
+
+默认值:disable
+
+是否可以动态配置:true
+
+是否为 Master FE 节点独有的配置项:true
+
+有些场景下集群出现了不可恢复的元数据问题,数据已的visibleversion 已经和be 不匹配,
+
+这种情况下仍然需要恢复剩余的数据(可能能会导致数据的正确性有问题),这个配置同`recover_with_empty_tablet` 一样只能在紧急情况下使用
+
+这个配置有三个值:
+
+   * disable :出现异常会正常报错。
+   * ignore_version: 忽略 fe partition 中记录的visibleVersion 信息, 使用replica version 
+   * ignore_all: 除了ignore_version, 在遇到找不到可查询的replica 时,直接跳过而不是抛出异常
+
diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java 
b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
index 145b9fb894..f40e7233cd 100644
--- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
+++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
@@ -1398,6 +1398,20 @@ public class Config extends ConfigBase {
     @ConfField(mutable = true, masterOnly = true)
     public static boolean recover_with_empty_tablet = false;
 
+    /**
+     * In some scenarios, there is an unrecoverable metadata problem in the 
cluster,
+     * and the visibleVersion of the data does not match be. In this case, it 
is still
+     * necessary to restore the remaining data (which may cause problems with 
the correctness of the data).
+     * This configuration is the same as` recover_with_empty_tablet` should 
only be used in emergency situations
+     * This configuration has three values:
+     *   disable : If an exception occurs, an error will be reported normally.
+     *   ignore_version: ignore the visibleVersion information recorded in fe 
partition, use replica version
+     *   ignore_all: In addition to ignore_version, when encountering no 
queryable replica,
+     *   skip it directly instead of throwing an exception
+     */
+    @ConfField(mutable = true, masterOnly = true)
+    public static String recover_with_skip_missing_version = "disable";
+
     /**
      * Whether to add a delete sign column when create unique table
      */
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/planner/OlapScanNode.java 
b/fe/fe-core/src/main/java/org/apache/doris/planner/OlapScanNode.java
index 14577c1967..bf0751ff9a 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/planner/OlapScanNode.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/planner/OlapScanNode.java
@@ -48,6 +48,7 @@ import org.apache.doris.catalog.PartitionType;
 import org.apache.doris.catalog.Replica;
 import org.apache.doris.catalog.Tablet;
 import org.apache.doris.common.AnalysisException;
+import org.apache.doris.common.Config;
 import org.apache.doris.common.ErrorCode;
 import org.apache.doris.common.ErrorReport;
 import org.apache.doris.common.UserException;
@@ -633,6 +634,20 @@ public class OlapScanNode extends ScanNode {
         }
         for (Tablet tablet : tablets) {
             long tabletId = tablet.getId();
+            if 
(!Config.recover_with_skip_missing_version.equalsIgnoreCase("disable")) {
+                long tabletVersion = -1L;
+                for (Replica replica : tablet.getReplicas()) {
+                    if (replica.getVersion() > tabletVersion) {
+                        tabletVersion = replica.getVersion();
+                    }
+                }
+                if (tabletVersion != visibleVersion) {
+                    LOG.warn("tablet {} version {} is not equal to partition 
{} version {}",
+                            tabletId, tabletVersion, partition.getId(), 
visibleVersion);
+                    visibleVersion = tabletVersion;
+                    visibleVersionStr = String.valueOf(visibleVersion);
+                }
+            }
             TScanRangeLocations scanRangeLocations = new TScanRangeLocations();
             TPaloScanRange paloRange = new TPaloScanRange();
             paloRange.setDbName("");
@@ -708,7 +723,12 @@ public class OlapScanNode extends ScanNode {
                 scanBackendIds.add(backend.getId());
             }
             if (tabletIsNull) {
-                throw new UserException(tabletId + " have no queryable 
replicas. err: " + Joiner.on(", ").join(errs));
+                if 
(Config.recover_with_skip_missing_version.equalsIgnoreCase("ignore_all")) {
+                    continue;
+                } else {
+                    throw new UserException(tabletId + " have no queryable 
replicas. err: "
+                            + Joiner.on(", ").join(errs));
+                }
             }
             TScanRange scanRange = new TScanRange();
             scanRange.setPaloScanRange(paloRange);


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to