Apache9 commented on code in PR #7375:
URL: https://github.com/apache/hbase/pull/7375#discussion_r2506976818
##########
hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java:
##########
@@ -440,7 +440,7 @@ boolean checkAndRecordNewServer(final ServerName
serverName, final ServerMetrics
* @param deadServersFromPE the region servers which already have a SCP
associated.
* @param liveServersFromWALDir the live region servers from wal directory.
*/
- void findDeadServersAndProcess(Set<ServerName> deadServersFromPE,
+ void findDeadServersAndProcess(Map<ServerName, Long> deadServersFromPE,
Review Comment:
Why just a parameter type change without chaging any real logic?
##########
hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java:
##########
@@ -3081,7 +3082,7 @@ public ClusterMetrics
getClusterMetricsWithoutCoprocessor(EnumSet<Option> option
case REGIONS_IN_TRANSITION: {
if (assignmentManager != null) {
builder.setRegionsInTransition(
-
assignmentManager.getRegionStates().getRegionsStateInTransition());
+ new
ArrayList<>(assignmentManager.getRegionsStateInTransition()));
Review Comment:
Why this change?
##########
hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java:
##########
@@ -116,22 +117,23 @@ private RegionServerInfo getServerInfo(ServerName
serverName)
* In this method, we will also construct the region server sets in {@link
ServerManager}. If a
* region server is dead between the crash of the previous master instance
and the start of the
* current master instance, we will schedule a SCP for it. This is done in
- * {@link ServerManager#findDeadServersAndProcess(Set, Set)}, we call it
here under the lock
+ * {@link ServerManager#findDeadServersAndProcess(Map, Set)}, we call it
here under the lock
* protection to prevent concurrency issues with server expiration operation.
* @param deadServersFromPE the region servers which already have
SCP associated.
* @param liveServersBeforeRestart the live region servers we recorded
before master restarts.
* @param splittingServersFromWALDir Servers whose WALs are being actively
'split'.
*/
- public void upgrade(Set<ServerName> deadServersFromPE, Set<ServerName>
liveServersBeforeRestart,
- Set<ServerName> splittingServersFromWALDir) throws KeeperException,
IOException {
+ public void upgrade(Map<ServerName, Long> deadServersFromPE,
+ Set<ServerName> liveServersBeforeRestart, Set<ServerName>
splittingServersFromWALDir)
+ throws KeeperException, IOException {
LOG.info(
"Upgrading RegionServerTracker to active master mode; {} have existing"
+ "ServerCrashProcedures, {} possibly 'live' servers, and {}
'splitting'.",
deadServersFromPE.size(), liveServersBeforeRestart.size(),
splittingServersFromWALDir.size());
// deadServersFromPE is made from a list of outstanding
ServerCrashProcedures.
// splittingServersFromWALDir are being actively split -- the directory in
the FS ends in
// '-SPLITTING'. Each splitting server should have a corresponding SCP.
Log if not.
- splittingServersFromWALDir.stream().filter(s ->
!deadServersFromPE.contains(s))
+ splittingServersFromWALDir.stream().filter(s ->
!deadServersFromPE.containsKey(s))
Review Comment:
Why we change the parameter from Set to a Map but we only use its
containsKey method?
##########
hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java:
##########
@@ -232,12 +236,15 @@ public class AssignmentManager {
private final int forceRegionRetainmentRetries;
+ private final RegionInTransitionTracker regionInTransitionTracker;
Review Comment:
Why not put this into RegionStates? In this way we can reduce the code
change i think.
##########
hbase-server/src/main/java/org/apache/hadoop/hbase/master/DeadServer.java:
##########
@@ -75,6 +75,10 @@ synchronized void putIfAbsent(ServerName sn) {
this.deadServers.putIfAbsent(sn, EnvironmentEdgeManager.currentTime());
}
+ synchronized void putIfAbsent(ServerName sn, Long crashedTime) {
Review Comment:
Long -> long
##########
hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java:
##########
@@ -2114,8 +2115,8 @@ public BalanceResponse balance(BalanceRequest request)
throws IOException {
if (!request.isIgnoreRegionsInTransition() || metaInTransition) {
LOG.info("Not running balancer (ignoreRIT=false" + ", metaRIT=" +
metaInTransition
- + ") because " + regionsInTransition.size() + " region(s) in
transition: " + toPrint
- + (truncated ? "(truncated list)" : ""));
+ + ") because " + assignmentManager.getRegionTransitScheduledCount()
Review Comment:
Why not using regionsInTransition.size()?
##########
hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionInTransitionTracker.java:
##########
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.assignment;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.ConcurrentSkipListMap;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.client.TableState;
+import org.apache.hadoop.hbase.master.RegionState;
+import org.apache.hadoop.hbase.master.TableStateManager;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
[email protected]
+public class RegionInTransitionTracker {
+ private static final Logger LOG =
LoggerFactory.getLogger(RegionInTransitionTracker.class);
+
+ private final List<RegionState.State> DISABLE_TABLE_REGION_STATE =
+ List.of(RegionState.State.OFFLINE, RegionState.State.CLOSED);
+
+ private final List<RegionState.State> ENABLE_TABLE_REGION_STATE =
List.of(RegionState.State.OPEN);
+
+ private final ConcurrentSkipListMap<RegionInfo, RegionStateNode>
regionInTransition =
+ new ConcurrentSkipListMap<>(RegionInfo.COMPARATOR);
+
+ private final TableStateManager tableStateManager;
+
+ public RegionInTransitionTracker(TableStateManager tableStateManager) {
+ this.tableStateManager = tableStateManager;
+ }
+
+ public boolean isRegionInTransition(final RegionInfo regionInfo) {
+ return regionInTransition.containsKey(regionInfo);
+ }
+
+ public void regionCrashed(RegionStateNode regionStateNode) {
Review Comment:
So this is only for regions on crashed region server?
##########
hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java:
##########
@@ -382,7 +391,7 @@ public void setupRIT(List<TransitRegionStateProcedure>
procs) {
return;
}
}
- LOG.info("Attach {} to {} to restore RIT", proc, regionNode);
+ LOG.info("Attach {} to {} to restore", proc, regionNode);
Review Comment:
Maybe just remove 'to restore' too?
```
LOG.info("Attach {} to {}", proc, regionNode);
```
##########
hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionInTransitionTracker.java:
##########
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.assignment;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.ConcurrentSkipListMap;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.client.TableState;
+import org.apache.hadoop.hbase.master.RegionState;
+import org.apache.hadoop.hbase.master.TableStateManager;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
[email protected]
+public class RegionInTransitionTracker {
Review Comment:
better add some javadocs to descrive the usage of this class.
##########
hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ProcedureSyncWait.java:
##########
@@ -246,13 +246,14 @@ protected static void waitMetaRegions(final
MasterProcedureEnv env) throws IOExc
protected static void waitRegionInTransition(final MasterProcedureEnv env,
final List<RegionInfo> regions) throws IOException {
- final RegionStates states = env.getAssignmentManager().getRegionStates();
+ final AssignmentManager assignmentManager = env.getAssignmentManager();
Review Comment:
Why this change?
##########
hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java:
##########
@@ -96,17 +96,17 @@ public AssignmentProcedureEvent(final RegionInfo
regionInfo) {
/**
* Updated whenever a call to {@link #setRegionLocation(ServerName)} or
- * {@link #setState(RegionState.State, RegionState.State...)}.
+ * {@link #setState(RegionState.State, RegionState.State...)} or {@link
#crashed(long)}.
*/
private volatile long lastUpdate = 0;
private volatile long openSeqNum = HConstants.NO_SEQNUM;
- RegionStateNode(RegionInfo regionInfo, ConcurrentMap<RegionInfo,
RegionStateNode> ritMap) {
+ RegionStateNode(RegionInfo regionInfo, AtomicInteger trspCounter) {
Review Comment:
Or maybe we could pass the RegionInTransitionTracker in?
##########
hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java:
##########
@@ -68,6 +68,7 @@
public class RegionStateNode implements Comparable<RegionStateNode> {
private static final Logger LOG =
LoggerFactory.getLogger(RegionStateNode.class);
+ private final AtomicInteger trspCounter;
Review Comment:
Better give it a more clear name. At the first place I though it is the
count for this region in transition state...
After checking the code I noticed that this is total regions in transition...
##########
hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java:
##########
@@ -208,6 +208,7 @@ protected Flow executeFromState(MasterProcedureEnv env,
ServerCrashState state)
if (LOG.isTraceEnabled()) {
this.regionsOnCrashedServer.stream().forEach(ri ->
LOG.trace(ri.getShortNameToLog()));
}
+
env.getAssignmentManager().markRegionsAsCrashed(regionsOnCrashedServer, this);
Review Comment:
This is only for normal regions, when do we call this method for meta region?
##########
hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java:
##########
@@ -1873,6 +1881,20 @@ public void visitRegionState(Result result, final
RegionInfo regionInfo, final S
if (regionNode.getProcedure() != null) {
regionNode.getProcedure().stateLoaded(AssignmentManager.this,
regionNode);
}
+ // add regions to RIT while visiting the meta
+ regionInTransitionTracker.handleRegionStateNodeOperation(regionNode);
+ // If region location of region belongs to a dead server mark the region
crashed
+ if (
+ regionNode.getRegionLocation() != null
+ &&
master.getServerManager().isServerDead(regionNode.getRegionLocation())
+ ) {
+ Date timeOfCrash =
Review Comment:
Not your fault but we should not use Date any more...
##########
hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java:
##########
@@ -440,7 +440,7 @@ boolean checkAndRecordNewServer(final ServerName
serverName, final ServerMetrics
* @param deadServersFromPE the region servers which already have a SCP
associated.
* @param liveServersFromWALDir the live region servers from wal directory.
*/
- void findDeadServersAndProcess(Set<ServerName> deadServersFromPE,
+ void findDeadServersAndProcess(Map<ServerName, Long> deadServersFromPE,
Review Comment:
I think the value is the crash time? We should use it when setting up dead
servers?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]