Squashed commit of the following: commit f55a17f71ec97513a6968b1ea3c359bc6238cc5e Author: Yakov Zhdanov <yzhda...@gridgain.com> Date: Fri Jul 31 13:32:32 2015 +0300
review commit 58ca345f622dbadfba7ef2d3dce850c4baa1f319 Merge: 5f921f6 7ed4d15 Author: Yakov Zhdanov <yzhda...@gridgain.com> Date: Fri Jul 31 13:24:51 2015 +0300 Merge branches 'ignite-752-2' and 'master' of https://git-wip-us.apache.org/repos/asf/incubator-ignite into ignite-752-2 commit 5f921f62dd6563a88b2ecdde92a2b2ee8218ec95 Author: Denis Magda <dma...@gridgain.com> Date: Wed Jul 29 10:40:44 2015 +0300 ignite-752-2: added info on the lowest failure detection timeout to the documentation commit 55f0eb56967d2cc9bdf62c3fb665521a59ddaf33 Author: Denis Magda <dma...@gridgain.com> Date: Wed Jul 29 09:15:29 2015 +0300 ignite-752-2: supported connection check frequency even for cases when failure timeout is ignored; performance optimizations Project: http://git-wip-us.apache.org/repos/asf/incubator-ignite/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-ignite/commit/44072f80 Tree: http://git-wip-us.apache.org/repos/asf/incubator-ignite/tree/44072f80 Diff: http://git-wip-us.apache.org/repos/asf/incubator-ignite/diff/44072f80 Branch: refs/heads/ignite-946 Commit: 44072f806d8d14d716475a3665d0afdf004c6db2 Parents: 7ed4d15 Author: Denis Magda <dma...@gridgain.com> Authored: Fri Jul 31 13:35:46 2015 +0300 Committer: Yakov Zhdanov <yzhda...@gridgain.com> Committed: Fri Jul 31 13:35:46 2015 +0300 ---------------------------------------------------------------------- .../ignite/spi/discovery/tcp/ServerImpl.java | 42 +++++++++++--------- .../spi/discovery/tcp/TcpDiscoverySpi.java | 2 +- 2 files changed, 24 insertions(+), 20 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-ignite/blob/44072f80/modules/core/src/main/java/org/apache/ignite/spi/discovery/tcp/ServerImpl.java ---------------------------------------------------------------------- diff --git a/modules/core/src/main/java/org/apache/ignite/spi/discovery/tcp/ServerImpl.java b/modules/core/src/main/java/org/apache/ignite/spi/discovery/tcp/ServerImpl.java index 547347c..47ba8e6 100644 --- a/modules/core/src/main/java/org/apache/ignite/spi/discovery/tcp/ServerImpl.java +++ b/modules/core/src/main/java/org/apache/ignite/spi/discovery/tcp/ServerImpl.java @@ -1787,6 +1787,9 @@ class ServerImpl extends TcpDiscoveryImpl { /** Connection check frequency. */ private long connCheckFreq; + /** Connection check threshold. */ + private long connCheckThreshold; + /** */ protected RingMessageWorker() { @@ -1799,19 +1802,22 @@ class ServerImpl extends TcpDiscoveryImpl { * Initializes connection check frequency. Used only when failure detection timeout is enabled. */ private void initConnectionCheckFrequency() { - if (spi.failureDetectionTimeoutEnabled()) { - for (int i = 3; i > 0; i--) { - connCheckFreq = spi.failureDetectionTimeout() / i; - - if (connCheckFreq > 0) - break; - } + if (spi.failureDetectionTimeoutEnabled()) + connCheckThreshold = spi.failureDetectionTimeout(); + else + connCheckThreshold = Math.min(spi.getSocketTimeout(), spi.getHeartbeatFrequency()); - assert connCheckFreq > 0; + for (int i = 3; i > 0; i--) { + connCheckFreq = connCheckThreshold / i; - if (log.isDebugEnabled()) - log.debug("Connection check frequency is calculated: " + connCheckFreq); + if (connCheckFreq > 10) + break; } + + assert connCheckFreq > 0; + + if (log.isDebugEnabled()) + log.debug("Connection check frequency is calculated: " + connCheckFreq); } /** @@ -2306,9 +2312,9 @@ class ServerImpl extends TcpDiscoveryImpl { // If node existed on connection initialization we should check // whether it has not gone yet. - if (nextNodeExists && pingNode(next)) - U.error(log, "Failed to send message to next node [msg=" + msg + - ", next=" + next + ']', err); + if (nextNodeExists) + U.warn(log, "Failed to send message to next node [msg=" + msg + ", next=" + next + + ", errMsg=" + (err != null ? err.getMessage() : "N/A") + ']'); else if (log.isDebugEnabled()) log.debug("Failed to send message to next node [msg=" + msg + ", next=" + next + ", errMsg=" + (err != null ? err.getMessage() : "N/A") + ']'); @@ -4025,7 +4031,7 @@ class ServerImpl extends TcpDiscoveryImpl { /** * Check the last time a heartbeat message received. If the time is bigger than {@code hbCheckTimeout} than - * {@link TcpDiscoveryStatusCheckMessage} is sent accros the ring. + * {@link TcpDiscoveryStatusCheckMessage} is sent across the ring. */ private void checkHeartbeatsReceiving() { if (lastTimeStatusMsgSent < locNode.lastUpdateTime()) @@ -4045,11 +4051,9 @@ class ServerImpl extends TcpDiscoveryImpl { * Check connection aliveness status. */ private void checkConnection() { - if (!spi.failureDetectionTimeoutEnabled()) - return; - - if (!failureThresholdReached && U.currentTimeMillis() - locNode.lastDataReceivedTime() - >= spi.failureDetectionTimeout() && ring.hasRemoteNodes() && spiStateCopy() == CONNECTED) { + if (spi.failureDetectionTimeoutEnabled() && !failureThresholdReached && + U.currentTimeMillis() - locNode.lastDataReceivedTime() >= connCheckThreshold && + ring.hasRemoteNodes() && spiStateCopy() == CONNECTED) { log.info("Local node seems to be disconnected from topology (failure detection timeout " + "is reached): [failureDetectionTimeout=" + spi.failureDetectionTimeout() + http://git-wip-us.apache.org/repos/asf/incubator-ignite/blob/44072f80/modules/core/src/main/java/org/apache/ignite/spi/discovery/tcp/TcpDiscoverySpi.java ---------------------------------------------------------------------- diff --git a/modules/core/src/main/java/org/apache/ignite/spi/discovery/tcp/TcpDiscoverySpi.java b/modules/core/src/main/java/org/apache/ignite/spi/discovery/tcp/TcpDiscoverySpi.java index 09690dc..3216166 100644 --- a/modules/core/src/main/java/org/apache/ignite/spi/discovery/tcp/TcpDiscoverySpi.java +++ b/modules/core/src/main/java/org/apache/ignite/spi/discovery/tcp/TcpDiscoverySpi.java @@ -74,7 +74,7 @@ import java.util.concurrent.atomic.*; * {@link IgniteConfiguration#setFailureDetectionTimeout(long)}. This failure timeout automatically controls the * following parameters: {@link #getSocketTimeout()}, {@link #getAckTimeout()}, {@link #getMaxAckTimeout()}, * {@link #getReconnectCount()}. If any of those parameters is set explicitly, then the failure timeout setting will be - * ignored. + * ignored. As an example, for stable low-latency networks the failure detection timeout may be set to ~120 ms. * <p> * If it's required to perform advanced settings of failure detection and * {@link IgniteConfiguration#getFailureDetectionTimeout()} is unsuitable then various {@code TcpDiscoverySpi}