Greg Padgett has uploaded a new change for review.

Change subject: agent: temporarily drop host score upon enexpected VM shutdown
......................................................................

agent: temporarily drop host score upon enexpected VM shutdown

When the agent detects that the engine VM is shut down unexpectedly,
it will lower the score to encourage the VM to start on a different
host.

This is an optimization to assist the user in putting a host
into maintenance mode more quickly.  If it is used in this manner, the
user should remember to enable host maintenance mode, else the score
will eventually revert to its usual number and the host may re-acquire
the engine VM.

Change-Id: If11d561a39cc9723ffdd55836db693fc2aad0575
Bug-Url: https://bugzilla.redhat.com/1015724
Signed-off-by: Greg Padgett <gpadg...@redhat.com>
---
M ovirt_hosted_engine_ha/agent/constants.py.in
M ovirt_hosted_engine_ha/agent/hosted_engine.py
M ovirt_hosted_engine_ha/lib/util.py
3 files changed, 39 insertions(+), 7 deletions(-)


  git pull ssh://gerrit.ovirt.org:29418/ovirt-hosted-engine-ha 
refs/changes/79/20279/1

diff --git a/ovirt_hosted_engine_ha/agent/constants.py.in 
b/ovirt_hosted_engine_ha/agent/constants.py.in
index 20ce115..85a13d9 100644
--- a/ovirt_hosted_engine_ha/agent/constants.py.in
+++ b/ovirt_hosted_engine_ha/agent/constants.py.in
@@ -38,6 +38,7 @@
 ENGINE_RETRY_COUNT = 3
 ENGINE_BAD_HEALTH_TIMEOUT_SECS = 300
 ENGINE_BAD_HEALTH_EXPIRATION_SECS = 600
+VM_UNEXPECTED_SHUTDOWN_EXPIRATION_SECS = 600
 INTERMITTENT_LOG_INTERVAL_SECS = 900
 MAX_VDSM_WAIT_SECS = 15
 MAX_DOMAIN_MONITOR_WAIT_SECS = 90
diff --git a/ovirt_hosted_engine_ha/agent/hosted_engine.py 
b/ovirt_hosted_engine_ha/agent/hosted_engine.py
index 20ba538..702f67f 100644
--- a/ovirt_hosted_engine_ha/agent/hosted_engine.py
+++ b/ovirt_hosted_engine_ha/agent/hosted_engine.py
@@ -201,6 +201,9 @@
         # Local timestamp when health status caused vm shutdown
         self._rinfo['bad-health-failure-time'] = None
 
+        # Local timestamp when vm was unexpectedly shut down
+        self._rinfo['unexpected-shutdown-time'] = None
+
         # Host id of local host
         self._rinfo['host-id'] = int(self._config.get(config.ENGINE,
                                                       config.HOST_ID))
@@ -507,9 +510,8 @@
 
         # re-initialize retry status variables if the retry window
         # has expired.
-        if (self._rinfo['engine-vm-retry-time'] is not None
-            and self._rinfo['engine-vm-retry-time']
-                < time.time() - constants.ENGINE_RETRY_EXPIRATION_SECS):
+        if util.has_elapsed(self._rinfo['engine-vm-retry-time'],
+                            constants.ENGINE_RETRY_EXPIRATION_SECS):
             self._rinfo['engine-vm-retry-time'] = None
             self._rinfo['engine-vm-retry-count'] = 0
             self._log.debug("Cleared retry status")
@@ -517,11 +519,16 @@
         # reset health status variable after expiration
         # FIXME it would be better to time this based on # of hosts available
         # to run the vm, not just a one-size-fits-all timeout
-        if (self._rinfo['bad-health-failure-time'] is not None
-                and self._rinfo['bad-health-failure-time']
-                < time.time() - constants.ENGINE_BAD_HEALTH_TIMEOUT_SECS):
+        if util.has_elapsed(self._rinfo['bad-health-failure-time'],
+                            constants.ENGINE_BAD_HEALTH_TIMEOUT_SECS):
             self._rinfo['bad-health-failure-time'] = None
             self._log.debug("Cleared bad health status")
+
+        # reset unexpected shutdown time after a specified delay
+        if util.has_elapsed(self._rinfo['unexpected-shutdown-time'],
+                            constants.VM_UNEXPECTED_SHUTDOWN_EXPIRATION_SECS):
+            self._rinfo['unexpected-shutdown-time'] = None
+            self._log.debug("Cleared unexpected shutdown status")
 
     def _generate_local_blocks(self):
         """
@@ -595,6 +602,13 @@
 
         # If engine has bad health status, let another host try
         if self._rinfo['bad-health-failure-time']:
+            score = 0
+
+        # If the VM shut down unexpectedly (user command, died, etc.), drop the
+        # score to effectively move it to another host.  This also serves as a
+        # shortcut for the user to start host maintenance mode, though it still
+        # should be set manually lest the score recover after a timeout.
+        if self._rinfo['unexpected-shutdown-time']:
             score = 0
 
         # Hosts in local maintenance mode should not run the vm
@@ -971,9 +985,12 @@
         local_host_id = self._rinfo['host-id']
         if self._rinfo['best-engine-status'][:5] != 'vm-up':
             self._log.error("Engine vm died unexpectedly")
-            return self.States.OFF, False
+            self._rinfo['unexpected-shutdown-time'] = time.time()
+            # Switch to OFF after yielding so score can adjust to 0
+            return self.States.OFF, True
         elif self._rinfo['best-engine-status-host-id'] != local_host_id:
             self._log.error("Engine vm unexpectedly running on other host")
+            self._rinfo['unexpected-shutdown-time'] = time.time()
             return self.States.OFF, True
 
         if self._rinfo['maintenance'] == self.MaintenanceMode.GLOBAL:
diff --git a/ovirt_hosted_engine_ha/lib/util.py 
b/ovirt_hosted_engine_ha/lib/util.py
index 99b28e5..b5d5e0c 100644
--- a/ovirt_hosted_engine_ha/lib/util.py
+++ b/ovirt_hosted_engine_ha/lib/util.py
@@ -24,10 +24,24 @@
 import errno
 import os
 import socket
+import time
 
 from .exceptions import DisconnectionError
 
 
+def has_elapsed(start, count, end=None):
+    """
+    Returns true if 'count' seconds have elapsed between timestamps 'start'
+    and 'end'.  If 'end' is not specified, defaults to time.time().  A
+    starting time of None results in False.
+    """
+    if start is None:
+        return False
+    if end is None:
+        end = time.time()
+    return (end - start >= count)
+
+
 def mkdir_recursive(path):
     try:
         os.makedirs(path)


-- 
To view, visit http://gerrit.ovirt.org/20279
To unsubscribe, visit http://gerrit.ovirt.org/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: If11d561a39cc9723ffdd55836db693fc2aad0575
Gerrit-PatchSet: 1
Gerrit-Project: ovirt-hosted-engine-ha
Gerrit-Branch: master
Gerrit-Owner: Greg Padgett <gpadg...@redhat.com>
_______________________________________________
Engine-patches mailing list
Engine-patches@ovirt.org
http://lists.ovirt.org/mailman/listinfo/engine-patches

Reply via email to