Greg Padgett has uploaded a new change for review. Change subject: agent: temporarily drop host score upon enexpected VM shutdown ......................................................................
agent: temporarily drop host score upon enexpected VM shutdown When the agent detects that the engine VM is shut down unexpectedly, it will lower the score to encourage the VM to start on a different host. This is an optimization to assist the user in putting a host into maintenance mode more quickly. If it is used in this manner, the user should remember to enable host maintenance mode, else the score will eventually revert to its usual number and the host may re-acquire the engine VM. Change-Id: If11d561a39cc9723ffdd55836db693fc2aad0575 Bug-Url: https://bugzilla.redhat.com/1015724 Signed-off-by: Greg Padgett <gpadg...@redhat.com> --- M ovirt_hosted_engine_ha/agent/constants.py.in M ovirt_hosted_engine_ha/agent/hosted_engine.py M ovirt_hosted_engine_ha/lib/util.py 3 files changed, 39 insertions(+), 7 deletions(-) git pull ssh://gerrit.ovirt.org:29418/ovirt-hosted-engine-ha refs/changes/79/20279/1 diff --git a/ovirt_hosted_engine_ha/agent/constants.py.in b/ovirt_hosted_engine_ha/agent/constants.py.in index 20ce115..85a13d9 100644 --- a/ovirt_hosted_engine_ha/agent/constants.py.in +++ b/ovirt_hosted_engine_ha/agent/constants.py.in @@ -38,6 +38,7 @@ ENGINE_RETRY_COUNT = 3 ENGINE_BAD_HEALTH_TIMEOUT_SECS = 300 ENGINE_BAD_HEALTH_EXPIRATION_SECS = 600 +VM_UNEXPECTED_SHUTDOWN_EXPIRATION_SECS = 600 INTERMITTENT_LOG_INTERVAL_SECS = 900 MAX_VDSM_WAIT_SECS = 15 MAX_DOMAIN_MONITOR_WAIT_SECS = 90 diff --git a/ovirt_hosted_engine_ha/agent/hosted_engine.py b/ovirt_hosted_engine_ha/agent/hosted_engine.py index 20ba538..702f67f 100644 --- a/ovirt_hosted_engine_ha/agent/hosted_engine.py +++ b/ovirt_hosted_engine_ha/agent/hosted_engine.py @@ -201,6 +201,9 @@ # Local timestamp when health status caused vm shutdown self._rinfo['bad-health-failure-time'] = None + # Local timestamp when vm was unexpectedly shut down + self._rinfo['unexpected-shutdown-time'] = None + # Host id of local host self._rinfo['host-id'] = int(self._config.get(config.ENGINE, config.HOST_ID)) @@ -507,9 +510,8 @@ # re-initialize retry status variables if the retry window # has expired. - if (self._rinfo['engine-vm-retry-time'] is not None - and self._rinfo['engine-vm-retry-time'] - < time.time() - constants.ENGINE_RETRY_EXPIRATION_SECS): + if util.has_elapsed(self._rinfo['engine-vm-retry-time'], + constants.ENGINE_RETRY_EXPIRATION_SECS): self._rinfo['engine-vm-retry-time'] = None self._rinfo['engine-vm-retry-count'] = 0 self._log.debug("Cleared retry status") @@ -517,11 +519,16 @@ # reset health status variable after expiration # FIXME it would be better to time this based on # of hosts available # to run the vm, not just a one-size-fits-all timeout - if (self._rinfo['bad-health-failure-time'] is not None - and self._rinfo['bad-health-failure-time'] - < time.time() - constants.ENGINE_BAD_HEALTH_TIMEOUT_SECS): + if util.has_elapsed(self._rinfo['bad-health-failure-time'], + constants.ENGINE_BAD_HEALTH_TIMEOUT_SECS): self._rinfo['bad-health-failure-time'] = None self._log.debug("Cleared bad health status") + + # reset unexpected shutdown time after a specified delay + if util.has_elapsed(self._rinfo['unexpected-shutdown-time'], + constants.VM_UNEXPECTED_SHUTDOWN_EXPIRATION_SECS): + self._rinfo['unexpected-shutdown-time'] = None + self._log.debug("Cleared unexpected shutdown status") def _generate_local_blocks(self): """ @@ -595,6 +602,13 @@ # If engine has bad health status, let another host try if self._rinfo['bad-health-failure-time']: + score = 0 + + # If the VM shut down unexpectedly (user command, died, etc.), drop the + # score to effectively move it to another host. This also serves as a + # shortcut for the user to start host maintenance mode, though it still + # should be set manually lest the score recover after a timeout. + if self._rinfo['unexpected-shutdown-time']: score = 0 # Hosts in local maintenance mode should not run the vm @@ -971,9 +985,12 @@ local_host_id = self._rinfo['host-id'] if self._rinfo['best-engine-status'][:5] != 'vm-up': self._log.error("Engine vm died unexpectedly") - return self.States.OFF, False + self._rinfo['unexpected-shutdown-time'] = time.time() + # Switch to OFF after yielding so score can adjust to 0 + return self.States.OFF, True elif self._rinfo['best-engine-status-host-id'] != local_host_id: self._log.error("Engine vm unexpectedly running on other host") + self._rinfo['unexpected-shutdown-time'] = time.time() return self.States.OFF, True if self._rinfo['maintenance'] == self.MaintenanceMode.GLOBAL: diff --git a/ovirt_hosted_engine_ha/lib/util.py b/ovirt_hosted_engine_ha/lib/util.py index 99b28e5..b5d5e0c 100644 --- a/ovirt_hosted_engine_ha/lib/util.py +++ b/ovirt_hosted_engine_ha/lib/util.py @@ -24,10 +24,24 @@ import errno import os import socket +import time from .exceptions import DisconnectionError +def has_elapsed(start, count, end=None): + """ + Returns true if 'count' seconds have elapsed between timestamps 'start' + and 'end'. If 'end' is not specified, defaults to time.time(). A + starting time of None results in False. + """ + if start is None: + return False + if end is None: + end = time.time() + return (end - start >= count) + + def mkdir_recursive(path): try: os.makedirs(path) -- To view, visit http://gerrit.ovirt.org/20279 To unsubscribe, visit http://gerrit.ovirt.org/settings Gerrit-MessageType: newchange Gerrit-Change-Id: If11d561a39cc9723ffdd55836db693fc2aad0575 Gerrit-PatchSet: 1 Gerrit-Project: ovirt-hosted-engine-ha Gerrit-Branch: master Gerrit-Owner: Greg Padgett <gpadg...@redhat.com> _______________________________________________ Engine-patches mailing list Engine-patches@ovirt.org http://lists.ovirt.org/mailman/listinfo/engine-patches