[Engine-patches] Change in ovirt-hosted-engine-ha[master]: agent, broker: additional vm control including migration

gpadgett Sat, 17 Aug 2013 20:26:46 -0700

Greg Padgett has uploaded a new change for review.

Change subject: agent, broker: additional vm control including migration
......................................................................


agent, broker: additional vm control including migration

Add logic to handle migration and fix up other transition logic.
Broker files were changed only to move run_vds_client_cmd() to lib.

Change-Id: I842f30e6d1533ae4be227508999cd96b3e4963c3
Signed-off-by: Greg Padgett <gpadg...@redhat.com>
---
M .gitignore
M ovirt_hosted_engine_ha/agent/constants.py.in
M ovirt_hosted_engine_ha/agent/hosted_engine.py
M ovirt_hosted_engine_ha/broker/Makefile.am
M ovirt_hosted_engine_ha/broker/constants.py.in
M ovirt_hosted_engine_ha/broker/submonitors/engine_health.py
M ovirt_hosted_engine_ha/broker/submonitors/mem_free.py
M ovirt_hosted_engine_ha/broker/submonitors/mem_load.py
M ovirt_hosted_engine_ha/broker/submonitors/mgmt_bridge.py
M ovirt_hosted_engine_ha/lib/Makefile.am
A ovirt_hosted_engine_ha/lib/constants.py.in
R ovirt_hosted_engine_ha/lib/vds_client.py
12 files changed, 332 insertions(+), 81 deletions(-)


  git pull ssh://gerrit.ovirt.org:29418/ovirt-hosted-engine-ha 
refs/changes/31/18231/1

diff --git a/.gitignore b/.gitignore
index 2eb2b81..f4eb52e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,8 @@
 /ovirt_hosted_engine_ha/agent/constants.py
 /ovirt_hosted_engine_ha/broker/config.log
 /ovirt_hosted_engine_ha/broker/constants.py
+/ovirt_hosted_engine_ha/lib/config.log
+/ovirt_hosted_engine_ha/lib/constants.py
 /tmp.repos/
 
 # Autotools
diff --git a/ovirt_hosted_engine_ha/agent/constants.py.in 
b/ovirt_hosted_engine_ha/agent/constants.py.in
index b1721af..a759512 100644
--- a/ovirt_hosted_engine_ha/agent/constants.py.in
+++ b/ovirt_hosted_engine_ha/agent/constants.py.in
@@ -36,6 +36,8 @@
 HOST_ALIVE_TIMEOUT_SECS = 60
 ENGINE_RETRY_EXPIRATION_SECS = 600
 ENGINE_RETRY_COUNT = 3
+ENGINE_BAD_HEALTH_TIMEOUT_SECS = 300
+ENGINE_BAD_HEALTH_EXPIRATION_SECS = 600
 INTERMITTENT_LOG_INTERVAL_SECS = 900
 MAX_VDSM_WAIT_SECS = 15
 
diff --git a/ovirt_hosted_engine_ha/agent/hosted_engine.py 
b/ovirt_hosted_engine_ha/agent/hosted_engine.py
index 9595a21..62e5276 100644
--- a/ovirt_hosted_engine_ha/agent/hosted_engine.py
+++ b/ovirt_hosted_engine_ha/agent/hosted_engine.py
@@ -31,6 +31,7 @@
 from . import constants
 from ..lib import exceptions as ex
 from ..lib import log_filter
+from ..lib import vds_client as vdsc
 
 
 class HostedEngine(object):
@@ -38,6 +39,8 @@
     LF_HOST_UPDATE = 'LF_HOST_UPDATE'
     LF_HOST_UPDATE_DETAIL = 'LF_HOST_UPDATE_DETAIL'
     LF_ENGINE_HEALTH = 'LF_ENGINE_HEALTH'
+
+    MIGRATION_THRESHOLD_SCORE = 800
 
     engine_status_score_lookup = {
         'None': 0,
@@ -54,6 +57,13 @@
         STOP = 'STOP'
         MIGRATE = 'MIGRATE'
 
+    class MigrationStatus(object):
+        PENDING = 'PENDING'
+        STARTED = 'STARTED'
+        IN_PROGRESS = 'IN_PROGRESS'
+        DONE = 'DONE'
+        FAILURE = 'FAILURE'
+
     def __init__(self, shutdown_requested_callback):
         """
         Initialize hosted engine monitoring logic.  shutdown_requested_callback
@@ -69,8 +79,8 @@
         self._broker = None
         self._required_monitors = self._get_required_monitors()
         self._local_monitors = {}
-        self._local_state = {}
-        self._init_local_state()
+        self._rinfo = {}
+        self._init_runtime_info()
         self._all_host_stats = {}
 
         self._sd_path = None
@@ -144,22 +154,50 @@
         })
         return req
 
-    def _init_local_state(self):
+    def _init_runtime_info(self):
         """
-        Initialize self._local_state dict (and document the entries).
+        Initialize self._rinfo dict (and document the entries).
         """
         # Local timestamp of most recent engine vm startup attempt
-        self._local_state['last-engine-retry'] = 0
+        self._rinfo['engine-vm-retry-time'] = None
 
         # Count of recent engine vm startup attempts
-        self._local_state['engine-retries'] = 0
+        self._rinfo['engine-vm-retry-count'] = 0
+
+        # Local timestamp when health status caused vm shutdown
+        self._rinfo['bad-health-failure-time'] = None
 
         # Host id of local host
-        self._local_state['host-id'] = int(self._config.get(config.ENGINE,
-                                                            config.HOST_ID))
+        self._rinfo['host-id'] = int(self._config.get(config.ENGINE,
+                                                      config.HOST_ID))
 
         # Initial state to track engine vm status in state machine
-        self._local_state['current-state'] = self.States.ENTRY
+        self._rinfo['current-state'] = self.States.ENTRY
+
+        # The following are initialized when needed to process engine actions
+
+        # Used to denote best-ranked engine status of all live hosts
+        # 'best-engine-status'
+        # 'best-engine-status-host-id'
+
+        # Highest score of all hosts, and host-id with that score
+        # 'best-score'
+        # 'best-score-host-id'
+
+        # Current state machine state, member of self.States
+        # 'current-state'
+
+        # State of maintenance bit, True/False
+        # 'maintenance'
+
+        # Used by ON state; tracks time when bad status first seen, cleanred
+        # on either state change due to healthy state or timeout
+        # 'first-bad-status-time'
+
+        # Used by ON and MIGRATE state, tracks status of migration (element of
+        # self.MigrationStatus) and host id to which migration is occurring
+        # 'migration-host-id'
+        # 'migration-status'
 
     def _get_lf_args(self, lf_class):
         return {'lf_class': lf_class,
@@ -295,7 +333,7 @@
     def _initialize_sanlock(self):
         self._cond_start_service('sanlock')
 
-        host_id = self._local_state['host-id']
+        host_id = self._rinfo['host-id']
         self._metadata_dir = os.path.join(self._sd_path,
                                           constants.SD_METADATA_DIR)
         lease_file = os.path.join(self._metadata_dir,
@@ -353,13 +391,21 @@
 
         # re-initialize retry status variables if the retry window
         # has expired.
-        if ((self._local_state['last-engine-retry'] != 0
-             or self._local_state['engine-retries'] != 0)
-            and self._local_state['last-engine-retry'] + time.time()
-                < constants.ENGINE_RETRY_EXPIRATION_SECS):
-            self._local_state['last-engine-retry'] = 0
-            self._local_state['engine-retries'] = 0
+        if (self._rinfo['engine-vm-retry-time'] is not None
+            and self._rinfo['engine-vm-retry-time']
+                < time.time() - constants.ENGINE_RETRY_EXPIRATION_SECS):
+            self._rinfo['engine-vm-retry-time'] = None
+            self._rinfo['engine-vm-retry-count'] = 0
             self._log.debug("Cleared retry status")
+
+        # reset health status variable after expiration
+        # FIXME it would be better to time this based on # of hosts available
+        # to run the vm, not just a one-size-fits-all timeout
+        if (self._rinfo['bad-health-failure-time'] is not None
+                and self._rinfo['bad-health-failure-time']
+                < time.time() - constants.ENGINE_BAD_HEALTH_TIMEOUT_SECS):
+            self._rinfo['bad-health-failure-time'] = None
+            self._log.debug("Cleared bad health status")
 
     def _generate_local_blocks(self):
         """
@@ -423,10 +469,16 @@
 
         # Subtracting a small amount each time causes round-robin attempts
         # between hosts that are otherwise equally suited to run the engine
-        score -= 50 * self._local_state['engine-retries']
+        score -= 50 * self._rinfo['engine-vm-retry-count']
+        score = max(0, score)
 
         # If too many retries occur, give a less-suited host a chance
-        if self._local_state['engine-retries'] > constants.ENGINE_RETRY_COUNT:
+        if (self._rinfo['engine-vm-retry-count']
+                > constants.ENGINE_RETRY_COUNT):
+            score = 0
+
+        # If engine has bad health status, let another host try
+        if self._rinfo['bad-health-failure-time']:
             score = 0
 
         ts = int(time.time())
@@ -435,7 +487,7 @@
                 .format(md_parse_vers=constants.METADATA_PARSE_VERSION,
                         md_feature_vers=constants.METADATA_FEATURE_VERSION,
                         ts_int=ts,
-                        host_id=self._local_state['host-id'],
+                        host_id=self._rinfo['host-id'],
                         score=score,
                         engine_status=lm['engine-health']['status'],
                         name=socket.gethostname()))
@@ -452,7 +504,7 @@
                         md_feature_vers=constants.METADATA_FEATURE_VERSION,
                         ts_int=ts,
                         ts_str=time.ctime(ts),
-                        host_id=self._local_state['host-id'],
+                        host_id=self._rinfo['host-id'],
                         score=score))
         for (k, v) in sorted(lm.iteritems()):
             info += "{0}={1}\n".format(k, str(v['status']))
@@ -575,13 +627,13 @@
         """
         Start or stop engine on current host based on hosts' statistics.
         """
-        local_host_id = self._local_state['host-id']
+        local_host_id = self._rinfo['host-id']
 
         if self._all_host_stats[local_host_id]['engine-status'] == 'None':
             self._log.info("Unknown local engine vm status, no actions taken")
             return
 
-        cur_stats = {
+        rinfo = {
             'best-engine-status':
             self._all_host_stats[local_host_id]['engine-status'],
             'best-engine-status-host-id': local_host_id,
@@ -601,28 +653,30 @@
 
             if self._get_engine_status_score(stats['engine-status']) \
                     > self._get_engine_status_score(
-                        cur_stats['best-engine-status']):
-                cur_stats['best-engine-status'] = stats['engine-status']
-                cur_stats['best-engine-status-host-id'] = host_id
+                        rinfo['best-engine-status']):
+                rinfo['best-engine-status'] = stats['engine-status']
+                rinfo['best-engine-status-host-id'] = host_id
             # Prefer local score if equal to remote score
-            if stats['score'] > cur_stats['best-score']:
-                cur_stats['best-score'] = stats['score']
-                cur_stats['best-score-host-id'] = host_id
+            if stats['score'] > rinfo['best-score']:
+                rinfo['best-score'] = stats['score']
+                rinfo['best-score-host-id'] = host_id
 
         # FIXME set maintenance flag
-        cur_stats['maintenance'] = False
+        rinfo['maintenance'] = False
 
-        self._cur_stats = cur_stats
+        self._rinfo.update(rinfo)
 
-        state = self._local_state['current-state']
         yield_ = False
         # Process the states until it's time to sleep, indicated by the
         # state handler returning yield_ as True.
         while not yield_:
-            self._log.debug("Processing engine state %s", state)
-            state, yield_ = self._vm_state_actions[state]()
-        self._log.debug("Next engine state %s", state)
-        self._local_state['current-state'] = state
+            self._log.debug("Processing engine state %s",
+                            self._rinfo['current-state'])
+            self._rinfo['current-state'], yield_ \
+                = self._vm_state_actions[self._rinfo['current-state']]()
+
+        self._log.debug("Next engine state %s",
+                        self._rinfo['current-state'])
 
     def _get_engine_status_score(self, status):
         """
@@ -635,11 +689,24 @@
             self._log.error("Invalid engine status: %s", status, exc_info=True)
             return 0
 
+    def handler_cleanup(f):
+        """
+        Call a cleanup function when transitioning out of a state
+        (i.e. when the handler returns a state other than its own)
+        """
+        def cleanup_wrapper(self):
+            ret = f(self)
+            if ret[0] != self._rinfo['current-state']:
+                cleanup_fn = f.__name__ + '_cleanup'
+                getattr(self, cleanup_fn)()
+            return ret
+        return cleanup_wrapper
+
     def _handle_entry(self):
         """
         ENTRY state.  Determine current vm state and switch appropriately.
         """
-        local_host_id = self._local_state['host-id']
+        local_host_id = self._rinfo['host-id']
         if self._all_host_stats[local_host_id]['engine-status'][:5] == 'vm-up':
             return self.States.ON, False
         else:
@@ -650,17 +717,13 @@
         OFF state.  Check if any conditions warrant starting the vm, and
         check if it was started externally.
         """
-        local_host_id = self._local_state['host-id']
+        local_host_id = self._rinfo['host-id']
 
-        if self._cur_stats['best-engine-status'][:5] == 'vm-up':
-            # FIXME timeout for bad-host-status: if up and no engine, try to
-            # migrate; if can't migrate, reduce local score and shut down
-            engine_host_id = self._cur_stats['best-engine-status-host-id']
+        if self._rinfo['best-engine-status'][:5] == 'vm-up':
+            engine_host_id = self._rinfo['best-engine-status-host-id']
             if engine_host_id == local_host_id:
                 self._log.info("Engine vm unexpectedly running locally,"
                                " monitoring vm")
-                # FIXME maintenance bit should prevent this transition; in
-                # fact, it should trigger STOP and then IDLE or similar
                 return self.States.ON, False
             else:
                 self._log.info(
@@ -675,14 +738,14 @@
 
         # FIXME cluster-wide engine maintenance bit
 
-        if self._cur_stats['best-score-host-id'] != local_host_id:
+        if self._rinfo['best-score-host-id'] != local_host_id:
             self._log.info("Engine down, local host does not have best score",
                            extra=self._get_lf_args(self.LF_ENGINE_HEALTH))
             return self.States.OFF, True
 
         self._log.error("Engine down and local host has best score (%d),"
                         " attempting to start engine VM",
-                        self._cur_stats['best-score'],
+                        self._rinfo['best-score'],
                         extra=self._get_lf_args(self.LF_ENGINE_HEALTH))
         return self.States.START, False
 
@@ -694,16 +757,16 @@
             self._start_engine_vm()
         except Exception as e:
             self._log.error("Failed to start engine VM: %s", str(e))
-            self._local_state['last-engine-retry'] = time.time()
-            self._local_state['engine-retries'] += 1
+            # FIXME these sorts of tracking vars could be put in an audit log
+            self._rinfo['engine-vm-retry-time'] = int(time.time())
+            self._rinfo['engine-vm-retry-count'] += 1
             # TODO mail for error (each time, or after n retries?)
             # OFF handler will retry based on host score
             return self.States.OFF, True
         else:
-            self._local_state['last-engine-retry'] = 0
-            self._local_state['engine-retries'] = 0
+            self._rinfo['engine-vm-retry-time'] = None
+            self._rinfo['engine-vm-retry-count'] = 0
             return self.States.ON, True
-            # TODO should we stay in START until success/timeout?
 
     def _start_engine_vm(self):
         self._log.info("Starting vm using `%s --vm-start`",
@@ -727,36 +790,185 @@
             raise Exception(output[1])
 
         self._log.error("Engine VM started on localhost")
-        # FIXME record start time in order to track bad-health-status timeout
         return
 
+    @handler_cleanup
     def _handle_on(self):
         """
         ON state.  See if the VM was stopped or needs to be stopped.
         """
-        local_host_id = self._local_state['host-id']
-        if self._cur_stats['best-engine-status'][:5] != 'vm-up':
+        local_host_id = self._rinfo['host-id']
+        if self._rinfo['best-engine-status'][:5] != 'vm-up':
             self._log.error("Engine vm died unexpectedly")
             return self.States.OFF, False
-        elif self._cur_stats['best-engine-status-host-id'] != local_host_id:
+        elif self._rinfo['best-engine-status-host-id'] != local_host_id:
             self._log.error("Engine vm unexpectedly running on other host")
             return self.States.OFF, True
 
-        # FIXME migration if other hosts are found to be significantly better
-        # TODO check for health, just just liveness
+        # FIXME maintenance bit should cause transition to STOP
+
+        best_host_id = self._rinfo['best-score-host-id']
+        if (best_host_id != local_host_id
+                and self._rinfo['best-score']
+                >= self._all_host_stats[local_host_id]['score']
+                + self.MIGRATION_THRESHOLD_SCORE):
+            self._log.error("Host %s (id %d) score is significantly better"
+                            " than local score, migrating vm",
+                            self._all_host_stats[best_host_id]['hostname'])
+            self._rinfo['migration-host-id'] = best_host_id
+            self._rinfo['migration-status'] = self.MigrationStatus.PENDING
+            return self.States.MIGRATE, False
+
+        if self._rinfo['best-engine-status'] == 'vm-up bad-health-status':
+            now = int(time.time())
+            if 'first-bad-status-time' not in self._rinfo:
+                self._rinfo['first-bad-status-time'] = now
+            timeout = (constants.ENGINE_BAD_HEALTH_TIMEOUT_SECS
+                       - (now - self._rinfo['first-bad-status-time']))
+            if timeout > 0:
+                self._log.error("Engine VM has bad health status,"
+                                " timeout in %d seconds", timeout)
+                return self.States.ON, True
+            else:
+                self._log.error("Engine VM timed out with bad health status"
+                                " after %d seconds, restarting",
+                                constants.ENGINE_BAD_HEALTH_TIMEOUT_SECS)
+                self._rinfo['bad-health-failure-time'] = now
+                # FIXME how do we avoid this for cases like vm running fsck?
+                return self.States.STOP, False
+
         self._log.info("Engine vm running on localhost")
         return self.States.ON, True
 
+    def _handle_on_cleanup(self):
+        if 'first-bad-status-time' in self._rinfo:
+            del self._rinfo['first-bad-status-time']
+
+    @handler_cleanup
     def _handle_stop(self):
         """
         STOP state.  Shut down the locally-running vm.
         """
-        # FIXME currently unused
-        return self.States.STOP, True
+        local_host_id = self._rinfo['host-id']
+        if (self._rinfo['best-engine-status'][:5] != 'vm-up'
+                or self._rinfo['best-engine-status-host-id'] != local_host_id:
+            self._log.info("Engine vm not running on local host")
+            return self.States.OFF, True
 
+        force = False
+        if self._rinfo.get('engine-vm-shutdown-time'):
+            elapsed = int(time.time()) - self._rinfo['engine-vm-shutdown-time']
+            if elapsed > constants.ENGINE_BAD_HEALTH_TIMEOUT_SECS:
+                force = True
+
+        try:
+            self._stop_engine_vm(force)
+        except Exception as e:
+            self._log.error("Failed to stop engine VM: %s", str(e))
+            # Allow rediscovery of vm state.  Yield in case the state
+            # machine ends up immediately in the STOP state again.
+            return self.States.ENTRY, True
+
+        if force:
+            return self.States.OFF, True
+        else:
+            if 'engine-vm-shutdown-time' not in self._rinfo:
+                self._rinfo['engine-vm-shutdown-time'] = int(time.time())
+            return self.States.STOP, True
+
+    def _handle_stop_cleanup(self):
+        if 'engine-vm-shutdown-time' in self._rinfo:
+            del self._rinfo['engine-vm-shutdown-time']
+
+    def _stop_engine_vm(self, force):
+        cmd = '--vm-poweroff' if force else '--vm-shutdown'
+        self._log.info("Shutting down vm using `%s %s`",
+                       constants.HOSTED_ENGINE_BINARY, cmd)
+        p = subprocess.Popen([constants.HOSTED_ENGINE_BINARY, cmd],
+                             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        output = p.communicate()
+        self._log.info("stdout: %s", output[0])
+        self._log.info("stderr: %s", output[1])
+        if (p.returncode != 0
+                and not output[0].startswith(
+                "Virtual machine does not exist")):
+            self._log.error("Failed to stop engine vm with %s %s: %s",
+                            constants.HOSTED_ENGINE_BINARY, cmd, output[1])
+            raise Exception(output[1])
+
+        self._log.error("Engine VM stopped on localhost")
+        return
+
+    @handler_cleanup
     def _handle_migrate(self):
         """
         MIGRATE state.  Move the VM to the destination host.
         """
-        # FIXME currently unused
-        return self.States.MIGRATE, True
+        vm_id = self._config.get(config.VM, config.VM_UUID)
+        best_host_id = self._rinfo['migration-host-id']
+        if self._rinfo['migration-status'] == self.MigrationStatus.PENDING:
+            try:
+                vdsc.run_vds_client_cmd(
+                    '0',
+                    self._config.get(config.ENGINE, config.VDSM_SSL),
+                    'migrate',
+                    vmId=vm_id,
+                    method='online',
+                    src='localhost',
+                    dst=best_host_id,
+                )
+            except:
+                self._log.error(exc_info=True)
+                self._rinfo['migration-status'] = self.MigrationStatus.FAILURE
+            else:
+                self._log.info("Started migration to host %s (id %d)",
+                               self._all_host_stats[best_host_id]['hostname'],
+                               best_host_id)
+                self._rinfo['migration-status'] \
+                    = self.MigrationStatus.IN_PROGRESS
+
+        else:
+            res = vdsc.run_vds_client_cmd(
+                '0',
+                self._config.get(config.ENGINE, config.VDSM_SSL),
+                'migrate',
+                vmId=vm_id
+            )
+            self._log.info("Migration status: %s", res['status']['message'])
+
+            if res['status']['message'].startswith('Migration in progress'):
+                self._rinfo['migration-status'] \
+                    = self.MigrationStatus.IN_PROGRESS
+            elif res['status']['message'].startswith('Migration done'):
+                self._rinfo['migration-status'] \
+                    = self.MigrationStatus.DONE
+            else:
+                self._rinfo['migration-status'] \
+                    = self.MigrationStatus.FAILURE
+
+        self._log.debug("Symbolic migration status is %s",
+                        self._rinfo['migration-status'])
+
+        if self._rinfo['migration-status'] == self.MigrationStatus.IN_PROGRESS:
+            self._log.info("Continuing to monitor migration")
+            return self.States.MIGRATE, True
+        elif self._rinfo['migration-status'] == self.MigrationStatus.DONE:
+            self._log.info("Migration to host %s (id %d) complete,"
+                           " no longer monitoring vm",
+                           self._all_host_stats[best_host_id]['hostname'],
+                           best_host_id)
+            return self.States.OFF, True
+        elif self._rinfo['migration-status'] == self.MigrationStatus.FAILURE:
+            self._log.error("Migration to host %s (id %d) failed",
+                            self._all_host_stats[best_host_id]['hostname'],
+                            best_host_id)
+            return self.States.STOP, False
+        else:
+            self._log.error("Unexpected migration state, migration failed")
+            return self.States.STOP, False
+
+    def _handle_migrate_cleanup(self):
+        if 'migration-host-id' in self._rinfo:
+            del self._rinfo['migration-host-id']
+        if 'migration-status' in self._rinfo:
+            del self._rinfo['migration-status']
diff --git a/ovirt_hosted_engine_ha/broker/Makefile.am 
b/ovirt_hosted_engine_ha/broker/Makefile.am
index 9762890..47227ec 100644
--- a/ovirt_hosted_engine_ha/broker/Makefile.am
+++ b/ovirt_hosted_engine_ha/broker/Makefile.am
@@ -44,7 +44,6 @@
        monitor.py \
        storage_broker.py \
        submonitor_base.py \
-       submonitor_util.py \
        $(NULL)
 
 broker_PYTHON = \
diff --git a/ovirt_hosted_engine_ha/broker/constants.py.in 
b/ovirt_hosted_engine_ha/broker/constants.py.in
index 5de6b0b..7a04b48 100644
--- a/ovirt_hosted_engine_ha/broker/constants.py.in
+++ b/ovirt_hosted_engine_ha/broker/constants.py.in
@@ -35,6 +35,3 @@
 VDSM_GROUP = '@VDSM_GROUP@'
 
 HOSTED_ENGINE_BINARY = '@ENGINE_SETUP_BINDIR@/hosted-engine'
-VDS_CLIENT_DIR = '/usr/share/vdsm'
-VDS_CLIENT_SSL = True
-VDS_CLIENT_MAX_RETRY = 3
diff --git a/ovirt_hosted_engine_ha/broker/submonitors/engine_health.py 
b/ovirt_hosted_engine_ha/broker/submonitors/engine_health.py
index da0cae8..3578c0b 100644
--- a/ovirt_hosted_engine_ha/broker/submonitors/engine_health.py
+++ b/ovirt_hosted_engine_ha/broker/submonitors/engine_health.py
@@ -22,9 +22,9 @@
 
 from ovirt_hosted_engine_ha.broker import constants
 from ovirt_hosted_engine_ha.broker import submonitor_base
-from ovirt_hosted_engine_ha.broker import submonitor_util as sm_util
-from ovirt_hosted_engine_ha.lib import util as util
 from ovirt_hosted_engine_ha.lib import exceptions as exceptions
+from ovirt_hosted_engine_ha.lib import util as util
+from ovirt_hosted_engine_ha.lib import vds_client as vdsc
 
 
 def register():
@@ -49,8 +49,8 @@
     def action(self, options):
         # First, see if vdsm tells us it's up
         try:
-            stats = sm_util.run_vds_client_cmd(self._address, self._use_ssl,
-                                               'getVmStats', self._vm_uuid)
+            stats = vdsc.run_vds_client_cmd(self._address, self._use_ssl,
+                                            'getVmStats', self._vm_uuid)
         except Exception as e:
             if isinstance(e, exceptions.DetailedError) \
                     and e.detail == "Virtual machine does not exist":
@@ -63,6 +63,10 @@
                 self.update_result(None)
                 return
         vm_status = stats['statsList'][0]['status']
+        if vm_status.lower() == 'powering up':
+            self._log.info("VM powering up")
+            self.update_result('vm-up bad-health-status')
+            return
         if vm_status.lower() != 'up':
             self._log.info("VM not running on this host, status %s", vm_status)
             self.update_result('vm-down')
diff --git a/ovirt_hosted_engine_ha/broker/submonitors/mem_free.py 
b/ovirt_hosted_engine_ha/broker/submonitors/mem_free.py
index 1b82e4b..34b0bcc 100644
--- a/ovirt_hosted_engine_ha/broker/submonitors/mem_free.py
+++ b/ovirt_hosted_engine_ha/broker/submonitors/mem_free.py
@@ -20,8 +20,8 @@
 import logging
 
 from ovirt_hosted_engine_ha.broker import submonitor_base
-from ovirt_hosted_engine_ha.broker import submonitor_util as sm_util
 from ovirt_hosted_engine_ha.lib import util as util
+from ovirt_hosted_engine_ha.lib import vds_client as vdsc
 
 
 def register():
@@ -39,8 +39,8 @@
 
     def action(self, options):
         try:
-            response = sm_util.run_vds_client_cmd(self._address, self._use_ssl,
-                                                  'getVdsStats')
+            response = vdsc.run_vds_client_cmd(self._address, self._use_ssl,
+                                               'getVdsStats')
         except Exception as e:
             self._log.error("Failed to getVdsStats: %s", str(e))
             self.update_result(None)
diff --git a/ovirt_hosted_engine_ha/broker/submonitors/mem_load.py 
b/ovirt_hosted_engine_ha/broker/submonitors/mem_load.py
index b9f91d3..dcc769a 100644
--- a/ovirt_hosted_engine_ha/broker/submonitors/mem_load.py
+++ b/ovirt_hosted_engine_ha/broker/submonitors/mem_load.py
@@ -20,8 +20,8 @@
 import logging
 
 from ovirt_hosted_engine_ha.broker import submonitor_base
-from ovirt_hosted_engine_ha.broker import submonitor_util as sm_util
 from ovirt_hosted_engine_ha.lib import util as util
+from ovirt_hosted_engine_ha.lib import vds_client as vdsc
 
 
 def register():
@@ -39,10 +39,10 @@
 
     def action(self, options):
         try:
-            caps = sm_util.run_vds_client_cmd(self._address, self._use_ssl,
-                                              'getVdsCapabilities')
-            stats = sm_util.run_vds_client_cmd(self._address, self._use_ssl,
-                                               'getVdsStats')
+            caps = vdsc.run_vds_client_cmd(self._address, self._use_ssl,
+                                           'getVdsCapabilities')
+            stats = vdsc.run_vds_client_cmd(self._address, self._use_ssl,
+                                            'getVdsStats')
         except Exception as e:
             self._log.error("Failed to getVdsStats/Caps: %s", str(e))
             self.update_result(None)
diff --git a/ovirt_hosted_engine_ha/broker/submonitors/mgmt_bridge.py 
b/ovirt_hosted_engine_ha/broker/submonitors/mgmt_bridge.py
index a14e124..7490dc2 100644
--- a/ovirt_hosted_engine_ha/broker/submonitors/mgmt_bridge.py
+++ b/ovirt_hosted_engine_ha/broker/submonitors/mgmt_bridge.py
@@ -20,8 +20,8 @@
 import logging
 
 from ovirt_hosted_engine_ha.broker import submonitor_base
-from ovirt_hosted_engine_ha.broker import submonitor_util as sm_util
 from ovirt_hosted_engine_ha.lib import util as util
+from ovirt_hosted_engine_ha.lib import vds_client as vdsc
 
 
 def register():
@@ -43,8 +43,8 @@
 
     def action(self, options):
         try:
-            response = sm_util.run_vds_client_cmd(self._address, self._use_ssl,
-                                                  'getVdsCapabilities')
+            response = vdsc.run_vds_client_cmd(self._address, self._use_ssl,
+                                               'getVdsCapabilities')
         except Exception as e:
             self._log.error("Failed to getVdsCapabilities: %s", str(e))
             self.update_result(None)
diff --git a/ovirt_hosted_engine_ha/lib/Makefile.am 
b/ovirt_hosted_engine_ha/lib/Makefile.am
index 3b52aac..8ebc2fc 100644
--- a/ovirt_hosted_engine_ha/lib/Makefile.am
+++ b/ovirt_hosted_engine_ha/lib/Makefile.am
@@ -18,11 +18,13 @@
 #
 
 include $(top_srcdir)/build/python.inc
+include $(top_srcdir)/build/var_subst.inc
 
 MAINTAINERCLEANFILES = \
        $(srcdir)/Makefile.in \
        $(NULL)
 CLEANFILES = \
+       constants.py \
        $(NULL)
 
 halibdir = $(engine_ha_libdir)/lib
@@ -32,6 +34,15 @@
        exceptions.py \
        log_filter.py \
        util.py \
+       vds_client.py \
+       $(NULL)
+
+halib_PYTHON = \
+       constants.py \
+       $(NULL)
+
+EXTRA_DIST = \
+       constants.py.in \
        $(NULL)
 
 clean-local: \
diff --git a/ovirt_hosted_engine_ha/lib/constants.py.in 
b/ovirt_hosted_engine_ha/lib/constants.py.in
new file mode 100644
index 0000000..c7e2301
--- /dev/null
+++ b/ovirt_hosted_engine_ha/lib/constants.py.in
@@ -0,0 +1,24 @@
+#
+# ovirt-hosted-engine-ha -- ovirt hosted engine high availability
+# Copyright (C) 2013 Red Hat, Inc.
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+#
+
+"""Constants."""
+
+VDS_CLIENT_DIR = '/usr/share/vdsm'
+VDS_CLIENT_SSL = True
+VDS_CLIENT_MAX_RETRY = 3
diff --git a/ovirt_hosted_engine_ha/broker/submonitor_util.py 
b/ovirt_hosted_engine_ha/lib/vds_client.py
similarity index 96%
rename from ovirt_hosted_engine_ha/broker/submonitor_util.py
rename to ovirt_hosted_engine_ha/lib/vds_client.py
index f77a6fc..d3d5ac7 100644
--- a/ovirt_hosted_engine_ha/broker/submonitor_util.py
+++ b/ovirt_hosted_engine_ha/lib/vds_client.py
@@ -24,8 +24,8 @@
 from otopi import util
 from vdsm import vdscli
 
-from . import constants
-from ..lib.exceptions import DetailedError
+import constants
+from exceptions import DetailedError
 
 
 def run_vds_client_cmd(address, use_ssl, command, *args):


-- 
To view, visit http://gerrit.ovirt.org/18231
To unsubscribe, visit http://gerrit.ovirt.org/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I842f30e6d1533ae4be227508999cd96b3e4963c3
Gerrit-PatchSet: 1
Gerrit-Project: ovirt-hosted-engine-ha
Gerrit-Branch: master
Gerrit-Owner: Greg Padgett <gpadg...@redhat.com>
_______________________________________________
Engine-patches mailing list
Engine-patches@ovirt.org
http://lists.ovirt.org/mailman/listinfo/engine-patches

[Engine-patches] Change in ovirt-hosted-engine-ha[master]: agent, broker: additional vm control including migration

Reply via email to