Martin Sivák has uploaded a new change for review.

Change subject: Add clean shutdown flag and checksum to the published metadata
......................................................................

Add clean shutdown flag and checksum to the published metadata

The shutdown flag is published only when the agent shuts down
cleanly and the checksum (crc32) is used to protect the metadata
section from corruption.

Both features will be then used together with sanlock lease file
to make sure a host can be decommisioned.

Change-Id: Idbcc3a345a08a03e95edcbdf9f3c6f7e6fa1a5fd
Signed-off-by: Martin Sivak <[email protected]>
---
M ovirt_hosted_engine_ha/agent/hosted_engine.py
M ovirt_hosted_engine_ha/agent/states.py
M ovirt_hosted_engine_ha/lib/metadata.py
3 files changed, 76 insertions(+), 15 deletions(-)


  git pull ssh://gerrit.ovirt.org:29418/ovirt-hosted-engine-ha 
refs/changes/76/38276/1

diff --git a/ovirt_hosted_engine_ha/agent/hosted_engine.py 
b/ovirt_hosted_engine_ha/agent/hosted_engine.py
index 82c2b61..a96ceb0 100644
--- a/ovirt_hosted_engine_ha/agent/hosted_engine.py
+++ b/ovirt_hosted_engine_ha/agent/hosted_engine.py
@@ -26,6 +26,7 @@
 import socket
 import subprocess
 import time
+import binascii
 
 import sanlock
 
@@ -40,6 +41,7 @@
 from ..lib import vds_client as vdsc
 from ..lib.storage_backends import StorageBackendTypes, VdsmBackend
 from .state_machine import EngineStateMachine
+from .states import AgentStopped
 
 
 class MetadataTooNewError(Exception):
@@ -290,6 +292,11 @@
     def host_id(self):
         return int(self._config.get(config.ENGINE, config.HOST_ID))
 
+    def publish(self, state):
+        blocks = self._generate_local_blocks(state)
+        self._push_to_storage(blocks)
+        self.update_hosts_state(state)
+
     def start_monitoring(self):
         error_count = 0
 
@@ -332,9 +339,7 @@
                                    state.data.best_score_host["score"])
 
                 # publish the current state
-                blocks = self._generate_local_blocks(state)
-                self._push_to_storage(blocks)
-                self.update_hosts_state(state)
+                self.publish(state)
             except Exception as e:
                 self._log.warning("Error while monitoring engine: %s", str(e))
                 if not (isinstance(e, ex.DisconnectionError) or
@@ -357,6 +362,10 @@
 
             self._log.log(log_level, "Sleeping %d seconds", delay)
             time.sleep(delay)
+
+        # Publish stopped status
+        stopped = AgentStopped(self.fsm.state.data)
+        self.publish(stopped)
 
         self._log.debug("Disconnecting from ha-broker")
         if self._broker and self._broker.is_connected():
@@ -677,17 +686,34 @@
         score = state.score(self.fsm.logger)
         lm = state.data.stats.local
         md = state.metadata()
-        data = ("{md_parse_vers}|{md_feature_vers}|{ts_int}"
-                "|{host_id}|{score}|{engine_status}|{name}|{maintenance}"
-                .format(md_parse_vers=constants.METADATA_PARSE_VERSION,
-                        md_feature_vers=constants.METADATA_FEATURE_VERSION,
-                        # system timestamp
-                        ts_int=state.data.stats.collect_start,
-                        host_id=state.data.stats.host_id,
-                        score=score,
-                        engine_status=json.dumps(lm['engine-health']),
-                        name=self._hostname,
-                        maintenance=1 if md["maintenance"] else 0))
+
+        tokens = []
+        # Metadata lowest compatible version
+        tokens.append(constants.METADATA_PARSE_VERSION)
+        # Metadata highest compatible version
+        tokens.append(constants.METADATA_FEATURE_VERSION)
+        # System timestamp
+        tokens.append(state.data.stats.collect_start)
+        # Host ID
+        tokens.append(state.data.stats.host_id)
+        # Host score
+        tokens.append(score)
+        # Engine status
+        tokens.append(json.dumps(lm['engine-health']))
+        # System hostname
+        tokens.append(self._hostname)
+        # Local maintenance flag
+        tokens.append(1 if md["maintenance"] else 0)
+        # Agent stopped cleanly flag
+        tokens.append(1 if "stopped" in md and md["stopped"] else 0)
+        # CRC32 in hex (use 0 for computing the crc)
+        tokens.append(metadata.EMPTY_CRC32)
+
+        data = "|".join(str(t) for t in tokens)
+        crc32 = metadata.CRC32_FORMAT % (binascii.crc32(data) & 0xffffffff)
+        tokens[9] = crc32
+        data = "|".join(str(t) for t in tokens)
+
         if len(data) > constants.METADATA_BLOCK_BYTES:
             raise Exception("Output metadata too long ({0} bytes)"
                             .format(data))
diff --git a/ovirt_hosted_engine_ha/agent/states.py 
b/ovirt_hosted_engine_ha/agent/states.py
index 90446a9..047c19f 100644
--- a/ovirt_hosted_engine_ha/agent/states.py
+++ b/ovirt_hosted_engine_ha/agent/states.py
@@ -188,10 +188,21 @@
 
     def metadata(self):
         data = {"state": self.__class__.__name__,
-                "maintenance": False}
+                "maintenance": False,
+                "stopped": False}
         return data
 
 
+class AgentStopped(EngineState):
+    def score(self, logger):
+        return 0
+
+    def metadata(self):
+        md = super(AgentStopped, self).metadata()
+        md["stopped"] = True
+        return md
+
+
 class LocalMaintenance(EngineState):
     """
     This state is entered any time the host gets to local maintenance state.
diff --git a/ovirt_hosted_engine_ha/lib/metadata.py 
b/ovirt_hosted_engine_ha/lib/metadata.py
index 43fdff3..16a974e 100644
--- a/ovirt_hosted_engine_ha/lib/metadata.py
+++ b/ovirt_hosted_engine_ha/lib/metadata.py
@@ -18,12 +18,15 @@
 #
 
 import re
+import binascii
 
 from ..env import constants
 from ..lib import util
 from exceptions import FatalMetadataError
 from exceptions import MetadataError
 
+EMPTY_CRC32 = '00000000'
+CRC32_FORMAT = "%08x"
 
 def to_bool_rep(value):
     """
@@ -104,6 +107,11 @@
          name - hostname of described host
          maintenance - 0 or 1 representing host is operational or in
             local maintenance
+         running - 0 or 1 representing the agent is running or
+            stopped (cleanly).
+         crc32 - eight hex characters representing the crc32 of the
+            whole 512B block (crc32 is computed with the field set
+            to '00000000').
 
      - Next 3584 bytes (for a total of 4096): human-readable description of
        data to aid in debugging, including factors considered in the host score
@@ -181,6 +189,22 @@
     if len(tokens) >= 8:
         ret['maintenance'] = int(tokens[7]) > 0
 
+    # support stopped cleanly flag if present, but ignore if it isn't
+    if len(tokens) >= 9:
+        ret['stopped'] = int(tokens[8]) > 0
+
+    # support crc32 field if present, but ignore if it isn't
+    if len(tokens) >= 10:
+        ret['crc32'] = tokens[9]
+        tokens[9] = EMPTY_CRC32
+        data = "|".join(tokens)
+        crc32 = CRC32_FORMAT % (binascii.crc32(data) & 0xffffffff)
+        if ret['crc32'] != crc32:
+            raise MetadataError("Malformed metadata for host {0}:"
+                                " provided checksum {1} does not match"
+                                " the data {2}."
+                                .format(host_id, ret[9], crc32))
+
     # Add human-readable summary from bytes 512+
     extra = data[512:].rstrip('\0')
     if len(extra):


-- 
To view, visit https://gerrit.ovirt.org/38276
To unsubscribe, visit https://gerrit.ovirt.org/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Idbcc3a345a08a03e95edcbdf9f3c6f7e6fa1a5fd
Gerrit-PatchSet: 1
Gerrit-Project: ovirt-hosted-engine-ha
Gerrit-Branch: master
Gerrit-Owner: Martin Sivák <[email protected]>
_______________________________________________
Engine-patches mailing list
[email protected]
http://lists.ovirt.org/mailman/listinfo/engine-patches

Reply via email to