Greg Padgett has uploaded a new change for review.

Change subject: broker, agent: more robust service startup/shutdown
......................................................................

broker, agent: more robust service startup/shutdown

The agent and broker had no ordering, and there was no way for an
external program to reliably tell when the broker was ready to
serve requests.

With this change:
- The agent and broker services gain an ordering dependency between
  them in systemd.
- The broker attempts to clean its socket file upon exit, so that the
  existence of the file generally indicates that the broker is ready
  to serve requests.
- The agent will be more patient in attempting to connect to the broker.

Change-Id: I392ddf54503d7b9ef3448d22385b547222c0cc86
Bug-Url: https://bugzilla.redhat.com/1018829
Signed-off-by: Greg Padgett <[email protected]>
---
M initscripts/ovirt-ha-agent.service
M ovirt_hosted_engine_ha/agent/constants.py.in
M ovirt_hosted_engine_ha/agent/hosted_engine.py
M ovirt_hosted_engine_ha/broker/broker.py
M ovirt_hosted_engine_ha/broker/listener.py
M ovirt_hosted_engine_ha/lib/brokerlink.py
6 files changed, 46 insertions(+), 6 deletions(-)


  git pull ssh://gerrit.ovirt.org:29418/ovirt-hosted-engine-ha 
refs/changes/61/20561/1

diff --git a/initscripts/ovirt-ha-agent.service 
b/initscripts/ovirt-ha-agent.service
index 76344ff..8282bff 100644
--- a/initscripts/ovirt-ha-agent.service
+++ b/initscripts/ovirt-ha-agent.service
@@ -1,6 +1,7 @@
 [Unit]
 Description=oVirt Hosted Engine High Availability Monitoring Agent
-Requires=ovirt-ha-broker.service
+Wants=ovirt-ha-broker.service
+After=ovirt-ha-broker.service
 
 [Service]
 Type=forking
@@ -10,3 +11,4 @@
 
 [Install]
 WantedBy=multi-user.target
+Also=ovirt-ha-broker.service
diff --git a/ovirt_hosted_engine_ha/agent/constants.py.in 
b/ovirt_hosted_engine_ha/agent/constants.py.in
index 20ce115..f05e74e 100644
--- a/ovirt_hosted_engine_ha/agent/constants.py.in
+++ b/ovirt_hosted_engine_ha/agent/constants.py.in
@@ -33,6 +33,7 @@
 
 METADATA_BLOCK_BYTES = 512
 SERVICE_TYPE = 'hosted-engine'
+BROKER_CONNECTION_RETRIES = 10
 HOST_ALIVE_TIMEOUT_SECS = 60
 ENGINE_RETRY_EXPIRATION_SECS = 600
 ENGINE_RETRY_COUNT = 3
diff --git a/ovirt_hosted_engine_ha/agent/hosted_engine.py 
b/ovirt_hosted_engine_ha/agent/hosted_engine.py
index 89d3d53..c8cf5ad 100644
--- a/ovirt_hosted_engine_ha/agent/hosted_engine.py
+++ b/ovirt_hosted_engine_ha/agent/hosted_engine.py
@@ -278,7 +278,7 @@
         if not self._broker:
             self._broker = brokerlink.BrokerLink()
         try:
-            self._broker.connect()
+            self._broker.connect(constants.BROKER_CONNECTION_RETRIES)
         except Exception as e:
             self._log.error("Failed to connect to ha-broker: %s", str(e))
             raise
diff --git a/ovirt_hosted_engine_ha/broker/broker.py 
b/ovirt_hosted_engine_ha/broker/broker.py
index 0cf06c4..60eddf0 100644
--- a/ovirt_hosted_engine_ha/broker/broker.py
+++ b/ovirt_hosted_engine_ha/broker/broker.py
@@ -112,6 +112,7 @@
 
         # Server shutdown...
         self._log.info("Server shutting down")
+        self._listener.clean_up()
         os.unlink(constants.PID_FILE)
         sys.exit(0)
 
diff --git a/ovirt_hosted_engine_ha/broker/listener.py 
b/ovirt_hosted_engine_ha/broker/listener.py
index 44085f9..80e888e 100644
--- a/ovirt_hosted_engine_ha/broker/listener.py
+++ b/ovirt_hosted_engine_ha/broker/listener.py
@@ -47,8 +47,7 @@
         self._storage_broker_instance_access_lock = threading.Lock()
         self._need_exit = False
 
-        if os.path.exists(constants.BROKER_SOCKET_FILE):
-            os.unlink(constants.BROKER_SOCKET_FILE)
+        self._remove_socket_file()
         self._server = ThreadedStreamServer(
             constants.BROKER_SOCKET_FILE, ConnectionHandler, True, self)
 
@@ -98,6 +97,18 @@
         """
         self._need_exit = True
 
+    def clean_up(self):
+        """
+        Perform final listener cleanup.  This is effectively the complement
+        of __init__(), to be called after the final call to listen() and
+        close_connections() have been made.
+        """
+        self._remove_socket_file()
+
+    def _remove_socket_file(self):
+        if os.path.exists(constants.BROKER_SOCKET_FILE):
+            os.unlink(constants.BROKER_SOCKET_FILE)
+
 
 class ThreadedStreamServer(SocketServer.ThreadingMixIn,
                            SocketServer.UnixStreamServer):
diff --git a/ovirt_hosted_engine_ha/lib/brokerlink.py 
b/ovirt_hosted_engine_ha/lib/brokerlink.py
index 3d23e6c..f53371b 100644
--- a/ovirt_hosted_engine_ha/lib/brokerlink.py
+++ b/ovirt_hosted_engine_ha/lib/brokerlink.py
@@ -21,6 +21,7 @@
 import contextlib
 import logging
 import socket
+import time
 
 from ..env import constants
 from ..lib.exceptions import DisconnectionError
@@ -33,13 +34,19 @@
         self._log = logging.getLogger("BrokerLink")
         self._socket = None
 
-    def connect(self):
+    def connect(self, retries=0):
+        """
+        Connect to the HA Broker.  Upon failure, reconnection attempts will
+        be made approximately once per second until the specified number of
+        retries have been made.  An exception will be raised if a connection
+        cannot be established.
+        """
         if self.is_connected():
             return
         self._log.info("Connecting to ha-broker")
+
         try:
             self._socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
-            self._socket.connect(constants.BROKER_SOCKET_FILE)
         except socket.error as e:
             self._log.error("Failed to connect to broker: %s", str(e))
             if self._socket:
@@ -47,6 +54,24 @@
             self._socket = None
             raise
 
+        attempt = 0
+        while True:
+            try:
+                self._socket.connect(constants.BROKER_SOCKET_FILE)
+            except socket.error as e:
+                if attempt < retries:
+                    self._log.info("Failed to connect to broker: %s", str(e))
+                    self._log.info("Retrying broker connection...")
+                    time.sleep(1)
+                    continue
+                else:
+                    self._log.error("Failed to connect to broker: %s", str(e))
+                    self._socket.close()
+                    self._socket = None
+                    raise
+            self._log.debug("Successfully connected")
+            break
+
     def is_connected(self):
         return self._socket is not None
 


-- 
To view, visit http://gerrit.ovirt.org/20561
To unsubscribe, visit http://gerrit.ovirt.org/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I392ddf54503d7b9ef3448d22385b547222c0cc86
Gerrit-PatchSet: 1
Gerrit-Project: ovirt-hosted-engine-ha
Gerrit-Branch: master
Gerrit-Owner: Greg Padgett <[email protected]>
_______________________________________________
Engine-patches mailing list
[email protected]
http://lists.ovirt.org/mailman/listinfo/engine-patches

Reply via email to