Liron Ar has uploaded a new change for review. Change subject: core: deadlock and related changes in domain monitoring/InitVdsOnUp ......................................................................
core: deadlock and related changes in domain monitoring/InitVdsOnUp The following patches fixes few issues related to the given bug: 1. Deadlock between domain failover process and InitVdsOnUp flows - Thread 1 (domain failover process) - runs within the event queue (a), attempts to call SetVdsStatus command that waits for lock on VdsManager.getLockObj() (b) Thread 2 - InitVdsOnUp - runs through the VdsManager.OnTimer with lock on VdsManager.getLockObj() (b) and waits for event submitted to the event queue to be executed (a) The implemented solution is to call SetVdsStatus executed from thread 1 on a new thread which will also reduce the wait time for the mutex. 2. Domain monitoring information should be processed from hosts that are "Up", if the host is not UP it means that something is wrong with that domain regardless (for example, if the host was moved to non-op status because of problematic report of domain, there's no reason to analyze each time its domain report). 3. Reduce calls to SetVdsStatus thus reducing acquired locks and wait time in case that the host isn't UP (if the host is already not up because of some other reason, there's no need to change it's status). Change-Id: I334b450fcf29ed57675ec1f2e4bf3bc43f31b4aa Bug-Url: https://bugzilla.redhat.com/965972 Signed-off-by: Liron Aravot <[email protected]> --- M backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/VdsManager.java M backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/irsbroker/IrsBrokerCommand.java 2 files changed, 31 insertions(+), 21 deletions(-) git pull ssh://gerrit.ovirt.org:29418/ovirt-engine refs/changes/95/20895/1 diff --git a/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/VdsManager.java b/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/VdsManager.java index c14b395..e076896 100644 --- a/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/VdsManager.java +++ b/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/VdsManager.java @@ -212,13 +212,11 @@ if (LockManagerFactory.getLockManager().acquireLock(monitoringLock).getFirst()) { try { setIsSetNonOperationalExecuted(false); - Guid vdsId = null; Guid storagePoolId = null; - String vdsName = null; ArrayList<VDSDomainsData> domainsList = null; - + VDS tmpVds; synchronized (getLockObj()) { - _vds = DbFacade.getInstance().getVdsDao().get(getVdsId()); + tmpVds = _vds = DbFacade.getInstance().getVdsDao().get(getVdsId()); if (_vds == null) { log.errorFormat("VdsManager::refreshVdsRunTimeInfo - OnTimer is NULL for {0}", getVdsId()); @@ -261,8 +259,6 @@ // to // the storage anymore (so there is no sense in updating the domains list in that case). if (_vds != null && _vds.getStatus() != VDSStatus.Maintenance) { - vdsId = _vds.getId(); - vdsName = _vds.getName(); storagePoolId = _vds.getStoragePoolId(); domainsList = _vds.getDomains(); } @@ -285,7 +281,7 @@ // Now update the status of domains, this code should not be in // synchronized part of code if (domainsList != null) { - IrsBrokerCommand.UpdateVdsDomainsData(vdsId, vdsName, storagePoolId, domainsList); + IrsBrokerCommand.UpdateVdsDomainsData(tmpVds, storagePoolId, domainsList); } } catch (Exception e) { log.error("Timer update runtimeinfo failed. Exception:", e); diff --git a/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/irsbroker/IrsBrokerCommand.java b/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/irsbroker/IrsBrokerCommand.java index 2a94e49..6466382 100644 --- a/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/irsbroker/IrsBrokerCommand.java +++ b/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/irsbroker/IrsBrokerCommand.java @@ -88,11 +88,19 @@ public static final long BYTES_TO_GB = 1024 * 1024 * 1024; private static Map<Guid, IrsProxyData> _irsProxyData = new ConcurrentHashMap<Guid, IrsProxyData>(); - public static void UpdateVdsDomainsData(Guid vdsId, String vdsName, Guid storagePoolId, - java.util.ArrayList<VDSDomainsData> vdsDomainData) { - IrsProxyData proxy = _irsProxyData.get(storagePoolId); - if (proxy != null) { - proxy.UpdateVdsDomainsData(vdsId, vdsName, vdsDomainData); + /** + * process received domain monitoring information from a given vds if necessary (according to it's status). + * @param vds + * @param storagePoolId + * @param vdsDomainData + */ + public static void UpdateVdsDomainsData(VDS vds, Guid storagePoolId, + ArrayList<VDSDomainsData> vdsDomainData) { + if (vds.getStatus() == VDSStatus.Up) { + IrsProxyData proxy = _irsProxyData.get(storagePoolId); + if (proxy != null) { + proxy.UpdateVdsDomainsData(vds.getId(), vds.getName(), vdsDomainData); + } } } @@ -1262,24 +1270,30 @@ // Moving all the hosts which reported on // this domain as in problem to non // operational. - for (Guid vdsId : _domainsInProblem.get(domainId)) { + for (final Guid vdsId : _domainsInProblem.get(domainId)) { VDS vds = vdsMap.get(vdsId); if (vds == null) { log.warnFormat( "vds {0} reported domain {1} - as in problem but cannot find vds in db!!", vdsId, domainIdTuple); - } else if (vds.getStatus() != VDSStatus.Maintenance - && vds.getStatus() != VDSStatus.NonOperational) { + } else if (vds.getStatus() == VDSStatus.Up) { log.warnFormat( - "vds {0} reported domain {1} as in problem, moving the vds to status NonOperational", + "vds {0} reported domain {1} as in problem, attempting to move the vds to status NonOperational", vds.getName(), domainIdTuple); - ResourceManager - .getInstance() - .getEventListener() - .vdsNonOperational(vdsId, NonOperationalReason.STORAGE_DOMAIN_UNREACHABLE, - true, true, domainId); + + ThreadPoolUtil.execute(new Runnable() { + @Override + public void run() { + ResourceManager + .getInstance() + .getEventListener() + .vdsNonOperational(vdsId, NonOperationalReason.STORAGE_DOMAIN_UNREACHABLE, + true, true, domainId); + } + }); + nonOpVdss.add(vdsId); } else { log.warnFormat( -- To view, visit http://gerrit.ovirt.org/20895 To unsubscribe, visit http://gerrit.ovirt.org/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I334b450fcf29ed57675ec1f2e4bf3bc43f31b4aa Gerrit-PatchSet: 1 Gerrit-Project: ovirt-engine Gerrit-Branch: ovirt-engine-3.3 Gerrit-Owner: Liron Ar <[email protected]> _______________________________________________ Engine-patches mailing list [email protected] http://lists.ovirt.org/mailman/listinfo/engine-patches
