Liron Aravot has uploaded a new change for review. Change subject: core: WIP: prevent simultaneous reconstructs on the same pool ......................................................................
core: WIP: prevent simultaneous reconstructs on the same pool ReconsturctMasterDomain command can be triggered from the GUI (by executing Reinitialize DC) and by IrsProxyData timer which may cause to two different ReconstructMasterDomain commands run on the same pool simultanously, this patch prevents it by acquiring lock on the IrsProxyData object which represents the pool while executing the ReconstructMasterDomain command. Known issues : 1.locks on IrsBrokerCommand might be redundant. 2.need to inspect the use of lock in case of ForceRemove Change-Id: I68783e92c6aca5dc047611f8a5dcd68faffb5893 Signed-off-by: Liron Aravot <lara...@redhat.com> --- M backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/storage/ForceRemoveStorageDomainCommand.java M backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/storage/ReconstructMasterDomainCommand.java M backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/storage/RecoveryStoragePoolCommand.java M backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dal/VdcBllMessages.java M backend/manager/modules/dal/src/main/resources/bundles/AppErrors.properties M backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/irsbroker/IrsBrokerCommand.java M frontend/webadmin/modules/frontend/src/main/java/org/ovirt/engine/ui/frontend/AppErrors.java M frontend/webadmin/modules/userportal-gwtp/src/main/resources/org/ovirt/engine/ui/frontend/AppErrors.properties M frontend/webadmin/modules/webadmin/src/main/resources/org/ovirt/engine/ui/frontend/AppErrors.properties 9 files changed, 139 insertions(+), 34 deletions(-) git pull ssh://gerrit.ovirt.org:29418/ovirt-engine refs/changes/11/8911/1 diff --git a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/storage/ForceRemoveStorageDomainCommand.java b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/storage/ForceRemoveStorageDomainCommand.java index cedf064..b2b4d80 100644 --- a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/storage/ForceRemoveStorageDomainCommand.java +++ b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/storage/ForceRemoveStorageDomainCommand.java @@ -37,6 +37,7 @@ ReconstructMasterParameters tempVar = new ReconstructMasterParameters(getStoragePool().getId(), getStorageDomain().getId(), false); tempVar.setTransactionScopeOption(TransactionScopeOption.RequiresNew); + tempVar.setParentCommand(VdcActionType.ForceRemoveStorageDomain); Backend.getInstance().runInternalAction(VdcActionType.ReconstructMasterDomain, tempVar); } // try to force detach first diff --git a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/storage/ReconstructMasterDomainCommand.java b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/storage/ReconstructMasterDomainCommand.java index 66db360..5cf09cf 100644 --- a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/storage/ReconstructMasterDomainCommand.java +++ b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/storage/ReconstructMasterDomainCommand.java @@ -8,6 +8,7 @@ import org.ovirt.engine.core.bll.VmCommand; import org.ovirt.engine.core.common.AuditLogType; import org.ovirt.engine.core.common.action.ReconstructMasterParameters; +import org.ovirt.engine.core.common.action.VdcActionType; import org.ovirt.engine.core.common.businessentities.StorageDomainStatus; import org.ovirt.engine.core.common.businessentities.VDS; import org.ovirt.engine.core.common.businessentities.VDSStatus; @@ -33,6 +34,7 @@ import org.ovirt.engine.core.dal.VdcBllMessages; import org.ovirt.engine.core.dal.dbbroker.DbFacade; import org.ovirt.engine.core.utils.transaction.TransactionMethod; +import org.ovirt.engine.core.vdsbroker.irsbroker.IrsBrokerCommand; @SuppressWarnings("serial") @NonTransactiveCommandAttribute(forceCompensation = true) @@ -55,6 +57,28 @@ @Override protected boolean canDoAction() { + if (this.getActionType() == VdcActionType.RecoveryStoragePool || VdcActionType.ForceRemoveStorageDomain.equals(getParameters().getParentCommand())) { + if (!IrsBrokerCommand.lockIrsProxyData(getStoragePool().getId())) { + return failCanDoAction(VdcBllMessages.ACTION_TYPE_FAILED_STORAGE_POOL_NOT_EXIST); + } + } else { + if (!IrsBrokerCommand.tryLockIrsProxyData(getStoragePool().getId())){ + return failCanDoAction(VdcBllMessages.ACTION_TYPE_FAILED_STORAGE_POOL_DURING_RECONSTRUCT); + } + } + + boolean toReturn = false; + try { + toReturn = performCanDoActionChecks(); + } finally { + if (!toReturn) { + IrsBrokerCommand.unLockIrsProxyData(getStoragePool().getId()); + } + } + return toReturn; + } + + protected boolean performCanDoActionChecks(){ List<storage_pool_iso_map> poolDomains = DbFacade.getInstance() .getStoragePoolIsoMapDao().getAllForStoragePool(getStoragePool().getId()); for (storage_pool_iso_map poolDomain : poolDomains) { @@ -63,7 +87,6 @@ return false; } } - return InitializeVds(); } @@ -155,7 +178,20 @@ } setSucceeded(reconstructOpSucceeded); + // we should run compensation under the lock in order to avoid + // race conditions. } finally { + if (!reconstructOpSucceeded) { + try { + executeInNewTransaction(new TransactionMethod<Void>() { + public Void runInTransaction() { + performCleanup(); + return null; + }}); + } catch (Exception e){ + log.error("error while performing cleanup in " + getActionType().toString(),e); + } + } // reset cache and mark reconstruct for pool as finished Backend.getInstance() .getResourceManager() @@ -163,9 +199,16 @@ VDSCommandType.MarkPoolInReconstructMode, new MarkPoolInReconstructModeVDSCommandParameters(getStoragePoolId() .getValue(), ReconstructMarkAction.ClearCache)); + // if we got here - it means that the IrsProxyData lock was acquired by + // canDoAction() method and that it's still held by this thread. + IrsBrokerCommand.unLockIrsProxyData(getStoragePool().getId()); } } + protected void performCleanup() { + compensate(); + } + protected boolean stopSpm() { boolean commandSucceeded = true; if (getStoragePool().getspm_vds_id() != null) { diff --git a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/storage/RecoveryStoragePoolCommand.java b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/storage/RecoveryStoragePoolCommand.java index 1f3c6ac..2fcf261 100644 --- a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/storage/RecoveryStoragePoolCommand.java +++ b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/storage/RecoveryStoragePoolCommand.java @@ -48,8 +48,8 @@ } @Override - protected boolean canDoAction() { - boolean returnValue = super.canDoAction() && checkStoragePool(); + protected boolean performCanDoActionChecks() { + boolean returnValue = super.performCanDoActionChecks() && checkStoragePool(); getReturnValue().getCanDoActionMessages().remove(VdcBllMessages.VAR__ACTION__RECONSTRUCT_MASTER.toString()); addCanDoActionMessage(VdcBllMessages.VAR__ACTION__RECOVER_POOL); @@ -82,6 +82,13 @@ } @Override + protected void performCleanup() { + getStoragePoolIsoMapDAO().remove(new StoragePoolIsoMapId(getRecoveryStoragePoolParametersData() + .getNewMasterDomainId(), getRecoveryStoragePoolParametersData().getStoragePoolId())); + super.performCleanup(); + } + + @Override protected void executeCommand() { storage_pool_iso_map domainPoolMap = TransactionSupport.executeInNewTransaction( @@ -98,20 +105,13 @@ } }); getStoragePool().setstatus(StoragePoolStatus.Problematic); - try { - if (StorageHelperDirector.getInstance().getItem(getStorageDomain().getstorage_type()) - .ConnectStorageToDomainByVdsId(getNewMaster(false), getVds().getId())) { - super.executeCommand(); - } else { - getReturnValue().setFault(new VdcFault(new VdcBLLException(VdcBllErrors.StorageServerConnectionError, - "Failed to connect storage"), - VdcBllErrors.StorageServerConnectionError)); - } - } finally { - if (!reconstructOpSucceeded) { - getStoragePoolIsoMapDAO().remove(new StoragePoolIsoMapId(getRecoveryStoragePoolParametersData() - .getNewMasterDomainId(), getRecoveryStoragePoolParametersData().getStoragePoolId())); - } + if (StorageHelperDirector.getInstance().getItem(getStorageDomain().getstorage_type()) + .ConnectStorageToDomainByVdsId(getNewMaster(false), getVds().getId())) { + super.executeCommand(); + } else { + getReturnValue().setFault(new VdcFault(new VdcBLLException(VdcBllErrors.StorageServerConnectionError, + "Failed to connect storage"), + VdcBllErrors.StorageServerConnectionError)); } } } diff --git a/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dal/VdcBllMessages.java b/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dal/VdcBllMessages.java index daed268..39004c0 100644 --- a/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dal/VdcBllMessages.java +++ b/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dal/VdcBllMessages.java @@ -421,6 +421,7 @@ ACTION_TYPE_FAILED_STORAGE_CONNECTION_WRONG_PARAMETERS_FOR_STORAGE_TYPE, ACTION_TYPE_FAILED_STORAGE_DOMAIN_STATUS_ILLEGAL, ACTION_TYPE_FAILED_STORAGE_DOMAIN_STATUS_ILLEGAL2, + ACTION_TYPE_FAILED_STORAGE_POOL_DURING_RECONSTRUCT, STORAGE_POOL_REINITIALIZE_WITH_MORE_THAN_ONE_DATA_DOMAIN, ACTION_TYPE_FAILED_STORAGE_POOL_STATUS_ILLEGAL, ACTION_TYPE_FAILED_NO_VDS_IN_POOL, diff --git a/backend/manager/modules/dal/src/main/resources/bundles/AppErrors.properties b/backend/manager/modules/dal/src/main/resources/bundles/AppErrors.properties index 3c30aa5..e5ef02c 100644 --- a/backend/manager/modules/dal/src/main/resources/bundles/AppErrors.properties +++ b/backend/manager/modules/dal/src/main/resources/bundles/AppErrors.properties @@ -337,6 +337,7 @@ -Please handle Storage Domain issues and retry the operation. ACTION_TYPE_FAILED_STORAGE_DOMAIN_STATUS_ILLEGAL2=Cannot ${action} ${type}. Storage status is ${status}.\n\ -Please check Storage Domain and retry the operation. +ACTION_TYPE_FAILED_STORAGE_POOL_DURING_RECONSTRUCT=Cannot ${action}. Storage Pool is currently being reconstructed. STORAGE_POOL_REINITIALIZE_WITH_MORE_THAN_ONE_DATA_DOMAIN = Cannot ${action} ${type}.\n\ The ${action} action can be performed on a Data Center that has only one Storage Domain in Active/Unknown state. ACTION_TYPE_FAILED_STORAGE_POOL_NOT_EXIST= Data Center doesn't exist. diff --git a/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/irsbroker/IrsBrokerCommand.java b/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/irsbroker/IrsBrokerCommand.java index ddd990e..d474288 100644 --- a/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/irsbroker/IrsBrokerCommand.java +++ b/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/irsbroker/IrsBrokerCommand.java @@ -62,6 +62,7 @@ import org.ovirt.engine.core.utils.log.Logged; import org.ovirt.engine.core.utils.log.Logged.LogLevel; import org.ovirt.engine.core.utils.log.LoggedUtils; +import org.ovirt.engine.core.utils.threadpool.ThreadPoolUtil; import org.ovirt.engine.core.utils.timer.OnTimerMethodAnnotation; import org.ovirt.engine.core.utils.timer.SchedulerUtilQuartzImpl; import org.ovirt.engine.core.utils.transaction.TransactionMethod; @@ -468,7 +469,7 @@ Guid masterDomainId, String exceptionMessage, String logMessage) { - if (duringReconstructMaster.compareAndSet(false, true)) { + if (tryLockIrsProxyLockObject()) { try { log.warnFormat(logMessage); @@ -480,7 +481,7 @@ .getEventListener() .masterDomainNotOperational(masterDomainId, storagePoolId); } finally { - duringReconstructMaster.set(false); + unlockIrsProxyLockObject(); } throw new IRSNoMasterDomainException(exceptionMessage); } else { @@ -1038,8 +1039,23 @@ private final Map<Guid, HashSet<Guid>> _domainsInProblem = new ConcurrentHashMap<Guid, HashSet<Guid>>(); private final Map<Guid, String> _timers = new HashMap<Guid, String>(); private AtomicBoolean duringReconstructMaster = new AtomicBoolean(false); + private final ReentrantLock duringReconstructMasterLock = new ReentrantLock(); private final Object _lockObject = new Object(); private final ReentrantLock _lockObjForDbSave = new ReentrantLock(); + + public void lockIrsProxyLockObject() { + duringReconstructMasterLock.lock(); + } + + protected boolean tryLockIrsProxyLockObject() { + return duringReconstructMasterLock.tryLock(); + } + + protected void unlockIrsProxyLockObject() { + if (duringReconstructMasterLock.isHeldByCurrentThread()) { + duringReconstructMasterLock.unlock(); + } + } public void lockDbSave() { _lockObjForDbSave.lock(); @@ -1131,7 +1147,7 @@ synchronized (_lockObject) { // during reconstruct master we do not want to update // domains status - if (duringReconstructMaster.get()) { + if (duringReconstructMasterLock.isLocked()) { log.debug("The pool is during reconstruct master, skipping handling problematic domains " + _storagePoolId); return; @@ -1368,17 +1384,26 @@ domainIdTuple); ResourceManager.getInstance() .getEventListener().storageDomainNotOperational(domainId, _storagePoolId); - } else if (duringReconstructMaster.compareAndSet(false, true)) { - try { - ResourceManager.getInstance() - .getEventListener().masterDomainNotOperational(domainId, _storagePoolId); - } finally { - duringReconstructMaster.set(false); - } } else { - log.warnFormat("domain {0} was reported by all hosts in status UP as problematic. But not moving the Domain to NonOperational. Because of is reconstract now", - domainIdTuple); - return; + if (tryLockIrsProxyLockObject()) { + try { + // run in new thread in order to perform the operation out of _lockObject. + ThreadPoolUtil.execute(new Runnable() { + @Override + public void run() { + ResourceManager.getInstance() + .getEventListener().masterDomainNotOperational(domainId, _storagePoolId); + } + }); + } finally { + unlockIrsProxyLockObject(); + } + } else { + log.warnFormat("domain {0} was reported by all hosts in status UP as problematic" + + ". But not moving the Domain to NonOperational. Because of is reconstract now", + domainId); + return; + } } } @@ -1429,14 +1454,12 @@ _timers.clear(); _domainsInProblem.clear(); _vdssInProblem.clear(); - duringReconstructMaster.set(false); } } public void clearPoolTimers() { synchronized (_lockObject) { log.info("clear domain error-timers for pool " + _storagePoolId); - duringReconstructMaster.set(true); for (String jobId : _timers.values()) { try { SchedulerUtilQuartzImpl.getInstance().deleteJob(jobId); @@ -1502,6 +1525,37 @@ } } + } + + + public static boolean tryLockIrsProxyData(Guid storagePoolId) { + IrsProxyData proxyData = _irsProxyData.get(storagePoolId); + if (proxyData != null) { + try { + return _irsProxyData.get(storagePoolId).tryLockIrsProxyLockObject(); + } catch (Exception e) { + return false; + } + } + } + + public static boolean lockIrsProxyData(Guid storagePoolId) { + IrsProxyData proxyData = _irsProxyData.get(storagePoolId); + try { + _irsProxyData.get(storagePoolId).lockIrsProxyLockObject(); + return true; + } catch (Exception e) { + return false; + } + } + + public static void unLockIrsProxyData(Guid storagePoolId) { + IrsProxyData proxyData = _irsProxyData.get(storagePoolId); + if (proxyData != null) { + try { + _irsProxyData.get(storagePoolId).unlockIrsProxyLockObject(); + } catch (Exception e) {} + } } /** @@ -1708,14 +1762,14 @@ } if (masterDomain != null) { - if (getCurrentIrsProxyData().duringReconstructMaster.compareAndSet(false, true)) { + if (getCurrentIrsProxyData().tryLockIrsProxyLockObject()) { try { ResourceManager.getInstance() .getEventListener() .masterDomainNotOperational(masterDomain.getId(), getParameters().getStoragePoolId()); } finally { - getCurrentIrsProxyData().duringReconstructMaster.set(false); + getCurrentIrsProxyData().unlockIrsProxyLockObject(); } } else { log.debug("The pool is during reconstruct master, skipping handling problematic domains " diff --git a/frontend/webadmin/modules/frontend/src/main/java/org/ovirt/engine/ui/frontend/AppErrors.java b/frontend/webadmin/modules/frontend/src/main/java/org/ovirt/engine/ui/frontend/AppErrors.java index ae5b4a6..edaa9ce 100644 --- a/frontend/webadmin/modules/frontend/src/main/java/org/ovirt/engine/ui/frontend/AppErrors.java +++ b/frontend/webadmin/modules/frontend/src/main/java/org/ovirt/engine/ui/frontend/AppErrors.java @@ -919,6 +919,9 @@ @DefaultStringValue("Cannot ${action} ${type}. Storage status is ${status}.\n-Please check Storage Domain and retry the operation.") String ACTION_TYPE_FAILED_STORAGE_DOMAIN_STATUS_ILLEGAL2(); + @DefaultStringValue("Cannot ${action}. Storage Pool is currently being reconstructed.") + String ACTION_TYPE_FAILED_STORAGE_POOL_DURING_RECONSTRUCT(); + @DefaultStringValue("Cannot ${action} ${type}.\nThe ${action} action can be performed on a Data Center that has only one Storage Domain in Active/Unknown state.") String STORAGE_POOL_REINITIALIZE_WITH_MORE_THAN_ONE_DATA_DOMAIN(); diff --git a/frontend/webadmin/modules/userportal-gwtp/src/main/resources/org/ovirt/engine/ui/frontend/AppErrors.properties b/frontend/webadmin/modules/userportal-gwtp/src/main/resources/org/ovirt/engine/ui/frontend/AppErrors.properties index a653c12..8f5ce66 100644 --- a/frontend/webadmin/modules/userportal-gwtp/src/main/resources/org/ovirt/engine/ui/frontend/AppErrors.properties +++ b/frontend/webadmin/modules/userportal-gwtp/src/main/resources/org/ovirt/engine/ui/frontend/AppErrors.properties @@ -335,6 +335,7 @@ -Please handle Storage Domain issues and retry the operation. ACTION_TYPE_FAILED_STORAGE_DOMAIN_STATUS_ILLEGAL2=Cannot ${action} ${type}. Storage status is ${status}.\n\ -Please check Storage Domain and retry the operation. +ACTION_TYPE_FAILED_STORAGE_POOL_DURING_RECONSTRUCT=Cannot ${action}. Storage Pool is currently being reconstructed. STORAGE_POOL_REINITIALIZE_WITH_MORE_THAN_ONE_DATA_DOMAIN = Cannot ${action} ${type}.\n\ The ${action} action can be performed on a Data Center that has only one Storage Domain in Active/Unknown state. ACTION_TYPE_FAILED_STORAGE_POOL_NOT_EXIST= Data Center doesn't exist. diff --git a/frontend/webadmin/modules/webadmin/src/main/resources/org/ovirt/engine/ui/frontend/AppErrors.properties b/frontend/webadmin/modules/webadmin/src/main/resources/org/ovirt/engine/ui/frontend/AppErrors.properties index 8604db7..56f34a8 100644 --- a/frontend/webadmin/modules/webadmin/src/main/resources/org/ovirt/engine/ui/frontend/AppErrors.properties +++ b/frontend/webadmin/modules/webadmin/src/main/resources/org/ovirt/engine/ui/frontend/AppErrors.properties @@ -333,6 +333,7 @@ -Please handle Storage Domain issues and retry the operation. ACTION_TYPE_FAILED_STORAGE_DOMAIN_STATUS_ILLEGAL2=Cannot ${action} ${type}. Storage status is ${status}.\n\ -Please check Storage Domain and retry the operation. +ACTION_TYPE_FAILED_STORAGE_POOL_DURING_RECONSTRUCT=Cannot ${action}. Storage Pool is currently being reconstructed. STORAGE_POOL_REINITIALIZE_WITH_MORE_THAN_ONE_DATA_DOMAIN = Cannot ${action} ${type}.\n\ The ${action} action can be performed on a Data Center that has only one Storage Domain in Active/Unknown state. ACTION_TYPE_FAILED_STORAGE_POOL_NOT_EXIST= Data Center doesn't exist. -- To view, visit http://gerrit.ovirt.org/8911 To unsubscribe, visit http://gerrit.ovirt.org/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I68783e92c6aca5dc047611f8a5dcd68faffb5893 Gerrit-PatchSet: 1 Gerrit-Project: ovirt-engine Gerrit-Branch: master Gerrit-Owner: Liron Aravot <lara...@redhat.com> _______________________________________________ Engine-patches mailing list Engine-patches@ovirt.org http://lists.ovirt.org/mailman/listinfo/engine-patches