Eli Mesika has uploaded a new change for review. Change subject: core: handle fence agent power wait param on stop ......................................................................
core: handle fence agent power wait param on stop When a host restart is dome manually or as a result of a non-responsive host treatment and in the case that power wait parameter is used the host may stay in 'off' state and even release its lock on HA VMs before the host is really down This is the scenario: 1) A restart command is issued and actually performed as stop -> wait for 'off' status -> start -> wait for 'on' status 2) power wait parameter is added to the command implicitly or explicitly giving a delay of X seconds before the operation is actually performed 3) The fence agent script returns immediately with 'off' status 4) A 'on' command is sent to the fence agent by the start operation 5) X seconds passed and the host is actually shutdown This patch handles this by adding a new configuration value that maps fence agents to the name of the parameter for power wait Upon stop operation, we will wait a fixed delay (5 seconds hard-coded) before starting to sample the host status, if power wait parameter is used, the value is extracted and we will wait for 5 + X seconds. Change-Id: I310e076ecf84988cacd0b179954d2460d7988b91 Bug-Url: https://bugzilla.redhat.com/show_bug.cgi?id=1114618 Signed-off-by: Eli Mesika <emes...@redhat.com> --- M backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/FenceVdsBaseCommand.java M backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java M backend/manager/modules/utils/src/main/java/org/ovirt/engine/core/utils/pm/VdsFenceOptions.java M packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql 4 files changed, 69 insertions(+), 1 deletion(-) git pull ssh://gerrit.ovirt.org:29418/ovirt-engine refs/changes/26/29426/1 diff --git a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/FenceVdsBaseCommand.java b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/FenceVdsBaseCommand.java index 7195328..866a64f 100644 --- a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/FenceVdsBaseCommand.java +++ b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/FenceVdsBaseCommand.java @@ -2,12 +2,14 @@ import java.util.ArrayList; import java.util.Collections; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorCompletionService; import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; import org.apache.commons.lang.StringUtils; import org.ovirt.engine.core.bll.job.ExecutionHandler; @@ -21,6 +23,7 @@ import org.ovirt.engine.core.common.businessentities.VDSStatus; import org.ovirt.engine.core.common.businessentities.VM; import org.ovirt.engine.core.common.businessentities.VMStatus; +import org.ovirt.engine.core.common.businessentities.VdsStatic; import org.ovirt.engine.core.common.businessentities.VmExitStatus; import org.ovirt.engine.core.common.config.Config; import org.ovirt.engine.core.common.config.ConfigValues; @@ -39,6 +42,7 @@ import org.ovirt.engine.core.dal.dbbroker.auditloghandling.AuditLogDirector; import org.ovirt.engine.core.dal.dbbroker.auditloghandling.AuditLogableBase; import org.ovirt.engine.core.utils.ThreadUtils; +import org.ovirt.engine.core.utils.pm.VdsFenceOptions; import org.ovirt.engine.core.utils.threadpool.ThreadPoolUtil; public abstract class FenceVdsBaseCommand<T extends FenceVdsActionParameters> extends VdsCommand<T> { @@ -499,6 +503,36 @@ setVmName(null); } + private int getSleep(FenceActionType actionType, FenceAgentOrder order) { + if (actionType != FenceActionType.Stop) { + return SLEEP_BEFORE_FIRST_ATTEMPT; + } + // We have to find out if power off delay was used and add this to the wait time + // since otherwise the command will return immediately with 'off' status and + // subsequent 'on' command issued during this delay will be overridden by the actual shutdown + String agent = (order == FenceAgentOrder.Primary) ? getVds().getPmType() : getVds().getPmSecondaryType(); + String options = (order == FenceAgentOrder.Primary) ? getVds().getPmOptions() : getVds().getPmSecondaryOptions(); + options = VdsFenceOptions.getDefaultAgentOptions(agent, options); + HashMap<String, String> optionsMap = VdsStatic.pmOptionsStringToMap(options); + String powerWaitParamSettings = Config.getValue(ConfigValues.FencePowerWaitParam); + String powerWaitParam = VdsFenceOptions.getAgentPowerWaitParam(agent, powerWaitParamSettings); + if (powerWaitParam == null) { + // no power wait for this agent + return SLEEP_BEFORE_FIRST_ATTEMPT; + } + if (optionsMap.containsKey(powerWaitParam)) { + try { + Integer powerWaitValueInSec = Integer.parseInt(optionsMap.get(powerWaitParam)); + return SLEEP_BEFORE_FIRST_ATTEMPT + (int) TimeUnit.SECONDS.toMillis(powerWaitValueInSec); + } + catch(NumberFormatException nfe) { + // illegal value + return SLEEP_BEFORE_FIRST_ATTEMPT; + } + } + return SLEEP_BEFORE_FIRST_ATTEMPT; + } + protected void setStatus() { Backend.getInstance() .getResourceManager() @@ -524,11 +558,12 @@ int i = 1; boolean statusReached = false; log.infoFormat("Waiting for vds {0} to {1}", vdsName, ACTION_NAME); + // Waiting before first attempt to check the host status. // This is done because if we will attempt to get host status immediately // in most cases it will not turn from on/off to off/on and we will need // to wait a full cycle for it. - ThreadUtils.sleep(SLEEP_BEFORE_FIRST_ATTEMPT); + ThreadUtils.sleep(getSleep(actionType, order)); while (!statusReached && i <= getRerties()) { log.infoFormat("Attempt {0} to get vds {1} status", i, vdsName); if (executor.findProxyHost()) { diff --git a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java index bbdccee..720af50 100644 --- a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java +++ b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java @@ -1651,5 +1651,12 @@ @DefaultValueAttribute("true") IscsiMultipathingSupported, + /** + * Defines the parameter name used by the agent script to delay host on/off + */ + @TypeConverterAttribute(String.class) + @DefaultValueAttribute("apc=power_wait,apc_snmp=power_wait,bladecenter=power_wait,cisco_ucs=power_wait,drac5=power_wait,drac7=power_wait,eps=delay,hpblade=power_wait,ilo=power_wait,ilo2=power_wait,ilo3=power_wait,ilo4=power_wait,ipmilan=power_wait,rsa=power_wait,rsb=power_wait,wti=power_wait") + FencePowerWaitParam, + Invalid; } diff --git a/backend/manager/modules/utils/src/main/java/org/ovirt/engine/core/utils/pm/VdsFenceOptions.java b/backend/manager/modules/utils/src/main/java/org/ovirt/engine/core/utils/pm/VdsFenceOptions.java index b21f91f..b84f643 100644 --- a/backend/manager/modules/utils/src/main/java/org/ovirt/engine/core/utils/pm/VdsFenceOptions.java +++ b/backend/manager/modules/utils/src/main/java/org/ovirt/engine/core/utils/pm/VdsFenceOptions.java @@ -289,6 +289,31 @@ return realAgent; } + + /** + * handles agent power wait parameter mapping + * @param agent + * @param powerWait + * @return + */ + public static String getAgentPowerWaitParam(String agent, String powerWait) { + String param = null; + // result has the format [<agent>=<power wait param name>[,]]* + String[] settings = powerWait.split(Pattern.quote(COMMA), -1); + if (settings.length > 0) { + for (String setting : settings) { + String[] pair = setting.split(Pattern.quote(EQUAL), -1); + if (pair.length == 2) { + if (agent.equalsIgnoreCase(pair[0])) { + param = pair[1]; + break; + } + } + } + } + return param; + } + /** * handles agent default options * diff --git a/packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql b/packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql index 159d395..6d22d7a 100644 --- a/packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql +++ b/packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql @@ -111,6 +111,7 @@ select fn_db_add_config_value('FenceStartStatusRetries','3','general'); select fn_db_add_config_value('FenceStopStatusDelayBetweenRetriesInSec','60','general'); select fn_db_add_config_value('FenceStopStatusRetries','3','general'); +select fn_db_add_config_value('FencePowerWaitParam','apc=power_wait,apc_snmp=power_wait,bladecenter=power_wait,cisco_ucs=power_wait,drac5=power_wait,drac7=power_wait,eps=delay,hpblade=power_wait,ilo=power_wait,ilo2=power_wait,ilo3=power_wait,ilo4=power_wait,ipmilan=power_wait,rsa=power_wait,rsb=power_wait,wti=power_wait','general'); select fn_db_add_config_value('FilteringLUNsEnabled','true','3.0'); select fn_db_add_config_value('FindFenceProxyDelayBetweenRetriesInSec','30','general'); select fn_db_add_config_value('FindFenceProxyRetries','3','general'); -- To view, visit http://gerrit.ovirt.org/29426 To unsubscribe, visit http://gerrit.ovirt.org/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I310e076ecf84988cacd0b179954d2460d7988b91 Gerrit-PatchSet: 1 Gerrit-Project: ovirt-engine Gerrit-Branch: ovirt-engine-3.4 Gerrit-Owner: Eli Mesika <emes...@redhat.com> _______________________________________________ Engine-patches mailing list Engine-patches@ovirt.org http://lists.ovirt.org/mailman/listinfo/engine-patches