mooli tayer has uploaded a new change for review. Change subject: backend: bugfix in VdsManager on failedToRunVm ......................................................................
backend: bugfix in VdsManager on failedToRunVm Bug description: onVdsDuringFailureTimer assumes that number of attempts is always >= 0 but this assumption breaks. the reason for this is that pausing a quartz job does not insure it will not run anymore but simply pauses the trigger, so previously scheduled jobs might still run. this is resolved by using simpler one time scheduling. This patch introduces a behavioral change: previously onVdsDuringFailureTimer would reduce failed attempt by 1 (which is problamatic since it no longer represents failed attempts when it is decremented by 1 every x time). previously there would be a competition between onVdsDuringFailureTimer calls to failedToRunVm calls. Now when ever attempt limit is reached number of attempts is set to 0. The effect of this is if we have a host failing to run vms and those vms were able to run on other hosts and we detect no oter problem on in (its status is up) it will keep trying to recover every 30 min (default) regardless. new behavior is consistent and easy to understand. Change-Id: Ia4dd140ceecf4954e65ea3f6174a41acea82f6a6 Bug-Url: https://bugzilla.redhat.com/show_bug.cgi?id=1066693 Signed-off-by: Mooli Tayer <mta...@redhat.com> --- M backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/VdsManager.java 1 file changed, 10 insertions(+), 16 deletions(-) git pull ssh://gerrit.ovirt.org:29418/ovirt-engine refs/changes/39/26139/1 diff --git a/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/VdsManager.java b/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/VdsManager.java index b693972..cdec892 100644 --- a/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/VdsManager.java +++ b/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/VdsManager.java @@ -100,7 +100,6 @@ private int VDS_DURING_FAILURE_TIMEOUT_IN_MINUTES = Config .<Integer> getValue(ConfigValues.TimeToReduceFailedRunOnVdsInMinutes); - private String duringFailureJobId; private boolean privateInitialized; public boolean getInitialized() { @@ -172,10 +171,6 @@ public void schedulJobs() { SchedulerUtil sched = SchedulerUtilQuartzImpl.getInstance(); - duringFailureJobId = sched.scheduleAFixedDelayJob(this, "onVdsDuringFailureTimer", new Class[0], - new Object[0], VDS_DURING_FAILURE_TIMEOUT_IN_MINUTES, VDS_DURING_FAILURE_TIMEOUT_IN_MINUTES, - TimeUnit.MINUTES); - sched.pauseJob(duringFailureJobId); // start with refresh statistics _refreshIteration = _numberRefreshesBeforeSave - 1; @@ -482,22 +477,18 @@ public void onVdsDuringFailureTimer() { synchronized (getLockObj()) { VDS vds = DbFacade.getInstance().getVdsDao().get(getVdsId()); - /** - * Disable timer if vds returns from suspicious mode - */ - if (mFailedToRunVmAttempts.decrementAndGet() == 0) { - SchedulerUtilQuartzImpl.getInstance().pauseJob(duringFailureJobId); - } + /** * Move vds to Up status from error */ - if (mFailedToRunVmAttempts.get() < Config.<Integer> getValue(ConfigValues.NumberOfFailedRunsOnVds) - && vds.getStatus() == VDSStatus.Error) { + if ( vds.getStatus() == VDSStatus.Error) { + mFailedToRunVmAttempts.set(0); setStatus(VDSStatus.Up, vds); DbFacade.getInstance().getVdsDynamicDao().updateStatus(getVdsId(), VDSStatus.Up); + log.infoFormat("onVdsDuringFailureTimer of Host {0} entered after {1} attempts to run a VM", + vds.getName(), + mFailedToRunVmAttempts); } - log.infoFormat("onVdsDuringFailureTimer of Host {0} entered after {1} attempts to run a VM", vds.getName(), - mFailedToRunVmAttempts); } } @@ -516,7 +507,10 @@ ResourceManager.getInstance().runVdsCommand(VDSCommandType.SetVdsStatus, new SetVdsStatusVDSCommandParameters(vds.getId(), VDSStatus.Error)); - SchedulerUtilQuartzImpl.getInstance().resumeJob(duringFailureJobId); + SchedulerUtil sched = SchedulerUtilQuartzImpl.getInstance(); + sched.scheduleAOneTimeJob(this, "onVdsDuringFailureTimer", new Class[0], + new Object[0], VDS_DURING_FAILURE_TIMEOUT_IN_MINUTES, + TimeUnit.MINUTES); AuditLogableBase logable = new AuditLogableBase(vds.getId()); logable.addCustomValue("Time", Config.<Integer> getValue(ConfigValues.TimeToReduceFailedRunOnVdsInMinutes) .toString()); -- To view, visit http://gerrit.ovirt.org/26139 To unsubscribe, visit http://gerrit.ovirt.org/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ia4dd140ceecf4954e65ea3f6174a41acea82f6a6 Gerrit-PatchSet: 1 Gerrit-Project: ovirt-engine Gerrit-Branch: master Gerrit-Owner: mooli tayer <mta...@redhat.com> _______________________________________________ Engine-patches mailing list Engine-patches@ovirt.org http://lists.ovirt.org/mailman/listinfo/engine-patches