Eli Mesika has uploaded a new change for review. Change subject: [RFE] Add periodic power management health check.. ......................................................................
[RFE] Add periodic power management health check.. Add periodic power management health check to detect/warn about link-down detection of power management Feature pages: http://www.ovirt.org/Features/PMHealthCheck http://www.ovirt.org/index.php?title=Features/Design/DetailedPMHealthCheck Change-Id: Ib1dfeab92a35793ebb421db8e3be94587dfe85e2 Bug-Url: https://bugzilla.redhat.com/show_bug.cgi?id=1090800 Signed-off-by: Eli Mesika <emes...@redhat.com> --- M backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/Backend.java A backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/pm/PmHealthCheckManager.java M backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/AuditLogType.java M backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java M backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dal/dbbroker/auditloghandling/AlertDirector.java M backend/manager/modules/dal/src/main/resources/bundles/AuditLogMessages.properties M packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql M packaging/etc/engine-config/engine-config.properties 8 files changed, 147 insertions(+), 0 deletions(-) git pull ssh://gerrit.ovirt.org:29418/ovirt-engine refs/changes/67/27367/1 diff --git a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/Backend.java b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/Backend.java index 8a0c2af..a68c624 100644 --- a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/Backend.java +++ b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/Backend.java @@ -29,6 +29,7 @@ import org.ovirt.engine.core.bll.job.ExecutionHandler; import org.ovirt.engine.core.bll.job.JobRepositoryCleanupManager; import org.ovirt.engine.core.bll.job.JobRepositoryFactory; +import org.ovirt.engine.core.bll.pm.PmHealthCheckManager; import org.ovirt.engine.core.bll.quota.QuotaManager; import org.ovirt.engine.core.bll.session.SessionDataContainer; import org.ovirt.engine.core.common.EngineWorkingMode; @@ -243,6 +244,8 @@ 1, quotaCacheIntervalInMinutes, TimeUnit.MINUTES); //initializes attestation initAttestation(); + // Initialize Power Management Health Check + PmHealthCheckManager.getInstance().initialize(); } private void initAttestation() { diff --git a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/pm/PmHealthCheckManager.java b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/pm/PmHealthCheckManager.java new file mode 100644 index 0000000..7d1ae02 --- /dev/null +++ b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/pm/PmHealthCheckManager.java @@ -0,0 +1,106 @@ +package org.ovirt.engine.core.bll.pm; + +import org.ovirt.engine.core.bll.FenceExecutor; +import org.ovirt.engine.core.common.AuditLogType; +import org.ovirt.engine.core.common.businessentities.FenceActionType; +import org.ovirt.engine.core.common.businessentities.FenceAgentOrder; +import org.ovirt.engine.core.common.businessentities.VDS; +import org.ovirt.engine.core.common.config.Config; +import org.ovirt.engine.core.common.config.ConfigValues; +import org.ovirt.engine.core.compat.Guid; +import org.ovirt.engine.core.dal.dbbroker.DbFacade; +import org.ovirt.engine.core.dal.dbbroker.auditloghandling.AlertDirector; +import org.ovirt.engine.core.utils.log.Log; +import org.ovirt.engine.core.utils.log.LogFactory; +import org.ovirt.engine.core.utils.timer.OnTimerMethodAnnotation; +import org.ovirt.engine.core.utils.timer.SchedulerUtilQuartzImpl; + +import java.util.List; +import java.util.concurrent.TimeUnit; + +/** + * Responsible for checking PM enabled hosts by sending a status command to each host configured PM agent cards and + * raise alerts for failed operations. . + */ +public class PmHealthCheckManager { + + private static final Log log = LogFactory.getLog(PmHealthCheckManager.class); + private static PmHealthCheckManager instance = new PmHealthCheckManager(); + private static boolean inWork=false; + + private PmHealthCheckManager() { + // intentionally empty + } + + /** + * Initializes the PM Health Check Manager + */ + public void initialize() { + log.info("Start initializing " + getClass().getSimpleName()); + if(Config.<Boolean>getValue(ConfigValues.PMHealthCheckEnabled)) { + Integer pmHealthCheckInterval = Config.<Integer> getValue(ConfigValues.PMHealthCheckIntervalInSec); + SchedulerUtilQuartzImpl.getInstance().scheduleAFixedDelayJob(this, + "pmHealthCheck", + new Class[] {}, + new Object[] {}, + pmHealthCheckInterval, + pmHealthCheckInterval, + TimeUnit.SECONDS); + } + log.info("Finished initializing " + getClass().getSimpleName()); + } + + @OnTimerMethodAnnotation("pmHealthCheck") + public void pmHealthCheck() { + // skip PM health check if previous operation is not complete yet + if (!inWork) { + synchronized (instance) { + List<VDS> hosts = DbFacade.getInstance().getVdsDao().getAll(); + for (VDS host : hosts) { + if (host.getpm_enabled()) { + boolean hasSecondary = (host.getPmSecondaryIp() != null && !host.getPmSecondaryIp().isEmpty()); + boolean isConcurrent = host.isPmSecondaryConcurrent(); + FenceExecutor executor = new FenceExecutor(host, FenceActionType.Status); + if (executor.findProxyHost() && executor.fence(FenceAgentOrder.Primary).getSucceeded()) { + removeAlarm(host.getId(), isConcurrent, AuditLogType.VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_CON_PRIMARY_AGENT, AuditLogType.VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_SEQ_PRIMARY_AGENT); + if (hasSecondary) { + if (executor.findProxyHost() && executor.fence(FenceAgentOrder.Secondary).getSucceeded()) { + removeAlarm(host.getId(), isConcurrent, AuditLogType.VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_CON_SECONDARY_AGENT, AuditLogType.VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_SEQ_SECONDARY_AGENT); + } + else{ + addAlarm(host.getId(), isConcurrent, AuditLogType.VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_CON_SECONDARY_AGENT, AuditLogType.VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_SEQ_SECONDARY_AGENT); + } + } + } + else { + addAlarm(host.getId(), isConcurrent, AuditLogType.VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_CON_PRIMARY_AGENT, AuditLogType.VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_SEQ_PRIMARY_AGENT); + } + } + } + inWork=false; + } + } + } + + private void addAlarm(Guid hostId, boolean isConcurrent, AuditLogType conAlert, AuditLogType seqAlert) { + if (isConcurrent) { + AlertDirector.AddVdsAlert(hostId, conAlert); + } + else { + AlertDirector.AddVdsAlert(hostId, seqAlert); + } + } + + private void removeAlarm(Guid hostId, boolean isConcurrent, AuditLogType conAlert, AuditLogType seqAlert) { + if (isConcurrent) { + AlertDirector.RemoveVdsAlert(hostId, conAlert); + } + else { + AlertDirector.RemoveVdsAlert(hostId, seqAlert); + } + } + + public static PmHealthCheckManager getInstance() { + return instance; + } +} diff --git a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/AuditLogType.java b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/AuditLogType.java index 06db020..eb0c30c 100644 --- a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/AuditLogType.java +++ b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/AuditLogType.java @@ -866,6 +866,10 @@ VDS_ALERT_FENCE_STATUS_VERIFICATION_FAILED(9005, AuditLogSeverity.ALERT), CANNOT_HIBERNATE_RUNNING_VMS_AFTER_CLUSTER_CPU_UPGRADE(9006, AuditLogSeverity.WARNING), VDS_ALERT_SECONDARY_AGENT_USED_FOR_FENCE_OPERATION(9007, AuditLogSeverity.ALERT), + VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_SEQ_PRIMARY_AGENT(9008, AuditLogSeverity.ALERT), + VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_SEQ_SECONDARY_AGENT(9009, AuditLogSeverity.ALERT), + VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_CON_PRIMARY_AGENT(9010, AuditLogSeverity.ALERT), + VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_CON_SECONDARY_AGENT(9011, AuditLogSeverity.ALERT), TASK_STOPPING_ASYNC_TASK(9500, AuditLogTimeInterval.MINUTE.getValue()), TASK_CLEARING_ASYNC_TASK(9501, AuditLogTimeInterval.MINUTE.getValue()), diff --git a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java index 96bc571..2097d1b 100644 --- a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java +++ b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java @@ -1715,5 +1715,13 @@ @DefaultValueAttribute("0") UserSessionHardLimit, + @TypeConverterAttribute(Boolean.class) + @DefaultValueAttribute("true") + PMHealthCheckEnabled, + + @TypeConverterAttribute(Integer.class) + @DefaultValueAttribute("3600") + PMHealthCheckIntervalInSec, + Invalid } diff --git a/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dal/dbbroker/auditloghandling/AlertDirector.java b/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dal/dbbroker/auditloghandling/AlertDirector.java index bce7d8e..c03ea39 100644 --- a/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dal/dbbroker/auditloghandling/AlertDirector.java +++ b/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dal/dbbroker/auditloghandling/AlertDirector.java @@ -25,6 +25,16 @@ } /** + * Adds an alert + * @param vdsId + * @param type + */ + public static void AddVdsAlert(Guid vdsId, AuditLogType type) { + AuditLogableBase alert = new AuditLogableBase(); + alert.setVdsId(vdsId); + AlertDirector.Alert(alert, type); + } + /** * Removes the alert. * * @param vdsId diff --git a/backend/manager/modules/dal/src/main/resources/bundles/AuditLogMessages.properties b/backend/manager/modules/dal/src/main/resources/bundles/AuditLogMessages.properties index 9d48046..b635214 100644 --- a/backend/manager/modules/dal/src/main/resources/bundles/AuditLogMessages.properties +++ b/backend/manager/modules/dal/src/main/resources/bundles/AuditLogMessages.properties @@ -602,6 +602,10 @@ VDS_ALERT_FENCE_NO_PROXY_HOST=There is no other host in the data center that can be used to test the power management settings. VDS_ALERT_FENCE_STATUS_VERIFICATION_FAILED=Failed to verify Host ${Host} ${Status} status, Please ${Status} Host ${Host} manually. VDS_ALERT_SECONDARY_AGENT_USED_FOR_FENCE_OPERATION=Secondary fence agent was used to ${Operation} Host ${VdsName} +VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_CON_PRIMARY_AGENT=Health check failed on Host ${VdsName} primary concurrent agent, future fence operations may fail on this Host. +VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_SEQ_PRIMARY_AGENT=Health check failed on Host ${VdsName} primary sequential agent, future fence operations may fail is secondary agent if not defined properly. +VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_CON_SECONDARY_AGENT=Health check failed on Host ${VdsName} secondary concurrent agent, future fence operations may fail on this Host. +VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_SEQ_SECONDARY_AGENT=Health check failed on Host ${VdsName} secondary sequential agent, future fence operations may fail is primary agent if not defined properly. TASK_STOPPING_ASYNC_TASK=Stopping async task ${CommandName} that started at ${Date} REFRESH_REPOSITORY_IMAGE_LIST_FAILED=Refresh image list failed for domain(s): ${imageDomains}. Please check domain activity. REFRESH_REPOSITORY_IMAGE_LIST_SUCCEEDED=Refresh image list succeeded for domain(s): ${imageDomains} diff --git a/packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql b/packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql index 155f22f..60e603b 100644 --- a/packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql +++ b/packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql @@ -441,6 +441,13 @@ select fn_db_add_config_value('OvfUpdateIntervalInMinutes','60','general'); select fn_db_add_config_value('OvfItemsCountPerUpdate','100','general'); select fn_db_add_config_value('PayloadSize','8192','general'); +-- Power management health check +select fn_db_add_config_value('PMHealthCheckEnabled','false','3.0'); +select fn_db_add_config_value('PMHealthCheckEnabled','false','3.1'); +select fn_db_add_config_value('PMHealthCheckEnabled','false','3.2'); +select fn_db_add_config_value('PMHealthCheckEnabled','false','3.3'); +select fn_db_add_config_value('PMHealthCheckEnabled','false','3.4'); +select fn_db_add_config_value('PMHealthCheckIntervalInSec','3600','general'); select fn_db_add_config_value('PosixStorageEnabled','false','3.0'); select fn_db_add_config_value('PostgresI18NPrefix','','general'); select fn_db_add_config_value('PostgresLikeSyntax','ILIKE','general'); diff --git a/packaging/etc/engine-config/engine-config.properties b/packaging/etc/engine-config/engine-config.properties index d9e5872..c712341 100644 --- a/packaging/etc/engine-config/engine-config.properties +++ b/packaging/etc/engine-config/engine-config.properties @@ -390,3 +390,8 @@ MaxNumOfTriesToRunFailedAutoStartVm.type=Integer RetryToRunAutoStartVmIntervalInSeconds.description="How often to try to restart highly available VM that went down unexpectedly (in seconds)" RetryToRunAutoStartVmIntervalInSeconds.type=Integer +# PM Health Check +PMHealthCheckEnabled.type=Boolean +PMHealthCheckEnabled.description="Enable/Disable Power Management Health Check feature." +PMHealthCheckIntervalInSec.type=Integer +PMHealthCheckIntervalInSec.description="The interval in which the Power Management Health Check is running." -- To view, visit http://gerrit.ovirt.org/27367 To unsubscribe, visit http://gerrit.ovirt.org/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ib1dfeab92a35793ebb421db8e3be94587dfe85e2 Gerrit-PatchSet: 1 Gerrit-Project: ovirt-engine Gerrit-Branch: master Gerrit-Owner: Eli Mesika <emes...@redhat.com> _______________________________________________ Engine-patches mailing list Engine-patches@ovirt.org http://lists.ovirt.org/mailman/listinfo/engine-patches