Eli Mesika has uploaded a new change for review.

Change subject: [RFE] Add periodic power management health check..
......................................................................

[RFE] Add periodic power management health check..

Add periodic power management health check to detect/warn about link-down 
detection of power management

Feature pages:
http://www.ovirt.org/Features/PMHealthCheck
http://www.ovirt.org/index.php?title=Features/Design/DetailedPMHealthCheck

Change-Id: Ib1dfeab92a35793ebb421db8e3be94587dfe85e2
Bug-Url: https://bugzilla.redhat.com/show_bug.cgi?id=1090800
Signed-off-by: Eli Mesika <emes...@redhat.com>
---
M 
backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/Backend.java
A 
backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/pm/PmHealthCheckManager.java
M 
backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/AuditLogType.java
M 
backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java
M 
backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dal/dbbroker/auditloghandling/AlertDirector.java
M 
backend/manager/modules/dal/src/main/resources/bundles/AuditLogMessages.properties
M packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql
M packaging/etc/engine-config/engine-config.properties
8 files changed, 147 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.ovirt.org:29418/ovirt-engine refs/changes/67/27367/1

diff --git 
a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/Backend.java
 
b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/Backend.java
index 8a0c2af..a68c624 100644
--- 
a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/Backend.java
+++ 
b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/Backend.java
@@ -29,6 +29,7 @@
 import org.ovirt.engine.core.bll.job.ExecutionHandler;
 import org.ovirt.engine.core.bll.job.JobRepositoryCleanupManager;
 import org.ovirt.engine.core.bll.job.JobRepositoryFactory;
+import org.ovirt.engine.core.bll.pm.PmHealthCheckManager;
 import org.ovirt.engine.core.bll.quota.QuotaManager;
 import org.ovirt.engine.core.bll.session.SessionDataContainer;
 import org.ovirt.engine.core.common.EngineWorkingMode;
@@ -243,6 +244,8 @@
                 1, quotaCacheIntervalInMinutes, TimeUnit.MINUTES);
         //initializes attestation
         initAttestation();
+        // Initialize Power Management Health Check
+        PmHealthCheckManager.getInstance().initialize();
     }
 
     private void initAttestation() {
diff --git 
a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/pm/PmHealthCheckManager.java
 
b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/pm/PmHealthCheckManager.java
new file mode 100644
index 0000000..7d1ae02
--- /dev/null
+++ 
b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/pm/PmHealthCheckManager.java
@@ -0,0 +1,106 @@
+package org.ovirt.engine.core.bll.pm;
+
+import org.ovirt.engine.core.bll.FenceExecutor;
+import org.ovirt.engine.core.common.AuditLogType;
+import org.ovirt.engine.core.common.businessentities.FenceActionType;
+import org.ovirt.engine.core.common.businessentities.FenceAgentOrder;
+import org.ovirt.engine.core.common.businessentities.VDS;
+import org.ovirt.engine.core.common.config.Config;
+import org.ovirt.engine.core.common.config.ConfigValues;
+import org.ovirt.engine.core.compat.Guid;
+import org.ovirt.engine.core.dal.dbbroker.DbFacade;
+import org.ovirt.engine.core.dal.dbbroker.auditloghandling.AlertDirector;
+import org.ovirt.engine.core.utils.log.Log;
+import org.ovirt.engine.core.utils.log.LogFactory;
+import org.ovirt.engine.core.utils.timer.OnTimerMethodAnnotation;
+import org.ovirt.engine.core.utils.timer.SchedulerUtilQuartzImpl;
+
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Responsible for checking PM enabled hosts by sending a status command to 
each host configured PM agent cards and
+ * raise alerts for failed operations. .
+ */
+public class PmHealthCheckManager {
+
+    private static final Log log = 
LogFactory.getLog(PmHealthCheckManager.class);
+    private static PmHealthCheckManager instance = new PmHealthCheckManager();
+    private static boolean inWork=false;
+
+    private PmHealthCheckManager() {
+        // intentionally empty
+    }
+
+    /**
+     * Initializes the PM Health Check Manager
+     */
+    public void initialize() {
+        log.info("Start initializing " + getClass().getSimpleName());
+        if(Config.<Boolean>getValue(ConfigValues.PMHealthCheckEnabled)) {
+            Integer pmHealthCheckInterval = Config.<Integer> 
getValue(ConfigValues.PMHealthCheckIntervalInSec);
+            SchedulerUtilQuartzImpl.getInstance().scheduleAFixedDelayJob(this,
+                    "pmHealthCheck",
+                    new Class[] {},
+                    new Object[] {},
+                    pmHealthCheckInterval,
+                    pmHealthCheckInterval,
+                    TimeUnit.SECONDS);
+        }
+        log.info("Finished initializing " + getClass().getSimpleName());
+    }
+
+    @OnTimerMethodAnnotation("pmHealthCheck")
+    public void pmHealthCheck() {
+        // skip PM health check if previous operation is not complete yet
+        if (!inWork) {
+            synchronized (instance) {
+                List<VDS> hosts = DbFacade.getInstance().getVdsDao().getAll();
+                for (VDS host : hosts) {
+                    if (host.getpm_enabled()) {
+                        boolean hasSecondary = (host.getPmSecondaryIp() != 
null && !host.getPmSecondaryIp().isEmpty());
+                        boolean isConcurrent = host.isPmSecondaryConcurrent();
+                        FenceExecutor executor = new FenceExecutor(host, 
FenceActionType.Status);
+                        if (executor.findProxyHost() && 
executor.fence(FenceAgentOrder.Primary).getSucceeded()) {
+                            removeAlarm(host.getId(), isConcurrent, 
AuditLogType.VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_CON_PRIMARY_AGENT, 
AuditLogType.VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_SEQ_PRIMARY_AGENT);
+                            if (hasSecondary) {
+                                if (executor.findProxyHost() && 
executor.fence(FenceAgentOrder.Secondary).getSucceeded()) {
+                                    removeAlarm(host.getId(), isConcurrent, 
AuditLogType.VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_CON_SECONDARY_AGENT, 
AuditLogType.VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_SEQ_SECONDARY_AGENT);
+                                }
+                                else{
+                                    addAlarm(host.getId(), isConcurrent, 
AuditLogType.VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_CON_SECONDARY_AGENT, 
AuditLogType.VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_SEQ_SECONDARY_AGENT);
+                                }
+                            }
+                        }
+                        else {
+                            addAlarm(host.getId(), isConcurrent, 
AuditLogType.VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_CON_PRIMARY_AGENT, 
AuditLogType.VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_SEQ_PRIMARY_AGENT);
+                        }
+                    }
+                }
+                inWork=false;
+            }
+        }
+    }
+
+    private void addAlarm(Guid hostId, boolean isConcurrent, AuditLogType 
conAlert, AuditLogType seqAlert) {
+        if (isConcurrent) {
+            AlertDirector.AddVdsAlert(hostId, conAlert);
+        }
+        else {
+            AlertDirector.AddVdsAlert(hostId, seqAlert);
+        }
+    }
+
+    private void removeAlarm(Guid hostId, boolean isConcurrent, AuditLogType 
conAlert, AuditLogType seqAlert) {
+        if (isConcurrent) {
+            AlertDirector.RemoveVdsAlert(hostId, conAlert);
+        }
+        else {
+            AlertDirector.RemoveVdsAlert(hostId, seqAlert);
+        }
+    }
+
+    public static PmHealthCheckManager getInstance() {
+        return instance;
+    }
+}
diff --git 
a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/AuditLogType.java
 
b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/AuditLogType.java
index 06db020..eb0c30c 100644
--- 
a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/AuditLogType.java
+++ 
b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/AuditLogType.java
@@ -866,6 +866,10 @@
     VDS_ALERT_FENCE_STATUS_VERIFICATION_FAILED(9005, AuditLogSeverity.ALERT),
     CANNOT_HIBERNATE_RUNNING_VMS_AFTER_CLUSTER_CPU_UPGRADE(9006, 
AuditLogSeverity.WARNING),
     VDS_ALERT_SECONDARY_AGENT_USED_FOR_FENCE_OPERATION(9007, 
AuditLogSeverity.ALERT),
+    VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_SEQ_PRIMARY_AGENT(9008, 
AuditLogSeverity.ALERT),
+    VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_SEQ_SECONDARY_AGENT(9009, 
AuditLogSeverity.ALERT),
+    VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_CON_PRIMARY_AGENT(9010, 
AuditLogSeverity.ALERT),
+    VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_CON_SECONDARY_AGENT(9011, 
AuditLogSeverity.ALERT),
 
     TASK_STOPPING_ASYNC_TASK(9500, AuditLogTimeInterval.MINUTE.getValue()),
     TASK_CLEARING_ASYNC_TASK(9501, AuditLogTimeInterval.MINUTE.getValue()),
diff --git 
a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java
 
b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java
index 96bc571..2097d1b 100644
--- 
a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java
+++ 
b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java
@@ -1715,5 +1715,13 @@
     @DefaultValueAttribute("0")
     UserSessionHardLimit,
 
+    @TypeConverterAttribute(Boolean.class)
+    @DefaultValueAttribute("true")
+    PMHealthCheckEnabled,
+
+    @TypeConverterAttribute(Integer.class)
+    @DefaultValueAttribute("3600")
+    PMHealthCheckIntervalInSec,
+
     Invalid
 }
diff --git 
a/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dal/dbbroker/auditloghandling/AlertDirector.java
 
b/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dal/dbbroker/auditloghandling/AlertDirector.java
index bce7d8e..c03ea39 100644
--- 
a/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dal/dbbroker/auditloghandling/AlertDirector.java
+++ 
b/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dal/dbbroker/auditloghandling/AlertDirector.java
@@ -25,6 +25,16 @@
     }
 
     /**
+     * Adds an alert
+     * @param vdsId
+     * @param type
+     */
+    public static void AddVdsAlert(Guid vdsId, AuditLogType type) {
+        AuditLogableBase alert = new AuditLogableBase();
+        alert.setVdsId(vdsId);
+        AlertDirector.Alert(alert, type);
+    }
+    /**
      * Removes the alert.
      *
      * @param vdsId
diff --git 
a/backend/manager/modules/dal/src/main/resources/bundles/AuditLogMessages.properties
 
b/backend/manager/modules/dal/src/main/resources/bundles/AuditLogMessages.properties
index 9d48046..b635214 100644
--- 
a/backend/manager/modules/dal/src/main/resources/bundles/AuditLogMessages.properties
+++ 
b/backend/manager/modules/dal/src/main/resources/bundles/AuditLogMessages.properties
@@ -602,6 +602,10 @@
 VDS_ALERT_FENCE_NO_PROXY_HOST=There is no other host in the data center that 
can be used to test the power management settings.
 VDS_ALERT_FENCE_STATUS_VERIFICATION_FAILED=Failed to verify Host ${Host} 
${Status} status, Please ${Status} Host ${Host} manually.
 VDS_ALERT_SECONDARY_AGENT_USED_FOR_FENCE_OPERATION=Secondary fence agent was 
used to ${Operation} Host ${VdsName}
+VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_CON_PRIMARY_AGENT=Health check failed on 
Host ${VdsName} primary concurrent agent, future fence operations may fail on 
this Host.
+VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_SEQ_PRIMARY_AGENT=Health check failed on 
Host ${VdsName} primary sequential agent, future fence operations may fail is 
secondary agent if not defined properly.
+VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_CON_SECONDARY_AGENT=Health check failed 
on Host ${VdsName} secondary concurrent agent, future fence operations may fail 
on this Host.
+VDS_ALERT_PM_HEALTH_CHECK_FAILED_FOR_SEQ_SECONDARY_AGENT=Health check failed 
on Host ${VdsName} secondary sequential agent, future fence operations may fail 
is primary agent if not defined properly.
 TASK_STOPPING_ASYNC_TASK=Stopping async task ${CommandName} that started at 
${Date}
 REFRESH_REPOSITORY_IMAGE_LIST_FAILED=Refresh image list failed for domain(s): 
${imageDomains}. Please check domain activity.
 REFRESH_REPOSITORY_IMAGE_LIST_SUCCEEDED=Refresh image list succeeded for 
domain(s): ${imageDomains}
diff --git a/packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql 
b/packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql
index 155f22f..60e603b 100644
--- a/packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql
+++ b/packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql
@@ -441,6 +441,13 @@
 select fn_db_add_config_value('OvfUpdateIntervalInMinutes','60','general');
 select fn_db_add_config_value('OvfItemsCountPerUpdate','100','general');
 select fn_db_add_config_value('PayloadSize','8192','general');
+-- Power management health check
+select fn_db_add_config_value('PMHealthCheckEnabled','false','3.0');
+select fn_db_add_config_value('PMHealthCheckEnabled','false','3.1');
+select fn_db_add_config_value('PMHealthCheckEnabled','false','3.2');
+select fn_db_add_config_value('PMHealthCheckEnabled','false','3.3');
+select fn_db_add_config_value('PMHealthCheckEnabled','false','3.4');
+select fn_db_add_config_value('PMHealthCheckIntervalInSec','3600','general');
 select fn_db_add_config_value('PosixStorageEnabled','false','3.0');
 select fn_db_add_config_value('PostgresI18NPrefix','','general');
 select fn_db_add_config_value('PostgresLikeSyntax','ILIKE','general');
diff --git a/packaging/etc/engine-config/engine-config.properties 
b/packaging/etc/engine-config/engine-config.properties
index d9e5872..c712341 100644
--- a/packaging/etc/engine-config/engine-config.properties
+++ b/packaging/etc/engine-config/engine-config.properties
@@ -390,3 +390,8 @@
 MaxNumOfTriesToRunFailedAutoStartVm.type=Integer
 RetryToRunAutoStartVmIntervalInSeconds.description="How often to try to 
restart highly available VM that went down unexpectedly (in seconds)"
 RetryToRunAutoStartVmIntervalInSeconds.type=Integer
+# PM Health Check
+PMHealthCheckEnabled.type=Boolean
+PMHealthCheckEnabled.description="Enable/Disable Power Management Health Check 
feature."
+PMHealthCheckIntervalInSec.type=Integer
+PMHealthCheckIntervalInSec.description="The interval in which the Power 
Management Health Check is running."


-- 
To view, visit http://gerrit.ovirt.org/27367
To unsubscribe, visit http://gerrit.ovirt.org/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ib1dfeab92a35793ebb421db8e3be94587dfe85e2
Gerrit-PatchSet: 1
Gerrit-Project: ovirt-engine
Gerrit-Branch: master
Gerrit-Owner: Eli Mesika <emes...@redhat.com>
_______________________________________________
Engine-patches mailing list
Engine-patches@ovirt.org
http://lists.ovirt.org/mailman/listinfo/engine-patches

Reply via email to