Hi lists,
I'm posting two patches to realize the function which we have discussed.
One is for Pacemaker-dev(aba67759589),
and another one is for Heartbeat-dev(fc047640072c).
The specifications are the following.
(1) add the following 4 settings.
"period-length" - Period in seconds to count monitor op's failures.
"max-failures-per-period" - Maximum times per period a monitor may fail.
"default-period-length" - default value of period-length for the cluster.
"default-max-failures-per-period" - default value of
max-failures-per-period for the cluster.
(2) lrmd counts the monitor op's failures of each resource per period-length.
And it ignores the resource's failure until the number of times of that
exceeds the threshold (max-failures-per-period).
(3) If the value of period-length is 0, lrmd calculates the suitable length of
the period for the resource's operation.
NOTE:
"suitable" means "safe enough".
In this patch, the expression to calculate "suitable" value is
(monitor's interval + timeout) * max-failure-per-period.
If the value of period-length is too short, and the number of times which
monitor operation has finished in the period is less than the threshold,
lrmd will never notify its client that the resource is failure.
To avoid this, period-length requires the value which larger than
(monitor's interval + timeout) * (max-failures-per-period - 1), at least.
And allowing for the time of lrmd's internal processing or the margin of
error of OS's timer and so on, I considered the first expression is
suitable.
In addition, I add the function to lrmadmin to show the following information.
i) the time when the period-length started of the specified resource.
ii) the value of the counter of failures of the specified resource.
This is the third patch.
Your comments and suggestions are really appreciated.
Best Regards,
Satomi Taniguchi
diff -urN org/include/lrm/lrm_api.h mod/include/lrm/lrm_api.h
--- org/include/lrm/lrm_api.h 2008-09-09 17:36:40.000000000 +0900
+++ mod/include/lrm/lrm_api.h 2008-09-09 17:38:00.000000000 +0900
@@ -136,6 +136,8 @@
int start_delay;
int copyparams; /* copy parameters to the rsc */
int target_rc;
+ int period_length;
+ int max_failures_per_period;
/*output fields*/
op_status_t op_status;
diff -urN org/include/lrm/lrm_msg.h mod/include/lrm/lrm_msg.h
--- org/include/lrm/lrm_msg.h 2008-09-09 17:36:40.000000000 +0900
+++ mod/include/lrm/lrm_msg.h 2008-09-09 17:38:48.000000000 +0900
@@ -76,6 +76,8 @@
#define F_LRM_QUEUE_TIME "lrm_queue_time"
#define F_LRM_FAIL_REASON "lrm_fail_reason"
#define F_LRM_ASYNCMON_RC "lrm_asyncmon_rc"
+#define F_LRM_PERIOD_LENGTH "lrm_period_length"
+#define F_LRM_MAX_FAILURES_PER_PERIOD "lrm_max_failures_per_period"
#define PRINT printf("file:%s,line:%d\n",__FILE__,__LINE__);
diff -urN org/lib/lrm/clientlib.c mod/lib/lrm/clientlib.c
--- org/lib/lrm/clientlib.c 2008-09-09 17:36:40.000000000 +0900
+++ mod/lib/lrm/clientlib.c 2008-09-09 17:40:08.000000000 +0900
@@ -1212,7 +1212,9 @@
|| HA_OK != ha_msg_value_int(msg,F_LRM_INTERVAL, &op->interval)
|| HA_OK != ha_msg_value_int(msg,F_LRM_TARGETRC, &op->target_rc)
|| HA_OK != ha_msg_value_int(msg,F_LRM_DELAY, &op->start_delay)
- || HA_OK != ha_msg_value_int(msg,F_LRM_CALLID, &op->call_id)) {
+ || HA_OK != ha_msg_value_int(msg,F_LRM_CALLID, &op->call_id)
+ || HA_OK != ha_msg_value_int(msg,F_LRM_PERIOD_LENGTH, &op->period_length)
+ || HA_OK != ha_msg_value_int(msg,F_LRM_MAX_FAILURES_PER_PERIOD, &op->max_failures_per_period)) {
LOG_BASIC_ERROR("ha_msg_value_int");
free_op(op);
return NULL;
@@ -1328,6 +1330,8 @@
|| HA_OK != ha_msg_add_ul(msg, F_LRM_EXEC_TIME, op->exec_time)
|| HA_OK != ha_msg_add_ul(msg, F_LRM_QUEUE_TIME, op->queue_time)
|| HA_OK != ha_msg_add_int(msg, F_LRM_TARGETRC, op->target_rc)
+ || HA_OK != ha_msg_add_int(msg, F_LRM_PERIOD_LENGTH, op->period_length)
+ || HA_OK != ha_msg_add_int(msg, F_LRM_MAX_FAILURES_PER_PERIOD, op->max_failures_per_period)
|| ( op->app_name && (HA_OK != ha_msg_add(msg, F_LRM_APP, op->app_name)))
|| ( op->user_data && (HA_OK != ha_msg_add(msg,F_LRM_USERDATA,op->user_data)))
|| ( op->params && (HA_OK != ha_msg_add_str_table(msg,F_LRM_PARAM,op->params)))) {
diff -urN org/lrm/lrmd/lrmd.c mod/lrm/lrmd/lrmd.c
--- org/lrm/lrmd/lrmd.c 2008-09-09 17:36:40.000000000 +0900
+++ mod/lrm/lrmd/lrmd.c 2008-09-09 17:48:15.000000000 +0900
@@ -598,6 +598,8 @@
return NULL;
}
rsc->delay_timeout = (guint)0;
+ rsc->t_failed = 0;
+ rsc->failcnt_per_period = 0;
if (id) {
rsc->id = strdup(id);
}
@@ -2013,6 +2015,8 @@
|| HA_OK != ha_msg_mod_int(msg,F_LRM_INTERVAL,0)
|| HA_OK != ha_msg_mod_int(msg,F_LRM_TARGETRC,EVERYTIME)
|| HA_OK != ha_msg_mod_int(msg,F_LRM_DELAY,0)
+ || HA_OK != ha_msg_mod_int(msg,F_LRM_PERIOD_LENGTH,0)
+ || HA_OK != ha_msg_mod_int(msg,F_LRM_MAX_FAILURES_PER_PERIOD,0)
) {
lrmd_log(LOG_ERR,"%s:%d: cannot add field to a message"
, __FUNCTION__, __LINE__);
@@ -2299,6 +2303,8 @@
int timeout = 0;
int interval = 0;
int delay = 0;
+ int period_length = 0;
+ int max_failures_per_period = 0;
LRMAUDIT();
CHECK_ALLOCATED(client, "client", HA_FAIL);
@@ -2308,6 +2314,8 @@
return_on_no_int_value(msg, F_LRM_INTERVAL, &interval);
return_on_no_int_value(msg, F_LRM_TIMEOUT, &timeout);
return_on_no_int_value(msg, F_LRM_DELAY, &delay);
+ return_on_no_int_value(msg, F_LRM_PERIOD_LENGTH, &period_length);
+ return_on_no_int_value(msg, F_LRM_MAX_FAILURES_PER_PERIOD, &max_failures_per_period);
rsc = lookup_rsc_by_msg(msg);
if (NULL == rsc) {
@@ -2710,6 +2718,11 @@
int target_rc, last_rc, op_rc;
int rc_changed;
op_status_t op_status;
+ const char* op_type = NULL;
+ int period_length;
+ int max_failures_per_period;
+ gboolean ignore_failure = FALSE;
+ time_t now = time(NULL);
LRMAUDIT();
CHECK_ALLOCATED(op, "op", HA_FAIL );
@@ -2730,6 +2743,138 @@
last_rc = op_rc = -1; /* set all rc to -1 */
ha_msg_value_int(op->msg,F_LRM_RC,&op_rc);
ha_msg_value_int(op->msg,F_LRM_LASTRC,&last_rc);
+
+ op_type = ha_msg_value(op->msg, F_LRM_OP);
+
+ if (STRNCMP_CONST(op_type, "start") == 0) {
+ /* initialize the counter of failures. */
+ rsc->t_failed = 0;
+ rsc->failcnt_per_period = 0;
+ }
+ if (STRNCMP_CONST(op_type, "monitor") == 0 && op->interval) {
+ int timeout;
+
+ if (HA_OK != ha_msg_value_int(op->msg,
+ F_LRM_PERIOD_LENGTH, &period_length)) {
+ period_length = 0;
+ lrmd_log(LOG_ERR
+ , "%s::%d: failed to get period_length for %s."
+ , __FUNCTION__, __LINE__, small_op_info(op));
+ }
+ if (HA_OK != ha_msg_value_int(op->msg,
+ F_LRM_MAX_FAILURES_PER_PERIOD, &max_failures_per_period)) {
+ max_failures_per_period = 1;
+ lrmd_log(LOG_ERR
+ , "%s::%d: failed to get max_failures_per_period for %s."
+ , __FUNCTION__, __LINE__, small_op_info(op));
+ }
+
+ if (HA_OK != ha_msg_value_int(op->msg, F_LRM_TIMEOUT, &timeout)) {
+ timeout = 0;
+ lrmd_log(LOG_ERR
+ , "%s::%d: failed to get timeout for %s."
+ , __FUNCTION__, __LINE__, small_op_info(op));
+ }
+
+ /* check validation */
+ if (period_length < 0 ) {
+ lrmd_log(LOG_ERR,"%s::%d:%s: period-length has invalid value (%d)."
+ , __FUNCTION__, __LINE__, rsc->id, period_length);
+ period_length = 0;
+ }
+ if (max_failures_per_period <= 0 ) {
+ lrmd_log(LOG_ERR
+ , "%s::%d:%s: max-failures-per-period has invalid value (%d)."
+ , __FUNCTION__, __LINE__, rsc->id, max_failures_per_period);
+ max_failures_per_period = 1;
+ }
+
+ if (period_length == 0) {
+ period_length = (op->interval + timeout) / 1000 * max_failures_per_period;
+ lrmd_log(LOG_DEBUG,
+ "substitute %d sec. for %s's period-length value.",
+ period_length, rsc->id);
+ }else {
+ int border;
+ border = (op->interval + timeout) / 1000 * (max_failures_per_period - 1);
+ if (period_length <= border) {
+ lrmd_log(LOG_WARNING,
+ "%s's period-length value is unsafe. Safe value is larger than %d sec.",
+ rsc->id, border);
+ }
+ }
+
+ if (rsc->t_failed != 0 && (now - rsc->t_failed) >= period_length) {
+ /*
+ * a period is over.
+ * initialize the counter of failures.
+ */
+ lrmd_log(LOG_NOTICE,
+ "clear the counter of failures per period for %s.", rsc->id);
+ rsc->t_failed = 0;
+ rsc->failcnt_per_period = 0;
+ }
+
+ if ((op_status == LRM_OP_DONE &&
+ op_rc != EXECRA_OK && op_rc != EXECRA_RUNNING_MASTER) ||
+ op_status == LRM_OP_TIMEOUT) {
+ /* detected resource's failure! */
+
+ /*
+ * check failures per period.
+ */
+ rsc->failcnt_per_period++;
+ if (rsc->t_failed == 0) {
+ /* the period has begun. */
+ rsc->t_failed = now;
+ }
+ if (rsc->failcnt_per_period < max_failures_per_period) {
+ ignore_failure = TRUE;
+ } else {
+ /*
+ * failures over the threshold.
+ * no more ignore failures, and reset the counter.
+ */
+ rsc->t_failed = 0;
+ rsc->failcnt_per_period = 0;
+ }
+
+ }
+ }
+
+ if (ignore_failure) {
+ /*
+ * consider that resource is not failed.
+ */
+ int actual_op_rc = op_rc;
+ op_rc = last_rc;
+ if (op_rc == -1) {
+ /* It's the first monitor operation. */
+ const char* last_op_type = NULL;
+ last_op_type = ha_msg_value(rsc->last_op_done->msg,
+ F_LRM_OP);
+ if (STRNCMP_CONST(last_op_type, "promote") == 0) {
+ /* It's master resource. */
+ op_rc = EXECRA_RUNNING_MASTER;
+ } else {
+ op_rc = EXECRA_OK;
+ }
+ }
+ lrmd_log(LOG_NOTICE, "%s can fail %d more times before being considered to be failed. Replace op's rc from %d to %d.",
+ rsc->id, (max_failures_per_period - rsc->failcnt_per_period), actual_op_rc, op_rc);
+ if (HA_OK != ha_msg_mod_int(op->msg, F_LRM_RC, op_rc)) {
+ lrmd_log(LOG_ERR,"%s: cannot save rc to msg",__FUNCTION__);
+ return HA_FAIL;
+ }
+ if (op_status != LRM_OP_DONE) {
+ op_status = LRM_OP_DONE;
+ if (HA_OK != ha_msg_mod_int(op->msg, F_LRM_OPSTATUS, LRM_OP_DONE)) {
+ lrmd_log(LOG_ERR,"%s: cannot save status to msg",__FUNCTION__);
+ return HA_FAIL;
+ }
+ }
+ }
+
rc_changed = (
op_status == LRM_OP_DONE
&& op_rc != -1
diff -urN org/lrm/lrmd/lrmd.h mod/lrm/lrmd/lrmd.h
--- org/lrm/lrmd/lrmd.h 2008-09-09 17:36:40.000000000 +0900
+++ mod/lrm/lrmd/lrmd.h 2008-09-09 17:37:30.000000000 +0900
@@ -178,6 +178,8 @@
guint delay_timeout; /* The delay value of op_list execution */
GList* requestors; /* a list of client pids to send replies to */
int state; /* status of the resource */
+ time_t t_failed; /* time stamp which failure occurs */
+ int failcnt_per_period; /* counter of failures per period */
};
struct lrmd_op
diff -r aba677595891 crmd/lrm.c
--- a/crmd/lrm.c Sun Sep 07 00:02:29 2008 +0200
+++ b/crmd/lrm.c Mon Sep 08 15:58:39 2008 +0900
@@ -1326,6 +1326,8 @@
const char *op_delay = NULL;
const char *op_timeout = NULL;
const char *op_interval = NULL;
+ const char *op_period_length = NULL;
+ const char *op_max_failures_per_period = NULL;
const char *transition = NULL;
CRM_DEV_ASSERT(rsc_id != NULL);
@@ -1340,6 +1342,8 @@
op->start_delay = 0;
op->copyparams = 0;
op->app_name = crm_strdup(CRM_SYSTEM_CRMD);
+ op->period_length = 0;
+ op->max_failures_per_period = 0;
if(rsc_op == NULL) {
CRM_DEV_ASSERT(safe_str_eq(CRMD_ACTION_STOP, operation));
@@ -1370,6 +1374,10 @@
op_delay = g_hash_table_lookup(op->params, crm_meta_name("start_delay"));
op_timeout = g_hash_table_lookup(op->params, crm_meta_name("timeout"));
op_interval = g_hash_table_lookup(op->params, crm_meta_name("interval"));
+ op_period_length = g_hash_table_lookup(op->params,
+ crm_meta_name("period_length"));
+ op_max_failures_per_period = g_hash_table_lookup(op->params,
+ crm_meta_name("max_failures_per_period"));
#if CRM_DEPRECATED_SINCE_2_0_5
if(op_delay == NULL) {
op_delay = g_hash_table_lookup(op->params, "start_delay");
@@ -1380,11 +1388,21 @@
if(op_interval == NULL) {
op_interval = g_hash_table_lookup(op->params, "interval");
}
+ if(op_period_length == NULL) {
+ op_period_length = g_hash_table_lookup(op->params, "period_length");
+ }
+ if(op_max_failures_per_period == NULL) {
+ op_max_failures_per_period = g_hash_table_lookup(op->params,
+ "max_failures_per_period");
+ }
#endif
op->interval = crm_parse_int(op_interval, "0");
op->timeout = crm_parse_int(op_timeout, "0");
op->start_delay = crm_parse_int(op_delay, "0");
+ op->period_length = crm_parse_int(op_period_length, "0");
+ op->max_failures_per_period =
+ crm_parse_int(op_max_failures_per_period, "1");
/* sanity */
if(op->interval < 0) {
diff -r aba677595891 include/crm/msg_xml.h
--- a/include/crm/msg_xml.h Sun Sep 07 00:02:29 2008 +0200
+++ b/include/crm/msg_xml.h Mon Sep 08 15:58:39 2008 +0900
@@ -150,6 +150,8 @@
#define XML_RSC_ATTR_NOTIFY "notify"
#define XML_RSC_ATTR_STICKINESS "resource-stickiness"
#define XML_RSC_ATTR_FAIL_STICKINESS "migration-threshold"
+#define XML_RSC_ATTR_PERIOD_LENGTH "period-length"
+#define XML_RSC_ATTR_MAX_FAILURES_PER_PERIOD "max-failures-per-period"
#define XML_RSC_ATTR_FAIL_TIMEOUT "failure-timeout"
#define XML_RSC_ATTR_MULTIPLE "multiple-active"
#define XML_RSC_ATTR_PRIORITY "priority"
diff -r aba677595891 include/crm/pengine/status.h
--- a/include/crm/pengine/status.h Sun Sep 07 00:02:29 2008 +0200
+++ b/include/crm/pengine/status.h Mon Sep 08 15:58:39 2008 +0900
@@ -73,6 +73,8 @@
int default_failure_timeout;
int default_migration_threshold;
+ int default_period_length;
+ int default_max_failures_per_period;
int default_resource_stickiness;
no_quorum_policy_t no_quorum_policy;
@@ -166,6 +168,8 @@
int failure_timeout;
int effective_priority;
int migration_threshold;
+ int period_length;
+ int max_failures_per_period;
unsigned long long flags;
diff -r aba677595891 lib/common/utils.c
--- a/lib/common/utils.c Sun Sep 07 00:02:29 2008 +0200
+++ b/lib/common/utils.c Mon Sep 08 15:58:39 2008 +0900
@@ -1165,6 +1165,8 @@
XML_RSC_ATTR_MULTIPLE,
XML_RSC_ATTR_STICKINESS,
XML_RSC_ATTR_FAIL_STICKINESS,
+ XML_RSC_ATTR_PERIOD_LENGTH,
+ XML_RSC_ATTR_MAX_FAILURES_PER_PERIOD,
XML_RSC_ATTR_TARGET_ROLE,
/* ignore clone fields */
diff -r aba677595891 lib/pengine/common.c
--- a/lib/pengine/common.c Sun Sep 07 00:02:29 2008 +0200
+++ b/lib/pengine/common.c Mon Sep 08 15:58:39 2008 +0900
@@ -72,6 +72,8 @@
{ "default-failure-timeout", NULL, "time", NULL, "0", &check_timer, "Time in seconds after which a failure expires", "Set to zero to disable" },
{ "default-resource-stickiness", "default_resource_stickiness", "integer", NULL, "0", &check_number, "", NULL },
{ "default-migration-threshold", NULL, "integer", NULL, "0", &check_number, "Maximum times a resource can fail before it is moved. Zero means no limit.", NULL },
+ { "default-period-length", NULL, "time", NULL, "0", &check_timer, "Period in seconds to count monitor op's failures.", "Set to zero to use the value which is calculated automatically based on monitor op's timeout, interval, and max-failures-per-period." },
+ { "default-max-failures-per-period", NULL, "integer", NULL, "1", &check_number, "Maximum times per period a monitor may fail.", NULL },
{ "is-managed-default", "is_managed_default", "boolean", NULL, "true", &check_boolean,
"Should the cluster start/stop resources as required", NULL },
{ "cluster-delay", "transition_idle_timeout", "time", NULL, "60s", &check_time,
diff -r aba677595891 lib/pengine/complex.c
--- a/lib/pengine/complex.c Sun Sep 07 00:02:29 2008 +0200
+++ b/lib/pengine/complex.c Mon Sep 08 15:58:39 2008 +0900
@@ -210,6 +210,8 @@
(*rsc)->recovery_type = recovery_stop_start;
(*rsc)->stickiness = data_set->default_resource_stickiness;
(*rsc)->migration_threshold = data_set->default_migration_threshold;
+ (*rsc)->period_length = data_set->default_period_length;
+ (*rsc)->max_failures_per_period = data_set->default_max_failures_per_period;
(*rsc)->failure_timeout = data_set->default_failure_timeout;
value = g_hash_table_lookup((*rsc)->meta, XML_CIB_ATTR_PRIORITY);
@@ -270,6 +272,17 @@
value = g_hash_table_lookup((*rsc)->meta, XML_RSC_ATTR_FAIL_STICKINESS);
if(value != NULL && safe_str_neq("default", value)) {
(*rsc)->migration_threshold = char2score(value);
+ }
+
+ value = g_hash_table_lookup((*rsc)->meta, XML_RSC_ATTR_PERIOD_LENGTH);
+ if(value != NULL) {
+ /* call crm_get_msec() and convert back to seconds */
+ (*rsc)->period_length = (crm_get_msec(value) / 1000);
+ }
+
+ value = g_hash_table_lookup((*rsc)->meta, XML_RSC_ATTR_MAX_FAILURES_PER_PERIOD);
+ if(value != NULL && safe_str_neq("default", value)) {
+ (*rsc)->max_failures_per_period = char2score(value);
}
value = g_hash_table_lookup((*rsc)->meta, XML_RSC_ATTR_FAIL_TIMEOUT);
diff -r aba677595891 lib/pengine/status.c
--- a/lib/pengine/status.c Sun Sep 07 00:02:29 2008 +0200
+++ b/lib/pengine/status.c Mon Sep 08 15:58:39 2008 +0900
@@ -236,6 +236,8 @@
data_set->default_failure_timeout = 0;
data_set->default_migration_threshold = 0;
data_set->default_resource_stickiness = 0;
+ data_set->default_period_length = 0;
+ data_set->default_max_failures_per_period = 1;
data_set->flags = 0x0ULL;
set_bit_inplace(data_set->flags, pe_flag_symmetric_cluster);
diff -r aba677595891 lib/pengine/unpack.c
--- a/lib/pengine/unpack.c Sun Sep 07 00:02:29 2008 +0200
+++ b/lib/pengine/unpack.c Mon Sep 08 15:58:39 2008 +0900
@@ -80,6 +80,15 @@
data_set->default_migration_threshold = char2score(value);
crm_debug("Default migration threshold: %d",
data_set->default_migration_threshold);
+
+ value = pe_pref(data_set->config_hash, "default-period-length");
+ data_set->default_period_length = (crm_get_msec(value) / 1000);
+ crm_debug("Default period length: %d", data_set->default_period_length);
+
+ value = pe_pref(data_set->config_hash, "default-max-failures-per-period");
+ data_set->default_max_failures_per_period = char2score(value);
+ crm_debug("Default max failures per period: %d",
+ data_set->default_max_failures_per_period);
set_config_flag(data_set, "stonith-enabled", pe_flag_stonith_enabled);
crm_debug("STONITH of failed nodes is %s",
diff -r aba677595891 lib/pengine/utils.c
--- a/lib/pengine/utils.c Sun Sep 07 00:02:29 2008 +0200
+++ b/lib/pengine/utils.c Mon Sep 08 15:58:39 2008 +0900
@@ -849,6 +849,20 @@
value_i += start_delay;
value_ms = crm_itoa(value_i);
g_hash_table_replace(action->meta, crm_strdup(field), value_ms);
+
+ field = XML_RSC_ATTR_PERIOD_LENGTH;
+ value = g_hash_table_lookup(action->meta, field);
+ if(value == NULL) {
+ g_hash_table_replace(action->meta, crm_strdup(field),
+ crm_itoa(action->rsc->period_length));
+ }
+
+ field = XML_RSC_ATTR_MAX_FAILURES_PER_PERIOD;
+ value = g_hash_table_lookup(action->meta, field);
+ if(value == NULL) {
+ g_hash_table_replace(action->meta, crm_strdup(field),
+ crm_itoa(action->rsc->max_failures_per_period));
+ }
}
xmlNode *
diff -r aba677595891 tools/crm_mon.c
--- a/tools/crm_mon.c Sun Sep 07 00:02:29 2008 +0200
+++ b/tools/crm_mon.c Mon Sep 08 15:58:39 2008 +0900
@@ -574,6 +574,8 @@
printed = TRUE;
print_as(" %s: migration-threshold=%d",
rsc->id, rsc->migration_threshold);
+ print_as(" period-length=%d(s)", rsc->period_length);
+ print_as(" max-failures-per-period=%d", rsc->max_failures_per_period);
}
if(failcount > 0) {
diff -urN org/include/lrm/lrm_api.h mod/include/lrm/lrm_api.h
--- org/include/lrm/lrm_api.h 2008-09-09 17:38:00.000000000 +0900
+++ mod/include/lrm/lrm_api.h 2008-09-09 17:44:28.000000000 +0900
@@ -241,6 +241,20 @@
* client can release the op using lrm_free_op()
*/
lrm_op_t* (*get_last_result)(lrm_rsc_t*, const char *op_type);
+
+/*
+ *get_failure_count:
+ * return the current failures' information of the resource.
+ * set the value of failures per period to the second org's mem,
+ * and return the time which the resource failed
+ * in a char strings form.
+ *
+ *failcnt_per_period : the pointer to set the value of failures per period
+ *
+ *return: the failed time of the resource (the start time of the "priod")
+ * NULL means an error occurs.
+ */
+ char* (*get_failure_count)(lrm_rsc_t*, int* failcnt_per_period);
};
diff -urN org/include/lrm/lrm_msg.h mod/include/lrm/lrm_msg.h
--- org/include/lrm/lrm_msg.h 2008-09-09 17:38:48.000000000 +0900
+++ mod/include/lrm/lrm_msg.h 2008-09-09 17:44:28.000000000 +0900
@@ -78,6 +78,8 @@
#define F_LRM_ASYNCMON_RC "lrm_asyncmon_rc"
#define F_LRM_PERIOD_LENGTH "lrm_period_length"
#define F_LRM_MAX_FAILURES_PER_PERIOD "lrm_max_failures_per_period"
+#define F_LRM_FAILED_TIME "lrm_failed_time"
+#define F_LRM_FAILCNT_PER_PERIOD "lrm_failcnt_per_period"
#define PRINT printf("file:%s,line:%d\n",__FILE__,__LINE__);
@@ -92,6 +94,7 @@
#define GETRSC "getrsc"
#define GETLASTOP "getlastop"
#define GETRSCSTATE "getstate"
+#define GETRSCFAILCNT "getfailcnt"
#define SETMONITOR "setmon"
#define GETMONITORS "getmons"
#define FLUSHRSC "flush"
diff -urN org/lib/lrm/clientlib.c mod/lib/lrm/clientlib.c
--- org/lib/lrm/clientlib.c 2008-09-09 17:40:08.000000000 +0900
+++ mod/lib/lrm/clientlib.c 2008-09-09 17:44:28.000000000 +0900
@@ -124,6 +124,7 @@
static int rsc_flush_ops (lrm_rsc_t*);
static GList* rsc_get_cur_state (lrm_rsc_t*, state_flag_t* cur_state);
static lrm_op_t* rsc_get_last_result (lrm_rsc_t*, const char* op_type);
+static char* rsc_get_failure_count (lrm_rsc_t* rsc, int* failcnt_per_period);
static gint compare_call_id(gconstpointer a, gconstpointer b);
static struct rsc_ops rsc_ops_instance =
@@ -132,7 +133,8 @@
rsc_cancel_op,
rsc_flush_ops,
rsc_get_cur_state,
- rsc_get_last_result
+ rsc_get_last_result,
+ rsc_get_failure_count
};
@@ -1181,6 +1183,63 @@
ha_msg_del(ret);
return op;
}
+
+static char *
+rsc_get_failure_count (lrm_rsc_t* rsc, int* failcnt_per_period)
+{
+ struct ha_msg* msg = NULL;
+ struct ha_msg* ret = NULL;
+ const char* tmp = NULL;
+ char* failed_time = NULL;
+
+ /* check whether the channel to lrmd is available */
+ if (NULL == ch_cmd) {
+ cl_log(LOG_ERR, "rsc_get_failure_count: ch_mod is null.");
+ return NULL;
+ }
+ /* check paramter */
+ if (NULL == rsc) {
+ cl_log(LOG_ERR, "rsc_get_failure_count: parameter rsc is null.");
+ return NULL;
+ }
+ /* create the msg of get current state of resource */
+ msg = create_lrm_rsc_msg(rsc->id,GETRSCFAILCNT);
+ if ( NULL == msg) {
+ LOG_FAIL_create_lrm_rsc_msg(GETRSCFAILCNT);
+ return NULL;
+ }
+ /* send the msg to lrmd */
+ if (HA_OK != msg2ipcchan(msg,ch_cmd)) {
+ ha_msg_del(msg);
+ LOG_FAIL_SEND_MSG(GETRSCFAILCNT, "ch_cmd");
+ return NULL;
+ }
+
+ /* get the return msg */
+ ret = msgfromIPC(ch_cmd, MSG_ALLOWINTR);
+ if (NULL == ret) {
+ LOG_FAIL_receive_reply(GETRSCFAILCNT);
+ return NULL;
+ }
+ if (HA_OK != ha_msg_value_int(ret,
+ F_LRM_FAILCNT_PER_PERIOD, failcnt_per_period)) {
+ LOG_FAIL_GET_MSG_FIELD(LOG_ERR, F_LRM_FAILCNT_PER_PERIOD, ret);
+ ha_msg_del(ret);
+ return NULL;
+ }
+ tmp = ha_msg_value(ret, F_LRM_FAILED_TIME);
+ if (tmp == NULL) {
+ LOG_FAIL_GET_MSG_FIELD(LOG_ERR, F_LRM_FAILED_TIME, ret);
+ ha_msg_del(ret);
+ return NULL;
+ }
+ failed_time = g_strdup(tmp);
+
+ ha_msg_del(msg);
+ ha_msg_del(ret);
+ return failed_time;
+}
+
/*
* following are the implements of the utility functions
*/
diff -urN org/lrm/admin/lrmadmin.c mod/lrm/admin/lrmadmin.c
--- org/lrm/admin/lrmadmin.c 2008-09-09 17:41:52.000000000 +0900
+++ mod/lrm/admin/lrmadmin.c 2008-09-09 17:44:28.000000000 +0900
@@ -47,7 +47,7 @@
#include <clplumbing/GSource.h>
#include <clplumbing/Gmain_timeout.h>
-const char * optstring = "AD:X:dEF:d:sg:M:O:P:c:S:LI:CT:n:h";
+const char * optstring = "AD:X:dEF:d:sg:M:O:P:c:S:LI:CT:n:W:h";
#ifdef HAVE_GETOPT_H
static struct option long_options[] = {
@@ -66,6 +66,7 @@
{"all_type_metadata",1,0,'O'},
{"metadata",1,0,'M'},
{"provider",1,0,'P'},
+ {"showfail",1,0,'W'},
{"help",0,0,'h'},
{0,0,0,0}
};
@@ -94,6 +95,7 @@
RA_METADATA,
RA_PROVIDER,
ALL_RA_METADATA,
+ SHOW_FAIL,
HELP
} lrmadmin_cmd_t;
@@ -150,6 +152,7 @@
" {-O|--all metadata of this class} <raclass>\n"
" {-M|--metadata} <raclass> <ratype> <provider|NULL>\n"
" {-P|--provider} <raclass> <ratype>\n"
+" {-W|--showfail} <racid>\n"
" {-h|--help}\n";
#define OPTION_OBSCURE_CHECK \
@@ -327,6 +330,14 @@
}
break;
+ case 'W':
+ OPTION_OBSCURE_CHECK
+ lrmadmin_cmd = SHOW_FAIL;
+ if (optarg) {
+ strncpy(rscid_arg_tmp, optarg, RID_LEN-1);
+ }
+ break;
+
case 'h':
OPTION_OBSCURE_CHECK
printf("%s",simple_help_screen);
@@ -561,6 +572,29 @@
ASYN_OPS = FALSE;
break;
+ case SHOW_FAIL:
+ lrm_rsc = get_lrm_rsc(lrmd, rscid_arg_tmp);
+ if (!(lrm_rsc)) {
+ ret_value = -3;
+ } else {
+ int failcnt_per_period;
+ char *failed_time = NULL;
+
+ failed_time = lrm_rsc->ops->get_failure_count(lrm_rsc,
+ &failcnt_per_period);
+ if (failed_time != NULL) {
+ printf("failed time (start time of the period):%s\n",
+ strcmp(failed_time, "0") != 0?failed_time:"(N/A)");
+ printf("failures per period:%d\n", failcnt_per_period);
+ g_free(failed_time);
+ } else {
+ printf("failed to get failures' information");
+ return -3;
+ }
+ lrm_free_rsc(lrm_rsc);
+ }
+ ASYN_OPS = FALSE;
+ break;
default:
fprintf(stderr, "Option %c is not supported yet.\n",
diff -urN org/lrm/lrmd/lrmd.c mod/lrm/lrmd/lrmd.c
--- org/lrm/lrmd/lrmd.c 2008-09-09 17:48:15.000000000 +0900
+++ mod/lrm/lrmd/lrmd.c 2008-09-09 17:44:28.000000000 +0900
@@ -103,6 +103,7 @@
{CANCELOP, REPLY_NOW, on_msg_cancel_op},
{GETRSCSTATE, NO_MSG, on_msg_get_state},
{GETRSCMETA, NO_MSG, on_msg_get_metadata},
+ {GETRSCFAILCNT, NO_MSG, on_msg_get_failcnt},
};
#define MSG_NR sizeof(msg_maps)/sizeof(struct msg_map)
@@ -2533,6 +2534,84 @@
return HA_OK;
}
+int
+on_msg_get_failcnt(lrmd_client_t* client, struct ha_msg* msg)
+{
+ lrmd_rsc_t* rsc = NULL;
+ const char* id = NULL;
+ struct ha_msg* ret = NULL;
+ char t_failed_str[26];
+
+ CHECK_ALLOCATED(client, "client", HA_FAIL);
+ CHECK_ALLOCATED(msg, "message", HA_FAIL);
+
+ id = ha_msg_value(msg,F_LRM_RID);
+ lrmd_debug2(LOG_DEBUG
+ , "on_msg_get_failcnt: client [%d] want to get the failure count of resource %s"
+ , client->pid, lrmd_nullcheck(id));
+
+ rsc = lookup_rsc_by_msg(msg);
+ if (NULL == rsc) {
+ lrmd_log(LOG_ERR, "on_msg_get_failcnt: no resource with id %s."
+ , lrmd_nullcheck(id));
+ send_ret_msg(client->ch_cmd, HA_FAIL);
+ return HA_FAIL;
+ }
+
+ ret = ha_msg_new(1);
+ if (NULL == ret) {
+ lrmd_log(LOG_ERR, "on_msg_get_failcnt: can't create a ha_msg.");
+ return HA_FAIL;
+ }
+
+ /* add the response field */
+ /*
+ * convert time_t into struct tm.
+ * It's troublesome, but safe.
+ */
+ if (rsc->t_failed == 0) {
+ strncpy(t_failed_str, "0", 1);
+ t_failed_str[1] = '\0';
+ } else {
+ int i;
+
+ if (asctime_r(localtime(&(rsc->t_failed)), t_failed_str) == NULL) {
+ lrmd_log(LOG_ERR
+ , "on_msg_get_failcnt: failed to convert failed time into char string.");
+ return HA_FAIL;
+ }
+ for(i = 0; i < 26; i++) {
+ if (t_failed_str[i] == '\n') {
+ t_failed_str[i] = '\0';
+ }
+ }
+ }
+ if (HA_OK != ha_msg_add(ret, F_LRM_FAILED_TIME, t_failed_str)) {
+ LOG_FAILED_TO_ADD_FIELD("failed_time");
+ ha_msg_del(ret);
+ return HA_FAIL;
+ }
+ if (HA_OK != ha_msg_add_int(ret, F_LRM_FAILCNT_PER_PERIOD
+ , rsc->failcnt_per_period)) {
+ LOG_FAILED_TO_ADD_FIELD("failcnt_per_period");
+ ha_msg_del(ret);
+ return HA_FAIL;
+ }
+ lrmd_debug(LOG_DEBUG
+ , "on_msg_get_failcnt: rsc %s : failed_time=%s, failcnt_per_period=%d"
+ , lrmd_nullcheck(id), t_failed_str, rsc->failcnt_per_period);
+ /* send the message to client */
+ if (HA_OK != msg2ipcchan(ret, client->ch_cmd)) {
+ lrmd_log(LOG_ERR,
+ "on_msg_get_failcnt: can not send the ret message.");
+ ha_msg_del(ret);
+ return HA_FAIL;
+ }
+ ha_msg_del(ret);
+
+ return HA_OK;
+}
+
#define safe_len(s) (s ? strlen(s) : 0)
static char *
diff -urN org/lrm/lrmd/lrmd_fdecl.h mod/lrm/lrmd/lrmd_fdecl.h
--- org/lrm/lrmd/lrmd_fdecl.h 2008-09-09 17:43:52.000000000 +0900
+++ mod/lrm/lrmd/lrmd_fdecl.h 2008-09-09 17:44:28.000000000 +0900
@@ -21,6 +21,7 @@
static int on_msg_get_rsc_types(lrmd_client_t* client, struct ha_msg* msg);
static int on_msg_get_rsc_providers(lrmd_client_t* client, struct ha_msg* msg);
static int on_msg_get_metadata(lrmd_client_t* client, struct ha_msg* msg);
+static int on_msg_get_failcnt(lrmd_client_t* client, struct ha_msg* msg);
static int on_msg_add_rsc(lrmd_client_t* client, struct ha_msg* msg);
static int on_msg_get_rsc(lrmd_client_t* client, struct ha_msg* msg);
static int on_msg_get_last_op(lrmd_client_t* client, struct ha_msg* msg);
_______________________________________________
Pacemaker mailing list
[email protected]
http://list.clusterlabs.org/mailman/listinfo/pacemaker