On Sep 9, 2008, at 11:37 AM, Satomi Taniguchi wrote:
Hi lists,
I'm posting two patches to realize the function which we have
discussed.
One is for Pacemaker-dev(aba67759589),
and another one is for Heartbeat-dev(fc047640072c).
The specifications are the following.
(1) add the following 4 settings.
"period-length" - Period in seconds to count monitor op's
failures.
"max-failures-per-period" - Maximum times per period a monitor
may fail.
"default-period-length" - default value of period-length for
the cluster.
"default-max-failures-per-period" - default value of max-
failures-per-period for the cluster.
(2) lrmd counts the monitor op's failures of each resource per
period-length.
And it ignores the resource's failure until the number of times
of that
exceeds the threshold (max-failures-per-period).
(3) If the value of period-length is 0, lrmd calculates the suitable
length of
the period for the resource's operation.
NOTE:
"suitable" means "safe enough".
In this patch, the expression to calculate "suitable" value is
(monitor's interval + timeout) * max-failure-per-period.
If the value of period-length is too short, and the number of
times which
monitor operation has finished in the period is less than the
threshold,
lrmd will never notify its client that the resource is failure.
To avoid this, period-length requires the value which larger than
(monitor's interval + timeout) * (max-failures-per-period - 1),
at least.
And allowing for the time of lrmd's internal processing or the
margin of
error of OS's timer and so on, I considered the first expression
is
suitable.
In addition, I add the function to lrmadmin to show the following
information.
i) the time when the period-length started of the specified resource.
ii) the value of the counter of failures of the specified resource.
This is the third patch.
Your comments and suggestions are really appreciated.
Best Regards,
Satomi Taniguchi
[snip]
struct lrmd_op
diff -r aba677595891 crmd/lrm.c
--- a/crmd/lrm.c Sun Sep 07 00:02:29 2008 +0200
+++ b/crmd/lrm.c Mon Sep 08 15:58:39 2008 +0900
@@ -1326,6 +1326,8 @@
const char *op_delay = NULL;
const char *op_timeout = NULL;
const char *op_interval = NULL;
+ const char *op_period_length = NULL;
+ const char *op_max_failures_per_period = NULL;
const char *transition = NULL;
CRM_DEV_ASSERT(rsc_id != NULL);
@@ -1340,6 +1342,8 @@
op->start_delay = 0;
op->copyparams = 0;
op->app_name = crm_strdup(CRM_SYSTEM_CRMD);
+ op->period_length = 0;
+ op->max_failures_per_period = 0;
if(rsc_op == NULL) {
CRM_DEV_ASSERT(safe_str_eq(CRMD_ACTION_STOP, operation));
@@ -1370,6 +1374,10 @@
op_delay = g_hash_table_lookup(op->params,
crm_meta_name("start_delay"));
op_timeout = g_hash_table_lookup(op->params,
crm_meta_name("timeout"));
op_interval = g_hash_table_lookup(op->params,
crm_meta_name("interval"));
+ op_period_length = g_hash_table_lookup(op->params,
+ crm_meta_name("period_length"));
+ op_max_failures_per_period = g_hash_table_lookup(op->params,
+ crm_meta_name("max_failures_per_period"));
#if CRM_DEPRECATED_SINCE_2_0_5
if(op_delay == NULL) {
op_delay = g_hash_table_lookup(op->params, "start_delay");
@@ -1380,11 +1388,21 @@
if(op_interval == NULL) {
op_interval = g_hash_table_lookup(op->params, "interval");
}
+ if(op_period_length == NULL) {
+ op_period_length = g_hash_table_lookup(op->params,
"period_length");
+ }
+ if(op_max_failures_per_period == NULL) {
+ op_max_failures_per_period = g_hash_table_lookup(op->params,
+ "max_failures_per_period");
+ }
please do not add code for deprecated releases.
#endif
op->interval = crm_parse_int(op_interval, "0");
op->timeout = crm_parse_int(op_timeout, "0");
op->start_delay = crm_parse_int(op_delay, "0");
+ op->period_length = crm_parse_int(op_period_length, "0");
+ op->max_failures_per_period =
+ crm_parse_int(op_max_failures_per_period, "1");
/* sanity */
if(op->interval < 0) {
diff -r aba677595891 include/crm/msg_xml.h
--- a/include/crm/msg_xml.h Sun Sep 07 00:02:29 2008 +0200
+++ b/include/crm/msg_xml.h Mon Sep 08 15:58:39 2008 +0900
@@ -150,6 +150,8 @@
#define XML_RSC_ATTR_NOTIFY "notify"
#define XML_RSC_ATTR_STICKINESS "resource-stickiness"
#define XML_RSC_ATTR_FAIL_STICKINESS "migration-threshold"
+#define XML_RSC_ATTR_PERIOD_LENGTH "period-length"
+#define XML_RSC_ATTR_MAX_FAILURES_PER_PERIOD "max-failures-per-
period"
#define XML_RSC_ATTR_FAIL_TIMEOUT "failure-timeout"
#define XML_RSC_ATTR_MULTIPLE "multiple-active"
#define XML_RSC_ATTR_PRIORITY "priority"
diff -r aba677595891 include/crm/pengine/status.h
--- a/include/crm/pengine/status.h Sun Sep 07 00:02:29 2008 +0200
+++ b/include/crm/pengine/status.h Mon Sep 08 15:58:39 2008 +0900
@@ -73,6 +73,8 @@
int default_failure_timeout;
int default_migration_threshold;
+ int default_period_length;
+ int default_max_failures_per_period;
we don't use this model anymore.
people should set resource attribute defaults in the rsc_defaults
section.
this is much more flexible and lets _everything_ have a default value.
int default_resource_stickiness;
no_quorum_policy_t no_quorum_policy;
@@ -166,6 +168,8 @@
int failure_timeout;
int effective_priority;
int migration_threshold;
+ int period_length;
+ int max_failures_per_period;
also here.
unsigned long long flags;
diff -r aba677595891 lib/common/utils.c
--- a/lib/common/utils.c Sun Sep 07 00:02:29 2008 +0200
+++ b/lib/common/utils.c Mon Sep 08 15:58:39 2008 +0900
@@ -1165,6 +1165,8 @@
XML_RSC_ATTR_MULTIPLE,
XML_RSC_ATTR_STICKINESS,
XML_RSC_ATTR_FAIL_STICKINESS,
+ XML_RSC_ATTR_PERIOD_LENGTH,
+ XML_RSC_ATTR_MAX_FAILURES_PER_PERIOD,
XML_RSC_ATTR_TARGET_ROLE,
[snip]
xmlNode *
diff -r aba677595891 tools/crm_mon.c
--- a/tools/crm_mon.c Sun Sep 07 00:02:29 2008 +0200
+++ b/tools/crm_mon.c Mon Sep 08 15:58:39 2008 +0900
@@ -574,6 +574,8 @@
printed = TRUE;
print_as(" %s: migration-threshold=%d",
rsc->id, rsc->migration_threshold);
+ print_as(" period-length=%d(s)", rsc->period_length);
+ print_as(" max-failures-per-period=%d", rsc-
>max_failures_per_period);
I don't want crm_mon displaying this information.
_______________________________________________
Pacemaker mailing list
[email protected]
http://list.clusterlabs.org/mailman/listinfo/pacemaker