On Sep 9, 2008, at 11:37 AM, Satomi Taniguchi wrote:

Hi lists,

I'm posting two patches to realize the function which we have discussed.
One is for Pacemaker-dev(aba67759589),
and another one is for Heartbeat-dev(fc047640072c).

The specifications are the following.
(1) add the following 4 settings.
"period-length" - Period in seconds to count monitor op's failures. "max-failures-per-period" - Maximum times per period a monitor may fail. "default-period-length" - default value of period-length for the cluster. "default-max-failures-per-period" - default value of max- failures-per-period for the cluster.

(2) lrmd counts the monitor op's failures of each resource per period-length. And it ignores the resource's failure until the number of times of that
    exceeds the threshold (max-failures-per-period).

(3) If the value of period-length is 0, lrmd calculates the suitable length of
    the period for the resource's operation.

    NOTE:
    "suitable" means "safe enough".
    In this patch, the expression to calculate "suitable" value is
    (monitor's interval + timeout) * max-failure-per-period.
If the value of period-length is too short, and the number of times which monitor operation has finished in the period is less than the threshold,
    lrmd will never notify its client that the resource is failure.
    To avoid this, period-length requires the value which larger than
(monitor's interval + timeout) * (max-failures-per-period - 1), at least. And allowing for the time of lrmd's internal processing or the margin of error of OS's timer and so on, I considered the first expression is
    suitable.

In addition, I add the function to lrmadmin to show the following information.
 i) the time when the period-length started of the specified resource.
ii) the value of the counter of failures of the specified resource.
This is the third patch.

Your comments and suggestions are really appreciated.

Best Regards,
Satomi Taniguchi


[snip]


struct lrmd_op
diff -r aba677595891 crmd/lrm.c
--- a/crmd/lrm.c        Sun Sep 07 00:02:29 2008 +0200
+++ b/crmd/lrm.c        Mon Sep 08 15:58:39 2008 +0900
@@ -1326,6 +1326,8 @@
        const char *op_delay = NULL;
        const char *op_timeout = NULL;
        const char *op_interval = NULL;
+       const char *op_period_length = NULL;
+       const char *op_max_failures_per_period = NULL;
        
        const char *transition = NULL;
        CRM_DEV_ASSERT(rsc_id != NULL);
@@ -1340,6 +1342,8 @@
        op->start_delay = 0;
        op->copyparams = 0;
        op->app_name = crm_strdup(CRM_SYSTEM_CRMD);
+       op->period_length = 0;
+       op->max_failures_per_period = 0;

        if(rsc_op == NULL) {
                CRM_DEV_ASSERT(safe_str_eq(CRMD_ACTION_STOP, operation));
@@ -1370,6 +1374,10 @@
op_delay = g_hash_table_lookup(op->params, crm_meta_name("start_delay")); op_timeout = g_hash_table_lookup(op->params, crm_meta_name("timeout")); op_interval = g_hash_table_lookup(op->params, crm_meta_name("interval"));
+       op_period_length = g_hash_table_lookup(op->params,
+               crm_meta_name("period_length"));
+       op_max_failures_per_period = g_hash_table_lookup(op->params,
+               crm_meta_name("max_failures_per_period"));
#if CRM_DEPRECATED_SINCE_2_0_5
        if(op_delay == NULL) {
                op_delay = g_hash_table_lookup(op->params, "start_delay");
@@ -1380,11 +1388,21 @@
        if(op_interval == NULL) {
                op_interval = g_hash_table_lookup(op->params, "interval");
        }
+       if(op_period_length == NULL) {
+ op_period_length = g_hash_table_lookup(op->params, "period_length");
+       }
+       if(op_max_failures_per_period == NULL) {
+               op_max_failures_per_period = g_hash_table_lookup(op->params,
+               "max_failures_per_period");
+       }

please do not add code for deprecated releases.


#endif
        
        op->interval = crm_parse_int(op_interval, "0");
        op->timeout  = crm_parse_int(op_timeout,  "0");
        op->start_delay = crm_parse_int(op_delay, "0");
+       op->period_length = crm_parse_int(op_period_length, "0");
+       op->max_failures_per_period =
+               crm_parse_int(op_max_failures_per_period, "1");

        /* sanity */
        if(op->interval < 0) {
diff -r aba677595891 include/crm/msg_xml.h
--- a/include/crm/msg_xml.h     Sun Sep 07 00:02:29 2008 +0200
+++ b/include/crm/msg_xml.h     Mon Sep 08 15:58:39 2008 +0900
@@ -150,6 +150,8 @@
#define XML_RSC_ATTR_NOTIFY             "notify"
#define XML_RSC_ATTR_STICKINESS         "resource-stickiness"
#define XML_RSC_ATTR_FAIL_STICKINESS    "migration-threshold"
+#define XML_RSC_ATTR_PERIOD_LENGTH     "period-length"
+#define XML_RSC_ATTR_MAX_FAILURES_PER_PERIOD "max-failures-per- period"
#define XML_RSC_ATTR_FAIL_TIMEOUT       "failure-timeout"
#define XML_RSC_ATTR_MULTIPLE           "multiple-active"
#define XML_RSC_ATTR_PRIORITY           "priority"
diff -r aba677595891 include/crm/pengine/status.h
--- a/include/crm/pengine/status.h      Sun Sep 07 00:02:29 2008 +0200
+++ b/include/crm/pengine/status.h      Mon Sep 08 15:58:39 2008 +0900
@@ -73,6 +73,8 @@

                int default_failure_timeout;
                int default_migration_threshold;
+               int default_period_length;
+               int default_max_failures_per_period;

we don't use this model anymore.
people should set resource attribute defaults in the rsc_defaults section.

this is much more flexible and lets _everything_ have a default value.


                int default_resource_stickiness;
                no_quorum_policy_t no_quorum_policy;

@@ -166,6 +168,8 @@
                int      failure_timeout;
                int      effective_priority;
                int      migration_threshold;
+               int      period_length;
+               int      max_failures_per_period;

also here.



                unsigned long long flags;
        
diff -r aba677595891 lib/common/utils.c
--- a/lib/common/utils.c        Sun Sep 07 00:02:29 2008 +0200
+++ b/lib/common/utils.c        Mon Sep 08 15:58:39 2008 +0900
@@ -1165,6 +1165,8 @@
                XML_RSC_ATTR_MULTIPLE,
                XML_RSC_ATTR_STICKINESS,
                XML_RSC_ATTR_FAIL_STICKINESS,
+               XML_RSC_ATTR_PERIOD_LENGTH,
+               XML_RSC_ATTR_MAX_FAILURES_PER_PERIOD,
                XML_RSC_ATTR_TARGET_ROLE,


[snip]


xmlNode *
diff -r aba677595891 tools/crm_mon.c
--- a/tools/crm_mon.c   Sun Sep 07 00:02:29 2008 +0200
+++ b/tools/crm_mon.c   Mon Sep 08 15:58:39 2008 +0900
@@ -574,6 +574,8 @@
        printed = TRUE;
        print_as("   %s: migration-threshold=%d",
                 rsc->id, rsc->migration_threshold);
+       print_as(" period-length=%d(s)", rsc->period_length);
+ print_as(" max-failures-per-period=%d", rsc- >max_failures_per_period);

I don't want crm_mon displaying this information.


_______________________________________________
Pacemaker mailing list
[email protected]
http://list.clusterlabs.org/mailman/listinfo/pacemaker

Reply via email to