Hi,
I'm sending a patch which has the following roles.
(1) add a timeout in stonithd on DC node when it is going to broadcast
a STONITH request because all STONITH plugins on DC failed.
(2) remove the timeout above when DC-stonithd receives a "STONITH succeeded"
message from other node.
I have no idea whether the current behavior is intentional or a bug.
If it is an expected behavior, I'd like to hear its purpose.
If not, I hope this patch is helpful to you.
Best Regards,
Satomi TANIGUCHI
Satomi TANIGUCHI wrote:
> Hi Lists,
>
> I found an odd behavior about remote fencing.
>
> In the following case, stonithd on DC node adds the timeout for remote fencing
> operations.
> (1) the target node is DC.
> (2) the target node is not DC, but DC node has no STONITH plugin.
>
> But it doesn't add that timeout in the following case.
> (i) the target node is not DC, and all STONITH plugins fail on DC node.
>
> Is it an expected behavior or a bug?
>
> When all STONITH operations on all nodes in the cluster failed,
> maybe no action can be executed forever because transition timeout in tengine
> is going to be removed...
>
> Best Regards,
> Satomi TANIGUCHI
>
> _______________________________________________
> Pacemaker mailing list
> [email protected]
> http://list.clusterlabs.org/mailman/listinfo/pacemaker
diff -urN pacemaker-dev/fencing/stonithd/stonithd.c pacemaker-dev.mod/fencing/stonithd/stonithd.c
--- pacemaker-dev/fencing/stonithd/stonithd.c 2008-11-13 15:48:06.000000000 +0900
+++ pacemaker-dev.mod/fencing/stonithd/stonithd.c 2008-11-13 16:21:34.000000000 +0900
@@ -1351,6 +1351,7 @@
free_stonith_ops_t(st_op);
return;
}
+ st_op->rs_callid = st_op->call_id;
if (ST_OK == require_local_stonithop(st_op, srsc, from)) {
stonithd_log(LOG_INFO, "Node %s try to help node %s to "
@@ -1434,6 +1435,7 @@
{
const char * from = NULL;
int call_id;
+ int rs_callid;
int op_result;
int * orig_key = NULL;
common_op_t * op = NULL;
@@ -1443,6 +1445,7 @@
st_get_string(msg, F_ORIG, from);
st_get_int_value(msg, F_STONITHD_CALLID, &call_id);
st_get_int_value(msg, F_STONITHD_FRC, &op_result);
+ st_get_int_value(msg, F_STONITHD_RS_CALLID, &rs_callid);
if( rc != ST_OK ) { /* didn't get all fields */
return;
}
@@ -1453,10 +1456,15 @@
(gpointer *)&orig_key, (gpointer *)&op, &call_id);
if ( !op ||
(op->scenario != STONITH_INIT && op->scenario != STONITH_REQ)) {
- stonithd_log(LOG_DEBUG, "handle_msg_trstit: the stonith "
- "operation (call_id=%d) has finished before "
- "receiving this message", call_id);
- return;
+ my_hash_table_find(executing_queue, has_this_callid,
+ (gpointer *)&orig_key, (gpointer *)&op, &rs_callid);
+ if ( !op ||
+ (op->scenario != STONITH_INIT && op->scenario != STONITH_REQ)) {
+ stonithd_log(LOG_DEBUG, "handle_msg_trstit: the stonith "
+ "operation (call_id=%d) has finished before "
+ "receiving this message", call_id);
+ return;
+ }
}
op->op_union.st_op->op_result = (stonith_ret_t)op_result;
op->op_union.st_op->node_list =
@@ -2416,6 +2424,7 @@
"op->op_union.st_op == NULL");
return ST_FAIL;
}
+ op->op_union.st_op->call_id = negative_callid_counter;
if ( ST_OK != require_others_to_stonith(op->op_union.st_op) ) {
stonithd_log(LOG_ERR, "require_others_to_stonith failed.");
@@ -2427,12 +2436,8 @@
"optype=%s, key=%d",
stonith_op_strname[op->op_union.st_op->optype],
*original_key);
- *original_key = op->op_union.st_op->call_id;
- g_hash_table_insert(executing_queue, original_key, op);
- stonithd_log(LOG_DEBUG, "changeto_remote_stonithop: inserted "
- "optype=%s, key=%d",
- stonith_op_strname[op->op_union.st_op->optype],
- *original_key);
+ insert_into_executing_queue(op, op->op_union.st_op->call_id);
+ negative_callid_counter--;
return ST_OK;
}
@@ -2579,6 +2584,8 @@
if ((ha_msg_add_int(reply, F_STONITHD_FRC, st_op->op_result) != HA_OK)
||(ha_msg_add_int(reply, F_STONITHD_CALLID, st_op->call_id)
+ != HA_OK)
+ ||(ha_msg_add_int(reply, F_STONITHD_RS_CALLID, st_op->rs_callid)
!= HA_OK)) {
stonithd_log(LOG_ERR, "stonithop_result_to_other_node: "
"ha_msg_add: cannot add field.");
@@ -3710,6 +3717,7 @@
ret->node_uuid = g_strdup(st_op->node_uuid);
ret->timeout = st_op->timeout;
ret->call_id = st_op->call_id;
+ ret->rs_callid = st_op->rs_callid;
ret->op_result = st_op->op_result;
/* In stonith daemon ( this file ), node_list is only a GString */
ret->node_list = g_string_new( ((GString *)(st_op->node_list))->str );
diff -urN pacemaker-dev/include/fencing/stonithd_api.h pacemaker-dev.mod/include/fencing/stonithd_api.h
--- pacemaker-dev/include/fencing/stonithd_api.h 2008-11-13 15:48:06.000000000 +0900
+++ pacemaker-dev.mod/include/fencing/stonithd_api.h 2008-11-13 16:21:20.000000000 +0900
@@ -61,6 +61,7 @@
/* Only output fields */
int call_id;
+ int rs_callid;
stonith_ret_t op_result;
/*
* By now node_list is only a char * type.
diff -urN pacemaker-dev/include/fencing/stonithd_msg.h pacemaker-dev.mod/include/fencing/stonithd_msg.h
--- pacemaker-dev/include/fencing/stonithd_msg.h 2008-11-13 15:48:06.000000000 +0900
+++ pacemaker-dev.mod/include/fencing/stonithd_msg.h 2008-11-13 16:22:13.000000000 +0900
@@ -46,6 +46,7 @@
#define F_STONITHD_RAOPTYPE "raoptype" /* stonith RA op type */
#define F_STONITHD_PARAMS "params" /* parameters for stonith RA */
#define F_STONITHD_CALLID "callid" /* RA executing call_id==pid */
+#define F_STONITHD_RS_CALLID "rscallid" /* call_id from DC at remote stonith */
#define F_STONITHD_STTYPES "sttypes" /* stonith device types */
#define F_STONITHD_FRC "frc" /* final return code */
#define F_STONITHD_PDATA "pdata" /* private data for callback */
_______________________________________________
Pacemaker mailing list
[email protected]
http://list.clusterlabs.org/mailman/listinfo/pacemaker