Hi,

I'm sending a patch which has the following roles.

(1) add a timeout in stonithd on DC node when it is going to broadcast
    a STONITH request because all STONITH plugins on DC failed.

(2) remove the timeout above when DC-stonithd receives a "STONITH succeeded"
    message from other node.

I have no idea whether the current behavior is intentional or a bug.
If it is an expected behavior, I'd like to hear its purpose.
If not, I hope this patch is helpful to you.

Best Regards,
Satomi TANIGUCHI


Satomi TANIGUCHI wrote:
> Hi Lists,
> 
> I found an odd behavior about remote fencing.
> 
> In the following case, stonithd on DC node adds the timeout for remote fencing
> operations.
> (1) the target node is DC.
> (2) the target node is not DC, but DC node has no STONITH plugin.
> 
> But it doesn't add that timeout in the following case.
> (i) the target node is not DC, and all STONITH plugins fail on DC node.
> 
> Is it an expected behavior or a bug?
> 
> When all STONITH operations on all nodes in the cluster failed,
> maybe no action can be executed forever because transition timeout in tengine
> is going to be removed...
> 
> Best Regards,
> Satomi TANIGUCHI
> 
> _______________________________________________
> Pacemaker mailing list
> [email protected]
> http://list.clusterlabs.org/mailman/listinfo/pacemaker

diff -urN pacemaker-dev/fencing/stonithd/stonithd.c pacemaker-dev.mod/fencing/stonithd/stonithd.c
--- pacemaker-dev/fencing/stonithd/stonithd.c	2008-11-13 15:48:06.000000000 +0900
+++ pacemaker-dev.mod/fencing/stonithd/stonithd.c	2008-11-13 16:21:34.000000000 +0900
@@ -1351,6 +1351,7 @@
 		free_stonith_ops_t(st_op);
 		return;
 	}
+	st_op->rs_callid = st_op->call_id;
 
 	if (ST_OK == require_local_stonithop(st_op, srsc, from)) {
 		stonithd_log(LOG_INFO, "Node %s try to help node %s to "
@@ -1434,6 +1435,7 @@
 {
 	const char * from = NULL;
 	int call_id;
+	int rs_callid;
 	int op_result;
 	int * orig_key = NULL;
 	common_op_t * op = NULL;
@@ -1443,6 +1445,7 @@
 	st_get_string(msg, F_ORIG, from);
 	st_get_int_value(msg, F_STONITHD_CALLID, &call_id);
 	st_get_int_value(msg, F_STONITHD_FRC, &op_result);
+	st_get_int_value(msg, F_STONITHD_RS_CALLID, &rs_callid);
 	if( rc != ST_OK ) { /* didn't get all fields */
 		return;
 	}
@@ -1453,10 +1456,15 @@
 			(gpointer *)&orig_key, (gpointer *)&op, &call_id);
 	if ( !op || 
 	    (op->scenario != STONITH_INIT && op->scenario != STONITH_REQ)) {
-		stonithd_log(LOG_DEBUG, "handle_msg_trstit: the stonith "
-			"operation (call_id=%d) has finished before "
-			"receiving this message", call_id);
-		return;
+		my_hash_table_find(executing_queue, has_this_callid,
+				(gpointer *)&orig_key, (gpointer *)&op, &rs_callid);
+		if ( !op || 
+		    (op->scenario != STONITH_INIT && op->scenario != STONITH_REQ)) {
+			stonithd_log(LOG_DEBUG, "handle_msg_trstit: the stonith "
+				"operation (call_id=%d) has finished before "
+				"receiving this message", call_id);
+			return;
+		}
 	}
 	op->op_union.st_op->op_result = (stonith_ret_t)op_result;
 	op->op_union.st_op->node_list = 
@@ -2416,6 +2424,7 @@
 				"op->op_union.st_op == NULL");
 		return ST_FAIL;
 	}
+	op->op_union.st_op->call_id = negative_callid_counter;
 
 	if ( ST_OK != require_others_to_stonith(op->op_union.st_op) ) {
 		stonithd_log(LOG_ERR, "require_others_to_stonith failed.");
@@ -2427,12 +2436,8 @@
 		  "optype=%s, key=%d",
 		  stonith_op_strname[op->op_union.st_op->optype],
 		  *original_key);
-	*original_key = op->op_union.st_op->call_id;
-	g_hash_table_insert(executing_queue, original_key, op);
-	stonithd_log(LOG_DEBUG, "changeto_remote_stonithop: inserted "
-		  "optype=%s, key=%d",
-		  stonith_op_strname[op->op_union.st_op->optype],
-		  *original_key);
+	insert_into_executing_queue(op, op->op_union.st_op->call_id);
+	negative_callid_counter--;
 	return ST_OK;
 }
 
@@ -2579,6 +2584,8 @@
 
 	if ((ha_msg_add_int(reply, F_STONITHD_FRC, st_op->op_result) != HA_OK)
     	    ||(ha_msg_add_int(reply, F_STONITHD_CALLID, st_op->call_id) 
+		!= HA_OK)
+    	    ||(ha_msg_add_int(reply, F_STONITHD_RS_CALLID, st_op->rs_callid)
 		!= HA_OK)) {
 		stonithd_log(LOG_ERR, "stonithop_result_to_other_node: "
 			     "ha_msg_add: cannot add field.");
@@ -3710,6 +3717,7 @@
 	ret->node_uuid = g_strdup(st_op->node_uuid);
 	ret->timeout = st_op->timeout;
 	ret->call_id = st_op->call_id;
+	ret->rs_callid = st_op->rs_callid;
 	ret->op_result = st_op->op_result;
 	/* In stonith daemon ( this file ), node_list is only a GString */
 	ret->node_list = g_string_new( ((GString *)(st_op->node_list))->str );
diff -urN pacemaker-dev/include/fencing/stonithd_api.h pacemaker-dev.mod/include/fencing/stonithd_api.h
--- pacemaker-dev/include/fencing/stonithd_api.h	2008-11-13 15:48:06.000000000 +0900
+++ pacemaker-dev.mod/include/fencing/stonithd_api.h	2008-11-13 16:21:20.000000000 +0900
@@ -61,6 +61,7 @@
 
 /* Only output fields */
 	int		call_id;
+	int		rs_callid;
 	stonith_ret_t	op_result;	
 /*
  * By now node_list is only a char * type. 
diff -urN pacemaker-dev/include/fencing/stonithd_msg.h pacemaker-dev.mod/include/fencing/stonithd_msg.h
--- pacemaker-dev/include/fencing/stonithd_msg.h	2008-11-13 15:48:06.000000000 +0900
+++ pacemaker-dev.mod/include/fencing/stonithd_msg.h	2008-11-13 16:22:13.000000000 +0900
@@ -46,6 +46,7 @@
 #define F_STONITHD_RAOPTYPE "raoptype" 	/* stonith RA op type */
 #define F_STONITHD_PARAMS   "params" 	/* parameters for stonith RA  */
 #define F_STONITHD_CALLID   "callid" 	/* RA executing call_id==pid */
+#define F_STONITHD_RS_CALLID "rscallid" /* call_id from DC at remote stonith */
 #define F_STONITHD_STTYPES  "sttypes" 	/* stonith device types */
 #define F_STONITHD_FRC	    "frc" 	/* final return code */
 #define F_STONITHD_PDATA    "pdata" 	/* private data for callback */
_______________________________________________
Pacemaker mailing list
[email protected]
http://list.clusterlabs.org/mailman/listinfo/pacemaker

Reply via email to