Hi, Jan!

Here's an idea of the fix:

Let's always use the KILL mutex locking order, that is

  victim_thread->LOCK_thd_data -> lock_sys->mutex -> victim_trx->mutex

For this we need to fix wsrep_abort_transaction(), which is called from the
server, and wsrep_innobase_kill_one_trx(), which is called from BF
thread.

wsrep_abort_transaction() can be fixed by not invoking
wsrep_innobase_kill_one_trx() and always using KILL code path (that is
wsrep_thd_awake) and forcing rollback after the kill.

wsrep_innobase_kill_one_trx() can be fixed by not locking LOCK_thd_data
at all, just don't lock it. We know that the victim waits on a lock
inside InnoDB and we've locked trx mutex and lock_sys mutex. The victim
cannot go away, cannot modify its data, it cannot do anything. So,
LOCK_thd_data doesn't seem to be necessary at that point.

I've attached a demo patch. It compiles, but I didn't try to run it,
it's only to show the idea, not a working fix (I already suspect I
removed too much from wsrep_abort_transaction()). Note it's the patch
for 10.2 at the commit 29bbcac0ee8^ - that is one commit before my fix.

On Oct 12, Jan Lindström wrote:
> Hi Sergei,
> 
> Update on wsrep_close_connections problem. My suggestion to fix this issue
> is on
> https://github.com/MariaDB/server/commit/99cbe03a44cc95e6f548550df51e7201ebea3b9d
> 
> If you have a better solution, please advise.
> 
> R: Jan
 
Regards,
Sergei
VP of MariaDB Server Engineering
and [email protected]
diff --git a/include/mysql/service_wsrep.h b/include/mysql/service_wsrep.h
index 54b9eb9250c..3f397276498 100644
--- a/include/mysql/service_wsrep.h
+++ b/include/mysql/service_wsrep.h
@@ -91,7 +91,7 @@ extern struct wsrep_service_st {
   enum wsrep_trx_status       (*wsrep_run_wsrep_commit_func)(THD *thd, bool 
all);
   void                        (*wsrep_thd_LOCK_func)(THD *thd);
   void                        (*wsrep_thd_UNLOCK_func)(THD *thd);
-  void                        (*wsrep_thd_awake_func)(THD *thd, my_bool 
signal);
+  void                        (*wsrep_thd_awake_func)(THD *thd, my_bool 
signal, my_bool sync);
   enum wsrep_conflict_state   (*wsrep_thd_conflict_state_func)(MYSQL_THD, 
my_bool);
   const char *                (*wsrep_thd_conflict_state_str_func)(THD *thd);
   enum wsrep_exec_mode        (*wsrep_thd_exec_mode_func)(THD *thd);
@@ -139,7 +139,7 @@ extern struct wsrep_service_st {
 #define wsrep_run_wsrep_commit(T,A) 
wsrep_service->wsrep_run_wsrep_commit_func(T,A)
 #define wsrep_thd_LOCK(T) wsrep_service->wsrep_thd_LOCK_func(T)
 #define wsrep_thd_UNLOCK(T) wsrep_service->wsrep_thd_UNLOCK_func(T)
-#define wsrep_thd_awake(T,S) wsrep_service->wsrep_thd_awake_func(T,S)
+#define wsrep_thd_awake(T,S,L) wsrep_service->wsrep_thd_awake_func(T,S,L)
 #define wsrep_thd_conflict_state(T,S) 
wsrep_service->wsrep_thd_conflict_state_func(T,S)
 #define wsrep_thd_conflict_state_str(T) 
wsrep_service->wsrep_thd_conflict_state_str_func(T)
 #define wsrep_thd_exec_mode(T) wsrep_service->wsrep_thd_exec_mode_func(T)
@@ -220,7 +220,7 @@ void wsrep_lock_rollback();
 void wsrep_post_commit(THD* thd, bool all);
 void wsrep_thd_LOCK(THD *thd);
 void wsrep_thd_UNLOCK(THD *thd);
-void wsrep_thd_awake(THD *thd, my_bool signal);
+void wsrep_thd_awake(THD *thd, my_bool signal, my_bool lock);
 void wsrep_thd_set_conflict_state(THD *thd, enum wsrep_conflict_state state);
 bool wsrep_thd_ignore_table(THD *thd);
 void wsrep_unlock_rollback();
diff --git a/sql/sql_class.cc b/sql/sql_class.cc
index 92736eacee2..e248f644a0a 100644
--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@@ -1693,7 +1693,9 @@ void THD::awake(killed_state state_to_set)
   DBUG_PRINT("enter", ("this: %p current_thd: %p  state: %d",
                        this, current_thd, (int) state_to_set));
   THD_CHECK_SENTRY(this);
-  mysql_mutex_assert_owner(&LOCK_thd_data);
+
+  if (!WSREP_ON || !wsrep_thd_is_BF(current_thd, FALSE))
+    mysql_mutex_assert_owner(&LOCK_thd_data);
 
   print_aborted_warning(3, "KILLED");
 
diff --git a/sql/wsrep_dummy.cc b/sql/wsrep_dummy.cc
index 53941c06892..7973b325b68 100644
--- a/sql/wsrep_dummy.cc
+++ b/sql/wsrep_dummy.cc
@@ -83,7 +83,7 @@ void wsrep_thd_LOCK(THD *)
 void wsrep_thd_UNLOCK(THD *)
 { }
 
-void wsrep_thd_awake(THD *, my_bool)
+void wsrep_thd_awake(THD *, my_bool, my_bool)
 { }
 
 const char *wsrep_thd_conflict_state_str(THD *)
diff --git a/sql/wsrep_mysqld.cc b/sql/wsrep_mysqld.cc
index d392d1c2a61..ff503ddf82b 100644
--- a/sql/wsrep_mysqld.cc
+++ b/sql/wsrep_mysqld.cc
@@ -2758,13 +2758,15 @@ extern "C" void wsrep_thd_set_wsrep_last_query_id(THD 
*thd, query_id_t id)
 }
 
 
-extern "C" void wsrep_thd_awake(THD *thd, my_bool signal)
+extern "C" void wsrep_thd_awake(THD *thd, my_bool signal, my_bool sync)
 {
   if (signal)
   {
-    mysql_mutex_lock(&thd->LOCK_thd_data);
+    if (sync)
+      mysql_mutex_lock(&thd->LOCK_thd_data);
     thd->awake(KILL_QUERY);
-    mysql_mutex_unlock(&thd->LOCK_thd_data);
+    if (sync)
+      mysql_mutex_unlock(&thd->LOCK_thd_data);
   }
   else
   {
diff --git a/storage/innobase/handler/ha_innodb.cc 
b/storage/innobase/handler/ha_innodb.cc
index e475852cb90..fcc2949f9ce 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -19572,7 +19572,7 @@ wsrep_innobase_kill_one_trx(
                WSREP_DEBUG("victim " TRX_ID_FMT " in MUST ABORT state",
                            victim_trx->id);
                wsrep_thd_UNLOCK(thd);
-               wsrep_thd_awake(thd, signal);
+               wsrep_thd_awake(thd, signal, 0);
                DBUG_VOID_RETURN;
                break;
        case ABORTED:
@@ -19610,7 +19610,7 @@ wsrep_innobase_kill_one_trx(
                                            TRX_ID_FMT,
                                            victim_trx->id);
                                wsrep_thd_UNLOCK(thd);
-                               wsrep_thd_awake(thd, signal);
+                               wsrep_thd_awake(thd, signal, 0);
                                DBUG_VOID_RETURN;
                                break;
                        case WSREP_OK:
@@ -19629,7 +19629,7 @@ wsrep_innobase_kill_one_trx(
                        }
                }
                wsrep_thd_UNLOCK(thd);
-               wsrep_thd_awake(thd, signal);
+               wsrep_thd_awake(thd, signal, 0);
                break;
        case QUERY_EXEC:
                /* it is possible that victim trx is itself waiting for some
@@ -19652,7 +19652,7 @@ wsrep_innobase_kill_one_trx(
                        }
 
                        wsrep_thd_UNLOCK(thd);
-                       wsrep_thd_awake(thd, signal);
+                       wsrep_thd_awake(thd, signal, 0);
                } else {
                        /* abort currently executing query */
                        DBUG_PRINT("wsrep",("sending KILL_QUERY to: %lu",
@@ -19662,7 +19662,7 @@ wsrep_innobase_kill_one_trx(
                        /* Note that innobase_kill_query will take lock_mutex
                        and trx_mutex */
                        wsrep_thd_UNLOCK(thd);
-                       wsrep_thd_awake(thd, signal);
+                       wsrep_thd_awake(thd, signal, 0);
 
                        /* for BF thd, we need to prevent him from committing */
                        if (wsrep_thd_exec_mode(thd) == REPL_RECV) {
@@ -19727,29 +19727,13 @@ wsrep_abort_transaction(
 {
        DBUG_ENTER("wsrep_abort_transaction");
 
-       trx_t* victim_trx       = thd_to_trx(victim_thd);
-       trx_t* bf_trx           = (bf_thd) ? thd_to_trx(bf_thd) : NULL;
-
        WSREP_DEBUG("abort transaction: BF: %s victim: %s victim conf: %d",
                        wsrep_thd_query(bf_thd),
                        wsrep_thd_query(victim_thd),
                        wsrep_thd_conflict_state(victim_thd, FALSE));
 
-       if (victim_trx) {
-               lock_mutex_enter();
-               trx_mutex_enter(victim_trx);
-               wsrep_innobase_kill_one_trx(bf_thd, bf_trx, victim_trx, signal);
-               lock_mutex_exit();
-               trx_mutex_exit(victim_trx);
-               wsrep_srv_conc_cancel_wait(victim_trx);
-               DBUG_VOID_RETURN;
-       } else {
-               WSREP_DEBUG("victim does not have transaction");
-               wsrep_thd_LOCK(victim_thd);
-               wsrep_thd_set_conflict_state(victim_thd, MUST_ABORT);
-               wsrep_thd_UNLOCK(victim_thd);
-               wsrep_thd_awake(victim_thd, signal);
-       }
+       thd_mark_transaction_to_rollback(victim_thd, 1);
+       wsrep_thd_awake(victim_thd, signal, 1);
 
        DBUG_VOID_RETURN;
 }
_______________________________________________
Mailing list: https://launchpad.net/~maria-developers
Post to     : [email protected]
Unsubscribe : https://launchpad.net/~maria-developers
More help   : https://help.launchpad.net/ListHelp

Reply via email to