[ https://issues.apache.org/jira/browse/GEODE-8696?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17262878#comment-17262878 ]
Kirk Lund commented on GEODE-8696: ---------------------------------- I'm reverting and reopening GEODE-8696 because the fix causes a new 3-way java-level deadlock: {noformat} Found one Java-level deadlock: ============================= "Pooled High Priority Message Processor 5": waiting to lock monitor 0x00007f3ac400d558 (object 0x00000000fd486c20, a java.util.HashMap), which is held by "vm_0_thr_0_locator_managing1_host1_13259" "vm_0_thr_0_locator_managing1_host1_13259": waiting for ownable synchronizer 0x00000000fd2f3fb0, (a java.util.concurrent.locks.ReentrantLock$NonfairSync), which is held by "DM-MemberEventInvoker" "DM-MemberEventInvoker": waiting to lock monitor 0x00007f3af8002d28 (object 0x00000000fd3d12a8, a org.apache.geode.management.internal.FederatingManager), which is held by "vm_0_thr_0_locator_managing1_host1_13259" {noformat}{noformat} Java stack information for the threads listed above: =================================================== {noformat}{noformat} "Pooled High Priority Message Processor 5": at org.apache.geode.management.internal.BaseManagementService.getExistingManagementService(BaseManagementService.java:106) - waiting to lock <0x00000000fd486c20> (a java.util.HashMap) at org.apache.geode.management.ManagementService.getExistingManagementService(ManagementService.java:52) at org.apache.geode.management.internal.JmxManagerAdvisee.fillInProfile(JmxManagerAdvisee.java:99) at org.apache.geode.distributed.internal.DistributionAdvisor.createProfile(DistributionAdvisor.java:1033) at org.apache.geode.management.internal.JmxManagerAdvisee.getProfile(JmxManagerAdvisee.java:65) at org.apache.geode.distributed.internal.DistributionAdvisor$Profile.handleDistributionAdvisee(DistributionAdvisor.java:1541) at org.apache.geode.management.internal.JmxManagerAdvisor$JmxManagerProfile.processIncoming(JmxManagerAdvisor.java:332) at org.apache.geode.internal.cache.UpdateAttributesProcessor$UpdateAttributesMessage.process(UpdateAttributesProcessor.java:291) at org.apache.geode.distributed.internal.DistributionMessage.scheduleAction(DistributionMessage.java:376) at org.apache.geode.distributed.internal.DistributionMessage$1.run(DistributionMessage.java:441) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at org.apache.geode.distributed.internal.ClusterOperationExecutors.runUntilShutdown(ClusterOperationExecutors.java:446) at org.apache.geode.distributed.internal.ClusterOperationExecutors.doHighPriorityThread(ClusterOperationExecutors.java:404) at org.apache.geode.distributed.internal.ClusterOperationExecutors$$Lambda$136/1646252585.invoke(Unknown Source) at org.apache.geode.logging.internal.executors.LoggingThreadFactory.lambda$newThread$0(LoggingThreadFactory.java:120) at org.apache.geode.logging.internal.executors.LoggingThreadFactory$$Lambda$134/1613570844.run(Unknown Source) at java.lang.Thread.run(Thread.java:748) {noformat}{noformat} "vm_0_thr_0_locator_managing1_host1_13259": at sun.misc.Unsafe.park(Native Method) - parking to wait for <0x00000000fd2f3fb0> (a java.util.concurrent.locks.ReentrantLock$NonfairSync) at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175) at java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:836) at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireQueued(AbstractQueuedSynchronizer.java:870) at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquire(AbstractQueuedSynchronizer.java:1199) at java.util.concurrent.locks.ReentrantLock$NonfairSync.lock(ReentrantLock.java:209) at java.util.concurrent.locks.ReentrantLock.lock(ReentrantLock.java:285) at org.apache.geode.management.internal.FederatingManager.startManager(FederatingManager.java:134) - locked <0x00000000fd3d12a8> (a org.apache.geode.management.internal.FederatingManager) at org.apache.geode.management.internal.SystemManagementService.startManager(SystemManagementService.java:373) - locked <0x00000000fd486c20> (a java.util.HashMap) at org.apache.geode.management.internal.beans.ManagementAdapter.handleCacheCreation(ManagementAdapter.java:199) at org.apache.geode.management.internal.beans.ManagementListener.handleEvent(ManagementListener.java:127) at org.apache.geode.distributed.internal.InternalDistributedSystem.notifyResourceEventListeners(InternalDistributedSystem.java:2086) at org.apache.geode.distributed.internal.InternalDistributedSystem.handleResourceEvent(InternalDistributedSystem.java:643) at org.apache.geode.internal.cache.GemFireCacheImpl.initialize(GemFireCacheImpl.java:1437) at org.apache.geode.internal.cache.InternalCacheBuilder.create(InternalCacheBuilder.java:191) - locked <0x00000000f0a15060> (a java.lang.Class for org.apache.geode.internal.cache.GemFireCacheImpl) - locked <0x00000000f0a29790> (a java.lang.Class for org.apache.geode.internal.cache.InternalCacheBuilder) at org.apache.geode.internal.cache.InternalCacheBuilder.create(InternalCacheBuilder.java:158) - locked <0x00000000f0a29790> (a java.lang.Class for org.apache.geode.internal.cache.InternalCacheBuilder) at org.apache.geode.cache.CacheFactory.create(CacheFactory.java:142) at hydra.CacheVersionHelper.configureAndCreateCache(CacheVersionHelper.java:51) at hydra.CacheHelper.createCacheWithHttpService(CacheHelper.java:127) - locked <0x00000000fd1f1070> (a java.lang.Class for hydra.CacheHelper) at hydra.CacheHelper.createCache(CacheHelper.java:87) at management.test.federation.FederationTest.createCache(FederationTest.java:234) at management.test.federation.FederationTest.initialize(FederationTest.java:225) at management.test.federation.FederationTest.HydraInitTask_initialize(FederationTest.java:130) - locked <0x00000000fd1f1308> (a java.lang.Class for management.test.federation.FederationTest) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at hydra.MethExecutor.execute(MethExecutor.java:173) at hydra.MethExecutor.execute(MethExecutor.java:141) at hydra.TestTask.execute(TestTask.java:197) at hydra.RemoteTestModule$1.run(RemoteTestModule.java:213) {noformat}{noformat} "DM-MemberEventInvoker": at org.apache.geode.management.internal.FederatingManager.executeTask(FederatingManager.java:357) - waiting to lock <0x00000000fd3d12a8> (a org.apache.geode.management.internal.FederatingManager) at org.apache.geode.management.internal.FederatingManager.addMember(FederatingManager.java:199) at org.apache.geode.management.internal.ManagementMembershipListener.memberJoined(ManagementMembershipListener.java:73) at org.apache.geode.distributed.internal.ClusterDistributionManager$MemberJoinedEvent.handleEvent(ClusterDistributionManager.java:2478) at org.apache.geode.distributed.internal.ClusterDistributionManager$MemberEvent.handleEvent(ClusterDistributionManager.java:2431) at org.apache.geode.distributed.internal.ClusterDistributionManager$MemberEvent.handleEvent(ClusterDistributionManager.java:2420) at org.apache.geode.distributed.internal.ClusterDistributionManager.handleMemberEvent(ClusterDistributionManager.java:1404) at org.apache.geode.distributed.internal.ClusterDistributionManager.access$200(ClusterDistributionManager.java:108) at org.apache.geode.distributed.internal.ClusterDistributionManager$MemberEventInvoker.run(ClusterDistributionManager.java:1436) at java.lang.Thread.run(Thread.java:748) {noformat} > Startup of JMX Manager may hang during crash of other members > ------------------------------------------------------------- > > Key: GEODE-8696 > URL: https://issues.apache.org/jira/browse/GEODE-8696 > Project: Geode > Issue Type: Bug > Components: jmx, management > Reporter: Kirk Lund > Assignee: Kirk Lund > Priority: Major > Labels: GeodeOperationAPI, pull-request-available > Fix For: 1.14.0 > > > The fix for GEODE-7400 removed final from the executorService field to > introduce a Supplier<ExecutorService>. I think this hang was caused by adding > synchronized to FederatingManager.executeTask instead of making > executorService a volatile field. > _vm_3_thr_3_client2_host2_21145_ hung while synchronized on > *0x00000000f6316520*: > {noformat} > "vm_3_thr_3_client2_host2_21145" #56 daemon prio=5 os_prio=0 > tid=0x00007f1854002000 nid=0x5326 waiting on condition [0x00007f18520e2000] > java.lang.Thread.State: WAITING (parking) > at sun.misc.Unsafe.park(Native Method) > - parking to wait for <0x00000000f6364030> (a > java.util.concurrent.FutureTask) > at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175) > at java.util.concurrent.FutureTask.awaitDone(FutureTask.java:429) > at java.util.concurrent.FutureTask.get(FutureTask.java:191) > at > java.util.concurrent.AbstractExecutorService.invokeAll(AbstractExecutorService.java:244) > at > org.apache.geode.management.internal.FederatingManager.startManagingActivity(FederatingManager.java:256) > at > org.apache.geode.management.internal.FederatingManager.startManager(FederatingManager.java:121) > - locked <0x00000000f6316520> (a > org.apache.geode.management.internal.FederatingManager) > at > org.apache.geode.management.internal.SystemManagementService.startManager(SystemManagementService.java:373) > - locked <0x00000000fed91a70> (a java.util.HashMap) > at > org.apache.geode.management.internal.beans.ManagementAdapter.handleCacheCreation(ManagementAdapter.java:197) > at > org.apache.geode.management.internal.beans.ManagementListener.handleEvent(ManagementListener.java:127) > at > org.apache.geode.distributed.internal.InternalDistributedSystem.notifyResourceEventListeners(InternalDistributedSystem.java:2089) > at > org.apache.geode.distributed.internal.InternalDistributedSystem.handleResourceEvent(InternalDistributedSystem.java:643) > at > org.apache.geode.internal.cache.GemFireCacheImpl.initialize(GemFireCacheImpl.java:1363) > at > org.apache.geode.internal.cache.InternalCacheBuilder.create(InternalCacheBuilder.java:191) > - locked <0x00000000e11065f0> (a java.lang.Class for > org.apache.geode.internal.cache.GemFireCacheImpl) > - locked <0x00000000e1101220> (a java.lang.Class for > org.apache.geode.internal.cache.InternalCacheBuilder) > at > org.apache.geode.internal.cache.InternalCacheBuilder.create(InternalCacheBuilder.java:158) > - locked <0x00000000e1101220> (a java.lang.Class for > org.apache.geode.internal.cache.InternalCacheBuilder) > at org.apache.geode.cache.CacheFactory.create(CacheFactory.java:142) > at > hydra.CacheVersionHelper.configureAndCreateCache(CacheVersionHelper.java:51) > at hydra.CacheHelper.createCacheWithHttpService(CacheHelper.java:127) > - locked <0x00000000fecab060> (a java.lang.Class for > hydra.CacheHelper) > at hydra.CacheHelper.createCache(CacheHelper.java:87) > at > splitBrain.NetworkPartitionTest.initialize(NetworkPartitionTest.java:293) > at > splitBrain.NetworkPartitionTest.initializeInstance(NetworkPartitionTest.java:228) > - locked <0x00000000e14a2db0> (a java.lang.Class for > splitBrain.NetworkPartitionTest) > at > splitBrain.NetworkPartitionTest.HydraTask_initialize(NetworkPartitionTest.java:203) > - locked <0x00000000e14a2db0> (a java.lang.Class for > splitBrain.NetworkPartitionTest) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:498) > at hydra.MethExecutor.execute(MethExecutor.java:173) > at hydra.MethExecutor.execute(MethExecutor.java:141) > at hydra.TestTask.execute(TestTask.java:197) > at hydra.RemoteTestModule$1.run(RemoteTestModule.java:213) > {noformat} > _vm_3_thr_3_client2_host2_21145_ is waiting for _FederatingManager1_ to > complete: > {noformat} > "FederatingManager1" #66 daemon prio=5 os_prio=0 tid=0x00007f1874327800 > nid=0x5331 waiting on condition [0x00007f18515d9000] > java.lang.Thread.State: TIMED_WAITING (parking) > at sun.misc.Unsafe.park(Native Method) > - parking to wait for <0x00000000f63b2590> (a > java.util.concurrent.CountDownLatch$Sync) > at > java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer.doAcquireSharedNanos(AbstractQueuedSynchronizer.java:1037) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer.tryAcquireSharedNanos(AbstractQueuedSynchronizer.java:1328) > at java.util.concurrent.CountDownLatch.await(CountDownLatch.java:277) > at > org.apache.geode.internal.util.concurrent.StoppableCountDownLatch.await(StoppableCountDownLatch.java:72) > at > org.apache.geode.distributed.internal.ReplyProcessor21.basicWait(ReplyProcessor21.java:731) > at > org.apache.geode.distributed.internal.ReplyProcessor21.waitForReplies(ReplyProcessor21.java:639) > at > org.apache.geode.distributed.internal.ReplyProcessor21.waitForReplies(ReplyProcessor21.java:620) > at > org.apache.geode.distributed.internal.ReplyProcessor21.waitForReplies(ReplyProcessor21.java:534) > at > org.apache.geode.internal.cache.StateFlushOperation.flush(StateFlushOperation.java:244) > at > org.apache.geode.internal.cache.InitialImageOperation.getFromOne(InitialImageOperation.java:432) > at > org.apache.geode.internal.cache.DistributedRegion.getInitialImageAndRecovery(DistributedRegion.java:1236) > at > org.apache.geode.internal.cache.DistributedRegion.initialize(DistributedRegion.java:1082) > at > org.apache.geode.internal.cache.GemFireCacheImpl.createVMRegion(GemFireCacheImpl.java:2971) > at > org.apache.geode.internal.cache.InternalCacheForClientAccess.createInternalRegion(InternalCacheForClientAccess.java:255) > at > org.apache.geode.management.internal.FederatingManager.addMemberArtifacts(FederatingManager.java:448) > - locked <0x00000000fec1a480> (a > org.apache.geode.distributed.internal.membership.InternalDistributedMember) > at > org.apache.geode.management.internal.FederatingManager$GIITask.call(FederatingManager.java:560) > at > org.apache.geode.management.internal.FederatingManager$GIITask.call(FederatingManager.java:550) > at java.util.concurrent.FutureTask.run(FutureTask.java:266) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748) > {noformat} > _DM-MemberEventInvoker_ is preventing _FederatingManager1_ from completing > its call to {{ReplyProcessor21.basicWait}} while waiting to lock > *0x00000000f6316520* which is held by _vm_3_thr_3_client2_host2_21145_: > {noformat} > "DM-MemberEventInvoker" #28 daemon prio=5 os_prio=0 tid=0x00007f1860356000 > nid=0x5313 waiting for monitor entry [0x00007f18531f2000] > java.lang.Thread.State: BLOCKED (on object monitor) > at > org.apache.geode.management.internal.FederatingManager.executeTask(FederatingManager.java:230) > - waiting to lock <0x00000000f6316520> (a > org.apache.geode.management.internal.FederatingManager) > at > org.apache.geode.management.internal.FederatingManager.removeMember(FederatingManager.java:160) > at > org.apache.geode.management.internal.ManagementMembershipListener.memberDeparted(ManagementMembershipListener.java:57) > at > org.apache.geode.distributed.internal.ClusterDistributionManager$MemberCrashedEvent.handleEvent(ClusterDistributionManager.java:2519) > at > org.apache.geode.distributed.internal.ClusterDistributionManager$MemberEvent.handleEvent(ClusterDistributionManager.java:2424) > at > org.apache.geode.distributed.internal.ClusterDistributionManager$MemberEvent.handleEvent(ClusterDistributionManager.java:2413) > at > org.apache.geode.distributed.internal.ClusterDistributionManager.handleMemberEvent(ClusterDistributionManager.java:1401) > at > org.apache.geode.distributed.internal.ClusterDistributionManager.access$200(ClusterDistributionManager.java:108) > at > org.apache.geode.distributed.internal.ClusterDistributionManager$MemberEventInvoker.run(ClusterDistributionManager.java:1433) > at java.lang.Thread.run(Thread.java:748) > {noformat} -- This message was sent by Atlassian Jira (v8.3.4#803005)