This is an automated email from the ASF dual-hosted git repository. kturner pushed a commit to branch elasticity in repository https://gitbox.apache.org/repos/asf/accumulo.git
The following commit(s) were added to refs/heads/elasticity by this push: new f6909cd706 fixes multiple upgrade bugs found when testing (#4717) f6909cd706 is described below commit f6909cd7067cad575e70893f3c1d5f4c99cc5676 Author: Keith Turner <ktur...@apache.org> AuthorDate: Tue Jul 2 10:10:48 2024 -0700 fixes multiple upgrade bugs found when testing (#4717) Fixed a bug where upgrade code was attempting to read the managers service lock before it was set. Fixed a bug with detecting fate operations. The code to detect fate operations would hang trying to read the fate table which did not exists yet. Scans of the metadata table running as the !SYSTEM user would fail if no auths were specified. The !SYSTEM user has not auths set in zookeeper and an exception would happen when trying to find them. A new zookeeper node related to compaction changes needed to be created. Simplified code for setting tablet availability by using DataLevel. Co-authored-by: Dave Marion <dlmar...@apache.org> --- .../accumulo/server/AccumuloDataVersion.java | 6 +-- .../java/org/apache/accumulo/manager/Manager.java | 2 +- .../manager/upgrade/UpgradeCoordinator.java | 30 ++++++------ .../accumulo/manager/upgrade/Upgrader12to13.java | 54 +++++++++++----------- 4 files changed, 42 insertions(+), 50 deletions(-) diff --git a/server/base/src/main/java/org/apache/accumulo/server/AccumuloDataVersion.java b/server/base/src/main/java/org/apache/accumulo/server/AccumuloDataVersion.java index 598b8e48cf..e2dfa4d961 100644 --- a/server/base/src/main/java/org/apache/accumulo/server/AccumuloDataVersion.java +++ b/server/base/src/main/java/org/apache/accumulo/server/AccumuloDataVersion.java @@ -82,10 +82,8 @@ public class AccumuloDataVersion { return CURRENT_VERSION; } - // ELASTICITY_TODO get upgrade working #4587 - // public static final Set<Integer> CAN_RUN = Set.of(ROOT_TABLET_META_CHANGES, - // REMOVE_DEPRECATIONS_FOR_VERSION_3, METADATA_FILE_JSON_ENCODING, CURRENT_VERSION); - public static final Set<Integer> CAN_RUN = Set.of(CURRENT_VERSION); + public static final Set<Integer> CAN_RUN = Set.of(CURRENT_VERSION, + REMOVE_DEPRECATIONS_FOR_VERSION_3, METADATA_FILE_JSON_ENCODING, ROOT_TABLET_META_CHANGES); /** * Get the stored, current working version. diff --git a/server/manager/src/main/java/org/apache/accumulo/manager/Manager.java b/server/manager/src/main/java/org/apache/accumulo/manager/Manager.java index d5778a44c5..28be0b7ffb 100644 --- a/server/manager/src/main/java/org/apache/accumulo/manager/Manager.java +++ b/server/manager/src/main/java/org/apache/accumulo/manager/Manager.java @@ -1037,7 +1037,6 @@ public class Manager extends AbstractServer } catch (KeeperException | InterruptedException e) { throw new IllegalStateException("Exception getting manager lock", e); } - this.getContext().setServiceLock(getManagerLock()); // If UpgradeStatus is not at complete by this moment, then things are currently // upgrading. @@ -1463,6 +1462,7 @@ public class Manager extends AbstractServer sleepUninterruptibly(TIME_TO_WAIT_BETWEEN_LOCK_CHECKS, MILLISECONDS); } + this.getContext().setServiceLock(getManagerLock()); setManagerState(ManagerState.HAVE_LOCK); return sld; } diff --git a/server/manager/src/main/java/org/apache/accumulo/manager/upgrade/UpgradeCoordinator.java b/server/manager/src/main/java/org/apache/accumulo/manager/upgrade/UpgradeCoordinator.java index 46d29516c2..2f4f7dd2ff 100644 --- a/server/manager/src/main/java/org/apache/accumulo/manager/upgrade/UpgradeCoordinator.java +++ b/server/manager/src/main/java/org/apache/accumulo/manager/upgrade/UpgradeCoordinator.java @@ -31,7 +31,6 @@ import java.util.TreeMap; import java.util.concurrent.CompletableFuture; import java.util.concurrent.Future; import java.util.concurrent.SynchronousQueue; -import java.util.stream.Stream; import org.apache.accumulo.core.Constants; import org.apache.accumulo.core.client.AccumuloException; @@ -39,9 +38,6 @@ import org.apache.accumulo.core.client.AccumuloSecurityException; import org.apache.accumulo.core.client.NamespaceNotFoundException; import org.apache.accumulo.core.client.TableNotFoundException; import org.apache.accumulo.core.conf.ConfigCheckUtil; -import org.apache.accumulo.core.fate.MetaFateStore; -import org.apache.accumulo.core.fate.ReadOnlyFateStore; -import org.apache.accumulo.core.fate.user.UserFateStore; import org.apache.accumulo.core.metadata.schema.Ample; import org.apache.accumulo.core.util.threads.ThreadPools; import org.apache.accumulo.core.volume.Volume; @@ -309,19 +305,19 @@ public class UpgradeCoordinator { justification = "Want to immediately stop all manager threads on upgrade error") private void abortIfFateTransactions(ServerContext context) { try { - final ReadOnlyFateStore<UpgradeCoordinator> mfs = new MetaFateStore<>( - context.getZooKeeperRoot() + Constants.ZFATE, context.getZooReaderWriter()); - final ReadOnlyFateStore<UpgradeCoordinator> ufs = new UserFateStore<>(context); - try (var mfsList = mfs.list(); var ufsList = ufs.list(); - var idStream = Stream.concat(mfsList, ufsList)) { - if (idStream.findFirst().isPresent()) { - throw new AccumuloException("Aborting upgrade because there are" - + " outstanding FATE transactions from a previous Accumulo version." - + " You can start the tservers and then use the shell to delete completed " - + " transactions. If there are incomplete transactions, you will need to roll" - + " back and fix those issues. Please see the following page for more information: " - + " https://accumulo.apache.org/docs/2.x/troubleshooting/advanced#upgrade-issues"); - } + // The current version of the code creates the new accumulo.fate table on upgrade, so no + // attempt is made to read it here. Attempting to read it this point would likely cause a hang + // as tablets are not assigned when this is called. The Fate code is not used to read from + // zookeeper below because the serialization format changed in zookeeper, that is why a direct + // read is performed. + if (!context.getZooReader().getChildren(context.getZooKeeperRoot() + Constants.ZFATE) + .isEmpty()) { + throw new AccumuloException("Aborting upgrade because there are" + + " outstanding FATE transactions from a previous Accumulo version." + + " You can start the tservers and then use the shell to delete completed " + + " transactions. If there are incomplete transactions, you will need to roll" + + " back and fix those issues. Please see the following page for more information: " + + " https://accumulo.apache.org/docs/2.x/troubleshooting/advanced#upgrade-issues"); } } catch (Exception exception) { log.error("Problem verifying Fate readiness", exception); diff --git a/server/manager/src/main/java/org/apache/accumulo/manager/upgrade/Upgrader12to13.java b/server/manager/src/main/java/org/apache/accumulo/manager/upgrade/Upgrader12to13.java index 025b08ef79..4071d64180 100644 --- a/server/manager/src/main/java/org/apache/accumulo/manager/upgrade/Upgrader12to13.java +++ b/server/manager/src/main/java/org/apache/accumulo/manager/upgrade/Upgrader12to13.java @@ -68,11 +68,13 @@ public class Upgrader12to13 implements Upgrader { @Override public void upgradeZookeeper(ServerContext context) { LOG.info("setting root table stored hosting availability"); - addHostingGoalToRootTable(context); + addHostingGoals(context, TabletAvailability.HOSTED, DataLevel.ROOT); LOG.info("Removing nodes no longer used from ZooKeeper"); removeUnusedZKNodes(context); LOG.info("Removing compact columns from root tablet"); removeCompactColumnsFromRootTabletMetadata(context); + LOG.info("Adding compactions node to zookeeper"); + addCompactionsNode(context); } @Override @@ -82,7 +84,7 @@ public class Upgrader12to13 implements Upgrader { LOG.info("Looking for partial splits"); handlePartialSplits(context, AccumuloTable.ROOT.tableName()); LOG.info("setting metadata table hosting availability"); - addHostingGoalToMetadataTable(context); + addHostingGoals(context, TabletAvailability.HOSTED, DataLevel.METADATA); LOG.info("Removing MetadataBulkLoadFilter iterator from root table"); removeMetaDataBulkLoadFilter(context, AccumuloTable.ROOT.tableId()); LOG.info("Removing compact columns from metadata tablets"); @@ -94,7 +96,7 @@ public class Upgrader12to13 implements Upgrader { LOG.info("Looking for partial splits"); handlePartialSplits(context, AccumuloTable.METADATA.tableName()); LOG.info("setting hosting availability on user tables"); - addHostingGoalToUserTables(context); + addHostingGoals(context, TabletAvailability.ONDEMAND, DataLevel.USER); LOG.info("Deleting external compaction final states from user tables"); deleteExternalCompactionFinalStates(context); LOG.info("Deleting external compaction from user tables"); @@ -105,6 +107,16 @@ public class Upgrader12to13 implements Upgrader { removeCompactColumnsFromTable(context, AccumuloTable.METADATA.tableName()); } + private static void addCompactionsNode(ServerContext context) { + try { + context.getZooReaderWriter().putPersistentData( + ZooUtil.getRoot(context.getInstanceID()) + Constants.ZCOMPACTIONS, new byte[0], + ZooUtil.NodeExistsPolicy.SKIP); + } catch (KeeperException | InterruptedException e) { + throw new IllegalStateException(e); + } + } + private void createFateTable(ServerContext context) { try { TableManager.prepareNewTableState(context, AccumuloTable.FATE.tableId(), @@ -163,7 +175,7 @@ public class Upgrader12to13 implements Upgrader { private void removeCompactColumnsFromTable(ServerContext context, String tableName) { - try (var scanner = context.createScanner(tableName); + try (var scanner = context.createScanner(tableName, Authorizations.EMPTY); var writer = context.createBatchWriter(tableName)) { scanner.setRange(MetadataSchema.TabletsSection.getRange()); COMPACT_COL.fetch(scanner); @@ -220,7 +232,9 @@ public class Upgrader12to13 implements Upgrader { // Compactions are committed in a completely different way now, so delete these entries. Its // possible some completed compactions may need to be redone, but processing these entries would // not be easy to test so its better for correctness to delete them and redo the work. - try (var scanner = context.createScanner(AccumuloTable.METADATA.tableName()); + try ( + var scanner = + context.createScanner(AccumuloTable.METADATA.tableName(), Authorizations.EMPTY); var writer = context.createBatchWriter(AccumuloTable.METADATA.tableName())) { var section = new Section(RESERVED_PREFIX + "ecomp", true, RESERVED_PREFIX + "ecomq", false); scanner.setRange(section.getRange()); @@ -240,31 +254,13 @@ public class Upgrader12to13 implements Upgrader { } } - private void addHostingGoalToSystemTable(ServerContext context, TableId tableId) { + private void addHostingGoals(ServerContext context, TabletAvailability availability, + DataLevel level) { try ( TabletsMetadata tm = - context.getAmple().readTablets().forTable(tableId).fetch(ColumnType.PREV_ROW).build(); - TabletsMutator mut = context.getAmple().mutateTablets()) { - tm.forEach(t -> mut.mutateTablet(t.getExtent()) - .putTabletAvailability(TabletAvailability.HOSTED).mutate()); - } - } - - private void addHostingGoalToRootTable(ServerContext context) { - addHostingGoalToSystemTable(context, AccumuloTable.ROOT.tableId()); - } - - private void addHostingGoalToMetadataTable(ServerContext context) { - addHostingGoalToSystemTable(context, AccumuloTable.METADATA.tableId()); - } - - private void addHostingGoalToUserTables(ServerContext context) { - try ( - TabletsMetadata tm = context.getAmple().readTablets().forLevel(DataLevel.USER) - .fetch(ColumnType.PREV_ROW).build(); + context.getAmple().readTablets().forLevel(level).fetch(ColumnType.PREV_ROW).build(); TabletsMutator mut = context.getAmple().mutateTablets()) { - tm.forEach(t -> mut.mutateTablet(t.getExtent()) - .putTabletAvailability(TabletAvailability.ONDEMAND).mutate()); + tm.forEach(t -> mut.mutateTablet(t.getExtent()).putTabletAvailability(availability).mutate()); } } @@ -273,7 +269,9 @@ public class Upgrader12to13 implements Upgrader { // process the metadata table. The metadata related to an external compaction has changed so // delete any that exists. Not using Ample in case there are problems deserializing the old // external compaction metadata. - try (var scanner = context.createScanner(AccumuloTable.METADATA.tableName()); + try ( + var scanner = + context.createScanner(AccumuloTable.METADATA.tableName(), Authorizations.EMPTY); var writer = context.createBatchWriter(AccumuloTable.METADATA.tableName())) { scanner.setRange(TabletsSection.getRange()); scanner.fetchColumnFamily(ExternalCompactionColumnFamily.NAME);