Author: julianfoad
Date: Thu Mar  4 12:05:28 2021
New Revision: 1887170

URL: http://svn.apache.org/viewvc?rev=1887170&view=rev
Log:
Update 'fsfsfixer' to work with packed repositories.

* contrib/server-side/fsfsfixer/fixer/find_good_id.py
  (usage): Update to match long-ago changes.
  (Repository): New class, based on one found in
    'tools/dev/benchmarks/RepoPerf/copy_repo.py'.
  (rev_file_path): Return the path of the rev pack file instead of a single
    rev file if the rev packed.  Adjust to be a method of Repository class.
  (rev_file_indexes): Adjust to be a method of Repository class.
  (find_good_id
   find_good_rep_header,
   main script): Adjust to use Repository class.

* contrib/server-side/fsfsfixer/fixer/fixer_config.py
  (REVS_PER_SHARD): Remove, as it is now determined automatically.

* contrib/server-side/fsfsfixer/fixer/fix-rev.py
  Adjust everywhere to use Repository class.

* contrib/server-side/fsfsfixer/README
  Remove a poor suggested work-around to use another script to unpack the
  repo. (That script does not support packed repositories either and was
  never declared ready for use.)

Modified:
    subversion/trunk/contrib/server-side/fsfsfixer/README
    subversion/trunk/contrib/server-side/fsfsfixer/fixer/find_good_id.py
    subversion/trunk/contrib/server-side/fsfsfixer/fixer/fix-rev.py
    subversion/trunk/contrib/server-side/fsfsfixer/fixer/fixer_config.py

Modified: subversion/trunk/contrib/server-side/fsfsfixer/README
URL: 
http://svn.apache.org/viewvc/subversion/trunk/contrib/server-side/fsfsfixer/README?rev=1887170&r1=1887169&r2=1887170&view=diff
==============================================================================
--- subversion/trunk/contrib/server-side/fsfsfixer/README (original)
+++ subversion/trunk/contrib/server-side/fsfsfixer/README Thu Mar  4 12:05:28 
2021
@@ -25,6 +25,3 @@ Backup your repository before running th
 For more details, see the email from Julian Foad on 2010-10-06, subject
 "Fixing FSFS 'Corrupt node-revision' and 'Corrupt representation' errors",
 <http://svn.haxx.se/dev/archive-2010-10/0095.shtml>.
-
-This script does not support fixing revisions that had been packed.  Consider
-using ../../../tools/server-side/fsfs-reshard.py first.

Modified: subversion/trunk/contrib/server-side/fsfsfixer/fixer/find_good_id.py
URL: 
http://svn.apache.org/viewvc/subversion/trunk/contrib/server-side/fsfsfixer/fixer/find_good_id.py?rev=1887170&r1=1887169&r2=1887170&view=diff
==============================================================================
--- subversion/trunk/contrib/server-side/fsfsfixer/fixer/find_good_id.py 
(original)
+++ subversion/trunk/contrib/server-side/fsfsfixer/fixer/find_good_id.py Thu 
Mar  4 12:05:28 2021
@@ -1,9 +1,12 @@
 #!/usr/bin/env python
 
 usage = """
-Print the correct FSFS node-rev id, given one that is correct except for
-its byte-offset part.
-Usage: $0 REPO-DIR FSFS-ID-WITH-BAD-OFFSET
+Usage:
+  $0 REPO-DIR FSFS-ID-WITH-BAD-OFFSET
+    -- Find the correct FSFS node-rev id, given one that is correct except for
+       its byte-offset part.
+  $0 REPO-DIR REV SIZE
+    -- Find a rep header that matches REV and SIZE.
 Example:
   Result of running 'svnadmin verify':
     svnadmin: Corrupt node-revision '5-12302.1-12953.r12953/29475'
@@ -29,42 +32,107 @@ def parse_id(id):
   noderev = node_id + '.' + copy_id + '.r' + rev
   return noderev, rev, offset
 
-def rev_file_path(repo_dir, rev):
-  """Return the path to the revision file in the repository at REPO_DIR
-     (a path string) for revision number REV (int or string).
+class Repository:
+  """Encapsulates key information of a repository:
+     its NAME, PATH, SHARD_SIZE, HEAD revision and MIN_UNPACKED_REV.
      """
-  if REVS_PER_SHARD:
-    shard = int(rev) / REVS_PER_SHARD
-    path = os.path.join(repo_dir, 'db', 'revs', str(shard), str(rev))
-  else:
-    path = os.path.join(repo_dir, 'db', 'revs', str(rev))
-  return path
-
-def rev_file_indexes(repo_dir, rev):
-  """Return (ids, texts), where IDS is a dictionary of all node-rev ids
-     defined in revision REV of the repo at REPO_DIR, in the form
-     {noderev: full_id}, and TEXTS is an array of
-     (offset, size, expanded-size, csum [,sha1-csum, uniquifier]) tuples
-     taken from all the "text: REV ..." representation lines
-     in revision REV.
 
-     Here, NODEREV is the node-revision id minus the /offset part, and
-     FULL_ID is the full node-revision id (including the /offset part).
-     """
-  ids = {}
-  texts = []
-  for line in open(rev_file_path(repo_dir, rev)):
-    if line.startswith('id: '):
-      id = line.replace('id: ', '').rstrip()
-      id_noderev, id_rev, _ = parse_id(id)
-      assert id_rev == rev
-      ids[id_noderev] = id
-    if line.startswith('text: ' + rev + ' '):  # also 'props:' lines?
-      fields = line.split()
-      texts.append(tuple(fields[2:]))
-  return ids, texts
+  def _read_repo_file(self, filename):
+    """Read and return all lines from FILENAME in REPO.
+    """
+
+    f = open(os.path.join(self.path, filename), "rb")
+    lines = f.readlines()
+    f.close()
+    return lines
+
+  def _read_config(self, filename):
+    """ Read and return all lines from FILENAME.
+        This will be used to read 'format', 'current' etc. . """
+
+    if filename not in self.db_config:
+      f = open(os.path.join(self.path, 'db', filename), "rb")
+      self.db_config[filename] = f.readlines()
+      f.close()
+
+    return self.db_config[filename]
+
+  def __init__(self, path_or_parent, name=None):
+    """Constructor collecting everything we need to know about
+       the repository at path PATH_OR_PARENT or NAME within PARENT folder.
+    """
+
+    if name is None:
+      self.name = os.path.basename(path_or_parent)
+      self.path = path_or_parent
+    else:
+      self.name = name
+      self.path = os.path.join(path_or_parent, name)
+
+    self.db_config = {}
+    self.repo_format = int(self._read_repo_file('format')[0])
+    self.fs_type = self._read_config('fs-type')[0].rstrip()
+    self.db_format = int(self._read_config('format')[0])
+    try:
+      self.shard_size = int(self._read_config('format')[1].split(' ')[2])
+    except IndexError:
+      self.shard_size = 0
+    if self.db_format >= 4:
+      self.min_unpacked_rev = int(self._read_config('min-unpacked-rev')[0])
+    else:
+      self.min_unpacked_rev = 0
+    self.head = int(self._read_config('current')[0])
+
+  def rev_file_path(self, rev):
+    """Return the path to the revision file in the repository at REPO_DIR
+       (a path string) for revision number REV (int or string).
+       """
+    if isinstance(rev, str):
+      rev = int(rev)
+    if self.shard_size > 0:
+      shard = int(rev) / self.shard_size
+      if rev < self.min_unpacked_rev:
+          path = os.path.join(self.path, 'db', 'revs', str(shard) + '.pack', 
'pack')
+      else:
+          path = os.path.join(self.path, 'db', 'revs', str(shard), str(rev))
+    else:
+      path = os.path.join(self.path, 'db', 'revs', str(rev))
+    return path
+
+  def rev_file_indexes(self, rev):
+    """Return (ids, texts), where IDS is a dictionary of all node-rev ids
+       defined in revision REV of the repo at REPO_DIR, in the form
+       {noderev: full_id}, and TEXTS is an array of
+       (offset, size, expanded-size, csum [,sha1-csum, uniquifier]) tuples
+       taken from all the "text: REV ..." representation lines
+       in revision REV.
+
+       Here, NODEREV is the node-revision id minus the /offset part, and
+       FULL_ID is the full node-revision id (including the /offset part).
+       """
+    if isinstance(rev, str):
+      rev = int(rev)
+    ids = {}
+    texts = []
+    for line in open(self.rev_file_path(rev)):
+      if line.startswith('id: '):
+        id = line.replace('id: ', '').rstrip()
+        id_noderev, id_rev, _ = parse_id(id)
+        id_rev = int(id_rev)
+        # all ids in an unpacked rev file should match its rev number
+        if rev >= self.min_unpacked_rev:
+          assert id_rev == rev
+        # in a pre-f7 pack, revs are ordered so after REV we can stop looking
+        if id_rev > rev:
+          break
+        if id_rev == rev:
+          ids[id_noderev] = id
+      elif line.startswith('text: ' + str(rev) + ' '):  # also 'props:' lines?
+        fields = line.split()
+        texts.append(tuple(fields[2:]))
+    return ids, texts
 
-def find_good_id(repo_dir, bad_id):
+def find_good_id(repo, bad_id):
   """Return the node-rev id that is like BAD_ID but has the byte-offset
      part corrected, by looking in the revision file in the repository
      at REPO_DIR.
@@ -74,17 +142,21 @@ def find_good_id(repo_dir, bad_id):
          possibility of a false match.
   """
 
+  if isinstance(repo, str):
+    repo = Repository(repo)
   noderev, rev, bad_offset = parse_id(bad_id)
-  ids, _ = rev_file_indexes(repo_dir, rev)
+  ids, _ = repo.rev_file_indexes(rev)
 
   if noderev not in ids:
     raise FixError("NodeRev Id '" + noderev + "' not found in r" + rev)
   return ids[noderev]
 
-def find_good_rep_header(repo_dir, rev, size):
+def find_good_rep_header(repo, rev, size):
   """Find a rep header that matches REV and SIZE.
      Return the correct offset."""
-  _, texts = rev_file_indexes(repo_dir, rev)
+  if isinstance(repo, str):
+    repo = Repository(repo)
+  _, texts = repo.rev_file_indexes(rev)
   n_matches = 0
   for fields in texts:
     if fields[1] == size:
@@ -101,7 +173,8 @@ if __name__ == '__main__':
     repo_dir = sys.argv[1]
     rev = sys.argv[2]
     size = sys.argv[3]
-    print("Good offset:", find_good_rep_header(repo_dir, rev, size))
+    repo = Repository(repo_dir)
+    print("Good rep header offset:", find_good_rep_header(repo, rev, size))
     sys.exit(0)
 
   if len(sys.argv) != 3:
@@ -111,7 +184,8 @@ if __name__ == '__main__':
   repo_dir = sys.argv[1]
   bad_id = sys.argv[2]
 
-  good_id = find_good_id(repo_dir, bad_id)
+  repo = Repository(repo_dir)
+  good_id = find_good_id(repo, bad_id)
 
   # Replacement ID must be the same length, otherwise I don't know how to
   # reconstruct the file so as to preserve all offsets.

Modified: subversion/trunk/contrib/server-side/fsfsfixer/fixer/fix-rev.py
URL: 
http://svn.apache.org/viewvc/subversion/trunk/contrib/server-side/fsfsfixer/fixer/fix-rev.py?rev=1887170&r1=1887169&r2=1887170&view=diff
==============================================================================
--- subversion/trunk/contrib/server-side/fsfsfixer/fixer/fix-rev.py (original)
+++ subversion/trunk/contrib/server-side/fsfsfixer/fixer/fix-rev.py Thu Mar  4 
12:05:28 2021
@@ -12,7 +12,7 @@ $LastChangedRevision$
 import os, sys, re, subprocess
 from subprocess import Popen, PIPE
 
-from find_good_id import FixError, rev_file_path, find_good_id, 
find_good_rep_header
+from find_good_id import FixError, Repository, find_good_id, 
find_good_rep_header
 from fixer_config import *
 
 
@@ -70,11 +70,11 @@ def replace_in_file(filename, old, new):
                 "    with '" + new + "'")
   os.remove(filename + '.bak')
 
-def replace_in_rev_file(repo_dir, rev, old, new):
+def replace_in_rev_file(repo, rev, old, new):
   """Replace all occurrences of the string OLD with the string NEW in the
      revision file for revision REV in the repository at REPO_DIR.  Raise an
      error if nothing changes."""
-  rev_file = rev_file_path(repo_dir, rev)
+  rev_file = repo.rev_file_path(rev)
   replace_in_file(rev_file, old, new)
 
 # Fix a node-rev ID that has a bad byte-offset part.  Look up the correct
@@ -85,10 +85,10 @@ def replace_in_rev_file(repo_dir, rev, o
 #   since the error reported for <REV> might actually exist in an older
 #   revision that is referenced by <REV>.
 #
-def fix_id(repo_dir, rev, bad_id):
+def fix_id(repo, rev, bad_id):
 
   # Find the GOOD_ID to replace BAD_ID.
-  good_id = find_good_id(repo_dir, bad_id)
+  good_id = find_good_id(repo, bad_id)
 
   # Replacement ID must be the same length, otherwise I don't know how to
   # reconstruct the file so as to preserve all offsets.
@@ -100,37 +100,37 @@ def fix_id(repo_dir, rev, bad_id):
     raise FixError("The ID supplied is already correct: " +
                    "good id '" + good_id + "'")
 
-  replace_in_rev_file(repo_dir, rev, bad_id, good_id)
+  replace_in_rev_file(repo, rev, bad_id, good_id)
   print("Fixed id: " + bad_id + " -> " + good_id)
   fixed_ids[bad_id] = good_id
 
-def fix_checksum(repo_dir, rev, old_checksum, new_checksum):
+def fix_checksum(repo, rev, old_checksum, new_checksum):
   """Change all occurrences of OLD_CHECKSUM to NEW_CHECKSUM in the revision
      file for REV in REPO_DIR."""
 
   assert len(old_checksum) and len(new_checksum)
   assert old_checksum != new_checksum
 
-  replace_in_rev_file(repo_dir, rev, old_checksum, new_checksum)
+  replace_in_rev_file(repo, rev, old_checksum, new_checksum)
   print("Fixed checksum: " + old_checksum + " -> " + new_checksum)
   fixed_checksums[old_checksum] = new_checksum
 
-def fix_rep_ref(repo_dir, rev, prefix, rep_rev, bad_offset, rep_size):
+def fix_rep_ref(repo, rev, prefix, rep_rev, bad_offset, rep_size):
   """Fix a "DELTA <REP_REV> <BAD_OFFSET> <REP_SIZE>"
         or "text: <REP_REV> <BAD_OFFSET> <REP_SIZE> ..."
      line in the revision file for REV in REPO_DIR, where <BAD_OFFSET> is
      wrong.  PREFIX is 'DELTA' or 'text:'.
   """
-  good_offset = find_good_rep_header(repo_dir, rep_rev, rep_size)
+  good_offset = find_good_rep_header(repo, rep_rev, rep_size)
   old_line = ' '.join([prefix, rep_rev, bad_offset, rep_size])
   new_line = ' '.join([prefix, rep_rev, good_offset, rep_size])
   if good_offset == bad_offset:
     raise FixError("Attempting to fix a rep ref that appears to be correct: " 
+ old_line)
-  replace_in_rev_file(repo_dir, rev, old_line, new_line)
+  replace_in_rev_file(repo, rev, old_line, new_line)
   print("Fixed rep ref:", old_line, "->", new_line)
 
 
-def handle_one_error(repo_dir, rev, error_lines):
+def handle_one_error(repo, rev, error_lines):
   """If ERROR_LINES describes an error we know how to fix, then fix it.
      Return True if fixed, False if not fixed."""
 
@@ -151,7 +151,7 @@ def handle_one_error(repo_dir, rev, erro
     # Fix it.
     bad_id = match.group(1)
     verbose_print(error_lines[0])
-    fix_id(repo_dir, rev, bad_id)
+    fix_id(repo, rev, bad_id)
 
     # Verify again, and expect to discover a checksum mismatch.
     # verbose_print("Fixed an ID; now verifying to discover the checksum we 
need to update")
@@ -163,7 +163,7 @@ def handle_one_error(repo_dir, rev, erro
     #
     # expected = ...
     # actual   = ...
-    # fix_checksum(repo_dir, rev, expected, actual)
+    # fix_checksum(repo, rev, expected, actual)
 
     return True
 
@@ -171,7 +171,7 @@ def handle_one_error(repo_dir, rev, erro
   if match:
     expected = re.match(r' *expected: *([^ ]*)', error_lines[1]).group(1)
     actual   = re.match(r' *actual: *([^ ]*)',   error_lines[2]).group(1)
-    fix_checksum(repo_dir, rev, expected, actual)
+    fix_checksum(repo, rev, expected, actual)
     return True
 
   match = re.match(r"svn.*: Corrupt representation '([0-9]*) ([0-9]*) ([0-9]*) 
.*'", line1)
@@ -196,11 +196,11 @@ def handle_one_error(repo_dir, rev, erro
     # a knock-on effect, invalidating the checksum of the rep so that all
     # references to this rep will then need their checksums correcting.
     try:
-      fix_rep_ref(repo_dir, rev, 'DELTA', bad_rev, bad_offset, bad_size)
+      fix_rep_ref(repo, rev, 'DELTA', bad_rev, bad_offset, bad_size)
     except FixError:
       # In at least one case of corruption, every bad reference has been in a
       # 'text:' line.  Fixing this has no knock-on effect.
-      fix_rep_ref(repo_dir, rev, 'text:', bad_rev, bad_offset, bad_size)
+      fix_rep_ref(repo, rev, 'text:', bad_rev, bad_offset, bad_size)
 
     return True
 
@@ -213,18 +213,18 @@ def grab_stderr(child_argv):
   child_err = [line for line in stderr.splitlines() if '(apr_err=' not in line]
   return child_err
 
-def fix_one_error(repo_dir, rev):
+def fix_one_error(repo, rev):
   """Verify, and if there is an error we know how to fix, then fix it.
      Return False if no error, True if fixed, exception if can't fix."""
 
   # Capture the output of 'svnadmin verify' (ignoring any debug-build output)
-  svnadmin_err = grab_stderr([SVNADMIN, 'verify', '-q', '-r'+rev, repo_dir])
+  svnadmin_err = grab_stderr([SVNADMIN, 'verify', '-q', '-r'+rev, repo.path])
 
   if svnadmin_err == []:
     return False
 
   try:
-    if handle_one_error(repo_dir, rev, svnadmin_err):
+    if handle_one_error(repo, rev, svnadmin_err):
       return True
     else:
       verbose_print("Unrecognized error message; trying 'svnlook' instead.")
@@ -237,51 +237,48 @@ def fix_one_error(repo_dir, rev):
   # one that we *can* handle.
 
   # Capture the output of 'svnlook tree' (ignoring any debug-build output)
-  svnlook_err = grab_stderr([SVNLOOK, 'tree', '-r'+rev, repo_dir])
+  svnlook_err = grab_stderr([SVNLOOK, 'tree', '-r'+rev, repo.path])
 
   if svnlook_err == []:
     print('warning: svnlook did not find an error')
   else:
-    if handle_one_error(repo_dir, rev, svnlook_err):
+    if handle_one_error(repo, rev, svnlook_err):
       return True
     else:
       verbose_print("Unrecognized error message.")
 
   raise FixError("unable to fix r" + str(rev))
 
-def check_formats(repo_dir):
+def check_formats(repo):
   """Check that REPO_DIR isn't newer than we know how to handle."""
 
-  repos_format = int(open(os.path.join(repo_dir, 'format')).readline())
-  if repos_format not in [3,5]:
+  if repo.repo_format not in [3,5]:
     raise FixError("Repository '%s' too new (format %d); try the version at %s"
-                   % (repo_dir, repos_format, URL))
+                   % (repo.path, repo.repo_format, URL))
 
-  fs_type = open(os.path.join(repo_dir, 'db', 'fs-type')).read().rstrip()
-  if fs_type != 'fsfs':
+  if repo.fs_type != 'fsfs':
     raise FixError("Repository '%s' has wrong FS backend: "
-                   "found '%s', expected '%s'" % (repo_dir, fs_type, 'fsfs'))
+                   "found '%s', expected '%s'" % (repo.path, repo.fs_type, 
'fsfs'))
 
-  fsfs_format = int(open(os.path.join(repo_dir, 'db', 'format')).readline())
-  if fsfs_format > MAX_FSFS_FORMAT:
+  if repo.db_format > MAX_FSFS_FORMAT:
     raise FixError("Filesystem '%s' is too new (format %d); try the version at 
%s"
-                   % (os.path.join(repo_dir, 'db'), fsfs_format, URL))
+                   % (os.path.join(repo.path, 'db'), repo.db_format, URL))
 
 # ----------------------------------------------------------------------
 # Main program
 
-def fix_rev(repo_dir, rev):
+def fix_rev(repo, rev):
   """"""
 
-  check_formats(repo_dir)
+  check_formats(repo)
 
   # Back up the file
-  if not os.path.exists(rev_file_path(repo_dir, rev) + '.orig'):
+  if not os.path.exists(repo.rev_file_path(rev) + '.orig'):
     pass
     # cp -a "$FILE" "$FILE.orig"
 
   # Keep looking for verification errors in r$REV and fixing them while we can.
-  while fix_one_error(repo_dir, rev):
+  while fix_one_error(repo, rev):
     pass
   print("Revision " + rev + " verifies OK.")
 
@@ -295,8 +292,9 @@ if __name__ == '__main__':
   repo_dir = sys.argv[1]
   rev = sys.argv[2]
 
+  repo = Repository(repo_dir)
   try:
-    fix_rev(repo_dir, rev)
+    fix_rev(repo, rev)
   except FixError as e:
     print('error:', e)
     sys.exit(1)

Modified: subversion/trunk/contrib/server-side/fsfsfixer/fixer/fixer_config.py
URL: 
http://svn.apache.org/viewvc/subversion/trunk/contrib/server-side/fsfsfixer/fixer/fixer_config.py?rev=1887170&r1=1887169&r2=1887170&view=diff
==============================================================================
--- subversion/trunk/contrib/server-side/fsfsfixer/fixer/fixer_config.py 
(original)
+++ subversion/trunk/contrib/server-side/fsfsfixer/fixer/fixer_config.py Thu 
Mar  4 12:05:28 2021
@@ -9,16 +9,3 @@ SVNLOOK = 'svnlook'
 # Verbosity: True for verbose, or False for quiet
 VERBOSE = True
 
-# PER-REPOSITORY CONFIGURATION
-
-# The number of revs per shard of the repository being accessed, or 'None'
-# for a linear (that is, non-sharded) layout.  This is 1000 for almost all
-# repositories in practice.
-#
-# The correct value can be found in the 'db/format' file in the repository.
-# The second line of that file will say something like 'layout sharded 1000'
-# or 'layout linear'.
-#
-# TODO: Read this value automatically from the db/format file.
-REVS_PER_SHARD=1000
-


Reply via email to