Tackling full pdiff support for debmirror tonight, I took the last patch a bit further and implemented APT's RRED method, basically by translating from C++ to perl in a linear fashion.
This means that debmirror is now capable of using the pdiff files to patch already available, out-dated Packages or Sources files and thus saving some bandwidth by not having to download the full (albeit compressed) files. I have little hope that above feature will ever be integrated, as debmirror currently seems to be unmaintained, and the inclusion of such an invasive piece of code or rather an improved variation thereof to support a slow and maybe volatile patch method might be questionable. Nevertheless, for those who would like to play with it... Regards, Peter
--- /tmp/debmirror-20051209~/debmirror 2005-12-09 19:13:09.000000000 +0100 +++ /tmp/debmirror-20051209/debmirror 2006-09-06 22:50:54.000000000 +0200 @@ -355,6 +355,7 @@ use LockFile::Simple; use Compress::Zlib; use Digest::MD5; +use Digest::SHA1; use LWP::UserAgent; # Yeah, I use too many global variables in this program. @@ -634,6 +635,7 @@ add_bytes("dists/$dist/$section/binary-$arch/Packages.gz"); add_bytes("dists/$dist/$section/binary-$arch/Packages.bz2"); add_bytes("dists/$dist/$section/binary-$arch/Release"); + add_bytes("dists/$dist/$section/binary-$arch/Packages.diff/Index"); } # d-i has no sources over there, sources are in main next if ($section =~ /debian-installer/); @@ -642,6 +644,7 @@ add_bytes("dists/$dist/$section/source/Sources.gz"); add_bytes("dists/$dist/$section/source/Sources.bz2"); add_bytes("dists/$dist/$section/source/Release"); + add_bytes("dists/$dist/$section/source/Sources.diff/Index"); } } } @@ -650,10 +653,12 @@ add_bytes("$_/Packages.gz"); add_bytes("$_/Packages.bz2"); add_bytes("$_/Release"); + add_bytes("$_/Packages.diff/Index"); if ($do_source) { add_bytes("$_/Sources"); add_bytes("$_/Sources.gz"); add_bytes("$_/Sources.bz2"); + add_bytes("$_/Sources.diff/Index"); } } if ($getcontents) { @@ -1059,6 +1064,24 @@ return 0; } +# Check uncompressed pdiff content against sha1sum from Index file. +sub check_pdiff { + my ($filename, $size, $sha1) = @_; + my $digest = Digest::SHA1->new; + my $ret = 0; + + if (-f "$filename.gz") { + system_redirect_io("gunzip", "$filename.gz", "$filename"); + if ($size == -s $filename) { + open HANDLE, $filename or die "$filename: $!"; + $digest->addfile(*HANDLE); + $ret = ($sha1 eq $digest->hexdigest); + } + unlink ($filename); + } + return $ret; +} + # Check file against md5sum and size from the Release file. # It will return true if the md5sum matches. sub check_lists { @@ -1276,6 +1299,32 @@ make_dir($subdir); make_dir("$tempdir/$subdir"); + if (exists $file_lists_size{"$tempdir/$subdir/Packages.diff/Index"}) { + if (!check_lists ("$tempdir/$subdir/Packages.diff/Index")) { + make_dir("$subdir/Packages.diff"); + make_dir("$tempdir/$subdir/Packages.diff"); + say("$subdir/Packages.diff/Index needs fetch"); + remote_get("$subdir/Packages.diff/Index"); + if (!check_lists ("$tempdir/$subdir/Packages.diff/Index")) { + say("$subdir/Packages.diff/Index failed md5sum check, removing"); + push (@errlog,"$subdir/Packages.diff/Index failed md5sum check, removing\n"); + unlink "$tempdir/$subdir/Packages.diff/Index"; + } else { + fetch_and_apply_pdiffs($subdir, "Packages"); + if (check_lists ("$tempdir/$subdir/Packages")) { + system_redirect_io("gzip -9 -n", "$tempdir/$subdir/Packages", "$tempdir/$subdir/Packages.gz"); + system_redirect_io("bzip2", "$tempdir/$subdir/Packages", "$tempdir/$subdir/Packages.bz2"); + } + } + } else { + $bytes_gotten += $file_lists_size{"$tempdir/$subdir/Packages.diff/Index"}; + fetch_and_apply_pdiffs($subdir, "Packages"); + if (check_lists ("$tempdir/$subdir/Packages")) { + system_redirect_io("gzip -9 -n", "$tempdir/$subdir/Packages", "$tempdir/$subdir/Packages.gz"); + system_redirect_io("bzip2", "$tempdir/$subdir/Packages", "$tempdir/$subdir/Packages.bz2"); + } + } + } if (exists $file_lists_size{"$tempdir/$subdir/Packages.gz"}) { if (!check_lists ("$tempdir/$subdir/Packages.gz")) { say("$subdir/Packages.gz needs fetch"); @@ -1345,10 +1394,12 @@ $files{"$subdir/Packages.bz2"}=1; $files{"$subdir/Packages"}=1; $files{"$subdir/Release"}=1; + $files{"$subdir/Packages.diff/Index"}=1; $files{"$tempdir/$subdir/Packages.gz"}=1; $files{"$tempdir/$subdir/Packages.bz2"}=1; $files{"$tempdir/$subdir/Packages"}=1; $files{"$tempdir/$subdir/Release"}=1; + $files{"$tempdir/$subdir/Packages.diff/Index"}=1; } # Get Sources file @@ -1358,6 +1409,32 @@ if ($do_source) { make_dir($subdir); make_dir($tempdir."/".$subdir); + if (exists $file_lists_size{"$tempdir/$subdir/Sources.diff/Index"}) { + if (!check_lists ("$tempdir/$subdir/Sources.diff/Index")) { + make_dir("$subdir/Sources.diff"); + make_dir("$tempdir/$subdir/Sources.diff"); + say("$subdir/Sources.diff/Index needs fetch"); + remote_get("$subdir/Sources.diff/Index"); + if (!check_lists ("$tempdir/$subdir/Sources.diff/Index")) { + say("$subdir/Sources.diff/Index failed md5sum check, removing"); + push (@errlog,"$subdir/Sources.diff/Index failed md5sum check, removing\n"); + unlink "$tempdir/$subdir/Sources.diff/Index"; + } else { + fetch_and_apply_pdiffs($subdir, "Sources"); + if (check_lists ("$tempdir/$subdir/Sources")) { + system_redirect_io("gzip -9 -n", "$tempdir/$subdir/Sources", "$tempdir/$subdir/Sources.gz"); + system_redirect_io("bzip2", "$tempdir/$subdir/Sources", "$tempdir/$subdir/Sources.bz2"); + } + } + } else { + $bytes_gotten += $file_lists_size{"$tempdir/$subdir/Sources.diff/Index"}; + fetch_and_apply_pdiffs($subdir, "Sources"); + if (check_lists ("$tempdir/$subdir/Sources")) { + system_redirect_io("gzip -9 -n", "$tempdir/$subdir/Sources", "$tempdir/$subdir/Sources.gz"); + system_redirect_io("bzip2", "$tempdir/$subdir/Sources", "$tempdir/$subdir/Sources.bz2"); + } + } + } if (exists $file_lists_size{"$tempdir/$subdir/Sources.gz"}) { if (!check_lists ("$tempdir/$subdir/Sources.gz")) { say("$subdir/Sources.gz needs fetch"); @@ -1427,10 +1504,202 @@ $files{"$subdir/Sources"}=1; $files{"$subdir/Sources.bz2"}=1; $files{"$subdir/Release"}=1; + $files{"$subdir/Sources.diff/Index"}=1; $files{"$tempdir/$subdir/Sources.gz"}=1; $files{"$tempdir/$subdir/Sources"}=1; $files{"$tempdir/$subdir/Sources.bz2"}=1; $files{"$tempdir/$subdir/Release"}=1; + $files{"$tempdir/$subdir/Sources.diff/Index"}=1; + } +} + +sub fetch_and_apply_pdiffs { + my ($subdir, $list) = @_; + local (*INDEX, *LIST, *LIST_OLD, *PDIFF); + my (%history_sha1, %history_size, %pdiff_sha1, %pdiff_size); + my ($current_sha1, $current_size, $sha1, $size, $file, $digest, $ret); + + # Parse DiffIndex file + open(INDEX, "$tempdir/$subdir/$list.diff/Index") or die "$tempdir/$subdir/$list.diff/Index: $!"; + $_ = <INDEX>; + while (defined($_)) { + if (m/^SHA1-Current:/m) { + ($current_sha1, $current_size) = m/^SHA1-Current:\s+([A-Za-z0-9]+)\s+(\d+)/m; + $_ = <INDEX>; + } + elsif (m/^SHA1-History:/m) { + while (defined($_ = <INDEX>)) { + last if (!m/^\s/m); + ($sha1, $size, $file) = m/^\s+([A-Za-z0-9]+)\s+(\d+)\s+(.*)/m; + $history_sha1{$file} = $sha1; + $history_size{$file} = $size; + } + } + elsif (m/^SHA1-Patches:/m) { + while (defined($_ = <INDEX>)) { + last if (!m/^\s/m); + ($sha1, $size, $file) = m/^\s+([A-Za-z0-9]+)\s+(\d+)\s+(.*)/m; + $pdiff_sha1{$file} = $sha1; + $pdiff_size{$file} = $size; + } + } + } + close(INDEX); + + # Download pdiff files as necessary + $ret = 1; + foreach $file (sort keys %pdiff_sha1) { + if (!check_pdiff("$tempdir/$subdir/$list.diff/$file", $pdiff_size{$file}, $pdiff_sha1{$file})) { + say("$subdir/$list.diff/$file.gz needs fetch"); + remote_get("$subdir/$list.diff/$file.gz"); + $bytes_to_get += -s "$tempdir/$subdir/$list.diff/$file.gz"; + if (!check_pdiff("$tempdir/$subdir/$list.diff/$file", $pdiff_size{$file}, $pdiff_sha1{$file})) { + say("$subdir/$list.diff/$file.gz failed sha1sum check, removing"); + push (@errlog,"$subdir/$list.diff/$file.gz failed sha1sum check, removing\n"); + unlink "$tempdir/$subdir/$list.diff/$file.gz"; + $ret = 0; + } + } else { + $bytes_to_get += -s "$tempdir/$subdir/$list.diff/$file.gz"; + $bytes_gotten += -s "$tempdir/$subdir/$list.diff/$file.gz"; + } + $files{"$subdir/$list.diff/$file.gz"}=1; + $files{"$tempdir/$subdir/$list.diff/$file.gz"}=1; + } + return unless ($ret); + + # Apply pdiff files + open(LIST, "$subdir/$list") or return; + $digest = Digest::SHA1->new; + $digest->addfile(*LIST); + $sha1 = $digest->hexdigest; + $size = -s "$subdir/$list"; + system("cp $subdir/$list $tempdir/$subdir/$list"); + foreach $file (sort keys %history_sha1) { + next unless ($sha1 eq $history_sha1{$file} && $size eq $history_size{$file}); + system_redirect_io("gunzip", "$tempdir/$subdir/$list.diff/$file.gz", "$tempdir/$subdir/$list.diff/$file"); + system("mv $tempdir/$subdir/$list $tempdir/$subdir/$list.old"); + open(PDIFF, "$tempdir/$subdir/$list.diff/$file") or die "$tempdir/$subdir/$list.diff/$file: $!"; + open(LIST_OLD, "$tempdir/$subdir/$list.old") or die "$tempdir/$subdir/$list.old: $!"; + open(LIST, ">$tempdir/$subdir/$list") or die "$tempdir/$subdir/$list: $!"; + rred_ed_file(*PDIFF, *LIST_OLD, *LIST, $digest); + close(PDIFF); + close(LIST_OLD); + close(LIST); + $sha1 = $digest->hexdigest; + $size = -s "$tempdir/$subdir/$list"; + say("$subdir/$list patched with $subdir/$list.diff/$file.gz"); + } + unlink("$tempdir/$subdir/$list $tempdir/$subdir/$list.old"); + if (!($sha1 eq $current_sha1 && $size eq $current_size)) { + say("$subdir/$list failed sha1sum check, removing"); + push (@errlog,"$subdir/$list failed sha1sum check, removing\n"); + unlink "$tempdir/$subdir/$list"; + } +} + +# apt pdiff patch algorithm (see apt-0.6.45/methods/rred.cc) +{ + # this method implements a patch functionality similar to "patch --ed" that is + # used by the "tiffany" incremental packages download stuff. it differs from + # "ed" insofar that it is way more restricted (and therefore secure). in the + # moment only the "c", "a" and "d" commands of ed are implemented (diff + # doesn't output any other). additionally the records must be reverse sorted + # by line number and may not overlap (diff *seems* to produce this kind of + # output). + + # rred method ed commands + use constant RRED_MODE_CHANGED => 'c'; + use constant RRED_MODE_DELETED => 'd'; + use constant RRED_MODE_ADDED => 'a'; + # rred method return values + use constant RRED_ED_OK => 0; + use constant RRED_ED_ORDERING => 1; + use constant RRED_ED_PARSER => 2; + use constant RRED_ED_FAILURE => 3; + + sub rred_ed_file { + local (*ED_CMDS, *IN_FILE, *OUT_FILE) = splice @_, 0, 3; + my ($digest) = @_; + my $result; + + # we do a tail recursion to read the commands in the right order + $result = rred_ed_rec(*ED_CMDS, *IN_FILE, *OUT_FILE, 0, $digest); + + # read the rest from infile + if ($result > 0) { + while (<IN_FILE>) { + print OUT_FILE; + $digest->add($_); + } + } + else { + return RRED_ED_FAILURE; + } + return RRED_ED_OK; + } + + sub rred_ed_rec { + no warnings 'recursion'; + local (*ED_CMDS, *IN_FILE, *OUT_FILE) = splice @_, 0, 3; + my ($line, $digest) = @_; + my ($pos, $startline, $stopline, $mode); + + # get the current command and parse it + if (defined($_ = <ED_CMDS>)) { + ($startline, $stopline, $mode) = m/^(\d+)(?:,(\d+))?([cda])$/m; + + if (not defined($startline)) { + return RRED_ED_PARSER; + } + elsif ($startline < $line) { + return RRED_ED_ORDERING; + } + $stopline = $startline unless defined($stopline); + } + else { + return $line; + } + # get the current position + $pos = tell(ED_CMDS); + # if this is add or change then go to the next full stop + if (($mode eq RRED_MODE_CHANGED) || ($mode eq RRED_MODE_ADDED)) { + while (<ED_CMDS>) { + last if (m/^\./m); + } + } + # do the recursive call + $line = rred_ed_rec(*ED_CMDS, *IN_FILE, *OUT_FILE, $line, $digest); + # pass on errors + if ($line < 0) { + return $line; + } + # apply our hunk + seek(ED_CMDS, $pos, 0); + # first wind to the current position + $startline -= 1 unless ($mode eq RRED_MODE_ADDED); + while ($line < $startline) { + $_ = <IN_FILE>; + print OUT_FILE $_; + $digest->add($_); + $line++; + } + # include from ed script + if (($mode eq RRED_MODE_ADDED) || ($mode eq RRED_MODE_CHANGED)) { + while (<ED_CMDS>) { + last if (m/^\./m); + print OUT_FILE; + $digest->add($_); + } + } + # ignore the corresponding number of lines from input + if (($mode eq RRED_MODE_DELETED) || ($mode eq RRED_MODE_CHANGED)) { + while ($line < $stopline) { + <IN_FILE>; + $line++; + } + } + return $line; } }