On Wed, Oct 07, 2015 at 02:51:33AM +0200, Cyril Brulebois wrote: > Steven Chamberlain <ste...@pyro.eu.org> (2015-10-07): > > If we had multiple udeb sources (Bug#345419), it could mean that > > anna reads a Packages file having multiple versions for some udebs. > > It's very easy to make net-retriever generate such a file, combining > > multiple suites, e.g. stable + stable-proposed-updates; or maybe > > stable + stable-backports. > > TBH, I'm not sure whether this should be dealt with in anna or in > net-retriever. The latter is responsible for the addition in the first > place, so could be considered as the one responsible for avoiding > duplicate entries. Another way to look at it would be: net-retriever > knows this limitation in anna, and is fine with trusting anna to do the > "only pick the last occurrence" selection, provided n-r cats everything > in the right order. > > ISTR some shell dance in net-retriever from my old patchset; having some > C code in anna instead doesn't seem crazier. > > > Taking a step back, one could argue that the logic could be slightly > different, like picking the highest version. Even if it's a corner case, > we could imagine having bits merged in a point release that's higher > than what's in backports. More interestingly, that would mean anna > doesn't care about the order in which the files were cat'd together.
If it's helpful, here's most of the patch I wrote for Ubuntu's net-retriever a while back that implements basically what you suggest here. The remaining piece is that you'd need to actually call deduplicate at an appropriate point. It might be more robust to have this in anna though, so that it would work for other retrievers that might somehow end up in a similar situation. Beware of doing it in shell; an earlier version of my patch did that and it was prohibitively slow (~10 minutes) for large Packages files, which we ended up with as a result of lots of kernel ABIs. * Deduplicate Packages files before passing them to anna. diff -Nru net-retriever-1.39/Makefile net-retriever-1.39ubuntu1/Makefile --- net-retriever-1.39/Makefile 1970-01-01 01:00:00.000000000 +0100 +++ net-retriever-1.39ubuntu1/Makefile 2013-07-16 12:24:57.000000000 +0100 @@ -0,0 +1,7 @@ +CFLAGS := -Os -fomit-frame-pointer -g -Wall +LDLIBS := -ldebian-installer + +all: deduplicate + +clean: + rm -f deduplicate diff -Nru net-retriever-1.39/debian/control net-retriever-1.39ubuntu1/debian/control --- net-retriever-1.39/debian/control 2014-03-03 10:26:08.000000000 +0000 +++ net-retriever-1.39ubuntu1/debian/control 2014-11-17 16:30:47.000000000 +0000 @@ -3,15 +3,15 @@ Priority: optional Maintainer: Debian Install System Team <debian-b...@lists.debian.org> Uploaders: Christian Perrier <bubu...@debian.org>, Cyril Brulebois <k...@debian.org> -Build-Depends: debhelper (>= 9) +Build-Depends: debhelper (>= 9), libdebian-installer4-dev Build-Depends-Indep: dpkg-dev (>= 1.7.0) Vcs-Browser: http://anonscm.debian.org/gitweb/?p=d-i/net-retriever.git Vcs-Git: git://anonscm.debian.org/d-i/net-retriever.git Package: net-retriever Package-Type: udeb -Architecture: all -Depends: ${misc:Depends}, choose-mirror, configured-network, di-utils (>= 1.58), gpgv-udeb, debian-archive-keyring-udeb +Architecture: any +Depends: ${shlibs:Depends}, ${misc:Depends}, choose-mirror, configured-network, di-utils (>= 1.58), gpgv-udeb, ubuntu-keyring-udeb Provides: retriever Description: Fetch modules from the Internet This is a retriever that uses wget to fetch files over http or ftp. diff -Nru net-retriever-1.39/debian/net-retriever.install net-retriever-1.39ubuntu1/debian/net-retriever.install --- net-retriever-1.39/debian/net-retriever.install 2012-03-12 09:17:51.000000000 +0000 +++ net-retriever-1.39ubuntu1/debian/net-retriever.install 2013-07-16 12:42:20.000000000 +0100 @@ -1 +1,2 @@ net-retriever usr/lib/debian-installer/retriever +deduplicate usr/lib/net-retriever diff -Nru net-retriever-1.39/debian/rules net-retriever-1.39ubuntu1/debian/rules --- net-retriever-1.39/debian/rules 2012-03-12 09:17:51.000000000 +0000 +++ net-retriever-1.39ubuntu1/debian/rules 2012-12-06 18:02:14.000000000 +0000 @@ -1,3 +1,12 @@ #! /usr/bin/make -f + +DEB_BUILD_GNU_TYPE ?= $(shell dpkg-architecture -qDEB_BUILD_GNU_TYPE) +DEB_HOST_GNU_TYPE ?= $(shell dpkg-architecture -qDEB_HOST_GNU_TYPE) + %: dh $@ + +ifneq ($(DEB_BUILD_GNU_TYPE),$(DEB_HOST_GNU_TYPE)) +override_dh_auto_build: + dh_auto_build -- CC=$(DEB_HOST_GNU_TYPE)-gcc +endif diff -Nru net-retriever-1.39/deduplicate.c net-retriever-1.39ubuntu1/deduplicate.c --- net-retriever-1.39/deduplicate.c 1970-01-01 01:00:00.000000000 +0100 +++ net-retriever-1.39ubuntu1/deduplicate.c 2013-07-16 12:41:13.000000000 +0100 @@ -0,0 +1,216 @@ +/* Remove duplicates in Packages file, pending libd-i doing it for us. */ + +#define _GNU_SOURCE + +#include <sys/types.h> +#include <ctype.h> +#include <regex.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> + +#include <debian-installer.h> + +di_hash_table *versions, *entries; +regex_t package_re, version_re; + +static void package_version_free (void *key) +{ + di_package_version *ver = key; + + di_free (ver->upstream); + di_free (ver->debian_revision); + di_free (ver); +} + +static void xregcomp (regex_t *preg, const char *regex, int cflags) +{ + int err; + + err = regcomp (preg, regex, cflags); + if (err) { + char *errbuf; + size_t errbuf_size; + + errbuf_size = regerror (err, preg, NULL, 0); + errbuf = di_malloc (errbuf_size); + regerror (err, preg, errbuf, errbuf_size); + fprintf (stderr, "Failed to compile /%s/: %s", regex, errbuf); + di_free (errbuf); + exit (1); + } +} + +static bool match_regex_capture_one (regex_t *preg, di_rstring *entry, + di_rstring *out) +{ + regmatch_t matches[2]; + + if (regexec (preg, entry->string, 2, matches, 0) != 0 || + matches[1].rm_so == -1 || matches[1].rm_eo == -1) + return false; + out->size = matches[1].rm_eo - matches[1].rm_so; + out->string = di_stradup (entry->string + matches[1].rm_so, out->size); + return true; +} + +static di_rstring *rstring_copy (di_rstring *orig) +{ + di_rstring *copy; + + copy = di_new (di_rstring, 1); + copy->string = strdup (orig->string); + copy->size = orig->size; + return copy; +} + +static void deduplicate_one (di_rstring *entry) +{ + di_rstring package, newver_str; + di_package dummynewver; + di_package_version *newver, *oldver; + + package.string = NULL; + newver_str.string = NULL; + + if (!match_regex_capture_one (&package_re, entry, &package)) + goto out; + if (!*package.string || + memchr (package.string, '/', package.size)) + goto out; + + if (!match_regex_capture_one (&version_re, entry, &newver_str)) + goto out; + + /* libdebian-installer has a crazy interface that won't let me parse + * raw strings, so I need this dance. + */ + dummynewver.version = newver_str.string; + newver = di_package_version_parse (&dummynewver); + if (!newver) { + fprintf (stderr, "failed to parse version %s\n", + newver_str.string); + exit (2); + } + oldver = di_hash_table_lookup (versions, &package); + if (oldver && di_package_version_compare (oldver, newver) >= 0) { + package_version_free (newver); + goto out; + } + di_hash_table_insert (versions, rstring_copy (&package), newver); + di_hash_table_insert (entries, rstring_copy (&package), + rstring_copy (entry)); + +out: + di_free (package.string); + di_free (newver_str.string); +} + +struct entry { + di_rstring key, value; +}; + +struct all_entries { + struct entry *entries; + di_ksize_t size, pos; +}; + +static void entry_append (void *key, void *value, void *user_data) +{ + struct all_entries *all_entries = user_data; + all_entries->entries[all_entries->pos].key = *(di_rstring *) key; + all_entries->entries[all_entries->pos].value = *(di_rstring *) value; + ++all_entries->pos; +} + +static int entry_compare (const void *va, const void *vb) +{ + const struct entry *a = va, *b = vb; + return strcmp (a->key.string, b->key.string); +} + +static void output (void) +{ + struct all_entries all_entries; + di_ksize_t i; + + all_entries.size = di_hash_table_size (entries); + all_entries.entries = di_new (struct entry, all_entries.size); + all_entries.pos = 0; + di_hash_table_foreach (entries, entry_append, &all_entries); + qsort (all_entries.entries, all_entries.size, sizeof (struct entry), + entry_compare); + + for (i = 0; i < all_entries.size; ++i) { + di_rstring *value = &all_entries.entries[i].value; + + fputs (value->string, stdout); + if (!value->size || value->string[value->size - 1] != '\n') + fputc ('\n', stdout); + fputc ('\n', stdout); + } + + di_free (all_entries.entries); +} + +int main (int argc, char **argv) +{ + int ret; + di_rstring line; + ssize_t line_size; + size_t line_alloc; + di_rstring entry; + size_t entry_alloc; + + versions = di_hash_table_new_full (di_rstring_hash, di_rstring_equal, + free, package_version_free); + entries = di_hash_table_new_full (di_rstring_hash, di_rstring_equal, + free, free); + xregcomp (&package_re, "^Package:[[:space:]]+(.*)", + REG_EXTENDED | REG_ICASE | REG_NEWLINE); + xregcomp (&version_re, "^Version:[[:space:]]+(.*)", + REG_EXTENDED | REG_ICASE | REG_NEWLINE); + line_alloc = 0; + line.string = NULL; + entry_alloc = 4096; + entry.string = di_malloc (entry_alloc); + entry.size = 0; + + while ((line_size = getline (&line.string, &line_alloc, stdin)) >= 0) { + line.size = (di_ksize_t) line_size; + if (*line.string && *line.string != '\n') { + di_ksize_t new_size; + + new_size = entry.size + line.size + 1; + if (new_size > entry_alloc) { + while (new_size > entry_alloc) + entry_alloc *= 2; + entry.string = di_realloc (entry.string, + entry_alloc); + } + memcpy (entry.string + entry.size, line.string, + line.size + 1); + entry.size += line.size; + } else { + deduplicate_one (&entry); + *entry.string = '\0'; + entry.size = 0; + } + } + if (ferror (stdin)) { + perror ("getline"); + ret = 1; + goto out; + } + if (entry.size) + deduplicate_one (&entry); + output (); + ret = 0; + +out: + di_hash_table_destroy (versions); + di_hash_table_destroy (entries); + regfree (&package_re); + regfree (&version_re); + return ret; +} diff -Nru net-retriever-1.39/net-retriever net-retriever-1.39ubuntu1/net-retriever --- net-retriever-1.39/net-retriever 2014-03-03 10:26:08.000000000 +0000 +++ net-retriever-1.39ubuntu1/net-retriever 2014-11-17 16:30:25.000000000 +0000 @@ -86,6 +86,12 @@ exit 1 } +# Nasty hack to remove duplicates in Packages file. +deduplicate () { + /usr/lib/net-retriever/deduplicate <"$1" >"$1.new" + mv "$1.new" "$1" +} + cmd="$1" shift -- Colin Watson [cjwat...@debian.org]