Hi, On Thu, 30 May 2024 14:26:31 +0000 Holger Levsen <hol...@layer-acht.org> wrote: > very "nice" find, josch!
with the help of Holger and osuosl4 I have dug into this a bit more and tried to get some hard data about this problem. My idea was the following: parse all Packages files for all suites, all architectures and all components for all timestamps stored on snapshot.d.o and find packages with the same name/arch/version tuple that have a different checksum. To this end, I slightly (less than 1000 lines of diff) patched the tooling at https://salsa.debian.org/metasnap-team/metasnap.git with the patch that I attached to this mail on top of 1dadf2575160caf9467c4e21aa6c0a31ac10ffc2. After running that script for 3 months and downloading 189 GB of data in 3.5 Million requests (about 2 seconds for every request), we had a database (actually a git repository) of 48 GB that we can use to find duplicates. It took another 2 months to go through that data. I attached a graph which shows the number of duplicate name/arch/version triplets per timestamp. Please note the logarithmic y-axis. The total number of duplicates from 2005 until 2024 is 334335. Problem solved? Not so fast. Processing all Packages files will *not* find the original problem with bash. Why? Because according to the Packages files from snapshot.debian.org only one version of bash:arm64=5.2.15-2+b3 exists, namely: MD5sum: 01ee4cfa3df78e7ff0dc156ff19e2c88 SHA1: 1a0b12419b69a983bf22ac1d3d9f8bec725487b1 SHA256: 828ce0b4445921fff5b6394e74cce8296f3038d559845a3e82435b55ca6fcaa8 The other version never ended up in a Packages file even though it was found in the /pool/main/b/bash directory in the snapshot of 2023-07-13 21:11:09 nearly one year before the other version popped up. How can a package be in the pool directory but not in a Packages file? No idea but it shows that my method from above does not find a certain class of problems. We could find those by creating a fitting query against the snapshot.d.o database. Apparently lw07 is DD accessible and has a snapshot-guest service. So this is on my TODO list and Nicolas Dandrimont already offered to help with constructing an appropriate SQL query during MiniDebConf Hamburg this year. Lastly there is the problem of packages in incoming. Those packages will be used to build other packages that end up in the archive but they might never end up in the archive themselves. Thus, we might never know whether one of these packages violated the idea that the packagename/architecture/version triplet uniquely identifies a Debian binary package in the archive... Thanks! cheers, josch
diff --git a/quirks.py b/quirks.py index 5ccf6b3..c9ee218 100644 --- a/quirks.py +++ b/quirks.py @@ -532,6 +532,12 @@ def validate_release_arches(archive, timestamp, suite, release_arches, release_f "sparc", ] ) + elif timestamp == "20101003T163248Z" and suite == "sid": + # the Release file stores different sizes for sha256 + for arch in ["s390", "kfreebsd-i386", "kfreebsd-amd64"]: + #assert arch not in release_arches, arch + #release_arches.append(arch) + release_arches.remove(arch) elif timestamp == "20110312T103459Z" and suite == "sid": # the Release file is missing entries for Packages files for arch in release_arches: @@ -849,20 +855,8 @@ def validate_Packages(archive, timestamp, suite, area, arch, bname): return False elif ( timestamp == "20101003T163248Z" - and algo == "SHA256" - and suite == "sid" - and area == "main" - and arch == "binary-s390" - and bname in ["Packages.gz", "Packages.bz2"] - ): - # the Release file stores different sizes for sha256 - return False - elif ( - timestamp == "20101003T163248Z" - and algo == "SHA256" and suite == "sid" - and area == "main" - and arch in ["binary-kfreebsd-i386", "binary-kfreebsd-amd64"] + and arch in ["binary-s390", "binary-kfreebsd-i386", "binary-kfreebsd-amd64"] ): # the Release file stores different sizes for sha256 return False @@ -999,6 +993,13 @@ def validate_Packages_link(archive, timestamp, suite, area, arch, release_arches and arch == "binary-m68k" ): return False + elif ( + timestamp == "20101003T163248Z" + and suite == "sid" + and arch in ["binary-s390", "binary-kfreebsd-i386", "binary-kfreebsd-amd64"] + ): + # the Release file stores different sizes for sha256 + return False elif ( "20161107T033615Z" <= timestamp # FIXME: find last and suite in ["stretch", "stretch-proposed-updates", "stretch-updates"] diff --git a/run.py b/run.py index 79e23ee..61e6de1 100755 --- a/run.py +++ b/run.py @@ -140,6 +140,8 @@ def download(url): c.URL, url, ) + c.setopt(pycurl.FOLLOWLOCATION, 1) + c.setopt(pycurl.MAXREDIRS, 5) # even 100 kB/s is too much sometimes c.setopt(c.MAX_RECV_SPEED_LARGE, 1000 * 1024) # bytes per second c.setopt(c.CONNECTTIMEOUT, 30) # the default is 300 @@ -204,7 +206,7 @@ def download(url): # callback was aborted due to timeout global num_timeoutexc num_timeoutexc += 1 - sleep_time = 4 ** (retrynum + 1) + sleep_time = 2 ** (retrynum + 1) print("retrying after %f s..." % sleep_time) global num_timeouts num_timeouts += 1 @@ -572,10 +574,6 @@ if not repopath.exists(): else: subprocess.check_call(["git", "-C", repopath, "checkout", "main"]) -for f in [Path("Release.%s" % archive), Path("Release.%s.gpg" % archive)]: - if f.exists(): - f.unlink() - if list(repopath.iterdir()) != [repopath / ".git"]: # remove files from the working tree and from the index subprocess.run(["git", "-C", repopath, "rm", "-r", "."]) @@ -596,8 +594,11 @@ if list(repopath.iterdir()) != [repopath / ".git"]: if not cpath.is_dir(): continue for e in cpath.iterdir(): - e.unlink() + (e / "Packages").unlink() + e.rmdir() cpath.rmdir() + (path / "Release").unlink(missing_ok=True) + (path / "Release.gpg").unlink(missing_ok=True) path.rmdir() assert list(repopath.iterdir()) == [repopath / ".git"] @@ -679,15 +680,13 @@ for i, timestamp in enumerate(timestamps): continue release_files = dict() + (repopath / suite ).mkdir(parents=True, exist_ok=True) # download Release and Release.gpg rel_data = download(get_baseurl(archive, timestamp, suite) + "/Release") - assert not os.path.exists("Release.%s" % archive) - with open("Release.%s" % archive, "wb") as f: - f.write(rel_data) - assert not os.path.exists("Release.%s.gpg" % archive) + (repopath / suite / "Release").write_bytes(rel_data) + rel_gpg = download(get_baseurl(archive, timestamp, suite) + "/Release.gpg") - with open("Release.%s.gpg" % archive, "wb") as f: - f.write(rel_gpg) + (repopath / suite / "Release.gpg").write_bytes(rel_gpg) # validate Release using Release.gpg with gpgv if archive in ["debian", "debian-debug", "debian-security"]: @@ -720,8 +719,8 @@ for i, timestamp in enumerate(timestamps): ] + keyrings + [ - "Release.%s.gpg" % archive, - "Release.%s" % archive, + (repopath / suite / "Release.gpg"), + (repopath / suite / "Release"), ], stderr=subprocess.DEVNULL, ) @@ -732,8 +731,6 @@ for i, timestamp in enumerate(timestamps): print("wrong signature of Release") else: raise - os.unlink("Release.%s" % archive) - os.unlink("Release.%s.gpg" % archive) rel = Release(BytesIO(rel_data)) byhash = False if rel.get("Acquire-By-Hash") == "yes": @@ -910,7 +907,8 @@ for i, timestamp in enumerate(timestamps): continue # no need to download anything if Packages file is of zero size if props["size"] == 0: - (repopath / suite / area / arch).touch() + (repopath / suite / area / f"binary-{arch}").mkdir(parents=True, exist_ok=True) + (repopath / suite / area / f"binary-{arch}" / "Packages").touch() print(archive, timestamp, suite, fname, "(zero size)") curr_packages[suite][fname] = b"" continue @@ -939,7 +937,7 @@ for i, timestamp in enumerate(timestamps): "reset", "--quiet", "HEAD", - "%s/%s/%s" % (suite, area, arch), + "%s/%s/binary-%s/Packages" % (suite, area, arch), ] ) subprocess.check_call( @@ -950,7 +948,7 @@ for i, timestamp in enumerate(timestamps): "checkout", "--quiet", "--", - "%s/%s/%s" % (suite, area, arch), + "%s/%s/binary-%s/Packages" % (suite, area, arch), ] ) print(archive, timestamp, suite, fname, "(no changes)") @@ -964,6 +962,7 @@ for i, timestamp in enumerate(timestamps): last_packages is not None and suite in last_packages and fname in last_packages[suite] + and fname in last_release_props[suite] and fname + ".diff/Index" in release_files ): # apply diff to last downloaded file @@ -1039,34 +1038,9 @@ for i, timestamp in enumerate(timestamps): print("wrong checksums2:", fname) else: raise + (repopath / suite / area / f"binary-{arch}").mkdir(parents=True, exist_ok=True) + (repopath / suite / area / f"binary-{arch}" / "Packages").write_bytes(data) curr_packages[suite][fname] = data - packages = [] - ## using Deb822 is 12 times slower - ## even with use_apt_pkg=True it's 4 times slower than manual parsing - # for pkg in Deb822.iter_paragraphs(data, use_apt_pkg=True): - # packages.append((pkg["Package"], pkg["Version"])) - pkgname = None - pkgver = None - data = quirks.fixup_data(archive, timestamp, suite, area, arch, data) - for line in data.splitlines(): - if line == b"": - assert pkgname is not None - assert pkgver is not None - packages.append((pkgname, pkgver)) - pkgname = None - pkgver = None - elif line.startswith(b"Package: "): - pkgname = line[9:].lstrip().decode("ascii") - elif line.startswith(b"Version: "): - pkgver = line[9:].lstrip().decode("ascii") - assert pkgname is None - assert pkgver is None - # Sort the list of packages to minimize the diff between timestamps - # in the git repo. Sometimes just the order of packages changes and - # we are not interested in the order. - (repopath / suite / area / arch).write_text( - "".join("%s %s\n" % (pkg, ver) for pkg, ver in sorted(packages)) - ) assert curr_release_props.keys() == curr_packages.keys() last_release_props = curr_release_props diff --git a/run2.py b/run2.py index e77eddc..1dd3476 100755 --- a/run2.py +++ b/run2.py @@ -22,6 +22,7 @@ from datetime import datetime from pathlib import Path import sqlite3 import time +from hashlib import blake2b import quirks @@ -55,15 +56,30 @@ Path("by-package").mkdir(exist_ok=True) db_existed = os.path.exists("by-package/%s.sqlite3" % archive) conn = sqlite3.connect("by-package/%s.sqlite3" % archive) conn.execute("PRAGMA journal_mode=WAL") - lastversions = dict() lasttimestamp = None lastarches = dict() +last_packages_hashes = set() timestamp2id = dict() suite2id = dict() arch2id = dict() pkg2id = dict() ver2id = dict() +hashes = { + "MD5sum": set(), + "SHA1": set(), + "SHA256": set(), +} +nav2hash = { + "MD5sum": dict(), + "SHA1": dict(), + "SHA256": dict(), +} +hash2nav = { + "MD5sum": dict(), + "SHA1": dict(), + "SHA256": dict(), +} if db_existed: timestamp2id = { @@ -97,6 +113,18 @@ if db_existed: ver2id = { name: i for name, i in conn.execute("select name, id from vers").fetchall() } + for algo in ["SHA256", "SHA1", "MD5sum"]: + hashes[algo] = set( + [ + (n, a, v, h) + for n, a, v, h in conn.execute( + f"select name, arch, version, hash from {algo}" + ).fetchall() + ] + ) + for n, a, v, h in hashes[algo]: + hash2nav[algo][h] = (n, a, v) + nav2hash[algo][(n, a, v)] = h rows = conn.execute( """ select suites.name, ranges.comp, arches.name, pkgs.name, vers.name, begin.name @@ -166,6 +194,9 @@ else: # creating a key on pkg,arch,ver,suite,comp would not be unique. conn.executescript( """ +create table MD5sum (name text, arch text, version text, hash text, primary key (name,arch,version,hash)); +create table SHA1 (name text, arch text, version text, hash text, primary key (name,arch,version,hash)); +create table SHA256 (name text, arch text, version text, hash text, primary key (name,arch,version,hash)); create table suites (id integer primary key, name text unique); create table arches (id integer primary key, name text unique); create table pkgs (id integer primary key, name text unique); @@ -191,6 +222,23 @@ starttime = time.time() for i, timestamp in enumerate(timestamps): print(timestamp) + for algo in ["SHA256", "SHA1", "MD5sum"]: + print(f"{algo:6}: num hashes: {len(hashes[algo])}") + # print(f"{algo}: num hash2nav: {len(hash2nav[algo])}") + # print(f"{algo}: num nav2hash: {len(nav2hash[algo])}") + ## sometimes, 'git checkout' would fail with: + ## Your local changes to the following files would be overwritten by checkout: + ## Maybe files on the disk get corrupted? Use 'git reset' to force update + #subprocess.check_call( + # [ + # "git", + # "-C", + # "./by-timestamp/%s.git" % archive, + # "reset", + # "--hard", + # timestamp, + # ] + #) subprocess.check_call( [ "git", @@ -201,7 +249,9 @@ for i, timestamp in enumerate(timestamps): timestamp, ] ) + ts_start_time = time.time() thisarches = defaultdict(set) + this_packages_hashes = set() thisversions = dict() cur = conn.execute("insert into timestamps(name) values(?)", (timestamp,)) timestamp2id[timestamp] = cur.lastrowid @@ -259,56 +309,55 @@ for i, timestamp in enumerate(timestamps): / f"binary-{arch}" / "Packages" ).read_bytes() # Packages files are not necessarily valid utf8 + hashed_data = blake2b(data).digest() + this_packages_hashes.add(hashed_data) + # do not parse a Packages file again that we have already seen + if hashed_data in last_packages_hashes: + continue data = quirks.fixup_data(archive, timestamp, suite, comp, arch, data) pkgname = None pkgver = None pkgarch = None - hashes = {b"SHA256": None, b"SHA1": None, b"MD5sum": None} + pkghashes = {b"SHA256": None, b"SHA1": None, b"MD5sum": None} + newhashes = { + "MD5sum": set(), + "SHA1": set(), + "SHA256": set(), + } for line in data.splitlines(): if line == b"": assert pkgname is not None assert pkgver is not None assert pkgarch is not None - assert any(hashes.values()) + assert any(pkghashes.values()) - for algo, h in hashes.items(): + for algo, h in pkghashes.items(): if h is None: continue - - hpath = ( - Path("hash-collisions") - / algo.decode("ascii") - / h[:2] - / h[:4] - ) - hpath.mkdir(parents=True, exist_ok=True) - hpath = hpath / h - content = None - if hpath.exists(): - content = hpath.read_text() - if content is None: - hpath.write_text(f"{pkgname}_{pkgver}_{pkgarch}\n") - elif content != f"{pkgname}_{pkgver}_{pkgarch}\n": - print(f"found packages with the same hash: {h}") - content = set(content.splitlines()) - content.add(f"{pkgname}_{pkgver}_{pkgarch}") - hpath.write_text("\n".join(sorted(content)) + "\n") - - ppath = Path("pkgverarch-collisions") / algo.decode("ascii") / pkgname / pkgarch - ppath.mkdir(parents=True, exist_ok=True) - ppath = ppath / pkgver - content = None - if ppath.exists(): - content = ppath.read_text() - if content is None: - ppath.write_text(h + "\n") - elif content != h + "\n": - content = set(content.splitlines()) - content.add(h) - print( - f"found {len(content)} hashes for the same package: {pkgname}:{pkgarch}={pkgver}" - ) - ppath.write_text("\n".join(sorted(content)) + "\n") + algo = algo.decode("ascii") + if h in hash2nav[algo]: + if hash2nav[algo][h] != (pkgname, pkgarch, pkgver): + print( + f"{timestamp}: different package with same hash {h.hex()}: {hash2nav[algo][h]} != {(pkgname, pkgarch, pkgver)}" + ) + # make sure that this file gets re-processed in the next + # timestamp even if it is identical so that we do not miss + # listing collisions + this_packages_hashes.discard(hashed_data) + if (pkgname, pkgarch, pkgver) in nav2hash[algo]: + if nav2hash[algo][(pkgname, pkgarch, pkgver)] != h: + print( + f"{timestamp}: different hash for the same package {(pkgname, pkgarch, pkgver)}: {nav2hash[algo][(pkgname, pkgarch, pkgver)].hex()} != {h.hex()}" + ) + # make sure that this file gets re-processed in the next + # timestamp even if it is identical so that we do not miss + # listing collisions + this_packages_hashes.discard(hashed_data) + if (pkgname, pkgarch, pkgver, h) not in hashes[algo]: + newhashes[algo].add((pkgname, pkgarch, pkgver, h)) + hashes[algo].add((pkgname, pkgarch, pkgver, h)) + hash2nav[algo][h] = (pkgname, pkgarch, pkgver) + nav2hash[algo][(pkgname, pkgarch, pkgver)] = h if pkgname not in pkg2id: cur = conn.execute( @@ -328,7 +377,7 @@ for i, timestamp in enumerate(timestamps): pkgname = None pkgver = None pkgarch = None - hashes = {b"SHA256": None, b"SHA1": None, b"MD5sum": None} + pkghashes = {b"SHA256": None, b"SHA1": None, b"MD5sum": None} elif line.startswith(b"Package: "): pkgname = line[9:].lstrip().decode("ascii") elif line.startswith(b"Version: "): @@ -336,217 +385,232 @@ for i, timestamp in enumerate(timestamps): elif line.startswith(b"Architecture: "): pkgarch = line[14:].lstrip().decode("ascii") - for algo in hashes.keys(): + for algo in pkghashes.keys(): if line.startswith(algo + b": "): - hashes[algo] = ( - line[len(algo) + 2 :].lstrip().decode("ascii") - ) - - for pkg in thisversions[key]: - if key in lastversions: - last_ver = lastversions[key].get(pkg) - else: - last_ver = None - if timestamp == timestamps[-1]: - rows = [] - # special handling for last timestamp - if last_ver is None: - for ver in thisversions[key][pkg]: - rows.append((ver, timestamp, timestamp)) - else: - if last_ver.keys() == thisversions[key][pkg]: - for ver in last_ver: - rows.append((ver, last_ver[ver], timestamp)) - else: - # there is a difference - for ver in sorted( - last_ver.keys() | thisversions[key][pkg] - ): - if ( - ver in last_ver - and ver in thisversions[key][pkg] - ): - # version is in both, the last and current timestamp - rows.append((ver, last_ver[ver], timestamp)) - elif ( - ver in last_ver - and ver not in thisversions[key][pkg] - ): - # version is in last but not current timestamp - rows.append((ver, last_ver[ver], lasttimestamp)) - elif ( - ver not in last_ver - and ver in thisversions[key][pkg] - ): - # version is new in this timestamp - rows.append((ver, timestamp, timestamp)) - else: - raise Exception("logic error") - conn.executemany( - """insert into ranges(suite, comp, arch, pkg, ver, begin, end) - values(?, ?, ?, ?, ?, ?, ?)""", - [ - ( - suite2id[suite], - compid, - arch2id[arch], - pkg2id[pkg], - ver2id[v], - timestamp2id[b], - timestamp2id[e], + # convert ascii hex to bytes to save some space in memory + try: + pkghashes[algo] = bytes.fromhex( + line[len(algo) + 2 :].lstrip().decode("ascii") ) - for v, b, e in rows - ], - ) - if last_ver is not None: - del lastversions[key][pkg] - if not lastversions[key]: - del lastversions[key] - continue - if last_ver is None: - # package didn't exist before, start new entry - if key not in lastversions: - lastversions[key] = dict() - lastversions[key][pkg] = { - v: timestamp for v in thisversions[key][pkg] - } - continue - if last_ver.keys() == thisversions[key][pkg]: - # same versions in this Packages file as in the last - continue - # there is a difference -- add a new line to the output for - # each version that was removed and add a new entry in the - # dict for each version that was added - rows = [] - for ver in last_ver.keys() - thisversions[key][pkg]: - # for all removed versions, add a new line - rows.append((ver, last_ver[ver], lasttimestamp)) - # and delete the old entry - del lastversions[key][pkg][ver] - if not lastversions[key][pkg]: - del lastversions[key][pkg] - if not lastversions[key]: - del lastversions[key] - conn.executemany( - """insert into ranges(suite, comp, arch, pkg, ver, begin, end) - values(?, ?, ?, ?, ?, ?, ?)""", - [ - ( - suite2id[suite], - compid, - arch2id[arch], - pkg2id[pkg], - ver2id[v], - timestamp2id[b], - timestamp2id[e], - ) - for v, b, e in rows - ], - ) - for ver in thisversions[key][pkg] - last_ver.keys(): - # for all added versions, add a new entry - if key not in lastversions: - lastversions[key] = dict() - if pkg not in lastversions[key]: - lastversions[key][pkg] = dict() - lastversions[key][pkg][ver] = timestamp + except ValueError as e: + print(f"{timestamp}: failed decoding {line} in {suite}/{comp}/binary-{arch}/Packages: {e}") - # go through all packages that were handled last timestamp but - # were missing this timestamp - if key in lastversions: - for pkg in lastversions[key].keys() - thisversions[key].keys(): - rows = [] - for ver in lastversions[key][pkg]: - rows.append( - (ver, lastversions[key][pkg][ver], lasttimestamp) - ) - conn.executemany( - """insert into ranges(suite, comp, arch, pkg, ver, begin, end) - values(?, ?, ?, ?, ?, ?, ?)""", - [ - ( - suite2id[suite], - compid, - arch2id[arch], - pkg2id[pkg], - ver2id[v], - timestamp2id[b], - timestamp2id[e], - ) - for v, b, e in rows - ], - ) - del lastversions[key][pkg] - if not lastversions[key]: - del lastversions[key] - # add entries for all suites that were present in the last timestamp - # but not anymore in this timestamp - for suite in lastarches.keys() - thisarches.keys(): - for arch in lastarches[suite]: - for compid, comp in enumerate(possible_areas): - if onlycomp is not None and onlycomp != comp: - continue - key = suite + " " + comp + " " + arch - if key not in lastversions: - continue - for pkg in lastversions[key]: - rows = [] - for ver in lastversions[key][pkg]: - rows.append((ver, lastversions[key][pkg][ver], lasttimestamp)) + for algo in newhashes.keys(): conn.executemany( - """insert into ranges(suite, comp, arch, pkg, ver, begin, end) - values(?, ?, ?, ?, ?, ?, ?)""", - [ - ( - suite2id[suite], - compid, - arch2id[arch], - pkg2id[pkg], - ver2id[v], - timestamp2id[b], - timestamp2id[e], - ) - for v, b, e in rows - ], + f"""insert into {algo}(name,arch,version,hash) + values(?,?,?,?)""", + newhashes[algo], ) - del lastversions[key] - # handle architectures that were present in the last timestamp but not - # anymore in this one - for suite in lastarches.keys() & thisarches.keys(): - for arch in lastarches[suite] - thisarches[suite]: - for compid, comp in enumerate(possible_areas): - if onlycomp is not None and onlycomp != comp: - continue - key = suite + " " + comp + " " + arch - if key not in lastversions: - continue - for pkg in lastversions[key]: - rows = [] - for ver in lastversions[key][pkg]: - rows.append((ver, lastversions[key][pkg][ver], lasttimestamp)) - conn.executemany( - """insert into ranges(suite, comp, arch, pkg, ver, begin, end) - values(?, ?, ?, ?, ?, ?, ?)""", - [ - ( - suite2id[suite], - compid, - arch2id[arch], - pkg2id[pkg], - ver2id[v], - timestamp2id[b], - timestamp2id[e], - ) - for v, b, e in rows - ], - ) - del lastversions[key] + + # for pkg in thisversions[key]: + # if key in lastversions: + # last_ver = lastversions[key].get(pkg) + # else: + # last_ver = None + # if timestamp == timestamps[-1]: + # rows = [] + # # special handling for last timestamp + # if last_ver is None: + # for ver in thisversions[key][pkg]: + # rows.append((ver, timestamp, timestamp)) + # else: + # if last_ver.keys() == thisversions[key][pkg]: + # for ver in last_ver: + # rows.append((ver, last_ver[ver], timestamp)) + # else: + # # there is a difference + # for ver in sorted( + # last_ver.keys() | thisversions[key][pkg] + # ): + # if ( + # ver in last_ver + # and ver in thisversions[key][pkg] + # ): + # # version is in both, the last and current timestamp + # rows.append((ver, last_ver[ver], timestamp)) + # elif ( + # ver in last_ver + # and ver not in thisversions[key][pkg] + # ): + # # version is in last but not current timestamp + # rows.append((ver, last_ver[ver], lasttimestamp)) + # elif ( + # ver not in last_ver + # and ver in thisversions[key][pkg] + # ): + # # version is new in this timestamp + # rows.append((ver, timestamp, timestamp)) + # else: + # raise Exception("logic error") + # conn.executemany( + # """insert into ranges(suite, comp, arch, pkg, ver, begin, end) + # values(?, ?, ?, ?, ?, ?, ?)""", + # [ + # ( + # suite2id[suite], + # compid, + # arch2id[arch], + # pkg2id[pkg], + # ver2id[v], + # timestamp2id[b], + # timestamp2id[e], + # ) + # for v, b, e in rows + # ], + # ) + # if last_ver is not None: + # del lastversions[key][pkg] + # if not lastversions[key]: + # del lastversions[key] + # continue + # if last_ver is None: + # # package didn't exist before, start new entry + # if key not in lastversions: + # lastversions[key] = dict() + # lastversions[key][pkg] = { + # v: timestamp for v in thisversions[key][pkg] + # } + # continue + # if last_ver.keys() == thisversions[key][pkg]: + # # same versions in this Packages file as in the last + # continue + # # there is a difference -- add a new line to the output for + # # each version that was removed and add a new entry in the + # # dict for each version that was added + # rows = [] + # for ver in last_ver.keys() - thisversions[key][pkg]: + # # for all removed versions, add a new line + # rows.append((ver, last_ver[ver], lasttimestamp)) + # # and delete the old entry + # del lastversions[key][pkg][ver] + # if not lastversions[key][pkg]: + # del lastversions[key][pkg] + # if not lastversions[key]: + # del lastversions[key] + # conn.executemany( + # """insert into ranges(suite, comp, arch, pkg, ver, begin, end) + # values(?, ?, ?, ?, ?, ?, ?)""", + # [ + # ( + # suite2id[suite], + # compid, + # arch2id[arch], + # pkg2id[pkg], + # ver2id[v], + # timestamp2id[b], + # timestamp2id[e], + # ) + # for v, b, e in rows + # ], + # ) + # for ver in thisversions[key][pkg] - last_ver.keys(): + # # for all added versions, add a new entry + # if key not in lastversions: + # lastversions[key] = dict() + # if pkg not in lastversions[key]: + # lastversions[key][pkg] = dict() + # lastversions[key][pkg][ver] = timestamp + + ## go through all packages that were handled last timestamp but + ## were missing this timestamp + # if key in lastversions: + # for pkg in lastversions[key].keys() - thisversions[key].keys(): + # rows = [] + # for ver in lastversions[key][pkg]: + # rows.append( + # (ver, lastversions[key][pkg][ver], lasttimestamp) + # ) + # conn.executemany( + # """insert into ranges(suite, comp, arch, pkg, ver, begin, end) + # values(?, ?, ?, ?, ?, ?, ?)""", + # [ + # ( + # suite2id[suite], + # compid, + # arch2id[arch], + # pkg2id[pkg], + # ver2id[v], + # timestamp2id[b], + # timestamp2id[e], + # ) + # for v, b, e in rows + # ], + # ) + # del lastversions[key][pkg] + # if not lastversions[key]: + # del lastversions[key] + ## add entries for all suites that were present in the last timestamp + ## but not anymore in this timestamp + # for suite in lastarches.keys() - thisarches.keys(): + # for arch in lastarches[suite]: + # for compid, comp in enumerate(possible_areas): + # if onlycomp is not None and onlycomp != comp: + # continue + # key = suite + " " + comp + " " + arch + # if key not in lastversions: + # continue + # for pkg in lastversions[key]: + # rows = [] + # for ver in lastversions[key][pkg]: + # rows.append((ver, lastversions[key][pkg][ver], lasttimestamp)) + # conn.executemany( + # """insert into ranges(suite, comp, arch, pkg, ver, begin, end) + # values(?, ?, ?, ?, ?, ?, ?)""", + # [ + # ( + # suite2id[suite], + # compid, + # arch2id[arch], + # pkg2id[pkg], + # ver2id[v], + # timestamp2id[b], + # timestamp2id[e], + # ) + # for v, b, e in rows + # ], + # ) + # del lastversions[key] + ## handle architectures that were present in the last timestamp but not + ## anymore in this one + # for suite in lastarches.keys() & thisarches.keys(): + # for arch in lastarches[suite] - thisarches[suite]: + # for compid, comp in enumerate(possible_areas): + # if onlycomp is not None and onlycomp != comp: + # continue + # key = suite + " " + comp + " " + arch + # if key not in lastversions: + # continue + # for pkg in lastversions[key]: + # rows = [] + # for ver in lastversions[key][pkg]: + # rows.append((ver, lastversions[key][pkg][ver], lasttimestamp)) + # conn.executemany( + # """insert into ranges(suite, comp, arch, pkg, ver, begin, end) + # values(?, ?, ?, ?, ?, ?, ?)""", + # [ + # ( + # suite2id[suite], + # compid, + # arch2id[arch], + # pkg2id[pkg], + # ver2id[v], + # timestamp2id[b], + # timestamp2id[e], + # ) + # for v, b, e in rows + # ], + # ) + # del lastversions[key] lastarches = thisarches + last_packages_hashes = this_packages_hashes lasttimestamp = timestamp total_hours = (time.time() - starttime) / (60 * 60) ts_per_h = (i + 1) / total_hours remaining = (len(timestamps) - i - 1) / ts_per_h - print("per hour: %f remaining: %f hours" % (ts_per_h, remaining)) + print("per hour all: %f remaining all: %f hours" % (ts_per_h, remaining)) + ts_per_h_current = (60 * 60 )/(time.time() - ts_start_time) + remaining_current = (len(timestamps) - i - 1) / ts_per_h_current + print("per hour curr: %f remaining curr: %f hours" % (ts_per_h_current, remaining_current)) print("committing changes to the database...")
signature.asc
Description: signature