While we are at it, let's convert HTML character entity references (which each use 6-8 characters and as many bytes in the HTML file) to actual characters (which UTF-8 encodes as 2-3 bytes). Since all diffoscope output files are peppered with abundant amounts of these things, this could reduce the file sizes by a few percent at least. I used Python string literals instead of the actual characters in the Python file, because 1) the non-breaking and zero-width spaces would be very hard to distinguish from ordinary space and missing string content, respectively, and 2) it is impossible to be sure that every piece of software that is ever going to be used to view or edit the file would handle non-ASCII characters correctly.
--- presenters/html.py.orig 2015-12-16 19:42:25.000000000 +0200 +++ presenters/html.py 2015-12-17 15:10:53.654467937 +0200 @@ -290,9 +290,9 @@ n = TABSIZE-(i%TABSIZE) if n == 0: n = TABSIZE - t.write('<span class="diffponct">»</span>'+' '*(n-1)) + t.write('<span class="diffponct">\xbb</span>'+'\xa0'*(n-1)) elif c == " " and ponct == 1: - t.write('<span class="diffponct">·</span>') + t.write('<span class="diffponct">\xb7</span>') elif c == "\n" and ponct == 1: t.write('<br/><span class="diffponct">\</span>') elif ord(c) < 32: @@ -304,11 +304,11 @@ i += 1 if WORDBREAK.count(c) == 1: - t.write('​') + t.write('\u200b') i = 0 if i > LINESIZE: i = 0 - t.write("​") + t.write('\u200b') return t.getvalue() @@ -353,7 +353,7 @@ print_func(u'</td>') else: s1 = "" - print_func(u'<td colspan="2"> </td>') + print_func(u'<td colspan="2">\xa0</td>') if s2 is not None: print_func(u'<td class="diffline">%d </td>' % line2) @@ -362,7 +362,7 @@ print_func(u'</td>') else: s2 = "" - print_func(u'<td colspan="2"> </td>') + print_func(u'<td colspan="2">\xa0</td>') finally: print_func(u"</tr>\n", force=True) @@ -522,7 +522,7 @@ print_func(u"<div><span class='source'>%s</span>" % escape(difference.source2)) anchor = '/'.join(sources[1:]) - print_func(u" <a class='anchor' href='#%s' name='%s'>¶</a>" % (anchor, anchor)) + print_func(u" <a class='anchor' href='#%s' name='%s'>\xb6</a>" % (anchor, anchor)) print_func(u"</div>") if difference.comments: print_func(u"<div class='comment'>%s</div>"