Control: tags -1 patch Hi,
I have rewritten Area.{C,h} to ensure proper UTF8 support, which would close this bug and related ones (#545695, #732702). Now html2text should always generate valid UTF8 in UTF8 mode. I've prepared a NMU which fixes these bugs and some other minor problems. Please see attached debdiff. Best regards, Dmitry Borisyuk
diff -Nru html2text-1.3.2a/debian/changelog html2text-1.3.2a/debian/changelog --- html2text-1.3.2a/debian/changelog 2014-09-07 21:08:35.000000000 +0300 +++ html2text-1.3.2a/debian/changelog 2014-12-13 09:33:08.000000000 +0200 @@ -1,3 +1,12 @@ +html2text (1.3.2a-18.1) experimental; urgency=low + + * Non-maintainer upload. + * Full UTF-8 support (rewritten Area.C, closes: #732702, #692861, #545695). + * Catch ENOMEM in {m,re}alloc_array in Area.C. + * Fix parsing of input args (Closes: #716064). + + -- Dmitry Borisyuk <q1we...@i.com.ua> Sat, 13 Dec 2014 09:31:59 +0200 + html2text (1.3.2a-18) unstable; urgency=medium * Complete the utf8 table. (Closes: #760588) - thanks to Marcos Marado for diff -Nru html2text-1.3.2a/debian/patches/catch-ENOMEM.patch html2text-1.3.2a/debian/patches/catch-ENOMEM.patch --- html2text-1.3.2a/debian/patches/catch-ENOMEM.patch 1970-01-01 03:00:00.000000000 +0300 +++ html2text-1.3.2a/debian/patches/catch-ENOMEM.patch 2014-12-13 09:40:56.000000000 +0200 @@ -0,0 +1,30 @@ +Description: Catch ENOMEM in {m,re}alloc_array (prevent segfaults). +Author: Dmitry Borisyuk <q1we...@i.com.ua> +--- a/Area.C ++++ b/Area.C +@@ -30,7 +30,7 @@ + + /***************************************************************************/ + +- ++#include <stdio.h> + #include <stdlib.h> + #include <string.h> + #include <iostream> +@@ -45,10 +45,14 @@ + + /* ------------------------------------------------------------------------- */ + ++static void* alloc_error() { ++ perror("html2text: error"); abort(); ++} ++ + #define malloc_array(type, size)\ +-((type *) malloc(sizeof(type) * (size))) ++((type *) (malloc(sizeof(type) * (size)) ? : alloc_error())) + #define realloc_array(array, type, size) \ +-((array) = (type *) realloc((array), sizeof(type) * (size))) ++((array) = (type *) (realloc((array), sizeof(type) * (size)) ? : alloc_error())) + #define copy_array(from, to, type, count) \ + ((void) memcpy((to), (from), (count) * sizeof(type))) + diff -Nru html2text-1.3.2a/debian/patches/fix-arg-parsing.patch html2text-1.3.2a/debian/patches/fix-arg-parsing.patch --- html2text-1.3.2a/debian/patches/fix-arg-parsing.patch 1970-01-01 03:00:00.000000000 +0300 +++ html2text-1.3.2a/debian/patches/fix-arg-parsing.patch 2014-12-13 09:39:35.000000000 +0200 @@ -0,0 +1,20 @@ +Description: Fix parsing of input args (prevent segfault). +Author: Dmitry Borisyuk <q1we...@i.com.ua> +Bug-Debian: https://bugs.debian.org/716064 +--- a/html2text.C ++++ b/html2text.C +@@ -295,10 +295,10 @@ + if (!strcmp(arg, "-check" )) { mode = MyParser::SYNTAX_CHECK; } else + if (!strcmp(arg, "-debug-scanner")) { debug_scanner = true; } else + if (!strcmp(arg, "-debug-parser" )) { debug_parser = true; } else +- if (!strcmp(arg, "-rcfile" )) { rcfile = argv[++i]; } else +- if (!strcmp(arg, "-style" )) { style = argv[++i]; } else +- if (!strcmp(arg, "-width" )) { if (atoi(argv[++i]) > 0) width = atoi(argv[i]); } else +- if (!strcmp(arg, "-o" )) { output_file_name = argv[++i]; } else ++ if (!strcmp(arg, "-rcfile" )) { if (++i < argc) rcfile = argv[i]; } else ++ if (!strcmp(arg, "-style" )) { if (++i < argc) style = argv[i]; } else ++ if (!strcmp(arg, "-width" )) { if (++i < argc) if (atoi(argv[i]) > 0) width = atoi(argv[i]); } else ++ if (!strcmp(arg, "-o" )) { if (++i < argc) output_file_name = argv[i]; } else + if (!strcmp(arg, "-nobs" )) { use_backspaces = false; } else + if (!strcmp(arg, "-ascii" )) { use_encoding = ASCII; } else + if (!strcmp(arg, "-utf8" )) { use_encoding = UTF8; } else diff -Nru html2text-1.3.2a/debian/patches/fix-utf8-support.patch html2text-1.3.2a/debian/patches/fix-utf8-support.patch --- html2text-1.3.2a/debian/patches/fix-utf8-support.patch 1970-01-01 03:00:00.000000000 +0300 +++ html2text-1.3.2a/debian/patches/fix-utf8-support.patch 2014-12-13 09:44:01.000000000 +0200 @@ -0,0 +1,453 @@ +Description: Full proper UTF-8 support. + Rewritten Area.{C,h}, introduced WideChar class which represents utf8 character. +Author: Dmitry Borisyuk <q1we...@i.com.ua> +Bug-Debian: https://bugs.debian.org/732702 +Bug-Debian: https://bugs.debian.org/692861 +Bug-Debian: https://bugs.debian.org/545695 +--- a/Area.C ++++ b/Area.C +@@ -43,6 +43,40 @@ + + extern int use_encoding; + ++int utf8_is_continuation(char ch) { ++ return (ch & 0xc0) == 0x80; ++} ++ ++// copy single (multibyte) character from s to dest, return ptr to the next one ++const char* ++utf_next(const char* s, char* dest) ++{ ++ if (!(s && *s)) return NULL; ++ const char *r = s+1; ++ if (USE_UTF8) { ++ while (utf8_is_continuation(*r)) r++; ++ }; ++ if (dest) { ++ size_t n = r - s; ++ if (n > MAX_UTF_LEN) n = MAX_UTF_LEN; ++ memcpy(dest, s, n); ++ dest[n] = '\0'; ++ }; ++ return r; ++} ++ ++size_t ++utf_length(const char* s) ++{ ++ size_t n = 0; ++ if (USE_UTF8) { ++ while (*s) ++ if (!utf8_is_continuation(*(s++))) n++; ++ return n; ++ } ++ else return strlen(s); ++} ++ + /* ------------------------------------------------------------------------- */ + + static void* alloc_error() { +@@ -58,27 +92,19 @@ + + /* ------------------------------------------------------------------------- */ + +-Line::Line(size_type l) : length_(l), cells_(malloc_array(Cell, l)) ++/*Line::Line(size_type l) : length_(l), cells_(malloc_array(Cell, l)) + { + Cell *p, *end = cells_ + l; + for (p = cells_; p != end; p++) p->clear(); +-} +- +-Line::Line(const char *p) : +- length_(strlen(p)), +- cells_(malloc_array(Cell, length_)) +-{ +- Cell *q = cells_, *end = q + length_; +- while (q != end) { q->character = *p++; q->attribute = Cell::NONE; q++; } +-} ++}*/ + + Line::Line(const string &s) : +- length_(s.length()), ++ length_(utf_length(s.c_str())), + cells_(malloc_array(Cell, length_)) + { + const char *p = s.c_str(); + Cell *q = cells_, *end = q + length_; +- while (q != end) { q->character = *p++; q->attribute = Cell::NONE; q++; } ++ while (q != end) { p = utf_next(p, q->character); q->attribute = Cell::NONE; q++; } + } + + Line::~Line() +@@ -86,61 +112,12 @@ + free(cells_); + } + +-/* ------------------------------------------------------------------------- */ +- +-/* utf_length() and utf_width() +- * +- * Very simplified algorithm of calculating length of UTF-8 +- * string. No check for errors. Counting only ASCII bytes and +- * leading bytes of UTF-8 multibyte sequences. All bytes like +- * 10xxxxxx are dropped. If USE_UTF8 is false then returns +- * usual length. --YS +- */ +- +-size_t utf8_aux_count(char ch) +-{ +- if((ch & 0xe0) == 0xc0) +- { +- return 1; +- } +- else if((ch & 0xf0) == 0xe0) +- { +- return 2; +- } +- else if ((ch & 0xf8) == 0xf0) +- { +- return 3; +- } +- else +- { +- return 0; +- } +-} +- +-unsigned int +-Line::utf_length(size_type f, size_type t) const +-{ +- size_type m = (t < length_ ? t : length_); +- size_type r = m - f; +- if(USE_UTF8) +- { +- for (int i = f; i < m; i++) +- { +- char& ch = cells_[i].character; +- size_type aux_count = utf8_aux_count(ch); +- r -= aux_count; +- i += aux_count; +- } +- } +- return r; +-} +- + void + Line::resize(size_type l) + { +- if (l == length()) return; ++ if (l == length_) return; + realloc_array(cells_, Cell, l); +- for (size_type x = length(); x < l; x++) cells_[x].clear(); ++ for (size_type x = length_; x < l; x++) cells_[x].clear(); + length_ = l; + } + +@@ -154,17 +131,9 @@ + } + + void +-Line::insert(const char *p, size_type x) +-{ +- enlarge(x + strlen(p)); +- Cell *q = cells_ + x; +- while (*p) q++->character = *p++; +-} +- +-void + Line::insert(const string &s, size_type x) + { +- insert(s.c_str(), x); ++ insert(Line(s), x); + } + + void +@@ -187,19 +156,15 @@ + } + + void +-Line::append(const char *p) ++Line::append(const string &s) + { +- size_type x = length_; +- enlarge(x + strlen(p)); +- Cell *q = cells_ + x; +- for (; *p; ++p, ++q) { q->character = *p; q->attribute = Cell::NONE; } ++ append(Line(s)); + } + + void + Line::add_attribute(char addition) + { + Cell *p = cells_, *end = cells_ + length_; +- + while(p != end) p++->attribute |= addition; + } + +@@ -232,30 +197,6 @@ + } + } + +-Area::Area(const char *p) : +- width_(strlen(p)), +- height_(1), +- cells_(malloc_array(Cell *, 1)) +-{ +- cells_[0] = malloc_array(Cell, width_); +- Cell *q = cells_[0], *end = q + width_; +- while (q != end) { q->character = *p++; q->attribute = Cell::NONE; q++; } +-} +- +-Area::Area(const string &s) : +- width_(s.length()), +- height_(1), +- cells_(malloc_array(Cell *, 1)) +-{ +- cells_[0] = malloc_array(Cell, width_); +- Cell *q = cells_[0]; +- for (string::size_type i = 0; i < s.length(); ++i) { +- q->character = s[i]; +- q->attribute = Cell::NONE; +- q++; +- } +-} +- + Area::Area(const Line &l) : + width_(l.length_), + height_(1), +@@ -281,37 +222,12 @@ + for (size_type y = 0; y < height_; y++) { + Cell *c = cells_[y]; + memmove(c + rs, c, (width_ - rs) * sizeof(Cell)); +- for (size_type x = 0; x < rs; x++) { +- c[x].character = ' '; +- c[x].attribute = Cell::NONE; +- } ++ for (size_type x = 0; x < rs; x++) c[x].clear(); + } + } + return *this; + } + +-unsigned int +-Area::utf_width() +-{ +- size_type r = width_; +- if(USE_UTF8) { r = 0; +- for (size_type yy = 0; yy < height_; yy++) { +- int i = width_ - 1; +- while((i >= 0) && isspace(cells_[yy][i].character)) +- { +- --i; +- } +- size_type aux_count_sum = 0; +- for (; i >= 0; i--) { +- aux_count_sum += utf8_aux_count(cells_[yy][i].character); +- } +- size_type r1 = width_ - aux_count_sum; +- if(r < r1) r = r1; +- } +- } +- return r; +-} +- + void + Area::resize(size_type w, size_type h) + { +@@ -416,25 +332,6 @@ + } + + void +-Area::insert(char c, size_type x, size_type y) +-{ +- enlarge(x + 1, y + 1); +- cells_[y][x].character = c; +-} +- +-void +-Area::insert(const string &s, size_type x, size_type y) +-{ +- enlarge(x + s.length(), y + 1); +- Cell *cell = &cells_[y][x]; +- for (string::size_type i = 0; i < s.length(); i++) { +- cell->character = s[i]; +- cell->attribute = Cell::NONE; +- cell++; +- } +-} +- +-void + Area::prepend(int n) + { + if (n <= 0) return; +@@ -512,7 +409,7 @@ + ) end--; + + for (const Cell *p = cell; p != end; p++) { +- char c = p->character; ++ WideChar c = p->character; + char a = p->attribute; + + if (c == (char) LATIN1_nbsp && !USE_UTF8) c = ' '; +@@ -540,7 +437,7 @@ + if ((a & Cell::BOLD ) && c != ' ') os << c << backspace; + os << c; + } else { +- os << (c == ' ' && (a & Cell::UNDERLINE) ? '_' : c); ++ os << (c == ' ' && (a & Cell::UNDERLINE) ? "_" : c); + } + } + } +--- a/Area.h ++++ b/Area.h +@@ -53,8 +53,21 @@ + + /* ------------------------------------------------------------------------- */ + ++#define MAX_UTF_LEN 4 ++ ++class WideChar { ++ char chars[MAX_UTF_LEN+1]; ++public: ++ WideChar() {}; ++ ~WideChar() {}; ++ void operator =(char c) { chars[0] = c; chars[1] = '\0'; } ++ bool operator ==(char c) const { return chars[0] == c; } ++ bool operator !=(char c) const { return chars[0] != c; } ++ operator char*() { return &chars[0]; } ++}; ++ + struct Cell { +- char character; ++ WideChar character; + char attribute; + + enum { NONE = 0, UNDERLINE = 1, BOLD = 2, STRIKETHROUGH = 4 }; +@@ -69,8 +82,7 @@ + public: + typedef size_t size_type; + +- Line(size_type l = 0); +- Line(const char *); ++// Line(size_type l = 0); + Line(const string &); + ~Line(); + +@@ -81,22 +93,18 @@ + Cell &operator[](size_type x) { return cells_[x]; } + const Cell *cells() const { return cells_; } + +- unsigned int utf_length(size_type f, size_type t) const; +- + void resize(size_type l); + void enlarge(size_type l) { if (l > length_) resize(l); } + + void insert(const Line &, size_type x); +- void insert(const char *, size_type x); + void insert(const string &, size_type x); + + void append(char c ); ++ void append(const string &); + void append(const Line &l); +- void append(const char *p); + + const Line &operator+=(char c ) { append(c); return *this; } + const Line &operator+=(const Line &l) { append(l); return *this; } +- const Line &operator+=(const char *p) { append(p); return *this; } + + void add_attribute(char addition); + +@@ -123,9 +131,7 @@ + }; + + Area(); +- Area(size_type w, size_type h = 0, char = ' ', char = Cell::NONE); +- Area(const char *); +- Area(const string &); ++ Area(size_type w, size_type h, char = ' ', char = Cell::NONE); + Area(const Line &); + ~Area(); + +@@ -136,8 +142,6 @@ + Cell *operator[](size_type y) { return cells_[y]; } + const Area &operator>>=(size_type rs); + +- unsigned int utf_width(); +- + void resize(size_type w, size_type h); + void enlarge(size_type w, size_type h); + +@@ -155,8 +159,6 @@ + ); + void insert(const Cell &, size_type x, size_type y); + void insert(const Cell *, size_type count, size_type x, size_type y); +- void insert(char, size_type x, size_type y); +- void insert(const string &, size_type x, size_type y); + void prepend(int n); // Prepend blank lines at top + void append(int n) // Append blank lines at bottom + { enlarge(width(), height() + n); } +--- a/format.C ++++ b/format.C +@@ -770,7 +770,7 @@ + if (!code.empty()) return new Area("[Java Applet " + code + ']'); + } + +- return new Area("[Java Applet]"); ++ return new Area(string("[Java Applet]")); + } + + Line * +@@ -1226,7 +1226,7 @@ + if (line[to].character == '\n') { + break; + } +- char c1 = line[to].character, c2 = line[to - 1].character; ++ WideChar c1 = line[to].character, c2 = line[to - 1].character; + if (c1 == ' ' || c1 == '('/*)*/ || c1 == '['/*]*/ || c1 == '{'/*}*/ || ( + ( + c2 == '-' || +@@ -1244,20 +1244,19 @@ + to++; + } + +- if (line.utf_length(from,to) > w && lbp != (Area::size_type) -1) ++ if (to - from > w && lbp != (Area::size_type) -1) + { to = lbp; break; } + } + +- to_from = line.utf_length(from,to); + /* + * Copy the "from...to" range from the "line" to the bottom of the "res" + * Area. + */ + Area::size_type x = 0; + Area::size_type len = to - from; +- if (halign == Area::LEFT || to_from >= w) { ; } else +- if (halign == Area::CENTER) { x += (w - to_from) / 2; } else +- if (halign == Area::RIGHT) { x += w - to_from; } ++ if (halign == Area::LEFT || len >= w) { ; } else ++ if (halign == Area::CENTER) { x += (w - len) / 2; } else ++ if (halign == Area::RIGHT) { x += w - len; } + res->insert(line.cells() + from, len, x, res->height()); + + /* +--- a/table.C ++++ b/table.C +@@ -175,7 +175,7 @@ + - (*number_of_columns_return - 1) * (column_spacing + 0), + Area::LEFT // Yields better results than "p->halign"! + )); +- p->width = tmp.get() ? tmp->utf_width() : 0; ++ p->width = tmp.get() ? tmp->width() : 0; + } + p->minimized = false; + +@@ -308,7 +308,7 @@ + left_of_column + old_column_width - 1, + Area::LEFT // Yields better results than "lc.halign"! + )); +- w = tmp->utf_width(); ++ w = tmp->width(); + if (w >= left_of_column + old_column_width) lc.minimized = true; + } + if (w > left_of_column + new_column_width) { diff -Nru html2text-1.3.2a/debian/patches/series html2text-1.3.2a/debian/patches/series --- html2text-1.3.2a/debian/patches/series 2014-09-07 21:00:27.000000000 +0300 +++ html2text-1.3.2a/debian/patches/series 2014-12-13 09:23:53.000000000 +0200 @@ -12,3 +12,6 @@ 810-fix-deprecated-conversion-warnings.patch 900-complete-utf8-entities-table.patch 950-validate-width-parameter.patch +fix-arg-parsing.patch +catch-ENOMEM.patch +fix-utf8-support.patch