Control: tags -1 patch

Hi,

I have rewritten Area.{C,h} to ensure proper UTF8 support,
which would close this bug and related ones (#545695, #732702).
Now html2text should always generate valid UTF8 in UTF8 mode.

I've prepared a NMU which fixes these bugs and some other minor problems. 
Please see attached debdiff.


Best regards,
Dmitry Borisyuk

diff -Nru html2text-1.3.2a/debian/changelog html2text-1.3.2a/debian/changelog
--- html2text-1.3.2a/debian/changelog	2014-09-07 21:08:35.000000000 +0300
+++ html2text-1.3.2a/debian/changelog	2014-12-13 09:33:08.000000000 +0200
@@ -1,3 +1,12 @@
+html2text (1.3.2a-18.1) experimental; urgency=low
+
+  * Non-maintainer upload.
+  * Full UTF-8 support (rewritten Area.C, closes: #732702, #692861, #545695).
+  * Catch ENOMEM in {m,re}alloc_array in Area.C.
+  * Fix parsing of input args (Closes: #716064).
+
+ -- Dmitry Borisyuk <q1we...@i.com.ua>  Sat, 13 Dec 2014 09:31:59 +0200
+
 html2text (1.3.2a-18) unstable; urgency=medium
 
   * Complete the utf8 table. (Closes: #760588) - thanks to Marcos Marado for
diff -Nru html2text-1.3.2a/debian/patches/catch-ENOMEM.patch html2text-1.3.2a/debian/patches/catch-ENOMEM.patch
--- html2text-1.3.2a/debian/patches/catch-ENOMEM.patch	1970-01-01 03:00:00.000000000 +0300
+++ html2text-1.3.2a/debian/patches/catch-ENOMEM.patch	2014-12-13 09:40:56.000000000 +0200
@@ -0,0 +1,30 @@
+Description: Catch ENOMEM in {m,re}alloc_array (prevent segfaults).
+Author: Dmitry Borisyuk <q1we...@i.com.ua>
+--- a/Area.C
++++ b/Area.C
+@@ -30,7 +30,7 @@
+ 
+  /***************************************************************************/
+ 
+-
++#include <stdio.h>
+ #include <stdlib.h>
+ #include <string.h>
+ #include <iostream>
+@@ -45,10 +45,14 @@
+ 
+ /* ------------------------------------------------------------------------- */
+ 
++static void* alloc_error() {
++  perror("html2text: error"); abort();
++}
++
+ #define malloc_array(type, size)\
+-((type *) malloc(sizeof(type) * (size)))
++((type *) (malloc(sizeof(type) * (size)) ? : alloc_error()))
+ #define realloc_array(array, type, size) \
+-((array) = (type *) realloc((array), sizeof(type) * (size)))
++((array) = (type *) (realloc((array), sizeof(type) * (size)) ? : alloc_error()))
+ #define copy_array(from, to, type, count) \
+ ((void) memcpy((to), (from), (count) * sizeof(type)))
+ 
diff -Nru html2text-1.3.2a/debian/patches/fix-arg-parsing.patch html2text-1.3.2a/debian/patches/fix-arg-parsing.patch
--- html2text-1.3.2a/debian/patches/fix-arg-parsing.patch	1970-01-01 03:00:00.000000000 +0300
+++ html2text-1.3.2a/debian/patches/fix-arg-parsing.patch	2014-12-13 09:39:35.000000000 +0200
@@ -0,0 +1,20 @@
+Description: Fix parsing of input args (prevent segfault).
+Author: Dmitry Borisyuk <q1we...@i.com.ua>
+Bug-Debian: https://bugs.debian.org/716064
+--- a/html2text.C
++++ b/html2text.C
+@@ -295,10 +295,10 @@
+     if (!strcmp(arg, "-check"        )) { mode = MyParser::SYNTAX_CHECK;                  } else
+     if (!strcmp(arg, "-debug-scanner")) { debug_scanner = true;                           } else
+     if (!strcmp(arg, "-debug-parser" )) { debug_parser = true;                            } else
+-    if (!strcmp(arg, "-rcfile"       )) { rcfile = argv[++i];                             } else
+-    if (!strcmp(arg, "-style"        )) { style = argv[++i];                              } else
+-    if (!strcmp(arg, "-width"        )) { if (atoi(argv[++i]) > 0) width = atoi(argv[i]); } else
+-    if (!strcmp(arg, "-o"            )) { output_file_name = argv[++i];                   } else
++    if (!strcmp(arg, "-rcfile"       )) { if (++i < argc) rcfile = argv[i];               } else
++    if (!strcmp(arg, "-style"        )) { if (++i < argc) style = argv[i];                } else
++    if (!strcmp(arg, "-width"        )) { if (++i < argc) if (atoi(argv[i]) > 0) width = atoi(argv[i]); } else
++    if (!strcmp(arg, "-o"            )) { if (++i < argc) output_file_name = argv[i];     } else
+     if (!strcmp(arg, "-nobs"         )) { use_backspaces = false;                         } else
+     if (!strcmp(arg, "-ascii"        )) { use_encoding = ASCII;                           } else
+     if (!strcmp(arg, "-utf8"         )) { use_encoding = UTF8;                            } else
diff -Nru html2text-1.3.2a/debian/patches/fix-utf8-support.patch html2text-1.3.2a/debian/patches/fix-utf8-support.patch
--- html2text-1.3.2a/debian/patches/fix-utf8-support.patch	1970-01-01 03:00:00.000000000 +0300
+++ html2text-1.3.2a/debian/patches/fix-utf8-support.patch	2014-12-13 09:44:01.000000000 +0200
@@ -0,0 +1,453 @@
+Description: Full proper UTF-8 support.
+ Rewritten Area.{C,h}, introduced WideChar class which represents utf8 character.
+Author: Dmitry Borisyuk <q1we...@i.com.ua>
+Bug-Debian: https://bugs.debian.org/732702
+Bug-Debian: https://bugs.debian.org/692861
+Bug-Debian: https://bugs.debian.org/545695
+--- a/Area.C
++++ b/Area.C
+@@ -43,6 +43,40 @@
+ 
+ extern int use_encoding;
+ 
++int utf8_is_continuation(char ch) {
++  return (ch & 0xc0) == 0x80;
++}
++
++// copy single (multibyte) character from s to dest, return ptr to the next one
++const char*
++utf_next(const char* s, char* dest)
++{
++  if (!(s && *s)) return NULL;
++  const char *r = s+1;
++  if (USE_UTF8) {
++    while (utf8_is_continuation(*r)) r++;
++  };
++  if (dest) {
++    size_t n = r - s;
++    if (n > MAX_UTF_LEN) n = MAX_UTF_LEN;
++    memcpy(dest, s, n);
++    dest[n] = '\0';
++  };
++  return r;
++}
++
++size_t
++utf_length(const char* s)
++{
++  size_t n = 0;
++  if (USE_UTF8) {
++    while (*s)
++      if (!utf8_is_continuation(*(s++))) n++;
++    return n;
++  }
++  else return strlen(s);
++}
++
+ /* ------------------------------------------------------------------------- */
+ 
+ static void* alloc_error() {
+@@ -58,27 +92,19 @@
+ 
+ /* ------------------------------------------------------------------------- */
+ 
+-Line::Line(size_type l) : length_(l), cells_(malloc_array(Cell, l))
++/*Line::Line(size_type l) : length_(l), cells_(malloc_array(Cell, l))
+ {
+   Cell *p, *end = cells_ + l;
+   for (p = cells_; p != end; p++) p->clear();
+-}
+-
+-Line::Line(const char *p) :
+-  length_(strlen(p)),
+-  cells_(malloc_array(Cell, length_))
+-{
+-  Cell *q = cells_, *end = q + length_;
+-  while (q != end) { q->character = *p++; q->attribute = Cell::NONE; q++; }
+-}
++}*/
+ 
+ Line::Line(const string &s) :
+-  length_(s.length()),
++  length_(utf_length(s.c_str())),
+   cells_(malloc_array(Cell, length_))
+ {
+   const char *p = s.c_str();
+   Cell *q = cells_, *end = q + length_;
+-  while (q != end) { q->character = *p++; q->attribute = Cell::NONE; q++; }
++  while (q != end) { p = utf_next(p, q->character); q->attribute = Cell::NONE; q++; }
+ }
+ 
+ Line::~Line()
+@@ -86,61 +112,12 @@
+   free(cells_);
+ }
+ 
+-/* ------------------------------------------------------------------------- */
+-
+-/*           utf_length() and utf_width()       
+- *
+- *     Very simplified algorithm of calculating length of UTF-8
+- *   string. No check for errors. Counting only ASCII bytes and
+- *   leading bytes of UTF-8 multibyte sequences. All bytes like
+- *   10xxxxxx are dropped. If USE_UTF8 is false then returns
+- *   usual length.               --YS
+- */
+-
+-size_t utf8_aux_count(char ch)
+-{
+-	if((ch & 0xe0) == 0xc0)
+-	{
+-		return 1;
+-	}
+-	else if((ch & 0xf0) == 0xe0)
+-	{
+-		return 2;
+-	}
+-	else if ((ch & 0xf8) == 0xf0)
+-	{
+-		return 3;
+-	}
+-	else
+-	{
+-		return 0;
+-	}
+-}
+-
+-unsigned int
+-Line::utf_length(size_type f, size_type t) const
+-{
+-	size_type m = (t < length_ ? t : length_);
+-	size_type r = m - f;
+-	if(USE_UTF8)
+-	{
+-		for (int i = f; i < m; i++)
+-		{
+-			char& ch = cells_[i].character;
+-			size_type aux_count = utf8_aux_count(ch);
+-			r -= aux_count;
+-			i += aux_count;
+-		}
+-	}
+-	return r;
+-}
+-
+ void
+ Line::resize(size_type l)
+ {
+-  if (l == length()) return;
++  if (l == length_) return;
+   realloc_array(cells_, Cell, l);
+-  for (size_type x = length(); x < l; x++) cells_[x].clear();
++  for (size_type x = length_; x < l; x++) cells_[x].clear();
+   length_ = l;
+ }
+ 
+@@ -154,17 +131,9 @@
+ }
+ 
+ void
+-Line::insert(const char *p, size_type x)
+-{
+-  enlarge(x + strlen(p));
+-  Cell *q = cells_ + x;
+-  while (*p) q++->character = *p++;
+-}
+-
+-void
+ Line::insert(const string &s, size_type x)
+ {
+-  insert(s.c_str(), x);
++  insert(Line(s), x);
+ }
+ 
+ void
+@@ -187,19 +156,15 @@
+ }
+ 
+ void
+-Line::append(const char *p)
++Line::append(const string &s)
+ {
+-  size_type x = length_;
+-  enlarge(x + strlen(p));
+-  Cell *q = cells_ + x;
+-  for (; *p; ++p, ++q) { q->character = *p; q->attribute = Cell::NONE; }
++  append(Line(s));
+ }
+ 
+ void
+ Line::add_attribute(char addition)
+ {
+   Cell *p = cells_, *end = cells_ + length_;
+-
+   while(p != end) p++->attribute |= addition;
+ }
+ 
+@@ -232,30 +197,6 @@
+   }
+ }
+ 
+-Area::Area(const char *p) :
+-  width_(strlen(p)),
+-  height_(1),
+-  cells_(malloc_array(Cell *, 1))
+-{
+-  cells_[0] = malloc_array(Cell, width_);
+-  Cell *q = cells_[0], *end = q + width_;
+-  while (q != end) { q->character = *p++; q->attribute = Cell::NONE; q++; }
+-}
+-
+-Area::Area(const string &s) :
+-  width_(s.length()),
+-  height_(1),
+-  cells_(malloc_array(Cell *, 1))
+-{
+-  cells_[0] = malloc_array(Cell, width_);
+-  Cell *q = cells_[0];
+-  for (string::size_type i = 0; i < s.length(); ++i) {
+-    q->character = s[i];
+-    q->attribute = Cell::NONE;
+-    q++;
+-  }
+-}
+-
+ Area::Area(const Line &l) :
+   width_(l.length_),
+   height_(1),
+@@ -281,37 +222,12 @@
+     for (size_type y = 0; y < height_; y++) {
+       Cell *c = cells_[y];
+       memmove(c + rs, c, (width_ - rs) * sizeof(Cell));
+-      for (size_type x = 0; x < rs; x++) {
+-        c[x].character = ' ';
+-        c[x].attribute = Cell::NONE;
+-      }
++      for (size_type x = 0; x < rs; x++) c[x].clear();
+     }
+   }
+   return *this;
+ }
+ 
+-unsigned int
+-Area::utf_width()
+-{
+-  size_type r = width_;
+-  if(USE_UTF8) { r = 0;
+-    for (size_type yy = 0; yy < height_; yy++) {
+-	  int i = width_ - 1;
+-      while((i >= 0) && isspace(cells_[yy][i].character))
+-	  {
+-		  --i;
+-	  }
+-      size_type aux_count_sum = 0;
+-      for (; i >= 0; i--) {
+-		aux_count_sum += utf8_aux_count(cells_[yy][i].character);
+-      }
+-	  size_type r1 = width_ - aux_count_sum;
+-      if(r < r1) r = r1;
+-    }
+-  }
+-  return r;
+-}
+-
+ void
+ Area::resize(size_type w, size_type h)
+ {
+@@ -416,25 +332,6 @@
+ }
+ 
+ void
+-Area::insert(char c, size_type x, size_type y)
+-{
+-  enlarge(x + 1, y + 1);
+-  cells_[y][x].character = c;
+-}
+-
+-void
+-Area::insert(const string &s, size_type x, size_type y)
+-{
+-  enlarge(x + s.length(), y + 1);
+-  Cell *cell = &cells_[y][x];
+-  for (string::size_type i = 0; i < s.length(); i++) {
+-    cell->character = s[i];
+-    cell->attribute = Cell::NONE;
+-    cell++;
+-  }
+-}
+-
+-void
+ Area::prepend(int n)
+ {
+   if (n <= 0) return;
+@@ -512,7 +409,7 @@
+     ) end--;
+ 
+     for (const Cell *p = cell; p != end; p++) {
+-      char c = p->character;
++      WideChar c = p->character;
+       char a = p->attribute;
+ 
+       if (c == (char) LATIN1_nbsp && !USE_UTF8) c = ' ';
+@@ -540,7 +437,7 @@
+           if ((a & Cell::BOLD     ) && c != ' ') os << c   << backspace;
+           os << c;
+         } else {
+-	  os << (c == ' ' && (a & Cell::UNDERLINE) ? '_' : c);
++	  os << (c == ' ' && (a & Cell::UNDERLINE) ? "_" : c);
+ 	}
+       }
+     }
+--- a/Area.h
++++ b/Area.h
+@@ -53,8 +53,21 @@
+ 
+ /* ------------------------------------------------------------------------- */
+ 
++#define MAX_UTF_LEN 4
++
++class WideChar {
++  char chars[MAX_UTF_LEN+1];
++public:
++  WideChar() {};
++  ~WideChar() {};
++  void operator =(char c) { chars[0] = c; chars[1] = '\0'; }
++  bool operator ==(char c) const { return chars[0] == c; }
++  bool operator !=(char c) const { return chars[0] != c; }
++  operator char*() { return &chars[0]; }
++};
++
+ struct Cell {
+-  char character;
++  WideChar character;
+   char attribute;
+ 
+   enum { NONE = 0, UNDERLINE = 1, BOLD = 2, STRIKETHROUGH = 4 };
+@@ -69,8 +82,7 @@
+ public:
+   typedef size_t size_type;
+ 
+-  Line(size_type l = 0);
+-  Line(const char *);
++//  Line(size_type l = 0);
+   Line(const string &);
+   ~Line();
+ 
+@@ -81,22 +93,18 @@
+   Cell       &operator[](size_type x)       { return cells_[x]; }
+   const Cell *cells() const { return cells_; }
+ 
+-  unsigned int utf_length(size_type f, size_type t) const;
+-
+   void resize(size_type l);
+   void enlarge(size_type l) { if (l > length_) resize(l); }
+ 
+   void insert(const Line &, size_type x);
+-  void insert(const char *, size_type x);
+   void insert(const string &, size_type x);
+ 
+   void append(char       c );
++  void append(const string &);
+   void append(const Line &l);
+-  void append(const char *p);
+ 
+   const Line &operator+=(char       c ) { append(c); return *this; }
+   const Line &operator+=(const Line &l) { append(l); return *this; }
+-  const Line &operator+=(const char *p) { append(p); return *this; }
+ 
+   void add_attribute(char addition);
+ 
+@@ -123,9 +131,7 @@
+   };
+ 
+   Area();
+-  Area(size_type w, size_type h = 0, char = ' ', char = Cell::NONE);
+-  Area(const char *);
+-  Area(const string &);
++  Area(size_type w, size_type h, char = ' ', char = Cell::NONE);
+   Area(const Line &);
+   ~Area();
+ 
+@@ -136,8 +142,6 @@
+   Cell       *operator[](size_type y)       { return cells_[y]; }
+   const Area &operator>>=(size_type rs);
+ 
+-  unsigned int utf_width();
+-
+   void resize(size_type w, size_type h);
+   void enlarge(size_type w, size_type h);
+ 
+@@ -155,8 +159,6 @@
+   );
+   void insert(const Cell &, size_type x, size_type y);
+   void insert(const Cell *, size_type count, size_type x, size_type y);
+-  void insert(char, size_type x, size_type y);
+-  void insert(const string &, size_type x, size_type y);
+   void prepend(int n);    // Prepend blank lines at top
+   void append(int n)      // Append blank lines at bottom
+                 { enlarge(width(), height() + n); }
+--- a/format.C
++++ b/format.C
+@@ -770,7 +770,7 @@
+     if (!code.empty()) return new Area("[Java Applet " + code + ']');
+   }
+ 
+-  return new Area("[Java Applet]");
++  return new Area(string("[Java Applet]"));
+ }
+ 
+ Line *
+@@ -1226,7 +1226,7 @@
+       if (line[to].character == '\n') {
+         break;
+       }
+-      char c1 = line[to].character, c2 = line[to - 1].character;
++      WideChar c1 = line[to].character, c2 = line[to - 1].character;
+       if (c1 == ' ' || c1 == '('/*)*/ || c1 == '['/*]*/ || c1 == '{'/*}*/ || (
+         (
+           c2 == '-' ||
+@@ -1244,20 +1244,19 @@
+         to++;
+       }
+ 
+-      if (line.utf_length(from,to) > w && lbp != (Area::size_type) -1) 
++      if (to - from > w && lbp != (Area::size_type) -1)
+                     { to = lbp; break; }
+     }
+ 
+-    to_from = line.utf_length(from,to);
+     /*
+      * Copy the "from...to" range from the "line" to the bottom of the "res"
+      * Area.
+      */
+     Area::size_type x = 0;
+     Area::size_type len = to - from;
+-    if (halign == Area::LEFT || to_from >= w) { ;                   } else
+-    if (halign == Area::CENTER)           { x += (w - to_from) / 2; } else
+-    if (halign == Area::RIGHT)            { x += w - to_from;       }
++    if (halign == Area::LEFT || len >= w) { ;                   } else
++    if (halign == Area::CENTER)           { x += (w - len) / 2; } else
++    if (halign == Area::RIGHT)            { x += w - len;       }
+     res->insert(line.cells() + from, len, x, res->height());
+ 
+     /*
+--- a/table.C
++++ b/table.C
+@@ -175,7 +175,7 @@
+           - (*number_of_columns_return - 1) * (column_spacing + 0),
+           Area::LEFT // Yields better results than "p->halign"!
+         ));
+-	p->width = tmp.get() ? tmp->utf_width() : 0;
++        p->width = tmp.get() ? tmp->width() : 0;
+       }
+       p->minimized = false;
+ 
+@@ -308,7 +308,7 @@
+ 	left_of_column + old_column_width - 1,
+ 	Area::LEFT // Yields better results than "lc.halign"!
+       ));
+-      w = tmp->utf_width();
++      w = tmp->width();
+       if (w >= left_of_column + old_column_width) lc.minimized = true;
+     }
+     if (w > left_of_column + new_column_width) {
diff -Nru html2text-1.3.2a/debian/patches/series html2text-1.3.2a/debian/patches/series
--- html2text-1.3.2a/debian/patches/series	2014-09-07 21:00:27.000000000 +0300
+++ html2text-1.3.2a/debian/patches/series	2014-12-13 09:23:53.000000000 +0200
@@ -12,3 +12,6 @@
 810-fix-deprecated-conversion-warnings.patch
 900-complete-utf8-entities-table.patch
 950-validate-width-parameter.patch
+fix-arg-parsing.patch
+catch-ENOMEM.patch
+fix-utf8-support.patch

Reply via email to