tags 633704 patch
thanks
On Wed, 24 Aug 2011, Bill Poser wrote:

I'm working on this, but, curiously, on my system I don't get a segfault.
Instead, I get infinitely many copies of the error message.

It also manifests as an infinite loop for me. The issue is that sscanf("...%n",...) does not fully match the pattern, so the %n value is not set, and the value of the variable passed in is unchanged. I think in some circumstances it could be uninitialized memory (which would explain the segfaults), but has always started out as zero in my simple test cases.

Working around the lack of a %n value is fairly trivial, just setting a sentinel value and checking for it; however, the code attempts to explicitly check for such "partial matches", so this would be a feature regression (in some sense).

I attach my current patch (I think the snprintf return value checks may need to be >=; I didn't test that part). I worked on this bug at a bugsquashathon here at MIT this weekend, and kcr was going to help me get the change uploaded, but it is kind of a nasty patch to review ...

Annotations inline:
% diff -ruN uni2ascii-4.18.orig//ascii2uni.c uni2ascii-4.18/ascii2uni.c
% --- uni2ascii-4.18.orig//ascii2uni.c  2011-05-14 22:15:20.000000000 -0400
% +++ uni2ascii-4.18/ascii2uni.c        2011-08-23 20:07:29.000000000 -0400
% @@ -208,6 +208,7 @@
%    char aHfmt [8+2+1];
%    char aDfmt [8+2+1];
%    char cbuf[5];
% +  char fmt_itoa[12];

The scanf pattern %ul stores into an integral type, but we need to know the number of characters it occupies in the string. To do so, we'll need to printf it again, into this buffer. It's statically allocated because the value should fit in 32 bits (if I read the UTF32 type correctly), and the decimal representation therein will fit in this size, with NUL terminator.

%    FILE *infp;
% % UTF32 num;
% @@ -555,45 +556,64 @@
%         }
%         else if (FType == CHENT) {
%        if (AllHTMLP){
% +        NConsumed = -1;
%          if(sscanf(iptr,aHfmt,&num,&NConsumed) > 0) {
% -          if(*(iptr+NConsumed-1) != ';') {
% +          if(NConsumed == -1 || *(iptr+NConsumed-1) != ';') {

The sentinel value, and we check for a partial match in both ways.

%              MicrosoftStyle++;
% +            if (NConsumed == -1) {
% +              if (snprintf(fmt_itoa, sizeof(fmt_itoa), "%x", num) > 
sizeof(fmt_itoa)-1) {
% +                fprintf(stderr, "UTF32 codepoint overflowed static 
buffer\n");
% +                exit(BADRECORD);
% +              }
% +              NConsumed = 3 /* "&#x" */ + strlen(fmt_itoa) + 1 /* ";" */;
% +            }

We need to know how many characters were used for the partial match; make sure to not overflow the static buffer.

%              fprintf(stderr,
%                      _("The HTML/HDML entity %1$s at token %2$lu of line %3$lu 
lacks the requisite final semicolon.\n"),
%                      
ExtractSubstring(tmpstr,iptr,iptr+NConsumed-3),TokenNumber,LineNo);
%              if(StrictP) {putchar(*iptr++); continue;}
% -            else {putu8(num);iptr+=NConsumed;}
% +            else {putu8(num);iptr+=NConsumed-1;}

The ';' was not matched, so the character in that position is really a part of something else.

%            }
%            else {putu8(num);iptr+=NConsumed;}
%            TokenNumber++;
%            continue;
%          }
% +        NConsumed = -1;
%          if(sscanf(iptr,aDfmt,&num,&NConsumed) > 0) {
% -          if(*(iptr+NConsumed-1) != ';') {
% +          if(NConsumed == -1 || *(iptr+NConsumed-1) != ';') {
%              MicrosoftStyle++;
% +            if (NConsumed == -1) {
% +              if (snprintf(fmt_itoa, sizeof(fmt_itoa), "%u", num) > 
sizeof(fmt_itoa)-1) {
% +                fprintf(stderr, "UTF32 codepoint overflowed static 
buffer\n");
% +                exit(BADRECORD);
% +              }
% +              NConsumed = 2 /* "&#" */ + strlen(fmt_itoa) + 1 /* ";" */;
% +            }

The same dance, but with a different-length prefix.

%              fprintf(stderr,
%                      _("The HTML/HDML entity %1$s at token %2$lu of line %3$lu 
lacks the requisite final semicolon.\n"),
%                      
ExtractSubstring(tmpstr,iptr,iptr+NConsumed-3),TokenNumber,LineNo);
%              if (StrictP) {putchar(*iptr++); continue;}
% -            else {putu8(num);iptr+=NConsumed;}
% +            else {putu8(num);iptr+=NConsumed-1;}
%            }
%            else {putu8(num);iptr+=NConsumed;}
%            TokenNumber++;
%            continue;
%          }
%        }
% +      NConsumed = -1;
%        if(sscanf(iptr,afmt,&enam,&NConsumed) > 0) {
% +        if (NConsumed == -1) NConsumed = 1 /* "&" */ + strlen(enam) + 1 /* 
";" */;

This case is slightly different, in that we accept any match but check if it is a valid entity. Update the number of characters consumed by the partial match early, to ease the check for MicrosoftStyle input below.

%          if( (num = LookupCodeForEntity(enam))) {
%            if(*(iptr+NConsumed-1) != ';') {
%              MicrosoftStyle++;
%              fprintf(stderr,_("The HTML/HDML entity %1$s at token %2$lu of line 
%3$lu lacks the requisite final 
semicolon.\n"),ExtractSubstring(tmpstr,iptr,iptr+NConsumed-3),TokenNumber,LineNo);
%              if(StrictP) {putchar(*iptr++);continue;}
% -            else {putu8(num);iptr+=NConsumed;}
% +            else {putu8(num);iptr+=NConsumed-1;}
%            }
%            else {putu8(num);iptr+=NConsumed;}
%            TokenNumber++;
%          }
%          else {
% +          if(*(iptr+NConsumed-1) != ';') NConsumed--;

Again, don't eat the extra unmatched character.

-Ben Kaduk


%            fprintf(stderr,"ascii2uni: unknown HTML/HDML character entity \"&%s;\" 
at line %lu\n",
%                    enam,LineNo);
%            putu8(UNI_REPLACEMENT_CHAR);
% diff -ruN uni2ascii-4.18.orig//debian/changelog 
uni2ascii-4.18/debian/changelog
% --- uni2ascii-4.18.orig//debian/changelog     2011-05-17 01:30:08.000000000 
-0400
% +++ uni2ascii-4.18/debian/changelog   2011-08-23 19:24:25.000000000 -0400
% @@ -1,3 +1,11 @@
% +uni2ascii (4.18-1.1) unstable; urgency=low
% +
% +  * Non-maintainer upload.
% +  * Check for partially-matched sscanf() patterns and consume an appropriate
% +    number of characters (Closes: #633704)
% +
% + -- Benjamin Kaduk <ka...@mit.edu>  Tue, 23 Aug 2011 19:22:19 -0400
% +
%  uni2ascii (4.18-1) unstable; urgency=low
% % * New upstream release:
diff -ruN uni2ascii-4.18.orig//ascii2uni.c uni2ascii-4.18/ascii2uni.c
--- uni2ascii-4.18.orig//ascii2uni.c    2011-05-14 22:15:20.000000000 -0400
+++ uni2ascii-4.18/ascii2uni.c  2011-08-23 20:07:29.000000000 -0400
@@ -208,6 +208,7 @@
   char aHfmt [8+2+1];
   char aDfmt [8+2+1];
   char cbuf[5];
+  char fmt_itoa[12];
   FILE *infp;
 
   UTF32 num;
@@ -555,45 +556,64 @@
        }
        else if (FType == CHENT) {
         if (AllHTMLP){
+          NConsumed = -1;
           if(sscanf(iptr,aHfmt,&num,&NConsumed) > 0) {
-            if(*(iptr+NConsumed-1) != ';') {
+            if(NConsumed == -1 || *(iptr+NConsumed-1) != ';') {
               MicrosoftStyle++;
+              if (NConsumed == -1) {
+                if (snprintf(fmt_itoa, sizeof(fmt_itoa), "%x", num) > 
sizeof(fmt_itoa)-1) {
+                  fprintf(stderr, "UTF32 codepoint overflowed static 
buffer\n");
+                  exit(BADRECORD);
+                }
+                NConsumed = 3 /* "&#x" */ + strlen(fmt_itoa) + 1 /* ";" */;
+              }
               fprintf(stderr,
                       _("The HTML/HDML entity %1$s at token %2$lu of line 
%3$lu lacks the requisite final semicolon.\n"),
                       
ExtractSubstring(tmpstr,iptr,iptr+NConsumed-3),TokenNumber,LineNo);
               if(StrictP) {putchar(*iptr++); continue;}
-              else {putu8(num);iptr+=NConsumed;}
+              else {putu8(num);iptr+=NConsumed-1;}
             }
             else {putu8(num);iptr+=NConsumed;}
             TokenNumber++;
             continue;
           }
+          NConsumed = -1;
           if(sscanf(iptr,aDfmt,&num,&NConsumed) > 0) {
-            if(*(iptr+NConsumed-1) != ';') {
+            if(NConsumed == -1 || *(iptr+NConsumed-1) != ';') {
               MicrosoftStyle++;
+              if (NConsumed == -1) {
+                if (snprintf(fmt_itoa, sizeof(fmt_itoa), "%u", num) > 
sizeof(fmt_itoa)-1) {
+                  fprintf(stderr, "UTF32 codepoint overflowed static 
buffer\n");
+                  exit(BADRECORD);
+                }
+                NConsumed = 2 /* "&#" */ + strlen(fmt_itoa) + 1 /* ";" */;
+              }
               fprintf(stderr,
                       _("The HTML/HDML entity %1$s at token %2$lu of line 
%3$lu lacks the requisite final semicolon.\n"),
                       
ExtractSubstring(tmpstr,iptr,iptr+NConsumed-3),TokenNumber,LineNo);
               if (StrictP) {putchar(*iptr++); continue;}
-              else {putu8(num);iptr+=NConsumed;}
+              else {putu8(num);iptr+=NConsumed-1;}
             }
             else {putu8(num);iptr+=NConsumed;}
             TokenNumber++;
             continue;
           }
         }
+        NConsumed = -1;
         if(sscanf(iptr,afmt,&enam,&NConsumed) > 0) {
+          if (NConsumed == -1) NConsumed = 1 /* "&" */ + strlen(enam) + 1 /* 
";" */;
           if( (num = LookupCodeForEntity(enam))) {
             if(*(iptr+NConsumed-1) != ';') {
               MicrosoftStyle++;
               fprintf(stderr,_("The HTML/HDML entity %1$s at token %2$lu of 
line %3$lu lacks the requisite final 
semicolon.\n"),ExtractSubstring(tmpstr,iptr,iptr+NConsumed-3),TokenNumber,LineNo);
               if(StrictP) {putchar(*iptr++);continue;}
-              else {putu8(num);iptr+=NConsumed;}
+              else {putu8(num);iptr+=NConsumed-1;}
             }
             else {putu8(num);iptr+=NConsumed;}
             TokenNumber++;
           }
           else {
+            if(*(iptr+NConsumed-1) != ';') NConsumed--;
             fprintf(stderr,"ascii2uni: unknown HTML/HDML character entity 
\"&%s;\" at line %lu\n",
                     enam,LineNo);
             putu8(UNI_REPLACEMENT_CHAR);
diff -ruN uni2ascii-4.18.orig//debian/changelog uni2ascii-4.18/debian/changelog
--- uni2ascii-4.18.orig//debian/changelog       2011-05-17 01:30:08.000000000 
-0400
+++ uni2ascii-4.18/debian/changelog     2011-08-23 19:24:25.000000000 -0400
@@ -1,3 +1,11 @@
+uni2ascii (4.18-1.1) unstable; urgency=low
+
+  * Non-maintainer upload.
+  * Check for partially-matched sscanf() patterns and consume an appropriate
+    number of characters (Closes: #633704)
+
+ -- Benjamin Kaduk <ka...@mit.edu>  Tue, 23 Aug 2011 19:22:19 -0400
+
 uni2ascii (4.18-1) unstable; urgency=low
 
   * New upstream release:

Reply via email to