tags 633704 patch
thanks
On Wed, 24 Aug 2011, Bill Poser wrote:
I'm working on this, but, curiously, on my system I don't get a segfault.
Instead, I get infinitely many copies of the error message.
It also manifests as an infinite loop for me. The issue is that
sscanf("...%n",...) does not fully match the pattern, so the %n value is
not set, and the value of the variable passed in is unchanged. I think in
some circumstances it could be uninitialized memory (which would explain
the segfaults), but has always started out as zero in my simple test
cases.
Working around the lack of a %n value is fairly trivial, just setting a
sentinel value and checking for it; however, the code attempts to
explicitly check for such "partial matches", so this would be a feature
regression (in some sense).
I attach my current patch (I think the snprintf return value checks may
need to be >=; I didn't test that part). I worked on this bug at a
bugsquashathon here at MIT this weekend, and kcr was going to help me get
the change uploaded, but it is kind of a nasty patch to review ...
Annotations inline:
% diff -ruN uni2ascii-4.18.orig//ascii2uni.c uni2ascii-4.18/ascii2uni.c
% --- uni2ascii-4.18.orig//ascii2uni.c 2011-05-14 22:15:20.000000000 -0400
% +++ uni2ascii-4.18/ascii2uni.c 2011-08-23 20:07:29.000000000 -0400
% @@ -208,6 +208,7 @@
% char aHfmt [8+2+1];
% char aDfmt [8+2+1];
% char cbuf[5];
% + char fmt_itoa[12];
The scanf pattern %ul stores into an integral type, but we need to know
the number of characters it occupies in the string. To do so, we'll need
to printf it again, into this buffer. It's statically allocated because
the value should fit in 32 bits (if I read the UTF32 type correctly), and
the decimal representation therein will fit in this size, with NUL
terminator.
% FILE *infp;
%
% UTF32 num;
% @@ -555,45 +556,64 @@
% }
% else if (FType == CHENT) {
% if (AllHTMLP){
% + NConsumed = -1;
% if(sscanf(iptr,aHfmt,&num,&NConsumed) > 0) {
% - if(*(iptr+NConsumed-1) != ';') {
% + if(NConsumed == -1 || *(iptr+NConsumed-1) != ';') {
The sentinel value, and we check for a partial match in both ways.
% MicrosoftStyle++;
% + if (NConsumed == -1) {
% + if (snprintf(fmt_itoa, sizeof(fmt_itoa), "%x", num) >
sizeof(fmt_itoa)-1) {
% + fprintf(stderr, "UTF32 codepoint overflowed static
buffer\n");
% + exit(BADRECORD);
% + }
% + NConsumed = 3 /* "&#x" */ + strlen(fmt_itoa) + 1 /* ";" */;
% + }
We need to know how many characters were used for the partial match; make
sure to not overflow the static buffer.
% fprintf(stderr,
% _("The HTML/HDML entity %1$s at token %2$lu of line %3$lu
lacks the requisite final semicolon.\n"),
%
ExtractSubstring(tmpstr,iptr,iptr+NConsumed-3),TokenNumber,LineNo);
% if(StrictP) {putchar(*iptr++); continue;}
% - else {putu8(num);iptr+=NConsumed;}
% + else {putu8(num);iptr+=NConsumed-1;}
The ';' was not matched, so the character in that position is really a
part of something else.
% }
% else {putu8(num);iptr+=NConsumed;}
% TokenNumber++;
% continue;
% }
% + NConsumed = -1;
% if(sscanf(iptr,aDfmt,&num,&NConsumed) > 0) {
% - if(*(iptr+NConsumed-1) != ';') {
% + if(NConsumed == -1 || *(iptr+NConsumed-1) != ';') {
% MicrosoftStyle++;
% + if (NConsumed == -1) {
% + if (snprintf(fmt_itoa, sizeof(fmt_itoa), "%u", num) >
sizeof(fmt_itoa)-1) {
% + fprintf(stderr, "UTF32 codepoint overflowed static
buffer\n");
% + exit(BADRECORD);
% + }
% + NConsumed = 2 /* "&#" */ + strlen(fmt_itoa) + 1 /* ";" */;
% + }
The same dance, but with a different-length prefix.
% fprintf(stderr,
% _("The HTML/HDML entity %1$s at token %2$lu of line %3$lu
lacks the requisite final semicolon.\n"),
%
ExtractSubstring(tmpstr,iptr,iptr+NConsumed-3),TokenNumber,LineNo);
% if (StrictP) {putchar(*iptr++); continue;}
% - else {putu8(num);iptr+=NConsumed;}
% + else {putu8(num);iptr+=NConsumed-1;}
% }
% else {putu8(num);iptr+=NConsumed;}
% TokenNumber++;
% continue;
% }
% }
% + NConsumed = -1;
% if(sscanf(iptr,afmt,&enam,&NConsumed) > 0) {
% + if (NConsumed == -1) NConsumed = 1 /* "&" */ + strlen(enam) + 1 /*
";" */;
This case is slightly different, in that we accept any match but check if
it is a valid entity. Update the number of characters consumed by the
partial match early, to ease the check for MicrosoftStyle input below.
% if( (num = LookupCodeForEntity(enam))) {
% if(*(iptr+NConsumed-1) != ';') {
% MicrosoftStyle++;
% fprintf(stderr,_("The HTML/HDML entity %1$s at token %2$lu of line
%3$lu lacks the requisite final
semicolon.\n"),ExtractSubstring(tmpstr,iptr,iptr+NConsumed-3),TokenNumber,LineNo);
% if(StrictP) {putchar(*iptr++);continue;}
% - else {putu8(num);iptr+=NConsumed;}
% + else {putu8(num);iptr+=NConsumed-1;}
% }
% else {putu8(num);iptr+=NConsumed;}
% TokenNumber++;
% }
% else {
% + if(*(iptr+NConsumed-1) != ';') NConsumed--;
Again, don't eat the extra unmatched character.
-Ben Kaduk
% fprintf(stderr,"ascii2uni: unknown HTML/HDML character entity \"&%s;\"
at line %lu\n",
% enam,LineNo);
% putu8(UNI_REPLACEMENT_CHAR);
% diff -ruN uni2ascii-4.18.orig//debian/changelog
uni2ascii-4.18/debian/changelog
% --- uni2ascii-4.18.orig//debian/changelog 2011-05-17 01:30:08.000000000
-0400
% +++ uni2ascii-4.18/debian/changelog 2011-08-23 19:24:25.000000000 -0400
% @@ -1,3 +1,11 @@
% +uni2ascii (4.18-1.1) unstable; urgency=low
% +
% + * Non-maintainer upload.
% + * Check for partially-matched sscanf() patterns and consume an appropriate
% + number of characters (Closes: #633704)
% +
% + -- Benjamin Kaduk <ka...@mit.edu> Tue, 23 Aug 2011 19:22:19 -0400
% +
% uni2ascii (4.18-1) unstable; urgency=low
%
% * New upstream release:diff -ruN uni2ascii-4.18.orig//ascii2uni.c uni2ascii-4.18/ascii2uni.c
--- uni2ascii-4.18.orig//ascii2uni.c 2011-05-14 22:15:20.000000000 -0400
+++ uni2ascii-4.18/ascii2uni.c 2011-08-23 20:07:29.000000000 -0400
@@ -208,6 +208,7 @@
char aHfmt [8+2+1];
char aDfmt [8+2+1];
char cbuf[5];
+ char fmt_itoa[12];
FILE *infp;
UTF32 num;
@@ -555,45 +556,64 @@
}
else if (FType == CHENT) {
if (AllHTMLP){
+ NConsumed = -1;
if(sscanf(iptr,aHfmt,&num,&NConsumed) > 0) {
- if(*(iptr+NConsumed-1) != ';') {
+ if(NConsumed == -1 || *(iptr+NConsumed-1) != ';') {
MicrosoftStyle++;
+ if (NConsumed == -1) {
+ if (snprintf(fmt_itoa, sizeof(fmt_itoa), "%x", num) >
sizeof(fmt_itoa)-1) {
+ fprintf(stderr, "UTF32 codepoint overflowed static
buffer\n");
+ exit(BADRECORD);
+ }
+ NConsumed = 3 /* "&#x" */ + strlen(fmt_itoa) + 1 /* ";" */;
+ }
fprintf(stderr,
_("The HTML/HDML entity %1$s at token %2$lu of line
%3$lu lacks the requisite final semicolon.\n"),
ExtractSubstring(tmpstr,iptr,iptr+NConsumed-3),TokenNumber,LineNo);
if(StrictP) {putchar(*iptr++); continue;}
- else {putu8(num);iptr+=NConsumed;}
+ else {putu8(num);iptr+=NConsumed-1;}
}
else {putu8(num);iptr+=NConsumed;}
TokenNumber++;
continue;
}
+ NConsumed = -1;
if(sscanf(iptr,aDfmt,&num,&NConsumed) > 0) {
- if(*(iptr+NConsumed-1) != ';') {
+ if(NConsumed == -1 || *(iptr+NConsumed-1) != ';') {
MicrosoftStyle++;
+ if (NConsumed == -1) {
+ if (snprintf(fmt_itoa, sizeof(fmt_itoa), "%u", num) >
sizeof(fmt_itoa)-1) {
+ fprintf(stderr, "UTF32 codepoint overflowed static
buffer\n");
+ exit(BADRECORD);
+ }
+ NConsumed = 2 /* "&#" */ + strlen(fmt_itoa) + 1 /* ";" */;
+ }
fprintf(stderr,
_("The HTML/HDML entity %1$s at token %2$lu of line
%3$lu lacks the requisite final semicolon.\n"),
ExtractSubstring(tmpstr,iptr,iptr+NConsumed-3),TokenNumber,LineNo);
if (StrictP) {putchar(*iptr++); continue;}
- else {putu8(num);iptr+=NConsumed;}
+ else {putu8(num);iptr+=NConsumed-1;}
}
else {putu8(num);iptr+=NConsumed;}
TokenNumber++;
continue;
}
}
+ NConsumed = -1;
if(sscanf(iptr,afmt,&enam,&NConsumed) > 0) {
+ if (NConsumed == -1) NConsumed = 1 /* "&" */ + strlen(enam) + 1 /*
";" */;
if( (num = LookupCodeForEntity(enam))) {
if(*(iptr+NConsumed-1) != ';') {
MicrosoftStyle++;
fprintf(stderr,_("The HTML/HDML entity %1$s at token %2$lu of
line %3$lu lacks the requisite final
semicolon.\n"),ExtractSubstring(tmpstr,iptr,iptr+NConsumed-3),TokenNumber,LineNo);
if(StrictP) {putchar(*iptr++);continue;}
- else {putu8(num);iptr+=NConsumed;}
+ else {putu8(num);iptr+=NConsumed-1;}
}
else {putu8(num);iptr+=NConsumed;}
TokenNumber++;
}
else {
+ if(*(iptr+NConsumed-1) != ';') NConsumed--;
fprintf(stderr,"ascii2uni: unknown HTML/HDML character entity
\"&%s;\" at line %lu\n",
enam,LineNo);
putu8(UNI_REPLACEMENT_CHAR);
diff -ruN uni2ascii-4.18.orig//debian/changelog uni2ascii-4.18/debian/changelog
--- uni2ascii-4.18.orig//debian/changelog 2011-05-17 01:30:08.000000000
-0400
+++ uni2ascii-4.18/debian/changelog 2011-08-23 19:24:25.000000000 -0400
@@ -1,3 +1,11 @@
+uni2ascii (4.18-1.1) unstable; urgency=low
+
+ * Non-maintainer upload.
+ * Check for partially-matched sscanf() patterns and consume an appropriate
+ number of characters (Closes: #633704)
+
+ -- Benjamin Kaduk <ka...@mit.edu> Tue, 23 Aug 2011 19:22:19 -0400
+
uni2ascii (4.18-1) unstable; urgency=low
* New upstream release: