[PATCH 06/10] tester fixes & minor bug in tokeniser

rsk1994 Mon, 31 Mar 2014 09:10:31 -0700

---
 src/tokeniser/tokeniser.c              |  2 +-
 test/data/tokeniser2/INDEX             |  5 +++-
 test/data/tokeniser2/escapeFlag.test   | 24 ++++++++--------
 test/data/tokeniser2/unicodeChars.test |  8 ------
 test/tokeniser2.c                      |  2 +-
 test/tokeniser3.c                      | 50 ++++++++++++++++++++--------------
 6 files changed, 47 insertions(+), 44 deletions(-)


diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c
index 8390bf0..41a4b0e 100644
--- a/src/tokeniser/tokeniser.c
+++ b/src/tokeniser/tokeniser.c
@@ -753,7 +753,7 @@ hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser 
*tokeniser)
                        }
 
                        /* Emit a replacement character */
-                       emit_current_chars(tokeniser);
+                       emit_character_token(tokeniser,u_fffd_str);
 
                        /* Advance past NUL */
                        parserutils_inputstream_advance(tokeniser->input, 1);
diff --git a/test/data/tokeniser2/INDEX b/test/data/tokeniser2/INDEX
index 9da56e7..9ff8596 100644
--- a/test/data/tokeniser2/INDEX
+++ b/test/data/tokeniser2/INDEX
@@ -10,6 +10,9 @@ entities.test         html5lib entity tests
 escapeFlag.test                html5lib escape flag tests
 numericEntities.test   html5lib numeric entities tests
 unicodeChars.test      html5lib unicode character tests
+#unicodeCharsProblematic.test  html5lib problematic unicode character tests
 cdata.test             CDATA section tests
-regression.test                Regression tests
+#regression.test               Regression tests
 namedEntities.test     html5lib named entities tests
+pendingSpecChanges.test        html5lib spec changes tests
+#xmlViolation.test     xmlViolation
diff --git a/test/data/tokeniser2/escapeFlag.test 
b/test/data/tokeniser2/escapeFlag.test
index 4c4bf51..18cb430 100644
--- a/test/data/tokeniser2/escapeFlag.test
+++ b/test/data/tokeniser2/escapeFlag.test
@@ -1,33 +1,33 @@
 {"tests": [
 
-{"description":"Commented close tag in [R]CDATA",
-"contentModelFlags":["RCDATA", "CDATA"],
+{"description":"Commented close tag in RCDATA or RAWTEXT",
+"initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"foo<!--</xmp>--></xmp>",
-"output":[["Character", "foo<!--</xmp>-->"], ["EndTag", "xmp"]]},
+"output":[["Character", "foo<!--"], ["EndTag", "xmp"], ["Character", "-->"], 
["EndTag", "xmp"]]},
 
-{"description":"Bogus comment in [R]CDATA",
-"contentModelFlags":["RCDATA", "CDATA"],
+{"description":"Bogus comment in RCDATA or RAWTEXT",
+"initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"foo<!-->baz</xmp>",
 "output":[["Character", "foo<!-->baz"], ["EndTag", "xmp"]]},
 
-{"description":"End tag surrounded by bogus comment in [R]CDATA",
-"contentModelFlags":["RCDATA", "CDATA"],
+{"description":"End tag surrounded by bogus comment in RCDATA or RAWTEXT",
+"initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"foo<!--></xmp><!-->baz</xmp>",
 "output":[["Character", "foo<!-->"], ["EndTag", "xmp"], "ParseError", 
["Comment", ""], ["Character", "baz"], ["EndTag", "xmp"]]},
 
 {"description":"Commented entities in RCDATA",
-"contentModelFlags":["RCDATA"],
+"initialStates":["RCDATA state"],
 "lastStartTag":"xmp",
 "input":" &amp; <!-- &amp; --> &amp; </xmp>",
-"output":[["Character", " & <!-- &amp; --> & "], ["EndTag", "xmp"]]},
+"output":[["Character", " & <!-- & --> & "], ["EndTag", "xmp"]]},
 
-{"description":"Incorrect comment ending sequences in [R]CDATA",
-"contentModelFlags":["RCDATA", "CDATA"],
+{"description":"Incorrect comment ending sequences in RCDATA or RAWTEXT",
+"initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"foo<!-- x --x>x-- >x--!>x--<></xmp>",
-"output":[["Character", "foo<!-- x --x>x-- >x--!>x--<></xmp>"]]}
+"output":[["Character", "foo<!-- x --x>x-- >x--!>x--<>"], ["EndTag", "xmp"]]}
 
 ]}
diff --git a/test/data/tokeniser2/unicodeChars.test 
b/test/data/tokeniser2/unicodeChars.test
index 9b59015..c778668 100644
--- a/test/data/tokeniser2/unicodeChars.test
+++ b/test/data/tokeniser2/unicodeChars.test
@@ -112,14 +112,6 @@
 "input": "\u007F",
 "output": ["ParseError", ["Character", "\u007F"]]},
 
-{"description": "Invalid Unicode character U+D800",
-"input": "\uD800",
-"output": ["ParseError", ["Character", "\uD800"]]},
-
-{"description": "Invalid Unicode character U+DFFF",
-"input": "\uDFFF",
-"output": ["ParseError", ["Character", "\uDFFF"]]},
-
 {"description": "Invalid Unicode character U+FDD0",
 "input": "\uFDD0",
 "output": ["ParseError", ["Character", "\uFDD0"]]},
diff --git a/test/tokeniser2.c b/test/tokeniser2.c
index bf0e69f..3024e81 100644
--- a/test/tokeniser2.c
+++ b/test/tokeniser2.c
@@ -86,7 +86,7 @@ int main(int argc, char **argv)
                        } else if (strcmp(key, "lastStartTag") == 0) {
                                ctx.last_start_tag = (const char *)
                                                json_object_get_string(val);
-                       } else if (strcmp(key, "contentModelFlags") == 0) {
+                       } else if (strcmp(key, "initialStates") == 0) {
                                ctx.content_model =
                                                json_object_get_array(val);
                        } else if (strcmp(key, "processCDATA") == 0) {
diff --git a/test/tokeniser3.c b/test/tokeniser3.c
index 949ddd0..c4c5231 100644
--- a/test/tokeniser3.c
+++ b/test/tokeniser3.c
@@ -29,6 +29,7 @@ typedef struct context {
 
 static void run_test(context *ctx);
 static hubbub_error token_handler(const hubbub_token *token, void *pw);
+size_t get_len(const char *str);
 
 int main(int argc, char **argv)
 {
@@ -85,7 +86,7 @@ int main(int argc, char **argv)
                        } else if (strcmp(key, "lastStartTag") == 0) {
                                ctx.last_start_tag = (const char *)
                                                json_object_get_string(val);
-                       } else if (strcmp(key, "contentModelFlags") == 0) {
+                       } else if (strcmp(key, "initialStates") == 0) {
                                ctx.content_model =
                                                json_object_get_array(val);
                        } else if (strcmp(key, "processCDATA") == 0) {
@@ -103,6 +104,13 @@ int main(int argc, char **argv)
        return 0;
 }
 
+size_t get_len(const char *str) {
+       if(str == NULL) {
+               return 0;
+       } else {
+               return strlen(str);
+       }
+}
 void run_test(context *ctx)
 {
        parserutils_inputstream *stream;
@@ -132,7 +140,7 @@ void run_test(context *ctx)
 
                if (ctx->last_start_tag != NULL) {
                        /* Fake up a start tag, in PCDATA state */
-                       size_t len = strlen(ctx->last_start_tag) + 3;
+                       size_t len = get_len(ctx->last_start_tag) + 3;
                        uint8_t *buf = malloc(len);
 
                        snprintf((char *) buf, len, "<%s>",
@@ -308,21 +316,21 @@ hubbub_error token_handler(const hubbub_token *token, 
void *pw)
                                (int) token->data.doctype.system_id.len);
                }
 
-               assert(token->data.doctype.name.len == strlen(expname));
-               assert(strncmp(gotname, expname, strlen(expname)) == 0);
+               assert(token->data.doctype.name.len == get_len(expname));
+               assert(strncmp(gotname, expname, get_len(expname)) == 0);
 
                assert((exppub == NULL) ==
                                (token->data.doctype.public_missing == true));
                if (exppub) {
-                       assert(token->data.doctype.public_id.len == 
strlen(exppub));
-                       assert(strncmp(gotpub, exppub, strlen(exppub)) == 0);
+                       assert(token->data.doctype.public_id.len == 
get_len(exppub));
+                       assert(strncmp(gotpub, exppub, get_len(exppub)) == 0);
                }
 
                assert((expsys == NULL) ==
                                (token->data.doctype.system_missing == true));
                if (gotsys) {
-                       assert(token->data.doctype.system_id.len == 
strlen(expsys));
-                       assert(strncmp(gotsys, expsys, strlen(expsys)) == 0);
+                       assert(token->data.doctype.system_id.len == 
get_len(expsys));
+                       assert(strncmp(gotsys, expsys, get_len(expsys)) == 0);
                }
 
                assert(expquirks == token->data.doctype.force_quirks);
@@ -354,8 +362,8 @@ hubbub_error token_handler(const hubbub_token *token, void 
*pw)
                        printf("attributes:\n");
                }
 
-               assert(token->data.tag.name.len == strlen(expname));
-               assert(strncmp(tagname, expname, strlen(expname)) == 0);
+               assert(token->data.tag.name.len == get_len(expname));
+               assert(strncmp(tagname, expname, get_len(expname)) == 0);
 
                assert((token->data.tag.n_attributes == 0) ==
                                (expattrs == NULL));
@@ -379,11 +387,11 @@ hubbub_error token_handler(const hubbub_token *token, 
void *pw)
                                        (int) namelen, gotname,
                                        (int) vallen, gotval);
 
-                       assert(namelen == strlen(expname));
+                       assert(namelen == get_len(expname));
                        assert(strncmp(gotname, expname,
-                                               strlen(expname)) == 0);
-                       assert(vallen == strlen(expval));
-                       assert(strncmp(gotval, expval, strlen(expval)) == 0);
+                                               get_len(expname)) == 0);
+                       assert(vallen == get_len(expval));
+                       assert(strncmp(gotval, expval, get_len(expval)) == 0);
 
                        expattrs = expattrs->next;
                }
@@ -404,8 +412,8 @@ hubbub_error token_handler(const hubbub_token *token, void 
*pw)
                                (token->data.tag.n_attributes > 0) ?
                                                "attributes:" : "");
 
-               assert(token->data.tag.name.len == strlen(expname));
-               assert(strncmp(tagname, expname, strlen(expname)) == 0);
+               assert(token->data.tag.name.len == get_len(expname));
+               assert(strncmp(tagname, expname, get_len(expname)) == 0);
        }
                break;
        case HUBBUB_TOKEN_COMMENT:
@@ -419,20 +427,20 @@ hubbub_error token_handler(const hubbub_token *token, 
void *pw)
                printf("     got: '%.*s'\n",
                                (int) token->data.comment.len, gotstr);
 
-               assert(token->data.comment.len == strlen(expstr));
-               assert(strncmp(gotstr, expstr, strlen(expstr)) == 0);
+               assert(token->data.comment.len == get_len(expstr));
+               assert(strncmp(gotstr, expstr, get_len(expstr)) == 0);
        }
                break;
        case HUBBUB_TOKEN_CHARACTER:
        {
-               int expstrlen = json_object_get_string_len(
+               int expget_len = json_object_get_string_len(
                                array_list_get_idx(items, 1));
                const char *expstr = json_object_get_string(
                                array_list_get_idx(items, 1));
                const char *gotstr = (const char *)
                                token->data.character.ptr;
                size_t len = min(token->data.character.len,
-                               expstrlen - ctx->char_off);
+                               expget_len - ctx->char_off);
 
                printf("expected: '%.*s'\n",
                                (int) len, expstr + ctx->char_off);
@@ -454,7 +462,7 @@ hubbub_error token_handler(const hubbub_token *token, void 
*pw)
                        ctx->char_off = 0;
 
                        token_handler(&t, pw);
-               } else if (strlen(expstr + ctx->char_off) >
+               } else if (get_len(expstr + ctx->char_off) >
                                token->data.character.len) {
                        /* Tokeniser output only contained part of the data
                         * in the expected token; calculate the offset into
-- 
1.8.3.2

[PATCH 06/10] tester fixes & minor bug in tokeniser

Reply via email to