--- test/data/tokeniser2/README.md | 104 ++++++++++++++++++++++ test/data/tokeniser2/contentModelFlags.test | 75 ++++++++++++++++ test/data/tokeniser2/domjs.test | 91 +++++++++++++++++++ test/data/tokeniser2/pendingSpecChanges.test | 7 ++ test/data/tokeniser2/unicodeCharsProblematic.test | 27 ++++++ test/data/tokeniser2/xmlViolation.test | 22 +++++ 6 files changed, 326 insertions(+) create mode 100644 test/data/tokeniser2/README.md create mode 100644 test/data/tokeniser2/contentModelFlags.test create mode 100644 test/data/tokeniser2/domjs.test create mode 100644 test/data/tokeniser2/pendingSpecChanges.test create mode 100644 test/data/tokeniser2/unicodeCharsProblematic.test create mode 100644 test/data/tokeniser2/xmlViolation.test
diff --git a/test/data/tokeniser2/README.md b/test/data/tokeniser2/README.md new file mode 100644 index 0000000..4218c26 --- /dev/null +++ b/test/data/tokeniser2/README.md @@ -0,0 +1,104 @@ +Tokenizer tests +=============== + +The test format is [JSON](http://www.json.org/). This has the advantage +that the syntax allows backward-compatible extensions to the tests and +the disadvantage that it is relatively verbose. + +Basic Structure +--------------- + + {"tests": [ + {"description": "Test description", + "input": "input_string", + "output": [expected_output_tokens], + "initialStates": [initial_states], + "lastStartTag": last_start_tag, + "ignoreErrorOrder": ignore_error_order + } + ]} + +Multiple tests per file are allowed simply by adding more objects to the +"tests" list. + +`description`, `input` and `output` are always present. The other values +are optional. + +### Test set-up + +`test.input` is a string containing the characters to pass to the +tokenizer. Specifically, it represents the characters of the **input +stream**, and so implementations are expected to perform the processing +described in the spec's **Preprocessing the input stream** section +before feeding the result to the tokenizer. + +If `test.doubleEscaped` is present and `true`, then `test.input` is not +quite as described above. Instead, it must first be subjected to another +round of unescaping (i.e., in addition to any unescaping involved in the +JSON import), and the result of *that* represents the characters of the +input stream. Currently, the only unescaping required by this option is +to convert each sequence of the form \\uHHHH (where H is a hex digit) +into the corresponding Unicode code point. (Note that this option also +affects the interpretation of `test.output`.) + +`test.initialStates` is a list of strings, each being the name of a +tokenizer state. The test should be run once for each string, using it +to set the tokenizer's initial state for that run. If +`test.initialStates` is omitted, it defaults to `["data state"]`. + +`test.lastStartTag` is a lowercase string that should be used as "the +tag name of the last start tag to have been emitted from this +tokenizer", referenced in the spec's definition of **appropriate end tag +token**. If it is omitted, it is treated as if "no start tag has been +emitted from this tokenizer". + +### Test results + +`test.output` is a list of tokens, ordered with the first produced by +the tokenizer the first (leftmost) in the list. The list must mach the +**complete** list of tokens that the tokenizer should produce. Valid +tokens are: + + ["DOCTYPE", name, public_id, system_id, correctness] + ["StartTag", name, {attributes}*, true*] + ["StartTag", name, {attributes}] + ["EndTag", name] + ["Comment", data] + ["Character", data] + "ParseError" + +`public_id` and `system_id` are either strings or `null`. `correctness` +is either `true` or `false`; `true` corresponds to the force-quirks flag +being false, and vice-versa. + +When the self-closing flag is set, the `StartTag` array has `true` as +its fourth entry. When the flag is not set, the array has only three +entries for backwards compatibility. + +All adjacent character tokens are coalesced into a single +`["Character", data]` token. + +If `test.doubleEscaped` is present and `true`, then every string within +`test.output` must be further unescaped (as described above) before +comparing with the tokenizer's output. + +`test.ignoreErrorOrder` is a boolean value indicating that the order of +`ParseError` tokens relative to other tokens in the output stream is +unimportant, and implementations should ignore such differences between +their output and `expected_output_tokens`. (This is used for errors +emitted by the input stream preprocessing stage, since it is useful to +test that code but it is undefined when the errors occur). If it is +omitted, it defaults to `false`. + +xmlViolation tests +------------------ + +`tokenizer/xmlViolation.test` differs from the above in a couple of +ways: + +- The name of the single member of the top-level JSON object is + "xmlViolationTests" instead of "tests". +- Each test's expected output assumes that implementation is applying + the tweaks given in the spec's "Coercing an HTML DOM into an + infoset" section. + diff --git a/test/data/tokeniser2/contentModelFlags.test b/test/data/tokeniser2/contentModelFlags.test new file mode 100644 index 0000000..a8b1695 --- /dev/null +++ b/test/data/tokeniser2/contentModelFlags.test @@ -0,0 +1,75 @@ +{"tests": [ + +{"description":"PLAINTEXT content model flag", +"initialStates":["PLAINTEXT state"], +"lastStartTag":"plaintext", +"input":"<head>&body;", +"output":[["Character", "<head>&body;"]]}, + +{"description":"End tag closing RCDATA or RAWTEXT", +"initialStates":["RCDATA state", "RAWTEXT state"], +"lastStartTag":"xmp", +"input":"foo</xmp>", +"output":[["Character", "foo"], ["EndTag", "xmp"]]}, + +{"description":"End tag closing RCDATA or RAWTEXT (case-insensitivity)", +"initialStates":["RCDATA state", "RAWTEXT state"], +"lastStartTag":"xmp", +"input":"foo</xMp>", +"output":[["Character", "foo"], ["EndTag", "xmp"]]}, + +{"description":"End tag closing RCDATA or RAWTEXT (ending with space)", +"initialStates":["RCDATA state", "RAWTEXT state"], +"lastStartTag":"xmp", +"input":"foo</xmp ", +"output":[["Character", "foo"], "ParseError"]}, + +{"description":"End tag closing RCDATA or RAWTEXT (ending with EOF)", +"initialStates":["RCDATA state", "RAWTEXT state"], +"lastStartTag":"xmp", +"input":"foo</xmp", +"output":[["Character", "foo</xmp"]]}, + +{"description":"End tag closing RCDATA or RAWTEXT (ending with slash)", +"initialStates":["RCDATA state", "RAWTEXT state"], +"lastStartTag":"xmp", +"input":"foo</xmp/", +"output":[["Character", "foo"], "ParseError"]}, + +{"description":"End tag not closing RCDATA or RAWTEXT (ending with left-angle-bracket)", +"initialStates":["RCDATA state", "RAWTEXT state"], +"lastStartTag":"xmp", +"input":"foo</xmp<", +"output":[["Character", "foo</xmp<"]]}, + +{"description":"End tag with incorrect name in RCDATA or RAWTEXT", +"initialStates":["RCDATA state", "RAWTEXT state"], +"lastStartTag":"xmp", +"input":"</foo>bar</xmp>", +"output":[["Character", "</foo>bar"], ["EndTag", "xmp"]]}, + +{"description":"End tag with incorrect name in RCDATA or RAWTEXT (starting like correct name)", +"initialStates":["RCDATA state", "RAWTEXT state"], +"lastStartTag":"xmp", +"input":"</foo>bar</xmpaar>", +"output":[["Character", "</foo>bar</xmpaar>"]]}, + +{"description":"End tag closing RCDATA or RAWTEXT, switching back to PCDATA", +"initialStates":["RCDATA state", "RAWTEXT state"], +"lastStartTag":"xmp", +"input":"foo</xmp></baz>", +"output":[["Character", "foo"], ["EndTag", "xmp"], ["EndTag", "baz"]]}, + +{"description":"RAWTEXT w/ something looking like an entity", +"initialStates":["RAWTEXT state"], +"lastStartTag":"xmp", +"input":"&foo;", +"output":[["Character", "&foo;"]]}, + +{"description":"RCDATA w/ an entity", +"initialStates":["RCDATA state"], +"lastStartTag":"textarea", +"input":"<", +"output":[["Character", "<"]]} + +]} diff --git a/test/data/tokeniser2/domjs.test b/test/data/tokeniser2/domjs.test new file mode 100644 index 0000000..b9e650c --- /dev/null +++ b/test/data/tokeniser2/domjs.test @@ -0,0 +1,91 @@ +{ + "tests": [ + { + "description":"CR in bogus comment state", + "input":"<?\u000d", + "output":["ParseError", ["Comment", "?\u000a"]] + }, + { + "description":"CRLF in bogus comment state", + "input":"<?\u000d\u000a", + "output":["ParseError", ["Comment", "?\u000a"]] + }, + { + "description":"NUL in RCDATA and RAWTEXT", + "doubleEscaped":true, + "initialStates":["RCDATA state", "RAWTEXT state"], + "input":"\\u0000", + "output":["ParseError", ["Character", "\\uFFFD"]] + }, + { + "description":"leading U+FEFF must pass through", + "doubleEscaped":true, + "input":"\\uFEFFfoo\\uFEFFbar", + "output":[["Character", "\\uFEFFfoo\\uFEFFbar"]] + }, + { + "description":"Non BMP-charref in in RCDATA", + "initialStates":["RCDATA state"], + "input":"≂̸", + "output":[["Character", "\u2242\u0338"]] + }, + { + "description":"Bad charref in in RCDATA", + "initialStates":["RCDATA state"], + "input":"&NotEqualTild;", + "output":["ParseError", ["Character", "&NotEqualTild;"]] + }, + { + "description":"lowercase endtags in RCDATA and RAWTEXT", + "initialStates":["RCDATA state", "RAWTEXT state"], + "lastStartTag":"xmp", + "input":"</XMP>", + "output":[["EndTag","xmp"]] + }, + { + "description":"bad endtag in RCDATA and RAWTEXT", + "initialStates":["RCDATA state", "RAWTEXT state"], + "lastStartTag":"xmp", + "input":"</ XMP>", + "output":[["Character","</ XMP>"]] + }, + { + "description":"bad endtag in RCDATA and RAWTEXT", + "initialStates":["RCDATA state", "RAWTEXT state"], + "lastStartTag":"xmp", + "input":"</xm>", + "output":[["Character","</xm>"]] + }, + { + "description":"bad endtag in RCDATA and RAWTEXT", + "initialStates":["RCDATA state", "RAWTEXT state"], + "lastStartTag":"xmp", + "input":"</xm ", + "output":[["Character","</xm "]] + }, + { + "description":"bad endtag in RCDATA and RAWTEXT", + "initialStates":["RCDATA state", "RAWTEXT state"], + "lastStartTag":"xmp", + "input":"</xm/", + "output":[["Character","</xm/"]] + }, + { + "description":"Non BMP-charref in attribute", + "input":"<p id=\"≂̸\">", + "output":[["StartTag", "p", {"id":"\u2242\u0338"}]] + }, + { + "description":"--!NUL in comment ", + "doubleEscaped":true, + "input":"<!----!\\u0000-->", + "output":["ParseError", "ParseError", ["Comment", "--!\\uFFFD"]] + }, + { + "description":"space EOF after doctype ", + "input":"<!DOCTYPE html ", + "output":["ParseError", ["DOCTYPE", "html", null, null , false]] + } + + ] +} diff --git a/test/data/tokeniser2/pendingSpecChanges.test b/test/data/tokeniser2/pendingSpecChanges.test new file mode 100644 index 0000000..1b7dc3c --- /dev/null +++ b/test/data/tokeniser2/pendingSpecChanges.test @@ -0,0 +1,7 @@ +{"tests": [ + +{"description":"<!---- >", +"input":"<!---- >", +"output":["ParseError", "ParseError", ["Comment","-- >"]]} + +]} diff --git a/test/data/tokeniser2/unicodeCharsProblematic.test b/test/data/tokeniser2/unicodeCharsProblematic.test new file mode 100644 index 0000000..5987845 --- /dev/null +++ b/test/data/tokeniser2/unicodeCharsProblematic.test @@ -0,0 +1,27 @@ +{"tests" : [ +{"description": "Invalid Unicode character U+DFFF", +"doubleEscaped":true, +"input": "\\uDFFF", +"output":["ParseError", ["Character", "\\uFFFD"]]}, + +{"description": "Invalid Unicode character U+D800", +"doubleEscaped":true, +"input": "\\uD800", +"output":["ParseError", ["Character", "\\uFFFD"]]}, + +{"description": "Invalid Unicode character U+DFFF with valid preceding character", +"doubleEscaped":true, +"input": "a\\uDFFF", +"output":[["Character", "a"], "ParseError", ["Character", "\\uFFFD"]]}, + +{"description": "Invalid Unicode character U+D800 with valid following character", +"doubleEscaped":true, +"input": "\\uD800a", +"output":["ParseError", ["Character", "\\uFFFDa"]]}, + +{"description":"CR followed by U+0000", +"input":"\r\u0000", +"output":[["Character", "\n"], "ParseError", ["Character", "\u0000"]], +"ignoreErrorOrder":true} +] +} \ No newline at end of file diff --git a/test/data/tokeniser2/xmlViolation.test b/test/data/tokeniser2/xmlViolation.test new file mode 100644 index 0000000..93c6351 --- /dev/null +++ b/test/data/tokeniser2/xmlViolation.test @@ -0,0 +1,22 @@ +{"tests": [ + +{"description":"Non-XML character", +"input":"a\uFFFFb", +"ignoreErrorOrder":true, +"output":["ParseError",["Character","a\uFFFDb"]]}, + +{"description":"Non-XML space", +"input":"a\u000Cb", +"ignoreErrorOrder":true, +"output":[["Character","a b"]]}, + +{"description":"Double hyphen in comment", +"input":"<!-- foo -- bar -->", +"output":["ParseError",["Comment"," foo - - bar "]]}, + +{"description":"FF between attributes", +"input":"<a b=''\u000Cc=''>", +"output":[["StartTag","a",{"b":"","c":""}]]} +]} + + -- 1.8.3.2
