Repository: commons-text Updated Branches: refs/heads/master 10b97cb5f -> 43ba72325
TEXT-27: Adding StringEscapeUtils from commons-lang:3.5 Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/d8f547e8 Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/d8f547e8 Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/d8f547e8 Branch: refs/heads/master Commit: d8f547e818d815c0439f4b3c317d077a0290a5f5 Parents: 10b97cb Author: Rob Tompkins <chtom...@gmail.com> Authored: Mon Nov 28 15:02:18 2016 -0500 Committer: Rob Tompkins <chtom...@gmail.com> Committed: Mon Nov 28 15:02:18 2016 -0500 ---------------------------------------------------------------------- pom.xml | 6 + src/changes/changes.xml | 1 + .../apache/commons/text/StringEscapeUtils.java | 811 +++++++++++++++++++ .../commons/text/StringEscapeUtilsTest.java | 621 ++++++++++++++ 4 files changed, 1439 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/commons-text/blob/d8f547e8/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index 0c3a10a..9113d23 100644 --- a/pom.xml +++ b/pom.xml @@ -101,6 +101,12 @@ <version>1.4</version> <scope>test</scope> </dependency> + <dependency> + <groupId>commons-io</groupId> + <artifactId>commons-io</artifactId> + <version>2.5</version> + <scope>test</scope> + </dependency> </dependencies> <distributionManagement> http://git-wip-us.apache.org/repos/asf/commons-text/blob/d8f547e8/src/changes/changes.xml ---------------------------------------------------------------------- diff --git a/src/changes/changes.xml b/src/changes/changes.xml index e60b364..1274672 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -22,6 +22,7 @@ <body> <release version="1.0" date="tba" description="tba"> + <action issue="TEXT-27" type="add" dev="chtompki">Move org.apache.commons.lang3.StringEscapeUtils.java into text</action> <action issue="TEXT-23" type="add" dev="chtompki">Moving from commons-lang, the package org.apache.commons.lang3.text</action> <action issue="TEXT-10" type="add" dev="kinow" due-to="Don Jeba">A more complex Levenshtein distance</action> <action issue="TEXT-24" type="add" dev="chtompki">Add coveralls and Travis.ci integration</action> http://git-wip-us.apache.org/repos/asf/commons-text/blob/d8f547e8/src/main/java/org/apache/commons/text/StringEscapeUtils.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/StringEscapeUtils.java b/src/main/java/org/apache/commons/text/StringEscapeUtils.java new file mode 100644 index 0000000..6b88275 --- /dev/null +++ b/src/main/java/org/apache/commons/text/StringEscapeUtils.java @@ -0,0 +1,811 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text; + +import java.io.IOException; +import java.io.Writer; + +import org.apache.commons.lang3.CharUtils; +import org.apache.commons.lang3.StringUtils; + +import org.apache.commons.text.translate.AggregateTranslator; +import org.apache.commons.text.translate.CharSequenceTranslator; +import org.apache.commons.text.translate.EntityArrays; +import org.apache.commons.text.translate.JavaUnicodeEscaper; +import org.apache.commons.text.translate.LookupTranslator; +import org.apache.commons.text.translate.NumericEntityEscaper; +import org.apache.commons.text.translate.NumericEntityUnescaper; +import org.apache.commons.text.translate.OctalUnescaper; +import org.apache.commons.text.translate.UnicodeUnescaper; +import org.apache.commons.text.translate.UnicodeUnpairedSurrogateRemover; + +/** + * <p>Escapes and unescapes {@code String}s for + * Java, Java Script, HTML and XML.</p> + * + * <p>#ThreadSafe#</p> + * + * + * <p> + * This code has been adapted from Apache Commons Lang 3.5. + * </p> + */ +public class StringEscapeUtils { + + /* ESCAPE TRANSLATORS */ + + /** + * Translator object for escaping Java. + * + * While {@link #escapeJava(String)} is the expected method of use, this + * object allows the Java escaping functionality to be used + * as the foundation for a custom translator. + * + * @since 3.0 + */ + public static final CharSequenceTranslator ESCAPE_JAVA = + new LookupTranslator( + new String[][] { + {"\"", "\\\""}, + {"\\", "\\\\"}, + }).with( + new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_ESCAPE()) + ).with( + JavaUnicodeEscaper.outsideOf(32, 0x7f) + ); + + /** + * Translator object for escaping EcmaScript/JavaScript. + * + * While {@link #escapeEcmaScript(String)} is the expected method of use, this + * object allows the EcmaScript escaping functionality to be used + * as the foundation for a custom translator. + * + * @since 3.0 + */ + public static final CharSequenceTranslator ESCAPE_ECMASCRIPT = + new AggregateTranslator( + new LookupTranslator( + new String[][] { + {"'", "\\'"}, + {"\"", "\\\""}, + {"\\", "\\\\"}, + {"/", "\\/"} + }), + new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_ESCAPE()), + JavaUnicodeEscaper.outsideOf(32, 0x7f) + ); + + /** + * Translator object for escaping Json. + * + * While {@link #escapeJson(String)} is the expected method of use, this + * object allows the Json escaping functionality to be used + * as the foundation for a custom translator. + * + * @since 3.2 + */ + public static final CharSequenceTranslator ESCAPE_JSON = + new AggregateTranslator( + new LookupTranslator( + new String[][] { + {"\"", "\\\""}, + {"\\", "\\\\"}, + {"/", "\\/"} + }), + new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_ESCAPE()), + JavaUnicodeEscaper.outsideOf(32, 0x7f) + ); + + /** + * Translator object for escaping XML. + * + * While {@link #escapeXml(String)} is the expected method of use, this + * object allows the XML escaping functionality to be used + * as the foundation for a custom translator. + * + * @since 3.0 + * @deprecated use {@link #ESCAPE_XML10} or {@link #ESCAPE_XML11} instead. + */ + @Deprecated + public static final CharSequenceTranslator ESCAPE_XML = + new AggregateTranslator( + new LookupTranslator(EntityArrays.BASIC_ESCAPE()), + new LookupTranslator(EntityArrays.APOS_ESCAPE()) + ); + + /** + * Translator object for escaping XML 1.0. + * + * While {@link #escapeXml10(String)} is the expected method of use, this + * object allows the XML escaping functionality to be used + * as the foundation for a custom translator. + * + * @since 3.3 + */ + public static final CharSequenceTranslator ESCAPE_XML10 = + new AggregateTranslator( + new LookupTranslator(EntityArrays.BASIC_ESCAPE()), + new LookupTranslator(EntityArrays.APOS_ESCAPE()), + new LookupTranslator( + new String[][] { + { "\u0000", StringUtils.EMPTY }, + { "\u0001", StringUtils.EMPTY }, + { "\u0002", StringUtils.EMPTY }, + { "\u0003", StringUtils.EMPTY }, + { "\u0004", StringUtils.EMPTY }, + { "\u0005", StringUtils.EMPTY }, + { "\u0006", StringUtils.EMPTY }, + { "\u0007", StringUtils.EMPTY }, + { "\u0008", StringUtils.EMPTY }, + { "\u000b", StringUtils.EMPTY }, + { "\u000c", StringUtils.EMPTY }, + { "\u000e", StringUtils.EMPTY }, + { "\u000f", StringUtils.EMPTY }, + { "\u0010", StringUtils.EMPTY }, + { "\u0011", StringUtils.EMPTY }, + { "\u0012", StringUtils.EMPTY }, + { "\u0013", StringUtils.EMPTY }, + { "\u0014", StringUtils.EMPTY }, + { "\u0015", StringUtils.EMPTY }, + { "\u0016", StringUtils.EMPTY }, + { "\u0017", StringUtils.EMPTY }, + { "\u0018", StringUtils.EMPTY }, + { "\u0019", StringUtils.EMPTY }, + { "\u001a", StringUtils.EMPTY }, + { "\u001b", StringUtils.EMPTY }, + { "\u001c", StringUtils.EMPTY }, + { "\u001d", StringUtils.EMPTY }, + { "\u001e", StringUtils.EMPTY }, + { "\u001f", StringUtils.EMPTY }, + { "\ufffe", StringUtils.EMPTY }, + { "\uffff", StringUtils.EMPTY } + }), + NumericEntityEscaper.between(0x7f, 0x84), + NumericEntityEscaper.between(0x86, 0x9f), + new UnicodeUnpairedSurrogateRemover() + ); + + /** + * Translator object for escaping XML 1.1. + * + * While {@link #escapeXml11(String)} is the expected method of use, this + * object allows the XML escaping functionality to be used + * as the foundation for a custom translator. + * + * @since 3.3 + */ + public static final CharSequenceTranslator ESCAPE_XML11 = + new AggregateTranslator( + new LookupTranslator(EntityArrays.BASIC_ESCAPE()), + new LookupTranslator(EntityArrays.APOS_ESCAPE()), + new LookupTranslator( + new String[][] { + { "\u0000", StringUtils.EMPTY }, + { "\u000b", "" }, + { "\u000c", "" }, + { "\ufffe", StringUtils.EMPTY }, + { "\uffff", StringUtils.EMPTY } + }), + NumericEntityEscaper.between(0x1, 0x8), + NumericEntityEscaper.between(0xe, 0x1f), + NumericEntityEscaper.between(0x7f, 0x84), + NumericEntityEscaper.between(0x86, 0x9f), + new UnicodeUnpairedSurrogateRemover() + ); + + /** + * Translator object for escaping HTML version 3.0. + * + * While {@link #escapeHtml3(String)} is the expected method of use, this + * object allows the HTML escaping functionality to be used + * as the foundation for a custom translator. + * + * @since 3.0 + */ + public static final CharSequenceTranslator ESCAPE_HTML3 = + new AggregateTranslator( + new LookupTranslator(EntityArrays.BASIC_ESCAPE()), + new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE()) + ); + + /** + * Translator object for escaping HTML version 4.0. + * + * While {@link #escapeHtml4(String)} is the expected method of use, this + * object allows the HTML escaping functionality to be used + * as the foundation for a custom translator. + * + * @since 3.0 + */ + public static final CharSequenceTranslator ESCAPE_HTML4 = + new AggregateTranslator( + new LookupTranslator(EntityArrays.BASIC_ESCAPE()), + new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE()), + new LookupTranslator(EntityArrays.HTML40_EXTENDED_ESCAPE()) + ); + + /** + * Translator object for escaping individual Comma Separated Values. + * + * While {@link #escapeCsv(String)} is the expected method of use, this + * object allows the CSV escaping functionality to be used + * as the foundation for a custom translator. + * + * @since 3.0 + */ + public static final CharSequenceTranslator ESCAPE_CSV = new CsvEscaper(); + + // TODO: Create a parent class - 'SinglePassTranslator' ? + // It would handle the index checking + length returning, + // and could also have an optimization check method. + static class CsvEscaper extends CharSequenceTranslator { + + private static final char CSV_DELIMITER = ','; + private static final char CSV_QUOTE = '"'; + private static final String CSV_QUOTE_STR = String.valueOf(CSV_QUOTE); + private static final char[] CSV_SEARCH_CHARS = + new char[] {CSV_DELIMITER, CSV_QUOTE, CharUtils.CR, CharUtils.LF}; + + @Override + public int translate(final CharSequence input, final int index, final Writer out) throws IOException { + + if(index != 0) { + throw new IllegalStateException("CsvEscaper should never reach the [1] index"); + } + + if (StringUtils.containsNone(input.toString(), CSV_SEARCH_CHARS)) { + out.write(input.toString()); + } else { + out.write(CSV_QUOTE); + out.write(StringUtils.replace(input.toString(), CSV_QUOTE_STR, CSV_QUOTE_STR + CSV_QUOTE_STR)); + out.write(CSV_QUOTE); + } + return Character.codePointCount(input, 0, input.length()); + } + } + + /* UNESCAPE TRANSLATORS */ + + /** + * Translator object for unescaping escaped Java. + * + * While {@link #unescapeJava(String)} is the expected method of use, this + * object allows the Java unescaping functionality to be used + * as the foundation for a custom translator. + * + * @since 3.0 + */ + // TODO: throw "illegal character: \92" as an Exception if a \ on the end of the Java (as per the compiler)? + public static final CharSequenceTranslator UNESCAPE_JAVA = + new AggregateTranslator( + new OctalUnescaper(), // .between('\1', '\377'), + new UnicodeUnescaper(), + new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_UNESCAPE()), + new LookupTranslator( + new String[][] { + {"\\\\", "\\"}, + {"\\\"", "\""}, + {"\\'", "'"}, + {"\\", ""} + }) + ); + + /** + * Translator object for unescaping escaped EcmaScript. + * + * While {@link #unescapeEcmaScript(String)} is the expected method of use, this + * object allows the EcmaScript unescaping functionality to be used + * as the foundation for a custom translator. + * + * @since 3.0 + */ + public static final CharSequenceTranslator UNESCAPE_ECMASCRIPT = UNESCAPE_JAVA; + + /** + * Translator object for unescaping escaped Json. + * + * While {@link #unescapeJson(String)} is the expected method of use, this + * object allows the Json unescaping functionality to be used + * as the foundation for a custom translator. + * + * @since 3.2 + */ + public static final CharSequenceTranslator UNESCAPE_JSON = UNESCAPE_JAVA; + + /** + * Translator object for unescaping escaped HTML 3.0. + * + * While {@link #unescapeHtml3(String)} is the expected method of use, this + * object allows the HTML unescaping functionality to be used + * as the foundation for a custom translator. + * + * @since 3.0 + */ + public static final CharSequenceTranslator UNESCAPE_HTML3 = + new AggregateTranslator( + new LookupTranslator(EntityArrays.BASIC_UNESCAPE()), + new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE()), + new NumericEntityUnescaper() + ); + + /** + * Translator object for unescaping escaped HTML 4.0. + * + * While {@link #unescapeHtml4(String)} is the expected method of use, this + * object allows the HTML unescaping functionality to be used + * as the foundation for a custom translator. + * + * @since 3.0 + */ + public static final CharSequenceTranslator UNESCAPE_HTML4 = + new AggregateTranslator( + new LookupTranslator(EntityArrays.BASIC_UNESCAPE()), + new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE()), + new LookupTranslator(EntityArrays.HTML40_EXTENDED_UNESCAPE()), + new NumericEntityUnescaper() + ); + + /** + * Translator object for unescaping escaped XML. + * + * While {@link #unescapeXml(String)} is the expected method of use, this + * object allows the XML unescaping functionality to be used + * as the foundation for a custom translator. + * + * @since 3.0 + */ + public static final CharSequenceTranslator UNESCAPE_XML = + new AggregateTranslator( + new LookupTranslator(EntityArrays.BASIC_UNESCAPE()), + new LookupTranslator(EntityArrays.APOS_UNESCAPE()), + new NumericEntityUnescaper() + ); + + /** + * Translator object for unescaping escaped Comma Separated Value entries. + * + * While {@link #unescapeCsv(String)} is the expected method of use, this + * object allows the CSV unescaping functionality to be used + * as the foundation for a custom translator. + * + * @since 3.0 + */ + public static final CharSequenceTranslator UNESCAPE_CSV = new CsvUnescaper(); + + static class CsvUnescaper extends CharSequenceTranslator { + + private static final char CSV_DELIMITER = ','; + private static final char CSV_QUOTE = '"'; + private static final String CSV_QUOTE_STR = String.valueOf(CSV_QUOTE); + private static final char[] CSV_SEARCH_CHARS = + new char[] {CSV_DELIMITER, CSV_QUOTE, CharUtils.CR, CharUtils.LF}; + + @Override + public int translate(final CharSequence input, final int index, final Writer out) throws IOException { + + if(index != 0) { + throw new IllegalStateException("CsvUnescaper should never reach the [1] index"); + } + + if ( input.charAt(0) != CSV_QUOTE || input.charAt(input.length() - 1) != CSV_QUOTE ) { + out.write(input.toString()); + return Character.codePointCount(input, 0, input.length()); + } + + // strip quotes + final String quoteless = input.subSequence(1, input.length() - 1).toString(); + + if ( StringUtils.containsAny(quoteless, CSV_SEARCH_CHARS) ) { + // deal with escaped quotes; ie) "" + out.write(StringUtils.replace(quoteless, CSV_QUOTE_STR + CSV_QUOTE_STR, CSV_QUOTE_STR)); + } else { + out.write(input.toString()); + } + return Character.codePointCount(input, 0, input.length()); + } + } + + /* Helper functions */ + + /** + * <p>{@code StringEscapeUtils} instances should NOT be constructed in + * standard programming.</p> + * + * <p>Instead, the class should be used as:</p> + * <pre>StringEscapeUtils.escapeJava("foo");</pre> + * + * <p>This constructor is public to permit tools that require a JavaBean + * instance to operate.</p> + */ + public StringEscapeUtils() { + super(); + } + + // Java and JavaScript + //-------------------------------------------------------------------------- + /** + * <p>Escapes the characters in a {@code String} using Java String rules.</p> + * + * <p>Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.) </p> + * + * <p>So a tab becomes the characters {@code '\\'} and + * {@code 't'}.</p> + * + * <p>The only difference between Java strings and JavaScript strings + * is that in JavaScript, a single quote and forward-slash (/) are escaped.</p> + * + * <p>Example:</p> + * <pre> + * input string: He didn't say, "Stop!" + * output string: He didn't say, \"Stop!\" + * </pre> + * + * @param input String to escape values in, may be null + * @return String with escaped values, {@code null} if null string input + */ + public static final String escapeJava(final String input) { + return ESCAPE_JAVA.translate(input); + } + + /** + * <p>Escapes the characters in a {@code String} using EcmaScript String rules.</p> + * <p>Escapes any values it finds into their EcmaScript String form. + * Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.) </p> + * + * <p>So a tab becomes the characters {@code '\\'} and + * {@code 't'}.</p> + * + * <p>The only difference between Java strings and EcmaScript strings + * is that in EcmaScript, a single quote and forward-slash (/) are escaped.</p> + * + * <p>Note that EcmaScript is best known by the JavaScript and ActionScript dialects. </p> + * + * <p>Example:</p> + * <pre> + * input string: He didn't say, "Stop!" + * output string: He didn\'t say, \"Stop!\" + * </pre> + * + * @param input String to escape values in, may be null + * @return String with escaped values, {@code null} if null string input + * + * @since 3.0 + */ + public static final String escapeEcmaScript(final String input) { + return ESCAPE_ECMASCRIPT.translate(input); + } + + /** + * <p>Escapes the characters in a {@code String} using Json String rules.</p> + * <p>Escapes any values it finds into their Json String form. + * Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.) </p> + * + * <p>So a tab becomes the characters {@code '\\'} and + * {@code 't'}.</p> + * + * <p>The only difference between Java strings and Json strings + * is that in Json, forward-slash (/) is escaped.</p> + * + * <p>See http://www.ietf.org/rfc/rfc4627.txt for further details. </p> + * + * <p>Example:</p> + * <pre> + * input string: He didn't say, "Stop!" + * output string: He didn't say, \"Stop!\" + * </pre> + * + * @param input String to escape values in, may be null + * @return String with escaped values, {@code null} if null string input + * + * @since 3.2 + */ + public static final String escapeJson(final String input) { + return ESCAPE_JSON.translate(input); + } + + /** + * <p>Unescapes any Java literals found in the {@code String}. + * For example, it will turn a sequence of {@code '\'} and + * {@code 'n'} into a newline character, unless the {@code '\'} + * is preceded by another {@code '\'}.</p> + * + * @param input the {@code String} to unescape, may be null + * @return a new unescaped {@code String}, {@code null} if null string input + */ + public static final String unescapeJava(final String input) { + return UNESCAPE_JAVA.translate(input); + } + + /** + * <p>Unescapes any EcmaScript literals found in the {@code String}.</p> + * + * <p>For example, it will turn a sequence of {@code '\'} and {@code 'n'} + * into a newline character, unless the {@code '\'} is preceded by another + * {@code '\'}.</p> + * + * @see #unescapeJava(String) + * @param input the {@code String} to unescape, may be null + * @return A new unescaped {@code String}, {@code null} if null string input + * + * @since 3.0 + */ + public static final String unescapeEcmaScript(final String input) { + return UNESCAPE_ECMASCRIPT.translate(input); + } + + /** + * <p>Unescapes any Json literals found in the {@code String}.</p> + * + * <p>For example, it will turn a sequence of {@code '\'} and {@code 'n'} + * into a newline character, unless the {@code '\'} is preceded by another + * {@code '\'}.</p> + * + * @see #unescapeJava(String) + * @param input the {@code String} to unescape, may be null + * @return A new unescaped {@code String}, {@code null} if null string input + * + * @since 3.2 + */ + public static final String unescapeJson(final String input) { + return UNESCAPE_JSON.translate(input); + } + + // HTML and XML + //-------------------------------------------------------------------------- + /** + * <p>Escapes the characters in a {@code String} using HTML entities.</p> + * + * <p> + * For example: + * </p> + * <p><code>"bread" & "butter"</code></p> + * becomes: + * <p> + * <code>&quot;bread&quot; &amp; &quot;butter&quot;</code>. + * </p> + * + * <p>Supports all known HTML 4.0 entities, including funky accents. + * Note that the commonly used apostrophe escape character (&apos;) + * is not a legal entity and so is not supported). </p> + * + * @param input the {@code String} to escape, may be null + * @return a new escaped {@code String}, {@code null} if null string input + * + * @see <a href="http://hotwired.lycos.com/webmonkey/reference/special_characters/">ISO Entities</a> + * @see <a href="http://www.w3.org/TR/REC-html32#latin1">HTML 3.2 Character Entities for ISO Latin-1</a> + * @see <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">HTML 4.0 Character entity references</a> + * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">HTML 4.01 Character References</a> + * @see <a href="http://www.w3.org/TR/html401/charset.html#code-position">HTML 4.01 Code positions</a> + * + * @since 3.0 + */ + public static final String escapeHtml4(final String input) { + return ESCAPE_HTML4.translate(input); + } + + /** + * <p>Escapes the characters in a {@code String} using HTML entities.</p> + * <p>Supports only the HTML 3.0 entities. </p> + * + * @param input the {@code String} to escape, may be null + * @return a new escaped {@code String}, {@code null} if null string input + * + * @since 3.0 + */ + public static final String escapeHtml3(final String input) { + return ESCAPE_HTML3.translate(input); + } + + //----------------------------------------------------------------------- + /** + * <p>Unescapes a string containing entity escapes to a string + * containing the actual Unicode characters corresponding to the + * escapes. Supports HTML 4.0 entities.</p> + * + * <p>For example, the string {@code "<Français>"} + * will become {@code "<Fran�ais>"}</p> + * + * <p>If an entity is unrecognized, it is left alone, and inserted + * verbatim into the result string. e.g. {@code ">&zzzz;x"} will + * become {@code ">&zzzz;x"}.</p> + * + * @param input the {@code String} to unescape, may be null + * @return a new unescaped {@code String}, {@code null} if null string input + * + * @since 3.0 + */ + public static final String unescapeHtml4(final String input) { + return UNESCAPE_HTML4.translate(input); + } + + /** + * <p>Unescapes a string containing entity escapes to a string + * containing the actual Unicode characters corresponding to the + * escapes. Supports only HTML 3.0 entities.</p> + * + * @param input the {@code String} to unescape, may be null + * @return a new unescaped {@code String}, {@code null} if null string input + * + * @since 3.0 + */ + public static final String unescapeHtml3(final String input) { + return UNESCAPE_HTML3.translate(input); + } + + //----------------------------------------------------------------------- + /** + * <p>Escapes the characters in a {@code String} using XML entities.</p> + * + * <p>For example: {@code "bread" & "butter"} => + * {@code "bread" & "butter"}. + * </p> + * + * <p>Supports only the five basic XML entities (gt, lt, quot, amp, apos). + * Does not support DTDs or external entities.</p> + * + * <p>Note that Unicode characters greater than 0x7f are as of 3.0, no longer + * escaped. If you still wish this functionality, you can achieve it + * via the following: + * {@code StringEscapeUtils.ESCAPE_XML.with( NumericEntityEscaper.between(0x7f, Integer.MAX_VALUE) );}</p> + * + * @param input the {@code String} to escape, may be null + * @return a new escaped {@code String}, {@code null} if null string input + * @see #unescapeXml(java.lang.String) + * @deprecated use {@link #escapeXml10(java.lang.String)} or {@link #escapeXml11(java.lang.String)} instead. + */ + @Deprecated + public static final String escapeXml(final String input) { + return ESCAPE_XML.translate(input); + } + + /** + * <p>Escapes the characters in a {@code String} using XML entities.</p> + * + * <p>For example: {@code "bread" & "butter"} => + * {@code "bread" & "butter"}. + * </p> + * + * <p>Note that XML 1.0 is a text-only format: it cannot represent control + * characters or unpaired Unicode surrogate codepoints, even after escaping. + * {@code escapeXml10} will remove characters that do not fit in the + * following ranges:</p> + * + * <p>{@code #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]}</p> + * + * <p>Though not strictly necessary, {@code escapeXml10} will escape + * characters in the following ranges:</p> + * + * <p>{@code [#x7F-#x84] | [#x86-#x9F]}</p> + * + * <p>The returned string can be inserted into a valid XML 1.0 or XML 1.1 + * document. If you want to allow more non-text characters in an XML 1.1 + * document, use {@link #escapeXml11(String)}.</p> + * + * @param input the {@code String} to escape, may be null + * @return a new escaped {@code String}, {@code null} if null string input + * @see #unescapeXml(java.lang.String) + * @since 3.3 + */ + public static String escapeXml10(final String input) { + return ESCAPE_XML10.translate(input); + } + + /** + * <p>Escapes the characters in a {@code String} using XML entities.</p> + * + * <p>For example: {@code "bread" & "butter"} => + * {@code "bread" & "butter"}. + * </p> + * + * <p>XML 1.1 can represent certain control characters, but it cannot represent + * the null byte or unpaired Unicode surrogate codepoints, even after escaping. + * {@code escapeXml11} will remove characters that do not fit in the following + * ranges:</p> + * + * <p>{@code [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]}</p> + * + * <p>{@code escapeXml11} will escape characters in the following ranges:</p> + * + * <p>{@code [#x1-#x8] | [#xB-#xC] | [#xE-#x1F] | [#x7F-#x84] | [#x86-#x9F]}</p> + * + * <p>The returned string can be inserted into a valid XML 1.1 document. Do not + * use it for XML 1.0 documents.</p> + * + * @param input the {@code String} to escape, may be null + * @return a new escaped {@code String}, {@code null} if null string input + * @see #unescapeXml(java.lang.String) + * @since 3.3 + */ + public static String escapeXml11(final String input) { + return ESCAPE_XML11.translate(input); + } + + //----------------------------------------------------------------------- + /** + * <p>Unescapes a string containing XML entity escapes to a string + * containing the actual Unicode characters corresponding to the + * escapes.</p> + * + * <p>Supports only the five basic XML entities (gt, lt, quot, amp, apos). + * Does not support DTDs or external entities.</p> + * + * <p>Note that numerical \\u Unicode codes are unescaped to their respective + * Unicode characters. This may change in future releases. </p> + * + * @param input the {@code String} to unescape, may be null + * @return a new unescaped {@code String}, {@code null} if null string input + * @see #escapeXml(String) + * @see #escapeXml10(String) + * @see #escapeXml11(String) + */ + public static final String unescapeXml(final String input) { + return UNESCAPE_XML.translate(input); + } + + //----------------------------------------------------------------------- + + /** + * <p>Returns a {@code String} value for a CSV column enclosed in double quotes, + * if required.</p> + * + * <p>If the value contains a comma, newline or double quote, then the + * String value is returned enclosed in double quotes.</p> + * + * <p>Any double quote characters in the value are escaped with another double quote.</p> + * + * <p>If the value does not contain a comma, newline or double quote, then the + * String value is returned unchanged.</p> + * + * see <a href="http://en.wikipedia.org/wiki/Comma-separated_values">Wikipedia</a> and + * <a href="http://tools.ietf.org/html/rfc4180">RFC 4180</a>. + * + * @param input the input CSV column String, may be null + * @return the input String, enclosed in double quotes if the value contains a comma, + * newline or double quote, {@code null} if null string input + * @since 2.4 + */ + public static final String escapeCsv(final String input) { + return ESCAPE_CSV.translate(input); + } + + /** + * <p>Returns a {@code String} value for an unescaped CSV column. </p> + * + * <p>If the value is enclosed in double quotes, and contains a comma, newline + * or double quote, then quotes are removed. + * </p> + * + * <p>Any double quote escaped characters (a pair of double quotes) are unescaped + * to just one double quote. </p> + * + * <p>If the value is not enclosed in double quotes, or is and does not contain a + * comma, newline or double quote, then the String value is returned unchanged.</p> + * + * see <a href="http://en.wikipedia.org/wiki/Comma-separated_values">Wikipedia</a> and + * <a href="http://tools.ietf.org/html/rfc4180">RFC 4180</a>. + * + * @param input the input CSV column String, may be null + * @return the input String, with enclosing double quotes removed and embedded double + * quotes unescaped, {@code null} if null string input + * @since 2.4 + */ + public static final String unescapeCsv(final String input) { + return UNESCAPE_CSV.translate(input); + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/commons-text/blob/d8f547e8/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java b/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java new file mode 100644 index 0000000..c86f769 --- /dev/null +++ b/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java @@ -0,0 +1,621 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.io.FileInputStream; +import java.io.IOException; +import java.io.StringWriter; +import java.lang.reflect.Constructor; +import java.lang.reflect.Modifier; +import java.nio.charset.Charset; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.text.translate.CharSequenceTranslator; +import org.apache.commons.text.translate.NumericEntityEscaper; +import org.junit.Test; + +/** + * Unit tests for {@link StringEscapeUtils}. + * + * <p> + * This code has been adapted from Apache Commons Lang 3.5. + * </p> + * + */ +public class StringEscapeUtilsTest { + private final static String FOO = "foo"; + + @Test + public void testConstructor() { + assertNotNull(new StringEscapeUtils()); + final Constructor<?>[] cons = StringEscapeUtils.class.getDeclaredConstructors(); + assertEquals(1, cons.length); + assertTrue(Modifier.isPublic(cons[0].getModifiers())); + assertTrue(Modifier.isPublic(StringEscapeUtils.class.getModifiers())); + assertFalse(Modifier.isFinal(StringEscapeUtils.class.getModifiers())); + } + + @Test + public void testEscapeJava() throws IOException { + assertEquals(null, StringEscapeUtils.escapeJava(null)); + try { + StringEscapeUtils.ESCAPE_JAVA.translate(null, null); + fail(); + } catch (final IOException ex) { + fail(); + } catch (final IllegalArgumentException ex) { + } + try { + StringEscapeUtils.ESCAPE_JAVA.translate("", null); + fail(); + } catch (final IOException ex) { + fail(); + } catch (final IllegalArgumentException ex) { + } + + assertEscapeJava("empty string", "", ""); + assertEscapeJava(FOO, FOO); + assertEscapeJava("tab", "\\t", "\t"); + assertEscapeJava("backslash", "\\\\", "\\"); + assertEscapeJava("single quote should not be escaped", "'", "'"); + assertEscapeJava("\\\\\\b\\t\\r", "\\\b\t\r"); + assertEscapeJava("\\u1234", "\u1234"); + assertEscapeJava("\\u0234", "\u0234"); + assertEscapeJava("\\u00EF", "\u00ef"); + assertEscapeJava("\\u0001", "\u0001"); + assertEscapeJava("Should use capitalized Unicode hex", "\\uABCD", "\uabcd"); + + assertEscapeJava("He didn't say, \\\"stop!\\\"", + "He didn't say, \"stop!\""); + assertEscapeJava("non-breaking space", "This space is non-breaking:" + "\\u00A0", + "This space is non-breaking:\u00a0"); + assertEscapeJava("\\uABCD\\u1234\\u012C", + "\uABCD\u1234\u012C"); + } + + /** + * Tests https://issues.apache.org/jira/browse/LANG-421 + */ + @Test + public void testEscapeJavaWithSlash() { + final String input = "String with a slash (/) in it"; + + final String expected = input; + final String actual = StringEscapeUtils.escapeJava(input); + + /** + * In 2.4 StringEscapeUtils.escapeJava(String) escapes '/' characters, which are not a valid character to escape + * in a Java string. + */ + assertEquals(expected, actual); + } + + private void assertEscapeJava(final String escaped, final String original) throws IOException { + assertEscapeJava(null, escaped, original); + } + + private void assertEscapeJava(String message, final String expected, final String original) throws IOException { + final String converted = StringEscapeUtils.escapeJava(original); + message = "escapeJava(String) failed" + (message == null ? "" : (": " + message)); + assertEquals(message, expected, converted); + + final StringWriter writer = new StringWriter(); + StringEscapeUtils.ESCAPE_JAVA.translate(original, writer); + assertEquals(expected, writer.toString()); + } + + @Test + public void testUnescapeJava() throws IOException { + assertEquals(null, StringEscapeUtils.unescapeJava(null)); + try { + StringEscapeUtils.UNESCAPE_JAVA.translate(null, null); + fail(); + } catch (final IOException ex) { + fail(); + } catch (final IllegalArgumentException ex) { + } + try { + StringEscapeUtils.UNESCAPE_JAVA.translate("", null); + fail(); + } catch (final IOException ex) { + fail(); + } catch (final IllegalArgumentException ex) { + } + try { + StringEscapeUtils.unescapeJava("\\u02-3"); + fail(); + } catch (final RuntimeException ex) { + } + + assertUnescapeJava("", ""); + assertUnescapeJava("test", "test"); + assertUnescapeJava("\ntest\b", "\\ntest\\b"); + assertUnescapeJava("\u123425foo\ntest\b", "\\u123425foo\\ntest\\b"); + assertUnescapeJava("'\foo\teste\r", "\\'\\foo\\teste\\r"); + assertUnescapeJava("", "\\"); + //foo + assertUnescapeJava("lowercase Unicode", "\uABCDx", "\\uabcdx"); + assertUnescapeJava("uppercase Unicode", "\uABCDx", "\\uABCDx"); + assertUnescapeJava("Unicode as final character", "\uABCD", "\\uabcd"); + } + + private void assertUnescapeJava(final String unescaped, final String original) throws IOException { + assertUnescapeJava(null, unescaped, original); + } + + private void assertUnescapeJava(final String message, final String unescaped, final String original) throws IOException { + final String expected = unescaped; + final String actual = StringEscapeUtils.unescapeJava(original); + + assertEquals("unescape(String) failed" + + (message == null ? "" : (": " + message)) + + ": expected '" + StringEscapeUtils.escapeJava(expected) + + // we escape this so we can see it in the error message + "' actual '" + StringEscapeUtils.escapeJava(actual) + "'", + expected, actual); + + final StringWriter writer = new StringWriter(); + StringEscapeUtils.UNESCAPE_JAVA.translate(original, writer); + assertEquals(unescaped, writer.toString()); + + } + + @Test + public void testEscapeEcmaScript() { + assertEquals(null, StringEscapeUtils.escapeEcmaScript(null)); + try { + StringEscapeUtils.ESCAPE_ECMASCRIPT.translate(null, null); + fail(); + } catch (final IOException ex) { + fail(); + } catch (final IllegalArgumentException ex) { + } + try { + StringEscapeUtils.ESCAPE_ECMASCRIPT.translate("", null); + fail(); + } catch (final IOException ex) { + fail(); + } catch (final IllegalArgumentException ex) { + } + + assertEquals("He didn\\'t say, \\\"stop!\\\"", StringEscapeUtils.escapeEcmaScript("He didn't say, \"stop!\"")); + assertEquals("document.getElementById(\\\"test\\\").value = \\'<script>alert(\\'aaa\\');<\\/script>\\';", + StringEscapeUtils.escapeEcmaScript("document.getElementById(\"test\").value = '<script>alert('aaa');</script>';")); + } + + + // HTML and XML + //-------------------------------------------------------------- + + private static final String[][] HTML_ESCAPES = { + {"no escaping", "plain text", "plain text"}, + {"no escaping", "plain text", "plain text"}, + {"empty string", "", ""}, + {"null", null, null}, + {"ampersand", "bread & butter", "bread & butter"}, + {"quotes", ""bread" & butter", "\"bread\" & butter"}, + {"final character only", "greater than >", "greater than >"}, + {"first character only", "< less than", "< less than"}, + {"apostrophe", "Huntington's chorea", "Huntington's chorea"}, + {"languages", "English,Français,\u65E5\u672C\u8A9E (nihongo)", "English,Fran\u00E7ais,\u65E5\u672C\u8A9E (nihongo)"}, + {"8-bit ascii shouldn't number-escape", "\u0080\u009F", "\u0080\u009F"}, + }; + + @Test + public void testEscapeHtml() { + for (final String[] element : HTML_ESCAPES) { + final String message = element[0]; + final String expected = element[1]; + final String original = element[2]; + assertEquals(message, expected, StringEscapeUtils.escapeHtml4(original)); + final StringWriter sw = new StringWriter(); + try { + StringEscapeUtils.ESCAPE_HTML4.translate(original, sw); + } catch (final IOException e) { + } + final String actual = original == null ? null : sw.toString(); + assertEquals(message, expected, actual); + } + } + + @Test + public void testUnescapeHtml4() { + for (final String[] element : HTML_ESCAPES) { + final String message = element[0]; + final String expected = element[2]; + final String original = element[1]; + assertEquals(message, expected, StringEscapeUtils.unescapeHtml4(original)); + + final StringWriter sw = new StringWriter(); + try { + StringEscapeUtils.UNESCAPE_HTML4.translate(original, sw); + } catch (final IOException e) { + } + final String actual = original == null ? null : sw.toString(); + assertEquals(message, expected, actual); + } + // \u00E7 is a cedilla (c with wiggle under) + // note that the test string must be 7-bit-clean (Unicode escaped) or else it will compile incorrectly + // on some locales + assertEquals("funny chars pass through OK", "Fran\u00E7ais", StringEscapeUtils.unescapeHtml4("Fran\u00E7ais")); + + assertEquals("Hello&;World", StringEscapeUtils.unescapeHtml4("Hello&;World")); + assertEquals("Hello&#;World", StringEscapeUtils.unescapeHtml4("Hello&#;World")); + assertEquals("Hello&# ;World", StringEscapeUtils.unescapeHtml4("Hello&# ;World")); + assertEquals("Hello&##;World", StringEscapeUtils.unescapeHtml4("Hello&##;World")); + } + + @Test + public void testUnescapeHexCharsHtml() { + // Simple easy to grok test + assertEquals("hex number unescape", "\u0080\u009F", StringEscapeUtils.unescapeHtml4("€Ÿ")); + assertEquals("hex number unescape", "\u0080\u009F", StringEscapeUtils.unescapeHtml4("€Ÿ")); + // Test all Character values: + for (char i = Character.MIN_VALUE; i < Character.MAX_VALUE; i++) { + final Character c1 = new Character(i); + final Character c2 = new Character((char)(i+1)); + final String expected = c1.toString() + c2.toString(); + final String escapedC1 = "&#x" + Integer.toHexString((c1.charValue())) + ";"; + final String escapedC2 = "&#x" + Integer.toHexString((c2.charValue())) + ";"; + assertEquals("hex number unescape index " + (int)i, expected, StringEscapeUtils.unescapeHtml4(escapedC1 + escapedC2)); + } + } + + @Test + public void testUnescapeUnknownEntity() throws Exception { + assertEquals("&zzzz;", StringEscapeUtils.unescapeHtml4("&zzzz;")); + } + + @Test + public void testEscapeHtmlVersions() throws Exception { + assertEquals("Β", StringEscapeUtils.escapeHtml4("\u0392")); + assertEquals("\u0392", StringEscapeUtils.unescapeHtml4("Β")); + + // TODO: refine API for escaping/unescaping specific HTML versions + } + + @Test + @SuppressWarnings( "deprecation" ) // ESCAPE_XML has been replaced by ESCAPE_XML10 and ESCAPE_XML11 in 3.3 + public void testEscapeXml() throws Exception { + assertEquals("<abc>", StringEscapeUtils.escapeXml("<abc>")); + assertEquals("<abc>", StringEscapeUtils.unescapeXml("<abc>")); + + assertEquals("XML should not escape >0x7f values", + "\u00A1", StringEscapeUtils.escapeXml("\u00A1")); + assertEquals("XML should be able to unescape >0x7f values", + "\u00A0", StringEscapeUtils.unescapeXml(" ")); + assertEquals("XML should be able to unescape >0x7f values with one leading 0", + "\u00A0", StringEscapeUtils.unescapeXml(" ")); + assertEquals("XML should be able to unescape >0x7f values with two leading 0s", + "\u00A0", StringEscapeUtils.unescapeXml(" ")); + assertEquals("XML should be able to unescape >0x7f values with three leading 0s", + "\u00A0", StringEscapeUtils.unescapeXml(" ")); + + assertEquals("ain't", StringEscapeUtils.unescapeXml("ain't")); + assertEquals("ain't", StringEscapeUtils.escapeXml("ain't")); + assertEquals("", StringEscapeUtils.escapeXml("")); + assertEquals(null, StringEscapeUtils.escapeXml(null)); + assertEquals(null, StringEscapeUtils.unescapeXml(null)); + + StringWriter sw = new StringWriter(); + try { + StringEscapeUtils.ESCAPE_XML.translate("<abc>", sw); + } catch (final IOException e) { + } + assertEquals("XML was escaped incorrectly", "<abc>", sw.toString() ); + + sw = new StringWriter(); + try { + StringEscapeUtils.UNESCAPE_XML.translate("<abc>", sw); + } catch (final IOException e) { + } + assertEquals("XML was unescaped incorrectly", "<abc>", sw.toString() ); + } + + @Test + public void testEscapeXml10() throws Exception { + assertEquals("a<b>c"d'e&f", StringEscapeUtils.escapeXml10("a<b>c\"d'e&f")); + assertEquals("XML 1.0 should not escape \t \n \r", + "a\tb\rc\nd", StringEscapeUtils.escapeXml10("a\tb\rc\nd")); + assertEquals("XML 1.0 should omit most #x0-x8 | #xb | #xc | #xe-#x19", + "ab", StringEscapeUtils.escapeXml10("a\u0000\u0001\u0008\u000b\u000c\u000e\u001fb")); + assertEquals("XML 1.0 should omit #xd800-#xdfff", + "a\ud7ff \ue000b", StringEscapeUtils.escapeXml10("a\ud7ff\ud800 \udfff \ue000b")); + assertEquals("XML 1.0 should omit #xfffe | #xffff", + "a\ufffdb", StringEscapeUtils.escapeXml10("a\ufffd\ufffe\uffffb")); + assertEquals("XML 1.0 should escape #x7f-#x84 | #x86 - #x9f, for XML 1.1 compatibility", + "a\u007e„\u0085†Ÿ\u00a0b", StringEscapeUtils.escapeXml10("a\u007e\u007f\u0084\u0085\u0086\u009f\u00a0b")); + } + + @Test + public void testEscapeXml11() throws Exception { + assertEquals("a<b>c"d'e&f", StringEscapeUtils.escapeXml11("a<b>c\"d'e&f")); + assertEquals("XML 1.1 should not escape \t \n \r", + "a\tb\rc\nd", StringEscapeUtils.escapeXml11("a\tb\rc\nd")); + assertEquals("XML 1.1 should omit #x0", + "ab", StringEscapeUtils.escapeXml11("a\u0000b")); + assertEquals("XML 1.1 should escape #x1-x8 | #xb | #xc | #xe-#x19", + "ab", StringEscapeUtils.escapeXml11("a\u0001\u0008\u000b\u000c\u000e\u001fb")); + assertEquals("XML 1.1 should escape #x7F-#x84 | #x86-#x9F", + "a\u007e„\u0085†Ÿ\u00a0b", StringEscapeUtils.escapeXml11("a\u007e\u007f\u0084\u0085\u0086\u009f\u00a0b")); + assertEquals("XML 1.1 should omit #xd800-#xdfff", + "a\ud7ff \ue000b", StringEscapeUtils.escapeXml11("a\ud7ff\ud800 \udfff \ue000b")); + assertEquals("XML 1.1 should omit #xfffe | #xffff", + "a\ufffdb", StringEscapeUtils.escapeXml11("a\ufffd\ufffe\uffffb")); + } + + /** + * Tests Supplementary characters. + * <p> + * From http://www.w3.org/International/questions/qa-escapes + * </p> + * <blockquote> + * Supplementary characters are those Unicode characters that have code points higher than the characters in + * the Basic Multilingual Plane (BMP). In UTF-16 a supplementary character is encoded using two 16-bit surrogate code points from the + * BMP. Because of this, some people think that supplementary characters need to be represented using two escapes, but this is incorrect + * - you must use the single, code point value for that character. For example, use &#x233B4; rather than + * &#xD84C;&#xDFB4;. + * </blockquote> + * @see <a href="http://www.w3.org/International/questions/qa-escapes">Using character escapes in markup and CSS</a> + * @see <a href="https://issues.apache.org/jira/browse/LANG-728">LANG-728</a> + */ + @Test + @SuppressWarnings( "deprecation" ) // ESCAPE_XML has been replaced by ESCAPE_XML10 and ESCAPE_XML11 in 3.3 + public void testEscapeXmlSupplementaryCharacters() { + final CharSequenceTranslator escapeXml = + StringEscapeUtils.ESCAPE_XML.with( NumericEntityEscaper.between(0x7f, Integer.MAX_VALUE) ); + + assertEquals("Supplementary character must be represented using a single escape", "𣎴", + escapeXml.translate("\uD84C\uDFB4")); + + assertEquals("Supplementary characters mixed with basic characters should be encoded correctly", "a b c 𣎴", + escapeXml.translate("a b c \uD84C\uDFB4")); + } + + @Test + @SuppressWarnings( "deprecation" ) // ESCAPE_XML has been replaced by ESCAPE_XML10 and ESCAPE_XML11 in 3.3 + public void testEscapeXmlAllCharacters() { + // http://www.w3.org/TR/xml/#charsets says: + // Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /* any Unicode character, + // excluding the surrogate blocks, FFFE, and FFFF. */ + final CharSequenceTranslator escapeXml = StringEscapeUtils.ESCAPE_XML + .with(NumericEntityEscaper.below(9), NumericEntityEscaper.between(0xB, 0xC), NumericEntityEscaper.between(0xE, 0x19), + NumericEntityEscaper.between(0xD800, 0xDFFF), NumericEntityEscaper.between(0xFFFE, 0xFFFF), NumericEntityEscaper.above(0x110000)); + + assertEquals("�", escapeXml.translate("\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008")); + assertEquals("\t", escapeXml.translate("\t")); // 0x9 + assertEquals("\n", escapeXml.translate("\n")); // 0xA + assertEquals("", escapeXml.translate("\u000B\u000C")); + assertEquals("\r", escapeXml.translate("\r")); // 0xD + assertEquals("Hello World! Ain't this great?", escapeXml.translate("Hello World! Ain't this great?")); + assertEquals("", escapeXml.translate("\u000E\u000F\u0018\u0019")); + } + + /** + * Reverse of the above. + * + * @see <a href="https://issues.apache.org/jira/browse/LANG-729">LANG-729</a> + */ + @Test + public void testUnescapeXmlSupplementaryCharacters() { + assertEquals("Supplementary character must be represented using a single escape", "\uD84C\uDFB4", + StringEscapeUtils.unescapeXml("𣎴") ); + + assertEquals("Supplementary characters mixed with basic characters should be decoded correctly", "a b c \uD84C\uDFB4", + StringEscapeUtils.unescapeXml("a b c 𣎴") ); + } + + // Tests issue #38569 + // http://issues.apache.org/bugzilla/show_bug.cgi?id=38569 + @Test + public void testStandaloneAmphersand() { + assertEquals("<P&O>", StringEscapeUtils.unescapeHtml4("<P&O>")); + assertEquals("test & <", StringEscapeUtils.unescapeHtml4("test & <")); + assertEquals("<P&O>", StringEscapeUtils.unescapeXml("<P&O>")); + assertEquals("test & <", StringEscapeUtils.unescapeXml("test & <")); + } + + @Test + public void testLang313() { + assertEquals("& &", StringEscapeUtils.unescapeHtml4("& &")); + } + + @Test + public void testEscapeCsvString() throws Exception { + assertEquals("foo.bar", StringEscapeUtils.escapeCsv("foo.bar")); + assertEquals("\"foo,bar\"", StringEscapeUtils.escapeCsv("foo,bar")); + assertEquals("\"foo\nbar\"", StringEscapeUtils.escapeCsv("foo\nbar")); + assertEquals("\"foo\rbar\"", StringEscapeUtils.escapeCsv("foo\rbar")); + assertEquals("\"foo\"\"bar\"", StringEscapeUtils.escapeCsv("foo\"bar")); + assertEquals("foo\uD84C\uDFB4bar", StringEscapeUtils.escapeCsv("foo\uD84C\uDFB4bar")); + assertEquals("", StringEscapeUtils.escapeCsv("")); + assertEquals(null, StringEscapeUtils.escapeCsv(null)); + } + + @Test + public void testEscapeCsvWriter() throws Exception { + checkCsvEscapeWriter("foo.bar", "foo.bar"); + checkCsvEscapeWriter("\"foo,bar\"", "foo,bar"); + checkCsvEscapeWriter("\"foo\nbar\"", "foo\nbar"); + checkCsvEscapeWriter("\"foo\rbar\"", "foo\rbar"); + checkCsvEscapeWriter("\"foo\"\"bar\"", "foo\"bar"); + checkCsvEscapeWriter("foo\uD84C\uDFB4bar", "foo\uD84C\uDFB4bar"); + checkCsvEscapeWriter("", null); + checkCsvEscapeWriter("", ""); + } + + private void checkCsvEscapeWriter(final String expected, final String value) { + try { + final StringWriter writer = new StringWriter(); + StringEscapeUtils.ESCAPE_CSV.translate(value, writer); + assertEquals(expected, writer.toString()); + } catch (final IOException e) { + fail("Threw: " + e); + } + } + + @Test + public void testUnescapeCsvString() throws Exception { + assertEquals("foo.bar", StringEscapeUtils.unescapeCsv("foo.bar")); + assertEquals("foo,bar", StringEscapeUtils.unescapeCsv("\"foo,bar\"")); + assertEquals("foo\nbar", StringEscapeUtils.unescapeCsv("\"foo\nbar\"")); + assertEquals("foo\rbar", StringEscapeUtils.unescapeCsv("\"foo\rbar\"")); + assertEquals("foo\"bar", StringEscapeUtils.unescapeCsv("\"foo\"\"bar\"")); + assertEquals("foo\uD84C\uDFB4bar", StringEscapeUtils.unescapeCsv("foo\uD84C\uDFB4bar")); + assertEquals("", StringEscapeUtils.unescapeCsv("")); + assertEquals(null, StringEscapeUtils.unescapeCsv(null)); + + assertEquals("\"foo.bar\"", StringEscapeUtils.unescapeCsv("\"foo.bar\"")); + } + + @Test + public void testUnescapeCsvWriter() throws Exception { + checkCsvUnescapeWriter("foo.bar", "foo.bar"); + checkCsvUnescapeWriter("foo,bar", "\"foo,bar\""); + checkCsvUnescapeWriter("foo\nbar", "\"foo\nbar\""); + checkCsvUnescapeWriter("foo\rbar", "\"foo\rbar\""); + checkCsvUnescapeWriter("foo\"bar", "\"foo\"\"bar\""); + checkCsvUnescapeWriter("foo\uD84C\uDFB4bar", "foo\uD84C\uDFB4bar"); + checkCsvUnescapeWriter("", null); + checkCsvUnescapeWriter("", ""); + + checkCsvUnescapeWriter("\"foo.bar\"", "\"foo.bar\""); + } + + private void checkCsvUnescapeWriter(final String expected, final String value) { + try { + final StringWriter writer = new StringWriter(); + StringEscapeUtils.UNESCAPE_CSV.translate(value, writer); + assertEquals(expected, writer.toString()); + } catch (final IOException e) { + fail("Threw: " + e); + } + } + + /** + * Tests // https://issues.apache.org/jira/browse/LANG-480 + */ + @Test + public void testEscapeHtmlHighUnicode() { + // this is the utf8 representation of the character: + // COUNTING ROD UNIT DIGIT THREE + // in Unicode + // codepoint: U+1D362 + final byte[] data = new byte[] { (byte)0xF0, (byte)0x9D, (byte)0x8D, (byte)0xA2 }; + + final String original = new String(data, Charset.forName("UTF8")); + + final String escaped = StringEscapeUtils.escapeHtml4( original ); + assertEquals( "High Unicode should not have been escaped", original, escaped); + + final String unescaped = StringEscapeUtils.unescapeHtml4( escaped ); + assertEquals( "High Unicode should have been unchanged", original, unescaped); + + // TODO: I think this should hold, needs further investigation + // String unescapedFromEntity = StringEscapeUtils.unescapeHtml4( "𝍢" ); + // assertEquals( "High Unicode should have been unescaped", original, unescapedFromEntity); + } + + /** + * Tests https://issues.apache.org/jira/browse/LANG-339 + */ + @Test + public void testEscapeHiragana() { + // Some random Japanese Unicode characters + final String original = "\u304B\u304C\u3068"; + final String escaped = StringEscapeUtils.escapeHtml4(original); + assertEquals( "Hiragana character Unicode behaviour should not be being escaped by escapeHtml4", + original, escaped); + + final String unescaped = StringEscapeUtils.unescapeHtml4( escaped ); + + assertEquals( "Hiragana character Unicode behaviour has changed - expected no unescaping", escaped, unescaped); + } + + /** + * Tests https://issues.apache.org/jira/browse/LANG-708 + * + * @throws IOException + * if an I/O error occurs + */ + @Test + public void testLang708() throws IOException { + final FileInputStream fis = new FileInputStream("src/test/resources/lang-708-input.txt"); + final String input = IOUtils.toString(fis, "UTF-8"); + final String escaped = StringEscapeUtils.escapeEcmaScript(input); + // just the end: + assertTrue(escaped, escaped.endsWith("}]")); + // a little more: + assertTrue(escaped, escaped.endsWith("\"valueCode\\\":\\\"\\\"}]")); + fis.close(); + } + + /** + * Tests https://issues.apache.org/jira/browse/LANG-720 + */ + @Test + @SuppressWarnings( "deprecation" ) // escapeXml(String) has been replaced by escapeXml10(String) and escapeXml11(String) in 3.3 + public void testLang720() { + final String input = "\ud842\udfb7" + "A"; + final String escaped = StringEscapeUtils.escapeXml(input); + assertEquals(input, escaped); + } + + /** + * Tests https://issues.apache.org/jira/browse/LANG-911 + */ + @Test + public void testLang911() { + final String bellsTest = "\ud83d\udc80\ud83d\udd14"; + final String value = StringEscapeUtils.escapeJava(bellsTest); + final String valueTest = StringEscapeUtils.unescapeJava(value); + assertEquals(bellsTest, valueTest); + } + + @Test + public void testEscapeJson() { + assertEquals(null, StringEscapeUtils.escapeJson(null)); + try { + StringEscapeUtils.ESCAPE_JSON.translate(null, null); + fail(); + } catch (final IOException ex) { + fail(); + } catch (final IllegalArgumentException ex) { + } + try { + StringEscapeUtils.ESCAPE_JSON.translate("", null); + fail(); + } catch (final IOException ex) { + fail(); + } catch (final IllegalArgumentException ex) { + } + + assertEquals("He didn't say, \\\"stop!\\\"", StringEscapeUtils.escapeJson("He didn't say, \"stop!\"")); + + final String expected = "\\\"foo\\\" isn't \\\"bar\\\". specials: \\b\\r\\n\\f\\t\\\\\\/"; + final String input ="\"foo\" isn't \"bar\". specials: \b\r\n\f\t\\/"; + + assertEquals(expected, StringEscapeUtils.escapeJson(input)); + } + +} \ No newline at end of file