This is an automated email from the ASF dual-hosted git repository. michaelo pushed a commit to branch DOXIA-700 in repository https://gitbox.apache.org/repos/asf/maven-doxia.git
commit 7c49be21a3abd8b9c57b19dddfa052c0513a0c9e Author: Michael Osipov <micha...@apache.org> AuthorDate: Sun Mar 5 15:52:38 2023 +0100 [DOXIA-700] Streamline, update and simplify Doxia ID generation This closes #155 --- .../maven/doxia/parser/Xhtml5BaseParser.java | 4 +- .../maven/doxia/sink/impl/Xhtml5BaseSink.java | 2 +- .../org/apache/maven/doxia/util/DoxiaUtils.java | 76 +++++++++------------- .../org/apache/maven/doxia/util/HtmlTools.java | 10 +-- .../apache/maven/doxia/util/DoxiaUtilsTest.java | 39 +++++------ .../org/apache/maven/doxia/util/HtmlToolsTest.java | 29 +++++---- .../apache/maven/doxia/module/apt/AptParser.java | 5 +- .../apache/maven/doxia/module/apt/AptUtils.java | 2 + .../maven/doxia/module/apt/AptParserTest.java | 4 +- .../apache/maven/doxia/module/fml/FmlParser.java | 4 +- 10 files changed, 83 insertions(+), 92 deletions(-) diff --git a/doxia-core/src/main/java/org/apache/maven/doxia/parser/Xhtml5BaseParser.java b/doxia-core/src/main/java/org/apache/maven/doxia/parser/Xhtml5BaseParser.java index 72dbd9a6..62a47b3d 100644 --- a/doxia-core/src/main/java/org/apache/maven/doxia/parser/Xhtml5BaseParser.java +++ b/doxia-core/src/main/java/org/apache/maven/doxia/parser/Xhtml5BaseParser.java @@ -743,7 +743,7 @@ public class Xhtml5BaseParser extends AbstractXmlParser implements HtmlMarkup { */ protected String validAnchor(String id) { if (!DoxiaUtils.isValidId(id)) { - String linkAnchor = DoxiaUtils.encodeId(id, true); + String linkAnchor = DoxiaUtils.encodeId(id); LOGGER.debug("Modified invalid link '{}' to '{}'", id, linkAnchor); @@ -785,7 +785,7 @@ public class Xhtml5BaseParser extends AbstractXmlParser implements HtmlMarkup { String hash = href.substring(hashIndex + 1); if (!DoxiaUtils.isValidId(hash)) { - href = href.substring(0, hashIndex) + "#" + DoxiaUtils.encodeId(hash, true); + href = href.substring(0, hashIndex) + "#" + DoxiaUtils.encodeId(hash); LOGGER.debug("Modified invalid link '{}' to '{}'", hash, href); } diff --git a/doxia-core/src/main/java/org/apache/maven/doxia/sink/impl/Xhtml5BaseSink.java b/doxia-core/src/main/java/org/apache/maven/doxia/sink/impl/Xhtml5BaseSink.java index 819fb32e..b9087bb5 100644 --- a/doxia-core/src/main/java/org/apache/maven/doxia/sink/impl/Xhtml5BaseSink.java +++ b/doxia-core/src/main/java/org/apache/maven/doxia/sink/impl/Xhtml5BaseSink.java @@ -1518,7 +1518,7 @@ public class Xhtml5BaseSink extends AbstractXmlSink implements HtmlMarkup { String id = name; if (!DoxiaUtils.isValidId(id)) { - id = DoxiaUtils.encodeId(name, true); + id = DoxiaUtils.encodeId(name); LOGGER.debug("Modified invalid anchor name '{}' to '{}'", name, id); } diff --git a/doxia-core/src/main/java/org/apache/maven/doxia/util/DoxiaUtils.java b/doxia-core/src/main/java/org/apache/maven/doxia/util/DoxiaUtils.java index 57bc5454..fdcefde6 100644 --- a/doxia-core/src/main/java/org/apache/maven/doxia/util/DoxiaUtils.java +++ b/doxia-core/src/main/java/org/apache/maven/doxia/util/DoxiaUtils.java @@ -130,90 +130,76 @@ public class DoxiaUtils { * Construct a valid Doxia id. * * <p> - * A valid Doxia id obeys the same constraints as an HTML ID or NAME token. - * According to the <a href="http://www.w3.org/TR/html4/types.html#type-name"> - * HTML 4.01 specification section 6.2 SGML basic types</a>: - * </p> - * <p> - * <i>ID and NAME tokens must begin with a letter ([A-Za-z]) and may be - * followed by any number of letters, digits ([0-9]), hyphens ("-"), - * underscores ("_"), colons (":"), and periods (".").</i> - * </p> - * <p> - * According to <a href="http://www.w3.org/TR/xhtml1/#C_8">XHTML 1.0 - * section C.8. Fragment Identifiers</a>: - * </p> - * <p> - * <i>When defining fragment identifiers to be backward-compatible, only - * strings matching the pattern [A-Za-z][A-Za-z0-9:_.-]* should be used.</i> + * A valid Doxia id corresponds to an XML id which is a {code NCName} which is in turn identical + * to a <a href="https://www.w3.org/TR/REC-xml/#NT-Name">{@code Name}</a>, but without a colon + * and without any character above {@code 0x7F}. * </p> * <p> * To achieve this we need to convert the <i>id</i> String. Two conversions * are necessary and one is done to get prettier ids: * </p> * <ol> - * <li>Remove whitespace at the start and end before starting to process</li> - * <li>If the first character is not a letter, prepend the id with the letter 'a'</li> - * <li>Any spaces are replaced with an underscore '_'</li> + * <li>Trim with {@link String#trim()} before starting to process,</li> + * <li>if the first character is not a {@code NameStartChar} prepend the letter 'a',</li> + * <li>any space character ({@code 0x20}) is replaced with an underscore,</li> * <li> - * Any characters not matching the above pattern are either dropped, - * or replaced according to the rules specified in the - * <a href="http://www.w3.org/TR/html4/appendix/notes.html#non-ascii-chars">HTML specs</a>. + * any character not matching the above pattern is either dropped, + * or replaced with its UTF-8 encoding where each byte is prepended with a dot. * </li> * </ol> - * <p> - * For letters, the case is preserved in the conversion. - * </p> * * <p> * Here are some examples: * </p> * <pre> * DoxiaUtils.encodeId(null) = null - * DoxiaUtils.encodeId("") = "a" - * DoxiaUtils.encodeId(" ") = "a" - * DoxiaUtils.encodeId(" _ ") = "a_" + * DoxiaUtils.encodeId("") = null + * DoxiaUtils.encodeId(" ") = null + * DoxiaUtils.encodeId(" _ ") = "_" * DoxiaUtils.encodeId("1") = "a1" * DoxiaUtils.encodeId("1anchor") = "a1anchor" - * DoxiaUtils.encodeId("_anchor") = "a_anchor" + * DoxiaUtils.encodeId("_anchor") = "_anchor" * DoxiaUtils.encodeId("a b-c123 ") = "a_b-c123" * DoxiaUtils.encodeId(" anchor") = "anchor" * DoxiaUtils.encodeId("myAnchor") = "myAnchor" + * DoxiaUtils.encodeId("€") = "a.E2.82.AC" * </pre> * - * @param id The id to be encoded. - * May be null in which case null is returned. - * @param chop true if non-ASCII characters should be ignored. - * If false, any non-ASCII characters will be replaced as specified above. + * @param text The text to be encoded. + * May be null, empty or blank in which case null is returned. + * @param chop true if non-encodable characters should be ignored. + * If false, any non-encodable characters will be replaced as specified above. * @return The trimmed and encoded id, or null if id is null. * If id is not null, the return value is guaranteed to be a valid Doxia id. * @see #isValidId(java.lang.String) * @since 1.1.1 + * @deprecated Don't chop characters which might produce collisions in a document */ - public static String encodeId(final String id, final boolean chop) { - if (id == null) { + @Deprecated + public static String encodeId(final String text, final boolean chop) { + if (text == null) { return null; } - final String idd = id.trim(); - int length = idd.length(); + final String textt = text.trim(); + int length = textt.length(); if (length == 0) { - return "a"; + return null; } StringBuilder buffer = new StringBuilder(length); for (int i = 0; i < length; ++i) { - char c = idd.charAt(i); + char c = textt.charAt(i); - if ((i == 0) && (!isAsciiLetter(c))) { + if ((i == 0) && !(isAsciiLetter(c) || c == '_')) { buffer.append('a'); } if (c == ' ') { buffer.append('_'); - } else if (isAsciiLetter(c) || isAsciiDigit(c) || (c == '-') || (c == '_') || (c == ':') || (c == '.')) { + } else if (isAsciiLetter(c) || isAsciiDigit(c) || (c == '-') || (c == '_') || (c == '.')) { buffer.append(c); } else if (!chop) { @@ -233,8 +219,8 @@ public class DoxiaUtils { * Determines if the specified text is a valid id according to the rules * laid out in {@link #encodeId(String)}. * - * @param text The text to be tested. - * May be null in which case false is returned. + * @param text The id to be tested. + * May be null or empty in which case false is returned. * @return <code>true</code> if the text is a valid id, otherwise <code>false</code>. * @see #encodeId(String) */ @@ -246,11 +232,11 @@ public class DoxiaUtils { for (int i = 0; i < text.length(); ++i) { char c = text.charAt(i); - if (isAsciiLetter(c)) { + if (isAsciiLetter(c) || c == '_') { continue; } - if ((i == 0) || (c == ' ') || (!isAsciiDigit(c) && c != '-' && c != '_' && c != ':' && c != '.')) { + if ((i == 0) || (!isAsciiDigit(c) && c != '-' && c != '.')) { return false; } } diff --git a/doxia-core/src/main/java/org/apache/maven/doxia/util/HtmlTools.java b/doxia-core/src/main/java/org/apache/maven/doxia/util/HtmlTools.java index 7014639c..8c4548cb 100644 --- a/doxia-core/src/main/java/org/apache/maven/doxia/util/HtmlTools.java +++ b/doxia-core/src/main/java/org/apache/maven/doxia/util/HtmlTools.java @@ -420,16 +420,18 @@ public class HtmlTools { * * <p> * <b>Note</b>: this method is identical to - * {@link DoxiaUtils#encodeId(String,boolean) DoxiaUtils.encodeId(id, false)}, + * {@link DoxiaUtils#encodeId(String)}, * the rules to encode an id are laid out there. * </p> * * @param id The id to be encoded. - * @return The trimmed and encoded id, or null if id is null. - * @see DoxiaUtils#encodeId(java.lang.String,boolean) + * @return The trimmed and encoded id, or null if id is null, empty or blank. + * @see DoxiaUtils#encodeId(java.lang.String) + * @deprecated use {@link DoxiaUtils#encodeId(String)} */ + @Deprecated public static String encodeId(String id) { - return DoxiaUtils.encodeId(id, false); + return DoxiaUtils.encodeId(id); } /** diff --git a/doxia-core/src/test/java/org/apache/maven/doxia/util/DoxiaUtilsTest.java b/doxia-core/src/test/java/org/apache/maven/doxia/util/DoxiaUtilsTest.java index 7c767078..586d435d 100644 --- a/doxia-core/src/test/java/org/apache/maven/doxia/util/DoxiaUtilsTest.java +++ b/doxia-core/src/test/java/org/apache/maven/doxia/util/DoxiaUtilsTest.java @@ -127,20 +127,20 @@ public class DoxiaUtilsTest { @Test public void testEncodeId() { assertNull(DoxiaUtils.encodeId(null)); - assertEquals(DoxiaUtils.encodeId(""), "a"); - assertEquals(DoxiaUtils.encodeId(" "), "a"); - assertEquals(DoxiaUtils.encodeId(" _ "), "a_"); - assertEquals(DoxiaUtils.encodeId("1"), "a1"); - assertEquals(DoxiaUtils.encodeId("1anchor"), "a1anchor"); - assertEquals(DoxiaUtils.encodeId("_anchor"), "a_anchor"); - assertEquals(DoxiaUtils.encodeId("a b-c123 "), "a_b-c123"); - assertEquals(DoxiaUtils.encodeId(" anchor"), "anchor"); - assertEquals(DoxiaUtils.encodeId("myAnchor"), "myAnchor"); - assertEquals(DoxiaUtils.encodeId("my&Anchor"), "my.26Anchor"); - assertEquals(DoxiaUtils.encodeId("H\u00E5kon"), "H.C3.A5kon"); - assertEquals(DoxiaUtils.encodeId("H\u00E5kon", true), "Hkon"); - assertEquals(DoxiaUtils.encodeId("Theu\u00DFl"), "Theu.C3.9Fl"); - assertEquals(DoxiaUtils.encodeId("Theu\u00DFl", true), "Theul"); + assertNull(DoxiaUtils.encodeId("")); + assertNull(DoxiaUtils.encodeId(" ")); + assertEquals("_", DoxiaUtils.encodeId(" _ ")); + assertEquals("a1", DoxiaUtils.encodeId("1")); + assertEquals("a1anchor", DoxiaUtils.encodeId("1anchor")); + assertEquals("_anchor", DoxiaUtils.encodeId("_anchor")); + assertEquals("a_b-c123", DoxiaUtils.encodeId("a b-c123 ")); + assertEquals("anchor", DoxiaUtils.encodeId(" anchor")); + assertEquals("myAnchor", DoxiaUtils.encodeId("myAnchor")); + assertEquals("my.26Anchor", DoxiaUtils.encodeId("my&Anchor")); + assertEquals("H.C3.A5kon", DoxiaUtils.encodeId("H\u00E5kon")); + assertEquals("Hkon", DoxiaUtils.encodeId("H\u00E5kon", true)); + assertEquals("Theu.C3.9Fl", DoxiaUtils.encodeId("Theu\u00DFl")); + assertEquals("Theul", DoxiaUtils.encodeId("Theu\u00DFl", true)); } /** @@ -151,22 +151,23 @@ public class DoxiaUtilsTest { assertFalse(DoxiaUtils.isValidId(null)); assertFalse(DoxiaUtils.isValidId("")); assertFalse(DoxiaUtils.isValidId(" ")); - assertFalse(DoxiaUtils.isValidId(" _ ")); assertFalse(DoxiaUtils.isValidId("1")); assertFalse(DoxiaUtils.isValidId("1anchor")); - assertFalse(DoxiaUtils.isValidId("_anchor")); assertFalse(DoxiaUtils.isValidId("a b-c123 ")); assertFalse(DoxiaUtils.isValidId(" anchor")); assertFalse(DoxiaUtils.isValidId("my&Anchor")); + assertFalse(DoxiaUtils.isValidId("Theu\u00DFl")); + assertFalse(DoxiaUtils.isValidId("a:")); + assertFalse(DoxiaUtils.isValidId("Theu%C3%9Fl")); + assertFalse(DoxiaUtils.isValidId(" _ ")); + assertTrue(DoxiaUtils.isValidId("_")); + assertTrue(DoxiaUtils.isValidId("_anchor")); assertTrue(DoxiaUtils.isValidId("myAnchor")); assertTrue(DoxiaUtils.isValidId("a_")); assertTrue(DoxiaUtils.isValidId("a-")); - assertTrue(DoxiaUtils.isValidId("a:")); assertTrue(DoxiaUtils.isValidId("a.")); assertTrue(DoxiaUtils.isValidId("index.html")); - assertFalse(DoxiaUtils.isValidId("Theu\u00DFl")); assertTrue(DoxiaUtils.isValidId("Theu.C3.9Fl")); - assertFalse(DoxiaUtils.isValidId("Theu%C3%9Fl")); } /** diff --git a/doxia-core/src/test/java/org/apache/maven/doxia/util/HtmlToolsTest.java b/doxia-core/src/test/java/org/apache/maven/doxia/util/HtmlToolsTest.java index 814ee0ad..230632f4 100644 --- a/doxia-core/src/test/java/org/apache/maven/doxia/util/HtmlToolsTest.java +++ b/doxia-core/src/test/java/org/apache/maven/doxia/util/HtmlToolsTest.java @@ -97,17 +97,18 @@ public class HtmlToolsTest { @Test public void testEncodeId() { assertNull(HtmlTools.encodeId(null)); - assertEquals(HtmlTools.encodeId(""), "a"); - assertEquals(HtmlTools.encodeId(" "), "a"); - assertEquals(HtmlTools.encodeId(" _ "), "a_"); - assertEquals(HtmlTools.encodeId("1"), "a1"); - assertEquals(HtmlTools.encodeId("1anchor"), "a1anchor"); - assertEquals(HtmlTools.encodeId("_anchor"), "a_anchor"); - assertEquals(HtmlTools.encodeId("a b-c123 "), "a_b-c123"); - assertEquals(HtmlTools.encodeId(" anchor"), "anchor"); - assertEquals(HtmlTools.encodeId("myAnchor"), "myAnchor"); - assertEquals(HtmlTools.encodeId("H\u00E5kon"), "H.C3.A5kon"); - assertEquals(HtmlTools.encodeId("Theu\u00DFl"), "Theu.C3.9Fl"); + assertNull(HtmlTools.encodeId("")); + assertNull(HtmlTools.encodeId(" ")); + assertEquals("_", HtmlTools.encodeId(" _ ")); + assertEquals("a1", HtmlTools.encodeId("1")); + assertEquals("a1anchor", HtmlTools.encodeId("1anchor")); + assertEquals("_anchor", HtmlTools.encodeId("_anchor")); + assertEquals("a_b-c123", HtmlTools.encodeId("a b-c123 ")); + assertEquals("anchor", HtmlTools.encodeId(" anchor")); + assertEquals("myAnchor", HtmlTools.encodeId("myAnchor")); + assertEquals("H.C3.A5kon", HtmlTools.encodeId("H\u00E5kon")); + assertEquals("Theu.C3.9Fl", HtmlTools.encodeId("Theu\u00DFl")); + assertEquals("a.E2.82.AC", HtmlTools.encodeId("\u20AC")); } /** @@ -142,15 +143,15 @@ public class HtmlToolsTest { assertFalse(HtmlTools.isId(" _ ")); assertFalse(HtmlTools.isId("1")); assertFalse(HtmlTools.isId("1anchor")); - assertFalse(HtmlTools.isId("_anchor")); assertFalse(HtmlTools.isId("a b-c123 ")); assertFalse(HtmlTools.isId(" anchor")); + assertFalse(HtmlTools.isId("a:")); + assertFalse(HtmlTools.isId("Theu\u00DFl")); + assertTrue(HtmlTools.isId("_anchor")); assertTrue(HtmlTools.isId("myAnchor")); assertTrue(HtmlTools.isId("a_")); assertTrue(HtmlTools.isId("a-")); - assertTrue(HtmlTools.isId("a:")); assertTrue(HtmlTools.isId("a.")); - assertFalse(HtmlTools.isId("Theu\u00DFl")); } /** diff --git a/doxia-modules/doxia-module-apt/src/main/java/org/apache/maven/doxia/module/apt/AptParser.java b/doxia-modules/doxia-module-apt/src/main/java/org/apache/maven/doxia/module/apt/AptParser.java index c020a8a6..f3a7e85c 100644 --- a/doxia-modules/doxia-module-apt/src/main/java/org/apache/maven/doxia/module/apt/AptParser.java +++ b/doxia-modules/doxia-module-apt/src/main/java/org/apache/maven/doxia/module/apt/AptParser.java @@ -415,8 +415,7 @@ public class AptParser extends AbstractTextParser implements AptMarkup { if (hash.startsWith("#")) { linkAnchor = linkAnchor.substring(0, hashIndex) + hash; } else if (!DoxiaUtils.isValidId(hash)) { - linkAnchor = - linkAnchor.substring(0, hashIndex) + "#" + DoxiaUtils.encodeId(hash, true); + linkAnchor = linkAnchor.substring(0, hashIndex) + "#" + DoxiaUtils.encodeId(hash); LOGGER.debug("Modified invalid link '{}' to '{}'", hash, linkAnchor); } @@ -429,7 +428,7 @@ public class AptParser extends AbstractTextParser implements AptMarkup { String linkAnchor = getTraversedAnchor(text, i + 1, end); - linkAnchor = AptUtils.encodeAnchor(linkAnchor); + linkAnchor = DoxiaUtils.encodeId(linkAnchor); sink.anchor(linkAnchor); } diff --git a/doxia-modules/doxia-module-apt/src/main/java/org/apache/maven/doxia/module/apt/AptUtils.java b/doxia-modules/doxia-module-apt/src/main/java/org/apache/maven/doxia/module/apt/AptUtils.java index 67b886d4..fcf76826 100644 --- a/doxia-modules/doxia-module-apt/src/main/java/org/apache/maven/doxia/module/apt/AptUtils.java +++ b/doxia-modules/doxia-module-apt/src/main/java/org/apache/maven/doxia/module/apt/AptUtils.java @@ -89,7 +89,9 @@ public class AptUtils { * * @param id The id to be encoded. * @return The trimmed and encoded id, or null if id is null. + * @deprecated use {@link DoxiaUtils#encodeId(String)} */ + @Deprecated public static String encodeAnchor(String id) { if (id == null) { return null; diff --git a/doxia-modules/doxia-module-apt/src/test/java/org/apache/maven/doxia/module/apt/AptParserTest.java b/doxia-modules/doxia-module-apt/src/test/java/org/apache/maven/doxia/module/apt/AptParserTest.java index afd833ae..178f2877 100644 --- a/doxia-modules/doxia-module-apt/src/test/java/org/apache/maven/doxia/module/apt/AptParserTest.java +++ b/doxia-modules/doxia-module-apt/src/test/java/org/apache/maven/doxia/module/apt/AptParserTest.java @@ -383,12 +383,12 @@ public class AptParserTest extends AbstractParserTest { Iterator<SinkEventElement> it = sink.getEventList().iterator(); assertSinkStartsWith(it, "head", "head_", "body", "paragraph"); - assertSinkEquals(it.next(), "anchor", "Anchor_with_spaces_and_brackets"); + assertSinkEquals(it.next(), "anchor", "Anchor_with_spaces_.28and_brackets.29"); assertSinkEquals(it.next(), "text", "Anchor with spaces (and brackets)"); assertSinkStartsWith(it, "anchor_", "text"); - assertSinkEquals(it.next(), "link", "#Anchor_with_spaces_and_brackets"); + assertSinkEquals(it.next(), "link", "#Anchor_with_spaces_.28and_brackets.29"); assertSinkEquals(it.next(), "text", "Anchor with spaces (and brackets)"); diff --git a/doxia-modules/doxia-module-fml/src/main/java/org/apache/maven/doxia/module/fml/FmlParser.java b/doxia-modules/doxia-module-fml/src/main/java/org/apache/maven/doxia/module/fml/FmlParser.java index ab2acfcd..46e6bd64 100644 --- a/doxia-modules/doxia-module-fml/src/main/java/org/apache/maven/doxia/module/fml/FmlParser.java +++ b/doxia-modules/doxia-module-fml/src/main/java/org/apache/maven/doxia/module/fml/FmlParser.java @@ -144,7 +144,7 @@ public class FmlParser extends AbstractXmlParser implements FmlMarkup { throw new XmlPullParserException("id attribute required for <part> at: (" + parser.getLineNumber() + ":" + parser.getColumnNumber() + ")"); } else if (!DoxiaUtils.isValidId(currentPart.getId())) { - String linkAnchor = DoxiaUtils.encodeId(currentPart.getId(), true); + String linkAnchor = DoxiaUtils.encodeId(currentPart.getId()); LOGGER.debug("Modified invalid link '{}' to '{}'", currentPart.getId(), linkAnchor); @@ -162,7 +162,7 @@ public class FmlParser extends AbstractXmlParser implements FmlMarkup { throw new XmlPullParserException("id attribute required for <faq> at: (" + parser.getLineNumber() + ":" + parser.getColumnNumber() + ")"); } else if (!DoxiaUtils.isValidId(currentFaq.getId())) { - String linkAnchor = DoxiaUtils.encodeId(currentFaq.getId(), true); + String linkAnchor = DoxiaUtils.encodeId(currentFaq.getId()); LOGGER.debug("Modified invalid link '{}' to '{}'", currentFaq.getId(), linkAnchor);