This is an automated email from the ASF dual-hosted git repository. desruisseaux pushed a commit to branch geoapi-4.0 in repository https://gitbox.apache.org/repos/asf/sis.git
The following commit(s) were added to refs/heads/geoapi-4.0 by this push: new cf568b124f Ignore zero-width spaces and other ignorable characters in CRS identifiers. cf568b124f is described below commit cf568b124f9499b2019ee4352ce9e1b86a8f850b Author: Martin Desruisseaux <martin.desruisse...@geomatys.com> AuthorDate: Sun Jan 21 21:57:38 2024 +0100 Ignore zero-width spaces and other ignorable characters in CRS identifiers. https://issues.apache.org/jira/browse/SIS-490 --- .../sis/metadata/iso/citation/Citations.java | 64 ++------------------- .../factory/GeodeticAuthorityFactory.java | 8 ++- .../factory/MultiAuthoritiesFactory.java | 2 +- .../sis/referencing/factory/package-info.java | 2 +- .../main/org/apache/sis/util/CharSequences.java | 66 ++++++++++++++++++++++ .../apache/sis/util/internal/DefinitionURI.java | 3 +- 6 files changed, 79 insertions(+), 66 deletions(-) diff --git a/endorsed/src/org.apache.sis.metadata/main/org/apache/sis/metadata/iso/citation/Citations.java b/endorsed/src/org.apache.sis.metadata/main/org/apache/sis/metadata/iso/citation/Citations.java index 8b6e802a88..956b3c2545 100644 --- a/endorsed/src/org.apache.sis.metadata/main/org/apache/sis/metadata/iso/citation/Citations.java +++ b/endorsed/src/org.apache.sis.metadata/main/org/apache/sis/metadata/iso/citation/Citations.java @@ -850,17 +850,8 @@ public final class Citations extends Static { * Those characters are illegal in XML identifiers, and should therefore be removed if the Unicode identifier * may also be used as XML identifier. * - * <p>If non-null, the result is suitable for use as a XML identifier except for a few uncommon characters.</p> - * - * <h4>Compatibility note</h4> - * the following characters are invalid in XML identifiers. However, since they are valid in Unicode identifiers, - * they could be included in the string returned by this method: - * <ul> - * <li>{@code µ}</li> - * <li>{@code ª} (feminine ordinal indicator)</li> - * <li>{@code º} (masculine ordinal indicator)</li> - * <li>{@code ⁔}</li> - * </ul> + * <p>If non-null, the result is suitable for use as a XML identifier except for a few uncommon characters. + * See {@link CharSequences#trimIgnorables(CharSequence)} for more information.</p> * * @param citation the citation for which to infer the code space, or {@code null}. * @return a non-empty code space for the given citation without leading or trailing whitespaces, @@ -872,55 +863,8 @@ public final class Citations extends Static { if (citation instanceof IdentifierSpace<?>) { return ((IdentifierSpace<?>) citation).getName(); } else { - return removeIgnorableCharacters(Identifiers.getIdentifier(citation, true)); - } - } - - /** - * Removes characters that are ignorable according Unicode specification. - * - * @param identifier the character sequence from which to remove ignorable characters, or {@code null}. - * @return a character sequence with ignorable character removed. May be the same instance as the given argument. - */ - private static String removeIgnorableCharacters(final String identifier) { - if (identifier != null) { - /* - * First perform a quick check to see if there is any ignorable characters. - * We make this check because those characters are valid according Unicode - * but not according XML. However, there is usually no such characters, so - * we will avoid the StringBuilder creation in the vast majority of times. - * - * Note that 'µ' and its friends are not ignorable, so we do not remove them. - * This method is aimed for "getUnicodeIdentifier", not "getXmlIdentifier". - */ - final int length = identifier.length(); - for (int i=0; i<length;) { - int c = identifier.codePointAt(i); - int n = Character.charCount(c); - if (Character.isIdentifierIgnorable(c)) { - /* - * Found an ignorable character. Create the buffer and copy non-ignorable characters. - * Following algorithm is inefficient, since we fill the buffer character-by-character - * (a more efficient approach would be to perform bulk appends). However, we presume - * that this block will be rarely executed, so it is not worth to optimize it. - */ - final StringBuilder buffer = new StringBuilder(length - n).append(identifier, 0, i); - while ((i += n) < length) { - c = identifier.codePointAt(i); - n = Character.charCount(c); - if (!Character.isIdentifierIgnorable(c)) { - buffer.appendCodePoint(c); - } - } - /* - * No need to verify if the buffer is empty, because ignorable - * characters are not legal Unicode identifier start. - */ - return buffer.toString(); - } - i += n; - } + CharSequence cs = CharSequences.trimIgnorables(Identifiers.getIdentifier(citation, true)); + return (cs != null) ? cs.toString() : null; } - return identifier; } } diff --git a/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/GeodeticAuthorityFactory.java b/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/GeodeticAuthorityFactory.java index 810632fb1e..0474fa9551 100644 --- a/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/GeodeticAuthorityFactory.java +++ b/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/GeodeticAuthorityFactory.java @@ -64,7 +64,7 @@ import org.apache.sis.util.resources.Errors; * * @author Martin Desruisseaux (IRD, Geomatys) * @author Johann Sorel (Geomatys) - * @version 1.4 + * @version 1.5 * @since 0.7 */ public abstract class GeodeticAuthorityFactory extends AbstractFactory implements AuthorityFactory { @@ -1265,7 +1265,8 @@ public abstract class GeodeticAuthorityFactory extends AbstractFactory implement /** * Trims the namespace, if present. For example if this factory is an EPSG authority factory * and the specified code start with the {@code "EPSG:"} prefix, then the prefix is removed. - * Otherwise, the string is returned unchanged (except for leading and trailing spaces). + * Otherwise, the string is returned unchanged except for leading and trailing spaces which + * are removed, together with {@link Character#isIdentifierIgnorable(int) ignorable characters}. * * @param code the code to trim. * @return the code with the namespace part removed if that part matched one of the values given by @@ -1273,7 +1274,8 @@ public abstract class GeodeticAuthorityFactory extends AbstractFactory implement * * @since 0.8 */ - protected final String trimNamespace(final String code) { + protected final String trimNamespace(String code) { + code = CharSequences.trimIgnorables(code).toString(); int s = code.indexOf(Constants.DEFAULT_SEPARATOR); if (s >= 0) { final int end = CharSequences.skipTrailingWhitespaces(code, 0, s); diff --git a/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/MultiAuthoritiesFactory.java b/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/MultiAuthoritiesFactory.java index e908bd7d4d..9f82fd9fcd 100644 --- a/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/MultiAuthoritiesFactory.java +++ b/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/MultiAuthoritiesFactory.java @@ -145,7 +145,7 @@ import org.apache.sis.util.collection.BackingStoreException; * do not need to be thread-safe. See constructor Javadoc for more information. * * @author Martin Desruisseaux (IRD, Geomatys) - * @version 1.4 + * @version 1.5 * * @see org.apache.sis.referencing.CRS#getAuthorityFactory(String) * diff --git a/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/package-info.java b/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/package-info.java index 373041442d..55d4937b4e 100644 --- a/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/package-info.java +++ b/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/package-info.java @@ -56,7 +56,7 @@ * </table> * * @author Martin Desruisseaux (IRD, Geomatys) - * @version 1.4 + * @version 1.5 * @since 0.6 */ package org.apache.sis.referencing.factory; diff --git a/endorsed/src/org.apache.sis.util/main/org/apache/sis/util/CharSequences.java b/endorsed/src/org.apache.sis.util/main/org/apache/sis/util/CharSequences.java index 699dda468a..7a361609fc 100644 --- a/endorsed/src/org.apache.sis.util/main/org/apache/sis/util/CharSequences.java +++ b/endorsed/src/org.apache.sis.util/main/org/apache/sis/util/CharSequences.java @@ -992,6 +992,72 @@ search: for (; fromIndex <= toIndex; fromIndex++) { return text; } + /** + * Returns a text with ignorable characters in Unicode identifier removed. While valid in identifiers, + * those {@linkplain Character#isIdentifierIgnorable(int) ignorable characters} are often non-displayed. + * An example of ignorable character is the zero-width space. + * + * <h4>Relationship with XML</h4> + * Unlike Unicode identifiers, ignorable characters are invalid in XML identifiers. + * This restriction avoids, for example, homograph attacks in domain name. + * So this method can be used for converting an Unicode identifier to an XML identifier, + * except for the characters listed below. Those characters are non-ignorable + * (so not removed by this method), but nevertheless invalid in XML identifiers. + * <ul> + * <li>{@code µ} (U+00B5) — micro</li> + * <li>{@code ª} (U+00AA) — feminine ordinal indicator</li> + * <li>{@code º} (U+00BA) — masculine ordinal indicator</li> + * <li>{@code ⁔} (U+2054) — inverted undertie</li> + * </ul> + * + * @param text the text from which to remove ignorable characters, or {@code null}. + * @return text with ignorable characters removed, or {@code null} if the given text was null. + * + * @see Character#isIdentifierIgnorable(int) + * + * @since 1.5 + */ + public static CharSequence trimIgnorables(final CharSequence text) { + if (text != null) { + /* + * First perform a quick check to see if there is any ignorable characters. + * We make this check because there is usually no such characters, + * so we will avoid the StringBuilder creation in the vast majority of times. + * + * Note that 'µ' and its friends are not ignorable, so we do not remove them. + * This method is aimed for `getUnicodeIdentifier`, not `getXmlIdentifier`. + */ + final int length = text.length(); + for (int i=0; i<length;) { + int c = codePointAt(text, i); + int n = Character.charCount(c); + if (Character.isIdentifierIgnorable(c)) { + /* + * Found an ignorable character. Create the buffer and copy non-ignorable characters. + * Following algorithm is inefficient, since we fill the buffer character-by-character + * (a more efficient approach would be to perform bulk appends). However, we presume + * that this block will be rarely executed, so it is not worth to optimize it. + */ + final StringBuilder buffer = new StringBuilder(length - n).append(text, 0, i); + while ((i += n) < length) { + c = codePointAt(text, i); + n = Character.charCount(c); + if (!Character.isIdentifierIgnorable(c)) { + buffer.appendCodePoint(c); + } + } + /* + * No need to verify if the buffer is empty, because ignorable + * characters are not legal Unicode identifier start. + */ + return buffer.toString(); + } + i += n; + } + } + return text; + } + /** * Trims the fractional part of the given formatted number, provided that it doesn't change * the value. This method assumes that the number is formatted in the US locale, typically diff --git a/endorsed/src/org.apache.sis.util/main/org/apache/sis/util/internal/DefinitionURI.java b/endorsed/src/org.apache.sis.util/main/org/apache/sis/util/internal/DefinitionURI.java index e57f627865..245dec83e6 100644 --- a/endorsed/src/org.apache.sis.util/main/org/apache/sis/util/internal/DefinitionURI.java +++ b/endorsed/src/org.apache.sis.util/main/org/apache/sis/util/internal/DefinitionURI.java @@ -271,7 +271,8 @@ public final class DefinitionURI { * @param uri the URI to parse. * @return the parse result, or {@code null} if the given URI is not recognized. */ - public static DefinitionURI parse(final String uri) { + public static DefinitionURI parse(String uri) { + uri = CharSequences.trimIgnorables(uri).toString(); return parse(uri, false, -1, uri.length()); }