(sis) branch geoapi-4.0 updated: Ignore zero-width spaces and other ignorable characters in CRS identifiers.

desruisseaux Sun, 21 Jan 2024 12:58:56 -0800

This is an automated email from the ASF dual-hosted git repository.

desruisseaux pushed a commit to branch geoapi-4.0
in repository https://gitbox.apache.org/repos/asf/sis.git



The following commit(s) were added to refs/heads/geoapi-4.0 by this push:
     new cf568b124f Ignore zero-width spaces and other ignorable characters in 
CRS identifiers.
cf568b124f is described below

commit cf568b124f9499b2019ee4352ce9e1b86a8f850b
Author: Martin Desruisseaux <martin.desruisse...@geomatys.com>
AuthorDate: Sun Jan 21 21:57:38 2024 +0100

    Ignore zero-width spaces and other ignorable characters in CRS identifiers.
    
    https://issues.apache.org/jira/browse/SIS-490
---
 .../sis/metadata/iso/citation/Citations.java       | 64 ++-------------------
 .../factory/GeodeticAuthorityFactory.java          |  8 ++-
 .../factory/MultiAuthoritiesFactory.java           |  2 +-
 .../sis/referencing/factory/package-info.java      |  2 +-
 .../main/org/apache/sis/util/CharSequences.java    | 66 ++++++++++++++++++++++
 .../apache/sis/util/internal/DefinitionURI.java    |  3 +-
 6 files changed, 79 insertions(+), 66 deletions(-)

diff --git 
a/endorsed/src/org.apache.sis.metadata/main/org/apache/sis/metadata/iso/citation/Citations.java
 
b/endorsed/src/org.apache.sis.metadata/main/org/apache/sis/metadata/iso/citation/Citations.java
index 8b6e802a88..956b3c2545 100644
--- 
a/endorsed/src/org.apache.sis.metadata/main/org/apache/sis/metadata/iso/citation/Citations.java
+++ 
b/endorsed/src/org.apache.sis.metadata/main/org/apache/sis/metadata/iso/citation/Citations.java
@@ -850,17 +850,8 @@ public final class Citations extends Static {
      * Those characters are illegal in XML identifiers, and should therefore 
be removed if the Unicode identifier
      * may also be used as XML identifier.
      *
-     * <p>If non-null, the result is suitable for use as a XML identifier 
except for a few uncommon characters.</p>
-     *
-     * <h4>Compatibility note</h4>
-     * the following characters are invalid in XML identifiers. However, since 
they are valid in Unicode identifiers,
-     * they could be included in the string returned by this method:
-     * <ul>
-     *   <li>{@code µ}</li>
-     *   <li>{@code ª} (feminine ordinal indicator)</li>
-     *   <li>{@code º} (masculine ordinal indicator)</li>
-     *   <li>{@code ⁔}</li>
-     * </ul>
+     * <p>If non-null, the result is suitable for use as a XML identifier 
except for a few uncommon characters.
+     * See {@link CharSequences#trimIgnorables(CharSequence)} for more 
information.</p>
      *
      * @param  citation  the citation for which to infer the code space, or 
{@code null}.
      * @return a non-empty code space for the given citation without leading 
or trailing whitespaces,
@@ -872,55 +863,8 @@ public final class Citations extends Static {
         if (citation instanceof IdentifierSpace<?>) {
             return ((IdentifierSpace<?>) citation).getName();
         } else {
-            return 
removeIgnorableCharacters(Identifiers.getIdentifier(citation, true));
-        }
-    }
-
-    /**
-     * Removes characters that are ignorable according Unicode specification.
-     *
-     * @param  identifier  the character sequence from which to remove 
ignorable characters, or {@code null}.
-     * @return a character sequence with ignorable character removed. May be 
the same instance as the given argument.
-     */
-    private static String removeIgnorableCharacters(final String identifier) {
-        if (identifier != null) {
-            /*
-             * First perform a quick check to see if there is any ignorable 
characters.
-             * We make this check because those characters are valid according 
Unicode
-             * but not according XML. However, there is usually no such 
characters, so
-             * we will avoid the StringBuilder creation in the vast majority 
of times.
-             *
-             * Note that 'µ' and its friends are not ignorable, so we do not 
remove them.
-             * This method is aimed for "getUnicodeIdentifier", not 
"getXmlIdentifier".
-             */
-            final int length = identifier.length();
-            for (int i=0; i<length;) {
-                int c = identifier.codePointAt(i);
-                int n = Character.charCount(c);
-                if (Character.isIdentifierIgnorable(c)) {
-                    /*
-                     * Found an ignorable character. Create the buffer and 
copy non-ignorable characters.
-                     * Following algorithm is inefficient, since we fill the 
buffer character-by-character
-                     * (a more efficient approach would be to perform bulk 
appends). However, we presume
-                     * that this block will be rarely executed, so it is not 
worth to optimize it.
-                     */
-                    final StringBuilder buffer = new StringBuilder(length - 
n).append(identifier, 0, i);
-                    while ((i += n) < length) {
-                        c = identifier.codePointAt(i);
-                        n = Character.charCount(c);
-                        if (!Character.isIdentifierIgnorable(c)) {
-                            buffer.appendCodePoint(c);
-                        }
-                    }
-                    /*
-                     * No need to verify if the buffer is empty, because 
ignorable
-                     * characters are not legal Unicode identifier start.
-                     */
-                    return buffer.toString();
-                }
-                i += n;
-            }
+            CharSequence cs = 
CharSequences.trimIgnorables(Identifiers.getIdentifier(citation, true));
+            return (cs != null) ? cs.toString() : null;
         }
-        return identifier;
     }
 }
diff --git 
a/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/GeodeticAuthorityFactory.java
 
b/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/GeodeticAuthorityFactory.java
index 810632fb1e..0474fa9551 100644
--- 
a/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/GeodeticAuthorityFactory.java
+++ 
b/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/GeodeticAuthorityFactory.java
@@ -64,7 +64,7 @@ import org.apache.sis.util.resources.Errors;
  *
  * @author  Martin Desruisseaux (IRD, Geomatys)
  * @author  Johann Sorel (Geomatys)
- * @version 1.4
+ * @version 1.5
  * @since   0.7
  */
 public abstract class GeodeticAuthorityFactory extends AbstractFactory 
implements AuthorityFactory {
@@ -1265,7 +1265,8 @@ public abstract class GeodeticAuthorityFactory extends 
AbstractFactory implement
     /**
      * Trims the namespace, if present. For example if this factory is an EPSG 
authority factory
      * and the specified code start with the {@code "EPSG:"} prefix, then the 
prefix is removed.
-     * Otherwise, the string is returned unchanged (except for leading and 
trailing spaces).
+     * Otherwise, the string is returned unchanged except for leading and 
trailing spaces which
+     * are removed, together with {@link Character#isIdentifierIgnorable(int) 
ignorable characters}.
      *
      * @param  code  the code to trim.
      * @return the code with the namespace part removed if that part matched 
one of the values given by
@@ -1273,7 +1274,8 @@ public abstract class GeodeticAuthorityFactory extends 
AbstractFactory implement
      *
      * @since 0.8
      */
-    protected final String trimNamespace(final String code) {
+    protected final String trimNamespace(String code) {
+        code = CharSequences.trimIgnorables(code).toString();
         int s = code.indexOf(Constants.DEFAULT_SEPARATOR);
         if (s >= 0) {
             final int end   = CharSequences.skipTrailingWhitespaces(code, 0, 
s);
diff --git 
a/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/MultiAuthoritiesFactory.java
 
b/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/MultiAuthoritiesFactory.java
index e908bd7d4d..9f82fd9fcd 100644
--- 
a/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/MultiAuthoritiesFactory.java
+++ 
b/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/MultiAuthoritiesFactory.java
@@ -145,7 +145,7 @@ import org.apache.sis.util.collection.BackingStoreException;
  * do not need to be thread-safe. See constructor Javadoc for more information.
  *
  * @author  Martin Desruisseaux (IRD, Geomatys)
- * @version 1.4
+ * @version 1.5
  *
  * @see org.apache.sis.referencing.CRS#getAuthorityFactory(String)
  *
diff --git 
a/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/package-info.java
 
b/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/package-info.java
index 373041442d..55d4937b4e 100644
--- 
a/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/package-info.java
+++ 
b/endorsed/src/org.apache.sis.referencing/main/org/apache/sis/referencing/factory/package-info.java
@@ -56,7 +56,7 @@
  * </table>
  *
  * @author  Martin Desruisseaux (IRD, Geomatys)
- * @version 1.4
+ * @version 1.5
  * @since   0.6
  */
 package org.apache.sis.referencing.factory;
diff --git 
a/endorsed/src/org.apache.sis.util/main/org/apache/sis/util/CharSequences.java 
b/endorsed/src/org.apache.sis.util/main/org/apache/sis/util/CharSequences.java
index 699dda468a..7a361609fc 100644
--- 
a/endorsed/src/org.apache.sis.util/main/org/apache/sis/util/CharSequences.java
+++ 
b/endorsed/src/org.apache.sis.util/main/org/apache/sis/util/CharSequences.java
@@ -992,6 +992,72 @@ search:     for (; fromIndex <= toIndex; fromIndex++) {
         return text;
     }
 
+    /**
+     * Returns a text with ignorable characters in Unicode identifier removed. 
While valid in identifiers,
+     * those {@linkplain Character#isIdentifierIgnorable(int) ignorable 
characters} are often non-displayed.
+     * An example of ignorable character is the zero-width space.
+     *
+     * <h4>Relationship with XML</h4>
+     * Unlike Unicode identifiers, ignorable characters are invalid in XML 
identifiers.
+     * This restriction avoids, for example, homograph attacks in domain name.
+     * So this method can be used for converting an Unicode identifier to an 
XML identifier,
+     * except for the characters listed below. Those characters are 
non-ignorable
+     * (so not removed by this method), but nevertheless invalid in XML 
identifiers.
+     * <ul>
+     *   <li>{@code µ} (U+00B5) — micro</li>
+     *   <li>{@code ª} (U+00AA) — feminine ordinal indicator</li>
+     *   <li>{@code º} (U+00BA) — masculine ordinal indicator</li>
+     *   <li>{@code ⁔} (U+2054) — inverted undertie</li>
+     * </ul>
+     *
+     * @param  text  the text from which to remove ignorable characters, or 
{@code null}.
+     * @return text with ignorable characters removed, or {@code null} if the 
given text was null.
+     *
+     * @see Character#isIdentifierIgnorable(int)
+     *
+     * @since 1.5
+     */
+    public static CharSequence trimIgnorables(final CharSequence text) {
+        if (text != null) {
+            /*
+             * First perform a quick check to see if there is any ignorable 
characters.
+             * We make this check because there is usually no such characters,
+             * so we will avoid the StringBuilder creation in the vast 
majority of times.
+             *
+             * Note that 'µ' and its friends are not ignorable, so we do not 
remove them.
+             * This method is aimed for `getUnicodeIdentifier`, not 
`getXmlIdentifier`.
+             */
+            final int length = text.length();
+            for (int i=0; i<length;) {
+                int c = codePointAt(text, i);
+                int n = Character.charCount(c);
+                if (Character.isIdentifierIgnorable(c)) {
+                    /*
+                     * Found an ignorable character. Create the buffer and 
copy non-ignorable characters.
+                     * Following algorithm is inefficient, since we fill the 
buffer character-by-character
+                     * (a more efficient approach would be to perform bulk 
appends). However, we presume
+                     * that this block will be rarely executed, so it is not 
worth to optimize it.
+                     */
+                    final StringBuilder buffer = new StringBuilder(length - 
n).append(text, 0, i);
+                    while ((i += n) < length) {
+                        c = codePointAt(text, i);
+                        n = Character.charCount(c);
+                        if (!Character.isIdentifierIgnorable(c)) {
+                            buffer.appendCodePoint(c);
+                        }
+                    }
+                    /*
+                     * No need to verify if the buffer is empty, because 
ignorable
+                     * characters are not legal Unicode identifier start.
+                     */
+                    return buffer.toString();
+                }
+                i += n;
+            }
+        }
+        return text;
+    }
+
     /**
      * Trims the fractional part of the given formatted number, provided that 
it doesn't change
      * the value. This method assumes that the number is formatted in the US 
locale, typically
diff --git 
a/endorsed/src/org.apache.sis.util/main/org/apache/sis/util/internal/DefinitionURI.java
 
b/endorsed/src/org.apache.sis.util/main/org/apache/sis/util/internal/DefinitionURI.java
index e57f627865..245dec83e6 100644
--- 
a/endorsed/src/org.apache.sis.util/main/org/apache/sis/util/internal/DefinitionURI.java
+++ 
b/endorsed/src/org.apache.sis.util/main/org/apache/sis/util/internal/DefinitionURI.java
@@ -271,7 +271,8 @@ public final class DefinitionURI {
      * @param  uri  the URI to parse.
      * @return the parse result, or {@code null} if the given URI is not 
recognized.
      */
-    public static DefinitionURI parse(final String uri) {
+    public static DefinitionURI parse(String uri) {
+        uri = CharSequences.trimIgnorables(uri).toString();
         return parse(uri, false, -1, uri.length());
     }

(sis) branch geoapi-4.0 updated: Ignore zero-width spaces and other ignorable characters in CRS identifiers.

Reply via email to