This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.1 by this push:
     new 2bad561edee (Fix)[hive-writer] Fixed the issue when partition values 
contain spaces when writing to s3. (#35645)
2bad561edee is described below

commit 2bad561edee49948f573d4ee88d2c89c1b9cfeae
Author: Qi Chen <kaka11.c...@gmail.com>
AuthorDate: Sat Jun 1 00:48:43 2024 +0800

    (Fix)[hive-writer] Fixed the issue when partition values contain spaces 
when writing to s3. (#35645)
    
    ## Proposed changes
    
    Issue Number: close #31442
    
    (Fix) [hive-writer] Fixed the issue when partition values contain spaces
    when writing to s3.
    
    ### Error msg
    ```
    org.apache.doris.common.UserException: errCode = 2, detailMessage = 
java.net.URISyntaxException: Illegal character in path at index 114: 
oss://xxxxxxxxxxx/hive/tpcds1000_partition_oss/call_center/cc_call_center_sk=1/cc_mkt_class=A
 bit narrow forms matter animals. Consist/cc_market_manager=Daniel 
Weller/cc_rec_end_date=2001-12-31/f6b5ff4253414b06-9fd365ef68e5ddc5_133f02fb-a7e0-4109-9100-fb748a28259e-0.zlib.orc
            at org.apache.doris.common.util.S3URI.validateUri(S3URI.java:134) 
~[doris-fe.jar:1.2-SNAPSHOT]
            at org.apache.doris.common.util.S3URI.parseUri(S3URI.java:120) 
~[doris-fe.jar:1.2-SNAPSHOT]
            at org.apache.doris.common.util.S3URI.<init>(S3URI.java:116) 
~[doris-fe.jar:1.2-SNAPSHOT]
            at org.apache.doris.common.util.S3URI.create(S3URI.java:108) 
~[doris-fe.jar:1.2-SNAPSHOT]
            at 
org.apache.doris.fs.obj.S3ObjStorage.deleteObject(S3ObjStorage.java:194) 
~[doris-fe.jar:1.2-SNAPSHOT]
            at 
org.apache.doris.fs.remote.ObjFileSystem.delete(ObjFileSystem.java:150) 
~[doris-fe.jar:1.2-SNAPSHOT]
            at 
org.apache.doris.fs.remote.SwitchingFileSystem.delete(SwitchingFileSystem.java:92)
 ~[doris-fe.jar:1.2-
    ```
    
    ### Root Cause
    Hadoop partition names will encode some special characters, but not
    space characters, which is different from URI encoding. Therefore, an
    error will be reported when constructing URI.
    
    ### Solution
    The solution is to use regular expressions to parse URI, and then pass
    in each part of URI to construct URI. This URI constructor will encode
    each part of URI.
---
 .../java/org/apache/doris/common/util/S3URI.java   | 39 +++++++++++++++++-----
 .../org/apache/doris/common/util/S3URITest.java    | 13 ++++++++
 2 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/util/S3URI.java 
b/fe/fe-core/src/main/java/org/apache/doris/common/util/S3URI.java
index 9c9a887aa7e..70263b4bf16 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/common/util/S3URI.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/S3URI.java
@@ -30,6 +30,8 @@ import java.util.List;
 import java.util.Map;
 import java.util.Optional;
 import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 
 /**
@@ -60,11 +62,12 @@ import java.util.stream.Collectors;
  * Virtual Host AWS Client (Hadoop S3) Mixed Style: <code>isPathStyle = false 
&& forceParsingByStandardUri = true</code>
  * Path AWS Client (Hadoop S3) Mixed Style: <code>isPathStyle = true && 
forceParsingByStandardUri = true</code>
  *
- * When the incoming location is url encoded, the encoded string will be 
returned.
- * For <code>getKey()</code>, <code>getQueryParams()</code> will return the 
encoding string
  */
 
 public class S3URI {
+
+    private static final Pattern URI_PATTERN =
+            
Pattern.compile("^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?");
     public static final String SCHEME_DELIM = "://";
     public static final String PATH_DELIM = "/";
     private static final Set<String> VALID_SCHEMES = ImmutableSet.of("http", 
"https", "s3", "s3a", "s3n",
@@ -117,7 +120,8 @@ public class S3URI {
     }
 
     private void parseUri(String location, boolean forceParsingStandardUri) 
throws UserException {
-        validateUri(location);
+        parseURILocation(location);
+        validateUri();
 
         if (!forceParsingStandardUri && 
OS_SCHEMES.contains(uri.getScheme().toLowerCase())) {
             parseAwsCliStyleUri();
@@ -127,12 +131,29 @@ public class S3URI {
         parseEndpointAndRegion();
     }
 
-    private void validateUri(String location) throws UserException {
+    /**
+     * parse uri location and encode to a URI.
+     * @param location
+     * @throws UserException
+     */
+    private void parseURILocation(String location) throws UserException {
+        Matcher matcher = URI_PATTERN.matcher(location);
+        if (!matcher.matches()) {
+            throw new UserException("Failed to parse uri: " + location);
+        }
+        String scheme = matcher.group(2);
+        String authority = matcher.group(4);
+        String path = matcher.group(5);
+        String query = matcher.group(7);
+        String fragment = matcher.group(9);
         try {
-            uri = new URI(location);
+            uri = new URI(scheme, authority, path, query, 
fragment).normalize();
         } catch (URISyntaxException e) {
             throw new UserException(e);
         }
+    }
+
+    private void validateUri() throws UserException {
         if (uri.getScheme() == null || 
!VALID_SCHEMES.contains(uri.getScheme().toLowerCase())) {
             throw new UserException("Invalid scheme: " + this.uri);
         }
@@ -143,7 +164,7 @@ public class S3URI {
         if (bucket == null) {
             throw new UserException("missing bucket: " + uri);
         }
-        String path = uri.getRawPath();
+        String path = uri.getPath();
         if (path.length() > 1) {
             key = path.substring(1);
         } else {
@@ -173,7 +194,7 @@ public class S3URI {
 
     private void addQueryParamsIfNeeded() {
         if (uri.getQuery() != null) {
-            queryParams = splitQueryString(uri.getRawQuery()).stream().map((s) 
-> s.split("="))
+            queryParams = splitQueryString(uri.getQuery()).stream().map((s) -> 
s.split("="))
                     .map((s) -> s.length == 1 ? new String[] {s[0], null} : 
s).collect(
                             Collectors.groupingBy((a) -> a[0],
                                     Collectors.mapping((a) -> a[1], 
Collectors.toList())));
@@ -201,7 +222,7 @@ public class S3URI {
     }
 
     private void parsePathStyleUri() throws UserException {
-        String path = uri.getRawPath();
+        String path = uri.getPath();
 
         if (!StringUtils.isEmpty(path) && !"/".equals(path)) {
             int index = path.indexOf('/', 1);
@@ -226,7 +247,7 @@ public class S3URI {
     private void parseVirtualHostedStyleUri() throws UserException {
         bucket = uri.getHost().split("\\.")[0];
 
-        String path = uri.getRawPath();
+        String path = uri.getPath();
         if (!StringUtils.isEmpty(path) && !"/".equals(path)) {
             key = path.substring(1);
         } else {
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/common/util/S3URITest.java 
b/fe/fe-core/src/test/java/org/apache/doris/common/util/S3URITest.java
index 1d92158c9cf..87fcdae27ac 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/common/util/S3URITest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/common/util/S3URITest.java
@@ -170,6 +170,19 @@ public class S3URITest {
         
Assert.assertTrue(uri1.getQueryParams().get().get("partNumber").contains("88"));
     }
 
+    @Test
+    public void testHadoopEncodedString() throws UserException {
+        String p1 = "s3://bucket/path%20to%20file/abc%3Aqqq=xyz%2Fyyy zzz";
+        boolean isPathStyle = false;
+        boolean forceParsingStandardUri = false;
+        S3URI uri1 = S3URI.create(p1, isPathStyle, forceParsingStandardUri);
+
+        Assert.assertEquals("bucket", uri1.getBucket());
+        Assert.assertEquals("path%20to%20file/abc%3Aqqq=xyz%2Fyyy zzz", 
uri1.getKey());
+        Assert.assertEquals(Optional.empty(), uri1.getEndpoint());
+        Assert.assertEquals(Optional.empty(), uri1.getRegion());
+    }
+
     @Test(expected = UserException.class)
     public void missingBucket() throws UserException {
         S3URI.create("https:///";);


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to