This is an automated email from the ASF dual-hosted git repository.

dataroaring pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git

commit d71a78e36ca0baa5f304ce827f1c9dab938893e8
Author: AlexYue <yj976240...@gmail.com>
AuthorDate: Tue Jul 9 09:46:24 2024 +0800

    [fix](Azure) Enhance the glob list's logic for azure file system in FE 
(#37490)
    
    Previously in fe, for files like
    `s3://qa-build/regression/tpcds/sf100_split/catalog_sales.dat.*.gz` it
    can not work.
---
 .../org/apache/doris/fs/obj/AzureObjStorage.java   | 62 +++++++++++++++++-----
 1 file changed, 48 insertions(+), 14 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/fs/obj/AzureObjStorage.java 
b/fe/fe-core/src/main/java/org/apache/doris/fs/obj/AzureObjStorage.java
index 358b66b44b2..73d9444cc8a 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/fs/obj/AzureObjStorage.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/fs/obj/AzureObjStorage.java
@@ -299,47 +299,81 @@ public class AzureObjStorage implements 
ObjStorage<BlobServiceClient> {
         return String.format("s3://%s/%s", bucket, fileName);
     }
 
+    public static String getLongestPrefix(String globPattern) {
+        int length = globPattern.length();
+        int earliestSpecialCharIndex = length;
+
+        char[] specialChars = {'*', '?', '[', '{', '\\'};
+
+        for (char specialChar : specialChars) {
+            int index = globPattern.indexOf(specialChar);
+            if (index != -1 && index < earliestSpecialCharIndex) {
+                earliestSpecialCharIndex = index;
+            }
+        }
+
+        return globPattern.substring(0, earliestSpecialCharIndex);
+    }
+
     public Status globList(String remotePath, List<RemoteFile> result, boolean 
fileNameOnly) {
+        long roundCnt = 0;
+        long elementCnt = 0;
+        long matchCnt = 0;
+        long startTime = System.nanoTime();
+        Status st = Status.OK;
         try {
             S3URI uri = S3URI.create(remotePath, isUsePathStyle, 
forceParsingByStandardUri);
             String globPath = uri.getKey();
+            String bucket = uri.getBucket();
             LOG.info("try to glob list for azure, remote path {}, orig {}", 
globPath, remotePath);
-            BlobContainerClient client = 
getClient().getBlobContainerClient(uri.getBucket());
+            BlobContainerClient client = 
getClient().getBlobContainerClient(bucket);
             java.nio.file.Path pathPattern = Paths.get(globPath);
             LOG.info("path pattern {}", pathPattern.toString());
             PathMatcher matcher = 
FileSystems.getDefault().getPathMatcher("glob:" + pathPattern.toString());
 
-            ListBlobsOptions options = new 
ListBlobsOptions().setPrefix(globPath);
+            String listPrefix = getLongestPrefix(globPath);
+            LOG.info("azure glob list prefix is {}", listPrefix);
+            ListBlobsOptions options = new 
ListBlobsOptions().setPrefix(listPrefix);
             String newContinuationToken = null;
             do {
+                roundCnt++;
                 PagedIterable<BlobItem> pagedBlobs = client.listBlobs(options, 
newContinuationToken, null);
                 PagedResponse<BlobItem> pagedResponse = 
pagedBlobs.iterableByPage().iterator().next();
 
                 for (BlobItem blobItem : pagedResponse.getElements()) {
+                    elementCnt++;
                     java.nio.file.Path blobPath = 
Paths.get(blobItem.getName());
 
-                    if (matcher.matches(blobPath)) {
-                        RemoteFile remoteFile = new RemoteFile(
-                                fileNameOnly ? 
blobPath.getFileName().toString() : constructS3Path(blobPath.toString(),
-                                        uri.getBucket()),
-                                !blobItem.isPrefix(),
-                                blobItem.isPrefix() ? -1 : 
blobItem.getProperties().getContentLength(),
-                                blobItem.getProperties().getContentLength(),
-                                
blobItem.getProperties().getLastModified().getSecond());
-                        result.add(remoteFile);
+                    if (!matcher.matches(blobPath)) {
+                        continue;
                     }
+                    matchCnt++;
+                    RemoteFile remoteFile = new RemoteFile(
+                            fileNameOnly ? blobPath.getFileName().toString() : 
constructS3Path(blobPath.toString(),
+                                    uri.getBucket()),
+                            !blobItem.isPrefix(),
+                            blobItem.isPrefix() ? -1 : 
blobItem.getProperties().getContentLength(),
+                            blobItem.getProperties().getContentLength(),
+                            
blobItem.getProperties().getLastModified().getSecond());
+                    result.add(remoteFile);
                 }
                 newContinuationToken = pagedResponse.getContinuationToken();
             } while (newContinuationToken != null);
 
         } catch (BlobStorageException e) {
             LOG.warn("glob file " + remotePath + " failed because azure error: 
" + e.getMessage());
-            return new Status(Status.ErrCode.COMMON_ERROR, "glob file " + 
remotePath
+            st = new Status(Status.ErrCode.COMMON_ERROR, "glob file " + 
remotePath
                     + " failed because azure error: " + e.getMessage());
         } catch (Exception e) {
             LOG.warn("errors while glob file " + remotePath, e);
-            return new Status(Status.ErrCode.COMMON_ERROR, "errors while glob 
file " + remotePath + e.getMessage());
+            st = new Status(Status.ErrCode.COMMON_ERROR, "errors while glob 
file " + remotePath + e.getMessage());
+        } finally {
+            long endTime = System.nanoTime();
+            long duration = endTime - startTime;
+            LOG.info("process {} elements under prefix {} for {} round, match 
{} elements, take {} micro second",
+                    remotePath, elementCnt, matchCnt, roundCnt,
+                    duration / 1000);
         }
-        return Status.OK;
+        return st;
     }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to