This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch tpc_preview4-external
in repository https://gitbox.apache.org/repos/asf/doris.git

commit c7572648fb955873e24cadc5cbbdcd2bb816b758
Author: Socrates <[email protected]>
AuthorDate: Sat Dec 20 22:35:38 2025 +0800

    Optimize location for tpch1000 (#59218)
---
 .../org/apache/doris/common/util/LocationPath.java | 89 ++++++++++++++++++----
 .../datasource/iceberg/source/IcebergScanNode.java | 87 ++++++++++++++++++++-
 .../property/storage/S3PropertyUtils.java          | 48 +++++++++++-
 3 files changed, 205 insertions(+), 19 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/common/util/LocationPath.java 
b/fe/fe-core/src/main/java/org/apache/doris/common/util/LocationPath.java
index e4b9aa0b25c..cbe2b01d912 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/common/util/LocationPath.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/LocationPath.java
@@ -96,27 +96,25 @@ public class LocationPath {
     }
 
     private static String parseScheme(String finalLocation) {
-        String scheme = "";
-        String[] schemeSplit = finalLocation.split(SCHEME_DELIM);
-        if (schemeSplit.length > 1) {
-            scheme = schemeSplit[0];
-        } else {
-            schemeSplit = finalLocation.split(NONSTANDARD_SCHEME_DELIM);
-            if (schemeSplit.length > 1) {
-                scheme = schemeSplit[0];
-            }
+        // Use indexOf instead of split for better performance
+        int schemeDelimIndex = finalLocation.indexOf(SCHEME_DELIM);
+        if (schemeDelimIndex > 0) {
+            return finalLocation.substring(0, schemeDelimIndex);
+        }
+
+        int nonstandardDelimIndex = 
finalLocation.indexOf(NONSTANDARD_SCHEME_DELIM);
+        if (nonstandardDelimIndex > 0) {
+            return finalLocation.substring(0, nonstandardDelimIndex);
         }
 
         // if not get scheme, need consider /path/to/local to no scheme
-        if (scheme.isEmpty()) {
-            try {
-                Paths.get(finalLocation);
-            } catch (InvalidPathException exception) {
-                throw new IllegalArgumentException("Fail to parse scheme, 
invalid location: " + finalLocation);
-            }
+        try {
+            Paths.get(finalLocation);
+        } catch (InvalidPathException exception) {
+            throw new IllegalArgumentException("Fail to parse scheme, invalid 
location: " + finalLocation);
         }
 
-        return scheme;
+        return "";
     }
 
     /**
@@ -201,6 +199,65 @@ public class LocationPath {
         }
     }
 
+    /**
+     * Ultra-fast factory method that directly constructs LocationPath without 
any parsing.
+     * This is used when the normalized location is already known (e.g., from 
prefix transformation).
+     *
+     * @param normalizedLocation the already-normalized location string
+     * @param schema             pre-computed schema
+     * @param fsIdentifier       pre-computed filesystem identifier
+     * @param storageProperties  the storage properties (can be null)
+     * @return a new LocationPath instance
+     */
+    public static LocationPath ofDirect(String normalizedLocation,
+                                        String schema,
+                                        String fsIdentifier,
+                                        StorageProperties storageProperties) {
+        return new LocationPath(schema, normalizedLocation, fsIdentifier, 
storageProperties);
+    }
+
+    /**
+     * Fast factory method that reuses pre-computed schema and fsIdentifier.
+     * This is optimized for batch processing where many files share the same 
bucket/prefix.
+     *
+     * @param location           the input URI location string
+     * @param storageProperties  pre-computed storage properties for 
normalization
+     * @param cachedSchema       pre-computed schema (can be null to compute)
+     * @param cachedFsIdPrefix   pre-computed fsIdentifier prefix like "s3://" 
(can be null to compute)
+     * @return a new LocationPath instance
+     */
+    public static LocationPath ofWithCache(String location,
+                                           StorageProperties storageProperties,
+                                           String cachedSchema,
+                                           String cachedFsIdPrefix) {
+        try {
+            String normalizedLocation = 
storageProperties.validateAndNormalizeUri(location);
+
+            String fsIdentifier;
+            if (cachedFsIdPrefix != null && 
normalizedLocation.startsWith(cachedFsIdPrefix)) {
+                // Fast path: extract authority from normalized location 
without full URI parsing
+                int authorityStart = cachedFsIdPrefix.length();
+                int authorityEnd = normalizedLocation.indexOf('/', 
authorityStart);
+                if (authorityEnd == -1) {
+                    authorityEnd = normalizedLocation.length();
+                }
+                String authority = 
normalizedLocation.substring(authorityStart, authorityEnd);
+                fsIdentifier = cachedFsIdPrefix + authority;
+            } else {
+                // Fallback to full URI parsing
+                String encodedLocation = encodedLocation(normalizedLocation);
+                URI uri = URI.create(encodedLocation);
+                fsIdentifier = Strings.nullToEmpty(uri.getScheme()) + "://"
+                        + Strings.nullToEmpty(uri.getAuthority());
+            }
+
+            String schema = cachedSchema != null ? cachedSchema : 
extractScheme(location);
+            return new LocationPath(schema, normalizedLocation, fsIdentifier, 
storageProperties);
+        } catch (UserException e) {
+            throw new StoragePropertiesException("Failed to create 
LocationPath for location: " + location, e);
+        }
+    }
+
     /**
      * Extracts the URI scheme (e.g., "s3", "hdfs") from the location string.
      *
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
index 133ac067644..698a6a380f0 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
@@ -133,6 +133,17 @@ public class IcebergScanNode extends FileQueryScanNode {
     private Map<String, String> backendStorageProperties;
 
     private Boolean isBatchMode = null;
+    // Cached values for LocationPath creation optimization
+    // These are lazily initialized on first use to avoid parsing overhead for 
each file
+    private volatile StorageProperties cachedStorageProperties;
+    private volatile String cachedSchema;
+    private volatile String cachedFsIdPrefix;
+    private volatile boolean locationPathCacheInitialized = false;
+    // Cache for path prefix transformation to avoid repeated S3URI parsing
+    // Maps original path prefix (e.g., "https://bucket.s3.amazonaws.com/";) to 
normalized prefix (e.g., "s3://bucket/")
+    private volatile String cachedOriginalPathPrefix;
+    private volatile String cachedNormalizedPathPrefix;
+    private volatile String cachedFsIdentifier;
 
     // for test
     @VisibleForTesting
@@ -547,9 +558,83 @@ public class IcebergScanNode extends FileQueryScanNode {
         return 
TableScanUtil.splitFiles(CloseableIterable.withNoopClose(tasks), 
targetSplitSize);
     }
 
+    /**
+     * Initialize cached values for LocationPath creation on first use.
+     * This avoids repeated StorageProperties lookup, scheme parsing, and 
S3URI regex parsing for each file.
+     */
+    private void initLocationPathCache(String samplePath) {
+        if (locationPathCacheInitialized) {
+            return;
+        }
+        synchronized (this) {
+            if (locationPathCacheInitialized) {
+                return;
+            }
+            try {
+                // Create a LocationPath using the full method to get all 
cached values
+                LocationPath sampleLocationPath = LocationPath.of(samplePath, 
storagePropertiesMap);
+                cachedStorageProperties = 
sampleLocationPath.getStorageProperties();
+                cachedSchema = sampleLocationPath.getSchema();
+                cachedFsIdentifier = sampleLocationPath.getFsIdentifier();
+
+                // Extract fsIdPrefix like "s3://" from fsIdentifier like 
"s3://bucket"
+                int schemeEnd = cachedFsIdentifier.indexOf("://");
+                if (schemeEnd > 0) {
+                    cachedFsIdPrefix = cachedFsIdentifier.substring(0, 
schemeEnd + 3);
+                }
+
+                // Cache path prefix mapping for fast transformation
+                // This allows subsequent files to skip S3URI regex parsing 
entirely
+                String normalizedPath = 
sampleLocationPath.getNormalizedLocation();
+
+                // Find the common prefix by looking for the last '/' before 
the filename
+                int lastSlashInOriginal = samplePath.lastIndexOf('/');
+                int lastSlashInNormalized = normalizedPath.lastIndexOf('/');
+
+                if (lastSlashInOriginal > 0 && lastSlashInNormalized > 0) {
+                    cachedOriginalPathPrefix = samplePath.substring(0, 
lastSlashInOriginal + 1);
+                    cachedNormalizedPathPrefix = normalizedPath.substring(0, 
lastSlashInNormalized + 1);
+                }
+
+                locationPathCacheInitialized = true;
+            } catch (Exception e) {
+                // If caching fails, we'll fall back to the full method each 
time
+                LOG.warn("Failed to initialize LocationPath cache, will use 
full parsing", e);
+                locationPathCacheInitialized = true;
+            }
+        }
+    }
+
+    /**
+     * Create a LocationPath with cached values for better performance.
+     * Uses cached path prefix mapping to completely bypass S3URI regex 
parsing for most files.
+     * Falls back to full parsing if cache is not available or path doesn't 
match cached prefix.
+     */
+    private LocationPath createLocationPathWithCache(String path) {
+        // Initialize cache on first call
+        if (!locationPathCacheInitialized) {
+            initLocationPathCache(path);
+        }
+
+        // Fast path: if path starts with cached original prefix, directly 
transform without any parsing
+        if (cachedOriginalPathPrefix != null && 
path.startsWith(cachedOriginalPathPrefix)) {
+            // Transform: replace original prefix with normalized prefix
+            String normalizedPath = cachedNormalizedPathPrefix + 
path.substring(cachedOriginalPathPrefix.length());
+            return LocationPath.ofDirect(normalizedPath, cachedSchema, 
cachedFsIdentifier, cachedStorageProperties);
+        }
+
+        // Medium path: use cached StorageProperties but still need 
validateAndNormalizeUri
+        if (cachedStorageProperties != null) {
+            return LocationPath.ofWithCache(path, cachedStorageProperties, 
cachedSchema, cachedFsIdPrefix);
+        }
+
+        // Fallback to full parsing
+        return LocationPath.of(path, storagePropertiesMap);
+    }
+
     private Split createIcebergSplit(FileScanTask fileScanTask) {
         String originalPath = fileScanTask.file().path().toString();
-        LocationPath locationPath = LocationPath.of(originalPath, 
storagePropertiesMap);
+        LocationPath locationPath = createLocationPathWithCache(originalPath);
         IcebergSplit split = new IcebergSplit(
                 locationPath,
                 fileScanTask.start(),
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/S3PropertyUtils.java
 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/S3PropertyUtils.java
index 99064b4e2e2..71360fc4799 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/S3PropertyUtils.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/S3PropertyUtils.java
@@ -33,6 +33,15 @@ import java.util.Optional;
 public class S3PropertyUtils {
     private static final Logger LOG = 
LogManager.getLogger(S3PropertyUtils.class);
 
+    private static final String SCHEME_DELIM = "://";
+    private static final String S3_SCHEME_PREFIX = "s3://";
+
+    // S3-compatible schemes that can be converted to s3:// with simple string 
replacement
+    // Format: scheme://bucket/key -> s3://bucket/key
+    private static final String[] SIMPLE_S3_COMPATIBLE_SCHEMES = {
+            "s3a", "s3n", "oss", "cos", "cosn", "obs", "bos", "gs"
+    };
+
     /**
      * Constructs the S3 endpoint from a given URI in the props map.
      *
@@ -113,7 +122,8 @@ public class S3PropertyUtils {
 
     /**
      * Validates and normalizes the given path into a standard S3 URI.
-     * If the input already starts with "s3://", it is returned as-is.
+     * If the input already starts with a known S3-compatible scheme (s3://, 
s3a://, oss://, etc.),
+     * it is returned as-is to avoid expensive regex parsing.
      * Otherwise, it is parsed and converted into an S3-compatible URI format.
      *
      * @param path                            the raw S3-style path or full URI
@@ -132,16 +142,50 @@ public class S3PropertyUtils {
         if (StringUtils.isBlank(path)) {
             throw new StoragePropertiesException("path is null");
         }
-        if (path.startsWith("s3://")) {
+
+        // Fast path 1: s3:// paths are already in the normalized format 
expected by BE
+        if (path.startsWith(S3_SCHEME_PREFIX)) {
             return path;
         }
 
+        // Fast path 2: simple S3-compatible schemes (oss://, cos://, s3a://, 
etc.)
+        // can be converted with simple string replacement: 
scheme://bucket/key -> s3://bucket/key
+        String normalized = trySimpleSchemeConversion(path);
+        if (normalized != null) {
+            return normalized;
+        }
+
+        // Full parsing path: for HTTP URLs and other complex formats
         boolean usePathStyle = Boolean.parseBoolean(stringUsePathStyle);
         boolean forceParsingByStandardUri = 
Boolean.parseBoolean(stringForceParsingByStandardUri);
         S3URI s3uri = S3URI.create(path, usePathStyle, 
forceParsingByStandardUri);
         return "s3" + S3URI.SCHEME_DELIM + s3uri.getBucket() + 
S3URI.PATH_DELIM + s3uri.getKey();
     }
 
+    /**
+     * Try to convert simple S3-compatible scheme URIs to s3:// format using 
string replacement.
+     * This avoids expensive regex parsing for common cases like 
oss://bucket/key, s3a://bucket/key, etc.
+     *
+     * @param path the input path
+     * @return converted s3:// path if successful, null if the path doesn't 
match simple pattern
+     */
+    private static String trySimpleSchemeConversion(String path) {
+        int delimIndex = path.indexOf(SCHEME_DELIM);
+        if (delimIndex <= 0) {
+            return null;
+        }
+
+        String scheme = path.substring(0, delimIndex).toLowerCase();
+        for (String compatibleScheme : SIMPLE_S3_COMPATIBLE_SCHEMES) {
+            if (compatibleScheme.equals(scheme)) {
+                // Simple conversion: replace scheme with "s3"
+                // e.g., "oss://bucket/key" -> "s3://bucket/key"
+                return S3_SCHEME_PREFIX + path.substring(delimIndex + 
SCHEME_DELIM.length());
+            }
+        }
+        return null;
+    }
+
     /**
      * Extracts and returns the raw URI string from the given props map.
      *


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to