This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new d8f7adca512 branch-4.0: [Improvement](Iceberg) Optimize
LocationPath.of performance for Iceberg table queries #59217 (#59427)
d8f7adca512 is described below
commit d8f7adca512f86c4166482744e41ce8983bc3c17
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Mon Dec 29 10:22:36 2025 +0800
branch-4.0: [Improvement](Iceberg) Optimize LocationPath.of performance for
Iceberg table queries #59217 (#59427)
Cherry-picked from #59217
Co-authored-by: Socrates <[email protected]>
---
.../org/apache/doris/common/util/LocationPath.java | 96 ++++++++++++++++++----
.../datasource/iceberg/source/IcebergScanNode.java | 79 +++++++++++++++++-
.../property/storage/S3PropertyUtils.java | 52 +++++++++++-
.../apache/doris/common/util/LocationPathTest.java | 40 +++++++++
.../storage/S3ConnectorPropertiesUtilsTest.java | 12 +++
5 files changed, 260 insertions(+), 19 deletions(-)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/common/util/LocationPath.java
b/fe/fe-core/src/main/java/org/apache/doris/common/util/LocationPath.java
index e4b9aa0b25c..b41cb950ff8 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/common/util/LocationPath.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/LocationPath.java
@@ -96,27 +96,25 @@ public class LocationPath {
}
private static String parseScheme(String finalLocation) {
- String scheme = "";
- String[] schemeSplit = finalLocation.split(SCHEME_DELIM);
- if (schemeSplit.length > 1) {
- scheme = schemeSplit[0];
- } else {
- schemeSplit = finalLocation.split(NONSTANDARD_SCHEME_DELIM);
- if (schemeSplit.length > 1) {
- scheme = schemeSplit[0];
- }
+ // Use indexOf instead of split for better performance
+ int schemeDelimIndex = finalLocation.indexOf(SCHEME_DELIM);
+ if (schemeDelimIndex > 0) {
+ return finalLocation.substring(0, schemeDelimIndex);
+ }
+
+ int nonstandardDelimIndex =
finalLocation.indexOf(NONSTANDARD_SCHEME_DELIM);
+ if (nonstandardDelimIndex > 0) {
+ return finalLocation.substring(0, nonstandardDelimIndex);
}
// if not get scheme, need consider /path/to/local to no scheme
- if (scheme.isEmpty()) {
- try {
- Paths.get(finalLocation);
- } catch (InvalidPathException exception) {
- throw new IllegalArgumentException("Fail to parse scheme,
invalid location: " + finalLocation);
- }
+ try {
+ Paths.get(finalLocation);
+ } catch (InvalidPathException exception) {
+ throw new IllegalArgumentException("Fail to parse scheme, invalid
location: " + finalLocation);
}
- return scheme;
+ return "";
}
/**
@@ -201,6 +199,72 @@ public class LocationPath {
}
}
+ /**
+ * Ultra-fast factory method that directly constructs LocationPath without
any parsing.
+ * This is used when the normalized location is already known (e.g., from
prefix transformation).
+ *
+ * @param normalizedLocation the already-normalized location string
+ * @param schema pre-computed schema
+ * @param fsIdentifier pre-computed filesystem identifier
+ * @param storageProperties the storage properties (can be null)
+ * @return a new LocationPath instance
+ */
+ public static LocationPath ofDirect(String normalizedLocation,
+ String schema,
+ String fsIdentifier,
+ StorageProperties storageProperties) {
+ return new LocationPath(schema, normalizedLocation, fsIdentifier,
storageProperties);
+ }
+
+ /**
+ * Fast factory method that reuses pre-computed schema and fsIdentifier.
+ * This is optimized for batch processing where many files share the same
bucket/prefix.
+ *
+ * @param location the input URI location string
+ * @param storageProperties pre-computed storage properties for
normalization
+ * @param cachedSchema pre-computed schema (can be null to compute)
+ * @param cachedFsIdPrefix pre-computed fsIdentifier prefix like "s3://"
(can be null to compute)
+ * @return a new LocationPath instance
+ */
+ public static LocationPath ofWithCache(String location,
+ StorageProperties storageProperties,
+ String cachedSchema,
+ String cachedFsIdPrefix) {
+ try {
+ String normalizedLocation =
storageProperties.validateAndNormalizeUri(location);
+
+ String fsIdentifier;
+ if (cachedFsIdPrefix != null &&
normalizedLocation.startsWith(cachedFsIdPrefix)) {
+ // Fast path: extract authority from normalized location
without full URI parsing
+ int authorityStart = cachedFsIdPrefix.length();
+ int authorityEnd = normalizedLocation.indexOf('/',
authorityStart);
+ if (authorityEnd == -1) {
+ authorityEnd = normalizedLocation.length();
+ }
+ String authority =
normalizedLocation.substring(authorityStart, authorityEnd);
+ if (authority.isEmpty()) {
+ throw new StoragePropertiesException("Invalid location,
missing authority: " + normalizedLocation);
+ }
+ fsIdentifier = cachedFsIdPrefix + authority;
+ } else {
+ // Fallback to full URI parsing
+ String encodedLocation = encodedLocation(normalizedLocation);
+ URI uri = URI.create(encodedLocation);
+ String authority = uri.getAuthority();
+ if (Strings.isNullOrEmpty(authority)) {
+ throw new StoragePropertiesException("Invalid location,
missing authority: " + normalizedLocation);
+ }
+ fsIdentifier = Strings.nullToEmpty(uri.getScheme()) + "://"
+ + authority;
+ }
+
+ String schema = cachedSchema != null ? cachedSchema :
extractScheme(location);
+ return new LocationPath(schema, normalizedLocation, fsIdentifier,
storageProperties);
+ } catch (UserException e) {
+ throw new StoragePropertiesException("Failed to create
LocationPath for location: " + location, e);
+ }
+ }
+
/**
* Extracts the URI scheme (e.g., "s3", "hdfs") from the location string.
*
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
index fb3d746fb10..696b1dacd76 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
@@ -117,6 +117,18 @@ public class IcebergScanNode extends FileQueryScanNode {
private Map<StorageProperties.Type, StorageProperties>
storagePropertiesMap;
private Map<String, String> backendStorageProperties;
+ // Cached values for LocationPath creation optimization
+ // These are lazily initialized on first use to avoid parsing overhead for
each file
+ private boolean locationPathCacheInitialized = false;
+ private StorageProperties cachedStorageProperties;
+ private String cachedSchema;
+ private String cachedFsIdPrefix;
+ // Cache for path prefix transformation to avoid repeated S3URI parsing
+ // Maps original path prefix (e.g., "https://bucket.s3.amazonaws.com/") to
normalized prefix (e.g., "s3://bucket/")
+ private String cachedOriginalPathPrefix;
+ private String cachedNormalizedPathPrefix;
+ private String cachedFsIdentifier;
+
// for test
@VisibleForTesting
public IcebergScanNode(PlanNodeId id, TupleDescriptor desc,
SessionVariable sv) {
@@ -364,9 +376,74 @@ public class IcebergScanNode extends FileQueryScanNode {
return TableScanUtil.splitFiles(scan.planFiles(), targetSplitSize);
}
+ /**
+ * Initialize cached values for LocationPath creation on first use.
+ * This avoids repeated StorageProperties lookup, scheme parsing, and
S3URI regex parsing for each file.
+ */
+ private void initLocationPathCache(String samplePath) {
+ try {
+ // Create a LocationPath using the full method to get all cached
values
+ LocationPath sampleLocationPath = LocationPath.of(samplePath,
storagePropertiesMap);
+ cachedStorageProperties =
sampleLocationPath.getStorageProperties();
+ cachedSchema = sampleLocationPath.getSchema();
+ cachedFsIdentifier = sampleLocationPath.getFsIdentifier();
+
+ // Extract fsIdPrefix like "s3://" from fsIdentifier like
"s3://bucket"
+ int schemeEnd = cachedFsIdentifier.indexOf("://");
+ if (schemeEnd > 0) {
+ cachedFsIdPrefix = cachedFsIdentifier.substring(0, schemeEnd +
3);
+ }
+
+ // Cache path prefix mapping for fast transformation
+ // This allows subsequent files to skip S3URI regex parsing
entirely
+ String normalizedPath = sampleLocationPath.getNormalizedLocation();
+
+ // Find the common prefix by looking for the last '/' before the
filename
+ int lastSlashInOriginal = samplePath.lastIndexOf('/');
+ int lastSlashInNormalized = normalizedPath.lastIndexOf('/');
+
+ if (lastSlashInOriginal > 0 && lastSlashInNormalized > 0) {
+ cachedOriginalPathPrefix = samplePath.substring(0,
lastSlashInOriginal + 1);
+ cachedNormalizedPathPrefix = normalizedPath.substring(0,
lastSlashInNormalized + 1);
+ }
+
+ locationPathCacheInitialized = true;
+ } catch (Exception e) {
+ // If caching fails, try to initialize again on next use
+ LOG.warn("Failed to initialize LocationPath cache, will use full
parsing", e);
+ }
+ }
+
+ /**
+ * Create a LocationPath with cached values for better performance.
+ * Uses cached path prefix mapping to completely bypass S3URI regex
parsing for most files.
+ * Falls back to full parsing if cache is not available or path doesn't
match cached prefix.
+ */
+ private LocationPath createLocationPathWithCache(String path) {
+ // Initialize cache on first call
+ if (!locationPathCacheInitialized) {
+ initLocationPathCache(path);
+ }
+
+ // Fast path: if path starts with cached original prefix, directly
transform without any parsing
+ if (cachedOriginalPathPrefix != null &&
path.startsWith(cachedOriginalPathPrefix)) {
+ // Transform: replace original prefix with normalized prefix
+ String normalizedPath = cachedNormalizedPathPrefix +
path.substring(cachedOriginalPathPrefix.length());
+ return LocationPath.ofDirect(normalizedPath, cachedSchema,
cachedFsIdentifier, cachedStorageProperties);
+ }
+
+ // Medium path: use cached StorageProperties but still need
validateAndNormalizeUri
+ if (cachedStorageProperties != null) {
+ return LocationPath.ofWithCache(path, cachedStorageProperties,
cachedSchema, cachedFsIdPrefix);
+ }
+
+ // Fallback to full parsing
+ return LocationPath.of(path, storagePropertiesMap);
+ }
+
private Split createIcebergSplit(FileScanTask fileScanTask) {
String originalPath = fileScanTask.file().path().toString();
- LocationPath locationPath = LocationPath.of(originalPath,
storagePropertiesMap);
+ LocationPath locationPath = createLocationPathWithCache(originalPath);
IcebergSplit split = new IcebergSplit(
locationPath,
fileScanTask.start(),
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/S3PropertyUtils.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/S3PropertyUtils.java
index 99064b4e2e2..4e64f13a635 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/S3PropertyUtils.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/S3PropertyUtils.java
@@ -33,6 +33,15 @@ import java.util.Optional;
public class S3PropertyUtils {
private static final Logger LOG =
LogManager.getLogger(S3PropertyUtils.class);
+ private static final String SCHEME_DELIM = "://";
+ private static final String S3_SCHEME_PREFIX = "s3://";
+
+ // S3-compatible schemes that can be converted to s3:// with simple string
replacement
+ // Format: scheme://bucket/key -> s3://bucket/key
+ private static final String[] SIMPLE_S3_COMPATIBLE_SCHEMES = {
+ "s3a", "s3n", "oss", "cos", "cosn", "obs", "bos", "gs"
+ };
+
/**
* Constructs the S3 endpoint from a given URI in the props map.
*
@@ -113,7 +122,8 @@ public class S3PropertyUtils {
/**
* Validates and normalizes the given path into a standard S3 URI.
- * If the input already starts with "s3://", it is returned as-is.
+ * If the input already starts with a known S3-compatible scheme (s3://,
s3a://, oss://, etc.),
+ * it is returned as-is to avoid expensive regex parsing.
* Otherwise, it is parsed and converted into an S3-compatible URI format.
*
* @param path the raw S3-style path or full URI
@@ -132,16 +142,54 @@ public class S3PropertyUtils {
if (StringUtils.isBlank(path)) {
throw new StoragePropertiesException("path is null");
}
- if (path.startsWith("s3://")) {
+
+ // Fast path 1: s3:// paths are already in the normalized format
expected by BE
+ if (path.startsWith(S3_SCHEME_PREFIX)) {
return path;
}
+ // Fast path 2: simple S3-compatible schemes (oss://, cos://, s3a://,
etc.)
+ // can be converted with simple string replacement:
scheme://bucket/key -> s3://bucket/key
+ String normalized = trySimpleSchemeConversion(path);
+ if (normalized != null) {
+ return normalized;
+ }
+
+ // Full parsing path: for HTTP URLs and other complex formats
boolean usePathStyle = Boolean.parseBoolean(stringUsePathStyle);
boolean forceParsingByStandardUri =
Boolean.parseBoolean(stringForceParsingByStandardUri);
S3URI s3uri = S3URI.create(path, usePathStyle,
forceParsingByStandardUri);
return "s3" + S3URI.SCHEME_DELIM + s3uri.getBucket() +
S3URI.PATH_DELIM + s3uri.getKey();
}
+ /**
+ * Try to convert simple S3-compatible scheme URIs to s3:// format using
string replacement.
+ * This avoids expensive regex parsing for common cases like
oss://bucket/key, s3a://bucket/key, etc.
+ *
+ * @param path the input path
+ * @return converted s3:// path if successful, null if the path doesn't
match simple pattern
+ */
+ private static String trySimpleSchemeConversion(String path) {
+ int delimIndex = path.indexOf(SCHEME_DELIM);
+ if (delimIndex <= 0) {
+ return null;
+ }
+
+ String scheme = path.substring(0, delimIndex).toLowerCase();
+ for (String compatibleScheme : SIMPLE_S3_COMPATIBLE_SCHEMES) {
+ if (compatibleScheme.equals(scheme)) {
+ String rest = path.substring(delimIndex +
SCHEME_DELIM.length());
+ if (rest.isEmpty() || rest.startsWith(S3URI.PATH_DELIM) ||
rest.contains(SCHEME_DELIM)) {
+ return null;
+ }
+ // Simple conversion: replace scheme with "s3"
+ // e.g., "oss://bucket/key" -> "s3://bucket/key"
+ return S3_SCHEME_PREFIX + rest;
+ }
+ }
+ return null;
+ }
+
/**
* Extracts and returns the raw URI string from the given props map.
*
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/common/util/LocationPathTest.java
b/fe/fe-core/src/test/java/org/apache/doris/common/util/LocationPathTest.java
index 0051ea494b0..8a73619824c 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/common/util/LocationPathTest.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/common/util/LocationPathTest.java
@@ -19,6 +19,7 @@ package org.apache.doris.common.util;
import org.apache.doris.common.UserException;
import org.apache.doris.datasource.property.storage.StorageProperties;
+import
org.apache.doris.datasource.property.storage.exception.StoragePropertiesException;
import org.apache.doris.fs.FileSystemType;
import org.apache.doris.thrift.TFileType;
@@ -300,4 +301,43 @@ public class LocationPathTest {
}
+ @Test
+ public void testLocationPathDirect() {
+ StorageProperties storageProperties =
STORAGE_PROPERTIES_MAP.get(StorageProperties.Type.S3);
+ LocationPath locationPath = LocationPath.ofDirect("s3://bucket/key",
"s3", "s3://bucket", storageProperties);
+ Assertions.assertEquals("s3://bucket/key",
locationPath.getNormalizedLocation());
+ Assertions.assertEquals("s3", locationPath.getSchema());
+ Assertions.assertEquals("s3://bucket", locationPath.getFsIdentifier());
+ Assertions.assertEquals(storageProperties,
locationPath.getStorageProperties());
+ }
+
+ @Test
+ public void testLocationPathWithCacheFastPath() {
+ StorageProperties storageProperties =
STORAGE_PROPERTIES_MAP.get(StorageProperties.Type.S3);
+ String location = "s3://bucket/path/to/file";
+ LocationPath cached = LocationPath.ofWithCache(location,
storageProperties, "s3", "s3://");
+ LocationPath full = LocationPath.of(location, STORAGE_PROPERTIES_MAP);
+ Assertions.assertEquals(full.getNormalizedLocation(),
cached.getNormalizedLocation());
+ Assertions.assertEquals(full.getFsIdentifier(),
cached.getFsIdentifier());
+ Assertions.assertEquals(full.getSchema(), cached.getSchema());
+ }
+
+ @Test
+ public void testLocationPathWithCacheFallback() {
+ StorageProperties storageProperties =
STORAGE_PROPERTIES_MAP.get(StorageProperties.Type.S3);
+ String location = "s3://bucket/path/to/file";
+ LocationPath cached = LocationPath.ofWithCache(location,
storageProperties, "s3", null);
+ LocationPath full = LocationPath.of(location, STORAGE_PROPERTIES_MAP);
+ Assertions.assertEquals(full.getNormalizedLocation(),
cached.getNormalizedLocation());
+ Assertions.assertEquals(full.getFsIdentifier(),
cached.getFsIdentifier());
+ Assertions.assertEquals(full.getSchema(), cached.getSchema());
+ }
+
+ @Test
+ public void testLocationPathWithCacheMissingAuthority() {
+ StorageProperties storageProperties =
STORAGE_PROPERTIES_MAP.get(StorageProperties.Type.S3);
+ Assertions.assertThrows(StoragePropertiesException.class,
+ () -> LocationPath.ofWithCache("s3:///path",
storageProperties, "s3", "s3://"));
+ }
+
}
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/datasource/property/storage/S3ConnectorPropertiesUtilsTest.java
b/fe/fe-core/src/test/java/org/apache/doris/datasource/property/storage/S3ConnectorPropertiesUtilsTest.java
index 261f8f273c4..a1aba03d5aa 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/datasource/property/storage/S3ConnectorPropertiesUtilsTest.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/datasource/property/storage/S3ConnectorPropertiesUtilsTest.java
@@ -108,4 +108,16 @@ class S3ConnectorPropertiesUtilsTest {
Assertions.assertThrows(StoragePropertiesException.class, () ->
S3PropertyUtils.validateAndNormalizeUri("", "false", "false"));
Assertions.assertThrows(UserException.class, () ->
S3PropertyUtils.validateAndNormalizeUri("not a uri", "true", "true"));
}
+
+ @Test
+ void testSimpleSchemeConversion() throws UserException {
+ String[] schemes = {"s3a", "s3n", "oss", "cos", "cosn", "obs", "bos",
"gs"};
+ for (String scheme : schemes) {
+ String input = scheme + "://bucket/key";
+ Assertions.assertEquals("s3://bucket/key",
+ S3PropertyUtils.validateAndNormalizeUri(input, "false",
"false"));
+ }
+ Assertions.assertEquals("s3://bucket/key",
+ S3PropertyUtils.validateAndNormalizeUri("OSS://bucket/key",
"false", "false"));
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]