This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch tpc_preview4-external in repository https://gitbox.apache.org/repos/asf/doris.git
commit c7572648fb955873e24cadc5cbbdcd2bb816b758 Author: Socrates <[email protected]> AuthorDate: Sat Dec 20 22:35:38 2025 +0800 Optimize location for tpch1000 (#59218) --- .../org/apache/doris/common/util/LocationPath.java | 89 ++++++++++++++++++---- .../datasource/iceberg/source/IcebergScanNode.java | 87 ++++++++++++++++++++- .../property/storage/S3PropertyUtils.java | 48 +++++++++++- 3 files changed, 205 insertions(+), 19 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/util/LocationPath.java b/fe/fe-core/src/main/java/org/apache/doris/common/util/LocationPath.java index e4b9aa0b25c..cbe2b01d912 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/util/LocationPath.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/LocationPath.java @@ -96,27 +96,25 @@ public class LocationPath { } private static String parseScheme(String finalLocation) { - String scheme = ""; - String[] schemeSplit = finalLocation.split(SCHEME_DELIM); - if (schemeSplit.length > 1) { - scheme = schemeSplit[0]; - } else { - schemeSplit = finalLocation.split(NONSTANDARD_SCHEME_DELIM); - if (schemeSplit.length > 1) { - scheme = schemeSplit[0]; - } + // Use indexOf instead of split for better performance + int schemeDelimIndex = finalLocation.indexOf(SCHEME_DELIM); + if (schemeDelimIndex > 0) { + return finalLocation.substring(0, schemeDelimIndex); + } + + int nonstandardDelimIndex = finalLocation.indexOf(NONSTANDARD_SCHEME_DELIM); + if (nonstandardDelimIndex > 0) { + return finalLocation.substring(0, nonstandardDelimIndex); } // if not get scheme, need consider /path/to/local to no scheme - if (scheme.isEmpty()) { - try { - Paths.get(finalLocation); - } catch (InvalidPathException exception) { - throw new IllegalArgumentException("Fail to parse scheme, invalid location: " + finalLocation); - } + try { + Paths.get(finalLocation); + } catch (InvalidPathException exception) { + throw new IllegalArgumentException("Fail to parse scheme, invalid location: " + finalLocation); } - return scheme; + return ""; } /** @@ -201,6 +199,65 @@ public class LocationPath { } } + /** + * Ultra-fast factory method that directly constructs LocationPath without any parsing. + * This is used when the normalized location is already known (e.g., from prefix transformation). + * + * @param normalizedLocation the already-normalized location string + * @param schema pre-computed schema + * @param fsIdentifier pre-computed filesystem identifier + * @param storageProperties the storage properties (can be null) + * @return a new LocationPath instance + */ + public static LocationPath ofDirect(String normalizedLocation, + String schema, + String fsIdentifier, + StorageProperties storageProperties) { + return new LocationPath(schema, normalizedLocation, fsIdentifier, storageProperties); + } + + /** + * Fast factory method that reuses pre-computed schema and fsIdentifier. + * This is optimized for batch processing where many files share the same bucket/prefix. + * + * @param location the input URI location string + * @param storageProperties pre-computed storage properties for normalization + * @param cachedSchema pre-computed schema (can be null to compute) + * @param cachedFsIdPrefix pre-computed fsIdentifier prefix like "s3://" (can be null to compute) + * @return a new LocationPath instance + */ + public static LocationPath ofWithCache(String location, + StorageProperties storageProperties, + String cachedSchema, + String cachedFsIdPrefix) { + try { + String normalizedLocation = storageProperties.validateAndNormalizeUri(location); + + String fsIdentifier; + if (cachedFsIdPrefix != null && normalizedLocation.startsWith(cachedFsIdPrefix)) { + // Fast path: extract authority from normalized location without full URI parsing + int authorityStart = cachedFsIdPrefix.length(); + int authorityEnd = normalizedLocation.indexOf('/', authorityStart); + if (authorityEnd == -1) { + authorityEnd = normalizedLocation.length(); + } + String authority = normalizedLocation.substring(authorityStart, authorityEnd); + fsIdentifier = cachedFsIdPrefix + authority; + } else { + // Fallback to full URI parsing + String encodedLocation = encodedLocation(normalizedLocation); + URI uri = URI.create(encodedLocation); + fsIdentifier = Strings.nullToEmpty(uri.getScheme()) + "://" + + Strings.nullToEmpty(uri.getAuthority()); + } + + String schema = cachedSchema != null ? cachedSchema : extractScheme(location); + return new LocationPath(schema, normalizedLocation, fsIdentifier, storageProperties); + } catch (UserException e) { + throw new StoragePropertiesException("Failed to create LocationPath for location: " + location, e); + } + } + /** * Extracts the URI scheme (e.g., "s3", "hdfs") from the location string. * diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java index 133ac067644..698a6a380f0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java @@ -133,6 +133,17 @@ public class IcebergScanNode extends FileQueryScanNode { private Map<String, String> backendStorageProperties; private Boolean isBatchMode = null; + // Cached values for LocationPath creation optimization + // These are lazily initialized on first use to avoid parsing overhead for each file + private volatile StorageProperties cachedStorageProperties; + private volatile String cachedSchema; + private volatile String cachedFsIdPrefix; + private volatile boolean locationPathCacheInitialized = false; + // Cache for path prefix transformation to avoid repeated S3URI parsing + // Maps original path prefix (e.g., "https://bucket.s3.amazonaws.com/") to normalized prefix (e.g., "s3://bucket/") + private volatile String cachedOriginalPathPrefix; + private volatile String cachedNormalizedPathPrefix; + private volatile String cachedFsIdentifier; // for test @VisibleForTesting @@ -547,9 +558,83 @@ public class IcebergScanNode extends FileQueryScanNode { return TableScanUtil.splitFiles(CloseableIterable.withNoopClose(tasks), targetSplitSize); } + /** + * Initialize cached values for LocationPath creation on first use. + * This avoids repeated StorageProperties lookup, scheme parsing, and S3URI regex parsing for each file. + */ + private void initLocationPathCache(String samplePath) { + if (locationPathCacheInitialized) { + return; + } + synchronized (this) { + if (locationPathCacheInitialized) { + return; + } + try { + // Create a LocationPath using the full method to get all cached values + LocationPath sampleLocationPath = LocationPath.of(samplePath, storagePropertiesMap); + cachedStorageProperties = sampleLocationPath.getStorageProperties(); + cachedSchema = sampleLocationPath.getSchema(); + cachedFsIdentifier = sampleLocationPath.getFsIdentifier(); + + // Extract fsIdPrefix like "s3://" from fsIdentifier like "s3://bucket" + int schemeEnd = cachedFsIdentifier.indexOf("://"); + if (schemeEnd > 0) { + cachedFsIdPrefix = cachedFsIdentifier.substring(0, schemeEnd + 3); + } + + // Cache path prefix mapping for fast transformation + // This allows subsequent files to skip S3URI regex parsing entirely + String normalizedPath = sampleLocationPath.getNormalizedLocation(); + + // Find the common prefix by looking for the last '/' before the filename + int lastSlashInOriginal = samplePath.lastIndexOf('/'); + int lastSlashInNormalized = normalizedPath.lastIndexOf('/'); + + if (lastSlashInOriginal > 0 && lastSlashInNormalized > 0) { + cachedOriginalPathPrefix = samplePath.substring(0, lastSlashInOriginal + 1); + cachedNormalizedPathPrefix = normalizedPath.substring(0, lastSlashInNormalized + 1); + } + + locationPathCacheInitialized = true; + } catch (Exception e) { + // If caching fails, we'll fall back to the full method each time + LOG.warn("Failed to initialize LocationPath cache, will use full parsing", e); + locationPathCacheInitialized = true; + } + } + } + + /** + * Create a LocationPath with cached values for better performance. + * Uses cached path prefix mapping to completely bypass S3URI regex parsing for most files. + * Falls back to full parsing if cache is not available or path doesn't match cached prefix. + */ + private LocationPath createLocationPathWithCache(String path) { + // Initialize cache on first call + if (!locationPathCacheInitialized) { + initLocationPathCache(path); + } + + // Fast path: if path starts with cached original prefix, directly transform without any parsing + if (cachedOriginalPathPrefix != null && path.startsWith(cachedOriginalPathPrefix)) { + // Transform: replace original prefix with normalized prefix + String normalizedPath = cachedNormalizedPathPrefix + path.substring(cachedOriginalPathPrefix.length()); + return LocationPath.ofDirect(normalizedPath, cachedSchema, cachedFsIdentifier, cachedStorageProperties); + } + + // Medium path: use cached StorageProperties but still need validateAndNormalizeUri + if (cachedStorageProperties != null) { + return LocationPath.ofWithCache(path, cachedStorageProperties, cachedSchema, cachedFsIdPrefix); + } + + // Fallback to full parsing + return LocationPath.of(path, storagePropertiesMap); + } + private Split createIcebergSplit(FileScanTask fileScanTask) { String originalPath = fileScanTask.file().path().toString(); - LocationPath locationPath = LocationPath.of(originalPath, storagePropertiesMap); + LocationPath locationPath = createLocationPathWithCache(originalPath); IcebergSplit split = new IcebergSplit( locationPath, fileScanTask.start(), diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/S3PropertyUtils.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/S3PropertyUtils.java index 99064b4e2e2..71360fc4799 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/S3PropertyUtils.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/S3PropertyUtils.java @@ -33,6 +33,15 @@ import java.util.Optional; public class S3PropertyUtils { private static final Logger LOG = LogManager.getLogger(S3PropertyUtils.class); + private static final String SCHEME_DELIM = "://"; + private static final String S3_SCHEME_PREFIX = "s3://"; + + // S3-compatible schemes that can be converted to s3:// with simple string replacement + // Format: scheme://bucket/key -> s3://bucket/key + private static final String[] SIMPLE_S3_COMPATIBLE_SCHEMES = { + "s3a", "s3n", "oss", "cos", "cosn", "obs", "bos", "gs" + }; + /** * Constructs the S3 endpoint from a given URI in the props map. * @@ -113,7 +122,8 @@ public class S3PropertyUtils { /** * Validates and normalizes the given path into a standard S3 URI. - * If the input already starts with "s3://", it is returned as-is. + * If the input already starts with a known S3-compatible scheme (s3://, s3a://, oss://, etc.), + * it is returned as-is to avoid expensive regex parsing. * Otherwise, it is parsed and converted into an S3-compatible URI format. * * @param path the raw S3-style path or full URI @@ -132,16 +142,50 @@ public class S3PropertyUtils { if (StringUtils.isBlank(path)) { throw new StoragePropertiesException("path is null"); } - if (path.startsWith("s3://")) { + + // Fast path 1: s3:// paths are already in the normalized format expected by BE + if (path.startsWith(S3_SCHEME_PREFIX)) { return path; } + // Fast path 2: simple S3-compatible schemes (oss://, cos://, s3a://, etc.) + // can be converted with simple string replacement: scheme://bucket/key -> s3://bucket/key + String normalized = trySimpleSchemeConversion(path); + if (normalized != null) { + return normalized; + } + + // Full parsing path: for HTTP URLs and other complex formats boolean usePathStyle = Boolean.parseBoolean(stringUsePathStyle); boolean forceParsingByStandardUri = Boolean.parseBoolean(stringForceParsingByStandardUri); S3URI s3uri = S3URI.create(path, usePathStyle, forceParsingByStandardUri); return "s3" + S3URI.SCHEME_DELIM + s3uri.getBucket() + S3URI.PATH_DELIM + s3uri.getKey(); } + /** + * Try to convert simple S3-compatible scheme URIs to s3:// format using string replacement. + * This avoids expensive regex parsing for common cases like oss://bucket/key, s3a://bucket/key, etc. + * + * @param path the input path + * @return converted s3:// path if successful, null if the path doesn't match simple pattern + */ + private static String trySimpleSchemeConversion(String path) { + int delimIndex = path.indexOf(SCHEME_DELIM); + if (delimIndex <= 0) { + return null; + } + + String scheme = path.substring(0, delimIndex).toLowerCase(); + for (String compatibleScheme : SIMPLE_S3_COMPATIBLE_SCHEMES) { + if (compatibleScheme.equals(scheme)) { + // Simple conversion: replace scheme with "s3" + // e.g., "oss://bucket/key" -> "s3://bucket/key" + return S3_SCHEME_PREFIX + path.substring(delimIndex + SCHEME_DELIM.length()); + } + } + return null; + } + /** * Extracts and returns the raw URI string from the given props map. * --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
