Nitin-Kashyap commented on code in PR #27784: URL: https://github.com/apache/doris/pull/27784#discussion_r1414247571
########## fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java: ########## @@ -438,9 +469,85 @@ public List<Column> initSchema() { columns = tmpSchema; } initPartitionColumns(columns); + initBucketingColumns(columns); return columns; } + private void initBucketingColumns(List<Column> columns) { + List<String> bucketCols = new ArrayList<>(5); + int numBuckets = getBucketColums(bucketCols); + if (bucketCols.isEmpty()) { + bucketColumns = ImmutableList.of(); + distributionInfo = new RandomDistributionInfo(1, true); + return; + } + + int bucketingVersion = Integer.valueOf(remoteTable.getParameters().getOrDefault(BUCKETING_VERSION, + "2")); + ImmutableList.Builder<Column> bucketColBuilder = ImmutableList.builder(); + for (String colName : bucketCols) { + // do not use "getColum()", which will cause dead loop + for (Column column : columns) { + if (colName.equals(column.getName())) { + // For partition column, if it is string type, change it to varchar(65535) + // to be same as doris managed table. + // This is to avoid some unexpected behavior such as different partition pruning result + // between doris managed table and external table. + if (column.getType().getPrimitiveType() == PrimitiveType.STRING) { + column.setType(ScalarType.createVarcharType(ScalarType.MAX_VARCHAR_LENGTH)); + } + bucketColBuilder.add(column); + break; + } + } + } + + bucketColumns = bucketColBuilder.build(); + distributionInfo = new HiveExternalDistributionInfo(numBuckets, bucketColumns, bucketingVersion); + LOG.debug("get {} bucket columns for table: {}", bucketColumns.size(), name); + } + + private int getBucketColums(List<String> bucketCols) { + StorageDescriptor descriptor = remoteTable.getSd(); + int numBuckets = -1; + if (descriptor.isSetBucketCols() && !descriptor.getBucketCols().isEmpty()) { + /* Hive Bucketed Table */ + bucketCols.addAll(descriptor.getBucketCols()); + numBuckets = descriptor.getNumBuckets(); + } else if (remoteTable.isSetParameters() + && !Collections.disjoint(SUPPORTED_BUCKET_PROPERTIES, remoteTable.getParameters().keySet())) { + Map<String, String> parameters = remoteTable.getParameters(); + for (String key : SUPPORTED_BUCKET_PROPERTIES) { + if (parameters.containsKey(key)) { + switch (key) { + case SPARK_BUCKET + "0": + bucketCols.add(0, parameters.get(key)); + break; + case SPARK_BUCKET + "1": + bucketCols.add(1, parameters.get(key)); + break; + case SPARK_BUCKET + "2": + bucketCols.add(2, parameters.get(key)); + break; + case SPARK_BUCKET + "3": + bucketCols.add(3, parameters.get(key)); + break; + case SPARK_BUCKET + "4": Review Comment: No, it can be more. I have re-worded the code for this. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org