xxiao2018 commented on code in PR #19133: URL: https://github.com/apache/doris/pull/19133#discussion_r1184622189
########## fe/fe-core/src/main/java/org/apache/doris/planner/ScanNode.java: ########## @@ -104,6 +105,10 @@ public void setSortColumn(String column) { sortColumn = column; } + protected List<Split> getSplits() throws UserException { + return null; Review Comment: Better throw NotImplementException? ########## fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java: ########## @@ -17,56 +17,104 @@ package org.apache.doris.planner.external; -import org.apache.doris.analysis.Expr; +import org.apache.doris.analysis.Analyzer; +import org.apache.doris.analysis.SlotDescriptor; +import org.apache.doris.analysis.TupleDescriptor; +import org.apache.doris.catalog.Column; import org.apache.doris.catalog.Env; +import org.apache.doris.catalog.HiveMetaStoreClientHelper; import org.apache.doris.catalog.ListPartitionItem; import org.apache.doris.catalog.PartitionItem; +import org.apache.doris.catalog.TableIf; import org.apache.doris.catalog.Type; import org.apache.doris.catalog.external.HMSExternalTable; +import org.apache.doris.common.DdlException; +import org.apache.doris.common.FeConstants; import org.apache.doris.common.UserException; import org.apache.doris.common.util.Util; import org.apache.doris.datasource.HMSExternalCatalog; import org.apache.doris.datasource.hive.HiveMetaStoreCache; import org.apache.doris.datasource.hive.HivePartition; -import org.apache.doris.external.hive.util.HiveUtil; -import org.apache.doris.planner.ColumnRange; import org.apache.doris.planner.ListPartitionPrunerV2; -import org.apache.doris.planner.Split; -import org.apache.doris.planner.Splitter; +import org.apache.doris.planner.PlanNodeId; import org.apache.doris.qe.ConnectContext; +import org.apache.doris.spi.Split; +import org.apache.doris.statistics.StatisticalType; +import org.apache.doris.thrift.TFileAttributes; +import org.apache.doris.thrift.TFileFormatType; +import org.apache.doris.thrift.TFileTextScanRangeParams; +import org.apache.doris.thrift.TFileType; import com.google.common.collect.Lists; +import com.google.common.collect.Maps; import org.apache.hadoop.fs.BlockLocation; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.LocatedFileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.RemoteIterator; -import org.apache.hadoop.mapred.InputFormat; -import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.io.IOException; import java.util.Collection; import java.util.List; import java.util.Map; +import java.util.stream.Collectors; -public class HiveSplitter implements Splitter { +public class HiveScanNode extends FileQueryScanNode { + private static final Logger LOG = LogManager.getLogger(HiveScanNode.class); - private static final Logger LOG = LogManager.getLogger(HiveSplitter.class); + public static final String PROP_FIELD_DELIMITER = "field.delim"; + public static final String DEFAULT_FIELD_DELIMITER = "\1"; // "\x01" + public static final String DEFAULT_LINE_DELIMITER = "\n"; - private HMSExternalTable hmsTable; - private Map<String, ColumnRange> columnNameToRange; - private int totalPartitionNum = 0; - private int readPartitionNum = 0; + private final HMSExternalTable hmsTable; - public HiveSplitter(HMSExternalTable hmsTable, Map<String, ColumnRange> columnNameToRange) { - this.hmsTable = hmsTable; - this.columnNameToRange = columnNameToRange; + /** + * * External file scan node for Query Hive table + * needCheckColumnPriv: Some of ExternalFileScanNode do not need to check column priv + * eg: s3 tvf + * These scan nodes do not have corresponding catalog/database/table info, so no need to do priv check + */ + public HiveScanNode(PlanNodeId id, TupleDescriptor desc, boolean needCheckColumnPriv) { + super(id, desc, "HIVE_SCAN_NODE", StatisticalType.HIVE_SCAN_NODE, needCheckColumnPriv); + hmsTable = (HMSExternalTable) desc.getTable(); + } + + public HiveScanNode(PlanNodeId id, TupleDescriptor desc, String planNodeName, + StatisticalType statisticalType, boolean needCheckColumnPriv) { + super(id, desc, planNodeName, statisticalType, needCheckColumnPriv); + hmsTable = (HMSExternalTable) desc.getTable(); + } + + @Override + public void init(Analyzer analyzer) throws UserException { + super.init(analyzer); + doInitialize(); + } + + /** + * Init ExternalFileScanNode, ONLY used for Nereids. Should NOT use this function in anywhere else. + */ + public void init() throws UserException { + doInitialize(); + } + + @Override + protected void doInitialize() throws UserException { + super.doInitialize(); + genSlotToSchemaIdMap(); + String inputFormat = hmsTable.getRemoteTable().getSd().getInputFormat(); + if (inputFormat.contains("TextInputFormat")) { + for (SlotDescriptor slot : desc.getSlots()) { + if (!slot.getType().isScalarType()) { + throw new UserException("For column `" + slot.getColumn().getName() + + "`, The column types ARRAY/MAP/STRUCT are not supported yet" + + " for text input format of Hive. "); + } + } + } } @Override - public List<Split> getSplits(List<Expr> exprs) throws UserException { + protected List<Split> getSplits() throws UserException { Review Comment: The logic of checking `useSelfSplitter` can be extracted as a method of `Catalog` ########## fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java: ########## @@ -17,56 +17,104 @@ package org.apache.doris.planner.external; -import org.apache.doris.analysis.Expr; +import org.apache.doris.analysis.Analyzer; +import org.apache.doris.analysis.SlotDescriptor; +import org.apache.doris.analysis.TupleDescriptor; +import org.apache.doris.catalog.Column; import org.apache.doris.catalog.Env; +import org.apache.doris.catalog.HiveMetaStoreClientHelper; import org.apache.doris.catalog.ListPartitionItem; import org.apache.doris.catalog.PartitionItem; +import org.apache.doris.catalog.TableIf; import org.apache.doris.catalog.Type; import org.apache.doris.catalog.external.HMSExternalTable; +import org.apache.doris.common.DdlException; +import org.apache.doris.common.FeConstants; import org.apache.doris.common.UserException; import org.apache.doris.common.util.Util; import org.apache.doris.datasource.HMSExternalCatalog; import org.apache.doris.datasource.hive.HiveMetaStoreCache; import org.apache.doris.datasource.hive.HivePartition; -import org.apache.doris.external.hive.util.HiveUtil; -import org.apache.doris.planner.ColumnRange; import org.apache.doris.planner.ListPartitionPrunerV2; -import org.apache.doris.planner.Split; -import org.apache.doris.planner.Splitter; +import org.apache.doris.planner.PlanNodeId; import org.apache.doris.qe.ConnectContext; +import org.apache.doris.spi.Split; +import org.apache.doris.statistics.StatisticalType; +import org.apache.doris.thrift.TFileAttributes; +import org.apache.doris.thrift.TFileFormatType; +import org.apache.doris.thrift.TFileTextScanRangeParams; +import org.apache.doris.thrift.TFileType; import com.google.common.collect.Lists; +import com.google.common.collect.Maps; import org.apache.hadoop.fs.BlockLocation; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.LocatedFileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.RemoteIterator; -import org.apache.hadoop.mapred.InputFormat; -import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.io.IOException; import java.util.Collection; import java.util.List; import java.util.Map; +import java.util.stream.Collectors; -public class HiveSplitter implements Splitter { +public class HiveScanNode extends FileQueryScanNode { + private static final Logger LOG = LogManager.getLogger(HiveScanNode.class); - private static final Logger LOG = LogManager.getLogger(HiveSplitter.class); + public static final String PROP_FIELD_DELIMITER = "field.delim"; + public static final String DEFAULT_FIELD_DELIMITER = "\1"; // "\x01" + public static final String DEFAULT_LINE_DELIMITER = "\n"; - private HMSExternalTable hmsTable; - private Map<String, ColumnRange> columnNameToRange; - private int totalPartitionNum = 0; - private int readPartitionNum = 0; + private final HMSExternalTable hmsTable; - public HiveSplitter(HMSExternalTable hmsTable, Map<String, ColumnRange> columnNameToRange) { - this.hmsTable = hmsTable; - this.columnNameToRange = columnNameToRange; + /** + * * External file scan node for Query Hive table + * needCheckColumnPriv: Some of ExternalFileScanNode do not need to check column priv + * eg: s3 tvf + * These scan nodes do not have corresponding catalog/database/table info, so no need to do priv check + */ + public HiveScanNode(PlanNodeId id, TupleDescriptor desc, boolean needCheckColumnPriv) { + super(id, desc, "HIVE_SCAN_NODE", StatisticalType.HIVE_SCAN_NODE, needCheckColumnPriv); + hmsTable = (HMSExternalTable) desc.getTable(); + } + + public HiveScanNode(PlanNodeId id, TupleDescriptor desc, String planNodeName, + StatisticalType statisticalType, boolean needCheckColumnPriv) { + super(id, desc, planNodeName, statisticalType, needCheckColumnPriv); + hmsTable = (HMSExternalTable) desc.getTable(); + } + + @Override + public void init(Analyzer analyzer) throws UserException { + super.init(analyzer); + doInitialize(); Review Comment: the `doInitialize()` is already called in `super.init()`, no need to call again here. Same issue for other kind of ScanNode ########## fe/fe-core/src/main/java/org/apache/doris/spi/Split.java: ########## @@ -15,17 +15,15 @@ // specific language governing permissions and limitations // under the License. -package org.apache.doris.planner; +package org.apache.doris.spi; -import lombok.Data; +/** + * Split interface. e.g. Tablet for Olap Table. + */ +public interface Split { -@Data -public abstract class Split { - protected String[] hosts; + String[] getHosts(); - public Split() {} + Object getInfo(); Review Comment: What is this for? ########## fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java: ########## @@ -17,56 +17,104 @@ package org.apache.doris.planner.external; -import org.apache.doris.analysis.Expr; +import org.apache.doris.analysis.Analyzer; +import org.apache.doris.analysis.SlotDescriptor; +import org.apache.doris.analysis.TupleDescriptor; +import org.apache.doris.catalog.Column; import org.apache.doris.catalog.Env; +import org.apache.doris.catalog.HiveMetaStoreClientHelper; import org.apache.doris.catalog.ListPartitionItem; import org.apache.doris.catalog.PartitionItem; +import org.apache.doris.catalog.TableIf; import org.apache.doris.catalog.Type; import org.apache.doris.catalog.external.HMSExternalTable; +import org.apache.doris.common.DdlException; +import org.apache.doris.common.FeConstants; import org.apache.doris.common.UserException; import org.apache.doris.common.util.Util; import org.apache.doris.datasource.HMSExternalCatalog; import org.apache.doris.datasource.hive.HiveMetaStoreCache; import org.apache.doris.datasource.hive.HivePartition; -import org.apache.doris.external.hive.util.HiveUtil; -import org.apache.doris.planner.ColumnRange; import org.apache.doris.planner.ListPartitionPrunerV2; -import org.apache.doris.planner.Split; -import org.apache.doris.planner.Splitter; +import org.apache.doris.planner.PlanNodeId; import org.apache.doris.qe.ConnectContext; +import org.apache.doris.spi.Split; +import org.apache.doris.statistics.StatisticalType; +import org.apache.doris.thrift.TFileAttributes; +import org.apache.doris.thrift.TFileFormatType; +import org.apache.doris.thrift.TFileTextScanRangeParams; +import org.apache.doris.thrift.TFileType; import com.google.common.collect.Lists; +import com.google.common.collect.Maps; import org.apache.hadoop.fs.BlockLocation; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.LocatedFileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.RemoteIterator; -import org.apache.hadoop.mapred.InputFormat; -import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.io.IOException; import java.util.Collection; import java.util.List; import java.util.Map; +import java.util.stream.Collectors; -public class HiveSplitter implements Splitter { +public class HiveScanNode extends FileQueryScanNode { + private static final Logger LOG = LogManager.getLogger(HiveScanNode.class); - private static final Logger LOG = LogManager.getLogger(HiveSplitter.class); + public static final String PROP_FIELD_DELIMITER = "field.delim"; + public static final String DEFAULT_FIELD_DELIMITER = "\1"; // "\x01" + public static final String DEFAULT_LINE_DELIMITER = "\n"; - private HMSExternalTable hmsTable; - private Map<String, ColumnRange> columnNameToRange; - private int totalPartitionNum = 0; - private int readPartitionNum = 0; + private final HMSExternalTable hmsTable; - public HiveSplitter(HMSExternalTable hmsTable, Map<String, ColumnRange> columnNameToRange) { - this.hmsTable = hmsTable; - this.columnNameToRange = columnNameToRange; + /** + * * External file scan node for Query Hive table + * needCheckColumnPriv: Some of ExternalFileScanNode do not need to check column priv + * eg: s3 tvf + * These scan nodes do not have corresponding catalog/database/table info, so no need to do priv check + */ + public HiveScanNode(PlanNodeId id, TupleDescriptor desc, boolean needCheckColumnPriv) { + super(id, desc, "HIVE_SCAN_NODE", StatisticalType.HIVE_SCAN_NODE, needCheckColumnPriv); + hmsTable = (HMSExternalTable) desc.getTable(); + } + + public HiveScanNode(PlanNodeId id, TupleDescriptor desc, String planNodeName, + StatisticalType statisticalType, boolean needCheckColumnPriv) { + super(id, desc, planNodeName, statisticalType, needCheckColumnPriv); + hmsTable = (HMSExternalTable) desc.getTable(); + } + + @Override + public void init(Analyzer analyzer) throws UserException { + super.init(analyzer); + doInitialize(); + } + + /** + * Init ExternalFileScanNode, ONLY used for Nereids. Should NOT use this function in anywhere else. + */ + public void init() throws UserException { Review Comment: This init() is same as in parent class, no need to override it. ########## fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java: ########## @@ -263,6 +265,22 @@ private HivePartition loadPartitions(PartitionCacheKey key) { return new HivePartition(key.dbName, key.tblName, false, sd.getInputFormat(), sd.getLocation(), key.values); } + // Get File Status by using FileSystem API. + public FileCacheValue getFileCache(Path path, InputFormat<?, ?> inputFormat, Review Comment: ```suggestion private FileCacheValue getFileCache(Path path, InputFormat<?, ?> inputFormat, ``` ########## fe/fe-core/src/main/java/org/apache/doris/planner/SingleNodePlanner.java: ########## @@ -1999,7 +2006,29 @@ private PlanNode createScanNode(Analyzer analyzer, TableRef tblRef, SelectStmt s break; case HMS_EXTERNAL_TABLE: case ICEBERG_EXTERNAL_TABLE: - scanNode = new FileQueryScanNode(ctx.getNextNodeId(), tblRef.getDesc(), true); + TableIf table = tblRef.getDesc().getTable(); Review Comment: It is strange to handle `HUDI` and `FunctionGenTable` table in case `HMS_EXTERNAL_TABLE` and `ICEBERG_EXTERNAL_TABLE` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org