HappenLee commented on code in PR #12730: URL: https://github.com/apache/doris/pull/12730#discussion_r975191153
########## be/src/exec/parquet_writer.h: ########## @@ -63,14 +63,35 @@ class ParquetOutputStream : public arrow::io::OutputStream { int64_t _written_len = 0; }; +class ParquetBuildHelper { +public: + static void build_schema_repetition_type( + parquet::Repetition::type& parquet_repetition_type, + const TParquetRepetitionType::type& column_repetition_type); + + static void build_schema_data_type(parquet::Type::type& parquet_data_type, + const TParquetDataType::type& column_data_type); + + static void build_compression_type(parquet::WriterProperties::Builder& builder, + const TParquetCompressionType::type& compression_type); + + static void build_version(parquet::WriterProperties::Builder& builder, + const TParquetVersion::type& parquet_version); +}; + // a wrapper of parquet output stream class ParquetWriterWrapper { public: - ParquetWriterWrapper(FileWriter* file_writer, const std::vector<ExprContext*>& output_expr_ctxs, - const std::map<std::string, std::string>& properties, - const std::vector<std::vector<std::string>>& schema, - bool output_object_data); - virtual ~ParquetWriterWrapper(); + ParquetWriterWrapper(doris::FileWriter* file_writer, + const std::vector<ExprContext*>& output_vexpr_ctxs, + const std::vector<TParquetRepetitionType::type>& schemas_repetition_type, + const std::vector<TParquetDataType::type>& schemas_data_type, + const std::vector<std::string>& schemas_column_name, + const TParquetCompressionType::type& compression_type, + const bool& parquet_disable_dictionary, + const TParquetVersion::type& parquet_version, bool output_object_data); + + virtual ~ParquetWriterWrapper() = default; Review Comment: if `ParquetWriterWrapper` do not have subclass, this function do need to be virtual ########## gensrc/thrift/DataSinks.thrift: ########## @@ -51,8 +84,18 @@ struct TResultFileSinkOptions { 6: optional list<Types.TNetworkAddress> broker_addresses; // only for remote file 7: optional map<string, string> broker_properties // only for remote file 8: optional string success_file_name - 9: optional list<list<string>> schema // for parquet/orc file - 10: optional map<string, string> file_properties // for parquet/orc file + 9: optional list<list<string>> schema // for orc file + 10: optional map<string, string> file_properties // for orc file + + //note: use outfile with parquet format, have deprecated 9:schema and 10:file_properties + //because when this info thrift to BE, BE hava to find useful info in string, + //have to check by use string directly, and maybe not so efficient Review Comment: if we deprecated 9 and 10, We should consider in detail the compatibility of the upgrade -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org