[GitHub] [doris] morningman commented on a diff in pull request #19677: [Feature](avro) Support Apache Avro file format

via GitHub Tue, 16 May 2023 02:35:10 -0700


morningman commented on code in PR #19677:
URL: https://github.com/apache/doris/pull/19677#discussion_r1194735479



##########
be/src/vec/exec/scan/avro_reader.cpp:
##########
@@ -0,0 +1,97 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "avro_reader.h"
+
+#include <map>
+#include <ostream>
+
+#include "runtime/descriptors.h"
+#include "runtime/types.h"
+
+namespace doris::vectorized {
+
+AvroReader::AvroReader(RuntimeState *state, RuntimeProfile *profile,
+                       const TFileScanRangeParams &params,
+                       const std::vector<SlotDescriptor *> &file_slot_descs)
+        : _file_slot_descs(file_slot_descs), _state(state), _profile(profile) {
+    std::ostringstream required_fields;
+    std::ostringstream columns_types;
+    std::vector<std::string> column_names;
+    int index = 0;
+    for (auto &desc: _file_slot_descs) {
+        std::string field = desc->col_name();
+        column_names.emplace_back(field);
+        std::string type = JniConnector::get_hive_type(desc->type());
+        if (index == 0) {
+            required_fields << field;
+            columns_types << type;
+        } else {
+            required_fields << "," << field;
+            columns_types << "#" << type;
+        }
+        index++;
+    }
+
+    TFileType::type type = params.file_type;
+    std::map<String, String> required_param = {{"required_fields", 
required_fields.str()},
+                                               {"columns_types",   
columns_types.str()},
+                                               {"file_type",       
std::to_string(type)}};;
+    switch (type) {
+        case TFileType::FILE_HDFS:
+            required_param.insert(std::make_pair("uri", 
params.hdfs_params.hdfs_conf.data()->value));
+            break;
+        case TFileType::FILE_S3:
+            required_param.insert(params.properties.begin(), 
params.properties.end());
+            break;
+        case TFileType::FILE_LOCAL:
+        case TFileType::FILE_BROKER:
+        case TFileType::FILE_STREAM:
+        default:
+            Status::InternalError("unsupported file reader type: {}", 
std::to_string(type));

Review Comment:
   Can we return value in constructor?



##########
fe/fe-core/src/main/java/org/apache/doris/tablefunction/S3TableValuedFunction.java:
##########
@@ -79,12 +80,15 @@ public S3TableValuedFunction(Map<String, String> params) 
throws AnalysisExceptio
             
credential.setSessionToken(tvfParams.get(S3Properties.SESSION_TOKEN));
         }
 
-        parseProperties(tvfParams);
         // set S3 location properties
         // these five properties is necessary, no one can be lost.
         locationProperties = S3Properties.credentialToMap(credential);
         String usePathStyle = 
tvfParams.getOrDefault(PropertyConverter.USE_PATH_STYLE, "false");
         locationProperties.put(PropertyConverter.USE_PATH_STYLE, usePathStyle);
+        locationProperties.put(S3Properties.VIRTUAL_BUCKET, virtualBucket);
+        locationProperties.put(S3Properties.VIRTUAL_KEY, getVirtualKey());

Review Comment:
   What is virtual bucket and key for？



##########
fe/java-udf/pom.xml:
##########
@@ -45,6 +45,26 @@ under the License.
             <artifactId>fe-common</artifactId>
             <version>${project.version}</version>
         </dependency>
+        <dependency>
+            <groupId>org.apache.avro</groupId>
+            <artifactId>avro</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.hadoop</groupId>
+            <artifactId>hadoop-client</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.hadoop</groupId>

Review Comment:
   Add comment to explain which class need this deps.
   because later we may split them into different modules



##########
be/src/vec/exec/scan/avro_reader.cpp:
##########
@@ -0,0 +1,97 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "avro_reader.h"
+
+#include <map>
+#include <ostream>
+
+#include "runtime/descriptors.h"
+#include "runtime/types.h"
+
+namespace doris::vectorized {
+
+AvroReader::AvroReader(RuntimeState *state, RuntimeProfile *profile,
+                       const TFileScanRangeParams &params,
+                       const std::vector<SlotDescriptor *> &file_slot_descs)
+        : _file_slot_descs(file_slot_descs), _state(state), _profile(profile) {
+    std::ostringstream required_fields;
+    std::ostringstream columns_types;
+    std::vector<std::string> column_names;
+    int index = 0;
+    for (auto &desc: _file_slot_descs) {
+        std::string field = desc->col_name();
+        column_names.emplace_back(field);
+        std::string type = JniConnector::get_hive_type(desc->type());
+        if (index == 0) {
+            required_fields << field;
+            columns_types << type;
+        } else {
+            required_fields << "," << field;
+            columns_types << "#" << type;
+        }
+        index++;
+    }
+
+    TFileType::type type = params.file_type;
+    std::map<String, String> required_param = {{"required_fields", 
required_fields.str()},
+                                               {"columns_types",   
columns_types.str()},
+                                               {"file_type",       
std::to_string(type)}};;
+    switch (type) {
+        case TFileType::FILE_HDFS:
+            required_param.insert(std::make_pair("uri", 
params.hdfs_params.hdfs_conf.data()->value));
+            break;
+        case TFileType::FILE_S3:
+            required_param.insert(params.properties.begin(), 
params.properties.end());
+            break;
+        case TFileType::FILE_LOCAL:
+        case TFileType::FILE_BROKER:
+        case TFileType::FILE_STREAM:
+        default:
+            Status::InternalError("unsupported file reader type: {}", 
std::to_string(type));

Review Comment:
   I think this should be done in `init_reader`?



##########
fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java:
##########
@@ -290,6 +305,56 @@ public static void parseCsvSchema(List<Column> csvSchema, 
Map<String, String> va
         }
     }
 
+    private void parseAvroSchema(List<Column> tableSchema) throws 
AnalysisException {
+        String filePath = getFilePath();

Review Comment:
   The file format which has schema(such as parquet/orc/avro), should got its 
schema by calling `getTableColumns()`, and let BE do the rest.
   
   the `parseCsvSchema()` is only for parsing csv schema specified by 
user(because csv has no schema, so we support user to specify it).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

[GitHub] [doris] morningman commented on a diff in pull request #19677: [Feature](avro) Support Apache Avro file format

Reply via email to