zclllyybb commented on code in PR #48956: URL: https://github.com/apache/doris/pull/48956#discussion_r2009197877
########## be/src/geo/kml_parse_ctx.h: ########## @@ -0,0 +1,126 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "geo/geo_common.h" +#include "geo/geo_types.h" +#include "geo/wkt_parse_type.h" + +namespace doris { + +// Parsing context state for KML geometry +struct KmlParseContext { + // ========================== Global State ========================== + GeoShapeType current_shape_type = + GEO_SHAPE_ANY; // Type of geometry being parsed (Point/LineString/Polygon) + bool in_coordinates = false; // Flag for <coordinates> tag scope + std::string char_buffer; // Temporary buffer for character data accumulation + + // ========================== Geometry Data Storage ========================== + // GeoCoordinateList for Point/LineString + GeoCoordinateList* coordinates = nullptr; + + // Polygon-specific parsing state + struct { + bool in_outer_boundary = false; // Inside <outerBoundaryIs> tag + bool in_inner_boundary = false; // Inside <innerBoundaryIs> tag + GeoCoordinateListList rings; // Coordinate rings (outer + inner) + } polygon_ctx; + + // ========================== Output Handling ========================== + std::unique_ptr<GeoShape> shape = nullptr; + GeoParseStatus status = GEO_PARSE_OK; // Parsing status code + + // ========================== Utility Methods ========================== + // Reset temporary state before parsing new geometry + void reset() { + current_shape_type = GEO_SHAPE_ANY; + in_coordinates = false; + char_buffer.clear(); + if (coordinates == nullptr) { + coordinates = new GeoCoordinateList(); + } + coordinates->clear(); + polygon_ctx = {}; // Reset polygon context + shape = nullptr; + status = GEO_PARSE_OK; + } + + ~KmlParseContext() { delete coordinates; } + + // Parse coordinate string (format: "lng,lat lng,lat...]") + bool parse_coordinates(const std::string& str) { + std::istringstream iss(str); Review Comment: do not use iss, it's inefficient. you can just parse the string ########## be/src/geo/kml_parse_ctx.h: ########## @@ -0,0 +1,126 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "geo/geo_common.h" +#include "geo/geo_types.h" +#include "geo/wkt_parse_type.h" + +namespace doris { + +// Parsing context state for KML geometry +struct KmlParseContext { + // ========================== Global State ========================== + GeoShapeType current_shape_type = + GEO_SHAPE_ANY; // Type of geometry being parsed (Point/LineString/Polygon) + bool in_coordinates = false; // Flag for <coordinates> tag scope + std::string char_buffer; // Temporary buffer for character data accumulation + + // ========================== Geometry Data Storage ========================== + // GeoCoordinateList for Point/LineString + GeoCoordinateList* coordinates = nullptr; + + // Polygon-specific parsing state + struct { + bool in_outer_boundary = false; // Inside <outerBoundaryIs> tag + bool in_inner_boundary = false; // Inside <innerBoundaryIs> tag + GeoCoordinateListList rings; // Coordinate rings (outer + inner) + } polygon_ctx; + + // ========================== Output Handling ========================== + std::unique_ptr<GeoShape> shape = nullptr; + GeoParseStatus status = GEO_PARSE_OK; // Parsing status code + + // ========================== Utility Methods ========================== + // Reset temporary state before parsing new geometry + void reset() { + current_shape_type = GEO_SHAPE_ANY; + in_coordinates = false; + char_buffer.clear(); + if (coordinates == nullptr) { + coordinates = new GeoCoordinateList(); + } + coordinates->clear(); + polygon_ctx = {}; // Reset polygon context + shape = nullptr; + status = GEO_PARSE_OK; + } + + ~KmlParseContext() { delete coordinates; } + + // Parse coordinate string (format: "lng,lat lng,lat...]") + bool parse_coordinates(const std::string& str) { + std::istringstream iss(str); + std::string token; + while (std::getline(iss, token, ' ')) { + size_t comma_pos = token.find(','); + if (comma_pos == std::string::npos) { + status = GEO_PARSE_COORD_INVALID; + return false; + } + try { + double lng = std::stod(token.substr(0, comma_pos)); Review Comment: `substr` made an unnecessary copy. use the C-style function `strtod` to avoid this. ########## be/src/vec/functions/functions_geo.cpp: ########## @@ -904,6 +906,74 @@ struct StAsBinary { } }; +struct StGeomFromKML { + static constexpr auto NEED_CONTEXT = true; + static constexpr auto NAME = "st_geomfromkml"; + static const size_t NUM_ARGS = 1; + using Type = DataTypeString; + + static Status execute(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + size_t result) { + DCHECK_EQ(arguments.size(), 1); + auto return_type = block.get_data_type(result); + const auto& kml_col = block.get_by_position(arguments[0]).column; + + const size_t num_rows = kml_col->size(); + auto res = ColumnString::create(); + auto null_map = ColumnUInt8::create(num_rows, 0); + auto& null_map_data = null_map->get_data(); + res->reserve(num_rows); + std::string buf; + KmlParseContext parse_ctx; + + for (size_t row_idx = 0; row_idx < num_rows; ++row_idx) { + // 1. Retrieve the KML input data for the current row + const auto kml_data_ref = kml_col->get_data_at(row_idx); + const std::string kml_input(kml_data_ref.data, kml_data_ref.size); Review Comment: will do useless copy. remove it. ########## be/src/vec/functions/functions_geo.cpp: ########## @@ -904,6 +906,74 @@ struct StAsBinary { } }; +struct StGeomFromKML { + static constexpr auto NEED_CONTEXT = true; + static constexpr auto NAME = "st_geomfromkml"; + static const size_t NUM_ARGS = 1; + using Type = DataTypeString; + + static Status execute(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + size_t result) { + DCHECK_EQ(arguments.size(), 1); + auto return_type = block.get_data_type(result); + const auto& kml_col = block.get_by_position(arguments[0]).column; + + const size_t num_rows = kml_col->size(); + auto res = ColumnString::create(); + auto null_map = ColumnUInt8::create(num_rows, 0); + auto& null_map_data = null_map->get_data(); + res->reserve(num_rows); + std::string buf; + KmlParseContext parse_ctx; + + for (size_t row_idx = 0; row_idx < num_rows; ++row_idx) { + // 1. Retrieve the KML input data for the current row + const auto kml_data_ref = kml_col->get_data_at(row_idx); Review Comment: the input must be `ColumnString`, so use this to do `assert_cast` and use non-virtual function ########## be/src/geo/kml_parse.cpp: ########## @@ -0,0 +1,132 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kml_parse.h" + +#include <libxml/xmlstring.h> + +#include "kml_parse_ctx.h" + +namespace doris { + +xmlSAXHandler KmlParser::sax_handler = {.startElement = start_element, + .endElement = end_element, + .characters = characters, + .initialized = XML_SAX2_MAGIC}; + +GeoParseStatus KmlParser::parse(const std::string& kml_input, KmlParseContext& ctx) { + // Create a push parser context with the SAX handler and the provided context + xmlParserCtxtPtr ctxt = xmlCreatePushParserCtxt(&sax_handler, &ctx, nullptr, 0, nullptr); Review Comment: why pass ctx as user_data? ########## be/src/geo/kml_parse.cpp: ########## @@ -0,0 +1,132 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kml_parse.h" + +#include <libxml/xmlstring.h> + +#include "kml_parse_ctx.h" + +namespace doris { + +xmlSAXHandler KmlParser::sax_handler = {.startElement = start_element, + .endElement = end_element, + .characters = characters, + .initialized = XML_SAX2_MAGIC}; + +GeoParseStatus KmlParser::parse(const std::string& kml_input, KmlParseContext& ctx) { + // Create a push parser context with the SAX handler and the provided context + xmlParserCtxtPtr ctxt = xmlCreatePushParserCtxt(&sax_handler, &ctx, nullptr, 0, nullptr); + if (!ctxt) { + return GEO_PARSE_XML_SYNTAX_ERROR; + } + + // Parse the KML input data in chunks (here, the entire data is pushed at once) + xmlParseChunk(ctxt, kml_input.data(), kml_input.size(), 0); + + // Check if the context status indicates an error during parsing + if (ctx.status != GEO_PARSE_OK) { + // Stop the parser if an error is encountered + xmlStopParser(ctxt); + } + + // Finalize the parsing process by signaling the end of the input + xmlParseChunk(ctxt, nullptr, 0, 1); + + // Check if the XML is well-formed + if (ctxt->wellFormed != 1) { + // Free the parser context and return a syntax error if the XML is not well-formed + xmlFreeParserCtxt(ctxt); + return GEO_PARSE_XML_SYNTAX_ERROR; + } + + // Free the parser context + xmlFreeParserCtxt(ctxt); + + // Return the final status recorded in the context + return ctx.status; +} + +void KmlParser::start_element(void* ctx, const xmlChar* name, const xmlChar** attrs) { + auto* context = static_cast<KmlParseContext*>(ctx); + if (context->status != GEO_PARSE_OK) { + return; + } + + const char* tag = reinterpret_cast<const char*>(name); + if (strcasecmp(tag, "Point") == 0) { Review Comment: how to make sure that's all we need? I think there's `MultiGeometry` and more? ########## be/src/geo/kml_parse.cpp: ########## @@ -0,0 +1,132 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kml_parse.h" + +#include <libxml/xmlstring.h> + +#include "kml_parse_ctx.h" + +namespace doris { + +xmlSAXHandler KmlParser::sax_handler = {.startElement = start_element, + .endElement = end_element, + .characters = characters, + .initialized = XML_SAX2_MAGIC}; + +GeoParseStatus KmlParser::parse(const std::string& kml_input, KmlParseContext& ctx) { + // Create a push parser context with the SAX handler and the provided context + xmlParserCtxtPtr ctxt = xmlCreatePushParserCtxt(&sax_handler, &ctx, nullptr, 0, nullptr); + if (!ctxt) { + return GEO_PARSE_XML_SYNTAX_ERROR; + } + + // Parse the KML input data in chunks (here, the entire data is pushed at once) + xmlParseChunk(ctxt, kml_input.data(), kml_input.size(), 0); + + // Check if the context status indicates an error during parsing + if (ctx.status != GEO_PARSE_OK) { + // Stop the parser if an error is encountered + xmlStopParser(ctxt); + } + + // Finalize the parsing process by signaling the end of the input + xmlParseChunk(ctxt, nullptr, 0, 1); + + // Check if the XML is well-formed + if (ctxt->wellFormed != 1) { + // Free the parser context and return a syntax error if the XML is not well-formed + xmlFreeParserCtxt(ctxt); + return GEO_PARSE_XML_SYNTAX_ERROR; + } + + // Free the parser context + xmlFreeParserCtxt(ctxt); + + // Return the final status recorded in the context + return ctx.status; +} + +void KmlParser::start_element(void* ctx, const xmlChar* name, const xmlChar** attrs) { + auto* context = static_cast<KmlParseContext*>(ctx); + if (context->status != GEO_PARSE_OK) { + return; + } + + const char* tag = reinterpret_cast<const char*>(name); + if (strcasecmp(tag, "Point") == 0) { Review Comment: we need some cases to make sure we can ignore irrelative elements ########## be/src/geo/kml_parse.cpp: ########## @@ -0,0 +1,132 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kml_parse.h" + +#include <libxml/xmlstring.h> + +#include "kml_parse_ctx.h" + +namespace doris { + +xmlSAXHandler KmlParser::sax_handler = {.startElement = start_element, + .endElement = end_element, + .characters = characters, + .initialized = XML_SAX2_MAGIC}; + +GeoParseStatus KmlParser::parse(const std::string& kml_input, KmlParseContext& ctx) { + // Create a push parser context with the SAX handler and the provided context + xmlParserCtxtPtr ctxt = xmlCreatePushParserCtxt(&sax_handler, &ctx, nullptr, 0, nullptr); + if (!ctxt) { + return GEO_PARSE_XML_SYNTAX_ERROR; + } + + // Parse the KML input data in chunks (here, the entire data is pushed at once) + xmlParseChunk(ctxt, kml_input.data(), kml_input.size(), 0); + + // Check if the context status indicates an error during parsing + if (ctx.status != GEO_PARSE_OK) { + // Stop the parser if an error is encountered + xmlStopParser(ctxt); + } + + // Finalize the parsing process by signaling the end of the input + xmlParseChunk(ctxt, nullptr, 0, 1); + + // Check if the XML is well-formed + if (ctxt->wellFormed != 1) { + // Free the parser context and return a syntax error if the XML is not well-formed + xmlFreeParserCtxt(ctxt); + return GEO_PARSE_XML_SYNTAX_ERROR; + } + + // Free the parser context + xmlFreeParserCtxt(ctxt); + + // Return the final status recorded in the context + return ctx.status; +} + +void KmlParser::start_element(void* ctx, const xmlChar* name, const xmlChar** attrs) { + auto* context = static_cast<KmlParseContext*>(ctx); + if (context->status != GEO_PARSE_OK) { + return; + } + + const char* tag = reinterpret_cast<const char*>(name); Review Comment: why do casting here? It seems that `xmlChar` is just an alias of `unsigned char`? ########## be/src/geo/kml_parse_ctx.h: ########## @@ -0,0 +1,126 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "geo/geo_common.h" +#include "geo/geo_types.h" +#include "geo/wkt_parse_type.h" + +namespace doris { + +// Parsing context state for KML geometry +struct KmlParseContext { + // ========================== Global State ========================== + GeoShapeType current_shape_type = + GEO_SHAPE_ANY; // Type of geometry being parsed (Point/LineString/Polygon) + bool in_coordinates = false; // Flag for <coordinates> tag scope + std::string char_buffer; // Temporary buffer for character data accumulation + + // ========================== Geometry Data Storage ========================== + // GeoCoordinateList for Point/LineString + GeoCoordinateList* coordinates = nullptr; Review Comment: use `unique_ptr` instead is better -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org