bito-code-review[bot] commented on code in PR #37815: URL: https://github.com/apache/superset/pull/37815#discussion_r2875305944
########## superset/semantic_layers/mapper.py: ########## @@ -0,0 +1,947 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Functions for mapping `QueryObject` to semantic layers. + +These functions validate and convert a `QueryObject` into one or more `SemanticQuery`, +which are then passed to semantic layer implementations for execution, returning a +single dataframe. + +""" + +from datetime import datetime, timedelta +from time import time +from typing import Any, cast, Sequence, TypeGuard + +import numpy as np +import pyarrow as pa +from superset_core.semantic_layers.semantic_view import SemanticViewFeature +from superset_core.semantic_layers.types import ( + AdhocExpression, + Day, + Dimension, + Filter, + FilterValues, + Grain, + GroupLimit, + Hour, + Metric, + Minute, + Month, + Operator, + OrderDirection, + OrderTuple, + PredicateType, + Quarter, + Second, + SemanticQuery, + SemanticResult, + Week, + Year, +) + +from superset.common.db_query_status import QueryStatus +from superset.common.query_object import QueryObject +from superset.common.utils.time_range_utils import get_since_until_from_query_object +from superset.connectors.sqla.models import BaseDatasource +from superset.constants import NO_TIME_RANGE +from superset.models.helpers import QueryResult +from superset.superset_typing import AdhocColumn +from superset.utils.core import ( + FilterOperator, + QueryObjectFilterClause, + TIME_COMPARISON, +) +from superset.utils.date_parser import get_past_or_future + + +class ValidatedQueryObjectFilterClause(QueryObjectFilterClause): + """ + A validated QueryObject filter clause with a string column name. + + The `col` in a `QueryObjectFilterClause` can be either a string (column name) or an + adhoc column, but we only support the former in semantic layers. + """ + + # overwrite to narrow type; mypy complains about more restrictive typed dicts, + # but the alternative would be to redefine the object + col: str # type: ignore[misc] + op: str # type: ignore[misc] + + +class ValidatedQueryObject(QueryObject): + """ + A query object that has a datasource defined. + """ + + datasource: BaseDatasource + + # overwrite to narrow type; mypy complains about the assignment since the base type + # allows adhoc filters, but we only support validated filters here + filter: list[ValidatedQueryObjectFilterClause] # type: ignore[assignment] + series_columns: Sequence[str] # type: ignore[assignment] + series_limit_metric: str | None + + +def get_results(query_object: QueryObject) -> QueryResult: + """ + Run 1+ queries based on `QueryObject` and return the results. + + :param query_object: The QueryObject containing query specifications + :return: QueryResult compatible with Superset's query interface + """ + if not validate_query_object(query_object): + raise ValueError("QueryObject must have a datasource defined.") + + # Track execution time + start_time = time() + + semantic_view = query_object.datasource.implementation + dispatcher = ( + semantic_view.get_row_count + if query_object.is_rowcount + else semantic_view.get_dataframe + ) + + # Step 1: Convert QueryObject to list of SemanticQuery objects + # The first query is the main query, subsequent queries are for time offsets + queries = map_query_object(query_object) + + # Step 2: Execute the main query (first in the list) + main_query = queries[0] + main_result = dispatcher( + metrics=main_query.metrics, + dimensions=main_query.dimensions, + filters=main_query.filters, + order=main_query.order, + limit=main_query.limit, + offset=main_query.offset, + group_limit=main_query.group_limit, + ) + + main_df = main_result.results.to_pandas() + + # Collect all requests (SQL queries, HTTP requests, etc.) for troubleshooting + all_requests = list(main_result.requests) + + # If no time offsets, return the main result as-is + if not query_object.time_offsets or len(queries) <= 1: + duration = timedelta(seconds=time() - start_time) + return map_semantic_result_to_query_result( + main_result, + query_object, + duration, + ) + + # Get metric names from the main query + # These are the columns that will be renamed with offset suffixes + metric_names = [metric.name for metric in main_query.metrics] + + # Join keys are all columns except metrics + # These will be used to match rows between main and offset DataFrames + join_keys = [col for col in main_df.columns if col not in metric_names] + + # Step 3 & 4: Execute each time offset query and join results + for offset_query, time_offset in zip( + queries[1:], + query_object.time_offsets, + strict=False, + ): + # Execute the offset query + result = dispatcher( + metrics=offset_query.metrics, + dimensions=offset_query.dimensions, + filters=offset_query.filters, + order=offset_query.order, + limit=offset_query.limit, + offset=offset_query.offset, + group_limit=offset_query.group_limit, + ) + + # Add this query's requests to the collection + all_requests.extend(result.requests) + + offset_df = result.results.to_pandas() + + # Handle empty results - add NaN columns directly instead of merging + # This avoids dtype mismatch issues with empty DataFrames + if offset_df.empty: + # Add offset metric columns with NaN values directly to main_df + for metric in metric_names: + offset_col_name = TIME_COMPARISON.join([metric, time_offset]) + main_df[offset_col_name] = np.nan + else: + # Rename metric columns with time offset suffix + # Format: "{metric_name}__{time_offset}" + # Example: "revenue" -> "revenue__1 week ago" + offset_df = offset_df.rename( + columns={ + metric: TIME_COMPARISON.join([metric, time_offset]) + for metric in metric_names + } + ) + + # Step 5: Perform left join on dimension columns + # This preserves all rows from main_df and adds offset metrics + # where they match + main_df = main_df.merge( + offset_df, + on=join_keys, + how="left", + suffixes=("", "__duplicate"), + ) + + # Clean up any duplicate columns that might have been created + # (shouldn't happen with proper join keys, but defensive programming) + duplicate_cols = [ + col for col in main_df.columns if col.endswith("__duplicate") + ] + if duplicate_cols: + main_df = main_df.drop(columns=duplicate_cols) + + # Convert final result to QueryResult + semantic_result = SemanticResult( + requests=all_requests, + results=pa.Table.from_pandas(main_df), + ) + duration = timedelta(seconds=time() - start_time) + return map_semantic_result_to_query_result( + semantic_result, + query_object, + duration, + ) + + +def map_semantic_result_to_query_result( + semantic_result: SemanticResult, + query_object: ValidatedQueryObject, + duration: timedelta, +) -> QueryResult: + """ + Convert a SemanticResult to a QueryResult. + + :param semantic_result: Result from the semantic layer + :param query_object: Original QueryObject (for passthrough attributes) + :param duration: Time taken to execute the query + :return: QueryResult compatible with Superset's query interface + """ + # Get the query string from requests (typically one or more SQL queries) + query_str = "" + if semantic_result.requests: + # Join all requests for display (could be multiple for time comparisons) + query_str = "\n\n".join( + f"-- {req.type}\n{req.definition}" for req in semantic_result.requests + ) + + return QueryResult( + # Core data + df=semantic_result.results.to_pandas(), + query=query_str, + duration=duration, + # Template filters - not applicable to semantic layers + # (semantic layers don't use Jinja templates) + applied_template_filters=None, + # Filter columns - not applicable to semantic layers + # (semantic layers handle filter validation internally) + applied_filter_columns=None, + rejected_filter_columns=None, + # Status - always success if we got here + # (errors would raise exceptions before reaching this point) + status=QueryStatus.SUCCESS, + error_message=None, + errors=None, + # Time range - pass through from original query_object + from_dttm=query_object.from_dttm, + to_dttm=query_object.to_dttm, + ) + + +def _normalize_column(column: str | AdhocColumn, dimension_names: set[str]) -> str: + """ + Normalize a column to its dimension name. + + Columns can be either: + - A string (dimension name directly) + - An AdhocColumn with isColumnReference=True and sqlExpression containing the + dimension name + """ + if isinstance(column, str): + return column + + # Handle column references (e.g., from time-series charts) + if column.get("isColumnReference") and (sql_expr := column.get("sqlExpression")): + if sql_expr in dimension_names: + return sql_expr + + raise ValueError("Adhoc dimensions are not supported in Semantic Views.") + + +def map_query_object(query_object: ValidatedQueryObject) -> list[SemanticQuery]: + """ + Convert a `QueryObject` into a list of `SemanticQuery`. + + This function maps the `QueryObject` into query objects that focus less on + visualization and more on semantics. + """ + semantic_view = query_object.datasource.implementation + + all_metrics = {metric.name: metric for metric in semantic_view.metrics} + all_dimensions = { + dimension.name: dimension for dimension in semantic_view.dimensions + } + + # Normalize columns (may be dicts with isColumnReference=True for time-series) + dimension_names = set(all_dimensions.keys()) + normalized_columns = { + _normalize_column(column, dimension_names) for column in query_object.columns + } + + metrics = [all_metrics[metric] for metric in (query_object.metrics or [])] + + grain = ( + _convert_time_grain(query_object.extras["time_grain_sqla"]) + if "time_grain_sqla" in query_object.extras + else None + ) + dimensions = [ + dimension + for dimension in semantic_view.dimensions + if dimension.name in normalized_columns + and ( + # if a grain is specified, only include the time dimension if its grain + # matches the requested grain + grain is None + or dimension.name != query_object.granularity + or dimension.grain == grain + ) + ] + + order = _get_order_from_query_object(query_object, all_metrics, all_dimensions) + limit = query_object.row_limit + offset = query_object.row_offset + + group_limit = _get_group_limit_from_query_object( + query_object, + all_metrics, + all_dimensions, + ) + + queries = [] + for time_offset in [None] + query_object.time_offsets: + filters = _get_filters_from_query_object( + query_object, + time_offset, + all_dimensions, + ) + print(">>", filters) Review Comment: <div> <div id="suggestion"> <div id="issue"><b>Debug print statement in production code</b></div> <div id="fix"> Remove debug `print` statement before merging. This should be replaced with proper logging if needed. </div> <details> <summary> <b>Code suggestion</b> </summary> <blockquote>Check the AI-generated fix before applying</blockquote> <div id="code"> ``` - print(">>>", filters) ``` </div> </details> </div> <small><i>Code Review Run #0e1b4d</i></small> </div> --- Should Bito avoid suggestions like this for future reviews? (<a href=https://alpha.bito.ai/home/ai-agents/review-rules>Manage Rules</a>) - [ ] Yes, avoid them ########## superset-core/pyproject.toml: ########## @@ -43,6 +43,7 @@ classifiers = [ ] dependencies = [ "flask-appbuilder>=5.0.2,<6", + "pyarrow>=16.0.0", "pydantic>=2.8.0", "sqlalchemy>=1.4.0,<2.0", Review Comment: <div> <div id="suggestion"> <div id="issue"><b>Dependency version inconsistency</b></div> <div id="fix"> The pyarrow dependency minimum version is set to 16.0.0, but requirements files pin to 16.1.0 which includes critical bug fixes for Parquet null value decoding and thread safety issues. Since pyarrow is used for CSV and Parquet data parsing in the codebase, using 16.0.0 could lead to incorrect data handling in production. Update to >=16.1.0 for consistency and reliability. </div> <details> <summary> <b>Code suggestion</b> </summary> <blockquote>Check the AI-generated fix before applying</blockquote> <div id="code"> ````suggestion dependencies = [ "flask-appbuilder>=5.0.2,<6", "pyarrow>=16.1.0", "pydantic>=2.8.0", "sqlalchemy>=1.4.0,<2.0", ```` </div> </details> </div> <small><i>Code Review Run #0e1b4d</i></small> </div> --- Should Bito avoid suggestions like this for future reviews? (<a href=https://alpha.bito.ai/home/ai-agents/review-rules>Manage Rules</a>) - [ ] Yes, avoid them -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
