This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new fa2f7d5e7449 [SPARK-53584][PYTHON] Improve process_column_param validation and column parameter docstring fa2f7d5e7449 is described below commit fa2f7d5e744918c9b4f04b6102a1b2cadba15429 Author: Xinrong Meng <xinr...@apache.org> AuthorDate: Tue Sep 16 11:57:53 2025 +0900 [SPARK-53584][PYTHON] Improve process_column_param validation and column parameter docstring ### What changes were proposed in this pull request? - process_column_param: from repeated linear scans of data.schema.fields to a one-time dictionary build, makes validation faster especially for wide schemas. - Clarified the behavior when column=None in docstrings ### Why are the changes needed? Ensure user-facing docs describe behavior in edge cases without confusion; improve performance of column validation. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Existing tests ### Was this patch authored or co-authored using generative AI tooling? No Closes #52344 from xinrong-meng/plot_util. Authored-by: Xinrong Meng <xinr...@apache.org> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/sql/plot/core.py | 11 +++++++---- python/pyspark/sql/plot/plotly.py | 7 +++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/python/pyspark/sql/plot/core.py b/python/pyspark/sql/plot/core.py index 563731b81cc8..526f4897d390 100644 --- a/python/pyspark/sql/plot/core.py +++ b/python/pyspark/sql/plot/core.py @@ -373,7 +373,8 @@ class PySparkPlotAccessor: ---------- column: str or list of str, optional Column name or list of names to be used for creating the box plot. - If None (default), all numeric columns will be used. + If None (default), all numeric columns will be used. If no numeric columns exist, + behavior may depend on the plot backend. **kwargs Extra arguments to `precision`: refer to a float that is used by pyspark to compute approximate statistics for building a boxplot. @@ -426,7 +427,8 @@ class PySparkPlotAccessor: See KernelDensity in PySpark for more information. column: str or list of str, optional Column name or list of names to be used for creating the kde plot. - If None (default), all numeric columns will be used. + If None (default), all numeric columns will be used. If no numeric columns exist, + behavior may depend on the plot backend. ind : List of float, NumPy array or integer, optional Evaluation points for the estimated PDF. If None (default), 1000 equally spaced points are used. If `ind` is a NumPy array, the @@ -465,8 +467,9 @@ class PySparkPlotAccessor: Parameters ---------- column: str or list of str, optional - Column name or list of names to be used for creating the hostogram plot. - If None (default), all numeric columns will be used. + Column name or list of names to be used for creating the histogram plot. + If None (default), all numeric columns will be used. If no numeric columns exist, + behavior may depend on the plot backend. bins : integer, default 10 Number of histogram bins to be used. **kwargs diff --git a/python/pyspark/sql/plot/plotly.py b/python/pyspark/sql/plot/plotly.py index 3953308ae20c..584d3869fe36 100644 --- a/python/pyspark/sql/plot/plotly.py +++ b/python/pyspark/sql/plot/plotly.py @@ -262,15 +262,14 @@ def process_column_param(column: Optional[Union[str, List[str]]], data: "DataFra - Raises a PySparkTypeError if any column in the list is not present in the DataFrame or is not of NumericType. """ + fields_by_name = {f.name: f for f in data.schema.fields} if column is None: - return [ - field.name for field in data.schema.fields if isinstance(field.dataType, NumericType) - ] + return [name for name, f in fields_by_name.items() if isinstance(f.dataType, NumericType)] if isinstance(column, str): column = [column] for col in column: - field = next((f for f in data.schema.fields if f.name == col), None) + field = fields_by_name.get(col) if not field or not isinstance(field.dataType, NumericType): raise PySparkTypeError( errorClass="PLOT_INVALID_TYPE_COLUMN", --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org