[SPARK-49365][PS] Simplify the bucket aggregation in hist plot

### What changes were proposed in this pull request? Simplify the bucket aggregation in hist plot ### Why are the changes needed? to simplify the implementation, by eliminating the multiple dataframes Union ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI and manually check ### Was this patch authored or co-authored using generative AI tooling? No Closes apache#47852 from zhengruifeng/plot_parallel_hist. Authored-by: Ruifeng Zheng <[email protected]> Signed-off-by: Ruifeng Zheng <[email protected]>
zhengruifeng · Aug 24, 2024 · 0c18fc0 · 0c18fc0
1 parent ee97caa
commit 0c18fc0
Showing 1 changed file with 11 additions and 18 deletions.
diff --git a/python/pyspark/pandas/plot/core.py b/python/pyspark/pandas/plot/core.py
@@ -198,25 +198,18 @@ def binary_search_for_buckets(value):
                 idx = bisect.bisect(bins, value) - 1
             return float(idx)
 
-        output_df = None
-        for group_id, (colname, bucket_name) in enumerate(zip(colnames, bucket_names)):
-            # sdf.na.drop to match handleInvalid="skip" in Bucketizer
-
-            bucket_df = sdf.na.drop(subset=[colname]).withColumn(
-                bucket_name,
-                binary_search_for_buckets(F.col(colname).cast("double")),
+        output_df = (
+            sdf.select(
+                F.posexplode(
+                    F.array([F.col(colname).cast("double") for colname in colnames])
+                ).alias("__group_id", "__value")
             )
-
-            if output_df is None:
-                output_df = bucket_df.select(
-                    F.lit(group_id).alias("__group_id"), F.col(bucket_name).alias("__bucket")
-                )
-            else:
-                output_df = output_df.union(
-                    bucket_df.select(
-                        F.lit(group_id).alias("__group_id"), F.col(bucket_name).alias("__bucket")
-                    )
-                )
+            # to match handleInvalid="skip" in Bucketizer
+            .where(F.col("__value").isNotNull() & ~F.col("__value").isNaN()).select(
+                F.col("__group_id"),
+                binary_search_for_buckets(F.col("__value")).alias("__bucket"),
+            )
+        )
 
         # 2. Calculate the count based on each group and bucket.
         #     +----------+-------+------+