From a454360378ba872f76969ca2800165baec211e76 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 5 May 2026 18:06:09 -0600 Subject: [PATCH 01/54] feat: add optimized PyArrow UDF execution (CometPythonMapInArrowExec) When Comet operators produce Arrow columnar data and the next operator is a Python UDF (mapInArrow/mapInPandas), Spark currently inserts an unnecessary ColumnarToRow transition. The Python runner then converts those rows back to Arrow to send to Python, creating a wasteful Arrow->Row->Arrow round-trip. This adds CometPythonMapInArrowExec which: - Accepts columnar input directly from Comet operators - Uses lightweight batch.rowIterator() instead of UnsafeProjection - Keeps the Python output as ColumnarBatch (no output row conversion) The optimization is detected in EliminateRedundantTransitions and controlled by spark.comet.exec.pythonMapInArrow.enabled (default: true). --- .../scala/org/apache/comet/CometConf.scala | 10 + .../rules/EliminateRedundantTransitions.scala | 42 +++- .../sql/comet/CometPythonMapInArrowExec.scala | 143 ++++++++++++++ .../resources/pyspark/test_pyarrow_udf.py | 183 ++++++++++++++++++ .../exec/CometPythonMapInArrowSuite.scala | 68 +++++++ 5 files changed, 445 insertions(+), 1 deletion(-) create mode 100644 spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala create mode 100644 spark/src/test/resources/pyspark/test_pyarrow_udf.py create mode 100644 spark/src/test/scala/org/apache/comet/exec/CometPythonMapInArrowSuite.scala diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala index d3f51dfbe2..a06cd896ec 100644 --- a/common/src/main/scala/org/apache/comet/CometConf.scala +++ b/common/src/main/scala/org/apache/comet/CometConf.scala @@ -314,6 +314,16 @@ object CometConf extends ShimCometConf { .booleanConf .createWithDefault(false) + val COMET_PYTHON_MAP_IN_ARROW_ENABLED: ConfigEntry[Boolean] = + conf("spark.comet.exec.pythonMapInArrow.enabled") + .category(CATEGORY_EXEC) + .doc( + "Whether to enable optimized execution of PyArrow UDFs (mapInArrow/mapInPandas). " + + "When enabled, Comet passes Arrow columnar data directly to Python UDFs without " + + "the intermediate Arrow-to-Row-to-Arrow conversion that Spark normally performs.") + .booleanConf + .createWithDefault(true) + val COMET_TRACING_ENABLED: ConfigEntry[Boolean] = conf("spark.comet.tracing.enabled") .category(CATEGORY_TUNING) .doc(s"Enable fine-grained tracing of events and memory usage. $TRACING_GUIDE.") diff --git a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala index 7402a83248..272ef76484 100644 --- a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala +++ b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala @@ -20,13 +20,15 @@ package org.apache.comet.rules import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.expressions.PythonUDF import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.util.sideBySide -import org.apache.spark.sql.comet.{CometCollectLimitExec, CometColumnarToRowExec, CometNativeColumnarToRowExec, CometNativeWriteExec, CometPlan, CometSparkToColumnarExec} +import org.apache.spark.sql.comet.{CometCollectLimitExec, CometColumnarToRowExec, CometNativeColumnarToRowExec, CometNativeWriteExec, CometPlan, CometPythonMapInArrowExec, CometSparkToColumnarExec} import org.apache.spark.sql.comet.execution.shuffle.{CometColumnarShuffle, CometShuffleExchangeExec} import org.apache.spark.sql.execution.{ColumnarToRowExec, RowToColumnarExec, SparkPlan} import org.apache.spark.sql.execution.adaptive.QueryStageExec import org.apache.spark.sql.execution.exchange.ReusedExchangeExec +import org.apache.spark.sql.execution.python.{MapInPandasExec, PythonMapInArrowExec} import org.apache.comet.CometConf @@ -98,6 +100,32 @@ case class EliminateRedundantTransitions(session: SparkSession) extends Rule[Spa case CometNativeColumnarToRowExec(sparkToColumnar: CometSparkToColumnarExec) => sparkToColumnar.child case CometSparkToColumnarExec(child: CometSparkToColumnarExec) => child + // Replace MapInBatchExec (PythonMapInArrowExec / MapInPandasExec) that has a + // ColumnarToRow child with CometPythonMapInArrowExec to avoid the unnecessary + // Arrow->Row->Arrow round-trip. + case p: PythonMapInArrowExec if CometConf.COMET_PYTHON_MAP_IN_ARROW_ENABLED.get() => + extractColumnarChild(p.child) + .map { columnarChild => + CometPythonMapInArrowExec( + p.func, + p.output, + columnarChild, + p.isBarrier, + p.func.asInstanceOf[PythonUDF].evalType) + } + .getOrElse(p) + case p: MapInPandasExec if CometConf.COMET_PYTHON_MAP_IN_ARROW_ENABLED.get() => + extractColumnarChild(p.child) + .map { columnarChild => + CometPythonMapInArrowExec( + p.func, + p.output, + columnarChild, + p.isBarrier, + p.func.asInstanceOf[PythonUDF].evalType) + } + .getOrElse(p) + // Spark adds `RowToColumnar` under Comet columnar shuffle. But it's redundant as the // shuffle takes row-based input. case s @ CometShuffleExchangeExec( @@ -130,6 +158,18 @@ case class EliminateRedundantTransitions(session: SparkSession) extends Rule[Spa } } + /** + * If the given plan is a ColumnarToRow transition wrapping a columnar child, returns that + * columnar child. Used to detect and eliminate unnecessary transitions before Python UDF + * operators. + */ + private def extractColumnarChild(plan: SparkPlan): Option[SparkPlan] = plan match { + case ColumnarToRowExec(child) if child.supportsColumnar => Some(child) + case CometColumnarToRowExec(child) => Some(child) + case CometNativeColumnarToRowExec(child) => Some(child) + case _ => None + } + /** * Creates an appropriate columnar to row transition operator. * diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala new file mode 100644 index 0000000000..84b3c31113 --- /dev/null +++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.comet + +import scala.collection.JavaConverters._ + +import org.apache.spark.{ContextAwareIterator, TaskContext} +import org.apache.spark.api.python.ChainedPythonFunctions +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.PythonUDF +import org.apache.spark.sql.catalyst.plans.physical.Partitioning +import org.apache.spark.sql.execution.{ColumnarToRowExec, SparkPlan, UnaryExecNode} +import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} +import org.apache.spark.sql.execution.python.{ArrowPythonRunner, BatchIterator, PythonSQLMetrics} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch} + +/** + * An optimized version of Spark's MapInBatchExec (PythonMapInArrowExec / MapInPandasExec) that + * accepts columnar input directly from Comet operators, avoiding unnecessary Arrow -> Row -> + * Arrow conversions. + * + * Normal Spark flow: CometNativeExec (Arrow) -> ColumnarToRow -> PythonMapInArrowExec + * (internally: rows -> Arrow -> Python -> Arrow -> rows) + * + * Optimized flow: CometNativeExec (Arrow) -> CometPythonMapInArrowExec (batch.rowIterator() -> + * Arrow -> Python -> Arrow columnar output) + * + * This eliminates: + * 1. The UnsafeProjection in ColumnarToRow (expensive copy) 2. The output Arrow->Row conversion + * (keeps Python output as ColumnarBatch) + */ +case class CometPythonMapInArrowExec( + func: Expression, + output: Seq[Attribute], + child: SparkPlan, + isBarrier: Boolean, + pythonEvalType: Int) + extends UnaryExecNode + with PythonSQLMetrics { + + override def supportsColumnar: Boolean = true + + override def producedAttributes: AttributeSet = AttributeSet(output) + + override def outputPartitioning: Partitioning = child.outputPartitioning + + override lazy val metrics: Map[String, SQLMetric] = Map( + "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), + "numOutputBatches" -> SQLMetrics.createMetric(sparkContext, "number of output batches"), + "numInputRows" -> SQLMetrics.createMetric(sparkContext, "number of input rows")) ++ + pythonMetrics + + override def doExecute(): RDD[InternalRow] = { + ColumnarToRowExec(this).doExecute() + } + + override def doExecuteColumnar(): RDD[ColumnarBatch] = { + val numOutputRows = longMetric("numOutputRows") + val numOutputBatches = longMetric("numOutputBatches") + val numInputRows = longMetric("numInputRows") + + val pythonRunnerConf = ArrowPythonRunner.getPythonRunnerConfMap(conf) + val pythonFunction = func.asInstanceOf[PythonUDF].func + val chainedFunc = Seq(ChainedPythonFunctions(Seq(pythonFunction))) + val localOutput = output + val localChildSchema = child.schema + val batchSize = conf.arrowMaxRecordsPerBatch + val sessionLocalTimeZone = conf.sessionLocalTimeZone + val largeVarTypes = conf.arrowUseLargeVarTypes + val localPythonEvalType = pythonEvalType + val localPythonMetrics = pythonMetrics + val jobArtifactUUID = + org.apache.spark.JobArtifactSet.getCurrentJobArtifactState.map(_.uuid) + + val inputRDD = child.executeColumnar() + + inputRDD.mapPartitionsInternal { batches => + val context = TaskContext.get() + val argOffsets = Array(Array(0)) + + // Convert columnar batches to rows using lightweight rowIterator + // (avoids UnsafeProjection copy that ColumnarToRow would do) + val rowIter = batches.flatMap { batch => + numInputRows += batch.numRows() + batch.rowIterator().asScala + } + + val contextAwareIterator = new ContextAwareIterator(context, rowIter) + + // Wrap rows as a struct, matching MapInBatchEvaluatorFactory behavior + val wrappedIter = contextAwareIterator.map(InternalRow(_)) + + val batchIter = + if (batchSize > 0) new BatchIterator(wrappedIter, batchSize) else Iterator(wrappedIter) + + val columnarBatchIter = new ArrowPythonRunner( + chainedFunc, + localPythonEvalType, + argOffsets, + org.apache.spark.sql.types + .StructType(Array(org.apache.spark.sql.types.StructField("struct", localChildSchema))), + sessionLocalTimeZone, + largeVarTypes, + pythonRunnerConf, + localPythonMetrics, + jobArtifactUUID).compute(batchIter, context.partitionId(), context) + + columnarBatchIter.map { batch => + // Python returns a StructType column; flatten to individual columns + val structVector = batch.column(0).asInstanceOf[ArrowColumnVector] + val outputVectors = localOutput.indices.map(structVector.getChild) + val flattenedBatch = new ColumnarBatch(outputVectors.toArray) + flattenedBatch.setNumRows(batch.numRows()) + numOutputRows += flattenedBatch.numRows() + numOutputBatches += 1 + flattenedBatch + } + } + } + + override protected def withNewChildInternal(newChild: SparkPlan): CometPythonMapInArrowExec = + copy(child = newChild) +} diff --git a/spark/src/test/resources/pyspark/test_pyarrow_udf.py b/spark/src/test/resources/pyspark/test_pyarrow_udf.py new file mode 100644 index 0000000000..04b83fe66b --- /dev/null +++ b/spark/src/test/resources/pyspark/test_pyarrow_udf.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Integration test for CometPythonMapInArrowExec. + +This test verifies that Comet's optimized PyArrow UDF execution works correctly +by checking: +1. The plan uses CometPythonMapInArrowExec instead of PythonMapInArrow + ColumnarToRow +2. The UDF produces correct results +3. Performance improvement by eliminating unnecessary Arrow->Row->Arrow conversions + +Usage: + # Build Comet first: make release + # Then run with PySpark: + spark-submit --jars spark/target/comet-spark-spark3.5_2.12-*.jar \ + --conf spark.plugins=org.apache.comet.CometSparkSessionExtensions \ + --conf spark.comet.enabled=true \ + --conf spark.comet.exec.enabled=true \ + --conf spark.comet.exec.pythonMapInArrow.enabled=true \ + --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \ + --conf spark.memory.offHeap.enabled=true \ + --conf spark.memory.offHeap.size=2g \ + spark/src/test/resources/pyspark/test_pyarrow_udf.py +""" + +import sys +import pyarrow as pa +from pyspark.sql import SparkSession +from pyspark.sql import types as T + + +def test_map_in_arrow_basic(): + """Test basic mapInArrow with Comet optimization.""" + spark = SparkSession.builder.getOrCreate() + + # Create test data + data = [(i, float(i * 1.5), f"name_{i}") for i in range(100)] + df = spark.createDataFrame(data, ["id", "value", "name"]) + + # Write to parquet so CometScan can read it + df.write.mode("overwrite").parquet("/tmp/comet_pyarrow_test_data") + test_df = spark.read.parquet("/tmp/comet_pyarrow_test_data") + + # Define a PyArrow UDF that doubles the value column + def double_value(batch: pa.RecordBatch) -> pa.RecordBatch: + pdf = batch.to_pandas() + pdf["value"] = pdf["value"] * 2 + return pa.RecordBatch.from_pandas(pdf) + + output_schema = T.StructType([ + T.StructField("id", T.LongType()), + T.StructField("value", T.DoubleType()), + T.StructField("name", T.StringType()), + ]) + + # Apply mapInArrow + result_df = test_df.mapInArrow(double_value, output_schema) + + # Check the explain plan + print("=" * 60) + print("PHYSICAL PLAN:") + print("=" * 60) + result_df.explain(mode="extended") + print("=" * 60) + + plan_str = result_df.queryExecution.executedPlan.toString() + print(f"\nPlan string:\n{plan_str}\n") + + # Verify CometPythonMapInArrowExec is in the plan (if Comet is active) + if "CometPythonMapInArrowExec" in plan_str: + print("SUCCESS: CometPythonMapInArrowExec is in the plan!") + elif "CometScan" in plan_str and "ColumnarToRow" in plan_str: + print("WARNING: CometScan present but still using ColumnarToRow before Python UDF") + elif "CometScan" not in plan_str: + print("INFO: Comet is not active for this query (CometScan not found)") + else: + print("INFO: Plan does not contain CometPythonMapInArrowExec") + + # Verify correctness + result = result_df.orderBy("id").collect() + expected_first = data[0] + actual_first = result[0] + + assert actual_first["id"] == expected_first[0], \ + f"ID mismatch: {actual_first['id']} != {expected_first[0]}" + assert abs(actual_first["value"] - expected_first[1] * 2) < 0.001, \ + f"Value mismatch: {actual_first['value']} != {expected_first[1] * 2}" + assert actual_first["name"] == expected_first[2], \ + f"Name mismatch: {actual_first['name']} != {expected_first[2]}" + + print(f"\nFirst row: {actual_first}") + print(f"Expected value (doubled): {expected_first[1] * 2}") + print("CORRECTNESS: PASSED") + + # Verify all rows + for i, row in enumerate(result): + expected_val = data[i][1] * 2 + assert abs(row["value"] - expected_val) < 0.001, \ + f"Row {i}: expected value {expected_val}, got {row['value']}" + + print(f"All {len(result)} rows verified correctly.") + return True + + +def test_map_in_arrow_type_change(): + """Test mapInArrow that changes the schema.""" + spark = SparkSession.builder.getOrCreate() + + data = [(i, float(i)) for i in range(50)] + df = spark.createDataFrame(data, ["id", "value"]) + df.write.mode("overwrite").parquet("/tmp/comet_pyarrow_test_data2") + test_df = spark.read.parquet("/tmp/comet_pyarrow_test_data2") + + def add_computed_column(batch: pa.RecordBatch) -> pa.RecordBatch: + pdf = batch.to_pandas() + pdf["squared"] = pdf["value"] ** 2 + pdf["label"] = pdf["id"].apply(lambda x: f"item_{x}") + return pa.RecordBatch.from_pandas(pdf) + + output_schema = T.StructType([ + T.StructField("id", T.LongType()), + T.StructField("value", T.DoubleType()), + T.StructField("squared", T.DoubleType()), + T.StructField("label", T.StringType()), + ]) + + result_df = test_df.mapInArrow(add_computed_column, output_schema) + result = result_df.orderBy("id").collect() + + assert len(result) == 50 + for i, row in enumerate(result): + assert abs(row["squared"] - float(i) ** 2) < 0.001 + assert row["label"] == f"item_{i}" + + print("test_map_in_arrow_type_change: PASSED") + return True + + +if __name__ == "__main__": + print("Running PyArrow UDF integration tests for Comet...") + print() + + tests = [ + ("test_map_in_arrow_basic", test_map_in_arrow_basic), + ("test_map_in_arrow_type_change", test_map_in_arrow_type_change), + ] + + passed = 0 + failed = 0 + for name, test_fn in tests: + print(f"\n{'=' * 60}") + print(f"Running: {name}") + print(f"{'=' * 60}") + try: + test_fn() + passed += 1 + except Exception as e: + print(f"FAILED: {e}") + import traceback + traceback.print_exc() + failed += 1 + + print(f"\n{'=' * 60}") + print(f"Results: {passed} passed, {failed} failed") + print(f"{'=' * 60}") + + sys.exit(0 if failed == 0 else 1) diff --git a/spark/src/test/scala/org/apache/comet/exec/CometPythonMapInArrowSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometPythonMapInArrowSuite.scala new file mode 100644 index 0000000000..94145cea2b --- /dev/null +++ b/spark/src/test/scala/org/apache/comet/exec/CometPythonMapInArrowSuite.scala @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.exec + +import org.apache.spark.sql.CometTestBase +import org.apache.spark.sql.comet.CometPythonMapInArrowExec +import org.apache.spark.sql.execution.ColumnarToRowExec +import org.apache.spark.sql.execution.python.PythonMapInArrowExec + +import org.apache.comet.CometConf + +class CometPythonMapInArrowSuite extends CometTestBase { + + test("plan with CometScan has columnar support for Python UDF optimization") { + withSQLConf( + CometConf.COMET_ENABLED.key -> "true", + CometConf.COMET_EXEC_ENABLED.key -> "true", + CometConf.COMET_PYTHON_MAP_IN_ARROW_ENABLED.key -> "true") { + withParquetTable( + (1 to 10).map(i => (i.toDouble, s"str_$i")), + "testTable", + withDictionary = false) { + val df = spark.sql("SELECT * FROM testTable") + val plan = df.queryExecution.executedPlan + val cometScans = plan.collect { case s if s.supportsColumnar => s } + assert(cometScans.nonEmpty, "Expected columnar operators that can feed Python UDFs") + } + } + } + + test("config disables Python map in arrow optimization") { + withSQLConf( + CometConf.COMET_ENABLED.key -> "true", + CometConf.COMET_EXEC_ENABLED.key -> "true", + CometConf.COMET_PYTHON_MAP_IN_ARROW_ENABLED.key -> "false") { + withParquetTable( + (1 to 10).map(i => (i.toDouble, s"str_$i")), + "testTable", + withDictionary = false) { + val df = spark.sql("SELECT * FROM testTable") + val plan = df.queryExecution.executedPlan + // With the feature disabled, no CometPythonMapInArrowExec should appear + val cometPythonExecs = + plan.collect { case e: CometPythonMapInArrowExec => e } + assert( + cometPythonExecs.isEmpty, + "CometPythonMapInArrowExec should not appear when disabled") + } + } + } +} From 84aec8406f093abff96ca916ca9c4602065f9019 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 5 May 2026 18:11:52 -0600 Subject: [PATCH 02/54] docs: add PyArrow UDF acceleration user guide page Documents the CometPythonMapInArrowExec optimization, including supported APIs, configuration, usage example, and how to verify the optimization is active in query plans. --- docs/source/user-guide/latest/index.rst | 1 + docs/source/user-guide/latest/pyarrow-udfs.md | 132 ++++++++++++++++++ .../resources/pyspark/test_pyarrow_udf.py | 3 +- 3 files changed, 135 insertions(+), 1 deletion(-) create mode 100644 docs/source/user-guide/latest/pyarrow-udfs.md diff --git a/docs/source/user-guide/latest/index.rst b/docs/source/user-guide/latest/index.rst index 480ec4f702..c96dea7750 100644 --- a/docs/source/user-guide/latest/index.rst +++ b/docs/source/user-guide/latest/index.rst @@ -38,5 +38,6 @@ Comet $COMET_VERSION User Guide Understanding Comet Plans Tuning Guide Metrics Guide + PyArrow UDF Acceleration Iceberg Guide Kubernetes Guide diff --git a/docs/source/user-guide/latest/pyarrow-udfs.md b/docs/source/user-guide/latest/pyarrow-udfs.md new file mode 100644 index 0000000000..71701960cd --- /dev/null +++ b/docs/source/user-guide/latest/pyarrow-udfs.md @@ -0,0 +1,132 @@ + + +# PyArrow UDF Acceleration + +Comet can accelerate Python UDFs that use PyArrow-backed batch processing, such as `mapInArrow` and `mapInPandas`. +These APIs are commonly used for ML inference, feature engineering, and data transformation workloads. + +## Background + +Spark's `mapInArrow` and `mapInPandas` APIs allow users to apply Python functions that operate on Arrow +RecordBatches or Pandas DataFrames. Under the hood, Spark communicates with the Python worker process +using the Arrow IPC format. + +Without Comet, the execution path for these UDFs involves unnecessary data conversions: + +1. Comet reads data in Arrow columnar format (via CometScan) +2. Spark inserts a ColumnarToRow transition (converts Arrow to UnsafeRow) +3. The Python runner converts those rows back to Arrow to send to Python +4. Python executes the UDF on Arrow batches +5. Results are returned as Arrow and then converted back to rows + +Steps 2 and 3 are redundant since the data starts and ends in Arrow format. + +## How Comet Optimizes This + +When enabled, Comet detects `PythonMapInArrowExec` and `MapInPandasExec` operators in the physical plan +and replaces them with `CometPythonMapInArrowExec`, which: + +- Reads Arrow columnar batches directly from the upstream Comet operator +- Feeds them to the Python runner without the expensive UnsafeProjection copy +- Keeps the Python output in columnar format for downstream operators + +This eliminates the ColumnarToRow transition and the output row conversion, reducing CPU overhead +and memory allocations. + +## Configuration + +The optimization is controlled by: + +``` +spark.comet.exec.pythonMapInArrow.enabled=true (default) +``` + +It is enabled by default when Comet execution is active. + +## Supported APIs + +| PySpark API | Spark Plan Node | Supported | +|-------------|-----------------|-----------| +| `df.mapInArrow(func, schema)` | `PythonMapInArrowExec` | Yes | +| `df.mapInPandas(func, schema)` | `MapInPandasExec` | Yes | +| `@pandas_udf` (scalar) | `ArrowEvalPythonExec` | Not yet | +| `df.applyInPandas(func, schema)` | `FlatMapGroupsInPandasExec` | Not yet | + +## Example + +```python +import pyarrow as pa +from pyspark.sql import SparkSession, types as T + +spark = SparkSession.builder \ + .config("spark.plugins", "org.apache.spark.CometPlugin") \ + .config("spark.comet.enabled", "true") \ + .config("spark.comet.exec.enabled", "true") \ + .config("spark.comet.exec.pythonMapInArrow.enabled", "true") \ + .config("spark.memory.offHeap.enabled", "true") \ + .config("spark.memory.offHeap.size", "2g") \ + .getOrCreate() + +df = spark.read.parquet("data.parquet") + +def transform(batch: pa.RecordBatch) -> pa.RecordBatch: + # Your transformation logic here + table = batch.to_pandas() + table["new_col"] = table["value"] * 2 + return pa.RecordBatch.from_pandas(table) + +output_schema = T.StructType([ + T.StructField("value", T.DoubleType()), + T.StructField("new_col", T.DoubleType()), +]) + +result = df.mapInArrow(transform, output_schema) +``` + +## Verifying the Optimization + +Use `explain()` to verify that `CometPythonMapInArrowExec` appears in your plan: + +```python +result.explain(mode="extended") +``` + +You should see: +``` +CometPythonMapInArrowExec ... ++- CometNativeExec ... + +- CometScan ... +``` + +Instead of the unoptimized plan: +``` +PythonMapInArrow ... ++- ColumnarToRow + +- CometNativeExec ... + +- CometScan ... +``` + +## Limitations + +- The optimization currently applies only to `mapInArrow` and `mapInPandas`. Scalar pandas UDFs + (`@pandas_udf`) and grouped operations (`applyInPandas`) are not yet supported. +- The internal row-to-Arrow conversion inside the Python runner is still present in this version. + A future optimization will write Arrow batches directly to the Python IPC stream, achieving + near zero-copy data transfer. diff --git a/spark/src/test/resources/pyspark/test_pyarrow_udf.py b/spark/src/test/resources/pyspark/test_pyarrow_udf.py index 04b83fe66b..1993f29f9f 100644 --- a/spark/src/test/resources/pyspark/test_pyarrow_udf.py +++ b/spark/src/test/resources/pyspark/test_pyarrow_udf.py @@ -26,10 +26,11 @@ 3. Performance improvement by eliminating unnecessary Arrow->Row->Arrow conversions Usage: + # Requires Python 3.11 or 3.12 (PySpark 3.5 does not support 3.13+) # Build Comet first: make release # Then run with PySpark: spark-submit --jars spark/target/comet-spark-spark3.5_2.12-*.jar \ - --conf spark.plugins=org.apache.comet.CometSparkSessionExtensions \ + --conf spark.plugins=org.apache.spark.CometPlugin \ --conf spark.comet.enabled=true \ --conf spark.comet.exec.enabled=true \ --conf spark.comet.exec.pythonMapInArrow.enabled=true \ From af98fbba92faed24484ae32504218821b4eb59d7 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 5 May 2026 18:38:38 -0600 Subject: [PATCH 03/54] fix(test): correct PyArrow UDF integration test signatures and assertions Fix three issues that prevented test_pyarrow_udf.py from running: 1. mapInArrow callbacks must accept Iterator[pa.RecordBatch] and yield batches. The previous single-batch signatures crashed with "'map' object has no attribute 'to_pandas'". 2. PySpark DataFrame has no `queryExecution` attribute. Use `_jdf.queryExecution().executedPlan().toString()` instead. 3. Replace soft plan-string heuristics with assertions that fail loudly if the optimization regresses. Match on `CometPythonMapInArrow` (no `Exec` suffix in the plan toString) and assert no `ColumnarToRow` transition is present. --- .../resources/pyspark/test_pyarrow_udf.py | 42 ++++++++++--------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/spark/src/test/resources/pyspark/test_pyarrow_udf.py b/spark/src/test/resources/pyspark/test_pyarrow_udf.py index 1993f29f9f..6acac6a912 100644 --- a/spark/src/test/resources/pyspark/test_pyarrow_udf.py +++ b/spark/src/test/resources/pyspark/test_pyarrow_udf.py @@ -58,11 +58,13 @@ def test_map_in_arrow_basic(): df.write.mode("overwrite").parquet("/tmp/comet_pyarrow_test_data") test_df = spark.read.parquet("/tmp/comet_pyarrow_test_data") - # Define a PyArrow UDF that doubles the value column - def double_value(batch: pa.RecordBatch) -> pa.RecordBatch: - pdf = batch.to_pandas() - pdf["value"] = pdf["value"] * 2 - return pa.RecordBatch.from_pandas(pdf) + # Define a PyArrow UDF that doubles the value column. + # mapInArrow callbacks receive an iterator of RecordBatches and must yield batches. + def double_value(iterator): + for batch in iterator: + pdf = batch.to_pandas() + pdf["value"] = pdf["value"] * 2 + yield pa.RecordBatch.from_pandas(pdf) output_schema = T.StructType([ T.StructField("id", T.LongType()), @@ -80,18 +82,17 @@ def double_value(batch: pa.RecordBatch) -> pa.RecordBatch: result_df.explain(mode="extended") print("=" * 60) - plan_str = result_df.queryExecution.executedPlan.toString() + plan_str = result_df._jdf.queryExecution().executedPlan().toString() print(f"\nPlan string:\n{plan_str}\n") - # Verify CometPythonMapInArrowExec is in the plan (if Comet is active) - if "CometPythonMapInArrowExec" in plan_str: - print("SUCCESS: CometPythonMapInArrowExec is in the plan!") - elif "CometScan" in plan_str and "ColumnarToRow" in plan_str: - print("WARNING: CometScan present but still using ColumnarToRow before Python UDF") - elif "CometScan" not in plan_str: - print("INFO: Comet is not active for this query (CometScan not found)") - else: - print("INFO: Plan does not contain CometPythonMapInArrowExec") + # Verify the optimized Comet operator is in the plan. The toString form is + # "CometPythonMapInArrow" (no Exec suffix) and the upstream scan prints as + # "CometNativeScan". + assert "CometPythonMapInArrow" in plan_str, \ + f"CometPythonMapInArrow missing from plan:\n{plan_str}" + assert "ColumnarToRow" not in plan_str, \ + f"Unexpected ColumnarToRow in optimized plan:\n{plan_str}" + print("SUCCESS: CometPythonMapInArrow is in the plan with no ColumnarToRow transition.") # Verify correctness result = result_df.orderBy("id").collect() @@ -128,11 +129,12 @@ def test_map_in_arrow_type_change(): df.write.mode("overwrite").parquet("/tmp/comet_pyarrow_test_data2") test_df = spark.read.parquet("/tmp/comet_pyarrow_test_data2") - def add_computed_column(batch: pa.RecordBatch) -> pa.RecordBatch: - pdf = batch.to_pandas() - pdf["squared"] = pdf["value"] ** 2 - pdf["label"] = pdf["id"].apply(lambda x: f"item_{x}") - return pa.RecordBatch.from_pandas(pdf) + def add_computed_column(iterator): + for batch in iterator: + pdf = batch.to_pandas() + pdf["squared"] = pdf["value"] ** 2 + pdf["label"] = pdf["id"].apply(lambda x: f"item_{x}") + yield pa.RecordBatch.from_pandas(pdf) output_schema = T.StructType([ T.StructField("id", T.LongType()), From f29cb2f53f5437edcfc906129a8ca3253fb0b0ea Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 5 May 2026 18:43:55 -0600 Subject: [PATCH 04/54] test: convert PyArrow UDF script to pytest and add CI coverage - Rewrite test_pyarrow_udf.py as a pytest module. A session-scoped SparkSession fixture builds the Comet-enabled session once and a parametrized `accelerated` fixture toggles spark.comet.exec.pythonMapInArrow.enabled per test, so each case runs under both the optimized and fallback paths and asserts the expected plan operator (`CometPythonMapInArrow` vs vanilla `PythonMapInArrow`). The jar is auto-discovered from spark/target by matching the installed pyspark version, or taken from the COMET_JAR env var. - Add a dedicated `PyArrow UDF Tests` workflow that builds Comet against Spark 3.5 / Scala 2.12, installs pyspark/pyarrow/pandas/pytest, and runs the new pytest module. - Add CometPythonMapInArrowSuite to the `exec` suite list in both pr_build_linux.yml and pr_build_macos.yml so the JVM-side suite is exercised on every PR. --- .github/workflows/pr_build_linux.yml | 1 + .github/workflows/pr_build_macos.yml | 1 + .github/workflows/pyarrow_udf_test.yml | 96 ++++++ .../resources/pyspark/test_pyarrow_udf.py | 299 +++++++++--------- 4 files changed, 256 insertions(+), 141 deletions(-) create mode 100644 .github/workflows/pyarrow_udf_test.yml diff --git a/.github/workflows/pr_build_linux.yml b/.github/workflows/pr_build_linux.yml index b0f09bc43b..b62a000f6c 100644 --- a/.github/workflows/pr_build_linux.yml +++ b/.github/workflows/pr_build_linux.yml @@ -354,6 +354,7 @@ jobs: org.apache.comet.exec.CometGenerateExecSuite org.apache.comet.exec.CometWindowExecSuite org.apache.comet.exec.CometJoinSuite + org.apache.comet.exec.CometPythonMapInArrowSuite org.apache.comet.CometNativeSuite org.apache.comet.CometSparkSessionExtensionsSuite org.apache.spark.CometPluginsSuite diff --git a/.github/workflows/pr_build_macos.yml b/.github/workflows/pr_build_macos.yml index c743d1888a..fe972818e6 100644 --- a/.github/workflows/pr_build_macos.yml +++ b/.github/workflows/pr_build_macos.yml @@ -193,6 +193,7 @@ jobs: org.apache.comet.exec.CometGenerateExecSuite org.apache.comet.exec.CometWindowExecSuite org.apache.comet.exec.CometJoinSuite + org.apache.comet.exec.CometPythonMapInArrowSuite org.apache.comet.CometNativeSuite org.apache.comet.CometSparkSessionExtensionsSuite org.apache.spark.CometPluginsSuite diff --git a/.github/workflows/pyarrow_udf_test.yml b/.github/workflows/pyarrow_udf_test.yml new file mode 100644 index 0000000000..0779f092a4 --- /dev/null +++ b/.github/workflows/pyarrow_udf_test.yml @@ -0,0 +1,96 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: PyArrow UDF Tests + +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + +on: + push: + branches: + - main + paths: + - "spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala" + - "spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala" + - "spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala" + - "spark/src/test/resources/pyspark/test_pyarrow_udf.py" + - ".github/workflows/pyarrow_udf_test.yml" + - "native/**" + pull_request: + paths: + - "spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala" + - "spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala" + - "spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala" + - "spark/src/test/resources/pyspark/test_pyarrow_udf.py" + - ".github/workflows/pyarrow_udf_test.yml" + - "native/**" + workflow_dispatch: + +env: + RUST_VERSION: stable + RUST_BACKTRACE: 1 + RUSTFLAGS: "-Clink-arg=-fuse-ld=bfd" + +jobs: + pyarrow-udf: + name: PyArrow UDF (Spark 3.5, JDK 17, Python 3.11) + runs-on: ubuntu-latest + container: + image: amd64/rust + env: + JAVA_TOOL_OPTIONS: "--add-exports=java.base/sun.nio.ch=ALL-UNNAMED --add-exports=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED" + steps: + - uses: actions/checkout@v6 + + - name: Setup Rust & Java toolchain + uses: ./.github/actions/setup-builder + with: + rust-version: ${{ env.RUST_VERSION }} + jdk-version: 17 + + - name: Cache Maven dependencies + uses: actions/cache@v5 + with: + path: | + ~/.m2/repository + /root/.m2/repository + key: ${{ runner.os }}-java-maven-${{ hashFiles('**/pom.xml') }}-pyarrow-udf + restore-keys: | + ${{ runner.os }}-java-maven- + + - name: Build Comet (release, Spark 3.5 / Scala 2.12) + run: | + cd native && cargo build --release + cd .. && ./mvnw -B -Prelease install -DskipTests -Pspark-3.5 -Pscala-2.12 + + - name: Install Python 3.11 and pip + run: | + apt-get update + apt-get install -y --no-install-recommends python3.11 python3.11-venv python3-pip + python3.11 -m venv /tmp/venv + /tmp/venv/bin/pip install --upgrade pip + /tmp/venv/bin/pip install "pyspark==3.5.8" "pyarrow>=14" pandas pytest + + - name: Run PyArrow UDF pytest + run: | + jar=$(ls "$PWD"/spark/target/comet-spark-spark3.5_2.12-*-SNAPSHOT.jar \ + | grep -v sources | grep -v tests | head -n1) + echo "Using $jar" + COMET_JAR="$jar" /tmp/venv/bin/python -m pytest -v \ + spark/src/test/resources/pyspark/test_pyarrow_udf.py diff --git a/spark/src/test/resources/pyspark/test_pyarrow_udf.py b/spark/src/test/resources/pyspark/test_pyarrow_udf.py index 6acac6a912..462f4efdc6 100644 --- a/spark/src/test/resources/pyspark/test_pyarrow_udf.py +++ b/spark/src/test/resources/pyspark/test_pyarrow_udf.py @@ -17,117 +17,165 @@ # under the License. """ -Integration test for CometPythonMapInArrowExec. +Pytest-driven integration tests for Comet's PyArrow UDF acceleration. -This test verifies that Comet's optimized PyArrow UDF execution works correctly -by checking: -1. The plan uses CometPythonMapInArrowExec instead of PythonMapInArrow + ColumnarToRow -2. The UDF produces correct results -3. Performance improvement by eliminating unnecessary Arrow->Row->Arrow conversions +Each test runs against two execution paths: + - "accelerated": spark.comet.exec.pythonMapInArrow.enabled=true + (plan should contain CometPythonMapInArrow and no ColumnarToRow) + - "fallback": spark.comet.exec.pythonMapInArrow.enabled=false + (plan should contain vanilla PythonMapInArrow) Usage: - # Requires Python 3.11 or 3.12 (PySpark 3.5 does not support 3.13+) - # Build Comet first: make release - # Then run with PySpark: - spark-submit --jars spark/target/comet-spark-spark3.5_2.12-*.jar \ - --conf spark.plugins=org.apache.spark.CometPlugin \ - --conf spark.comet.enabled=true \ - --conf spark.comet.exec.enabled=true \ - --conf spark.comet.exec.pythonMapInArrow.enabled=true \ - --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \ - --conf spark.memory.offHeap.enabled=true \ - --conf spark.memory.offHeap.size=2g \ - spark/src/test/resources/pyspark/test_pyarrow_udf.py -""" + # Build Comet first: + make release -import sys -import pyarrow as pa -from pyspark.sql import SparkSession -from pyspark.sql import types as T + # Then either let the test discover the jar from spark/target, or pass it + # explicitly via COMET_JAR: + export COMET_JAR=$PWD/spark/target/comet-spark-spark3.5_2.12-0.16.0-SNAPSHOT.jar + pip install pyspark==3.5.8 pyarrow pandas pytest + pytest -v spark/src/test/resources/pyspark/test_pyarrow_udf.py +""" -def test_map_in_arrow_basic(): - """Test basic mapInArrow with Comet optimization.""" - spark = SparkSession.builder.getOrCreate() +import glob +import os - # Create test data +import pyarrow as pa +import pytest +from pyspark.sql import SparkSession, types as T + + +REPO_ROOT = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..") +) + + +def _resolve_comet_jar() -> str: + explicit = os.environ.get("COMET_JAR") + if explicit: + if any(ch in explicit for ch in "*?["): + matches = sorted(glob.glob(explicit)) + if not matches: + raise FileNotFoundError( + f"COMET_JAR pattern matched nothing: {explicit}" + ) + return matches[-1] + return explicit + + # Pick the jar that matches the installed pyspark major.minor version. The + # Comet jars are published per Spark version (e.g., comet-spark-spark3.5_2.12-*.jar); + # using the wrong one yields ClassNotFoundException on Scala stdlib classes. + import pyspark + + major_minor = ".".join(pyspark.__version__.split(".")[:2]) + spark_tag = f"spark{major_minor}" + scala_tag = "_2.12" if major_minor.startswith("3.") else "_2.13" + pattern = os.path.join( + REPO_ROOT, + f"spark/target/comet-spark-{spark_tag}{scala_tag}-*-SNAPSHOT.jar", + ) + candidates = [ + m + for m in sorted(glob.glob(pattern)) + if "sources" not in os.path.basename(m) and "tests" not in os.path.basename(m) + ] + if not candidates: + raise FileNotFoundError( + "Comet jar not found. Set COMET_JAR or run `make release`. " + f"Looked under {pattern}." + ) + return candidates[-1] + + +@pytest.fixture(scope="session") +def spark(): + jar = _resolve_comet_jar() + # PYSPARK_SUBMIT_ARGS is consumed when pyspark launches its JVM. Setting + # --jars puts the Comet jar on both driver and executor classpaths so the + # CometPlugin can be loaded. + os.environ["PYSPARK_SUBMIT_ARGS"] = ( + f"--jars {jar} --driver-class-path {jar} pyspark-shell" + ) + session = ( + SparkSession.builder.master("local[2]") + .appName("comet-pyarrow-udf-tests") + .config("spark.plugins", "org.apache.spark.CometPlugin") + .config("spark.comet.enabled", "true") + .config("spark.comet.exec.enabled", "true") + .config("spark.memory.offHeap.enabled", "true") + .config("spark.memory.offHeap.size", "2g") + .getOrCreate() + ) + try: + yield session + finally: + session.stop() + + +@pytest.fixture(params=[True, False], ids=["accelerated", "fallback"]) +def accelerated(request, spark) -> bool: + spark.conf.set( + "spark.comet.exec.pythonMapInArrow.enabled", + "true" if request.param else "false", + ) + return request.param + + +def _executed_plan(df) -> str: + return df._jdf.queryExecution().executedPlan().toString() + + +def _assert_plan_matches_mode(plan: str, accelerated: bool) -> None: + if accelerated: + assert "CometPythonMapInArrow" in plan, ( + f"expected CometPythonMapInArrow in accelerated plan, got:\n{plan}" + ) + assert "ColumnarToRow" not in plan, ( + f"unexpected ColumnarToRow in accelerated plan:\n{plan}" + ) + else: + assert "CometPythonMapInArrow" not in plan, ( + f"unexpected CometPythonMapInArrow in fallback plan:\n{plan}" + ) + assert "PythonMapInArrow" in plan, ( + f"expected PythonMapInArrow in fallback plan, got:\n{plan}" + ) + + +def test_map_in_arrow_doubles_value(spark, tmp_path, accelerated): data = [(i, float(i * 1.5), f"name_{i}") for i in range(100)] - df = spark.createDataFrame(data, ["id", "value", "name"]) + src = str(tmp_path / "src.parquet") + spark.createDataFrame(data, ["id", "value", "name"]).write.parquet(src) - # Write to parquet so CometScan can read it - df.write.mode("overwrite").parquet("/tmp/comet_pyarrow_test_data") - test_df = spark.read.parquet("/tmp/comet_pyarrow_test_data") - - # Define a PyArrow UDF that doubles the value column. - # mapInArrow callbacks receive an iterator of RecordBatches and must yield batches. def double_value(iterator): for batch in iterator: pdf = batch.to_pandas() pdf["value"] = pdf["value"] * 2 yield pa.RecordBatch.from_pandas(pdf) - output_schema = T.StructType([ - T.StructField("id", T.LongType()), - T.StructField("value", T.DoubleType()), - T.StructField("name", T.StringType()), - ]) - - # Apply mapInArrow - result_df = test_df.mapInArrow(double_value, output_schema) - - # Check the explain plan - print("=" * 60) - print("PHYSICAL PLAN:") - print("=" * 60) - result_df.explain(mode="extended") - print("=" * 60) - - plan_str = result_df._jdf.queryExecution().executedPlan().toString() - print(f"\nPlan string:\n{plan_str}\n") - - # Verify the optimized Comet operator is in the plan. The toString form is - # "CometPythonMapInArrow" (no Exec suffix) and the upstream scan prints as - # "CometNativeScan". - assert "CometPythonMapInArrow" in plan_str, \ - f"CometPythonMapInArrow missing from plan:\n{plan_str}" - assert "ColumnarToRow" not in plan_str, \ - f"Unexpected ColumnarToRow in optimized plan:\n{plan_str}" - print("SUCCESS: CometPythonMapInArrow is in the plan with no ColumnarToRow transition.") - - # Verify correctness - result = result_df.orderBy("id").collect() - expected_first = data[0] - actual_first = result[0] - - assert actual_first["id"] == expected_first[0], \ - f"ID mismatch: {actual_first['id']} != {expected_first[0]}" - assert abs(actual_first["value"] - expected_first[1] * 2) < 0.001, \ - f"Value mismatch: {actual_first['value']} != {expected_first[1] * 2}" - assert actual_first["name"] == expected_first[2], \ - f"Name mismatch: {actual_first['name']} != {expected_first[2]}" - - print(f"\nFirst row: {actual_first}") - print(f"Expected value (doubled): {expected_first[1] * 2}") - print("CORRECTNESS: PASSED") - - # Verify all rows - for i, row in enumerate(result): - expected_val = data[i][1] * 2 - assert abs(row["value"] - expected_val) < 0.001, \ - f"Row {i}: expected value {expected_val}, got {row['value']}" - - print(f"All {len(result)} rows verified correctly.") - return True - - -def test_map_in_arrow_type_change(): - """Test mapInArrow that changes the schema.""" - spark = SparkSession.builder.getOrCreate() + schema = T.StructType( + [ + T.StructField("id", T.LongType()), + T.StructField("value", T.DoubleType()), + T.StructField("name", T.StringType()), + ] + ) + result_df = spark.read.parquet(src).mapInArrow(double_value, schema) + + _assert_plan_matches_mode(_executed_plan(result_df), accelerated) + + rows = result_df.orderBy("id").collect() + assert len(rows) == len(data) + for row, original in zip(rows, data): + assert row["id"] == original[0] + assert abs(row["value"] - original[1] * 2) < 1e-6 + assert row["name"] == original[2] + +def test_map_in_arrow_changes_schema(spark, tmp_path, accelerated): data = [(i, float(i)) for i in range(50)] - df = spark.createDataFrame(data, ["id", "value"]) - df.write.mode("overwrite").parquet("/tmp/comet_pyarrow_test_data2") - test_df = spark.read.parquet("/tmp/comet_pyarrow_test_data2") + src = str(tmp_path / "src.parquet") + spark.createDataFrame(data, ["id", "value"]).write.parquet(src) def add_computed_column(iterator): for batch in iterator: @@ -136,51 +184,20 @@ def add_computed_column(iterator): pdf["label"] = pdf["id"].apply(lambda x: f"item_{x}") yield pa.RecordBatch.from_pandas(pdf) - output_schema = T.StructType([ - T.StructField("id", T.LongType()), - T.StructField("value", T.DoubleType()), - T.StructField("squared", T.DoubleType()), - T.StructField("label", T.StringType()), - ]) - - result_df = test_df.mapInArrow(add_computed_column, output_schema) - result = result_df.orderBy("id").collect() - - assert len(result) == 50 - for i, row in enumerate(result): - assert abs(row["squared"] - float(i) ** 2) < 0.001 + schema = T.StructType( + [ + T.StructField("id", T.LongType()), + T.StructField("value", T.DoubleType()), + T.StructField("squared", T.DoubleType()), + T.StructField("label", T.StringType()), + ] + ) + result_df = spark.read.parquet(src).mapInArrow(add_computed_column, schema) + + _assert_plan_matches_mode(_executed_plan(result_df), accelerated) + + rows = result_df.orderBy("id").collect() + assert len(rows) == 50 + for i, row in enumerate(rows): + assert abs(row["squared"] - float(i) ** 2) < 1e-6 assert row["label"] == f"item_{i}" - - print("test_map_in_arrow_type_change: PASSED") - return True - - -if __name__ == "__main__": - print("Running PyArrow UDF integration tests for Comet...") - print() - - tests = [ - ("test_map_in_arrow_basic", test_map_in_arrow_basic), - ("test_map_in_arrow_type_change", test_map_in_arrow_type_change), - ] - - passed = 0 - failed = 0 - for name, test_fn in tests: - print(f"\n{'=' * 60}") - print(f"Running: {name}") - print(f"{'=' * 60}") - try: - test_fn() - passed += 1 - except Exception as e: - print(f"FAILED: {e}") - import traceback - traceback.print_exc() - failed += 1 - - print(f"\n{'=' * 60}") - print(f"Results: {passed} passed, {failed} failed") - print(f"{'=' * 60}") - - sys.exit(0 if failed == 0 else 1) From f7515397e4aada8fc956552b9042d3ce00ceb039 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 5 May 2026 18:43:57 -0600 Subject: [PATCH 05/54] docs: run prettier on pyarrow-udfs user guide page --- docs/source/user-guide/latest/pyarrow-udfs.md | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/docs/source/user-guide/latest/pyarrow-udfs.md b/docs/source/user-guide/latest/pyarrow-udfs.md index 71701960cd..2d555cedc4 100644 --- a/docs/source/user-guide/latest/pyarrow-udfs.md +++ b/docs/source/user-guide/latest/pyarrow-udfs.md @@ -62,12 +62,12 @@ It is enabled by default when Comet execution is active. ## Supported APIs -| PySpark API | Spark Plan Node | Supported | -|-------------|-----------------|-----------| -| `df.mapInArrow(func, schema)` | `PythonMapInArrowExec` | Yes | -| `df.mapInPandas(func, schema)` | `MapInPandasExec` | Yes | -| `@pandas_udf` (scalar) | `ArrowEvalPythonExec` | Not yet | -| `df.applyInPandas(func, schema)` | `FlatMapGroupsInPandasExec` | Not yet | +| PySpark API | Spark Plan Node | Supported | +| -------------------------------- | --------------------------- | --------- | +| `df.mapInArrow(func, schema)` | `PythonMapInArrowExec` | Yes | +| `df.mapInPandas(func, schema)` | `MapInPandasExec` | Yes | +| `@pandas_udf` (scalar) | `ArrowEvalPythonExec` | Not yet | +| `df.applyInPandas(func, schema)` | `FlatMapGroupsInPandasExec` | Not yet | ## Example @@ -109,6 +109,7 @@ result.explain(mode="extended") ``` You should see: + ``` CometPythonMapInArrowExec ... +- CometNativeExec ... @@ -116,6 +117,7 @@ CometPythonMapInArrowExec ... ``` Instead of the unoptimized plan: + ``` PythonMapInArrow ... +- ColumnarToRow From b14fbfb58adaf3b9219e8f06171f450ef7fd1deb Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 5 May 2026 18:46:57 -0600 Subject: [PATCH 06/54] style: apply spotless formatting --- .../org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala | 1 - .../org/apache/comet/exec/CometPythonMapInArrowSuite.scala | 2 -- 2 files changed, 3 deletions(-) diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala index 84b3c31113..223153d7d8 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala @@ -31,7 +31,6 @@ import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.execution.{ColumnarToRowExec, SparkPlan, UnaryExecNode} import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.sql.execution.python.{ArrowPythonRunner, BatchIterator, PythonSQLMetrics} -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch} /** diff --git a/spark/src/test/scala/org/apache/comet/exec/CometPythonMapInArrowSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometPythonMapInArrowSuite.scala index 94145cea2b..7b1e17c4ed 100644 --- a/spark/src/test/scala/org/apache/comet/exec/CometPythonMapInArrowSuite.scala +++ b/spark/src/test/scala/org/apache/comet/exec/CometPythonMapInArrowSuite.scala @@ -21,8 +21,6 @@ package org.apache.comet.exec import org.apache.spark.sql.CometTestBase import org.apache.spark.sql.comet.CometPythonMapInArrowExec -import org.apache.spark.sql.execution.ColumnarToRowExec -import org.apache.spark.sql.execution.python.PythonMapInArrowExec import org.apache.comet.CometConf From ca0bbbf50892860e7e103af8c016163d9d4310ef Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 5 May 2026 18:46:57 -0600 Subject: [PATCH 07/54] ci: broaden pyarrow_udf_test triggers to match pr_build_linux Replace the narrow paths allowlist with the same paths-ignore list used by pr_build_linux.yml so the workflow runs on any source change that could affect Comet's PyArrow UDF execution path, not just the few files explicitly named. --- .github/workflows/pyarrow_udf_test.yml | 34 +++++++++++++++----------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/.github/workflows/pyarrow_udf_test.yml b/.github/workflows/pyarrow_udf_test.yml index 0779f092a4..46c5fbe079 100644 --- a/.github/workflows/pyarrow_udf_test.yml +++ b/.github/workflows/pyarrow_udf_test.yml @@ -25,21 +25,27 @@ on: push: branches: - main - paths: - - "spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala" - - "spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala" - - "spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala" - - "spark/src/test/resources/pyspark/test_pyarrow_udf.py" - - ".github/workflows/pyarrow_udf_test.yml" - - "native/**" + paths-ignore: + - "benchmarks/**" + - "doc/**" + - "docs/**" + - "**.md" + - "dev/changelog/*.md" + - "native/core/benches/**" + - "native/spark-expr/benches/**" + - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" + - "spark/src/main/scala/org/apache/comet/GenerateDocs.scala" pull_request: - paths: - - "spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala" - - "spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala" - - "spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala" - - "spark/src/test/resources/pyspark/test_pyarrow_udf.py" - - ".github/workflows/pyarrow_udf_test.yml" - - "native/**" + paths-ignore: + - "benchmarks/**" + - "doc/**" + - "docs/**" + - "**.md" + - "dev/changelog/*.md" + - "native/core/benches/**" + - "native/spark-expr/benches/**" + - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" + - "spark/src/main/scala/org/apache/comet/GenerateDocs.scala" workflow_dispatch: env: From 55c28c32a187cce9bdf6b49a2b4113e845ed1d44 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 5 May 2026 18:47:48 -0600 Subject: [PATCH 08/54] ci: restrict GITHUB_TOKEN to contents:read in pyarrow_udf_test --- .github/workflows/pyarrow_udf_test.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/pyarrow_udf_test.yml b/.github/workflows/pyarrow_udf_test.yml index 46c5fbe079..0740842413 100644 --- a/.github/workflows/pyarrow_udf_test.yml +++ b/.github/workflows/pyarrow_udf_test.yml @@ -48,6 +48,9 @@ on: - "spark/src/main/scala/org/apache/comet/GenerateDocs.scala" workflow_dispatch: +permissions: + contents: read + env: RUST_VERSION: stable RUST_BACKTRACE: 1 From 05b1e7afd38437c9eb72309ac2f4f5f764a97adc Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 5 May 2026 22:19:01 -0600 Subject: [PATCH 09/54] fix: shim CometPythonMapInArrowExec for cross-version Spark builds The PR's `CometPythonMapInArrowExec` and `EliminateRedundantTransitions` rule directly reference Spark 3.5 APIs that differ across supported Spark versions: the `ArrowPythonRunner` constructor (4 distinct signatures across 3.4/3.5/4.0/4.1+/4.2), `arrowUseLargeVarTypes`, `JobArtifactSet`, `MapInBatchExec.isBarrier`, and the `PythonMapInArrowExec` type itself (renamed to `MapInArrowExec` in 4.0+). This breaks compile on every profile other than 3.5. Introduce a per-version `ShimCometPythonMapInArrow` trait under `org.apache.spark.sql.comet.shims` (placed in the spark namespace so it can reach `private[spark]` members) that: * matches the Spark-version-specific MapInArrow / MapInPandas exec types and exposes their `(func, output, child, isBarrier, evalType)` tuple, * constructs the right `ArrowPythonRunner` for the version, * hides `arrowUseLargeVarTypes` / `JobArtifactSet` / `getPythonRunnerConfMap` behind helper methods. Spark 3.4 lacks the prerequisite APIs (no `isBarrier`, no `JobArtifactSet`, no `arrowUseLargeVarTypes`), so its shim returns `None` from the matchers and the optimization is a no-op there. --- .../rules/EliminateRedundantTransitions.scala | 41 ++++----- .../sql/comet/CometPythonMapInArrowExec.scala | 32 +++---- .../shims/ShimCometPythonMapInArrow.scala | 68 +++++++++++++++ .../shims/ShimCometPythonMapInArrow.scala | 84 ++++++++++++++++++ .../shims/ShimCometPythonMapInArrow.scala | 86 ++++++++++++++++++ .../shims/ShimCometPythonMapInArrow.scala | 87 +++++++++++++++++++ .../shims/ShimCometPythonMapInArrow.scala | 86 ++++++++++++++++++ 7 files changed, 446 insertions(+), 38 deletions(-) create mode 100644 spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala create mode 100644 spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala create mode 100644 spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala create mode 100644 spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala create mode 100644 spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala diff --git a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala index 272ef76484..e7218ab935 100644 --- a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala +++ b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala @@ -20,15 +20,14 @@ package org.apache.comet.rules import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalyst.expressions.PythonUDF import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.util.sideBySide import org.apache.spark.sql.comet.{CometCollectLimitExec, CometColumnarToRowExec, CometNativeColumnarToRowExec, CometNativeWriteExec, CometPlan, CometPythonMapInArrowExec, CometSparkToColumnarExec} import org.apache.spark.sql.comet.execution.shuffle.{CometColumnarShuffle, CometShuffleExchangeExec} +import org.apache.spark.sql.comet.shims.ShimCometPythonMapInArrow import org.apache.spark.sql.execution.{ColumnarToRowExec, RowToColumnarExec, SparkPlan} import org.apache.spark.sql.execution.adaptive.QueryStageExec import org.apache.spark.sql.execution.exchange.ReusedExchangeExec -import org.apache.spark.sql.execution.python.{MapInPandasExec, PythonMapInArrowExec} import org.apache.comet.CometConf @@ -53,7 +52,9 @@ import org.apache.comet.CometConf // various reasons) or Spark requests row-based output such as a `collect` call. Spark will adds // another `ColumnarToRowExec` on top of `CometSparkToColumnarExec`. In this case, the pair could // be removed. -case class EliminateRedundantTransitions(session: SparkSession) extends Rule[SparkPlan] { +case class EliminateRedundantTransitions(session: SparkSession) + extends Rule[SparkPlan] + with ShimCometPythonMapInArrow { private lazy val showTransformations = CometConf.COMET_EXPLAIN_TRANSFORMATIONS.get() @@ -100,29 +101,23 @@ case class EliminateRedundantTransitions(session: SparkSession) extends Rule[Spa case CometNativeColumnarToRowExec(sparkToColumnar: CometSparkToColumnarExec) => sparkToColumnar.child case CometSparkToColumnarExec(child: CometSparkToColumnarExec) => child - // Replace MapInBatchExec (PythonMapInArrowExec / MapInPandasExec) that has a - // ColumnarToRow child with CometPythonMapInArrowExec to avoid the unnecessary - // Arrow->Row->Arrow round-trip. - case p: PythonMapInArrowExec if CometConf.COMET_PYTHON_MAP_IN_ARROW_ENABLED.get() => - extractColumnarChild(p.child) + // Replace MapInBatchExec (PythonMapInArrowExec / MapInArrowExec / MapInPandasExec) that has + // a ColumnarToRow child with CometPythonMapInArrowExec to avoid the unnecessary + // Arrow->Row->Arrow round-trip. The matchers are version-shimmed: Spark 3.4 returns None + // (it lacks the required APIs) and Spark 4.1+ matches the renamed `MapInArrowExec`. + case p: SparkPlan + if CometConf.COMET_PYTHON_MAP_IN_ARROW_ENABLED.get() && + matchMapInArrow(p).orElse(matchMapInPandas(p)).isDefined => + val (mapFunc, mapOutput, mapChild, mapIsBarrier, mapEvalType) = + matchMapInArrow(p).orElse(matchMapInPandas(p)).get + extractColumnarChild(mapChild) .map { columnarChild => CometPythonMapInArrowExec( - p.func, - p.output, + mapFunc, + mapOutput, columnarChild, - p.isBarrier, - p.func.asInstanceOf[PythonUDF].evalType) - } - .getOrElse(p) - case p: MapInPandasExec if CometConf.COMET_PYTHON_MAP_IN_ARROW_ENABLED.get() => - extractColumnarChild(p.child) - .map { columnarChild => - CometPythonMapInArrowExec( - p.func, - p.output, - columnarChild, - p.isBarrier, - p.func.asInstanceOf[PythonUDF].evalType) + mapIsBarrier, + mapEvalType) } .getOrElse(p) diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala index 223153d7d8..9b3e820023 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala @@ -22,15 +22,16 @@ package org.apache.spark.sql.comet import scala.collection.JavaConverters._ import org.apache.spark.{ContextAwareIterator, TaskContext} -import org.apache.spark.api.python.ChainedPythonFunctions import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.PythonUDF import org.apache.spark.sql.catalyst.plans.physical.Partitioning +import org.apache.spark.sql.comet.shims.ShimCometPythonMapInArrow import org.apache.spark.sql.execution.{ColumnarToRowExec, SparkPlan, UnaryExecNode} import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} -import org.apache.spark.sql.execution.python.{ArrowPythonRunner, BatchIterator, PythonSQLMetrics} +import org.apache.spark.sql.execution.python.{BatchIterator, PythonSQLMetrics} +import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch} /** @@ -55,7 +56,8 @@ case class CometPythonMapInArrowExec( isBarrier: Boolean, pythonEvalType: Int) extends UnaryExecNode - with PythonSQLMetrics { + with PythonSQLMetrics + with ShimCometPythonMapInArrow { override def supportsColumnar: Boolean = true @@ -78,18 +80,16 @@ case class CometPythonMapInArrowExec( val numOutputBatches = longMetric("numOutputBatches") val numInputRows = longMetric("numInputRows") - val pythonRunnerConf = ArrowPythonRunner.getPythonRunnerConfMap(conf) - val pythonFunction = func.asInstanceOf[PythonUDF].func - val chainedFunc = Seq(ChainedPythonFunctions(Seq(pythonFunction))) + val pythonUDF = func.asInstanceOf[PythonUDF] val localOutput = output val localChildSchema = child.schema val batchSize = conf.arrowMaxRecordsPerBatch val sessionLocalTimeZone = conf.sessionLocalTimeZone - val largeVarTypes = conf.arrowUseLargeVarTypes + val useLargeVarTypes = largeVarTypes(conf) + val pythonRunnerConf = getPythonRunnerConfMap(conf) val localPythonEvalType = pythonEvalType val localPythonMetrics = pythonMetrics - val jobArtifactUUID = - org.apache.spark.JobArtifactSet.getCurrentJobArtifactState.map(_.uuid) + val jobArtifactUUID = currentJobArtifactUUID() val inputRDD = child.executeColumnar() @@ -112,17 +112,19 @@ case class CometPythonMapInArrowExec( val batchIter = if (batchSize > 0) new BatchIterator(wrappedIter, batchSize) else Iterator(wrappedIter) - val columnarBatchIter = new ArrowPythonRunner( - chainedFunc, + val columnarBatchIter = computeArrowPython( + pythonUDF, localPythonEvalType, argOffsets, - org.apache.spark.sql.types - .StructType(Array(org.apache.spark.sql.types.StructField("struct", localChildSchema))), + StructType(Array(StructField("struct", localChildSchema))), sessionLocalTimeZone, - largeVarTypes, + useLargeVarTypes, pythonRunnerConf, localPythonMetrics, - jobArtifactUUID).compute(batchIter, context.partitionId(), context) + jobArtifactUUID, + batchIter, + context.partitionId(), + context) columnarBatchIter.map { batch => // Python returns a StructType column; flatten to individual columns diff --git a/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala b/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala new file mode 100644 index 0000000000..30736d99b3 --- /dev/null +++ b/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.comet.shims + +import org.apache.spark.TaskContext +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, PythonUDF} +import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.execution.metric.SQLMetric +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.vectorized.ColumnarBatch + +/** + * Spark 3.4 shim for the PyArrow UDF acceleration support. + * + * Spark 3.4 lacks several APIs that the optimization relies on (`isBarrier` on `MapInBatchExec`, + * `arrowUseLargeVarTypes`, `JobArtifactSet`, the modern `ArrowPythonRunner` constructor), so the + * matchers return `None` and the runner factory throws. The optimization is effectively a no-op + * on Spark 3.4. + */ +trait ShimCometPythonMapInArrow { + + protected def matchMapInArrow( + plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] = None + + protected def matchMapInPandas( + plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] = None + + protected def currentJobArtifactUUID(): Option[String] = None + + protected def largeVarTypes(conf: SQLConf): Boolean = false + + protected def getPythonRunnerConfMap(conf: SQLConf): Map[String, String] = Map.empty + + protected def computeArrowPython( + pythonUDF: PythonUDF, + evalType: Int, + argOffsets: Array[Array[Int]], + schema: StructType, + timeZoneId: String, + largeVarTypes: Boolean, + pythonRunnerConf: Map[String, String], + pythonMetrics: Map[String, SQLMetric], + jobArtifactUUID: Option[String], + batchIter: Iterator[Iterator[InternalRow]], + partitionId: Int, + context: TaskContext): Iterator[ColumnarBatch] = + throw new UnsupportedOperationException( + "CometPythonMapInArrowExec is not supported on Spark 3.4") +} diff --git a/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala b/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala new file mode 100644 index 0000000000..f7c8221d9e --- /dev/null +++ b/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.comet.shims + +import org.apache.spark.{JobArtifactSet, TaskContext} +import org.apache.spark.api.python.ChainedPythonFunctions +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, PythonUDF} +import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.execution.metric.SQLMetric +import org.apache.spark.sql.execution.python.{ArrowPythonRunner, MapInPandasExec, PythonMapInArrowExec} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.vectorized.ColumnarBatch + +trait ShimCometPythonMapInArrow { + + protected def matchMapInArrow( + plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] = + plan match { + case p: PythonMapInArrowExec => + Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType)) + case _ => None + } + + protected def matchMapInPandas( + plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] = + plan match { + case p: MapInPandasExec => + Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType)) + case _ => None + } + + protected def currentJobArtifactUUID(): Option[String] = + JobArtifactSet.getCurrentJobArtifactState.map(_.uuid) + + protected def largeVarTypes(conf: SQLConf): Boolean = conf.arrowUseLargeVarTypes + + protected def getPythonRunnerConfMap(conf: SQLConf): Map[String, String] = + ArrowPythonRunner.getPythonRunnerConfMap(conf) + + protected def computeArrowPython( + pythonUDF: PythonUDF, + evalType: Int, + argOffsets: Array[Array[Int]], + schema: StructType, + timeZoneId: String, + largeVarTypes: Boolean, + pythonRunnerConf: Map[String, String], + pythonMetrics: Map[String, SQLMetric], + jobArtifactUUID: Option[String], + batchIter: Iterator[Iterator[InternalRow]], + partitionId: Int, + context: TaskContext): Iterator[ColumnarBatch] = { + val chainedFunc = Seq(ChainedPythonFunctions(Seq(pythonUDF.func))) + new ArrowPythonRunner( + chainedFunc, + evalType, + argOffsets, + schema, + timeZoneId, + largeVarTypes, + pythonRunnerConf, + pythonMetrics, + jobArtifactUUID).compute(batchIter, partitionId, context) + } +} diff --git a/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala b/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala new file mode 100644 index 0000000000..78935f54c5 --- /dev/null +++ b/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.comet.shims + +import org.apache.spark.{JobArtifactSet, TaskContext} +import org.apache.spark.api.python.ChainedPythonFunctions +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, PythonUDF} +import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.execution.metric.SQLMetric +import org.apache.spark.sql.execution.python.{ArrowPythonRunner, MapInArrowExec, MapInPandasExec} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.vectorized.ColumnarBatch + +trait ShimCometPythonMapInArrow { + + protected def matchMapInArrow( + plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] = + plan match { + case p: MapInArrowExec => + Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType)) + case _ => None + } + + protected def matchMapInPandas( + plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] = + plan match { + case p: MapInPandasExec => + Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType)) + case _ => None + } + + protected def currentJobArtifactUUID(): Option[String] = + JobArtifactSet.getCurrentJobArtifactState.map(_.uuid) + + protected def largeVarTypes(conf: SQLConf): Boolean = conf.arrowUseLargeVarTypes + + protected def getPythonRunnerConfMap(conf: SQLConf): Map[String, String] = + ArrowPythonRunner.getPythonRunnerConfMap(conf) + + protected def computeArrowPython( + pythonUDF: PythonUDF, + evalType: Int, + argOffsets: Array[Array[Int]], + schema: StructType, + timeZoneId: String, + largeVarTypes: Boolean, + pythonRunnerConf: Map[String, String], + pythonMetrics: Map[String, SQLMetric], + jobArtifactUUID: Option[String], + batchIter: Iterator[Iterator[InternalRow]], + partitionId: Int, + context: TaskContext): Iterator[ColumnarBatch] = { + val chainedFunc = + Seq((ChainedPythonFunctions(Seq(pythonUDF.func)), pythonUDF.resultId.id)) + new ArrowPythonRunner( + chainedFunc, + evalType, + argOffsets, + schema, + timeZoneId, + largeVarTypes, + pythonRunnerConf, + pythonMetrics, + jobArtifactUUID, + None).compute(batchIter, partitionId, context) + } +} diff --git a/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala b/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala new file mode 100644 index 0000000000..f7f775b1fa --- /dev/null +++ b/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.comet.shims + +import org.apache.spark.{JobArtifactSet, TaskContext} +import org.apache.spark.api.python.ChainedPythonFunctions +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, PythonUDF} +import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.execution.metric.SQLMetric +import org.apache.spark.sql.execution.python.{ArrowPythonRunner, MapInArrowExec, MapInPandasExec} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.vectorized.ColumnarBatch + +trait ShimCometPythonMapInArrow { + + protected def matchMapInArrow( + plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] = + plan match { + case p: MapInArrowExec => + Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType)) + case _ => None + } + + protected def matchMapInPandas( + plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] = + plan match { + case p: MapInPandasExec => + Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType)) + case _ => None + } + + protected def currentJobArtifactUUID(): Option[String] = + JobArtifactSet.getCurrentJobArtifactState.map(_.uuid) + + protected def largeVarTypes(conf: SQLConf): Boolean = conf.arrowUseLargeVarTypes + + protected def getPythonRunnerConfMap(conf: SQLConf): Map[String, String] = + ArrowPythonRunner.getPythonRunnerConfMap(conf) + + protected def computeArrowPython( + pythonUDF: PythonUDF, + evalType: Int, + argOffsets: Array[Array[Int]], + schema: StructType, + timeZoneId: String, + largeVarTypes: Boolean, + pythonRunnerConf: Map[String, String], + pythonMetrics: Map[String, SQLMetric], + jobArtifactUUID: Option[String], + batchIter: Iterator[Iterator[InternalRow]], + partitionId: Int, + context: TaskContext): Iterator[ColumnarBatch] = { + val chainedFunc = + Seq((ChainedPythonFunctions(Seq(pythonUDF.func)), pythonUDF.resultId.id)) + new ArrowPythonRunner( + chainedFunc, + evalType, + argOffsets, + schema, + timeZoneId, + largeVarTypes, + pythonRunnerConf, + pythonMetrics, + jobArtifactUUID, + None, + None).compute(batchIter, partitionId, context) + } +} diff --git a/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala b/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala new file mode 100644 index 0000000000..78935f54c5 --- /dev/null +++ b/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.comet.shims + +import org.apache.spark.{JobArtifactSet, TaskContext} +import org.apache.spark.api.python.ChainedPythonFunctions +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, PythonUDF} +import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.execution.metric.SQLMetric +import org.apache.spark.sql.execution.python.{ArrowPythonRunner, MapInArrowExec, MapInPandasExec} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.vectorized.ColumnarBatch + +trait ShimCometPythonMapInArrow { + + protected def matchMapInArrow( + plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] = + plan match { + case p: MapInArrowExec => + Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType)) + case _ => None + } + + protected def matchMapInPandas( + plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] = + plan match { + case p: MapInPandasExec => + Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType)) + case _ => None + } + + protected def currentJobArtifactUUID(): Option[String] = + JobArtifactSet.getCurrentJobArtifactState.map(_.uuid) + + protected def largeVarTypes(conf: SQLConf): Boolean = conf.arrowUseLargeVarTypes + + protected def getPythonRunnerConfMap(conf: SQLConf): Map[String, String] = + ArrowPythonRunner.getPythonRunnerConfMap(conf) + + protected def computeArrowPython( + pythonUDF: PythonUDF, + evalType: Int, + argOffsets: Array[Array[Int]], + schema: StructType, + timeZoneId: String, + largeVarTypes: Boolean, + pythonRunnerConf: Map[String, String], + pythonMetrics: Map[String, SQLMetric], + jobArtifactUUID: Option[String], + batchIter: Iterator[Iterator[InternalRow]], + partitionId: Int, + context: TaskContext): Iterator[ColumnarBatch] = { + val chainedFunc = + Seq((ChainedPythonFunctions(Seq(pythonUDF.func)), pythonUDF.resultId.id)) + new ArrowPythonRunner( + chainedFunc, + evalType, + argOffsets, + schema, + timeZoneId, + largeVarTypes, + pythonRunnerConf, + pythonMetrics, + jobArtifactUUID, + None).compute(batchIter, partitionId, context) + } +} From 66eb246d3cb9f04b6b25878a02a32f6a2007b669 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 5 May 2026 22:19:07 -0600 Subject: [PATCH 10/54] ci: switch pyarrow_udf_test container to rust:bookworm The default `amd64/rust` image is Debian 13 (trixie), where the system `python3` is 3.13 and there is no `python3.11` apt package. The workflow installed `python3.11` explicitly, which fails on trixie with `Unable to locate package python3.11`. Switching to `rust:bookworm` gives a Debian 12 base where `python3` is 3.11, matching the job name and pyspark 3.5.x's supported runtime. --- .github/workflows/pyarrow_udf_test.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/pyarrow_udf_test.yml b/.github/workflows/pyarrow_udf_test.yml index 0740842413..622ee59fd0 100644 --- a/.github/workflows/pyarrow_udf_test.yml +++ b/.github/workflows/pyarrow_udf_test.yml @@ -61,7 +61,10 @@ jobs: name: PyArrow UDF (Spark 3.5, JDK 17, Python 3.11) runs-on: ubuntu-latest container: - image: amd64/rust + # Pinned to the Debian 12 (bookworm) base so the system `python3` is 3.11. The default + # `amd64/rust` image is Debian 13 (trixie) which ships Python 3.13 and no python3.11 apt + # package, breaking `apt-get install python3.11`. + image: rust:bookworm env: JAVA_TOOL_OPTIONS: "--add-exports=java.base/sun.nio.ch=ALL-UNNAMED --add-exports=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED" steps: @@ -91,8 +94,8 @@ jobs: - name: Install Python 3.11 and pip run: | apt-get update - apt-get install -y --no-install-recommends python3.11 python3.11-venv python3-pip - python3.11 -m venv /tmp/venv + apt-get install -y --no-install-recommends python3 python3-venv python3-pip + python3 -m venv /tmp/venv /tmp/venv/bin/pip install --upgrade pip /tmp/venv/bin/pip install "pyspark==3.5.8" "pyarrow>=14" pandas pytest From ec6fa783ed9bb9495dfd709159f5c10cdf37a60b Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 6 May 2026 06:23:52 -0600 Subject: [PATCH 11/54] ci: set PYSPARK_PYTHON to venv python for pyarrow_udf_test Spark launches Python workers in fresh subprocesses that look up python3 on PATH. Without PYSPARK_PYTHON, workers use the system python (no pyarrow installed) and UDF execution fails with ModuleNotFoundError. Point both PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON at /tmp/venv/bin/python so workers inherit the same interpreter that pytest uses. --- .github/workflows/pyarrow_udf_test.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/pyarrow_udf_test.yml b/.github/workflows/pyarrow_udf_test.yml index 622ee59fd0..e8018889cc 100644 --- a/.github/workflows/pyarrow_udf_test.yml +++ b/.github/workflows/pyarrow_udf_test.yml @@ -100,6 +100,13 @@ jobs: /tmp/venv/bin/pip install "pyspark==3.5.8" "pyarrow>=14" pandas pytest - name: Run PyArrow UDF pytest + env: + # Spark launches Python workers in a fresh subprocess and looks up `python3` + # on PATH unless PYSPARK_PYTHON is set. Without this, workers use the system + # python which has no pyarrow installed and UDF execution fails with + # ModuleNotFoundError. + PYSPARK_PYTHON: /tmp/venv/bin/python + PYSPARK_DRIVER_PYTHON: /tmp/venv/bin/python run: | jar=$(ls "$PWD"/spark/target/comet-spark-spark3.5_2.12-*-SNAPSHOT.jar \ | grep -v sources | grep -v tests | head -n1) From 1de2c2f815607115c03cc3075200ec4bc28d8223 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 6 May 2026 07:44:54 -0600 Subject: [PATCH 12/54] feat: default-disable PyArrow UDF optimization while experimental Flip spark.comet.exec.pythonMapInArrow.enabled default from true to false and prefix the config doc with "Experimental:" so the default matches the "[experimental]" label on the feature. Update the user guide to instruct users to opt in explicitly. --- common/src/main/scala/org/apache/comet/CometConf.scala | 10 ++++++---- docs/source/user-guide/latest/pyarrow-udfs.md | 6 +++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala index a06cd896ec..675e872b6e 100644 --- a/common/src/main/scala/org/apache/comet/CometConf.scala +++ b/common/src/main/scala/org/apache/comet/CometConf.scala @@ -318,11 +318,13 @@ object CometConf extends ShimCometConf { conf("spark.comet.exec.pythonMapInArrow.enabled") .category(CATEGORY_EXEC) .doc( - "Whether to enable optimized execution of PyArrow UDFs (mapInArrow/mapInPandas). " + - "When enabled, Comet passes Arrow columnar data directly to Python UDFs without " + - "the intermediate Arrow-to-Row-to-Arrow conversion that Spark normally performs.") + "Experimental: whether to enable optimized execution of PyArrow UDFs " + + "(mapInArrow/mapInPandas). When enabled, Comet passes Arrow columnar data " + + "directly to Python UDFs without the intermediate Arrow-to-Row-to-Arrow " + + "conversion that Spark normally performs. Disabled by default while the " + + "feature stabilizes.") .booleanConf - .createWithDefault(true) + .createWithDefault(false) val COMET_TRACING_ENABLED: ConfigEntry[Boolean] = conf("spark.comet.tracing.enabled") .category(CATEGORY_TUNING) diff --git a/docs/source/user-guide/latest/pyarrow-udfs.md b/docs/source/user-guide/latest/pyarrow-udfs.md index 2d555cedc4..374948c039 100644 --- a/docs/source/user-guide/latest/pyarrow-udfs.md +++ b/docs/source/user-guide/latest/pyarrow-udfs.md @@ -52,13 +52,13 @@ and memory allocations. ## Configuration -The optimization is controlled by: +The optimization is experimental and disabled by default. Enable it with: ``` -spark.comet.exec.pythonMapInArrow.enabled=true (default) +spark.comet.exec.pythonMapInArrow.enabled=true ``` -It is enabled by default when Comet execution is active. +The default is `false` while the feature stabilizes. ## Supported APIs From 3f68cbeb56f6be4a3235b73113630d9b9a928249 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 6 May 2026 07:45:02 -0600 Subject: [PATCH 13/54] test: expand PyArrow UDF pytest coverage Add coverage for cases that the original pytest module did not exercise: - mapInPandas (claimed supported, previously zero coverage) - Null preservation across long and string columns via Arrow passthrough - Empty input from a CometScan via filter pushdown - Python exception propagation (sentinel must surface in driver-side error) - DecimalType(18,6), DateType, TimestampType round-trip with nulls - ArrayType and nested StructType, including null arrays/structs and arrays containing null elements - repartition between scan and UDF (correctness only; the optimization itself does not fire across a vanilla Exchange and is documented as such in the test) Generalize _assert_plan_matches_mode to take the vanilla node name so the fallback assertion can match either PythonMapInArrow or MapInPandas. --- .../resources/pyspark/test_pyarrow_udf.py | 280 +++++++++++++++++- 1 file changed, 277 insertions(+), 3 deletions(-) diff --git a/spark/src/test/resources/pyspark/test_pyarrow_udf.py b/spark/src/test/resources/pyspark/test_pyarrow_udf.py index 462f4efdc6..b62db73be1 100644 --- a/spark/src/test/resources/pyspark/test_pyarrow_udf.py +++ b/spark/src/test/resources/pyspark/test_pyarrow_udf.py @@ -37,8 +37,10 @@ pytest -v spark/src/test/resources/pyspark/test_pyarrow_udf.py """ +import datetime as dt import glob import os +from decimal import Decimal import pyarrow as pa import pytest @@ -125,7 +127,9 @@ def _executed_plan(df) -> str: return df._jdf.queryExecution().executedPlan().toString() -def _assert_plan_matches_mode(plan: str, accelerated: bool) -> None: +def _assert_plan_matches_mode( + plan: str, accelerated: bool, vanilla_node: str = "PythonMapInArrow" +) -> None: if accelerated: assert "CometPythonMapInArrow" in plan, ( f"expected CometPythonMapInArrow in accelerated plan, got:\n{plan}" @@ -137,8 +141,8 @@ def _assert_plan_matches_mode(plan: str, accelerated: bool) -> None: assert "CometPythonMapInArrow" not in plan, ( f"unexpected CometPythonMapInArrow in fallback plan:\n{plan}" ) - assert "PythonMapInArrow" in plan, ( - f"expected PythonMapInArrow in fallback plan, got:\n{plan}" + assert vanilla_node in plan, ( + f"expected {vanilla_node} in fallback plan, got:\n{plan}" ) @@ -201,3 +205,273 @@ def add_computed_column(iterator): for i, row in enumerate(rows): assert abs(row["squared"] - float(i) ** 2) < 1e-6 assert row["label"] == f"item_{i}" + + +def test_map_in_pandas_doubles_value(spark, tmp_path, accelerated): + data = [(i, float(i * 1.5)) for i in range(100)] + src = str(tmp_path / "src.parquet") + spark.createDataFrame(data, ["id", "value"]).write.parquet(src) + + def double_value(iterator): + for pdf in iterator: + pdf = pdf.copy() + pdf["value"] = pdf["value"] * 2 + yield pdf + + schema = T.StructType( + [ + T.StructField("id", T.LongType()), + T.StructField("value", T.DoubleType()), + ] + ) + result_df = spark.read.parquet(src).mapInPandas(double_value, schema) + + _assert_plan_matches_mode( + _executed_plan(result_df), accelerated, vanilla_node="MapInPandas" + ) + + rows = result_df.orderBy("id").collect() + assert len(rows) == len(data) + for row, original in zip(rows, data): + assert row["id"] == original[0] + assert abs(row["value"] - original[1] * 2) < 1e-6 + + +def test_map_in_pandas_changes_schema(spark, tmp_path, accelerated): + data = [(i, float(i)) for i in range(50)] + src = str(tmp_path / "src.parquet") + spark.createDataFrame(data, ["id", "value"]).write.parquet(src) + + def add_squared(iterator): + for pdf in iterator: + pdf = pdf.copy() + pdf["squared"] = pdf["value"] ** 2 + yield pdf + + schema = T.StructType( + [ + T.StructField("id", T.LongType()), + T.StructField("value", T.DoubleType()), + T.StructField("squared", T.DoubleType()), + ] + ) + result_df = spark.read.parquet(src).mapInPandas(add_squared, schema) + + _assert_plan_matches_mode( + _executed_plan(result_df), accelerated, vanilla_node="MapInPandas" + ) + + rows = result_df.orderBy("id").collect() + assert len(rows) == 50 + for i, row in enumerate(rows): + assert abs(row["squared"] - float(i) ** 2) < 1e-6 + + +def test_map_in_arrow_preserves_nulls(spark, tmp_path, accelerated): + schema_in = T.StructType( + [ + T.StructField("id", T.LongType()), + T.StructField("name", T.StringType()), + ] + ) + rows = [ + (1, "a"), + (2, None), + (None, "c"), + (None, None), + (5, "e"), + ] + src = str(tmp_path / "src.parquet") + spark.createDataFrame(rows, schema_in).write.parquet(src) + + def passthrough(iterator): + # Pure Arrow passthrough so nulls survive without a pandas roundtrip + # (pandas would coerce null longs to NaN floats). + for batch in iterator: + yield batch + + result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in) + _assert_plan_matches_mode(_executed_plan(result_df), accelerated) + + out = {(r["id"], r["name"]) for r in result_df.collect()} + assert out == set(rows) + + +def test_map_in_arrow_empty_input(spark, tmp_path, accelerated): + schema_in = T.StructType( + [ + T.StructField("id", T.LongType()), + T.StructField("value", T.DoubleType()), + ] + ) + src = str(tmp_path / "src.parquet") + spark.createDataFrame([(1, 1.0), (2, 2.0)], schema_in).write.parquet(src) + + def passthrough(iterator): + for batch in iterator: + yield batch + + # Filter all rows out so the operator sees an empty stream from CometScan. + result_df = ( + spark.read.parquet(src).where("id < 0").mapInArrow(passthrough, schema_in) + ) + _assert_plan_matches_mode(_executed_plan(result_df), accelerated) + + assert result_df.count() == 0 + + +def test_map_in_arrow_python_exception_propagates(spark, tmp_path, accelerated): + schema_in = T.StructType([T.StructField("id", T.LongType())]) + data = [(i,) for i in range(10)] + src = str(tmp_path / "src.parquet") + spark.createDataFrame(data, schema_in).write.parquet(src) + + sentinel = "boom-from-pyarrow-udf" + + def boom(iterator): + for _batch in iterator: + raise ValueError(sentinel) + # Unreachable, but mapInArrow requires the callable to be a generator. + yield # pragma: no cover + + result_df = spark.read.parquet(src).mapInArrow(boom, schema_in) + _assert_plan_matches_mode(_executed_plan(result_df), accelerated) + + with pytest.raises(Exception) as exc_info: + result_df.collect() + assert sentinel in str(exc_info.value), ( + f"expected sentinel {sentinel!r} in exception, got: {exc_info.value}" + ) + + +def test_map_in_arrow_decimal_type(spark, tmp_path, accelerated): + schema_in = T.StructType( + [ + T.StructField("id", T.LongType()), + T.StructField("amount", T.DecimalType(18, 6)), + ] + ) + rows = [ + (1, Decimal("123.456789")), + (2, Decimal("0.000001")), + (3, Decimal("-99999999.999999")), + (4, None), + ] + src = str(tmp_path / "src.parquet") + spark.createDataFrame(rows, schema_in).write.parquet(src) + + def passthrough(iterator): + for batch in iterator: + yield batch + + result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in) + _assert_plan_matches_mode(_executed_plan(result_df), accelerated) + + out = {(r["id"], r["amount"]) for r in result_df.collect()} + assert out == set(rows) + + +def test_map_in_arrow_date_and_timestamp(spark, tmp_path, accelerated): + schema_in = T.StructType( + [ + T.StructField("id", T.LongType()), + T.StructField("d", T.DateType()), + T.StructField("ts", T.TimestampType()), + ] + ) + rows = [ + (1, dt.date(2024, 1, 1), dt.datetime(2024, 1, 1, 12, 30, 45)), + (2, dt.date(1999, 12, 31), dt.datetime(2000, 6, 15, 0, 0, 0)), + (3, None, None), + ] + src = str(tmp_path / "src.parquet") + spark.createDataFrame(rows, schema_in).write.parquet(src) + + def passthrough(iterator): + for batch in iterator: + yield batch + + result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in) + _assert_plan_matches_mode(_executed_plan(result_df), accelerated) + + out = {(r["id"], r["d"], r["ts"]) for r in result_df.collect()} + assert out == set(rows) + + +def test_map_in_arrow_array_and_struct(spark, tmp_path, accelerated): + schema_in = T.StructType( + [ + T.StructField("id", T.LongType()), + T.StructField("nums", T.ArrayType(T.IntegerType())), + T.StructField( + "addr", + T.StructType( + [ + T.StructField("city", T.StringType()), + T.StructField("zip", T.IntegerType()), + ] + ), + ), + ] + ) + rows = [ + (1, [1, 2, 3], ("Berlin", 10115)), + (2, [], ("NYC", 10001)), + (3, None, None), + (4, [None, 5], ("Tokyo", None)), + ] + src = str(tmp_path / "src.parquet") + spark.createDataFrame(rows, schema_in).write.parquet(src) + + def passthrough(iterator): + for batch in iterator: + yield batch + + result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in) + _assert_plan_matches_mode(_executed_plan(result_df), accelerated) + + def _normalize(row): + nums = tuple(row["nums"]) if row["nums"] is not None else None + addr = row["addr"] + addr_tuple = (addr["city"], addr["zip"]) if addr is not None else None + return (row["id"], nums, addr_tuple) + + out = {_normalize(r) for r in result_df.collect()} + expected = { + (r[0], tuple(r[1]) if r[1] is not None else None, r[2]) for r in rows + } + assert out == expected + + +def test_map_in_arrow_after_shuffle(spark, tmp_path, accelerated): + """ + Verifies correctness when a shuffle sits between the Comet scan and the + Python UDF. Without `spark.shuffle.manager` configured at session startup + the shuffle stays a vanilla `Exchange`, which is not columnar, so the + optimization does not fire across it today. This test does not assert on + the plan; it only ensures the path produces correct results in both modes + so a future change that wires Comet shuffle into the optimization does + not silently break correctness. + """ + schema_in = T.StructType( + [ + T.StructField("id", T.LongType()), + T.StructField("value", T.DoubleType()), + ] + ) + rows = [(i, float(i)) for i in range(50)] + src = str(tmp_path / "src.parquet") + spark.createDataFrame(rows, schema_in).write.parquet(src) + + def passthrough(iterator): + for batch in iterator: + yield batch + + result_df = ( + spark.read.parquet(src) + .repartition(4, "id") + .mapInArrow(passthrough, schema_in) + ) + + out = sorted((r["id"], r["value"]) for r in result_df.collect()) + assert out == sorted(rows) From e2ca2d2d91e5a10a829bd3793cac31727c27f6d4 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 6 May 2026 07:45:52 -0600 Subject: [PATCH 14/54] docs: document PyArrow UDF limitations and AQE explain quirk Expand the user guide with the limitations a user should know before enabling the experimental optimization: - The remaining row-to-Arrow round-trip inside the Python runner is documented more precisely (the input goes through ColumnarBatch.rowIterator to feed ArrowPythonRunner, which re-encodes to Arrow IPC). - A vanilla Spark Exchange between the Comet scan and the UDF prevents the optimization from firing. Users must configure Comet's native shuffle manager at session startup to keep the data columnar. - Spark 3.4 lacks the prerequisite APIs and the feature is a no-op there. - isBarrier is captured by the operator constructor but not yet propagated to the Python runner. Also explain the AQE display quirk: with AQE on and a shuffle present, the pre-execution plan shows the unoptimized form because the rule only sees the materialized subplan after stage execution. Running an action and re-inspecting explain() reveals the optimized plan. --- docs/source/user-guide/latest/pyarrow-udfs.md | 37 +++++++++++++++++-- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/docs/source/user-guide/latest/pyarrow-udfs.md b/docs/source/user-guide/latest/pyarrow-udfs.md index 374948c039..08a731e5de 100644 --- a/docs/source/user-guide/latest/pyarrow-udfs.md +++ b/docs/source/user-guide/latest/pyarrow-udfs.md @@ -102,7 +102,7 @@ result = df.mapInArrow(transform, output_schema) ## Verifying the Optimization -Use `explain()` to verify that `CometPythonMapInArrowExec` appears in your plan: +Use `explain()` to verify that `CometPythonMapInArrow` appears in your plan: ```python result.explain(mode="extended") @@ -111,7 +111,7 @@ result.explain(mode="extended") You should see: ``` -CometPythonMapInArrowExec ... +CometPythonMapInArrow ... +- CometNativeExec ... +- CometScan ... ``` @@ -125,10 +125,39 @@ PythonMapInArrow ... +- CometScan ... ``` +When AQE is enabled (the Spark default) and the query contains a shuffle, the +optimization is applied during stage materialization. Calling `explain()` before +running an action will show the unoptimized plan: + +``` +AdaptiveSparkPlan isFinalPlan=false ++- PythonMapInArrow ... + +- CometExchange ... +``` + +To see the optimized plan, run an action first (for example `result.collect()` or +`result.cache(); result.count()`) and then call `explain()`. The post-execution +plan shows the materialized stages and includes `CometPythonMapInArrow` if the +optimization fired. + ## Limitations - The optimization currently applies only to `mapInArrow` and `mapInPandas`. Scalar pandas UDFs (`@pandas_udf`) and grouped operations (`applyInPandas`) are not yet supported. - The internal row-to-Arrow conversion inside the Python runner is still present in this version. - A future optimization will write Arrow batches directly to the Python IPC stream, achieving - near zero-copy data transfer. + Comet currently routes columnar input through `ColumnarBatch.rowIterator()` so that the existing + `ArrowPythonRunner` can re-encode the rows back to Arrow IPC. A future optimization will write + Arrow batches directly to the Python IPC stream, eliminating the remaining round-trip and + achieving near zero-copy data transfer. +- The optimization requires Arrow data on the input side. If a shuffle sits between the upstream + Comet operator and the Python UDF, you need Comet's native shuffle for the optimization to + apply. Set `spark.shuffle.manager` to + `org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager` and enable + `spark.comet.exec.shuffle.enabled=true` at session startup. With a vanilla Spark `Exchange` + in the plan the data leaves the shuffle as rows and the optimization cannot fire. +- Spark 3.4 lacks several APIs the optimization depends on (`MapInBatchExec.isBarrier`, + `arrowUseLargeVarTypes`, `JobArtifactSet`, the modern `ArrowPythonRunner` constructor). On + Spark 3.4 the feature is a no-op even when enabled. Spark 3.5+ is required. +- The `isBarrier` flag on `mapInArrow` / `mapInPandas` is currently captured but not propagated + through to the Python runner. If your job depends on barrier-execution semantics, leave the + optimization disabled until this is fixed. From f4b5c3274cc45400fe5e7102b72e2fea96ed8496 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 6 May 2026 07:57:26 -0600 Subject: [PATCH 15/54] bench: add Python end-to-end benchmark for PyArrow UDF acceleration Standalone Python script that times df.mapInArrow(passthrough).count() and the equivalent mapInPandas query with the optimization toggled on and off. Numbers are wall-clock seconds, so they include the Python worker, Arrow IPC, and downstream count() costs. That is the right unit for a feature whose user surface is Python: it shows what fraction of end-to-end time the optimization shaves off, not just the JVM-side delta in isolation. Three workloads exercise the dimension where the optimization helps most: - narrow primitives (long, int, double) - mixed with strings (variable-length encoding) - wide rows (50 columns, projection cost scales with column count) Local smoke run with 200k rows shows 1.17x to 1.45x speedup across mapInArrow and mapInPandas, narrow/wide schemas. The script is configurable via BENCHMARK_ROWS / BENCHMARK_WARMUP / BENCHMARK_ITERS env vars for users who want longer or shorter runs. --- .../pyspark/benchmark_pyarrow_udf.py | 235 ++++++++++++++++++ 1 file changed, 235 insertions(+) create mode 100644 spark/src/test/resources/pyspark/benchmark_pyarrow_udf.py diff --git a/spark/src/test/resources/pyspark/benchmark_pyarrow_udf.py b/spark/src/test/resources/pyspark/benchmark_pyarrow_udf.py new file mode 100644 index 0000000000..8a3b4333c4 --- /dev/null +++ b/spark/src/test/resources/pyspark/benchmark_pyarrow_udf.py @@ -0,0 +1,235 @@ +#!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +End-to-end wall-clock benchmark for Comet's PyArrow UDF acceleration. + +Times `df.mapInArrow(passthrough, schema).count()` and the equivalent +`mapInPandas` query with `spark.comet.exec.pythonMapInArrow.enabled` set +to false (vanilla Spark path) and true (Comet's optimized path). Both +modes run the same Python worker, so the measured delta covers what the +optimization actually changes for users: + + * vanilla: CometScan -> ColumnarToRow + UnsafeProjection -> ArrowPythonRunner + * optimized: CometScan -> rowIterator -> ArrowPythonRunner (same runner; + no UnsafeProjection, output kept as ColumnarBatch) + +Results are wall-clock seconds, so they include Python interpreter, +Arrow IPC, and downstream count() costs. That's intentional: the +optimization's user-visible value is what fraction of end-to-end time +it shaves off, not the JVM-side delta in isolation. + +Usage: + # Build Comet (release for representative numbers): + make release + + pip install pyspark==3.5.8 pyarrow pandas + + python3 spark/src/test/resources/pyspark/benchmark_pyarrow_udf.py + +Override defaults via environment variables: + COMET_JAR=/path/to/comet.jar path to the Comet jar + BENCHMARK_ROWS=2000000 rows per run + BENCHMARK_WARMUP=2 warmup iterations per case + BENCHMARK_ITERS=5 measured iterations per case +""" + +import contextlib +import glob +import os +import statistics +import tempfile +import time + +from pyspark.sql import SparkSession + + +REPO_ROOT = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..") +) + + +def _resolve_comet_jar() -> str: + explicit = os.environ.get("COMET_JAR") + if explicit: + return explicit + import pyspark + + major_minor = ".".join(pyspark.__version__.split(".")[:2]) + spark_tag = f"spark{major_minor}" + scala_tag = "_2.12" if major_minor.startswith("3.") else "_2.13" + pattern = os.path.join( + REPO_ROOT, + f"spark/target/comet-spark-{spark_tag}{scala_tag}-*-SNAPSHOT.jar", + ) + candidates = [ + m + for m in sorted(glob.glob(pattern)) + if "sources" not in os.path.basename(m) and "tests" not in os.path.basename(m) + ] + if not candidates: + raise FileNotFoundError( + "Comet jar not found. Set COMET_JAR or run `make release`. " + f"Looked under {pattern}." + ) + return candidates[-1] + + +def _build_spark() -> SparkSession: + jar = _resolve_comet_jar() + os.environ["PYSPARK_SUBMIT_ARGS"] = ( + f"--jars {jar} --driver-class-path {jar} pyspark-shell" + ) + return ( + SparkSession.builder.master("local[2]") + .appName("comet-pyarrow-udf-benchmark") + .config("spark.plugins", "org.apache.spark.CometPlugin") + .config("spark.comet.enabled", "true") + .config("spark.comet.exec.enabled", "true") + .config("spark.memory.offHeap.enabled", "true") + .config("spark.memory.offHeap.size", "4g") + .config("spark.driver.memory", "4g") + # Pin AQE off so the explain output and plan structure are stable + # across iterations. AQE doesn't change the optimization's behavior; + # it just makes plan inspection harder. + .config("spark.sql.adaptive.enabled", "false") + .getOrCreate() + ) + + +def _passthrough_arrow(iterator): + for batch in iterator: + yield batch + + +def _passthrough_pandas(iterator): + for pdf in iterator: + yield pdf + + +def _narrow_primitives(spark: SparkSession, n: int): + return spark.range(n).selectExpr( + "id as id_long", + "cast(id as int) as id_int", + "cast(id as double) as id_double", + ) + + +def _mixed_with_strings(spark: SparkSession, n: int): + return spark.range(n).selectExpr( + "id as id_long", + "cast(id as int) as id_int", + "cast(id as double) as id_double", + "concat('row_', cast(id as string)) as id_str", + "cast(id % 2 as boolean) as id_bool", + ) + + +def _wide_rows(spark: SparkSession, n: int): + types = ["int", "long", "double"] + cols = [ + f"cast(id + {i} as {types[i % len(types)]}) as col_{i}" for i in range(50) + ] + return spark.range(n).selectExpr(*cols) + + +WORKLOADS = [ + ("narrow primitives", _narrow_primitives), + ("mixed with strings", _mixed_with_strings), + ("wide rows (50 cols)", _wide_rows), +] + + +@contextlib.contextmanager +def _temp_parquet(spark: SparkSession, build_df, n: int): + with tempfile.TemporaryDirectory() as d: + path = os.path.join(d, "src.parquet") + build_df(spark, n).write.parquet(path) + yield path + + +def _time_run(spark: SparkSession, parquet_path: str, accelerate: bool, api: str) -> float: + spark.conf.set( + "spark.comet.exec.pythonMapInArrow.enabled", + "true" if accelerate else "false", + ) + df = spark.read.parquet(parquet_path) + schema = df.schema + if api == "mapInArrow": + df = df.mapInArrow(_passthrough_arrow, schema) + else: + df = df.mapInPandas(_passthrough_pandas, schema) + t0 = time.perf_counter() + df.count() + return time.perf_counter() - t0 + + +def main() -> None: + rows = int(os.environ.get("BENCHMARK_ROWS", 1024 * 1024)) + warmup = int(os.environ.get("BENCHMARK_WARMUP", 2)) + iters = int(os.environ.get("BENCHMARK_ITERS", 5)) + + spark = _build_spark() + spark.sparkContext.setLogLevel("WARN") + + print(f"\nrows per run: {rows:,}") + print(f"warmup iters: {warmup}, measured iters: {iters}") + print(f"jar: {_resolve_comet_jar()}\n") + + header = " {:<14} {:<10} {:>10} {:>10} {:>10} {:>13} {:>9}".format( + "api", "mode", "min (s)", "median (s)", "max (s)", "rows/s", "speedup" + ) + print(header) + print(" " + "-" * (len(header) - 2)) + + for name, build_df in WORKLOADS: + print(f"\n=== {name} ===") + with _temp_parquet(spark, build_df, rows) as parquet_path: + for api in ("mapInArrow", "mapInPandas"): + samples_by_mode = {} + for mode, accelerate in (("vanilla", False), ("optimized", True)): + for _ in range(warmup): + _time_run(spark, parquet_path, accelerate, api) + samples = [ + _time_run(spark, parquet_path, accelerate, api) + for _ in range(iters) + ] + samples_by_mode[mode] = samples + median = statistics.median(samples) + speedup = "" + if mode == "optimized": + speedup = "{:.2f}x".format( + statistics.median(samples_by_mode["vanilla"]) / median + ) + print( + " {:<14} {:<10} {:>10} {:>10} {:>10} {:>13} {:>9}".format( + api, + mode, + "{:.3f}".format(min(samples)), + "{:.3f}".format(median), + "{:.3f}".format(max(samples)), + "{:,.0f}".format(rows / median), + speedup, + ) + ) + + spark.stop() + + +if __name__ == "__main__": + main() From 3822ed7d90368a0746a577a681d2bf56efccb087 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 6 May 2026 08:17:49 -0600 Subject: [PATCH 16/54] fix: propagate isBarrier through CometPythonMapInArrowExec The operator captured isBarrier in its constructor but always called inputRDD.mapPartitionsInternal, dropping the barrier execution mode semantics that mapInArrow(..., barrier=True) requests. Stages running under the optimization lost gang scheduling and the BarrierTaskContext APIs the UDF expects. Branch on isBarrier and route through inputRDD.barrier().mapPartitions in the barrier case, matching what Spark's MapInBatchExec.doExecute does. Add a pytest case that calls BarrierTaskContext.get() inside the UDF, which raises if the task is not running in a barrier stage; runs in both vanilla and optimized modes. Drop the isBarrier limitation note from the user guide. --- docs/source/user-guide/latest/pyarrow-udfs.md | 3 -- .../sql/comet/CometPythonMapInArrowExec.scala | 15 ++++++-- .../resources/pyspark/test_pyarrow_udf.py | 38 +++++++++++++++++++ 3 files changed, 50 insertions(+), 6 deletions(-) diff --git a/docs/source/user-guide/latest/pyarrow-udfs.md b/docs/source/user-guide/latest/pyarrow-udfs.md index 08a731e5de..6a95fbac0d 100644 --- a/docs/source/user-guide/latest/pyarrow-udfs.md +++ b/docs/source/user-guide/latest/pyarrow-udfs.md @@ -158,6 +158,3 @@ optimization fired. - Spark 3.4 lacks several APIs the optimization depends on (`MapInBatchExec.isBarrier`, `arrowUseLargeVarTypes`, `JobArtifactSet`, the modern `ArrowPythonRunner` constructor). On Spark 3.4 the feature is a no-op even when enabled. Spark 3.5+ is required. -- The `isBarrier` flag on `mapInArrow` / `mapInPandas` is currently captured but not propagated - through to the Python runner. If your job depends on barrier-execution semantics, leave the - optimization disabled until this is fixed. diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala index 9b3e820023..68e27b9355 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala @@ -93,12 +93,13 @@ case class CometPythonMapInArrowExec( val inputRDD = child.executeColumnar() - inputRDD.mapPartitionsInternal { batches => + // Run on every partition. Identical to what MapInBatchExec does, except the input + // is columnar; we intentionally avoid the UnsafeProjection copy that ColumnarToRow + // would do. + def processPartition(batches: Iterator[ColumnarBatch]): Iterator[ColumnarBatch] = { val context = TaskContext.get() val argOffsets = Array(Array(0)) - // Convert columnar batches to rows using lightweight rowIterator - // (avoids UnsafeProjection copy that ColumnarToRow would do) val rowIter = batches.flatMap { batch => numInputRows += batch.numRows() batch.rowIterator().asScala @@ -137,6 +138,14 @@ case class CometPythonMapInArrowExec( flattenedBatch } } + + // Preserve isBarrier semantics: when set, run inside a barrier stage so all tasks + // are gang-scheduled and BarrierTaskContext.barrier() works inside the UDF. + if (isBarrier) { + inputRDD.barrier().mapPartitions(processPartition) + } else { + inputRDD.mapPartitionsInternal(processPartition) + } } override protected def withNewChildInternal(newChild: SparkPlan): CometPythonMapInArrowExec = diff --git a/spark/src/test/resources/pyspark/test_pyarrow_udf.py b/spark/src/test/resources/pyspark/test_pyarrow_udf.py index b62db73be1..ea72436841 100644 --- a/spark/src/test/resources/pyspark/test_pyarrow_udf.py +++ b/spark/src/test/resources/pyspark/test_pyarrow_udf.py @@ -475,3 +475,41 @@ def passthrough(iterator): out = sorted((r["id"], r["value"]) for r in result_df.collect()) assert out == sorted(rows) + + +def test_map_in_arrow_barrier_mode(spark, tmp_path, accelerated): + """ + `mapInArrow(..., barrier=True)` runs the stage in barrier execution mode + (gang scheduling, all-or-nothing failure semantics, BarrierTaskContext + available inside the UDF). The optimization captures isBarrier in the + operator constructor and must propagate it through to RDD.barrier(); + otherwise the runtime context the UDF sees changes when the optimization + fires and any code calling BarrierTaskContext APIs breaks. + """ + schema_in = T.StructType( + [ + T.StructField("id", T.LongType()), + T.StructField("value", T.DoubleType()), + ] + ) + rows = [(i, float(i)) for i in range(20)] + src = str(tmp_path / "src.parquet") + spark.createDataFrame(rows, schema_in).write.parquet(src) + + def assert_barrier_context(iterator): + from pyspark import BarrierTaskContext + + # Will raise if the task is not running inside a barrier stage. + BarrierTaskContext.get() + for batch in iterator: + yield batch + + result_df = ( + spark.read.parquet(src).mapInArrow( + assert_barrier_context, schema_in, barrier=True + ) + ) + _assert_plan_matches_mode(_executed_plan(result_df), accelerated) + + out = sorted((r["id"], r["value"]) for r in result_df.collect()) + assert out == sorted(rows) From 24dc84b86368a73cf0a509a457cfab0af15cfda7 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 8 May 2026 04:56:15 -0600 Subject: [PATCH 17/54] refactor: address PR review feedback for PyArrow UDF acceleration Conf and operator rename: - spark.comet.exec.pythonMapInArrow.enabled -> spark.comet.exec.pyarrowUdf.enabled - CometPythonMapInArrowExec -> CometMapInBatchExec, matching Spark's MapInBatchExec parent class and reflecting that this op handles MapInArrow + MapInPandas Shim and rule cleanup: - Mix CometPlan into the operator - Use PythonEvalType.SQL_MAP_ARROW_ITER_UDF / SQL_MAP_PANDAS_ITER_UDF directly in the matchers instead of dereferencing PythonUDF.evalType - Replace 5-tuple shim return with named MapInBatchInfo case class - Collapse double matcher evaluation in the rule into a single match - Remove unreachable ColumnarToRowExec branch in extractColumnarChild - Reduce computeArrowPython parameter count by passing SQLConf and deriving timeZoneId / largeVarTypes / pythonRunnerConf / jobArtifactUUID inside - Add a why-comment to the doExecute fallback - Drop comments that restate the code Spark 4.x shim consolidation: - Move shared 4.x matchers and runnerInputs helper into spark-4.x/ Spark4xMapInBatchSupport, leaving each minor's ShimCometMapInBatch as a small ArrowPythonRunner constructor factory CI workflow: - Switch pyarrow_udf_test workflow from Spark 3.5 to Spark 4.0 to cover the 4.x shim path; build in debug mode (no -Prelease, no cargo --release) Tests: - Replace the no-op CometPythonMapInArrowSuite with per-Spark-version CometMapInBatchSuite under spark/src/test/spark-{3.5,4.x} that constructs a PythonMapInArrowExec / MapInArrowExec over a stub CometPlan leaf and verifies EliminateRedundantTransitions rewrites it to CometMapInBatchExec (and does not when the conf is disabled) - Consolidate jar resolution into spark/src/test/resources/pyspark/conftest.py; pytest and the benchmark script both import resolve_comet_jar from there, and the workflow no longer needs an inline ls/grep - Update plan-string assertions to look for CometMapInBatch and the substring MapInArrow which is shared by Spark 3.5's PythonMapInArrowExec and Spark 4.x's MapInArrowExec node names Docs: - Rename references in the user guide and add a barrier=True section noting that isBarrier is propagated through RDD.barrier() --- .github/workflows/pyarrow_udf_test.yml | 15 +-- .../scala/org/apache/comet/CometConf.scala | 4 +- docs/source/user-guide/latest/pyarrow-udfs.md | 44 +++++-- .../rules/EliminateRedundantTransitions.scala | 52 +++++---- ...owExec.scala => CometMapInBatchExec.scala} | 62 +++++----- .../sql/comet/shims/MapInBatchInfo.scala | 36 ++++++ ...nArrow.scala => ShimCometMapInBatch.scala} | 24 +--- ...nArrow.scala => ShimCometMapInBatch.scala} | 48 ++++---- .../sql/comet/shims/ShimCometMapInBatch.scala | 56 +++++++++ .../shims/ShimCometPythonMapInArrow.scala | 86 -------------- .../sql/comet/shims/ShimCometMapInBatch.scala | 57 ++++++++++ .../shims/ShimCometPythonMapInArrow.scala | 87 -------------- .../sql/comet/shims/ShimCometMapInBatch.scala | 56 +++++++++ .../shims/ShimCometPythonMapInArrow.scala | 86 -------------- .../shims/Spark4xMapInBatchSupport.scala | 81 +++++++++++++ .../pyspark/benchmark_pyarrow_udf.py | 48 +++----- spark/src/test/resources/pyspark/conftest.py | 73 ++++++++++++ .../resources/pyspark/test_pyarrow_udf.py | 72 +++--------- .../exec/CometPythonMapInArrowSuite.scala | 66 ----------- .../sql/comet/CometMapInBatchSuite.scala | 106 +++++++++++++++++ .../sql/comet/CometMapInBatchSuite.scala | 107 ++++++++++++++++++ 21 files changed, 730 insertions(+), 536 deletions(-) rename spark/src/main/scala/org/apache/spark/sql/comet/{CometPythonMapInArrowExec.scala => CometMapInBatchExec.scala} (71%) create mode 100644 spark/src/main/scala/org/apache/spark/sql/comet/shims/MapInBatchInfo.scala rename spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/{ShimCometPythonMapInArrow.scala => ShimCometMapInBatch.scala} (69%) rename spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/{ShimCometPythonMapInArrow.scala => ShimCometMapInBatch.scala} (64%) create mode 100644 spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala delete mode 100644 spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala create mode 100644 spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala delete mode 100644 spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala create mode 100644 spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala delete mode 100644 spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala create mode 100644 spark/src/main/spark-4.x/org/apache/spark/sql/comet/shims/Spark4xMapInBatchSupport.scala create mode 100644 spark/src/test/resources/pyspark/conftest.py delete mode 100644 spark/src/test/scala/org/apache/comet/exec/CometPythonMapInArrowSuite.scala create mode 100644 spark/src/test/spark-3.5/org/apache/spark/sql/comet/CometMapInBatchSuite.scala create mode 100644 spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala diff --git a/.github/workflows/pyarrow_udf_test.yml b/.github/workflows/pyarrow_udf_test.yml index e8018889cc..211a9bd23a 100644 --- a/.github/workflows/pyarrow_udf_test.yml +++ b/.github/workflows/pyarrow_udf_test.yml @@ -58,7 +58,7 @@ env: jobs: pyarrow-udf: - name: PyArrow UDF (Spark 3.5, JDK 17, Python 3.11) + name: PyArrow UDF (Spark 4.0, JDK 17, Python 3.11) runs-on: ubuntu-latest container: # Pinned to the Debian 12 (bookworm) base so the system `python3` is 3.11. The default @@ -86,10 +86,10 @@ jobs: restore-keys: | ${{ runner.os }}-java-maven- - - name: Build Comet (release, Spark 3.5 / Scala 2.12) + - name: Build Comet (debug, Spark 4.0 / Scala 2.13) run: | - cd native && cargo build --release - cd .. && ./mvnw -B -Prelease install -DskipTests -Pspark-3.5 -Pscala-2.12 + cd native && cargo build + cd .. && ./mvnw -B install -DskipTests -Pspark-4.0 -Pscala-2.13 - name: Install Python 3.11 and pip run: | @@ -97,7 +97,7 @@ jobs: apt-get install -y --no-install-recommends python3 python3-venv python3-pip python3 -m venv /tmp/venv /tmp/venv/bin/pip install --upgrade pip - /tmp/venv/bin/pip install "pyspark==3.5.8" "pyarrow>=14" pandas pytest + /tmp/venv/bin/pip install "pyspark==4.0.1" "pyarrow>=14" pandas pytest - name: Run PyArrow UDF pytest env: @@ -108,8 +108,5 @@ jobs: PYSPARK_PYTHON: /tmp/venv/bin/python PYSPARK_DRIVER_PYTHON: /tmp/venv/bin/python run: | - jar=$(ls "$PWD"/spark/target/comet-spark-spark3.5_2.12-*-SNAPSHOT.jar \ - | grep -v sources | grep -v tests | head -n1) - echo "Using $jar" - COMET_JAR="$jar" /tmp/venv/bin/python -m pytest -v \ + /tmp/venv/bin/python -m pytest -v \ spark/src/test/resources/pyspark/test_pyarrow_udf.py diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala index 675e872b6e..0bdc35d3ce 100644 --- a/common/src/main/scala/org/apache/comet/CometConf.scala +++ b/common/src/main/scala/org/apache/comet/CometConf.scala @@ -314,8 +314,8 @@ object CometConf extends ShimCometConf { .booleanConf .createWithDefault(false) - val COMET_PYTHON_MAP_IN_ARROW_ENABLED: ConfigEntry[Boolean] = - conf("spark.comet.exec.pythonMapInArrow.enabled") + val COMET_PYARROW_UDF_ENABLED: ConfigEntry[Boolean] = + conf("spark.comet.exec.pyarrowUdf.enabled") .category(CATEGORY_EXEC) .doc( "Experimental: whether to enable optimized execution of PyArrow UDFs " + diff --git a/docs/source/user-guide/latest/pyarrow-udfs.md b/docs/source/user-guide/latest/pyarrow-udfs.md index 6a95fbac0d..23ef50e79c 100644 --- a/docs/source/user-guide/latest/pyarrow-udfs.md +++ b/docs/source/user-guide/latest/pyarrow-udfs.md @@ -40,22 +40,43 @@ Steps 2 and 3 are redundant since the data starts and ends in Arrow format. ## How Comet Optimizes This -When enabled, Comet detects `PythonMapInArrowExec` and `MapInPandasExec` operators in the physical plan -and replaces them with `CometPythonMapInArrowExec`, which: +When enabled, Comet detects `PythonMapInArrowExec` / `MapInArrowExec` and `MapInPandasExec` +operators in the physical plan and replaces them with `CometMapInBatchExec`, which: - Reads Arrow columnar batches directly from the upstream Comet operator - Feeds them to the Python runner without the expensive UnsafeProjection copy - Keeps the Python output in columnar format for downstream operators This eliminates the ColumnarToRow transition and the output row conversion, reducing CPU overhead -and memory allocations. +and memory allocations. The internal row-to-Arrow IPC re-encoding inside Spark's +`ArrowPythonRunner` is unchanged in this version; full round-trip elimination is tracked in +[#4240](https://github.com/apache/datafusion-comet/issues/4240). + +### Plan flow + +Without Comet's optimization: + +``` +PythonMapInArrow / MapInArrow / MapInPandas ++- ColumnarToRow <- Arrow -> Row copy + +- CometNativeExec <- Arrow batch + +- CometScan +``` + +With the optimization enabled: + +``` +CometMapInBatch <- Arrow batch in/out, Python runner attached ++- CometNativeExec + +- CometScan +``` ## Configuration The optimization is experimental and disabled by default. Enable it with: ``` -spark.comet.exec.pythonMapInArrow.enabled=true +spark.comet.exec.pyarrowUdf.enabled=true ``` The default is `false` while the feature stabilizes. @@ -79,7 +100,7 @@ spark = SparkSession.builder \ .config("spark.plugins", "org.apache.spark.CometPlugin") \ .config("spark.comet.enabled", "true") \ .config("spark.comet.exec.enabled", "true") \ - .config("spark.comet.exec.pythonMapInArrow.enabled", "true") \ + .config("spark.comet.exec.pyarrowUdf.enabled", "true") \ .config("spark.memory.offHeap.enabled", "true") \ .config("spark.memory.offHeap.size", "2g") \ .getOrCreate() @@ -102,7 +123,7 @@ result = df.mapInArrow(transform, output_schema) ## Verifying the Optimization -Use `explain()` to verify that `CometPythonMapInArrow` appears in your plan: +Use `explain()` to verify that `CometMapInBatch` appears in your plan: ```python result.explain(mode="extended") @@ -111,7 +132,7 @@ result.explain(mode="extended") You should see: ``` -CometPythonMapInArrow ... +CometMapInBatch ... +- CometNativeExec ... +- CometScan ... ``` @@ -137,9 +158,16 @@ AdaptiveSparkPlan isFinalPlan=false To see the optimized plan, run an action first (for example `result.collect()` or `result.cache(); result.count()`) and then call `explain()`. The post-execution -plan shows the materialized stages and includes `CometPythonMapInArrow` if the +plan shows the materialized stages and includes `CometMapInBatch` if the optimization fired. +## Barrier execution + +`mapInArrow(..., barrier=True)` and `mapInPandas(..., barrier=True)` are honored: the +optimized operator propagates `isBarrier` through `RDD.barrier()`, so all tasks are +gang-scheduled and `BarrierTaskContext.barrier()` works inside the UDF the same way it does +on the unoptimized path. + ## Limitations - The optimization currently applies only to `mapInArrow` and `mapInPandas`. Scalar pandas UDFs diff --git a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala index e7218ab935..24c969c173 100644 --- a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala +++ b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala @@ -22,9 +22,9 @@ package org.apache.comet.rules import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.util.sideBySide -import org.apache.spark.sql.comet.{CometCollectLimitExec, CometColumnarToRowExec, CometNativeColumnarToRowExec, CometNativeWriteExec, CometPlan, CometPythonMapInArrowExec, CometSparkToColumnarExec} +import org.apache.spark.sql.comet.{CometCollectLimitExec, CometColumnarToRowExec, CometMapInBatchExec, CometNativeColumnarToRowExec, CometNativeWriteExec, CometPlan, CometSparkToColumnarExec} import org.apache.spark.sql.comet.execution.shuffle.{CometColumnarShuffle, CometShuffleExchangeExec} -import org.apache.spark.sql.comet.shims.ShimCometPythonMapInArrow +import org.apache.spark.sql.comet.shims.ShimCometMapInBatch import org.apache.spark.sql.execution.{ColumnarToRowExec, RowToColumnarExec, SparkPlan} import org.apache.spark.sql.execution.adaptive.QueryStageExec import org.apache.spark.sql.execution.exchange.ReusedExchangeExec @@ -54,7 +54,7 @@ import org.apache.comet.CometConf // be removed. case class EliminateRedundantTransitions(session: SparkSession) extends Rule[SparkPlan] - with ShimCometPythonMapInArrow { + with ShimCometMapInBatch { private lazy val showTransformations = CometConf.COMET_EXPLAIN_TRANSFORMATIONS.get() @@ -102,24 +102,25 @@ case class EliminateRedundantTransitions(session: SparkSession) sparkToColumnar.child case CometSparkToColumnarExec(child: CometSparkToColumnarExec) => child // Replace MapInBatchExec (PythonMapInArrowExec / MapInArrowExec / MapInPandasExec) that has - // a ColumnarToRow child with CometPythonMapInArrowExec to avoid the unnecessary - // Arrow->Row->Arrow round-trip. The matchers are version-shimmed: Spark 3.4 returns None - // (it lacks the required APIs) and Spark 4.1+ matches the renamed `MapInArrowExec`. - case p: SparkPlan - if CometConf.COMET_PYTHON_MAP_IN_ARROW_ENABLED.get() && - matchMapInArrow(p).orElse(matchMapInPandas(p)).isDefined => - val (mapFunc, mapOutput, mapChild, mapIsBarrier, mapEvalType) = - matchMapInArrow(p).orElse(matchMapInPandas(p)).get - extractColumnarChild(mapChild) - .map { columnarChild => - CometPythonMapInArrowExec( - mapFunc, - mapOutput, - columnarChild, - mapIsBarrier, - mapEvalType) - } - .getOrElse(p) + // a ColumnarToRow child with CometMapInBatchExec, eliminating the input and output + // UnsafeProjection copies and keeping the stage columnar. The matchers are + // version-shimmed: Spark 3.4 returns None (it lacks the required APIs) and Spark 4.1+ + // matches the renamed `MapInArrowExec`. + case p: SparkPlan if CometConf.COMET_PYARROW_UDF_ENABLED.get() => + matchMapInArrow(p).orElse(matchMapInPandas(p)) match { + case Some(info) => + extractColumnarChild(info.child) + .map { columnarChild => + CometMapInBatchExec( + info.func, + info.output, + columnarChild, + info.isBarrier, + info.pythonEvalType) + } + .getOrElse(p) + case None => p + } // Spark adds `RowToColumnar` under Comet columnar shuffle. But it's redundant as the // shuffle takes row-based input. @@ -154,12 +155,13 @@ case class EliminateRedundantTransitions(session: SparkSession) } /** - * If the given plan is a ColumnarToRow transition wrapping a columnar child, returns that - * columnar child. Used to detect and eliminate unnecessary transitions before Python UDF - * operators. + * If the given plan is a Comet ColumnarToRow transition, returns the columnar child the Python + * UDF operator can consume directly. By the time this rule runs the earlier + * `hasCometNativeChild` arm has already rewritten any `ColumnarToRowExec` over a Comet columnar + * source to one of the Comet variants, so vanilla `ColumnarToRowExec` cannot reach here on a + * Comet-driven plan and is intentionally not handled. */ private def extractColumnarChild(plan: SparkPlan): Option[SparkPlan] = plan match { - case ColumnarToRowExec(child) if child.supportsColumnar => Some(child) case CometColumnarToRowExec(child) => Some(child) case CometNativeColumnarToRowExec(child) => Some(child) case _ => None diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala similarity index 71% rename from spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala rename to spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala index 68e27b9355..77dbfff7ce 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.PythonUDF import org.apache.spark.sql.catalyst.plans.physical.Partitioning -import org.apache.spark.sql.comet.shims.ShimCometPythonMapInArrow +import org.apache.spark.sql.comet.shims.ShimCometMapInBatch import org.apache.spark.sql.execution.{ColumnarToRowExec, SparkPlan, UnaryExecNode} import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.sql.execution.python.{BatchIterator, PythonSQLMetrics} @@ -35,29 +35,26 @@ import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch} /** - * An optimized version of Spark's MapInBatchExec (PythonMapInArrowExec / MapInPandasExec) that - * accepts columnar input directly from Comet operators, avoiding unnecessary Arrow -> Row -> - * Arrow conversions. + * Comet replacement for Spark's `MapInBatchExec` family (`PythonMapInArrowExec` / + * `MapInArrowExec` in 4.1+ / `MapInPandasExec`). Accepts columnar input directly from a Comet + * child instead of going through the per-row `UnsafeProjection` that `ColumnarToRowExec` applies, + * and keeps the Python runner output as `ColumnarBatch` so downstream Comet operators consume it + * natively. * - * Normal Spark flow: CometNativeExec (Arrow) -> ColumnarToRow -> PythonMapInArrowExec - * (internally: rows -> Arrow -> Python -> Arrow -> rows) - * - * Optimized flow: CometNativeExec (Arrow) -> CometPythonMapInArrowExec (batch.rowIterator() -> - * Arrow -> Python -> Arrow columnar output) - * - * This eliminates: - * 1. The UnsafeProjection in ColumnarToRow (expensive copy) 2. The output Arrow->Row conversion - * (keeps Python output as ColumnarBatch) + * What this eliminates: two `UnsafeProjection` copies (input and output) and the row transition + * between Comet and the Python operator. The internal row-to-Arrow IPC re-encoding inside + * `ArrowPythonRunner` is unchanged; full round-trip elimination is tracked in #4240. */ -case class CometPythonMapInArrowExec( +case class CometMapInBatchExec( func: Expression, output: Seq[Attribute], child: SparkPlan, isBarrier: Boolean, pythonEvalType: Int) extends UnaryExecNode + with CometPlan with PythonSQLMetrics - with ShimCometPythonMapInArrow { + with ShimCometMapInBatch { override def supportsColumnar: Boolean = true @@ -71,6 +68,9 @@ case class CometPythonMapInArrowExec( "numInputRows" -> SQLMetrics.createMetric(sparkContext, "number of input rows")) ++ pythonMetrics + // Fallback for row-consuming parents (e.g. a top-level `collect()` that produces rows). + // Wraps this columnar exec in `ColumnarToRowExec`, reintroducing exactly the row transition + // this operator otherwise eliminates. Only fires when nothing downstream consumes columnar. override def doExecute(): RDD[InternalRow] = { ColumnarToRowExec(this).doExecute() } @@ -81,21 +81,15 @@ case class CometPythonMapInArrowExec( val numInputRows = longMetric("numInputRows") val pythonUDF = func.asInstanceOf[PythonUDF] - val localOutput = output - val localChildSchema = child.schema + val outputAttrs = output + val childSchema = child.schema val batchSize = conf.arrowMaxRecordsPerBatch - val sessionLocalTimeZone = conf.sessionLocalTimeZone - val useLargeVarTypes = largeVarTypes(conf) - val pythonRunnerConf = getPythonRunnerConfMap(conf) - val localPythonEvalType = pythonEvalType - val localPythonMetrics = pythonMetrics - val jobArtifactUUID = currentJobArtifactUUID() + val evalType = pythonEvalType + val sqlConf = conf + val metricsCopy = pythonMetrics val inputRDD = child.executeColumnar() - // Run on every partition. Identical to what MapInBatchExec does, except the input - // is columnar; we intentionally avoid the UnsafeProjection copy that ColumnarToRow - // would do. def processPartition(batches: Iterator[ColumnarBatch]): Iterator[ColumnarBatch] = { val context = TaskContext.get() val argOffsets = Array(Array(0)) @@ -115,22 +109,18 @@ case class CometPythonMapInArrowExec( val columnarBatchIter = computeArrowPython( pythonUDF, - localPythonEvalType, + evalType, argOffsets, - StructType(Array(StructField("struct", localChildSchema))), - sessionLocalTimeZone, - useLargeVarTypes, - pythonRunnerConf, - localPythonMetrics, - jobArtifactUUID, + StructType(Array(StructField("struct", childSchema))), + sqlConf, + metricsCopy, batchIter, context.partitionId(), context) columnarBatchIter.map { batch => - // Python returns a StructType column; flatten to individual columns val structVector = batch.column(0).asInstanceOf[ArrowColumnVector] - val outputVectors = localOutput.indices.map(structVector.getChild) + val outputVectors = outputAttrs.indices.map(structVector.getChild) val flattenedBatch = new ColumnarBatch(outputVectors.toArray) flattenedBatch.setNumRows(batch.numRows()) numOutputRows += flattenedBatch.numRows() @@ -148,6 +138,6 @@ case class CometPythonMapInArrowExec( } } - override protected def withNewChildInternal(newChild: SparkPlan): CometPythonMapInArrowExec = + override protected def withNewChildInternal(newChild: SparkPlan): CometMapInBatchExec = copy(child = newChild) } diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/shims/MapInBatchInfo.scala b/spark/src/main/scala/org/apache/spark/sql/comet/shims/MapInBatchInfo.scala new file mode 100644 index 0000000000..f610c575b1 --- /dev/null +++ b/spark/src/main/scala/org/apache/spark/sql/comet/shims/MapInBatchInfo.scala @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.comet.shims + +import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} +import org.apache.spark.sql.execution.SparkPlan + +/** + * Spark-version-agnostic projection of a `MapInBatchExec` (`PythonMapInArrowExec`, + * `MapInArrowExec`, or `MapInPandasExec`) that the Comet rewrite needs. Lives outside the shims + * so the Comet planner can pattern-match on it without depending on which concrete Spark class + * was matched. + */ +case class MapInBatchInfo( + func: Expression, + output: Seq[Attribute], + child: SparkPlan, + isBarrier: Boolean, + pythonEvalType: Int) diff --git a/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala b/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala similarity index 69% rename from spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala rename to spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala index 30736d99b3..c7d6ae2f97 100644 --- a/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala +++ b/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala @@ -21,7 +21,7 @@ package org.apache.spark.sql.comet.shims import org.apache.spark.TaskContext import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, PythonUDF} +import org.apache.spark.sql.catalyst.expressions.PythonUDF import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.internal.SQLConf @@ -36,33 +36,21 @@ import org.apache.spark.sql.vectorized.ColumnarBatch * matchers return `None` and the runner factory throws. The optimization is effectively a no-op * on Spark 3.4. */ -trait ShimCometPythonMapInArrow { +trait ShimCometMapInBatch { - protected def matchMapInArrow( - plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] = None + protected def matchMapInArrow(plan: SparkPlan): Option[MapInBatchInfo] = None - protected def matchMapInPandas( - plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] = None - - protected def currentJobArtifactUUID(): Option[String] = None - - protected def largeVarTypes(conf: SQLConf): Boolean = false - - protected def getPythonRunnerConfMap(conf: SQLConf): Map[String, String] = Map.empty + protected def matchMapInPandas(plan: SparkPlan): Option[MapInBatchInfo] = None protected def computeArrowPython( pythonUDF: PythonUDF, evalType: Int, argOffsets: Array[Array[Int]], schema: StructType, - timeZoneId: String, - largeVarTypes: Boolean, - pythonRunnerConf: Map[String, String], + conf: SQLConf, pythonMetrics: Map[String, SQLMetric], - jobArtifactUUID: Option[String], batchIter: Iterator[Iterator[InternalRow]], partitionId: Int, context: TaskContext): Iterator[ColumnarBatch] = - throw new UnsupportedOperationException( - "CometPythonMapInArrowExec is not supported on Spark 3.4") + throw new UnsupportedOperationException("CometMapInBatchExec is not supported on Spark 3.4") } diff --git a/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala b/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala similarity index 64% rename from spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala rename to spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala index f7c8221d9e..42d66465f4 100644 --- a/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala +++ b/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala @@ -20,9 +20,9 @@ package org.apache.spark.sql.comet.shims import org.apache.spark.{JobArtifactSet, TaskContext} -import org.apache.spark.api.python.ChainedPythonFunctions +import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType} import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, PythonUDF} +import org.apache.spark.sql.catalyst.expressions.PythonUDF import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.execution.python.{ArrowPythonRunner, MapInPandasExec, PythonMapInArrowExec} @@ -30,54 +30,54 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType import org.apache.spark.sql.vectorized.ColumnarBatch -trait ShimCometPythonMapInArrow { +trait ShimCometMapInBatch { - protected def matchMapInArrow( - plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] = + protected def matchMapInArrow(plan: SparkPlan): Option[MapInBatchInfo] = plan match { case p: PythonMapInArrowExec => - Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType)) + Some( + MapInBatchInfo( + p.func, + p.output, + p.child, + p.isBarrier, + PythonEvalType.SQL_MAP_ARROW_ITER_UDF)) case _ => None } - protected def matchMapInPandas( - plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] = + protected def matchMapInPandas(plan: SparkPlan): Option[MapInBatchInfo] = plan match { case p: MapInPandasExec => - Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType)) + Some( + MapInBatchInfo( + p.func, + p.output, + p.child, + p.isBarrier, + PythonEvalType.SQL_MAP_PANDAS_ITER_UDF)) case _ => None } - protected def currentJobArtifactUUID(): Option[String] = - JobArtifactSet.getCurrentJobArtifactState.map(_.uuid) - - protected def largeVarTypes(conf: SQLConf): Boolean = conf.arrowUseLargeVarTypes - - protected def getPythonRunnerConfMap(conf: SQLConf): Map[String, String] = - ArrowPythonRunner.getPythonRunnerConfMap(conf) - protected def computeArrowPython( pythonUDF: PythonUDF, evalType: Int, argOffsets: Array[Array[Int]], schema: StructType, - timeZoneId: String, - largeVarTypes: Boolean, - pythonRunnerConf: Map[String, String], + conf: SQLConf, pythonMetrics: Map[String, SQLMetric], - jobArtifactUUID: Option[String], batchIter: Iterator[Iterator[InternalRow]], partitionId: Int, context: TaskContext): Iterator[ColumnarBatch] = { val chainedFunc = Seq(ChainedPythonFunctions(Seq(pythonUDF.func))) + val jobArtifactUUID = JobArtifactSet.getCurrentJobArtifactState.map(_.uuid) new ArrowPythonRunner( chainedFunc, evalType, argOffsets, schema, - timeZoneId, - largeVarTypes, - pythonRunnerConf, + conf.sessionLocalTimeZone, + conf.arrowUseLargeVarTypes, + ArrowPythonRunner.getPythonRunnerConfMap(conf), pythonMetrics, jobArtifactUUID).compute(batchIter, partitionId, context) } diff --git a/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala new file mode 100644 index 0000000000..0c21cb3738 --- /dev/null +++ b/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.comet.shims + +import org.apache.spark.TaskContext +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.PythonUDF +import org.apache.spark.sql.execution.metric.SQLMetric +import org.apache.spark.sql.execution.python.ArrowPythonRunner +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.vectorized.ColumnarBatch + +trait ShimCometMapInBatch extends Spark4xMapInBatchSupport { + + protected def computeArrowPython( + pythonUDF: PythonUDF, + evalType: Int, + argOffsets: Array[Array[Int]], + schema: StructType, + conf: SQLConf, + pythonMetrics: Map[String, SQLMetric], + batchIter: Iterator[Iterator[InternalRow]], + partitionId: Int, + context: TaskContext): Iterator[ColumnarBatch] = { + val r = runnerInputs(pythonUDF, conf) + new ArrowPythonRunner( + r.chainedFunc, + evalType, + argOffsets, + schema, + r.timeZoneId, + r.largeVarTypes, + r.pythonRunnerConf, + pythonMetrics, + r.jobArtifactUUID, + None).compute(batchIter, partitionId, context) + } +} diff --git a/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala b/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala deleted file mode 100644 index 78935f54c5..0000000000 --- a/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.spark.sql.comet.shims - -import org.apache.spark.{JobArtifactSet, TaskContext} -import org.apache.spark.api.python.ChainedPythonFunctions -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, PythonUDF} -import org.apache.spark.sql.execution.SparkPlan -import org.apache.spark.sql.execution.metric.SQLMetric -import org.apache.spark.sql.execution.python.{ArrowPythonRunner, MapInArrowExec, MapInPandasExec} -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.vectorized.ColumnarBatch - -trait ShimCometPythonMapInArrow { - - protected def matchMapInArrow( - plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] = - plan match { - case p: MapInArrowExec => - Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType)) - case _ => None - } - - protected def matchMapInPandas( - plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] = - plan match { - case p: MapInPandasExec => - Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType)) - case _ => None - } - - protected def currentJobArtifactUUID(): Option[String] = - JobArtifactSet.getCurrentJobArtifactState.map(_.uuid) - - protected def largeVarTypes(conf: SQLConf): Boolean = conf.arrowUseLargeVarTypes - - protected def getPythonRunnerConfMap(conf: SQLConf): Map[String, String] = - ArrowPythonRunner.getPythonRunnerConfMap(conf) - - protected def computeArrowPython( - pythonUDF: PythonUDF, - evalType: Int, - argOffsets: Array[Array[Int]], - schema: StructType, - timeZoneId: String, - largeVarTypes: Boolean, - pythonRunnerConf: Map[String, String], - pythonMetrics: Map[String, SQLMetric], - jobArtifactUUID: Option[String], - batchIter: Iterator[Iterator[InternalRow]], - partitionId: Int, - context: TaskContext): Iterator[ColumnarBatch] = { - val chainedFunc = - Seq((ChainedPythonFunctions(Seq(pythonUDF.func)), pythonUDF.resultId.id)) - new ArrowPythonRunner( - chainedFunc, - evalType, - argOffsets, - schema, - timeZoneId, - largeVarTypes, - pythonRunnerConf, - pythonMetrics, - jobArtifactUUID, - None).compute(batchIter, partitionId, context) - } -} diff --git a/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala new file mode 100644 index 0000000000..e73748aafe --- /dev/null +++ b/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.comet.shims + +import org.apache.spark.TaskContext +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.PythonUDF +import org.apache.spark.sql.execution.metric.SQLMetric +import org.apache.spark.sql.execution.python.ArrowPythonRunner +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.vectorized.ColumnarBatch + +trait ShimCometMapInBatch extends Spark4xMapInBatchSupport { + + protected def computeArrowPython( + pythonUDF: PythonUDF, + evalType: Int, + argOffsets: Array[Array[Int]], + schema: StructType, + conf: SQLConf, + pythonMetrics: Map[String, SQLMetric], + batchIter: Iterator[Iterator[InternalRow]], + partitionId: Int, + context: TaskContext): Iterator[ColumnarBatch] = { + val r = runnerInputs(pythonUDF, conf) + new ArrowPythonRunner( + r.chainedFunc, + evalType, + argOffsets, + schema, + r.timeZoneId, + r.largeVarTypes, + r.pythonRunnerConf, + pythonMetrics, + r.jobArtifactUUID, + None, + None).compute(batchIter, partitionId, context) + } +} diff --git a/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala b/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala deleted file mode 100644 index f7f775b1fa..0000000000 --- a/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.spark.sql.comet.shims - -import org.apache.spark.{JobArtifactSet, TaskContext} -import org.apache.spark.api.python.ChainedPythonFunctions -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, PythonUDF} -import org.apache.spark.sql.execution.SparkPlan -import org.apache.spark.sql.execution.metric.SQLMetric -import org.apache.spark.sql.execution.python.{ArrowPythonRunner, MapInArrowExec, MapInPandasExec} -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.vectorized.ColumnarBatch - -trait ShimCometPythonMapInArrow { - - protected def matchMapInArrow( - plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] = - plan match { - case p: MapInArrowExec => - Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType)) - case _ => None - } - - protected def matchMapInPandas( - plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] = - plan match { - case p: MapInPandasExec => - Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType)) - case _ => None - } - - protected def currentJobArtifactUUID(): Option[String] = - JobArtifactSet.getCurrentJobArtifactState.map(_.uuid) - - protected def largeVarTypes(conf: SQLConf): Boolean = conf.arrowUseLargeVarTypes - - protected def getPythonRunnerConfMap(conf: SQLConf): Map[String, String] = - ArrowPythonRunner.getPythonRunnerConfMap(conf) - - protected def computeArrowPython( - pythonUDF: PythonUDF, - evalType: Int, - argOffsets: Array[Array[Int]], - schema: StructType, - timeZoneId: String, - largeVarTypes: Boolean, - pythonRunnerConf: Map[String, String], - pythonMetrics: Map[String, SQLMetric], - jobArtifactUUID: Option[String], - batchIter: Iterator[Iterator[InternalRow]], - partitionId: Int, - context: TaskContext): Iterator[ColumnarBatch] = { - val chainedFunc = - Seq((ChainedPythonFunctions(Seq(pythonUDF.func)), pythonUDF.resultId.id)) - new ArrowPythonRunner( - chainedFunc, - evalType, - argOffsets, - schema, - timeZoneId, - largeVarTypes, - pythonRunnerConf, - pythonMetrics, - jobArtifactUUID, - None, - None).compute(batchIter, partitionId, context) - } -} diff --git a/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala new file mode 100644 index 0000000000..0c21cb3738 --- /dev/null +++ b/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.comet.shims + +import org.apache.spark.TaskContext +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.PythonUDF +import org.apache.spark.sql.execution.metric.SQLMetric +import org.apache.spark.sql.execution.python.ArrowPythonRunner +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.vectorized.ColumnarBatch + +trait ShimCometMapInBatch extends Spark4xMapInBatchSupport { + + protected def computeArrowPython( + pythonUDF: PythonUDF, + evalType: Int, + argOffsets: Array[Array[Int]], + schema: StructType, + conf: SQLConf, + pythonMetrics: Map[String, SQLMetric], + batchIter: Iterator[Iterator[InternalRow]], + partitionId: Int, + context: TaskContext): Iterator[ColumnarBatch] = { + val r = runnerInputs(pythonUDF, conf) + new ArrowPythonRunner( + r.chainedFunc, + evalType, + argOffsets, + schema, + r.timeZoneId, + r.largeVarTypes, + r.pythonRunnerConf, + pythonMetrics, + r.jobArtifactUUID, + None).compute(batchIter, partitionId, context) + } +} diff --git a/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala b/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala deleted file mode 100644 index 78935f54c5..0000000000 --- a/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.spark.sql.comet.shims - -import org.apache.spark.{JobArtifactSet, TaskContext} -import org.apache.spark.api.python.ChainedPythonFunctions -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, PythonUDF} -import org.apache.spark.sql.execution.SparkPlan -import org.apache.spark.sql.execution.metric.SQLMetric -import org.apache.spark.sql.execution.python.{ArrowPythonRunner, MapInArrowExec, MapInPandasExec} -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.vectorized.ColumnarBatch - -trait ShimCometPythonMapInArrow { - - protected def matchMapInArrow( - plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] = - plan match { - case p: MapInArrowExec => - Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType)) - case _ => None - } - - protected def matchMapInPandas( - plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] = - plan match { - case p: MapInPandasExec => - Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType)) - case _ => None - } - - protected def currentJobArtifactUUID(): Option[String] = - JobArtifactSet.getCurrentJobArtifactState.map(_.uuid) - - protected def largeVarTypes(conf: SQLConf): Boolean = conf.arrowUseLargeVarTypes - - protected def getPythonRunnerConfMap(conf: SQLConf): Map[String, String] = - ArrowPythonRunner.getPythonRunnerConfMap(conf) - - protected def computeArrowPython( - pythonUDF: PythonUDF, - evalType: Int, - argOffsets: Array[Array[Int]], - schema: StructType, - timeZoneId: String, - largeVarTypes: Boolean, - pythonRunnerConf: Map[String, String], - pythonMetrics: Map[String, SQLMetric], - jobArtifactUUID: Option[String], - batchIter: Iterator[Iterator[InternalRow]], - partitionId: Int, - context: TaskContext): Iterator[ColumnarBatch] = { - val chainedFunc = - Seq((ChainedPythonFunctions(Seq(pythonUDF.func)), pythonUDF.resultId.id)) - new ArrowPythonRunner( - chainedFunc, - evalType, - argOffsets, - schema, - timeZoneId, - largeVarTypes, - pythonRunnerConf, - pythonMetrics, - jobArtifactUUID, - None).compute(batchIter, partitionId, context) - } -} diff --git a/spark/src/main/spark-4.x/org/apache/spark/sql/comet/shims/Spark4xMapInBatchSupport.scala b/spark/src/main/spark-4.x/org/apache/spark/sql/comet/shims/Spark4xMapInBatchSupport.scala new file mode 100644 index 0000000000..78672aea5e --- /dev/null +++ b/spark/src/main/spark-4.x/org/apache/spark/sql/comet/shims/Spark4xMapInBatchSupport.scala @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.comet.shims + +import org.apache.spark.{JobArtifactSet, TaskContext} +import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.PythonUDF +import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.execution.metric.SQLMetric +import org.apache.spark.sql.execution.python.{ArrowPythonRunner, MapInArrowExec, MapInPandasExec} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.vectorized.ColumnarBatch + +/** + * Shared 4.x bits for `ShimCometMapInBatch`. The matchers and `getRunnerInputs` helper are + * identical across 4.0/4.1/4.2; only the `ArrowPythonRunner` constructor parameter list differs + * per minor, so each minor's `ShimCometMapInBatch` provides only `computeArrowPython`. + */ +trait Spark4xMapInBatchSupport { + + protected def matchMapInArrow(plan: SparkPlan): Option[MapInBatchInfo] = + plan match { + case p: MapInArrowExec => + Some( + MapInBatchInfo( + p.func, + p.output, + p.child, + p.isBarrier, + PythonEvalType.SQL_MAP_ARROW_ITER_UDF)) + case _ => None + } + + protected def matchMapInPandas(plan: SparkPlan): Option[MapInBatchInfo] = + plan match { + case p: MapInPandasExec => + Some( + MapInBatchInfo( + p.func, + p.output, + p.child, + p.isBarrier, + PythonEvalType.SQL_MAP_PANDAS_ITER_UDF)) + case _ => None + } + + /** Inputs every 4.x `ArrowPythonRunner` constructor needs in the same shape. */ + protected case class RunnerInputs( + chainedFunc: Seq[(ChainedPythonFunctions, Long)], + timeZoneId: String, + largeVarTypes: Boolean, + pythonRunnerConf: Map[String, String], + jobArtifactUUID: Option[String]) + + protected def runnerInputs(pythonUDF: PythonUDF, conf: SQLConf): RunnerInputs = + RunnerInputs( + chainedFunc = Seq((ChainedPythonFunctions(Seq(pythonUDF.func)), pythonUDF.resultId.id)), + timeZoneId = conf.sessionLocalTimeZone, + largeVarTypes = conf.arrowUseLargeVarTypes, + pythonRunnerConf = ArrowPythonRunner.getPythonRunnerConfMap(conf), + jobArtifactUUID = JobArtifactSet.getCurrentJobArtifactState.map(_.uuid)) +} diff --git a/spark/src/test/resources/pyspark/benchmark_pyarrow_udf.py b/spark/src/test/resources/pyspark/benchmark_pyarrow_udf.py index 8a3b4333c4..49574130c0 100644 --- a/spark/src/test/resources/pyspark/benchmark_pyarrow_udf.py +++ b/spark/src/test/resources/pyspark/benchmark_pyarrow_udf.py @@ -20,7 +20,7 @@ End-to-end wall-clock benchmark for Comet's PyArrow UDF acceleration. Times `df.mapInArrow(passthrough, schema).count()` and the equivalent -`mapInPandas` query with `spark.comet.exec.pythonMapInArrow.enabled` set +`mapInPandas` query with `spark.comet.exec.pyarrowUdf.enabled` set to false (vanilla Spark path) and true (Comet's optimized path). Both modes run the same Python worker, so the measured delta covers what the optimization actually changes for users: @@ -34,6 +34,12 @@ optimization's user-visible value is what fraction of end-to-end time it shaves off, not the JVM-side delta in isolation. +Caveat: the workload here is `passthrough_udf` + `count()` on `local[2]`, +so most of the wall time is Spark's Python fork/IPC overhead with very +little real Python work. Real UDFs (PyArrow compute, pandas ops, model +inference) increase the per-row Python cost, which dilutes the JVM-side +savings and shrinks the speedup ratio relative to what you see here. + Usage: # Build Comet (release for representative numbers): make release @@ -50,48 +56,20 @@ """ import contextlib -import glob import os import statistics +import sys import tempfile import time from pyspark.sql import SparkSession - -REPO_ROOT = os.path.abspath( - os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..") -) - - -def _resolve_comet_jar() -> str: - explicit = os.environ.get("COMET_JAR") - if explicit: - return explicit - import pyspark - - major_minor = ".".join(pyspark.__version__.split(".")[:2]) - spark_tag = f"spark{major_minor}" - scala_tag = "_2.12" if major_minor.startswith("3.") else "_2.13" - pattern = os.path.join( - REPO_ROOT, - f"spark/target/comet-spark-{spark_tag}{scala_tag}-*-SNAPSHOT.jar", - ) - candidates = [ - m - for m in sorted(glob.glob(pattern)) - if "sources" not in os.path.basename(m) and "tests" not in os.path.basename(m) - ] - if not candidates: - raise FileNotFoundError( - "Comet jar not found. Set COMET_JAR or run `make release`. " - f"Looked under {pattern}." - ) - return candidates[-1] +sys.path.insert(0, os.path.dirname(__file__)) +from conftest import resolve_comet_jar def _build_spark() -> SparkSession: - jar = _resolve_comet_jar() + jar = resolve_comet_jar() os.environ["PYSPARK_SUBMIT_ARGS"] = ( f"--jars {jar} --driver-class-path {jar} pyspark-shell" ) @@ -165,7 +143,7 @@ def _temp_parquet(spark: SparkSession, build_df, n: int): def _time_run(spark: SparkSession, parquet_path: str, accelerate: bool, api: str) -> float: spark.conf.set( - "spark.comet.exec.pythonMapInArrow.enabled", + "spark.comet.exec.pyarrowUdf.enabled", "true" if accelerate else "false", ) df = spark.read.parquet(parquet_path) @@ -189,7 +167,7 @@ def main() -> None: print(f"\nrows per run: {rows:,}") print(f"warmup iters: {warmup}, measured iters: {iters}") - print(f"jar: {_resolve_comet_jar()}\n") + print(f"jar: {resolve_comet_jar()}\n") header = " {:<14} {:<10} {:>10} {:>10} {:>10} {:>13} {:>9}".format( "api", "mode", "min (s)", "median (s)", "max (s)", "rows/s", "speedup" diff --git a/spark/src/test/resources/pyspark/conftest.py b/spark/src/test/resources/pyspark/conftest.py new file mode 100644 index 0000000000..35d6d85191 --- /dev/null +++ b/spark/src/test/resources/pyspark/conftest.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Shared helpers for the pytest modules under this directory and for the +benchmark scripts that import them. + +`resolve_comet_jar` returns the path to the Comet jar a Spark session needs. +Resolution order: the `COMET_JAR` env var (taken verbatim if it points at a +file, expanded as a glob otherwise), then `/spark/target` matched against +the installed pyspark major.minor version. +""" + +import glob +import os + + +REPO_ROOT = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..") +) + + +def resolve_comet_jar() -> str: + explicit = os.environ.get("COMET_JAR") + if explicit: + if any(ch in explicit for ch in "*?["): + matches = sorted(glob.glob(explicit)) + if not matches: + raise FileNotFoundError( + f"COMET_JAR pattern matched nothing: {explicit}" + ) + return matches[-1] + return explicit + + # Pick the jar that matches the installed pyspark major.minor version. The + # Comet jars are published per Spark version (e.g. + # comet-spark-spark3.5_2.12-*.jar); using the wrong one yields + # ClassNotFoundException on Scala stdlib classes. + import pyspark + + major_minor = ".".join(pyspark.__version__.split(".")[:2]) + spark_tag = f"spark{major_minor}" + scala_tag = "_2.12" if major_minor.startswith("3.") else "_2.13" + pattern = os.path.join( + REPO_ROOT, + f"spark/target/comet-spark-{spark_tag}{scala_tag}-*-SNAPSHOT.jar", + ) + candidates = [ + m + for m in sorted(glob.glob(pattern)) + if "sources" not in os.path.basename(m) and "tests" not in os.path.basename(m) + ] + if not candidates: + raise FileNotFoundError( + "Comet jar not found. Set COMET_JAR or run `make release`. " + f"Looked under {pattern}." + ) + return candidates[-1] diff --git a/spark/src/test/resources/pyspark/test_pyarrow_udf.py b/spark/src/test/resources/pyspark/test_pyarrow_udf.py index ea72436841..87558ec057 100644 --- a/spark/src/test/resources/pyspark/test_pyarrow_udf.py +++ b/spark/src/test/resources/pyspark/test_pyarrow_udf.py @@ -20,14 +20,14 @@ Pytest-driven integration tests for Comet's PyArrow UDF acceleration. Each test runs against two execution paths: - - "accelerated": spark.comet.exec.pythonMapInArrow.enabled=true - (plan should contain CometPythonMapInArrow and no ColumnarToRow) - - "fallback": spark.comet.exec.pythonMapInArrow.enabled=false - (plan should contain vanilla PythonMapInArrow) + - "accelerated": spark.comet.exec.pyarrowUdf.enabled=true + (plan should contain CometMapInBatch and no ColumnarToRow) + - "fallback": spark.comet.exec.pyarrowUdf.enabled=false + (plan should contain vanilla PythonMapInArrow / MapInArrow) Usage: # Build Comet first: - make release + make # Then either let the test discover the jar from spark/target, or pass it # explicitly via COMET_JAR: @@ -38,7 +38,6 @@ """ import datetime as dt -import glob import os from decimal import Decimal @@ -46,52 +45,12 @@ import pytest from pyspark.sql import SparkSession, types as T - -REPO_ROOT = os.path.abspath( - os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..") -) - - -def _resolve_comet_jar() -> str: - explicit = os.environ.get("COMET_JAR") - if explicit: - if any(ch in explicit for ch in "*?["): - matches = sorted(glob.glob(explicit)) - if not matches: - raise FileNotFoundError( - f"COMET_JAR pattern matched nothing: {explicit}" - ) - return matches[-1] - return explicit - - # Pick the jar that matches the installed pyspark major.minor version. The - # Comet jars are published per Spark version (e.g., comet-spark-spark3.5_2.12-*.jar); - # using the wrong one yields ClassNotFoundException on Scala stdlib classes. - import pyspark - - major_minor = ".".join(pyspark.__version__.split(".")[:2]) - spark_tag = f"spark{major_minor}" - scala_tag = "_2.12" if major_minor.startswith("3.") else "_2.13" - pattern = os.path.join( - REPO_ROOT, - f"spark/target/comet-spark-{spark_tag}{scala_tag}-*-SNAPSHOT.jar", - ) - candidates = [ - m - for m in sorted(glob.glob(pattern)) - if "sources" not in os.path.basename(m) and "tests" not in os.path.basename(m) - ] - if not candidates: - raise FileNotFoundError( - "Comet jar not found. Set COMET_JAR or run `make release`. " - f"Looked under {pattern}." - ) - return candidates[-1] +from conftest import resolve_comet_jar @pytest.fixture(scope="session") def spark(): - jar = _resolve_comet_jar() + jar = resolve_comet_jar() # PYSPARK_SUBMIT_ARGS is consumed when pyspark launches its JVM. Setting # --jars puts the Comet jar on both driver and executor classpaths so the # CometPlugin can be loaded. @@ -117,7 +76,7 @@ def spark(): @pytest.fixture(params=[True, False], ids=["accelerated", "fallback"]) def accelerated(request, spark) -> bool: spark.conf.set( - "spark.comet.exec.pythonMapInArrow.enabled", + "spark.comet.exec.pyarrowUdf.enabled", "true" if request.param else "false", ) return request.param @@ -128,18 +87,18 @@ def _executed_plan(df) -> str: def _assert_plan_matches_mode( - plan: str, accelerated: bool, vanilla_node: str = "PythonMapInArrow" + plan: str, accelerated: bool, vanilla_node: str = "MapInArrow" ) -> None: if accelerated: - assert "CometPythonMapInArrow" in plan, ( - f"expected CometPythonMapInArrow in accelerated plan, got:\n{plan}" + assert "CometMapInBatch" in plan, ( + f"expected CometMapInBatch in accelerated plan, got:\n{plan}" ) assert "ColumnarToRow" not in plan, ( f"unexpected ColumnarToRow in accelerated plan:\n{plan}" ) else: - assert "CometPythonMapInArrow" not in plan, ( - f"unexpected CometPythonMapInArrow in fallback plan:\n{plan}" + assert "CometMapInBatch" not in plan, ( + f"unexpected CometMapInBatch in fallback plan:\n{plan}" ) assert vanilla_node in plan, ( f"expected {vanilla_node} in fallback plan, got:\n{plan}" @@ -176,6 +135,11 @@ def double_value(iterator): assert row["name"] == original[2] +# All other tests use the default `vanilla_node="MapInArrow"`. The mapInPandas tests below +# pass `MapInPandas` explicitly. The substring is the same on Spark 3.5 (PythonMapInArrowExec) +# and Spark 4.x (MapInArrowExec) since the latter is a substring of the former. + + def test_map_in_arrow_changes_schema(spark, tmp_path, accelerated): data = [(i, float(i)) for i in range(50)] src = str(tmp_path / "src.parquet") diff --git a/spark/src/test/scala/org/apache/comet/exec/CometPythonMapInArrowSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometPythonMapInArrowSuite.scala deleted file mode 100644 index 7b1e17c4ed..0000000000 --- a/spark/src/test/scala/org/apache/comet/exec/CometPythonMapInArrowSuite.scala +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.comet.exec - -import org.apache.spark.sql.CometTestBase -import org.apache.spark.sql.comet.CometPythonMapInArrowExec - -import org.apache.comet.CometConf - -class CometPythonMapInArrowSuite extends CometTestBase { - - test("plan with CometScan has columnar support for Python UDF optimization") { - withSQLConf( - CometConf.COMET_ENABLED.key -> "true", - CometConf.COMET_EXEC_ENABLED.key -> "true", - CometConf.COMET_PYTHON_MAP_IN_ARROW_ENABLED.key -> "true") { - withParquetTable( - (1 to 10).map(i => (i.toDouble, s"str_$i")), - "testTable", - withDictionary = false) { - val df = spark.sql("SELECT * FROM testTable") - val plan = df.queryExecution.executedPlan - val cometScans = plan.collect { case s if s.supportsColumnar => s } - assert(cometScans.nonEmpty, "Expected columnar operators that can feed Python UDFs") - } - } - } - - test("config disables Python map in arrow optimization") { - withSQLConf( - CometConf.COMET_ENABLED.key -> "true", - CometConf.COMET_EXEC_ENABLED.key -> "true", - CometConf.COMET_PYTHON_MAP_IN_ARROW_ENABLED.key -> "false") { - withParquetTable( - (1 to 10).map(i => (i.toDouble, s"str_$i")), - "testTable", - withDictionary = false) { - val df = spark.sql("SELECT * FROM testTable") - val plan = df.queryExecution.executedPlan - // With the feature disabled, no CometPythonMapInArrowExec should appear - val cometPythonExecs = - plan.collect { case e: CometPythonMapInArrowExec => e } - assert( - cometPythonExecs.isEmpty, - "CometPythonMapInArrowExec should not appear when disabled") - } - } - } -} diff --git a/spark/src/test/spark-3.5/org/apache/spark/sql/comet/CometMapInBatchSuite.scala b/spark/src/test/spark-3.5/org/apache/spark/sql/comet/CometMapInBatchSuite.scala new file mode 100644 index 0000000000..af960c5c97 --- /dev/null +++ b/spark/src/test/spark-3.5/org/apache/spark/sql/comet/CometMapInBatchSuite.scala @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.comet + +import org.apache.spark.api.python.{PythonAccumulatorV2, PythonBroadcast, PythonEvalType, PythonFunction} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.CometTestBase +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, ExprId, PythonUDF} +import org.apache.spark.sql.execution.{ColumnarToRowExec, LeafExecNode} +import org.apache.spark.sql.execution.python.PythonMapInArrowExec +import org.apache.spark.sql.types.{LongType, StructField, StructType} +import org.apache.spark.sql.vectorized.ColumnarBatch + +import org.apache.comet.CometConf +import org.apache.comet.rules.EliminateRedundantTransitions + +/** Minimal CometPlan leaf used to anchor the rule's transform without triggering execution. */ +private case class StubCometLeaf(override val output: Seq[Attribute]) + extends LeafExecNode + with CometPlan { + override def supportsColumnar: Boolean = true + override protected def doExecute(): RDD[InternalRow] = + throw new UnsupportedOperationException + override protected def doExecuteColumnar(): RDD[ColumnarBatch] = + throw new UnsupportedOperationException +} + +/** + * Plan-rule test for the `EliminateRedundantTransitions` rewrite that produces + * `CometMapInBatchExec`. Pure Python execution paths are covered by the pytest module + * `test_pyarrow_udf.py`; this suite verifies the JVM-side rule without spinning up Python. + * + * Lives under `org.apache.spark.sql.comet` so it can reference Spark's `private[spark]` + * `PythonFunction` / `PythonAccumulatorV2` / `PythonBroadcast` classes when fabricating a stub + * `PythonUDF` for `PythonMapInArrowExec` to wrap. + */ +class CometMapInBatchSuite extends CometTestBase { + + private def stubPythonUDF: PythonUDF = { + val pyFunc = new PythonFunction { + override val command: Seq[Byte] = Seq.empty[Byte] + override val envVars: java.util.Map[String, String] = + new java.util.HashMap[String, String]() + override val pythonIncludes: java.util.List[String] = + java.util.Collections.emptyList[String]() + override val pythonExec: String = "python3" + override val pythonVer: String = "3" + override val broadcastVars: java.util.List[Broadcast[PythonBroadcast]] = + java.util.Collections.emptyList[Broadcast[PythonBroadcast]]() + override val accumulator: PythonAccumulatorV2 = null + } + PythonUDF( + name = "test_udf", + func = pyFunc, + dataType = StructType(Seq(StructField("id", LongType))), + children = Seq(AttributeReference("id", LongType)(ExprId(0L))), + evalType = PythonEvalType.SQL_MAP_ARROW_ITER_UDF, + udfDeterministic = true) + } + + private def buildPlan(): PythonMapInArrowExec = { + val cometChild = StubCometLeaf(Seq(AttributeReference("id", LongType)(ExprId(0L)))) + PythonMapInArrowExec( + stubPythonUDF, + cometChild.output, + ColumnarToRowExec(cometChild), + isBarrier = false) + } + + test("rule rewrites PythonMapInArrowExec over Comet to CometMapInBatchExec") { + withSQLConf(CometConf.COMET_PYARROW_UDF_ENABLED.key -> "true") { + val rewritten = EliminateRedundantTransitions(spark).apply(buildPlan()) + assert( + rewritten.exists(_.isInstanceOf[CometMapInBatchExec]), + s"expected CometMapInBatchExec in rewritten plan:\n$rewritten") + } + } + + test("rule does not rewrite when feature is disabled") { + withSQLConf(CometConf.COMET_PYARROW_UDF_ENABLED.key -> "false") { + val rewritten = EliminateRedundantTransitions(spark).apply(buildPlan()) + assert( + !rewritten.exists(_.isInstanceOf[CometMapInBatchExec]), + s"unexpected CometMapInBatchExec when disabled:\n$rewritten") + } + } +} diff --git a/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala b/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala new file mode 100644 index 0000000000..5ab0b927a2 --- /dev/null +++ b/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.comet + +import org.apache.spark.api.python.{PythonAccumulatorV2, PythonBroadcast, PythonEvalType, PythonFunction} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.CometTestBase +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, ExprId, PythonUDF} +import org.apache.spark.sql.execution.{ColumnarToRowExec, LeafExecNode} +import org.apache.spark.sql.execution.python.MapInArrowExec +import org.apache.spark.sql.types.{LongType, StructField, StructType} +import org.apache.spark.sql.vectorized.ColumnarBatch + +import org.apache.comet.CometConf +import org.apache.comet.rules.EliminateRedundantTransitions + +/** Minimal CometPlan leaf used to anchor the rule's transform without triggering execution. */ +private case class StubCometLeaf(override val output: Seq[Attribute]) + extends LeafExecNode + with CometPlan { + override def supportsColumnar: Boolean = true + override protected def doExecute(): RDD[InternalRow] = + throw new UnsupportedOperationException + override protected def doExecuteColumnar(): RDD[ColumnarBatch] = + throw new UnsupportedOperationException +} + +/** + * Plan-rule test for the `EliminateRedundantTransitions` rewrite that produces + * `CometMapInBatchExec`. Pure Python execution paths are covered by the pytest module + * `test_pyarrow_udf.py`; this suite verifies the JVM-side rule without spinning up Python. + * + * Lives under `org.apache.spark.sql.comet` so it can reference Spark's `private[spark]` + * `PythonFunction` / `PythonAccumulatorV2` / `PythonBroadcast` classes when fabricating a stub + * `PythonUDF` for `MapInArrowExec` to wrap. + */ +class CometMapInBatchSuite extends CometTestBase { + + private def stubPythonUDF: PythonUDF = { + val pyFunc = new PythonFunction { + override val command: Seq[Byte] = Seq.empty[Byte] + override val envVars: java.util.Map[String, String] = + new java.util.HashMap[String, String]() + override val pythonIncludes: java.util.List[String] = + java.util.Collections.emptyList[String]() + override val pythonExec: String = "python3" + override val pythonVer: String = "3" + override val broadcastVars: java.util.List[Broadcast[PythonBroadcast]] = + java.util.Collections.emptyList[Broadcast[PythonBroadcast]]() + override val accumulator: PythonAccumulatorV2 = null + } + PythonUDF( + name = "test_udf", + func = pyFunc, + dataType = StructType(Seq(StructField("id", LongType))), + children = Seq(AttributeReference("id", LongType)(ExprId(0L))), + evalType = PythonEvalType.SQL_MAP_ARROW_ITER_UDF, + udfDeterministic = true) + } + + private def buildPlan(): MapInArrowExec = { + val cometChild = StubCometLeaf(Seq(AttributeReference("id", LongType)(ExprId(0L)))) + MapInArrowExec( + stubPythonUDF, + cometChild.output, + ColumnarToRowExec(cometChild), + isBarrier = false, + profile = None) + } + + test("rule rewrites MapInArrowExec over Comet to CometMapInBatchExec") { + withSQLConf(CometConf.COMET_PYARROW_UDF_ENABLED.key -> "true") { + val rewritten = EliminateRedundantTransitions(spark).apply(buildPlan()) + assert( + rewritten.exists(_.isInstanceOf[CometMapInBatchExec]), + s"expected CometMapInBatchExec in rewritten plan:\n$rewritten") + } + } + + test("rule does not rewrite when feature is disabled") { + withSQLConf(CometConf.COMET_PYARROW_UDF_ENABLED.key -> "false") { + val rewritten = EliminateRedundantTransitions(spark).apply(buildPlan()) + assert( + !rewritten.exists(_.isInstanceOf[CometMapInBatchExec]), + s"unexpected CometMapInBatchExec when disabled:\n$rewritten") + } + } +} From 6f5aca3b30d6d84d06db7f6dc1556ab3ad50a3e8 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 11 May 2026 18:40:12 -0600 Subject: [PATCH 18/54] fix: resolve ArrowPythonRunner inputs on driver, not in task closure CometMapInBatchExec previously captured `SQLConf` in the partition closure and resolved `sessionLocalTimeZone` / `arrowUseLargeVarTypes` / `getPythonRunnerConfMap` on the executor inside `runnerInputs`. SQLConf reads from a thread-local ConfigReader that only exists on the driver, so this NPEs on the executor (reported by wForget on #4234). Move the `runnerInputs(...)` call to the driver in `doExecuteColumnar` and pass the resolved primitives into `computeArrowPython` as a serializable `RunnerInputs` case class. The per-minor shims now take `RunnerInputs` instead of `(PythonUDF, SQLConf)`. Also drop now-unused imports from `Spark4xMapInBatchSupport` which were flagged by scalafix on the Spark 4.0 lint job. --- .../spark/sql/comet/CometMapInBatchExec.scala | 10 +++-- .../sql/comet/shims/ShimCometMapInBatch.scala | 9 ++++- .../sql/comet/shims/ShimCometMapInBatch.scala | 39 +++++++++++++------ .../sql/comet/shims/ShimCometMapInBatch.scala | 19 ++++----- .../sql/comet/shims/ShimCometMapInBatch.scala | 19 ++++----- .../sql/comet/shims/ShimCometMapInBatch.scala | 19 ++++----- .../shims/Spark4xMapInBatchSupport.scala | 17 ++++---- 7 files changed, 71 insertions(+), 61 deletions(-) diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala index 77dbfff7ce..233df4d0dc 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala @@ -80,14 +80,17 @@ case class CometMapInBatchExec( val numOutputBatches = longMetric("numOutputBatches") val numInputRows = longMetric("numInputRows") - val pythonUDF = func.asInstanceOf[PythonUDF] val outputAttrs = output val childSchema = child.schema val batchSize = conf.arrowMaxRecordsPerBatch val evalType = pythonEvalType - val sqlConf = conf val metricsCopy = pythonMetrics + // Resolve every `SQLConf`-derived input on the driver. `SQLConf.get` reads from a thread-local + // `ConfigReader` that only exists on the driver, so dereferencing `conf` from inside the task + // closure NPEs (see #4234 review). + val resolvedRunnerInputs = runnerInputs(func.asInstanceOf[PythonUDF], conf) + val inputRDD = child.executeColumnar() def processPartition(batches: Iterator[ColumnarBatch]): Iterator[ColumnarBatch] = { @@ -108,11 +111,10 @@ case class CometMapInBatchExec( if (batchSize > 0) new BatchIterator(wrappedIter, batchSize) else Iterator(wrappedIter) val columnarBatchIter = computeArrowPython( - pythonUDF, + resolvedRunnerInputs, evalType, argOffsets, StructType(Array(StructField("struct", childSchema))), - sqlConf, metricsCopy, batchIter, context.partitionId(), diff --git a/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala index c7d6ae2f97..1bde7ca094 100644 --- a/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala +++ b/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala @@ -42,12 +42,17 @@ trait ShimCometMapInBatch { protected def matchMapInPandas(plan: SparkPlan): Option[MapInBatchInfo] = None + /** Stub; never constructed on Spark 3.4 because the matchers always return `None`. */ + protected case class RunnerInputs() + + protected def runnerInputs(pythonUDF: PythonUDF, conf: SQLConf): RunnerInputs = + throw new UnsupportedOperationException("CometMapInBatchExec is not supported on Spark 3.4") + protected def computeArrowPython( - pythonUDF: PythonUDF, + runnerInputs: RunnerInputs, evalType: Int, argOffsets: Array[Array[Int]], schema: StructType, - conf: SQLConf, pythonMetrics: Map[String, SQLMetric], batchIter: Iterator[Iterator[InternalRow]], partitionId: Int, diff --git a/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala index 42d66465f4..a04681044c 100644 --- a/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala +++ b/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala @@ -58,27 +58,44 @@ trait ShimCometMapInBatch { case _ => None } + /** Inputs Spark 3.5's `ArrowPythonRunner` constructor needs. */ + protected case class RunnerInputs( + chainedFunc: Seq[ChainedPythonFunctions], + timeZoneId: String, + largeVarTypes: Boolean, + pythonRunnerConf: Map[String, String], + jobArtifactUUID: Option[String]) + + /** + * Resolves the `SQLConf`-derived inputs the `ArrowPythonRunner` needs. Must be called on the + * driver: `conf.sessionLocalTimeZone` etc. read from a thread-local `ConfigReader` that only + * exists on the driver, so dereferencing them from a task closure NPEs. + */ + protected def runnerInputs(pythonUDF: PythonUDF, conf: SQLConf): RunnerInputs = + RunnerInputs( + chainedFunc = Seq(ChainedPythonFunctions(Seq(pythonUDF.func))), + timeZoneId = conf.sessionLocalTimeZone, + largeVarTypes = conf.arrowUseLargeVarTypes, + pythonRunnerConf = ArrowPythonRunner.getPythonRunnerConfMap(conf), + jobArtifactUUID = JobArtifactSet.getCurrentJobArtifactState.map(_.uuid)) + protected def computeArrowPython( - pythonUDF: PythonUDF, + runnerInputs: RunnerInputs, evalType: Int, argOffsets: Array[Array[Int]], schema: StructType, - conf: SQLConf, pythonMetrics: Map[String, SQLMetric], batchIter: Iterator[Iterator[InternalRow]], partitionId: Int, - context: TaskContext): Iterator[ColumnarBatch] = { - val chainedFunc = Seq(ChainedPythonFunctions(Seq(pythonUDF.func))) - val jobArtifactUUID = JobArtifactSet.getCurrentJobArtifactState.map(_.uuid) + context: TaskContext): Iterator[ColumnarBatch] = new ArrowPythonRunner( - chainedFunc, + runnerInputs.chainedFunc, evalType, argOffsets, schema, - conf.sessionLocalTimeZone, - conf.arrowUseLargeVarTypes, - ArrowPythonRunner.getPythonRunnerConfMap(conf), + runnerInputs.timeZoneId, + runnerInputs.largeVarTypes, + runnerInputs.pythonRunnerConf, pythonMetrics, - jobArtifactUUID).compute(batchIter, partitionId, context) - } + runnerInputs.jobArtifactUUID).compute(batchIter, partitionId, context) } diff --git a/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala index 0c21cb3738..fdc9a03e14 100644 --- a/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala +++ b/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala @@ -21,36 +21,31 @@ package org.apache.spark.sql.comet.shims import org.apache.spark.TaskContext import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.PythonUDF import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.execution.python.ArrowPythonRunner -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType import org.apache.spark.sql.vectorized.ColumnarBatch trait ShimCometMapInBatch extends Spark4xMapInBatchSupport { protected def computeArrowPython( - pythonUDF: PythonUDF, + runnerInputs: RunnerInputs, evalType: Int, argOffsets: Array[Array[Int]], schema: StructType, - conf: SQLConf, pythonMetrics: Map[String, SQLMetric], batchIter: Iterator[Iterator[InternalRow]], partitionId: Int, - context: TaskContext): Iterator[ColumnarBatch] = { - val r = runnerInputs(pythonUDF, conf) + context: TaskContext): Iterator[ColumnarBatch] = new ArrowPythonRunner( - r.chainedFunc, + runnerInputs.chainedFunc, evalType, argOffsets, schema, - r.timeZoneId, - r.largeVarTypes, - r.pythonRunnerConf, + runnerInputs.timeZoneId, + runnerInputs.largeVarTypes, + runnerInputs.pythonRunnerConf, pythonMetrics, - r.jobArtifactUUID, + runnerInputs.jobArtifactUUID, None).compute(batchIter, partitionId, context) - } } diff --git a/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala index e73748aafe..b0e6ecc3a0 100644 --- a/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala +++ b/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala @@ -21,37 +21,32 @@ package org.apache.spark.sql.comet.shims import org.apache.spark.TaskContext import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.PythonUDF import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.execution.python.ArrowPythonRunner -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType import org.apache.spark.sql.vectorized.ColumnarBatch trait ShimCometMapInBatch extends Spark4xMapInBatchSupport { protected def computeArrowPython( - pythonUDF: PythonUDF, + runnerInputs: RunnerInputs, evalType: Int, argOffsets: Array[Array[Int]], schema: StructType, - conf: SQLConf, pythonMetrics: Map[String, SQLMetric], batchIter: Iterator[Iterator[InternalRow]], partitionId: Int, - context: TaskContext): Iterator[ColumnarBatch] = { - val r = runnerInputs(pythonUDF, conf) + context: TaskContext): Iterator[ColumnarBatch] = new ArrowPythonRunner( - r.chainedFunc, + runnerInputs.chainedFunc, evalType, argOffsets, schema, - r.timeZoneId, - r.largeVarTypes, - r.pythonRunnerConf, + runnerInputs.timeZoneId, + runnerInputs.largeVarTypes, + runnerInputs.pythonRunnerConf, pythonMetrics, - r.jobArtifactUUID, + runnerInputs.jobArtifactUUID, None, None).compute(batchIter, partitionId, context) - } } diff --git a/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala index 0c21cb3738..fdc9a03e14 100644 --- a/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala +++ b/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala @@ -21,36 +21,31 @@ package org.apache.spark.sql.comet.shims import org.apache.spark.TaskContext import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.PythonUDF import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.execution.python.ArrowPythonRunner -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType import org.apache.spark.sql.vectorized.ColumnarBatch trait ShimCometMapInBatch extends Spark4xMapInBatchSupport { protected def computeArrowPython( - pythonUDF: PythonUDF, + runnerInputs: RunnerInputs, evalType: Int, argOffsets: Array[Array[Int]], schema: StructType, - conf: SQLConf, pythonMetrics: Map[String, SQLMetric], batchIter: Iterator[Iterator[InternalRow]], partitionId: Int, - context: TaskContext): Iterator[ColumnarBatch] = { - val r = runnerInputs(pythonUDF, conf) + context: TaskContext): Iterator[ColumnarBatch] = new ArrowPythonRunner( - r.chainedFunc, + runnerInputs.chainedFunc, evalType, argOffsets, schema, - r.timeZoneId, - r.largeVarTypes, - r.pythonRunnerConf, + runnerInputs.timeZoneId, + runnerInputs.largeVarTypes, + runnerInputs.pythonRunnerConf, pythonMetrics, - r.jobArtifactUUID, + runnerInputs.jobArtifactUUID, None).compute(batchIter, partitionId, context) - } } diff --git a/spark/src/main/spark-4.x/org/apache/spark/sql/comet/shims/Spark4xMapInBatchSupport.scala b/spark/src/main/spark-4.x/org/apache/spark/sql/comet/shims/Spark4xMapInBatchSupport.scala index 78672aea5e..bfb56427cf 100644 --- a/spark/src/main/spark-4.x/org/apache/spark/sql/comet/shims/Spark4xMapInBatchSupport.scala +++ b/spark/src/main/spark-4.x/org/apache/spark/sql/comet/shims/Spark4xMapInBatchSupport.scala @@ -19,21 +19,17 @@ package org.apache.spark.sql.comet.shims -import org.apache.spark.{JobArtifactSet, TaskContext} +import org.apache.spark.JobArtifactSet import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType} -import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.PythonUDF import org.apache.spark.sql.execution.SparkPlan -import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.execution.python.{ArrowPythonRunner, MapInArrowExec, MapInPandasExec} import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.vectorized.ColumnarBatch /** - * Shared 4.x bits for `ShimCometMapInBatch`. The matchers and `getRunnerInputs` helper are - * identical across 4.0/4.1/4.2; only the `ArrowPythonRunner` constructor parameter list differs - * per minor, so each minor's `ShimCometMapInBatch` provides only `computeArrowPython`. + * Shared 4.x bits for `ShimCometMapInBatch`. The matchers and `runnerInputs` helper are identical + * across 4.0/4.1/4.2; only the `ArrowPythonRunner` constructor parameter list differs per minor, + * so each minor's `ShimCometMapInBatch` provides only `computeArrowPython`. */ trait Spark4xMapInBatchSupport { @@ -71,6 +67,11 @@ trait Spark4xMapInBatchSupport { pythonRunnerConf: Map[String, String], jobArtifactUUID: Option[String]) + /** + * Resolves the `SQLConf`-derived inputs the `ArrowPythonRunner` needs. Must be called on the + * driver: `SQLConf.get` reads from a thread-local `ConfigReader` that only exists on the + * driver, so dereferencing `conf.sessionLocalTimeZone` etc. from a task closure NPEs. + */ protected def runnerInputs(pythonUDF: PythonUDF, conf: SQLConf): RunnerInputs = RunnerInputs( chainedFunc = Seq((ChainedPythonFunctions(Seq(pythonUDF.func)), pythonUDF.resultId.id)), From 213e96cd5ca21c3274e5db1a7161c1f73a7e9ca2 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 11 May 2026 18:40:17 -0600 Subject: [PATCH 19/54] ci: register CometMapInBatchSuite in pr_build_linux suite list The refactor commit renamed `CometPythonMapInArrowSuite` to `CometMapInBatchSuite` and moved it under `org.apache.spark.sql.comet`, but the pr_build_linux workflow still referenced the old FQN, so `check-missing-suites` failed. --- .github/workflows/pr_build_linux.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr_build_linux.yml b/.github/workflows/pr_build_linux.yml index b62a000f6c..d40226c941 100644 --- a/.github/workflows/pr_build_linux.yml +++ b/.github/workflows/pr_build_linux.yml @@ -354,7 +354,7 @@ jobs: org.apache.comet.exec.CometGenerateExecSuite org.apache.comet.exec.CometWindowExecSuite org.apache.comet.exec.CometJoinSuite - org.apache.comet.exec.CometPythonMapInArrowSuite + org.apache.spark.sql.comet.CometMapInBatchSuite org.apache.comet.CometNativeSuite org.apache.comet.CometSparkSessionExtensionsSuite org.apache.spark.CometPluginsSuite From a7e3fa39f8028d960a6bb580f805a2b5d75d1b2a Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 11 May 2026 18:40:19 -0600 Subject: [PATCH 20/54] ci: tighten pyarrow_udf_test triggers to feature-specific paths Switch from `paths-ignore` to an explicit allowlist anchored on the files that actually affect the feature. The previous filter re-ran the 15-minute workflow on any unrelated Rust or Scala change. Per review feedback on #4234. --- .github/workflows/pyarrow_udf_test.yml | 40 +++++++++++++------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/.github/workflows/pyarrow_udf_test.yml b/.github/workflows/pyarrow_udf_test.yml index 211a9bd23a..e325ab8b6d 100644 --- a/.github/workflows/pyarrow_udf_test.yml +++ b/.github/workflows/pyarrow_udf_test.yml @@ -25,27 +25,27 @@ on: push: branches: - main - paths-ignore: - - "benchmarks/**" - - "doc/**" - - "docs/**" - - "**.md" - - "dev/changelog/*.md" - - "native/core/benches/**" - - "native/spark-expr/benches/**" - - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" - - "spark/src/main/scala/org/apache/comet/GenerateDocs.scala" + paths: &feature-paths + - "pom.xml" + - "common/pom.xml" + - "common/src/main/scala/org/apache/comet/CometConf.scala" + - "spark/pom.xml" + - "spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala" + - "spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala" + - "spark/src/main/scala/org/apache/spark/sql/comet/shims/MapInBatchInfo.scala" + - "spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala" + - "spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala" + - "spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala" + - "spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala" + - "spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala" + - "spark/src/main/spark-4.x/org/apache/spark/sql/comet/shims/Spark4xMapInBatchSupport.scala" + - "spark/src/test/resources/pyspark/conftest.py" + - "spark/src/test/resources/pyspark/test_pyarrow_udf.py" + - "spark/src/test/spark-3.5/org/apache/spark/sql/comet/CometMapInBatchSuite.scala" + - "spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala" + - ".github/workflows/pyarrow_udf_test.yml" pull_request: - paths-ignore: - - "benchmarks/**" - - "doc/**" - - "docs/**" - - "**.md" - - "dev/changelog/*.md" - - "native/core/benches/**" - - "native/spark-expr/benches/**" - - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" - - "spark/src/main/scala/org/apache/comet/GenerateDocs.scala" + paths: *feature-paths workflow_dispatch: permissions: From 338dcc1e945af7738c2757aa4c0a84523453792a Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 11 May 2026 18:40:21 -0600 Subject: [PATCH 21/54] docs: clarify pyarrowUdf conf vs Spark's PySpark Arrow conversion conf Address review question on #4234: `spark.comet.exec.pyarrowUdf.enabled` is distinct from `spark.sql.execution.arrow.pyspark.enabled` (which controls `toPandas()` / `createDataFrame(pandas_df)` and pandas UDFs, not `mapInArrow` / `mapInPandas`). Add a short section to the user guide so readers don't conflate the two. --- docs/source/user-guide/latest/pyarrow-udfs.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docs/source/user-guide/latest/pyarrow-udfs.md b/docs/source/user-guide/latest/pyarrow-udfs.md index 23ef50e79c..8495184812 100644 --- a/docs/source/user-guide/latest/pyarrow-udfs.md +++ b/docs/source/user-guide/latest/pyarrow-udfs.md @@ -81,6 +81,15 @@ spark.comet.exec.pyarrowUdf.enabled=true The default is `false` while the feature stabilizes. +### Relationship to Spark's PySpark Arrow conversion conf + +`spark.comet.exec.pyarrowUdf.enabled` is **not** the same as PySpark's +[`spark.sql.execution.arrow.pyspark.enabled`](https://spark.apache.org/docs/latest/api/python/tutorial/sql/arrow_pandas.html#enabling-for-conversion-to-from-pandas). +That conf controls whether Spark uses Arrow when materializing a DataFrame to a Pandas DataFrame +(`toPandas()`) or constructing one from Pandas. The Comet conf controls a planner rewrite for +`mapInArrow` / `mapInPandas`, and only affects how Comet's columnar batches feed the Python +worker. Both confs can be set independently. + ## Supported APIs | PySpark API | Spark Plan Node | Supported | From 46a238ed9ad76d44d8494d61d5b40d4ccc9a82de Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 11 May 2026 18:44:59 -0600 Subject: [PATCH 22/54] ci: register CometMapInBatchSuite in pr_build_macos suite list Mirror the pr_build_linux fix: the macos workflow also still referenced the old CometPythonMapInArrowSuite FQN. --- .github/workflows/pr_build_macos.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr_build_macos.yml b/.github/workflows/pr_build_macos.yml index 6f2d2fb41b..0faca13269 100644 --- a/.github/workflows/pr_build_macos.yml +++ b/.github/workflows/pr_build_macos.yml @@ -194,7 +194,7 @@ jobs: org.apache.comet.exec.CometGenerateExecSuite org.apache.comet.exec.CometWindowExecSuite org.apache.comet.exec.CometJoinSuite - org.apache.comet.exec.CometPythonMapInArrowSuite + org.apache.spark.sql.comet.CometMapInBatchSuite org.apache.comet.CometNativeSuite org.apache.comet.CometSetOpWithGroupBySuite org.apache.comet.CometSparkSessionExtensionsSuite From 5fed7342f4841df881e739be028059115659756e Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 11 May 2026 19:38:58 -0600 Subject: [PATCH 23/54] test: add cross-allocator transfer probe for pyarrow UDF runner [skip ci] --- .../CometColumnarPythonInputProbeSuite.scala | 135 ++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometColumnarPythonInputProbeSuite.scala diff --git a/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometColumnarPythonInputProbeSuite.scala b/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometColumnarPythonInputProbeSuite.scala new file mode 100644 index 0000000000..2ee43b5982 --- /dev/null +++ b/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometColumnarPythonInputProbeSuite.scala @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.comet + +import java.io.{ByteArrayInputStream, ByteArrayOutputStream} +import java.util.concurrent.atomic.AtomicReference + +import org.apache.arrow.vector.{FieldVector, VectorSchemaRoot} +import org.apache.arrow.vector.complex.StructVector +import org.apache.arrow.vector.ipc.{ArrowStreamReader, ArrowStreamWriter} +import org.apache.spark.sql.CometTestBase +import org.apache.spark.sql.comet.util.Utils +import org.apache.spark.sql.types.{StructField, StructType} +import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch} + +/** + * Probe test for the invariant `CometColumnarPythonInput` relies on: `makeTransferPair` between a + * Comet `ColumnarBatch`'s field vectors and a `VectorSchemaRoot` allocated from + * `ArrowUtils.rootAllocator` must succeed, and the resulting Arrow IPC stream must round-trip to + * equivalent row counts and values. + * + * If this test starts failing because of allocator changes, `CometColumnarPythonInput` must grow + * a per-buffer copy fallback before the regression lands on main. + */ +class CometColumnarPythonInputProbeSuite extends CometTestBase { + + test("Comet vectors transfer-pair into ArrowUtils.rootAllocator VSR and round-trip via IPC") { + withTempPath { path => + val pathStr = path.getCanonicalPath + spark + .range(0, 1000, 1, 1) + .selectExpr("id AS id", "CAST(id AS DOUBLE) * 1.5 AS value") + .write + .mode("overwrite") + .parquet(pathStr) + + val df = spark.read.parquet(pathStr) + val childSchema = df.schema + val wireSchema = StructType(Array(StructField("struct", childSchema))) + // Capture timezone on the driver before entering the partition closure. + val timeZoneId = conf.sessionLocalTimeZone + + // ColumnarBatch is not serializable, so we can't use take()/collect(). Run all Arrow + // operations inside foreachPartition. In local mode this executes in the same JVM. + // A shared AtomicReference carries any failure back to the test thread. + val failureRef = new AtomicReference[Throwable](null) + + df.queryExecution.executedPlan + .collectFirst { case n: CometNativeExec => n } + .getOrElse(fail("Expected CometNativeExec in plan")) + .executeColumnar() + .foreachPartition { (batches: Iterator[ColumnarBatch]) => + if (batches.hasNext) { + val batch = batches.next() + try { + val arrowSchema = Utils.toArrowSchema(wireSchema, timeZoneId) + val rootAlloc = org.apache.spark.sql.util.ArrowUtils.rootAllocator + val allocator = rootAlloc.newChildAllocator("probe-allocator", 0, Long.MaxValue) + val root = VectorSchemaRoot.create(arrowSchema, allocator) + try { + val structVec = root.getVector(0).asInstanceOf[StructVector] + var i = 0 + while (i < batch.numCols()) { + val src = batch.column(i).asInstanceOf[ArrowColumnVector].getValueVector + val dst = structVec.getChildByOrdinal(i).asInstanceOf[FieldVector] + src.makeTransferPair(dst).transfer() + i += 1 + } + structVec.setValueCount(batch.numRows()) + root.setRowCount(batch.numRows()) + + val baos = new ByteArrayOutputStream() + val writer = new ArrowStreamWriter(root, null, baos) + writer.start() + writer.writeBatch() + writer.end() + + val readAllocator = + rootAlloc.newChildAllocator("probe-read", 0, Long.MaxValue) + try { + val reader = new ArrowStreamReader( + new ByteArrayInputStream(baos.toByteArray), + readAllocator) + try { + if (!reader.loadNextBatch()) { + failureRef.set( + new AssertionError("IPC round-trip: expected at least one record batch")) + } else { + val readRoot = reader.getVectorSchemaRoot + if (readRoot.getRowCount != batch.numRows()) { + failureRef.set( + new AssertionError( + s"row count mismatch: read=${readRoot.getRowCount}, " + + s"expected=${batch.numRows()}")) + } + } + } finally { + reader.close() + } + } finally { + readAllocator.close() + } + } finally { + root.close() + allocator.close() + batch.close() + } + } catch { + case t: Throwable => failureRef.set(t) + } + } + } + + val failure = failureRef.get() + if (failure != null) throw failure + } + } +} From 03f74898e07f0ab104c289405759ceddead2cee2 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 11 May 2026 20:34:26 -0600 Subject: [PATCH 24/54] Revert "test: add cross-allocator transfer probe for pyarrow UDF runner" [skip ci] Probe's cross-allocator transfer invariant turned out false: Comet's Parquet readers each construct their own RootAllocator, separate from ArrowUtils.rootAllocator. The original probe also had a silent-pass bug (AtomicReference doesn't cross Spark task boundaries). The redesigned trait uses per-buffer byte copy instead of TransferPair, so the probe is no longer load-bearing. --- .../CometColumnarPythonInputProbeSuite.scala | 135 ------------------ 1 file changed, 135 deletions(-) delete mode 100644 spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometColumnarPythonInputProbeSuite.scala diff --git a/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometColumnarPythonInputProbeSuite.scala b/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometColumnarPythonInputProbeSuite.scala deleted file mode 100644 index 2ee43b5982..0000000000 --- a/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometColumnarPythonInputProbeSuite.scala +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.spark.sql.comet - -import java.io.{ByteArrayInputStream, ByteArrayOutputStream} -import java.util.concurrent.atomic.AtomicReference - -import org.apache.arrow.vector.{FieldVector, VectorSchemaRoot} -import org.apache.arrow.vector.complex.StructVector -import org.apache.arrow.vector.ipc.{ArrowStreamReader, ArrowStreamWriter} -import org.apache.spark.sql.CometTestBase -import org.apache.spark.sql.comet.util.Utils -import org.apache.spark.sql.types.{StructField, StructType} -import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch} - -/** - * Probe test for the invariant `CometColumnarPythonInput` relies on: `makeTransferPair` between a - * Comet `ColumnarBatch`'s field vectors and a `VectorSchemaRoot` allocated from - * `ArrowUtils.rootAllocator` must succeed, and the resulting Arrow IPC stream must round-trip to - * equivalent row counts and values. - * - * If this test starts failing because of allocator changes, `CometColumnarPythonInput` must grow - * a per-buffer copy fallback before the regression lands on main. - */ -class CometColumnarPythonInputProbeSuite extends CometTestBase { - - test("Comet vectors transfer-pair into ArrowUtils.rootAllocator VSR and round-trip via IPC") { - withTempPath { path => - val pathStr = path.getCanonicalPath - spark - .range(0, 1000, 1, 1) - .selectExpr("id AS id", "CAST(id AS DOUBLE) * 1.5 AS value") - .write - .mode("overwrite") - .parquet(pathStr) - - val df = spark.read.parquet(pathStr) - val childSchema = df.schema - val wireSchema = StructType(Array(StructField("struct", childSchema))) - // Capture timezone on the driver before entering the partition closure. - val timeZoneId = conf.sessionLocalTimeZone - - // ColumnarBatch is not serializable, so we can't use take()/collect(). Run all Arrow - // operations inside foreachPartition. In local mode this executes in the same JVM. - // A shared AtomicReference carries any failure back to the test thread. - val failureRef = new AtomicReference[Throwable](null) - - df.queryExecution.executedPlan - .collectFirst { case n: CometNativeExec => n } - .getOrElse(fail("Expected CometNativeExec in plan")) - .executeColumnar() - .foreachPartition { (batches: Iterator[ColumnarBatch]) => - if (batches.hasNext) { - val batch = batches.next() - try { - val arrowSchema = Utils.toArrowSchema(wireSchema, timeZoneId) - val rootAlloc = org.apache.spark.sql.util.ArrowUtils.rootAllocator - val allocator = rootAlloc.newChildAllocator("probe-allocator", 0, Long.MaxValue) - val root = VectorSchemaRoot.create(arrowSchema, allocator) - try { - val structVec = root.getVector(0).asInstanceOf[StructVector] - var i = 0 - while (i < batch.numCols()) { - val src = batch.column(i).asInstanceOf[ArrowColumnVector].getValueVector - val dst = structVec.getChildByOrdinal(i).asInstanceOf[FieldVector] - src.makeTransferPair(dst).transfer() - i += 1 - } - structVec.setValueCount(batch.numRows()) - root.setRowCount(batch.numRows()) - - val baos = new ByteArrayOutputStream() - val writer = new ArrowStreamWriter(root, null, baos) - writer.start() - writer.writeBatch() - writer.end() - - val readAllocator = - rootAlloc.newChildAllocator("probe-read", 0, Long.MaxValue) - try { - val reader = new ArrowStreamReader( - new ByteArrayInputStream(baos.toByteArray), - readAllocator) - try { - if (!reader.loadNextBatch()) { - failureRef.set( - new AssertionError("IPC round-trip: expected at least one record batch")) - } else { - val readRoot = reader.getVectorSchemaRoot - if (readRoot.getRowCount != batch.numRows()) { - failureRef.set( - new AssertionError( - s"row count mismatch: read=${readRoot.getRowCount}, " + - s"expected=${batch.numRows()}")) - } - } - } finally { - reader.close() - } - } finally { - readAllocator.close() - } - } finally { - root.close() - allocator.close() - batch.close() - } - } catch { - case t: Throwable => failureRef.set(t) - } - } - } - - val failure = failureRef.get() - if (failure != null) throw failure - } - } -} From 1ba717c509bd99d050c5754da14a1e501e0179d6 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 11 May 2026 21:02:57 -0600 Subject: [PATCH 25/54] feat: add CometColumnarPythonInput trait for bulk-copy Arrow IPC [skip ci] Also add arrow-vector as provided scope to spark/pom.xml so Scala can resolve org.apache.arrow.vector during compilation; the partial org/apache/arrow/c/ tree in common/target/classes otherwise masks the package and causes "object vector is not a member of package org.apache.arrow" errors. The unloader is created inline per-batch rather than as a class field to stay compatible across Spark 4.0/4.1/4.2, which differ in whether PythonArrowInput declares unloader as abstract. --- spark/pom.xml | 10 +- .../python/CometColumnarPythonInput.scala | 177 ++++++++++++++++++ 2 files changed, 186 insertions(+), 1 deletion(-) create mode 100644 spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala diff --git a/spark/pom.xml b/spark/pom.xml index d3c18ccf87..e35832d9a6 100644 --- a/spark/pom.xml +++ b/spark/pom.xml @@ -130,7 +130,15 @@ under the License. + classpath, since the Maven shading happens in 'package' phase which is after 'test'. + arrow-vector is listed as provided (not test) so that Scala sees the full + org.apache.arrow.vector package during compile; without it, the partial + org/apache/arrow/c/ tree in common/target/classes masks the package. --> + + org.apache.arrow + arrow-vector + provided + org.apache.arrow arrow-memory-unsafe diff --git a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala new file mode 100644 index 0000000000..b27d8f0568 --- /dev/null +++ b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.execution.python + +import java.io.DataOutputStream +import java.nio.channels.Channels + +import org.apache.arrow.vector.{BaseFixedWidthVector, BaseVariableWidthVector, FieldVector, VectorSchemaRoot, VectorUnloader} +import org.apache.arrow.vector.complex.StructVector +import org.apache.arrow.vector.compression.{CompressionCodec, CompressionUtil, NoCompressionCodec} +import org.apache.arrow.vector.ipc.{ArrowStreamWriter, WriteChannel} +import org.apache.arrow.vector.ipc.message.MessageSerializer +import org.apache.spark.SparkException +import org.apache.spark.api.python.BasePythonRunner +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.vectorized.ColumnarBatch + +import org.apache.comet.vector.CometDecodedVector + +/** + * `PythonArrowInput` implementation that streams Comet `ColumnarBatch` values to the Python + * worker as Arrow IPC, bypassing the row materialization that `BasicPythonArrowInput` performs. + * The persistent root supplied by `PythonArrowInput` carries the wrapped-struct schema + * (`StructType(Array(StructField("struct", childSchema)))`) so the Python worker contract is + * preserved. + * + * Each call writes one Comet batch. The runner contract repeats `writeNextBatchToArrowStream` + * until it returns `false`. Per-batch the input trait allocates a destination vector in the + * persistent root and copies each source buffer via `ArrowBuf.setBytes` -- this is bulk per + * buffer, not per row, but it is NOT zero-copy: Comet's Parquet reader allocators are independent + * roots from `ArrowUtils.rootAllocator`. + */ +private[python] trait CometColumnarPythonInput extends PythonArrowInput[Iterator[ColumnarBatch]] { + self: BasePythonRunner[Iterator[ColumnarBatch], _] => + + private var currentGroup: Iterator[ColumnarBatch] = _ + + // Read the codec name via raw config key so this compiles against Spark 4.0 (which lacks + // SQLConf.arrowCompressionCodec) as well as 4.1/4.2. The codec instances are obtained + // through CompressionCodec.Factory (arrow-vector) rather than importing the concrete + // Lz4CompressionCodec / ZstdCompressionCodec from the separate arrow-compression artifact. + private lazy val cometCodec: CompressionCodec = { + val factory = CompressionCodec.Factory.INSTANCE + SQLConf.get.getConfString("spark.sql.execution.arrow.compression.codec", "none") match { + case "none" => NoCompressionCodec.INSTANCE + case "lz4" => + factory.createCodec(CompressionUtil.CodecType.LZ4_FRAME) + case "zstd" => + val level = + SQLConf.get.getConfString("spark.sql.execution.arrow.compression.zstd.level", "3").toInt + factory.createCodec(CompressionUtil.CodecType.ZSTD, level) + case other => + throw SparkException.internalError( + s"Unsupported Arrow compression codec: $other. Supported values: none, lz4, zstd") + } + } + + override protected def writeNextBatchToArrowStream( + root: VectorSchemaRoot, + writer: ArrowStreamWriter, + dataOut: DataOutputStream, + inputIterator: Iterator[Iterator[ColumnarBatch]]): Boolean = { + + while (currentGroup == null || !currentGroup.hasNext) { + if (!inputIterator.hasNext) { + super[PythonArrowInput].close() + return false + } + currentGroup = inputIterator.next() + } + + val cometBatch = currentGroup.next() + val startData = dataOut.size() + val structVec = root.getVector(0).asInstanceOf[StructVector] + + var i = 0 + while (i < cometBatch.numCols()) { + val src = cometBatch + .column(i) + .asInstanceOf[CometDecodedVector] + .getValueVector + .asInstanceOf[FieldVector] + val dst = structVec.getChildByOrdinal(i).asInstanceOf[FieldVector] + copyVector(src, dst) + i += 1 + } + structVec.setValueCount(cometBatch.numRows()) + root.setRowCount(cometBatch.numRows()) + + // VectorUnloader is lightweight (wraps root); create per-batch to stay compatible + // across Spark 4.0/4.1/4.2 which differ in how the unloader field is managed. + val batchUnloader = + new VectorUnloader(root, /* includeNullCount */ true, cometCodec, /* alignBuffers */ true) + val recordBatch = batchUnloader.getRecordBatch + try { + val writeChannel = new WriteChannel(Channels.newChannel(dataOut)) + MessageSerializer.serialize(writeChannel, recordBatch) + } finally { + recordBatch.close() + } + + pythonMetrics("pythonDataSent") += dataOut.size() - startData + true + } + + /** + * Copy `src` into `dst` via per-buffer memcpy. Allocates `dst` sized to match `src`, then + * `ArrowBuf.setBytes` copies each field buffer (validity, offsets, data) wholesale. Recurses + * into struct / list children. + * + * This does NOT transfer buffer ownership and does NOT change refcounts: `src` retains its + * buffers, `dst` allocates new ones in the runner's allocator. Required because Comet's Parquet + * reader allocators are independent roots from `ArrowUtils.rootAllocator`. + */ + private def copyVector(src: FieldVector, dst: FieldVector): Unit = { + val numRows = src.getValueCount + + dst match { + case bfwv: BaseFixedWidthVector => + bfwv.allocateNew(numRows) + case bvwv: BaseVariableWidthVector => + // Variable-width data buffer size depends on actual byte content, not just numRows. + // Match the source data buffer's readable bytes. + val srcFieldBufs = src.getFieldBuffers + val dataBufIdx = srcFieldBufs.size - 1 + val srcDataSize = srcFieldBufs.get(dataBufIdx).readableBytes + bvwv.allocateNew(srcDataSize, numRows) + case _ => + dst.setInitialCapacity(numRows) + dst.allocateNew() + } + + val srcBufs = src.getFieldBuffers + val dstBufs = dst.getFieldBuffers + require( + srcBufs.size == dstBufs.size, + s"buffer count mismatch for ${src.getField}: src=${srcBufs.size} dst=${dstBufs.size}") + var bi = 0 + while (bi < srcBufs.size) { + val sBuf = srcBufs.get(bi) + val dBuf = dstBufs.get(bi) + dBuf.setBytes(0L, sBuf, 0L, sBuf.readableBytes) + bi += 1 + } + + val srcChildren = src.getChildrenFromFields + val dstChildren = dst.getChildrenFromFields + require( + srcChildren.size == dstChildren.size, + s"child count mismatch for ${src.getField}: " + + s"src=${srcChildren.size} dst=${dstChildren.size}") + var ci = 0 + while (ci < srcChildren.size) { + copyVector(srcChildren.get(ci), dstChildren.get(ci)) + ci += 1 + } + + dst.setValueCount(numRows) + } +} From 06a284568a87f7c442db5a7c8d38e8596880590d Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 11 May 2026 21:44:15 -0600 Subject: [PATCH 26/54] feat: add CometArrowPythonRunner for Spark 4.0 [skip ci] --- .../python/CometArrowPythonRunner.scala | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 spark/src/main/spark-4.0/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala diff --git a/spark/src/main/spark-4.0/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala b/spark/src/main/spark-4.0/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala new file mode 100644 index 0000000000..63d282e8b9 --- /dev/null +++ b/spark/src/main/spark-4.0/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.execution.python + +import java.io.DataOutputStream + +import org.apache.spark.api.python.{BasePythonRunner, ChainedPythonFunctions} +import org.apache.spark.sql.execution.metric.SQLMetric +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.vectorized.ColumnarBatch + +/** + * Comet's Arrow Python runner for Spark 4.0. Extends `BasePythonRunner` directly because Spark + * 4.0's `BaseArrowPythonRunner` is bound to `Iterator[InternalRow]` and mixes in + * `BasicPythonArrowInput`, so we cannot inherit from it. Wires the SQLConf-driven fields that + * `BaseArrowPythonRunner` provides. + */ +class CometArrowPythonRunner( + funcs: Seq[(ChainedPythonFunctions, Long)], + evalType: Int, + argOffsets: Array[Array[Int]], + protected override val schema: StructType, + protected override val timeZoneId: String, + protected override val largeVarTypes: Boolean, + override val workerConf: Map[String, String], + override val pythonMetrics: Map[String, SQLMetric], + jobArtifactUUID: Option[String]) + extends BasePythonRunner[Iterator[ColumnarBatch], ColumnarBatch]( + funcs.map(_._1), + evalType, + argOffsets, + jobArtifactUUID, + pythonMetrics) + with CometColumnarPythonInput + with BasicPythonArrowOutput { + + override val pythonExec: String = + SQLConf.get.pysparkWorkerPythonExecutable.getOrElse(funcs.head._1.funcs.head.pythonExec) + + override val faultHandlerEnabled: Boolean = SQLConf.get.pythonUDFWorkerFaulthandlerEnabled + override val idleTimeoutSeconds: Long = SQLConf.get.pythonUDFWorkerIdleTimeoutSeconds + override val errorOnDuplicatedFieldNames: Boolean = true + override val hideTraceback: Boolean = SQLConf.get.pysparkHideTraceback + override val simplifiedTraceback: Boolean = SQLConf.get.pysparkSimplifiedTraceback + + override val bufferSize: Int = SQLConf.get.pandasUDFBufferSize + require( + bufferSize >= 4, + "Pandas execution requires more than 4 bytes. Please set higher buffer. " + + s"Please change '${SQLConf.PANDAS_UDF_BUFFER_SIZE.key}'.") + + override protected def writeUDF(dataOut: DataOutputStream): Unit = + PythonUDFRunner.writeUDFs(dataOut, funcs, argOffsets, jobArtifactUUID) +} From 95a4dd0d4876a18d1b5731c60839fe6038e39c38 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 11 May 2026 21:48:21 -0600 Subject: [PATCH 27/54] refactor: swap 4.0 shim to CometArrowPythonRunner with columnar input [skip ci] --- .../spark/sql/comet/shims/ShimCometMapInBatch.scala | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala index fdc9a03e14..ddb73ac95c 100644 --- a/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala +++ b/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala @@ -20,9 +20,8 @@ package org.apache.spark.sql.comet.shims import org.apache.spark.TaskContext -import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.metric.SQLMetric -import org.apache.spark.sql.execution.python.ArrowPythonRunner +import org.apache.spark.sql.execution.python.CometArrowPythonRunner import org.apache.spark.sql.types.StructType import org.apache.spark.sql.vectorized.ColumnarBatch @@ -34,10 +33,10 @@ trait ShimCometMapInBatch extends Spark4xMapInBatchSupport { argOffsets: Array[Array[Int]], schema: StructType, pythonMetrics: Map[String, SQLMetric], - batchIter: Iterator[Iterator[InternalRow]], + batchIter: Iterator[Iterator[ColumnarBatch]], partitionId: Int, context: TaskContext): Iterator[ColumnarBatch] = - new ArrowPythonRunner( + new CometArrowPythonRunner( runnerInputs.chainedFunc, evalType, argOffsets, @@ -46,6 +45,5 @@ trait ShimCometMapInBatch extends Spark4xMapInBatchSupport { runnerInputs.largeVarTypes, runnerInputs.pythonRunnerConf, pythonMetrics, - runnerInputs.jobArtifactUUID, - None).compute(batchIter, partitionId, context) + runnerInputs.jobArtifactUUID).compute(batchIter, partitionId, context) } From 1c4a6e258ec681dcc24c950f6284f91a7ab2a500 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 11 May 2026 21:48:24 -0600 Subject: [PATCH 28/54] feat: switch CometMapInBatchExec to columnar Python runner input [skip ci] --- .../spark/sql/comet/CometMapInBatchExec.scala | 44 ++++++------------- 1 file changed, 13 insertions(+), 31 deletions(-) diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala index 233df4d0dc..14e19ab50f 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala @@ -19,9 +19,7 @@ package org.apache.spark.sql.comet -import scala.collection.JavaConverters._ - -import org.apache.spark.{ContextAwareIterator, TaskContext} +import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ @@ -30,20 +28,17 @@ import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.comet.shims.ShimCometMapInBatch import org.apache.spark.sql.execution.{ColumnarToRowExec, SparkPlan, UnaryExecNode} import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} -import org.apache.spark.sql.execution.python.{BatchIterator, PythonSQLMetrics} +import org.apache.spark.sql.execution.python.PythonSQLMetrics import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch} /** * Comet replacement for Spark's `MapInBatchExec` family (`PythonMapInArrowExec` / - * `MapInArrowExec` in 4.1+ / `MapInPandasExec`). Accepts columnar input directly from a Comet - * child instead of going through the per-row `UnsafeProjection` that `ColumnarToRowExec` applies, - * and keeps the Python runner output as `ColumnarBatch` so downstream Comet operators consume it - * natively. + * `MapInArrowExec` in 4.1+ / `MapInPandasExec`). Feeds upstream Comet `ColumnarBatch` values + * directly to a `CometArrowPythonRunner`, eliminating the per-row `InternalRow.getXXX` loop that + * vanilla Spark's `ArrowPythonRunner` performs. * - * What this eliminates: two `UnsafeProjection` copies (input and output) and the row transition - * between Comet and the Python operator. The internal row-to-Arrow IPC re-encoding inside - * `ArrowPythonRunner` is unchanged; full round-trip elimination is tracked in #4240. + * Per-Spark-minor wiring lives in `ShimCometMapInBatch.computeArrowPython`. */ case class CometMapInBatchExec( func: Expression, @@ -69,8 +64,8 @@ case class CometMapInBatchExec( pythonMetrics // Fallback for row-consuming parents (e.g. a top-level `collect()` that produces rows). - // Wraps this columnar exec in `ColumnarToRowExec`, reintroducing exactly the row transition - // this operator otherwise eliminates. Only fires when nothing downstream consumes columnar. + // Wraps this columnar exec in `ColumnarToRowExec`, reintroducing the row transition this + // operator otherwise eliminates. Only fires when nothing downstream consumes columnar. override def doExecute(): RDD[InternalRow] = { ColumnarToRowExec(this).doExecute() } @@ -82,45 +77,32 @@ case class CometMapInBatchExec( val outputAttrs = output val childSchema = child.schema - val batchSize = conf.arrowMaxRecordsPerBatch val evalType = pythonEvalType val metricsCopy = pythonMetrics // Resolve every `SQLConf`-derived input on the driver. `SQLConf.get` reads from a thread-local // `ConfigReader` that only exists on the driver, so dereferencing `conf` from inside the task - // closure NPEs (see #4234 review). + // closure NPEs. val resolvedRunnerInputs = runnerInputs(func.asInstanceOf[PythonUDF], conf) val inputRDD = child.executeColumnar() def processPartition(batches: Iterator[ColumnarBatch]): Iterator[ColumnarBatch] = { val context = TaskContext.get() - val argOffsets = Array(Array(0)) - - val rowIter = batches.flatMap { batch => - numInputRows += batch.numRows() - batch.rowIterator().asScala - } - - val contextAwareIterator = new ContextAwareIterator(context, rowIter) - - // Wrap rows as a struct, matching MapInBatchEvaluatorFactory behavior - val wrappedIter = contextAwareIterator.map(InternalRow(_)) - - val batchIter = - if (batchSize > 0) new BatchIterator(wrappedIter, batchSize) else Iterator(wrappedIter) + val counting = batches.map { b => numInputRows += b.numRows(); b } val columnarBatchIter = computeArrowPython( resolvedRunnerInputs, evalType, - argOffsets, + Array(Array(0)), StructType(Array(StructField("struct", childSchema))), metricsCopy, - batchIter, + Iterator(counting), context.partitionId(), context) columnarBatchIter.map { batch => + // Python returns a single struct column; flatten to the user's output columns. val structVector = batch.column(0).asInstanceOf[ArrowColumnVector] val outputVectors = outputAttrs.indices.map(structVector.getChild) val flattenedBatch = new ColumnarBatch(outputVectors.toArray) From 8874b5729e0355e5587cbf6dad773fbab26087e0 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 11 May 2026 21:53:50 -0600 Subject: [PATCH 29/54] refactor: collapse 3.5 shim to no-op; columnar path targets 4.x only [skip ci] --- .../sql/comet/shims/ShimCometMapInBatch.scala | 72 +++--------- .../sql/comet/CometMapInBatchSuite.scala | 106 ------------------ 2 files changed, 15 insertions(+), 163 deletions(-) delete mode 100644 spark/src/test/spark-3.5/org/apache/spark/sql/comet/CometMapInBatchSuite.scala diff --git a/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala index a04681044c..73a1077de2 100644 --- a/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala +++ b/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala @@ -19,65 +19,32 @@ package org.apache.spark.sql.comet.shims -import org.apache.spark.{JobArtifactSet, TaskContext} -import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType} -import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.TaskContext import org.apache.spark.sql.catalyst.expressions.PythonUDF import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.metric.SQLMetric -import org.apache.spark.sql.execution.python.{ArrowPythonRunner, MapInPandasExec, PythonMapInArrowExec} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType import org.apache.spark.sql.vectorized.ColumnarBatch +/** + * Spark 3.5 shim for the PyArrow UDF acceleration support. + * + * The columnar runner introduced in #4234 only targets Spark 4.0+. On Spark 3.5 the matchers + * return `None`, the rewrite does not fire, and vanilla Spark handles `mapInArrow` / + * `mapInPandas` unchanged. 3.5 support can be added later if there is user demand. + */ trait ShimCometMapInBatch { - protected def matchMapInArrow(plan: SparkPlan): Option[MapInBatchInfo] = - plan match { - case p: PythonMapInArrowExec => - Some( - MapInBatchInfo( - p.func, - p.output, - p.child, - p.isBarrier, - PythonEvalType.SQL_MAP_ARROW_ITER_UDF)) - case _ => None - } + protected def matchMapInArrow(plan: SparkPlan): Option[MapInBatchInfo] = None - protected def matchMapInPandas(plan: SparkPlan): Option[MapInBatchInfo] = - plan match { - case p: MapInPandasExec => - Some( - MapInBatchInfo( - p.func, - p.output, - p.child, - p.isBarrier, - PythonEvalType.SQL_MAP_PANDAS_ITER_UDF)) - case _ => None - } + protected def matchMapInPandas(plan: SparkPlan): Option[MapInBatchInfo] = None - /** Inputs Spark 3.5's `ArrowPythonRunner` constructor needs. */ - protected case class RunnerInputs( - chainedFunc: Seq[ChainedPythonFunctions], - timeZoneId: String, - largeVarTypes: Boolean, - pythonRunnerConf: Map[String, String], - jobArtifactUUID: Option[String]) + /** Stub; never constructed on Spark 3.5 because the matchers always return `None`. */ + protected case class RunnerInputs() - /** - * Resolves the `SQLConf`-derived inputs the `ArrowPythonRunner` needs. Must be called on the - * driver: `conf.sessionLocalTimeZone` etc. read from a thread-local `ConfigReader` that only - * exists on the driver, so dereferencing them from a task closure NPEs. - */ protected def runnerInputs(pythonUDF: PythonUDF, conf: SQLConf): RunnerInputs = - RunnerInputs( - chainedFunc = Seq(ChainedPythonFunctions(Seq(pythonUDF.func))), - timeZoneId = conf.sessionLocalTimeZone, - largeVarTypes = conf.arrowUseLargeVarTypes, - pythonRunnerConf = ArrowPythonRunner.getPythonRunnerConfMap(conf), - jobArtifactUUID = JobArtifactSet.getCurrentJobArtifactState.map(_.uuid)) + throw new UnsupportedOperationException("CometMapInBatchExec is not supported on Spark 3.5") protected def computeArrowPython( runnerInputs: RunnerInputs, @@ -85,17 +52,8 @@ trait ShimCometMapInBatch { argOffsets: Array[Array[Int]], schema: StructType, pythonMetrics: Map[String, SQLMetric], - batchIter: Iterator[Iterator[InternalRow]], + batchIter: Iterator[Iterator[ColumnarBatch]], partitionId: Int, context: TaskContext): Iterator[ColumnarBatch] = - new ArrowPythonRunner( - runnerInputs.chainedFunc, - evalType, - argOffsets, - schema, - runnerInputs.timeZoneId, - runnerInputs.largeVarTypes, - runnerInputs.pythonRunnerConf, - pythonMetrics, - runnerInputs.jobArtifactUUID).compute(batchIter, partitionId, context) + throw new UnsupportedOperationException("CometMapInBatchExec is not supported on Spark 3.5") } diff --git a/spark/src/test/spark-3.5/org/apache/spark/sql/comet/CometMapInBatchSuite.scala b/spark/src/test/spark-3.5/org/apache/spark/sql/comet/CometMapInBatchSuite.scala deleted file mode 100644 index af960c5c97..0000000000 --- a/spark/src/test/spark-3.5/org/apache/spark/sql/comet/CometMapInBatchSuite.scala +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.spark.sql.comet - -import org.apache.spark.api.python.{PythonAccumulatorV2, PythonBroadcast, PythonEvalType, PythonFunction} -import org.apache.spark.broadcast.Broadcast -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.CometTestBase -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, ExprId, PythonUDF} -import org.apache.spark.sql.execution.{ColumnarToRowExec, LeafExecNode} -import org.apache.spark.sql.execution.python.PythonMapInArrowExec -import org.apache.spark.sql.types.{LongType, StructField, StructType} -import org.apache.spark.sql.vectorized.ColumnarBatch - -import org.apache.comet.CometConf -import org.apache.comet.rules.EliminateRedundantTransitions - -/** Minimal CometPlan leaf used to anchor the rule's transform without triggering execution. */ -private case class StubCometLeaf(override val output: Seq[Attribute]) - extends LeafExecNode - with CometPlan { - override def supportsColumnar: Boolean = true - override protected def doExecute(): RDD[InternalRow] = - throw new UnsupportedOperationException - override protected def doExecuteColumnar(): RDD[ColumnarBatch] = - throw new UnsupportedOperationException -} - -/** - * Plan-rule test for the `EliminateRedundantTransitions` rewrite that produces - * `CometMapInBatchExec`. Pure Python execution paths are covered by the pytest module - * `test_pyarrow_udf.py`; this suite verifies the JVM-side rule without spinning up Python. - * - * Lives under `org.apache.spark.sql.comet` so it can reference Spark's `private[spark]` - * `PythonFunction` / `PythonAccumulatorV2` / `PythonBroadcast` classes when fabricating a stub - * `PythonUDF` for `PythonMapInArrowExec` to wrap. - */ -class CometMapInBatchSuite extends CometTestBase { - - private def stubPythonUDF: PythonUDF = { - val pyFunc = new PythonFunction { - override val command: Seq[Byte] = Seq.empty[Byte] - override val envVars: java.util.Map[String, String] = - new java.util.HashMap[String, String]() - override val pythonIncludes: java.util.List[String] = - java.util.Collections.emptyList[String]() - override val pythonExec: String = "python3" - override val pythonVer: String = "3" - override val broadcastVars: java.util.List[Broadcast[PythonBroadcast]] = - java.util.Collections.emptyList[Broadcast[PythonBroadcast]]() - override val accumulator: PythonAccumulatorV2 = null - } - PythonUDF( - name = "test_udf", - func = pyFunc, - dataType = StructType(Seq(StructField("id", LongType))), - children = Seq(AttributeReference("id", LongType)(ExprId(0L))), - evalType = PythonEvalType.SQL_MAP_ARROW_ITER_UDF, - udfDeterministic = true) - } - - private def buildPlan(): PythonMapInArrowExec = { - val cometChild = StubCometLeaf(Seq(AttributeReference("id", LongType)(ExprId(0L)))) - PythonMapInArrowExec( - stubPythonUDF, - cometChild.output, - ColumnarToRowExec(cometChild), - isBarrier = false) - } - - test("rule rewrites PythonMapInArrowExec over Comet to CometMapInBatchExec") { - withSQLConf(CometConf.COMET_PYARROW_UDF_ENABLED.key -> "true") { - val rewritten = EliminateRedundantTransitions(spark).apply(buildPlan()) - assert( - rewritten.exists(_.isInstanceOf[CometMapInBatchExec]), - s"expected CometMapInBatchExec in rewritten plan:\n$rewritten") - } - } - - test("rule does not rewrite when feature is disabled") { - withSQLConf(CometConf.COMET_PYARROW_UDF_ENABLED.key -> "false") { - val rewritten = EliminateRedundantTransitions(spark).apply(buildPlan()) - assert( - !rewritten.exists(_.isInstanceOf[CometMapInBatchExec]), - s"unexpected CometMapInBatchExec when disabled:\n$rewritten") - } - } -} From 6523eb9f9f80f2a3b4ce9a9a3bd0595fb31b6667 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 11 May 2026 21:55:44 -0600 Subject: [PATCH 30/54] feat: add CometArrowPythonRunner for Spark 4.1 [skip ci] --- .../sql/comet/shims/ShimCometMapInBatch.scala | 8 +-- .../python/CometArrowPythonRunner.scala | 64 +++++++++++++++++++ 2 files changed, 67 insertions(+), 5 deletions(-) create mode 100644 spark/src/main/spark-4.1/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala diff --git a/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala index b0e6ecc3a0..ad27b7de42 100644 --- a/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala +++ b/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala @@ -20,9 +20,8 @@ package org.apache.spark.sql.comet.shims import org.apache.spark.TaskContext -import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.metric.SQLMetric -import org.apache.spark.sql.execution.python.ArrowPythonRunner +import org.apache.spark.sql.execution.python.CometArrowPythonRunner import org.apache.spark.sql.types.StructType import org.apache.spark.sql.vectorized.ColumnarBatch @@ -34,10 +33,10 @@ trait ShimCometMapInBatch extends Spark4xMapInBatchSupport { argOffsets: Array[Array[Int]], schema: StructType, pythonMetrics: Map[String, SQLMetric], - batchIter: Iterator[Iterator[InternalRow]], + batchIter: Iterator[Iterator[ColumnarBatch]], partitionId: Int, context: TaskContext): Iterator[ColumnarBatch] = - new ArrowPythonRunner( + new CometArrowPythonRunner( runnerInputs.chainedFunc, evalType, argOffsets, @@ -47,6 +46,5 @@ trait ShimCometMapInBatch extends Spark4xMapInBatchSupport { runnerInputs.pythonRunnerConf, pythonMetrics, runnerInputs.jobArtifactUUID, - None, None).compute(batchIter, partitionId, context) } diff --git a/spark/src/main/spark-4.1/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala b/spark/src/main/spark-4.1/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala new file mode 100644 index 0000000000..7b82b0aed8 --- /dev/null +++ b/spark/src/main/spark-4.1/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.execution.python + +import java.io.DataOutputStream + +import org.apache.spark.api.python.ChainedPythonFunctions +import org.apache.spark.sql.execution.metric.SQLMetric +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.vectorized.ColumnarBatch + +/** + * Comet's Arrow Python runner for Spark 4.1. Extends `BaseArrowPythonRunner` parameterized over + * `Iterator[ColumnarBatch]` input, and supplies the columnar input via `CometColumnarPythonInput` + * instead of `BasicPythonArrowInput`. + * + * Spark 4.1's `PythonUDFRunner.writeUDFs` takes a `profiler: Option[String]` fourth argument; we + * pass `None` since Comet does not support Python profiling. + */ +class CometArrowPythonRunner( + funcs: Seq[(ChainedPythonFunctions, Long)], + evalType: Int, + argOffsets: Array[Array[Int]], + schema: StructType, + timeZoneId: String, + largeVarTypes: Boolean, + workerConf: Map[String, String], + pythonMetrics: Map[String, SQLMetric], + jobArtifactUUID: Option[String], + sessionUUID: Option[String]) + extends BaseArrowPythonRunner[Iterator[ColumnarBatch], ColumnarBatch]( + funcs, + evalType, + argOffsets, + schema, + timeZoneId, + largeVarTypes, + workerConf, + pythonMetrics, + jobArtifactUUID, + sessionUUID) + with CometColumnarPythonInput + with BasicPythonArrowOutput { + + override protected def writeUDF(dataOut: DataOutputStream): Unit = + PythonUDFRunner.writeUDFs(dataOut, funcs, argOffsets, None) +} From 173e1971e8efd75b757281e3dbd49d695a57897c Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 12 May 2026 00:19:43 -0600 Subject: [PATCH 31/54] feat: add CometArrowPythonRunner for Spark 4.2 [skip ci] --- .../sql/comet/shims/ShimCometMapInBatch.scala | 7 +-- .../python/CometArrowPythonRunner.scala | 63 +++++++++++++++++++ 2 files changed, 66 insertions(+), 4 deletions(-) create mode 100644 spark/src/main/spark-4.2/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala diff --git a/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala index fdc9a03e14..ad27b7de42 100644 --- a/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala +++ b/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala @@ -20,9 +20,8 @@ package org.apache.spark.sql.comet.shims import org.apache.spark.TaskContext -import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.metric.SQLMetric -import org.apache.spark.sql.execution.python.ArrowPythonRunner +import org.apache.spark.sql.execution.python.CometArrowPythonRunner import org.apache.spark.sql.types.StructType import org.apache.spark.sql.vectorized.ColumnarBatch @@ -34,10 +33,10 @@ trait ShimCometMapInBatch extends Spark4xMapInBatchSupport { argOffsets: Array[Array[Int]], schema: StructType, pythonMetrics: Map[String, SQLMetric], - batchIter: Iterator[Iterator[InternalRow]], + batchIter: Iterator[Iterator[ColumnarBatch]], partitionId: Int, context: TaskContext): Iterator[ColumnarBatch] = - new ArrowPythonRunner( + new CometArrowPythonRunner( runnerInputs.chainedFunc, evalType, argOffsets, diff --git a/spark/src/main/spark-4.2/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala b/spark/src/main/spark-4.2/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala new file mode 100644 index 0000000000..c9714ce068 --- /dev/null +++ b/spark/src/main/spark-4.2/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.execution.python + +import java.io.DataOutputStream + +import org.apache.spark.api.python.ChainedPythonFunctions +import org.apache.spark.sql.execution.metric.SQLMetric +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.vectorized.ColumnarBatch + +/** + * Comet's Arrow Python runner for Spark 4.2. Spark 4.2's `BaseArrowPythonRunner` no longer + * accepts `workerConf` in its constructor; the subclass overrides `runnerConf` instead. + * `PythonUDFRunner.writeUDFs` drops the `profiler` argument compared to 4.1. + */ +class CometArrowPythonRunner( + funcs: Seq[(ChainedPythonFunctions, Long)], + evalType: Int, + argOffsets: Array[Array[Int]], + schema: StructType, + timeZoneId: String, + largeVarTypes: Boolean, + pythonRunnerConf: Map[String, String], + pythonMetrics: Map[String, SQLMetric], + jobArtifactUUID: Option[String], + sessionUUID: Option[String]) + extends BaseArrowPythonRunner[Iterator[ColumnarBatch], ColumnarBatch]( + funcs, + evalType, + argOffsets, + schema, + timeZoneId, + largeVarTypes, + pythonMetrics, + jobArtifactUUID, + sessionUUID) + with CometColumnarPythonInput + with BasicPythonArrowOutput { + + override protected def runnerConf: Map[String, String] = + super.runnerConf ++ pythonRunnerConf + + override protected def writeUDF(dataOut: DataOutputStream): Unit = + PythonUDFRunner.writeUDFs(dataOut, funcs, argOffsets) +} From d6128d6c5246f9991bd5600f01e935f914ef2d53 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 12 May 2026 00:23:21 -0600 Subject: [PATCH 32/54] fix: align Spark 3.4 shim ColumnarBatch signature [skip ci] --- .../org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala index 1bde7ca094..1fd4b96f09 100644 --- a/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala +++ b/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala @@ -20,7 +20,6 @@ package org.apache.spark.sql.comet.shims import org.apache.spark.TaskContext -import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.PythonUDF import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.metric.SQLMetric @@ -54,7 +53,7 @@ trait ShimCometMapInBatch { argOffsets: Array[Array[Int]], schema: StructType, pythonMetrics: Map[String, SQLMetric], - batchIter: Iterator[Iterator[InternalRow]], + batchIter: Iterator[Iterator[ColumnarBatch]], partitionId: Int, context: TaskContext): Iterator[ColumnarBatch] = throw new UnsupportedOperationException("CometMapInBatchExec is not supported on Spark 3.4") From e64ce0ecd26c3d5f77f56e17f665a5e78e742859 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 12 May 2026 00:26:22 -0600 Subject: [PATCH 33/54] test: add end-to-end runner check to CometMapInBatchSuite [skip ci] --- .../sql/comet/CometMapInBatchSuite.scala | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala b/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala index 5ab0b927a2..79d75bb2cf 100644 --- a/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala +++ b/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala @@ -104,4 +104,38 @@ class CometMapInBatchSuite extends CometTestBase { s"unexpected CometMapInBatchExec when disabled:\n$rewritten") } } + + test("end-to-end: rewrite-on output matches rewrite-off output for primitives + varchar") { + // This test needs PySpark workers; only run if PYSPARK_PYTHON is set in the env. + assume( + sys.env.contains("PYSPARK_PYTHON"), + "set PYSPARK_PYTHON to enable end-to-end pyarrow UDF tests") + + withTempPath { path => + val pathStr = path.getCanonicalPath + spark + .range(0, 1000, 1, 4) + .selectExpr( + "id AS id", + "CAST(id AS DOUBLE) * 1.5 AS dbl", + "CASE WHEN id % 10 = 0 THEN NULL ELSE CONCAT('row_', CAST(id AS STRING)) END AS s") + .write + .mode("overwrite") + .parquet(pathStr) + + // Baseline: rewrite disabled, vanilla MapInArrowExec runs. + val baseline = withSQLConf(CometConf.COMET_PYARROW_UDF_ENABLED.key -> "false") { + spark.read.parquet(pathStr).collect().map(_.toSeq).toSet + } + + // Optimized: rewrite enabled, CometMapInBatchExec + CometArrowPythonRunner runs. + withSQLConf(CometConf.COMET_PYARROW_UDF_ENABLED.key -> "true") { + val df = spark.read.parquet(pathStr) + val result = df.collect().map(_.toSeq).toSet + assert( + result == baseline, + s"optimized output differs from baseline:\noptimized=$result\nbaseline=$baseline") + } + } + } } From 7fa8dcbad883c8b96f587101ef836a5e01a0cb51 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 12 May 2026 00:32:00 -0600 Subject: [PATCH 34/54] build: allowlist Comet pyarrow UDF runner classes in jar-contents check [skip ci] --- dev/ensure-jars-have-correct-contents.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/dev/ensure-jars-have-correct-contents.sh b/dev/ensure-jars-have-correct-contents.sh index 084936475d..3c8b7a3afa 100755 --- a/dev/ensure-jars-have-correct-contents.sh +++ b/dev/ensure-jars-have-correct-contents.sh @@ -91,6 +91,13 @@ allowed_expr+="|^org/apache/spark/shuffle/comet/.*$" allowed_expr+="|^org/apache/spark/sql/$" # allow ExplainPlanGenerator trait since it may not be available in older Spark versions allowed_expr+="|^org/apache/spark/sql/ExtendedExplainGenerator.*$" +# PyArrow UDF acceleration runner classes are under org/apache/spark/sql/execution/python +# because PythonArrowInput and BasicPythonArrowOutput are private[python]; Comet's classes +# must be in that package to mix them in. +allowed_expr+="|^org/apache/spark/sql/execution/$" +allowed_expr+="|^org/apache/spark/sql/execution/python/$" +allowed_expr+="|^org/apache/spark/sql/execution/python/CometColumnarPythonInput.*$" +allowed_expr+="|^org/apache/spark/sql/execution/python/CometArrowPythonRunner.*$" allowed_expr+="|^org/apache/spark/CometPlugin.class$" allowed_expr+="|^org/apache/spark/CometDriverPlugin.*$" allowed_expr+="|^org/apache/spark/CometSource.*$" From dbe603b344cbc77373bdb1ed8c497049ba355ef4 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 12 May 2026 01:35:01 -0600 Subject: [PATCH 35/54] feat: add CometVectorIpcCopier helper to copy comet vector bytes by address [skip ci] Adds a helper class in comet-common that copies Arrow buffer bytes from a CometDecodedVector to caller-supplied memory addresses without exposing shaded Arrow types across the module boundary. Uses getFieldBuffers()/getChildrenFromFields() traversal consistent with VectorUnloader so buffer ordering matches between source (shaded) and destination (unshaded) sides. --- .../comet/vector/CometVectorIpcCopier.java | 122 ++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 common/src/main/java/org/apache/comet/vector/CometVectorIpcCopier.java diff --git a/common/src/main/java/org/apache/comet/vector/CometVectorIpcCopier.java b/common/src/main/java/org/apache/comet/vector/CometVectorIpcCopier.java new file mode 100644 index 0000000000..368f02b0ec --- /dev/null +++ b/common/src/main/java/org/apache/comet/vector/CometVectorIpcCopier.java @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.vector; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.util.MemoryUtil; +import org.apache.arrow.vector.FieldVector; + +/** + * Helpers that copy the contents of a {@link CometDecodedVector} (whose underlying Arrow buffers + * live in the shaded {@code org.apache.comet.shaded.arrow.*} package after the comet-common jar is + * built) into destination buffer addresses provided by the caller. + * + *

Callers in {@code comet-spark} reference the unshaded {@code org.apache.arrow.*} classes + * supplied by Spark at runtime. Direct cross-package access from the spark module would fail with a + * {@code ClassCastException}. Crossing the boundary via raw memory addresses (long primitives) + * sidesteps the class identity issue: the bytes on disk are identical regardless of which Arrow + * Java distribution produced them. + * + *

All traversals use {@code getFieldBuffers()} and {@code getChildrenFromFields()} — the same + * API that {@code VectorUnloader} uses — so buffer ordering and counts are consistent between the + * source (shaded) and destination (unshaded) sides. + */ +public final class CometVectorIpcCopier { + + private CometVectorIpcCopier() {} + + /** + * Returns the readable byte counts of all buffers in {@code cometVec}'s underlying Arrow tree, in + * depth-first order (the same order {@code VectorUnloader} uses). + * + *

The caller can use this to size destination buffers before calling {@link + * #copyBuffersToAddresses}. + */ + public static long[] bufferReadableBytes(CometDecodedVector cometVec) { + List sizes = new ArrayList<>(); + collectBufferSizes((FieldVector) cometVec.getValueVector(), sizes); + long[] out = new long[sizes.size()]; + for (int i = 0; i < sizes.size(); i++) { + out[i] = sizes.get(i); + } + return out; + } + + /** + * Returns the {@code valueCount} of every {@link FieldVector} node in {@code cometVec}'s tree, in + * depth-first order. The first entry is the value count of the top-level vector; subsequent + * entries are for nested children (struct fields, list elements). + */ + public static int[] valueCounts(CometDecodedVector cometVec) { + List counts = new ArrayList<>(); + collectValueCounts((FieldVector) cometVec.getValueVector(), counts); + int[] out = new int[counts.size()]; + for (int i = 0; i < counts.size(); i++) { + out[i] = counts.get(i); + } + return out; + } + + /** + * Copies all of {@code cometVec}'s buffer bytes into {@code destAddresses}, in the same + * depth-first order as {@link #bufferReadableBytes}. Each destination address must be backed by + * at least the corresponding entry from {@code bufferReadableBytes} bytes of writable memory. + */ + public static void copyBuffersToAddresses(CometDecodedVector cometVec, long[] destAddresses) { + walkAndCopy((FieldVector) cometVec.getValueVector(), destAddresses, new int[] {0}); + } + + private static void collectBufferSizes(FieldVector vec, List out) { + for (ArrowBuf buf : vec.getFieldBuffers()) { + out.add(buf.readableBytes()); + } + for (FieldVector child : vec.getChildrenFromFields()) { + collectBufferSizes(child, out); + } + } + + private static void collectValueCounts(FieldVector vec, List out) { + out.add(vec.getValueCount()); + for (FieldVector child : vec.getChildrenFromFields()) { + collectValueCounts(child, out); + } + } + + private static void walkAndCopy(FieldVector vec, long[] addrs, int[] cursor) { + for (ArrowBuf buf : vec.getFieldBuffers()) { + if (cursor[0] >= addrs.length) { + throw new IllegalArgumentException( + "destAddresses too small at cursor=" + + cursor[0] + + " (have " + + addrs.length + + " addresses)"); + } + MemoryUtil.copyMemory(buf.memoryAddress(), addrs[cursor[0]], buf.readableBytes()); + cursor[0]++; + } + for (FieldVector child : vec.getChildrenFromFields()) { + walkAndCopy(child, addrs, cursor); + } + } +} From f02caed1268d93471fde37cc5a3488456d6fb13e Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 12 May 2026 01:35:08 -0600 Subject: [PATCH 36/54] fix: route comet-to-arrow buffer copy through unshaded primitive API [skip ci] Rewrites CometColumnarPythonInput to copy Comet vector bytes via CometVectorIpcCopier (long-address API) rather than casting shaded FieldVector to unshaded FieldVector, which caused ClassCastException at runtime. Additional fixes for correct Arrow IPC semantics: - Fill struct validity buffer with 0xFF so Python sees non-null struct rows - Set lastSet before setValueCount on variable-width and list vectors to prevent fillHoles from overwriting correctly copied offset buffers - Process nodes bottom-up so parent setValueCount cascade does not clobber children that have not yet had lastSet updated --- .../python/CometColumnarPythonInput.scala | 183 +++++++++++------- 1 file changed, 117 insertions(+), 66 deletions(-) diff --git a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala index b27d8f0568..cd50c8b238 100644 --- a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala +++ b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala @@ -22,8 +22,11 @@ package org.apache.spark.sql.execution.python import java.io.DataOutputStream import java.nio.channels.Channels -import org.apache.arrow.vector.{BaseFixedWidthVector, BaseVariableWidthVector, FieldVector, VectorSchemaRoot, VectorUnloader} -import org.apache.arrow.vector.complex.StructVector +import scala.collection.mutable.ArrayBuffer +import scala.jdk.CollectionConverters._ + +import org.apache.arrow.vector.{BaseFixedWidthVector, BaseLargeVariableWidthVector, BaseVariableWidthVector, FieldVector, VectorSchemaRoot, VectorUnloader} +import org.apache.arrow.vector.complex.{LargeListVector, ListVector, StructVector} import org.apache.arrow.vector.compression.{CompressionCodec, CompressionUtil, NoCompressionCodec} import org.apache.arrow.vector.ipc.{ArrowStreamWriter, WriteChannel} import org.apache.arrow.vector.ipc.message.MessageSerializer @@ -31,21 +34,23 @@ import org.apache.spark.SparkException import org.apache.spark.api.python.BasePythonRunner import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.vectorized.ColumnarBatch +import org.apache.spark.unsafe.Platform -import org.apache.comet.vector.CometDecodedVector +import org.apache.comet.vector.{CometDecodedVector, CometVectorIpcCopier} /** * `PythonArrowInput` implementation that streams Comet `ColumnarBatch` values to the Python - * worker as Arrow IPC, bypassing the row materialization that `BasicPythonArrowInput` performs. - * The persistent root supplied by `PythonArrowInput` carries the wrapped-struct schema - * (`StructType(Array(StructField("struct", childSchema)))`) so the Python worker contract is - * preserved. + * worker as Arrow IPC. + * + * Comet's vectors live in the shaded `org.apache.comet.shaded.arrow.*` package at runtime + * (relocated by comet-common's maven-shade-plugin). This trait must not reference shaded Arrow + * types directly; buffer copying is delegated to `CometVectorIpcCopier` in comet-common, which + * crosses the module boundary using only `long` primitives. * - * Each call writes one Comet batch. The runner contract repeats `writeNextBatchToArrowStream` - * until it returns `false`. Per-batch the input trait allocates a destination vector in the - * persistent root and copies each source buffer via `ArrowBuf.setBytes` -- this is bulk per - * buffer, not per row, but it is NOT zero-copy: Comet's Parquet reader allocators are independent - * roots from `ArrowUtils.rootAllocator`. + * Per-batch: walk the destination struct's children (unshaded, allocated from the runner's + * persistent root), allocate each child sized to match the corresponding Comet column, collect + * dst buffer addresses into a `long[]`, and call the helper for a single bulk memcpy across all + * buffers. */ private[python] trait CometColumnarPythonInput extends PythonArrowInput[Iterator[ColumnarBatch]] { self: BasePythonRunner[Iterator[ColumnarBatch], _] => @@ -92,20 +97,20 @@ private[python] trait CometColumnarPythonInput extends PythonArrowInput[Iterator var i = 0 while (i < cometBatch.numCols()) { - val src = cometBatch - .column(i) - .asInstanceOf[CometDecodedVector] - .getValueVector - .asInstanceOf[FieldVector] + val src = cometBatch.column(i).asInstanceOf[CometDecodedVector] val dst = structVec.getChildByOrdinal(i).asInstanceOf[FieldVector] copyVector(src, dst) i += 1 } - structVec.setValueCount(cometBatch.numRows()) - root.setRowCount(cometBatch.numRows()) + val numRows = cometBatch.numRows() + structVec.setValueCount(numRows) + // Mark every row in the struct as non-null (all-1 validity bits). The struct validity + // buffer is freshly allocated (or cleared) and zero-initialised, so without this step + // Python would see an all-null struct column and return null for every output row. + val validityBytes = (numRows + 7) / 8 + Platform.setMemory(structVec.getValidityBuffer.memoryAddress(), 0xff.toByte, validityBytes) + root.setRowCount(numRows) - // VectorUnloader is lightweight (wraps root); create per-batch to stay compatible - // across Spark 4.0/4.1/4.2 which differ in how the unloader field is managed. val batchUnloader = new VectorUnloader(root, /* includeNullCount */ true, cometCodec, /* alignBuffers */ true) val recordBatch = batchUnloader.getRecordBatch @@ -121,57 +126,103 @@ private[python] trait CometColumnarPythonInput extends PythonArrowInput[Iterator } /** - * Copy `src` into `dst` via per-buffer memcpy. Allocates `dst` sized to match `src`, then - * `ArrowBuf.setBytes` copies each field buffer (validity, offsets, data) wholesale. Recurses - * into struct / list children. - * - * This does NOT transfer buffer ownership and does NOT change refcounts: `src` retains its - * buffers, `dst` allocates new ones in the runner's allocator. Required because Comet's Parquet - * reader allocators are independent roots from `ArrowUtils.rootAllocator`. + * Copy a Comet column (whose Arrow buffers are in the shaded class tree) into the destination + * FieldVector (allocated from the runner's persistent root, in the unshaded class tree). The + * actual byte copy happens inside `CometVectorIpcCopier` in comet-common, which references only + * shaded Arrow types internally and exposes the buffer addresses as `long` primitives. */ - private def copyVector(src: FieldVector, dst: FieldVector): Unit = { - val numRows = src.getValueCount - - dst match { - case bfwv: BaseFixedWidthVector => - bfwv.allocateNew(numRows) - case bvwv: BaseVariableWidthVector => - // Variable-width data buffer size depends on actual byte content, not just numRows. - // Match the source data buffer's readable bytes. - val srcFieldBufs = src.getFieldBuffers - val dataBufIdx = srcFieldBufs.size - 1 - val srcDataSize = srcFieldBufs.get(dataBufIdx).readableBytes - bvwv.allocateNew(srcDataSize, numRows) - case _ => - dst.setInitialCapacity(numRows) - dst.allocateNew() - } + private def copyVector(src: CometDecodedVector, dst: FieldVector): Unit = { + val srcBufSizes = CometVectorIpcCopier.bufferReadableBytes(src) + val srcValueCounts = CometVectorIpcCopier.valueCounts(src) - val srcBufs = src.getFieldBuffers - val dstBufs = dst.getFieldBuffers + val dstNodes = collectFieldVectors(dst) require( - srcBufs.size == dstBufs.size, - s"buffer count mismatch for ${src.getField}: src=${srcBufs.size} dst=${dstBufs.size}") - var bi = 0 - while (bi < srcBufs.size) { - val sBuf = srcBufs.get(bi) - val dBuf = dstBufs.get(bi) - dBuf.setBytes(0L, sBuf, 0L, sBuf.readableBytes) - bi += 1 + dstNodes.size == srcValueCounts.length, + s"tree node count mismatch for ${dst.getField}: " + + s"dst=${dstNodes.size}, src=${srcValueCounts.length}") + + var bufIdx = 0 + var nodeIdx = 0 + while (nodeIdx < dstNodes.size) { + val node = dstNodes(nodeIdx) + val valueCount = srcValueCounts(nodeIdx) + node match { + case bfwv: BaseFixedWidthVector => + bfwv.allocateNew(valueCount) + case bvwv: BaseVariableWidthVector => + val ownBufCount = node.getFieldBuffers.size + val dataSize = srcBufSizes(bufIdx + ownBufCount - 1) + bvwv.allocateNew(dataSize, valueCount) + case blvwv: BaseLargeVariableWidthVector => + val ownBufCount = node.getFieldBuffers.size + val dataSize = srcBufSizes(bufIdx + ownBufCount - 1) + blvwv.allocateNew(dataSize, valueCount) + case _ => + node.setInitialCapacity(valueCount) + node.allocateNew() + } + bufIdx += node.getFieldBuffers.size + nodeIdx += 1 } - - val srcChildren = src.getChildrenFromFields - val dstChildren = dst.getChildrenFromFields require( - srcChildren.size == dstChildren.size, - s"child count mismatch for ${src.getField}: " + - s"src=${srcChildren.size} dst=${dstChildren.size}") - var ci = 0 - while (ci < srcChildren.size) { - copyVector(srcChildren.get(ci), dstChildren.get(ci)) - ci += 1 + bufIdx == srcBufSizes.length, + s"buffer count mismatch for ${dst.getField}: dst=$bufIdx, src=${srcBufSizes.length}") + + val dstAddrs = collectBufferAddresses(dstNodes, srcBufSizes.length) + CometVectorIpcCopier.copyBuffersToAddresses(src, dstAddrs) + + // Process nodes bottom-up (leaves first) so that when a composite vector (struct, list) + // calls setValueCount on its children recursively, those children have already had their + // lastSet field updated and fillHoles becomes a no-op. + var fi = dstNodes.size - 1 + while (fi >= 0) { + val node = dstNodes(fi) + val vc = srcValueCounts(fi) + // For vectors that fill offset-buffer "holes" in setValueCount (variable-width and list + // types), set lastSet = vc - 1 first so fillHoles is a no-op and the already-copied + // offset bytes are preserved. + node match { + case v: BaseVariableWidthVector => v.setLastSet(vc - 1) + case v: BaseLargeVariableWidthVector => v.setLastSet(vc - 1) + case v: ListVector => v.setLastSet(vc - 1) + case v: LargeListVector => v.setLastSet(vc - 1) + case _ => + } + node.setValueCount(vc) + fi -= 1 + } + } + + private def collectFieldVectors(vec: FieldVector): IndexedSeq[FieldVector] = { + val buf = ArrayBuffer.empty[FieldVector] + walkFieldVectors(vec, buf) + buf.toIndexedSeq + } + + private def walkFieldVectors(vec: FieldVector, buf: ArrayBuffer[FieldVector]): Unit = { + buf += vec + vec.getChildrenFromFields.asScala.foreach { child => + walkFieldVectors(child.asInstanceOf[FieldVector], buf) } + } - dst.setValueCount(numRows) + private def collectBufferAddresses( + nodes: IndexedSeq[FieldVector], + expected: Int): Array[Long] = { + val addrs = new Array[Long](expected) + var idx = 0 + var ni = 0 + while (ni < nodes.size) { + val bufs = nodes(ni).getFieldBuffers + var bi = 0 + while (bi < bufs.size) { + addrs(idx) = bufs.get(bi).memoryAddress() + idx += 1 + bi += 1 + } + ni += 1 + } + require(idx == expected, s"collected $idx addresses, expected $expected") + addrs } } From 0a98c7debc2210df9d8b0924933285db23dbe627 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 12 May 2026 01:41:01 -0600 Subject: [PATCH 37/54] test: refresh pyarrow UDF benchmark header for bulk-copy path [skip ci] --- .../src/test/resources/pyspark/benchmark_pyarrow_udf.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/spark/src/test/resources/pyspark/benchmark_pyarrow_udf.py b/spark/src/test/resources/pyspark/benchmark_pyarrow_udf.py index 49574130c0..08f2c6540f 100644 --- a/spark/src/test/resources/pyspark/benchmark_pyarrow_udf.py +++ b/spark/src/test/resources/pyspark/benchmark_pyarrow_udf.py @@ -19,6 +19,9 @@ """ End-to-end wall-clock benchmark for Comet's PyArrow UDF acceleration. +Requires PySpark 4.0.1+ (Comet's columnar runner targets Spark 4.0+ only; +3.5 and 3.4 are documented no-ops). + Times `df.mapInArrow(passthrough, schema).count()` and the equivalent `mapInPandas` query with `spark.comet.exec.pyarrowUdf.enabled` set to false (vanilla Spark path) and true (Comet's optimized path). Both @@ -26,8 +29,10 @@ optimization actually changes for users: * vanilla: CometScan -> ColumnarToRow + UnsafeProjection -> ArrowPythonRunner - * optimized: CometScan -> rowIterator -> ArrowPythonRunner (same runner; - no UnsafeProjection, output kept as ColumnarBatch) + (per-row InternalRow.getXXX() loop inside ArrowWriter.write) + * optimized: CometScan -> CometMapInBatchExec -> CometArrowPythonRunner + (per-buffer Unsafe.copyMemory from Comet's vectors into the + runner's persistent VectorSchemaRoot; no row materialization) Results are wall-clock seconds, so they include Python interpreter, Arrow IPC, and downstream count() costs. That's intentional: the From 7e4ace85d76595a31ef2401edac881bb7d007afb Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 12 May 2026 01:41:29 -0600 Subject: [PATCH 38/54] docs: bump pyarrow UDF user guide to Spark 4.0+; note buffer-copy boundary --- docs/source/user-guide/latest/pyarrow-udfs.md | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/docs/source/user-guide/latest/pyarrow-udfs.md b/docs/source/user-guide/latest/pyarrow-udfs.md index 8495184812..f22d926541 100644 --- a/docs/source/user-guide/latest/pyarrow-udfs.md +++ b/docs/source/user-guide/latest/pyarrow-udfs.md @@ -181,17 +181,18 @@ on the unoptimized path. - The optimization currently applies only to `mapInArrow` and `mapInPandas`. Scalar pandas UDFs (`@pandas_udf`) and grouped operations (`applyInPandas`) are not yet supported. -- The internal row-to-Arrow conversion inside the Python runner is still present in this version. - Comet currently routes columnar input through `ColumnarBatch.rowIterator()` so that the existing - `ArrowPythonRunner` can re-encode the rows back to Arrow IPC. A future optimization will write - Arrow batches directly to the Python IPC stream, eliminating the remaining round-trip and - achieving near zero-copy data transfer. - The optimization requires Arrow data on the input side. If a shuffle sits between the upstream Comet operator and the Python UDF, you need Comet's native shuffle for the optimization to apply. Set `spark.shuffle.manager` to `org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager` and enable `spark.comet.exec.shuffle.enabled=true` at session startup. With a vanilla Spark `Exchange` in the plan the data leaves the shuffle as rows and the optimization cannot fire. -- Spark 3.4 lacks several APIs the optimization depends on (`MapInBatchExec.isBarrier`, - `arrowUseLargeVarTypes`, `JobArtifactSet`, the modern `ArrowPythonRunner` constructor). On - Spark 3.4 the feature is a no-op even when enabled. Spark 3.5+ is required. +- Spark 4.0 or newer is required. On Spark 3.4 and 3.5 the optimization is a no-op even when + enabled; vanilla `PythonMapInArrowExec` / `MapInPandasExec` handle the operation. The Spark 3.5 + `PythonArrowInput` trait has a different contract than 4.x and a separate implementation has + not been written. Track 3.5 support as a future follow-on if there is user demand. +- The current implementation copies Comet's vector buffers into Spark's allocator via + `Unsafe.copyMemory` (one bulk memcpy per buffer per column). True zero-copy via + `TransferPair` is blocked on Comet's Parquet readers allocating from `ArrowUtils.rootAllocator` + (rather than each reader constructing its own independent `RootAllocator`). A future PR that + unifies the allocator parent would unlock zero-copy. From e4dfd2a64a3b5e41cda0591bdb14c99486c0a154 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 12 May 2026 02:01:38 -0600 Subject: [PATCH 39/54] docs: link pyarrow UDF zero-copy follow-on issue (#4294) --- docs/source/user-guide/latest/pyarrow-udfs.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/user-guide/latest/pyarrow-udfs.md b/docs/source/user-guide/latest/pyarrow-udfs.md index f22d926541..42130c18eb 100644 --- a/docs/source/user-guide/latest/pyarrow-udfs.md +++ b/docs/source/user-guide/latest/pyarrow-udfs.md @@ -194,5 +194,5 @@ on the unoptimized path. - The current implementation copies Comet's vector buffers into Spark's allocator via `Unsafe.copyMemory` (one bulk memcpy per buffer per column). True zero-copy via `TransferPair` is blocked on Comet's Parquet readers allocating from `ArrowUtils.rootAllocator` - (rather than each reader constructing its own independent `RootAllocator`). A future PR that - unifies the allocator parent would unlock zero-copy. + (rather than each reader constructing its own independent `RootAllocator`). Tracked in + [#4294](https://github.com/apache/datafusion-comet/issues/4294). From 35c03b3c00a5a61a6274bfab22741ce520f80a9b Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 20 May 2026 12:16:47 -0600 Subject: [PATCH 40/54] refactor: remove Arrow shading workaround from pyarrow UDF input trait MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The long[]-address indirection through CometVectorIpcCopier existed because comet-common shaded org.apache.arrow.* into org.apache.comet.shaded.arrow.*, making source vectors and Spark's IPC root different JVM types. After #4325 moved most JVM code into comet-spark and dropped the shading, both sides see the same Arrow classes — the helper is no longer needed. Replace with a direct walk of the source/destination FieldVector trees using ArrowBuf.setBytes for the buffer copy. Same per-buffer memcpy semantics; the cross-RootAllocator constraint that blocks true zero-copy is independent of shading and still tracked in #4294. --- docs/source/user-guide/latest/pyarrow-udfs.md | 8 +- .../comet/vector/CometVectorIpcCopier.java | 122 -------------- .../python/CometColumnarPythonInput.scala | 159 +++++++----------- 3 files changed, 61 insertions(+), 228 deletions(-) delete mode 100644 spark/src/main/java/org/apache/comet/vector/CometVectorIpcCopier.java diff --git a/docs/source/user-guide/latest/pyarrow-udfs.md b/docs/source/user-guide/latest/pyarrow-udfs.md index 42130c18eb..68b3ba15c3 100644 --- a/docs/source/user-guide/latest/pyarrow-udfs.md +++ b/docs/source/user-guide/latest/pyarrow-udfs.md @@ -191,8 +191,8 @@ on the unoptimized path. enabled; vanilla `PythonMapInArrowExec` / `MapInPandasExec` handle the operation. The Spark 3.5 `PythonArrowInput` trait has a different contract than 4.x and a separate implementation has not been written. Track 3.5 support as a future follow-on if there is user demand. -- The current implementation copies Comet's vector buffers into Spark's allocator via - `Unsafe.copyMemory` (one bulk memcpy per buffer per column). True zero-copy via - `TransferPair` is blocked on Comet's Parquet readers allocating from `ArrowUtils.rootAllocator` - (rather than each reader constructing its own independent `RootAllocator`). Tracked in +- The current implementation copies Comet's vector buffers into Spark's allocator one + buffer at a time. True zero-copy via `TransferPair` is blocked on Comet's Parquet + readers allocating from `ArrowUtils.rootAllocator` (rather than each reader + constructing its own independent `RootAllocator`). Tracked in [#4294](https://github.com/apache/datafusion-comet/issues/4294). diff --git a/spark/src/main/java/org/apache/comet/vector/CometVectorIpcCopier.java b/spark/src/main/java/org/apache/comet/vector/CometVectorIpcCopier.java deleted file mode 100644 index 368f02b0ec..0000000000 --- a/spark/src/main/java/org/apache/comet/vector/CometVectorIpcCopier.java +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.comet.vector; - -import java.util.ArrayList; -import java.util.List; - -import org.apache.arrow.memory.ArrowBuf; -import org.apache.arrow.memory.util.MemoryUtil; -import org.apache.arrow.vector.FieldVector; - -/** - * Helpers that copy the contents of a {@link CometDecodedVector} (whose underlying Arrow buffers - * live in the shaded {@code org.apache.comet.shaded.arrow.*} package after the comet-common jar is - * built) into destination buffer addresses provided by the caller. - * - *

Callers in {@code comet-spark} reference the unshaded {@code org.apache.arrow.*} classes - * supplied by Spark at runtime. Direct cross-package access from the spark module would fail with a - * {@code ClassCastException}. Crossing the boundary via raw memory addresses (long primitives) - * sidesteps the class identity issue: the bytes on disk are identical regardless of which Arrow - * Java distribution produced them. - * - *

All traversals use {@code getFieldBuffers()} and {@code getChildrenFromFields()} — the same - * API that {@code VectorUnloader} uses — so buffer ordering and counts are consistent between the - * source (shaded) and destination (unshaded) sides. - */ -public final class CometVectorIpcCopier { - - private CometVectorIpcCopier() {} - - /** - * Returns the readable byte counts of all buffers in {@code cometVec}'s underlying Arrow tree, in - * depth-first order (the same order {@code VectorUnloader} uses). - * - *

The caller can use this to size destination buffers before calling {@link - * #copyBuffersToAddresses}. - */ - public static long[] bufferReadableBytes(CometDecodedVector cometVec) { - List sizes = new ArrayList<>(); - collectBufferSizes((FieldVector) cometVec.getValueVector(), sizes); - long[] out = new long[sizes.size()]; - for (int i = 0; i < sizes.size(); i++) { - out[i] = sizes.get(i); - } - return out; - } - - /** - * Returns the {@code valueCount} of every {@link FieldVector} node in {@code cometVec}'s tree, in - * depth-first order. The first entry is the value count of the top-level vector; subsequent - * entries are for nested children (struct fields, list elements). - */ - public static int[] valueCounts(CometDecodedVector cometVec) { - List counts = new ArrayList<>(); - collectValueCounts((FieldVector) cometVec.getValueVector(), counts); - int[] out = new int[counts.size()]; - for (int i = 0; i < counts.size(); i++) { - out[i] = counts.get(i); - } - return out; - } - - /** - * Copies all of {@code cometVec}'s buffer bytes into {@code destAddresses}, in the same - * depth-first order as {@link #bufferReadableBytes}. Each destination address must be backed by - * at least the corresponding entry from {@code bufferReadableBytes} bytes of writable memory. - */ - public static void copyBuffersToAddresses(CometDecodedVector cometVec, long[] destAddresses) { - walkAndCopy((FieldVector) cometVec.getValueVector(), destAddresses, new int[] {0}); - } - - private static void collectBufferSizes(FieldVector vec, List out) { - for (ArrowBuf buf : vec.getFieldBuffers()) { - out.add(buf.readableBytes()); - } - for (FieldVector child : vec.getChildrenFromFields()) { - collectBufferSizes(child, out); - } - } - - private static void collectValueCounts(FieldVector vec, List out) { - out.add(vec.getValueCount()); - for (FieldVector child : vec.getChildrenFromFields()) { - collectValueCounts(child, out); - } - } - - private static void walkAndCopy(FieldVector vec, long[] addrs, int[] cursor) { - for (ArrowBuf buf : vec.getFieldBuffers()) { - if (cursor[0] >= addrs.length) { - throw new IllegalArgumentException( - "destAddresses too small at cursor=" - + cursor[0] - + " (have " - + addrs.length - + " addresses)"); - } - MemoryUtil.copyMemory(buf.memoryAddress(), addrs[cursor[0]], buf.readableBytes()); - cursor[0]++; - } - for (FieldVector child : vec.getChildrenFromFields()) { - walkAndCopy(child, addrs, cursor); - } - } -} diff --git a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala index cd50c8b238..5cce2aaf28 100644 --- a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala +++ b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala @@ -22,7 +22,6 @@ package org.apache.spark.sql.execution.python import java.io.DataOutputStream import java.nio.channels.Channels -import scala.collection.mutable.ArrayBuffer import scala.jdk.CollectionConverters._ import org.apache.arrow.vector.{BaseFixedWidthVector, BaseLargeVariableWidthVector, BaseVariableWidthVector, FieldVector, VectorSchemaRoot, VectorUnloader} @@ -36,21 +35,18 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.vectorized.ColumnarBatch import org.apache.spark.unsafe.Platform -import org.apache.comet.vector.{CometDecodedVector, CometVectorIpcCopier} +import org.apache.comet.vector.CometDecodedVector /** * `PythonArrowInput` implementation that streams Comet `ColumnarBatch` values to the Python * worker as Arrow IPC. * - * Comet's vectors live in the shaded `org.apache.comet.shaded.arrow.*` package at runtime - * (relocated by comet-common's maven-shade-plugin). This trait must not reference shaded Arrow - * types directly; buffer copying is delegated to `CometVectorIpcCopier` in comet-common, which - * crosses the module boundary using only `long` primitives. - * - * Per-batch: walk the destination struct's children (unshaded, allocated from the runner's - * persistent root), allocate each child sized to match the corresponding Comet column, collect - * dst buffer addresses into a `long[]`, and call the helper for a single bulk memcpy across all - * buffers. + * Per batch: walk the destination struct's children, allocate each child sized to match the + * corresponding Comet column, and copy each buffer with `ArrowBuf.setBytes`. The source (Comet's + * vectors) and the destination (Spark's persistent IPC root) live in different `RootAllocator` + * trees, so `TransferPair` / `VectorLoader.load` cannot rebind buffers across the boundary; + * per-buffer memcpy is the available alternative until the readers share a parent allocator + * (tracked in #4294). */ private[python] trait CometColumnarPythonInput extends PythonArrowInput[Iterator[ColumnarBatch]] { self: BasePythonRunner[Iterator[ColumnarBatch], _] => @@ -97,7 +93,12 @@ private[python] trait CometColumnarPythonInput extends PythonArrowInput[Iterator var i = 0 while (i < cometBatch.numCols()) { - val src = cometBatch.column(i).asInstanceOf[CometDecodedVector] + val src = + cometBatch + .column(i) + .asInstanceOf[CometDecodedVector] + .getValueVector + .asInstanceOf[FieldVector] val dst = structVec.getChildByOrdinal(i).asInstanceOf[FieldVector] copyVector(src, dst) i += 1 @@ -126,103 +127,57 @@ private[python] trait CometColumnarPythonInput extends PythonArrowInput[Iterator } /** - * Copy a Comet column (whose Arrow buffers are in the shaded class tree) into the destination - * FieldVector (allocated from the runner's persistent root, in the unshaded class tree). The - * actual byte copy happens inside `CometVectorIpcCopier` in comet-common, which references only - * shaded Arrow types internally and exposes the buffer addresses as `long` primitives. + * Copy a Comet column into the destination FieldVector. Walks both trees in lockstep: sizes + * each destination node from the source, copies every buffer with `ArrowBuf.setBytes`, then + * sets value counts bottom-up so `setValueCount` does not rewrite the offset bytes we just + * copied. */ - private def copyVector(src: CometDecodedVector, dst: FieldVector): Unit = { - val srcBufSizes = CometVectorIpcCopier.bufferReadableBytes(src) - val srcValueCounts = CometVectorIpcCopier.valueCounts(src) - - val dstNodes = collectFieldVectors(dst) - require( - dstNodes.size == srcValueCounts.length, - s"tree node count mismatch for ${dst.getField}: " + - s"dst=${dstNodes.size}, src=${srcValueCounts.length}") - - var bufIdx = 0 - var nodeIdx = 0 - while (nodeIdx < dstNodes.size) { - val node = dstNodes(nodeIdx) - val valueCount = srcValueCounts(nodeIdx) - node match { - case bfwv: BaseFixedWidthVector => - bfwv.allocateNew(valueCount) - case bvwv: BaseVariableWidthVector => - val ownBufCount = node.getFieldBuffers.size - val dataSize = srcBufSizes(bufIdx + ownBufCount - 1) - bvwv.allocateNew(dataSize, valueCount) - case blvwv: BaseLargeVariableWidthVector => - val ownBufCount = node.getFieldBuffers.size - val dataSize = srcBufSizes(bufIdx + ownBufCount - 1) - blvwv.allocateNew(dataSize, valueCount) - case _ => - node.setInitialCapacity(valueCount) - node.allocateNew() - } - bufIdx += node.getFieldBuffers.size - nodeIdx += 1 + private def copyVector(src: FieldVector, dst: FieldVector): Unit = { + val valueCount = src.getValueCount + + dst match { + case bfwv: BaseFixedWidthVector => + bfwv.allocateNew(valueCount) + case bvwv: BaseVariableWidthVector => + bvwv.allocateNew(src.getDataBuffer.readableBytes, valueCount) + case blvwv: BaseLargeVariableWidthVector => + blvwv.allocateNew(src.getDataBuffer.readableBytes, valueCount) + case _ => + dst.setInitialCapacity(valueCount) + dst.allocateNew() } + + val srcBufs = src.getFieldBuffers + val dstBufs = dst.getFieldBuffers require( - bufIdx == srcBufSizes.length, - s"buffer count mismatch for ${dst.getField}: dst=$bufIdx, src=${srcBufSizes.length}") - - val dstAddrs = collectBufferAddresses(dstNodes, srcBufSizes.length) - CometVectorIpcCopier.copyBuffersToAddresses(src, dstAddrs) - - // Process nodes bottom-up (leaves first) so that when a composite vector (struct, list) - // calls setValueCount on its children recursively, those children have already had their - // lastSet field updated and fillHoles becomes a no-op. - var fi = dstNodes.size - 1 - while (fi >= 0) { - val node = dstNodes(fi) - val vc = srcValueCounts(fi) - // For vectors that fill offset-buffer "holes" in setValueCount (variable-width and list - // types), set lastSet = vc - 1 first so fillHoles is a no-op and the already-copied - // offset bytes are preserved. - node match { - case v: BaseVariableWidthVector => v.setLastSet(vc - 1) - case v: BaseLargeVariableWidthVector => v.setLastSet(vc - 1) - case v: ListVector => v.setLastSet(vc - 1) - case v: LargeListVector => v.setLastSet(vc - 1) - case _ => - } - node.setValueCount(vc) - fi -= 1 + srcBufs.size == dstBufs.size, + s"buffer count mismatch for ${dst.getField}: src=${srcBufs.size}, dst=${dstBufs.size}") + var b = 0 + while (b < srcBufs.size) { + val s = srcBufs.get(b) + dstBufs.get(b).setBytes(0, s, 0, s.readableBytes) + b += 1 } - } - - private def collectFieldVectors(vec: FieldVector): IndexedSeq[FieldVector] = { - val buf = ArrayBuffer.empty[FieldVector] - walkFieldVectors(vec, buf) - buf.toIndexedSeq - } - private def walkFieldVectors(vec: FieldVector, buf: ArrayBuffer[FieldVector]): Unit = { - buf += vec - vec.getChildrenFromFields.asScala.foreach { child => - walkFieldVectors(child.asInstanceOf[FieldVector], buf) + val srcChildren = src.getChildrenFromFields + val dstChildren = dst.getChildrenFromFields + require( + srcChildren.size == dstChildren.size, + s"child count mismatch for ${dst.getField}: src=${srcChildren.size}, dst=${dstChildren.size}") + srcChildren.asScala.zip(dstChildren.asScala).foreach { case (sc, dc) => + copyVector(sc.asInstanceOf[FieldVector], dc.asInstanceOf[FieldVector]) } - } - private def collectBufferAddresses( - nodes: IndexedSeq[FieldVector], - expected: Int): Array[Long] = { - val addrs = new Array[Long](expected) - var idx = 0 - var ni = 0 - while (ni < nodes.size) { - val bufs = nodes(ni).getFieldBuffers - var bi = 0 - while (bi < bufs.size) { - addrs(idx) = bufs.get(bi).memoryAddress() - idx += 1 - bi += 1 - } - ni += 1 + // For vectors that fill offset-buffer "holes" in setValueCount (variable-width and list + // types), set lastSet = vc - 1 first so fillHoles is a no-op and the already-copied + // offset bytes are preserved. + dst match { + case v: BaseVariableWidthVector => v.setLastSet(valueCount - 1) + case v: BaseLargeVariableWidthVector => v.setLastSet(valueCount - 1) + case v: ListVector => v.setLastSet(valueCount - 1) + case v: LargeListVector => v.setLastSet(valueCount - 1) + case _ => } - require(idx == expected, s"collected $idx addresses, expected $expected") - addrs + dst.setValueCount(valueCount) } } From 693afef91b514fff870d3f05acf30257844b231e Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 20 May 2026 15:39:15 -0600 Subject: [PATCH 41/54] test: expand pyarrow UDF type coverage and fall back on useLargeVarTypes Adds pytest cases for the data-type branches in CometColumnarPythonInput that were previously unexercised: numeric scalars (boolean/byte/short/float), binary, timestamp NTZ, map, and a deeply nested array/struct combination. Falls back to vanilla Spark when spark.sql.execution.arrow.useLargeVarTypes is enabled. With that conf on, Spark widens StringType/BinaryType to 8-byte-offset variants in the destination IPC root while Comet's source vectors keep 4-byte offsets, so the per-buffer memcpy in copyVector would corrupt the offset buffer. While narrowing the rule to gate on largeVarTypes, also fix a pre-existing greedy match: the MapInBatch case used `p: SparkPlan` with the pyarrow conf as a guard, which matched every plan when the conf was on and consumed the later CometShuffleExchangeExec arm. The case now gates on a structural check via eligibleMapInBatchInfo so unrelated plans flow through. --- docs/source/user-guide/latest/pyarrow-udfs.md | 5 + .../rules/EliminateRedundantTransitions.scala | 61 +++- .../resources/pyspark/test_pyarrow_udf.py | 318 ++++++++++++++++++ 3 files changed, 368 insertions(+), 16 deletions(-) diff --git a/docs/source/user-guide/latest/pyarrow-udfs.md b/docs/source/user-guide/latest/pyarrow-udfs.md index 68b3ba15c3..0b2cd9aebb 100644 --- a/docs/source/user-guide/latest/pyarrow-udfs.md +++ b/docs/source/user-guide/latest/pyarrow-udfs.md @@ -191,6 +191,11 @@ on the unoptimized path. enabled; vanilla `PythonMapInArrowExec` / `MapInPandasExec` handle the operation. The Spark 3.5 `PythonArrowInput` trait has a different contract than 4.x and a separate implementation has not been written. Track 3.5 support as a future follow-on if there is user demand. +- `spark.sql.execution.arrow.useLargeVarTypes=true` is not supported. With this conf enabled, + Spark widens `StringType` and `BinaryType` to Arrow's 8-byte-offset variants in the + destination IPC root, while Comet's source vectors always use 4-byte offsets. The buffer-copy + path cannot bridge that mismatch, so `EliminateRedundantTransitions` skips the rewrite and + vanilla Spark handles the operation. - The current implementation copies Comet's vector buffers into Spark's allocator one buffer at a time. True zero-copy via `TransferPair` is blocked on Comet's Parquet readers allocating from `ArrowUtils.rootAllocator` (rather than each reader diff --git a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala index 24c969c173..ee7a9e085b 100644 --- a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala +++ b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala @@ -24,7 +24,7 @@ import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.util.sideBySide import org.apache.spark.sql.comet.{CometCollectLimitExec, CometColumnarToRowExec, CometMapInBatchExec, CometNativeColumnarToRowExec, CometNativeWriteExec, CometPlan, CometSparkToColumnarExec} import org.apache.spark.sql.comet.execution.shuffle.{CometColumnarShuffle, CometShuffleExchangeExec} -import org.apache.spark.sql.comet.shims.ShimCometMapInBatch +import org.apache.spark.sql.comet.shims.{MapInBatchInfo, ShimCometMapInBatch} import org.apache.spark.sql.execution.{ColumnarToRowExec, RowToColumnarExec, SparkPlan} import org.apache.spark.sql.execution.adaptive.QueryStageExec import org.apache.spark.sql.execution.exchange.ReusedExchangeExec @@ -106,21 +106,29 @@ case class EliminateRedundantTransitions(session: SparkSession) // UnsafeProjection copies and keeping the stage columnar. The matchers are // version-shimmed: Spark 3.4 returns None (it lacks the required APIs) and Spark 4.1+ // matches the renamed `MapInArrowExec`. - case p: SparkPlan if CometConf.COMET_PYARROW_UDF_ENABLED.get() => - matchMapInArrow(p).orElse(matchMapInPandas(p)) match { - case Some(info) => - extractColumnarChild(info.child) - .map { columnarChild => - CometMapInBatchExec( - info.func, - info.output, - columnarChild, - info.isBarrier, - info.pythonEvalType) - } - .getOrElse(p) - case None => p - } + // + // Falls back to vanilla Spark when `spark.sql.execution.arrow.useLargeVarTypes` is enabled: + // CometColumnarPythonInput.copyVector does raw `setBytes` on each Arrow buffer, but Comet's + // source string/binary vectors always use 4-byte offsets while the destination root is + // allocated with 8-byte offsets when this conf is on. The buffer counts match but the + // offset width does not, so a direct memcpy would corrupt the offsets. + // + // The guard runs `eligibleMapInBatchInfo` so this case only matches actual MapInArrow / + // MapInPandas operators. Without the structural check the case would match every + // `SparkPlan` whenever the pyarrow conf is on, short-circuiting the + // `CometShuffleExchangeExec` arm below. + case p if eligibleMapInBatchInfo(p).isDefined => + val info = eligibleMapInBatchInfo(p).get + extractColumnarChild(info.child) + .map { columnarChild => + CometMapInBatchExec( + info.func, + info.output, + columnarChild, + info.isBarrier, + info.pythonEvalType) + } + .getOrElse(p) // Spark adds `RowToColumnar` under Comet columnar shuffle. But it's redundant as the // shuffle takes row-based input. @@ -167,6 +175,27 @@ case class EliminateRedundantTransitions(session: SparkSession) case _ => None } + /** + * Returns `Some(info)` only when this rule should attempt to rewrite `plan` to + * `CometMapInBatchExec`, i.e. when the conf is on, the largeVarTypes fallback does not apply, + * and the plan is one of the version-shimmed MapInArrow / MapInPandas operators. Used in the + * pattern guard so the case only fires for plans we actually want to rewrite - without that + * narrowing, the `case` would match every `SparkPlan` whenever the conf is on and consume the + * later `CometShuffleExchangeExec` arm. Read the conf via the raw key string so this compiles + * against Spark 3.4, which lacks `SQLConf.arrowUseLargeVarTypes`. + */ + private def eligibleMapInBatchInfo(plan: SparkPlan): Option[MapInBatchInfo] = { + if (!CometConf.COMET_PYARROW_UDF_ENABLED.get()) { + None + } else if (plan.conf + .getConfString("spark.sql.execution.arrow.useLargeVarTypes", "false") + .toBoolean) { + None + } else { + matchMapInArrow(plan).orElse(matchMapInPandas(plan)) + } + } + /** * Creates an appropriate columnar to row transition operator. * diff --git a/spark/src/test/resources/pyspark/test_pyarrow_udf.py b/spark/src/test/resources/pyspark/test_pyarrow_udf.py index 87558ec057..fc671053a6 100644 --- a/spark/src/test/resources/pyspark/test_pyarrow_udf.py +++ b/spark/src/test/resources/pyspark/test_pyarrow_udf.py @@ -407,6 +407,324 @@ def _normalize(row): assert out == expected +def test_map_in_arrow_numeric_scalars(spark, tmp_path, accelerated): + """ + Covers the BaseFixedWidthVector branch in CometColumnarPythonInput.copyVector for + every fixed-width primitive Comet's scan supports beyond the long/double/int already + exercised by other tests: boolean, byte, short, float. Each has a distinct buffer + size, and the validity bit handling is independent per column. + """ + schema_in = T.StructType( + [ + T.StructField("id", T.LongType()), + T.StructField("b", T.BooleanType()), + T.StructField("tiny", T.ByteType()), + T.StructField("small", T.ShortType()), + T.StructField("flt", T.FloatType()), + ] + ) + rows = [ + (1, True, 1, 1000, 1.5), + (2, False, -128, -32768, -3.25), + (3, True, 127, 32767, float("inf")), + (4, None, None, None, None), + ] + src = str(tmp_path / "src.parquet") + spark.createDataFrame(rows, schema_in).write.parquet(src) + + def passthrough(iterator): + for batch in iterator: + yield batch + + result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in) + _assert_plan_matches_mode(_executed_plan(result_df), accelerated) + + out = {(r["id"], r["b"], r["tiny"], r["small"], r["flt"]) for r in result_df.collect()} + assert out == set(rows) + + +def test_map_in_arrow_binary_type(spark, tmp_path, accelerated): + """ + BinaryType is the BaseVariableWidthVector path with non-string content. StringType + already exercises that path for utf-8 data; binary covers the case where the data + buffer can hold arbitrary bytes (including null bytes mid-string). + """ + schema_in = T.StructType( + [ + T.StructField("id", T.LongType()), + T.StructField("payload", T.BinaryType()), + ] + ) + rows = [ + (1, b"\x00\x01\x02\x03"), + (2, b""), + (3, b"\xff" * 64), + (4, None), + ] + src = str(tmp_path / "src.parquet") + spark.createDataFrame(rows, schema_in).write.parquet(src) + + def passthrough(iterator): + for batch in iterator: + yield batch + + result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in) + _assert_plan_matches_mode(_executed_plan(result_df), accelerated) + + out = {(r["id"], bytes(r["payload"]) if r["payload"] is not None else None) + for r in result_df.collect()} + expected = set(rows) + assert out == expected + + +def test_map_in_arrow_timestamp_ntz(spark, tmp_path, accelerated): + """ + TimestampNTZType is a separate Arrow type from TimestampType (no timezone) and goes + through a different ArrowType.Timestamp(..., tz=None) on the wire. + """ + schema_in = T.StructType( + [ + T.StructField("id", T.LongType()), + T.StructField("ts_ntz", T.TimestampNTZType()), + ] + ) + rows = [ + (1, dt.datetime(2024, 1, 1, 12, 30, 45)), + (2, dt.datetime(1970, 1, 1, 0, 0, 0)), + (3, None), + ] + src = str(tmp_path / "src.parquet") + spark.createDataFrame(rows, schema_in).write.parquet(src) + + def passthrough(iterator): + for batch in iterator: + yield batch + + result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in) + _assert_plan_matches_mode(_executed_plan(result_df), accelerated) + + out = {(r["id"], r["ts_ntz"]) for r in result_df.collect()} + assert out == set(rows) + + +def test_map_in_arrow_map_type(spark, tmp_path, accelerated): + """ + MapType is encoded in Arrow as a List> with extra metadata. The + buffer layout (offsets + struct child + key/value children) is distinct from a plain + list, and CometMapVector is a separate vector class from CometListVector. Without + this test the recursive copy path through map-typed columns is unexercised. + """ + schema_in = T.StructType( + [ + T.StructField("id", T.LongType()), + T.StructField( + "attrs", T.MapType(T.StringType(), T.IntegerType(), valueContainsNull=True) + ), + ] + ) + rows = [ + (1, {"a": 1, "b": 2}), + (2, {}), + (3, None), + (4, {"only": None}), + ] + src = str(tmp_path / "src.parquet") + spark.createDataFrame(rows, schema_in).write.parquet(src) + + def passthrough(iterator): + for batch in iterator: + yield batch + + result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in) + _assert_plan_matches_mode(_executed_plan(result_df), accelerated) + + def _normalize(row): + attrs = row["attrs"] + attrs_norm = ( + tuple(sorted(attrs.items(), key=lambda kv: kv[0])) + if attrs is not None + else None + ) + return (row["id"], attrs_norm) + + out = {_normalize(r) for r in result_df.collect()} + expected = { + ( + r[0], + tuple(sorted(r[1].items(), key=lambda kv: kv[0])) if r[1] is not None else None, + ) + for r in rows + } + assert out == expected + + +def test_map_in_arrow_deeply_nested(spark, tmp_path, accelerated): + """ + Exercises the recursive descent in CometColumnarPythonInput.copyVector at depth > 1, + in every nesting combination: array-of-array, array-of-struct, struct-of-array, + struct-of-struct. Single-level nesting is covered by test_map_in_arrow_array_and_struct; + the bug surface here is that setLastSet / setValueCount must be applied bottom-up + correctly at every level. + """ + schema_in = T.StructType( + [ + T.StructField("id", T.LongType()), + T.StructField("matrix", T.ArrayType(T.ArrayType(T.IntegerType()))), + T.StructField( + "people", + T.ArrayType( + T.StructType( + [ + T.StructField("name", T.StringType()), + T.StructField("age", T.IntegerType()), + ] + ) + ), + ), + T.StructField( + "config", + T.StructType( + [ + T.StructField("flags", T.ArrayType(T.StringType())), + T.StructField( + "limits", + T.StructType( + [ + T.StructField("min", T.IntegerType()), + T.StructField("max", T.IntegerType()), + ] + ), + ), + ] + ), + ), + ] + ) + rows = [ + ( + 1, + [[1, 2], [3, 4, 5]], + [("alice", 30), ("bob", 25)], + (["x", "y"], (0, 100)), + ), + ( + 2, + [[], [None, 7]], + [("solo", None)], + ([], (None, None)), + ), + (3, None, None, None), + (4, [None, [9]], [None, ("ghost", 0)], (None, None)), + ] + src = str(tmp_path / "src.parquet") + spark.createDataFrame(rows, schema_in).write.parquet(src) + + def passthrough(iterator): + for batch in iterator: + yield batch + + result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in) + _assert_plan_matches_mode(_executed_plan(result_df), accelerated) + + def _norm_array(a): + return tuple(a) if a is not None else None + + def _norm_matrix(m): + return tuple(_norm_array(inner) for inner in m) if m is not None else None + + def _norm_people(p): + if p is None: + return None + return tuple( + (item["name"], item["age"]) if item is not None else None for item in p + ) + + def _norm_config(c): + if c is None: + return None + flags = _norm_array(c["flags"]) + limits = c["limits"] + limits_norm = (limits["min"], limits["max"]) if limits is not None else None + return (flags, limits_norm) + + def _norm_row(r): + return ( + r["id"], + _norm_matrix(r["matrix"]), + _norm_people(r["people"]), + _norm_config(r["config"]), + ) + + def _norm_input_people(p): + if p is None: + return None + return tuple(item if item is not None else None for item in p) + + def _norm_input_config(c): + if c is None: + return None + flags, limits = c + return (_norm_array(flags), limits) + + out = {_norm_row(r) for r in result_df.collect()} + expected = { + ( + r[0], + _norm_matrix(r[1]), + _norm_input_people(r[2]), + _norm_input_config(r[3]), + ) + for r in rows + } + assert out == expected + + +def test_map_in_arrow_falls_back_when_use_large_var_types(spark, tmp_path): + """ + `spark.sql.execution.arrow.useLargeVarTypes=true` widens StringType / BinaryType to + LargeUtf8 / LargeBinary in the destination IPC root (8-byte offsets). Comet's source + vectors always use 4-byte offsets; CometColumnarPythonInput.copyVector does a raw + setBytes per buffer and would corrupt the offset buffer in this configuration. + EliminateRedundantTransitions must skip the rewrite in that case so vanilla Spark + handles the operation. This test does not use the `accelerated` fixture: it sets + pyarrowUdf.enabled=true AND useLargeVarTypes=true and asserts the plan still falls + back to vanilla MapInArrow. + """ + schema_in = T.StructType( + [ + T.StructField("id", T.LongType()), + T.StructField("name", T.StringType()), + ] + ) + rows = [(i, f"name_{i}") for i in range(20)] + src = str(tmp_path / "src.parquet") + spark.createDataFrame(rows, schema_in).write.parquet(src) + + def passthrough(iterator): + for batch in iterator: + yield batch + + prev_pyarrow = spark.conf.get("spark.comet.exec.pyarrowUdf.enabled", "false") + prev_large = spark.conf.get("spark.sql.execution.arrow.useLargeVarTypes", "false") + spark.conf.set("spark.comet.exec.pyarrowUdf.enabled", "true") + spark.conf.set("spark.sql.execution.arrow.useLargeVarTypes", "true") + try: + result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in) + plan = _executed_plan(result_df) + assert "CometMapInBatch" not in plan, ( + f"useLargeVarTypes=true should force fallback, but plan has " + f"CometMapInBatch:\n{plan}" + ) + assert "MapInArrow" in plan, ( + f"expected vanilla MapInArrow in fallback plan, got:\n{plan}" + ) + out = sorted((r["id"], r["name"]) for r in result_df.collect()) + assert out == sorted(rows) + finally: + spark.conf.set("spark.comet.exec.pyarrowUdf.enabled", prev_pyarrow) + spark.conf.set("spark.sql.execution.arrow.useLargeVarTypes", prev_large) + + def test_map_in_arrow_after_shuffle(spark, tmp_path, accelerated): """ Verifies correctness when a shuffle sits between the Comet scan and the From f79e905334773d467409ac2d3a2d638658f241ed Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 21 May 2026 09:05:37 -0600 Subject: [PATCH 42/54] fix: emit CometVector from CometMapInBatchExec so downstream Comet consumers work Wrap each ArrowColumnVector from the Python output as a CometVector via CometVector.getVector before emitting the batch. This makes the operator a proper Comet columnar producer: - a CometMapInBatchExec stacked above another one can cast the input column to CometDecodedVector (as CometColumnarPythonInput already does) - NativeUtil.exportBatch's case match handles the output (CometVector arm, not the SparkException-throwing fallthrough), so a Comet native aggregate or join probe over the UDF output does not blow up at FFI handoff Adds pytest cases that exercise the consumer shapes (chained mapInArrow, filter on UDF output, groupBy/agg on UDF output) plus a Scala plan-level test pinning the chained-rewrite structure. Addresses mbutrovich's correctness comment on #4234. --- .../spark/sql/comet/CometMapInBatchExec.scala | 21 +++- .../resources/pyspark/test_pyarrow_udf.py | 113 ++++++++++++++++++ .../sql/comet/CometMapInBatchSuite.scala | 33 +++++ 3 files changed, 163 insertions(+), 4 deletions(-) diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala index 14e19ab50f..4c40e68809 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala @@ -30,7 +30,9 @@ import org.apache.spark.sql.execution.{ColumnarToRowExec, SparkPlan, UnaryExecNo import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.sql.execution.python.PythonSQLMetrics import org.apache.spark.sql.types.{StructField, StructType} -import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch} +import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnVector} + +import org.apache.comet.vector.CometVector /** * Comet replacement for Spark's `MapInBatchExec` family (`PythonMapInArrowExec` / @@ -102,10 +104,21 @@ case class CometMapInBatchExec( context) columnarBatchIter.map { batch => - // Python returns a single struct column; flatten to the user's output columns. + // Python returns a single struct column; flatten to the user's output columns and + // re-wrap each child as CometVector so consumers that expect Comet's vector hierarchy + // (e.g. another CometMapInBatchExec stacked on top, or NativeUtil.exportBatch for a + // downstream native Comet operator) see the right type. Sharing the underlying Arrow + // ValueVector with the original ArrowColumnVector is safe: close() on either ends up + // releasing the same buffers, and arrow-vector's release path is idempotent. val structVector = batch.column(0).asInstanceOf[ArrowColumnVector] - val outputVectors = outputAttrs.indices.map(structVector.getChild) - val flattenedBatch = new ColumnarBatch(outputVectors.toArray) + val outputVectors: Array[ColumnVector] = outputAttrs.indices.map { i => + val childArrow = structVector.getChild(i) + CometVector.getVector( + childArrow.getValueVector, + /* useDecimal128 */ true, + /* dictionaryProvider */ null) + }.toArray + val flattenedBatch = new ColumnarBatch(outputVectors) flattenedBatch.setNumRows(batch.numRows()) numOutputRows += flattenedBatch.numRows() numOutputBatches += 1 diff --git a/spark/src/test/resources/pyspark/test_pyarrow_udf.py b/spark/src/test/resources/pyspark/test_pyarrow_udf.py index fc671053a6..4567bedf6d 100644 --- a/spark/src/test/resources/pyspark/test_pyarrow_udf.py +++ b/spark/src/test/resources/pyspark/test_pyarrow_udf.py @@ -759,6 +759,119 @@ def passthrough(iterator): assert out == sorted(rows) +def test_chained_map_in_arrow(spark, tmp_path, accelerated): + """ + `df.mapInArrow(udf1).mapInArrow(udf2)` stacks two operators. With the rewrite + enabled both become `CometMapInBatchExec`, so the inner one's output feeds + the outer one's input. The outer operator's input path expects vectors of + `CometDecodedVector` type: if the inner's output is plain `ArrowColumnVector` + the outer throws `ClassCastException` on the first batch. + """ + schema = T.StructType( + [ + T.StructField("id", T.LongType()), + T.StructField("value", T.DoubleType()), + ] + ) + rows = [(i, float(i)) for i in range(50)] + src = str(tmp_path / "src.parquet") + spark.createDataFrame(rows, schema).write.parquet(src) + + def add_one(iterator): + for batch in iterator: + pdf = batch.to_pandas() + pdf["value"] = pdf["value"] + 1.0 + yield pa.RecordBatch.from_pandas(pdf) + + def double_value(iterator): + for batch in iterator: + pdf = batch.to_pandas() + pdf["value"] = pdf["value"] * 2.0 + yield pa.RecordBatch.from_pandas(pdf) + + result_df = ( + spark.read.parquet(src) + .mapInArrow(add_one, schema) + .mapInArrow(double_value, schema) + ) + + if accelerated: + plan = _executed_plan(result_df) + assert plan.count("CometMapInBatch") >= 2, ( + f"expected two CometMapInBatch operators in accelerated plan, got:\n{plan}" + ) + + out = sorted((r["id"], r["value"]) for r in result_df.collect()) + expected = sorted((i, (float(i) + 1.0) * 2.0) for i in range(50)) + assert out == expected + + +def test_filter_on_map_in_arrow_output(spark, tmp_path, accelerated): + """ + A filter on the UDF output column is a downstream Comet operator (when Comet's + native filter applies) reading from `CometMapInBatchExec`'s output. If the + output were plain `ArrowColumnVector`, NativeUtil.exportBatch's case match + would fall to the `case c =>` arm and throw SparkException. + """ + schema = T.StructType( + [ + T.StructField("id", T.LongType()), + T.StructField("value", T.LongType()), + ] + ) + rows = [(i, i * 2) for i in range(100)] + src = str(tmp_path / "src.parquet") + spark.createDataFrame(rows, schema).write.parquet(src) + + def passthrough(iterator): + for batch in iterator: + yield batch + + result_df = ( + spark.read.parquet(src).mapInArrow(passthrough, schema).filter("value > 50") + ) + + out = sorted((r["id"], r["value"]) for r in result_df.collect()) + expected = sorted((i, i * 2) for i in range(100) if i * 2 > 50) + assert out == expected + + +def test_aggregate_on_map_in_arrow_output(spark, tmp_path, accelerated): + """ + `mapInArrow(...).groupBy(...).agg(...)` puts an aggregate over the UDF output. + The aggregate is a Comet operator and reads from `CometMapInBatchExec`'s + output via NativeUtil.exportBatch when promoted to the native pipeline. If + the output were ArrowColumnVector, exportBatch would throw on every batch. + """ + schema = T.StructType( + [ + T.StructField("id", T.LongType()), + T.StructField("grp", T.LongType()), + T.StructField("value", T.LongType()), + ] + ) + rows = [(i, i % 5, i) for i in range(100)] + src = str(tmp_path / "src.parquet") + spark.createDataFrame(rows, schema).write.parquet(src) + + def passthrough(iterator): + for batch in iterator: + yield batch + + result_df = ( + spark.read.parquet(src) + .mapInArrow(passthrough, schema) + .groupBy("grp") + .agg({"value": "sum"}) + ) + + out = {r["grp"]: r["sum(value)"] for r in result_df.collect()} + expected = {} + for i in range(100): + expected[i % 5] = expected.get(i % 5, 0) + i + assert out == expected + + def test_map_in_arrow_barrier_mode(spark, tmp_path, accelerated): """ `mapInArrow(..., barrier=True)` runs the stage in barrier execution mode diff --git a/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala b/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala index 79d75bb2cf..b4f3d64d76 100644 --- a/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala +++ b/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala @@ -105,6 +105,39 @@ class CometMapInBatchSuite extends CometTestBase { } } + test("rule handles chained MapInArrowExec without crashing") { + // df.mapInArrow(...).mapInArrow(...) produces two MapInArrowExec operators. The outer + // consumes rows from the inner directly (MapInArrowExec is a row producer), so there is + // no ColumnarToRow between them. After the rule's bottom-up rewrite the inner becomes + // CometMapInBatchExec; the outer keeps its row contract and is satisfied by + // CometMapInBatchExec.doExecute() reintroducing a ColumnarToRow internally. The + // assertion exists mainly to pin the structure: regress this if a future change makes + // both rewrite (the bulk-copy input path would then need to accept a CometVector input + // that did not come from a CometDecodedVector chain). + withSQLConf(CometConf.COMET_PYARROW_UDF_ENABLED.key -> "true") { + val cometLeaf = StubCometLeaf(Seq(AttributeReference("id", LongType)(ExprId(0L)))) + val inner = MapInArrowExec( + stubPythonUDF, + cometLeaf.output, + ColumnarToRowExec(cometLeaf), + isBarrier = false, + profile = None) + val outer = MapInArrowExec( + stubPythonUDF, + cometLeaf.output, + inner, + isBarrier = false, + profile = None) + + val rewritten = EliminateRedundantTransitions(spark).apply(outer) + val cometOps = rewritten.collect { case op: CometMapInBatchExec => op } + assert( + cometOps.size == 1, + s"expected the inner MapInArrowExec to be rewritten, but the chain produced " + + s"${cometOps.size} CometMapInBatchExec(s):\n$rewritten") + } + } + test("end-to-end: rewrite-on output matches rewrite-off output for primitives + varchar") { // This test needs PySpark workers; only run if PYSPARK_PYTHON is set in the env. assume( From 2b11f4612d3d8dbf3d9b8b2f4de05f669e4fd4ba Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 21 May 2026 09:17:28 -0600 Subject: [PATCH 43/54] refactor: tighten EliminateRedundantTransitions arm and dedupe 3.x stub - Move the 3.4 / 3.5 ShimCometMapInBatch stubs into a single spark-3.x shim (they were byte-identical). The matchers still return None on both versions so the rule is a no-op on Spark 3.x. - Replace the eligibleMapInBatchInfo guard + .get unpack with an EligibleMapInBatch extractor that runs the matchers and conf reads once per visited plan. - Add arrowUseLargeVarTypes(conf) to ShimSQLConf so the rule no longer reads the conf stringly. 4.x and 3.5 forward to the typed accessor; 3.4 falls back to getConfString because that version has no accessor. - Hoist the per-batch VectorUnloader in CometColumnarPythonInput to a lazy val. getRecordBatch reads root.getFieldVectors on every call so reuse is safe; this drops one allocation per batch. - Clarify the comment on cometCodec: 4.0.x has no SQLConf.arrowCompressionCodec accessor (added after 4.0 branch was cut), so a typed ShimSQLConf forwarder would still need a stringly fallback for the 4.0 build. The 4.1+ codec instances live in the separate arrow-compression artifact, which Comet does not depend on; the CompressionCodec.Factory path keeps that dependency contained. Addresses mbutrovich's items 5, 6, 8, 10 on #4234. --- .../rules/EliminateRedundantTransitions.scala | 63 ++++++++----------- .../org/apache/comet/shims/ShimSQLConf.scala | 9 +++ .../sql/comet/shims/ShimCometMapInBatch.scala | 60 ------------------ .../org/apache/comet/shims/ShimSQLConf.scala | 8 ++- .../sql/comet/shims/ShimCometMapInBatch.scala | 19 ++++-- .../org/apache/comet/shims/ShimSQLConf.scala | 10 ++- .../python/CometColumnarPythonInput.scala | 18 ++++-- 7 files changed, 77 insertions(+), 110 deletions(-) delete mode 100644 spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala rename spark/src/main/{spark-3.5 => spark-3.x}/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala (72%) diff --git a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala index ee7a9e085b..ce3b78a9fa 100644 --- a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala +++ b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala @@ -30,6 +30,7 @@ import org.apache.spark.sql.execution.adaptive.QueryStageExec import org.apache.spark.sql.execution.exchange.ReusedExchangeExec import org.apache.comet.CometConf +import org.apache.comet.shims.ShimSQLConf // This rule is responsible for eliminating redundant transitions between row-based and // columnar-based operators for Comet. Currently, three potential redundant transitions are: @@ -54,7 +55,8 @@ import org.apache.comet.CometConf // be removed. case class EliminateRedundantTransitions(session: SparkSession) extends Rule[SparkPlan] - with ShimCometMapInBatch { + with ShimCometMapInBatch + with ShimSQLConf { private lazy val showTransformations = CometConf.COMET_EXPLAIN_TRANSFORMATIONS.get() @@ -104,31 +106,21 @@ case class EliminateRedundantTransitions(session: SparkSession) // Replace MapInBatchExec (PythonMapInArrowExec / MapInArrowExec / MapInPandasExec) that has // a ColumnarToRow child with CometMapInBatchExec, eliminating the input and output // UnsafeProjection copies and keeping the stage columnar. The matchers are - // version-shimmed: Spark 3.4 returns None (it lacks the required APIs) and Spark 4.1+ - // matches the renamed `MapInArrowExec`. + // version-shimmed: Spark 3.4 / 3.5 return None (they lack the required APIs) and Spark + // 4.1+ matches the renamed `MapInArrowExec`. // // Falls back to vanilla Spark when `spark.sql.execution.arrow.useLargeVarTypes` is enabled: // CometColumnarPythonInput.copyVector does raw `setBytes` on each Arrow buffer, but Comet's // source string/binary vectors always use 4-byte offsets while the destination root is // allocated with 8-byte offsets when this conf is on. The buffer counts match but the // offset width does not, so a direct memcpy would corrupt the offsets. - // - // The guard runs `eligibleMapInBatchInfo` so this case only matches actual MapInArrow / - // MapInPandas operators. Without the structural check the case would match every - // `SparkPlan` whenever the pyarrow conf is on, short-circuiting the - // `CometShuffleExchangeExec` arm below. - case p if eligibleMapInBatchInfo(p).isDefined => - val info = eligibleMapInBatchInfo(p).get - extractColumnarChild(info.child) - .map { columnarChild => - CometMapInBatchExec( - info.func, - info.output, - columnarChild, - info.isBarrier, - info.pythonEvalType) - } - .getOrElse(p) + case EligibleMapInBatch(info, columnarChild) => + CometMapInBatchExec( + info.func, + info.output, + columnarChild, + info.isBarrier, + info.pythonEvalType) // Spark adds `RowToColumnar` under Comet columnar shuffle. But it's redundant as the // shuffle takes row-based input. @@ -176,23 +168,22 @@ case class EliminateRedundantTransitions(session: SparkSession) } /** - * Returns `Some(info)` only when this rule should attempt to rewrite `plan` to - * `CometMapInBatchExec`, i.e. when the conf is on, the largeVarTypes fallback does not apply, - * and the plan is one of the version-shimmed MapInArrow / MapInPandas operators. Used in the - * pattern guard so the case only fires for plans we actually want to rewrite - without that - * narrowing, the `case` would match every `SparkPlan` whenever the conf is on and consume the - * later `CometShuffleExchangeExec` arm. Read the conf via the raw key string so this compiles - * against Spark 3.4, which lacks `SQLConf.arrowUseLargeVarTypes`. + * Matches the plans this rule should rewrite to `CometMapInBatchExec`. Single extractor used in + * the `transformUp` arm above so the matchers and conf reads run once per visited plan. Returns + * `(info, columnarChild)` where `columnarChild` is the Comet columnar producer that + * `CometMapInBatchExec` will consume directly. Returns `None` (and the arm misses) when the + * conf is off, when `useLargeVarTypes` forces the fallback, when the plan is not one of the + * version-shimmed MapInArrow / MapInPandas operators, or when the child is not a Comet + * columnar-to-row transition we can strip. */ - private def eligibleMapInBatchInfo(plan: SparkPlan): Option[MapInBatchInfo] = { - if (!CometConf.COMET_PYARROW_UDF_ENABLED.get()) { - None - } else if (plan.conf - .getConfString("spark.sql.execution.arrow.useLargeVarTypes", "false") - .toBoolean) { - None - } else { - matchMapInArrow(plan).orElse(matchMapInPandas(plan)) + private object EligibleMapInBatch { + def unapply(plan: SparkPlan): Option[(MapInBatchInfo, SparkPlan)] = { + if (!CometConf.COMET_PYARROW_UDF_ENABLED.get()) None + else if (arrowUseLargeVarTypes(plan.conf)) None + else + matchMapInArrow(plan) + .orElse(matchMapInPandas(plan)) + .flatMap(info => extractColumnarChild(info.child).map(child => (info, child))) } } diff --git a/spark/src/main/spark-3.4/org/apache/comet/shims/ShimSQLConf.scala b/spark/src/main/spark-3.4/org/apache/comet/shims/ShimSQLConf.scala index 0bff426c21..e809e33904 100644 --- a/spark/src/main/spark-3.4/org/apache/comet/shims/ShimSQLConf.scala +++ b/spark/src/main/spark-3.4/org/apache/comet/shims/ShimSQLConf.scala @@ -19,9 +19,18 @@ package org.apache.comet.shims +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy trait ShimSQLConf { protected val LEGACY = LegacyBehaviorPolicy.LEGACY protected val CORRECTED = LegacyBehaviorPolicy.CORRECTED + + /** + * Reads `spark.sql.execution.arrow.useLargeVarTypes`. Spark 3.4 has no typed accessor for this + * conf, so read by raw key. The conf only governs the destination Arrow IPC root width on + * Spark 4.x, so the value returned here matters only to callers that look it up explicitly. + */ + protected def arrowUseLargeVarTypes(conf: SQLConf): Boolean = + conf.getConfString("spark.sql.execution.arrow.useLargeVarTypes", "false").toBoolean } diff --git a/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala deleted file mode 100644 index 1fd4b96f09..0000000000 --- a/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.spark.sql.comet.shims - -import org.apache.spark.TaskContext -import org.apache.spark.sql.catalyst.expressions.PythonUDF -import org.apache.spark.sql.execution.SparkPlan -import org.apache.spark.sql.execution.metric.SQLMetric -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.vectorized.ColumnarBatch - -/** - * Spark 3.4 shim for the PyArrow UDF acceleration support. - * - * Spark 3.4 lacks several APIs that the optimization relies on (`isBarrier` on `MapInBatchExec`, - * `arrowUseLargeVarTypes`, `JobArtifactSet`, the modern `ArrowPythonRunner` constructor), so the - * matchers return `None` and the runner factory throws. The optimization is effectively a no-op - * on Spark 3.4. - */ -trait ShimCometMapInBatch { - - protected def matchMapInArrow(plan: SparkPlan): Option[MapInBatchInfo] = None - - protected def matchMapInPandas(plan: SparkPlan): Option[MapInBatchInfo] = None - - /** Stub; never constructed on Spark 3.4 because the matchers always return `None`. */ - protected case class RunnerInputs() - - protected def runnerInputs(pythonUDF: PythonUDF, conf: SQLConf): RunnerInputs = - throw new UnsupportedOperationException("CometMapInBatchExec is not supported on Spark 3.4") - - protected def computeArrowPython( - runnerInputs: RunnerInputs, - evalType: Int, - argOffsets: Array[Array[Int]], - schema: StructType, - pythonMetrics: Map[String, SQLMetric], - batchIter: Iterator[Iterator[ColumnarBatch]], - partitionId: Int, - context: TaskContext): Iterator[ColumnarBatch] = - throw new UnsupportedOperationException("CometMapInBatchExec is not supported on Spark 3.4") -} diff --git a/spark/src/main/spark-3.5/org/apache/comet/shims/ShimSQLConf.scala b/spark/src/main/spark-3.5/org/apache/comet/shims/ShimSQLConf.scala index bdb2739460..219e0f2a2e 100644 --- a/spark/src/main/spark-3.5/org/apache/comet/shims/ShimSQLConf.scala +++ b/spark/src/main/spark-3.5/org/apache/comet/shims/ShimSQLConf.scala @@ -19,9 +19,15 @@ package org.apache.comet.shims -import org.apache.spark.sql.internal.LegacyBehaviorPolicy +import org.apache.spark.sql.internal.{LegacyBehaviorPolicy, SQLConf} trait ShimSQLConf { protected val LEGACY = LegacyBehaviorPolicy.LEGACY protected val CORRECTED = LegacyBehaviorPolicy.CORRECTED + + /** + * Reads `spark.sql.execution.arrow.useLargeVarTypes`. Spark 3.5 has the typed accessor; + * forward to it. + */ + protected def arrowUseLargeVarTypes(conf: SQLConf): Boolean = conf.arrowUseLargeVarTypes } diff --git a/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-3.x/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala similarity index 72% rename from spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala rename to spark/src/main/spark-3.x/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala index 73a1077de2..c0a31c6e52 100644 --- a/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala +++ b/spark/src/main/spark-3.x/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala @@ -28,11 +28,18 @@ import org.apache.spark.sql.types.StructType import org.apache.spark.sql.vectorized.ColumnarBatch /** - * Spark 3.5 shim for the PyArrow UDF acceleration support. + * Spark 3.x stub for the PyArrow UDF acceleration support. * - * The columnar runner introduced in #4234 only targets Spark 4.0+. On Spark 3.5 the matchers + * The columnar runner introduced in #4234 only targets Spark 4.0+. On Spark 3.4 / 3.5 the matchers * return `None`, the rewrite does not fire, and vanilla Spark handles `mapInArrow` / - * `mapInPandas` unchanged. 3.5 support can be added later if there is user demand. + * `mapInPandas` unchanged. The runner factory throws; it is never called because the matchers + * always return `None`. 3.x support can be added later if there is user demand. + * + * Shared across spark-3.4 and spark-3.5 because both are identical: 3.4 lacks the modern + * `ArrowPythonRunner` constructor and `arrowUseLargeVarTypes`, and 3.5's `PythonArrowInput` + * trait has a different contract (`writeIteratorToArrowStream` one-shot vs 4.x's + * `writeNextBatchToArrowStream` batch-at-a-time), so neither version can host the columnar input + * implementation without a separate rewrite. */ trait ShimCometMapInBatch { @@ -40,11 +47,11 @@ trait ShimCometMapInBatch { protected def matchMapInPandas(plan: SparkPlan): Option[MapInBatchInfo] = None - /** Stub; never constructed on Spark 3.5 because the matchers always return `None`. */ + /** Stub; never constructed on Spark 3.x because the matchers always return `None`. */ protected case class RunnerInputs() protected def runnerInputs(pythonUDF: PythonUDF, conf: SQLConf): RunnerInputs = - throw new UnsupportedOperationException("CometMapInBatchExec is not supported on Spark 3.5") + throw new UnsupportedOperationException("CometMapInBatchExec is not supported on Spark 3.x") protected def computeArrowPython( runnerInputs: RunnerInputs, @@ -55,5 +62,5 @@ trait ShimCometMapInBatch { batchIter: Iterator[Iterator[ColumnarBatch]], partitionId: Int, context: TaskContext): Iterator[ColumnarBatch] = - throw new UnsupportedOperationException("CometMapInBatchExec is not supported on Spark 3.5") + throw new UnsupportedOperationException("CometMapInBatchExec is not supported on Spark 3.x") } diff --git a/spark/src/main/spark-4.x/org/apache/comet/shims/ShimSQLConf.scala b/spark/src/main/spark-4.x/org/apache/comet/shims/ShimSQLConf.scala index bdb2739460..3157889b43 100644 --- a/spark/src/main/spark-4.x/org/apache/comet/shims/ShimSQLConf.scala +++ b/spark/src/main/spark-4.x/org/apache/comet/shims/ShimSQLConf.scala @@ -19,9 +19,17 @@ package org.apache.comet.shims -import org.apache.spark.sql.internal.LegacyBehaviorPolicy +import org.apache.spark.sql.internal.{LegacyBehaviorPolicy, SQLConf} trait ShimSQLConf { protected val LEGACY = LegacyBehaviorPolicy.LEGACY protected val CORRECTED = LegacyBehaviorPolicy.CORRECTED + + /** + * Reads `spark.sql.execution.arrow.useLargeVarTypes`. Spark 4.x exposes a typed accessor; 3.4 + * lacks it (a 3.5 backport added it, but Comet's 3.x shim collapses both into a single string + * fallback). Forward to the accessor here so callers do not depend on which version they're + * compiled against. + */ + protected def arrowUseLargeVarTypes(conf: SQLConf): Boolean = conf.arrowUseLargeVarTypes } diff --git a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala index 5cce2aaf28..ac821901c0 100644 --- a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala +++ b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala @@ -53,10 +53,18 @@ private[python] trait CometColumnarPythonInput extends PythonArrowInput[Iterator private var currentGroup: Iterator[ColumnarBatch] = _ - // Read the codec name via raw config key so this compiles against Spark 4.0 (which lacks - // SQLConf.arrowCompressionCodec) as well as 4.1/4.2. The codec instances are obtained - // through CompressionCodec.Factory (arrow-vector) rather than importing the concrete - // Lz4CompressionCodec / ZstdCompressionCodec from the separate arrow-compression artifact. + // Constructed once per task: `root` (the trait's persistent destination IPC root) and + // `cometCodec` are both stable across the partition. `getRecordBatch` reads the current + // contents of `root.getFieldVectors` on every call, so re-using the unloader is safe. + private lazy val batchUnloader: VectorUnloader = + new VectorUnloader(root, /* includeNullCount */ true, cometCodec, /* alignBuffers */ true) + + // Read the codec name via raw config key. Spark 4.0.x has no `SQLConf.arrowCompressionCodec` + // accessor at all (it was added after the 4.0 line was cut), so a typed `ShimSQLConf` + // forwarder would still need a stringly-typed fallback for the 4.0 build. The codec instances + // are obtained through `CompressionCodec.Factory` (arrow-vector) rather than importing the + // concrete `Lz4CompressionCodec` / `ZstdCompressionCodec` from the separate + // arrow-compression artifact, which Comet does not depend on. private lazy val cometCodec: CompressionCodec = { val factory = CompressionCodec.Factory.INSTANCE SQLConf.get.getConfString("spark.sql.execution.arrow.compression.codec", "none") match { @@ -112,8 +120,6 @@ private[python] trait CometColumnarPythonInput extends PythonArrowInput[Iterator Platform.setMemory(structVec.getValidityBuffer.memoryAddress(), 0xff.toByte, validityBytes) root.setRowCount(numRows) - val batchUnloader = - new VectorUnloader(root, /* includeNullCount */ true, cometCodec, /* alignBuffers */ true) val recordBatch = batchUnloader.getRecordBatch try { val writeChannel = new WriteChannel(Channels.newChannel(dataOut)) From b69da2ebe64211df47e24e08bbb7fc19bbea18a7 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 21 May 2026 09:18:24 -0600 Subject: [PATCH 44/54] docs: ground pyarrow UDF allocator framing in actual code The PR description, CometColumnarPythonInput header, and pyarrow-udfs.md all blamed the per-buffer copy on 'Comet's Parquet readers each constructing their own RootAllocator'. The repo only has one process-wide RootAllocator (CometArrowAllocator), and native scan does Parquet reading on the Rust side: arrow buffers cross the boundary via Arrow C Data Interface, not a JVM allocator. The actual blocker on TransferPair is that imported buffers carry a ReferenceManager whose release routes through FFI, while Spark's destination IPC root is a child of ArrowUtils.rootAllocator. The two reference managers cannot share buffers. Reframe the per-batch work as 'two copies, one structural': - copy 1 (Comet -> destination IPC root) is droppable, tracked in #4294 - copy 2 (root -> pipe via VectorUnloader / MessageSerializer) is the structural floor; Spark's transport to Python is fork + pipe + Arrow IPC, so the bytes must reach the pipe at least once Addresses mbutrovich's items 1 and 3 (framing) on #4234. The PR description update is a separate step. --- docs/source/user-guide/latest/pyarrow-udfs.md | 16 +++++++++++----- .../python/CometColumnarPythonInput.scala | 17 ++++++++++++----- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/docs/source/user-guide/latest/pyarrow-udfs.md b/docs/source/user-guide/latest/pyarrow-udfs.md index 0b2cd9aebb..d0e2346998 100644 --- a/docs/source/user-guide/latest/pyarrow-udfs.md +++ b/docs/source/user-guide/latest/pyarrow-udfs.md @@ -196,8 +196,14 @@ on the unoptimized path. destination IPC root, while Comet's source vectors always use 4-byte offsets. The buffer-copy path cannot bridge that mismatch, so `EliminateRedundantTransitions` skips the rewrite and vanilla Spark handles the operation. -- The current implementation copies Comet's vector buffers into Spark's allocator one - buffer at a time. True zero-copy via `TransferPair` is blocked on Comet's Parquet - readers allocating from `ArrowUtils.rootAllocator` (rather than each reader - constructing its own independent `RootAllocator`). Tracked in - [#4294](https://github.com/apache/datafusion-comet/issues/4294). +- Each batch is copied twice on the JVM side: once from Comet's vectors into Spark's + destination IPC root (per-buffer `setBytes`), and a second time inside the IPC writer when + `VectorUnloader` / `MessageSerializer.serialize` walks the root and writes bytes to the + pipe to the Python worker. The pipe write is structural (Spark's transport to Python is + fork + pipe + Arrow IPC, so the buffer bytes must reach the pipe at least once); dropping + the first copy by serialising directly from Comet's vectors is tracked in + [#4294](https://github.com/apache/datafusion-comet/issues/4294). Even after that, + true zero-copy at the JVM boundary is blocked because Comet's source `FieldVector`s are + imported from native via Arrow C Data Interface (their buffers route `release` through FFI), + while Spark's destination IPC root is a child of `ArrowUtils.rootAllocator`. The two + reference managers cannot share buffers via `TransferPair`. diff --git a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala index ac821901c0..cf4f324a23 100644 --- a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala +++ b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala @@ -42,11 +42,18 @@ import org.apache.comet.vector.CometDecodedVector * worker as Arrow IPC. * * Per batch: walk the destination struct's children, allocate each child sized to match the - * corresponding Comet column, and copy each buffer with `ArrowBuf.setBytes`. The source (Comet's - * vectors) and the destination (Spark's persistent IPC root) live in different `RootAllocator` - * trees, so `TransferPair` / `VectorLoader.load` cannot rebind buffers across the boundary; - * per-buffer memcpy is the available alternative until the readers share a parent allocator - * (tracked in #4294). + * corresponding Comet column, and copy each buffer with `ArrowBuf.setBytes`. The current path + * does two copies per batch: this one (Comet vector buffers → destination IPC root), and a + * second one inside `VectorUnloader` / `MessageSerializer.serialize` (root → pipe). The pipe + * write is structural — Spark's transport to Python is fork + pipe + Arrow IPC, so the buffer + * bytes must reach the pipe at least once. Dropping the first copy by serialising directly + * from Comet's vectors is tracked in #4294; once done, the path is at the single-copy floor. + * + * The cross-allocator constraint on `TransferPair` is independent of the copy count: even after + * #4294, true zero-copy at the JVM boundary is blocked because Comet's source `FieldVector`s + * are imported from native via Arrow C Data Interface (their buffers route `release` through + * FFI), while Spark's destination IPC root is a child of `ArrowUtils.rootAllocator`. The two + * reference managers cannot share buffers. */ private[python] trait CometColumnarPythonInput extends PythonArrowInput[Iterator[ColumnarBatch]] { self: BasePythonRunner[Iterator[ColumnarBatch], _] => From a520321765a44fd8809c272ae50fc27c13963bce Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 21 May 2026 09:19:45 -0600 Subject: [PATCH 45/54] test: expand pyarrow UDF coverage for vector-copy edge cases Hand-written cases that pin the boundaries mbutrovich called out as gaps: - decimal precision sweep (1, 9, 17, 18, 19, 28, 38; scale 0/half/max) covering the short-decimal (long-backed) and long-decimal (16-byte FixedSizeBinary) paths and the 18/19 boundary - null density sweep (0, 0.01, 0.5, 0.99, 1.0) for validity-buffer memcpy - multi-batch per partition (batch size 16, 4000 rows in 1 partition) so the persistent destination IPC root is exercised across many batches - wide schema (50 cols, mixed primitives + strings + booleans) for the flattened-tree address arithmetic - mid-stream zero-row batch so setValueCount(0) + validity sizing is hit while the iterator continues - transforming array UDF (reverse each list) to catch symmetric encode/decode mistakes that a passthrough would invert A randomised fuzz harness (analogous to CometCodegenFuzzSuite) is the right next step for the recursive vector-tree walk; deferred to a separate follow-on. --- .../resources/pyspark/test_pyarrow_udf.py | 239 ++++++++++++++++++ 1 file changed, 239 insertions(+) diff --git a/spark/src/test/resources/pyspark/test_pyarrow_udf.py b/spark/src/test/resources/pyspark/test_pyarrow_udf.py index 4567bedf6d..24e3b65049 100644 --- a/spark/src/test/resources/pyspark/test_pyarrow_udf.py +++ b/spark/src/test/resources/pyspark/test_pyarrow_udf.py @@ -335,6 +335,245 @@ def passthrough(iterator): assert out == set(rows) +@pytest.mark.parametrize( + "precision,scale", + [ + (1, 0), + (9, 0), + (9, 4), + (17, 8), + (18, 0), + (18, 18), + (19, 0), + (28, 14), + (38, 0), + (38, 18), + (38, 38), + ], +) +def test_map_in_arrow_decimal_precision_sweep( + spark, tmp_path, accelerated, precision, scale +): + """ + Spark's `BaseFixedWidthVector` handles short decimals (precision <= 18, long-backed) and long + decimals (precision >= 19, 16-byte `FixedSizeBinary`) on different code paths. The 18/19 + boundary is where buffer-width assumptions in `copyVector` can hide bugs. Sweep over + representative precisions and scale extremes (0, half, max). + """ + schema_in = T.StructType( + [ + T.StructField("id", T.LongType()), + T.StructField("amount", T.DecimalType(precision, scale)), + ] + ) + integer_digits = precision - scale + abs_int = (10**integer_digits - 1) if integer_digits > 0 else 0 + abs_frac = (10**scale - 1) if scale > 0 else 0 + largest = Decimal(f"{abs_int}.{abs_frac:0{scale}d}") if scale else Decimal(abs_int) + rows = [ + (1, Decimal(0)), + (2, largest), + (3, -largest), + (4, None), + ] + src = str(tmp_path / "src.parquet") + spark.createDataFrame(rows, schema_in).write.parquet(src) + + def passthrough(iterator): + for batch in iterator: + yield batch + + result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in) + _assert_plan_matches_mode(_executed_plan(result_df), accelerated) + + out = {(r["id"], r["amount"]) for r in result_df.collect()} + assert out == set(rows) + + +@pytest.mark.parametrize("null_fraction", [0.0, 0.01, 0.5, 0.99, 1.0]) +def test_map_in_arrow_null_density_sweep( + spark, tmp_path, accelerated, null_fraction +): + """ + Validity-buffer memcpy is where Arrow Java vector copies historically break. Sweep null + density across the corner cases: all-non-null, sparse-null, half-null, sparse-non-null, + all-null. Catches off-by-one in validity packing and edge cases where source/destination + null counts diverge. + """ + schema_in = T.StructType( + [ + T.StructField("id", T.LongType()), + T.StructField("value", T.LongType()), + ] + ) + n = 256 + rows = [ + (i, None if (i * 9973) % 100 < int(null_fraction * 100) else i * 2) + for i in range(n) + ] + src = str(tmp_path / "src.parquet") + spark.createDataFrame(rows, schema_in).write.parquet(src) + + def passthrough(iterator): + for batch in iterator: + yield batch + + result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in) + _assert_plan_matches_mode(_executed_plan(result_df), accelerated) + + out = sorted((r["id"], r["value"]) for r in result_df.collect()) + assert out == sorted(rows) + + +def test_map_in_arrow_multi_batch_per_partition(spark, tmp_path, accelerated): + """ + Force many small batches in a single partition so the writer/unloader exercises the + persistent destination IPC root over multiple batches. Catches buffer-reuse bugs and + variable-width data-buffer growth across batches that single-batch tests miss. + """ + schema_in = T.StructType( + [ + T.StructField("id", T.LongType()), + T.StructField("s", T.StringType()), + ] + ) + n = 4000 + rows = [(i, f"row_{i}" if i % 7 != 0 else None) for i in range(n)] + src = str(tmp_path / "src.parquet") + # Single partition; small arrow batch limit forces ~250 batches per partition. + spark.createDataFrame(rows, schema_in).coalesce(1).write.parquet(src) + + prev_records = spark.conf.get("spark.sql.execution.arrow.maxRecordsPerBatch") + spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "16") + try: + + def passthrough(iterator): + for batch in iterator: + yield batch + + result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in) + _assert_plan_matches_mode(_executed_plan(result_df), accelerated) + + out = sorted((r["id"], r["s"]) for r in result_df.collect()) + assert out == sorted(rows) + finally: + spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", prev_records) + + +def test_map_in_arrow_wide_schema(spark, tmp_path, accelerated): + """ + 50-column mixed-type schema. The bulk-copy path walks a flattened addresses[] array indexed + across the whole vector tree; off-by-one in flattening logic surfaces at depth * width. + """ + fields = [T.StructField("id", T.LongType())] + for i in range(15): + fields.append(T.StructField(f"i{i}", T.IntegerType())) + for i in range(15): + fields.append(T.StructField(f"d{i}", T.DoubleType())) + for i in range(15): + fields.append(T.StructField(f"s{i}", T.StringType())) + for i in range(4): + fields.append(T.StructField(f"b{i}", T.BooleanType())) + assert len(fields) == 50 + schema_in = T.StructType(fields) + + rows = [] + for i in range(60): + row = [i] + row += [i + k if k % 3 != 0 else None for k in range(15)] + row += [float(i + k) * 0.5 if k % 4 != 0 else None for k in range(15)] + row += [f"s{i}_{k}" if k % 5 != 0 else None for k in range(15)] + row += [bool((i + k) % 2) for k in range(4)] + rows.append(tuple(row)) + + src = str(tmp_path / "src.parquet") + spark.createDataFrame(rows, schema_in).write.parquet(src) + + def passthrough(iterator): + for batch in iterator: + yield batch + + result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in) + _assert_plan_matches_mode(_executed_plan(result_df), accelerated) + + out = sorted(tuple(r[name] for name in schema_in.names) for r in result_df.collect()) + assert out == sorted(rows) + + +def test_map_in_arrow_zero_row_batch_in_stream(spark, tmp_path, accelerated): + """ + A non-empty stream that contains a 0-row batch mid-stream. The existing empty-input test + filters everything out so the operator sees zero batches; this one keeps later batches so + the writer must handle a 0-row batch and continue. setValueCount(0) + validity buffer + sizing are the candidates that can break here. + """ + schema_in = T.StructType( + [ + T.StructField("id", T.LongType()), + T.StructField("value", T.LongType()), + ] + ) + rows = [(i, i * 3) for i in range(50)] + src = str(tmp_path / "src.parquet") + spark.createDataFrame(rows, schema_in).coalesce(1).write.parquet(src) + + def emit_with_empty(iterator): + for batch in iterator: + # Yield an empty record batch first, then the real one. + yield batch.slice(0, 0) + yield batch + + result_df = spark.read.parquet(src).mapInArrow(emit_with_empty, schema_in) + _assert_plan_matches_mode(_executed_plan(result_df), accelerated) + + out = sorted((r["id"], r["value"]) for r in result_df.collect()) + assert out == sorted(rows) + + +def test_map_in_arrow_transforming_array(spark, tmp_path, accelerated): + """ + Mutating UDF over a complex type: reverse each array. Catches symmetric encode/decode + mistakes that a passthrough UDF would invert and hide. + """ + schema_in = T.StructType( + [ + T.StructField("id", T.LongType()), + T.StructField("nums", T.ArrayType(T.IntegerType())), + ] + ) + rows = [ + (1, [1, 2, 3, 4]), + (2, [None, 5, None]), + (3, []), + (4, None), + (5, [42]), + ] + src = str(tmp_path / "src.parquet") + spark.createDataFrame(rows, schema_in).write.parquet(src) + + def reverse_arrays(iterator): + for batch in iterator: + pdf = batch.to_pandas() + pdf["nums"] = pdf["nums"].apply( + lambda lst: list(reversed(lst)) if lst is not None else None + ) + yield pa.RecordBatch.from_pandas(pdf) + + result_df = spark.read.parquet(src).mapInArrow(reverse_arrays, schema_in) + _assert_plan_matches_mode(_executed_plan(result_df), accelerated) + + def _norm(row): + nums = row["nums"] + return (row["id"], None if nums is None else tuple(nums)) + + out = {_norm(r) for r in result_df.collect()} + expected = set() + for id_, nums in rows: + rev = None if nums is None else tuple(reversed(nums)) + expected.add((id_, rev)) + assert out == expected + + def test_map_in_arrow_date_and_timestamp(spark, tmp_path, accelerated): schema_in = T.StructType( [ From 1c00e1b1db28a9b3e2ba32dcca56673fcebcb10b Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 21 May 2026 09:29:08 -0600 Subject: [PATCH 46/54] test: disable Comet shuffle in pyarrow UDF pytest session CometSparkSessionExtensions.isCometLoaded short-circuits the whole extension (returning false; no rules registered) when spark.comet.exec.shuffle.enabled is true but spark.shuffle.manager is not Comet's manager. The pytest conftest only sets the basic Comet configs, so this guard fired and CometScanRule never ran. The plan stayed vanilla Parquet, the rewrite chain never had a Comet columnar producer to match, and every [accelerated] assertion that checks for CometMapInBatch failed. These tests do not exercise shuffle, so disable Comet shuffle in the session. Comet's scan and exec rules then run normally and the rewrite fires. Diagnoses the wholesale PyArrow UDF Spark 4.0 CI failure on #4234. --- spark/src/test/resources/pyspark/test_pyarrow_udf.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/spark/src/test/resources/pyspark/test_pyarrow_udf.py b/spark/src/test/resources/pyspark/test_pyarrow_udf.py index 24e3b65049..6347411cb7 100644 --- a/spark/src/test/resources/pyspark/test_pyarrow_udf.py +++ b/spark/src/test/resources/pyspark/test_pyarrow_udf.py @@ -63,6 +63,12 @@ def spark(): .config("spark.plugins", "org.apache.spark.CometPlugin") .config("spark.comet.enabled", "true") .config("spark.comet.exec.enabled", "true") + # spark.comet.exec.shuffle.enabled defaults to true, and + # CometSparkSessionExtensions.isCometLoaded refuses to register Comet's rules + # at all when shuffle is on but spark.shuffle.manager is not the Comet manager. + # These tests do not need Comet shuffle, so disable it explicitly to keep + # Comet's scan and exec rules active without configuring shuffle. + .config("spark.comet.exec.shuffle.enabled", "false") .config("spark.memory.offHeap.enabled", "true") .config("spark.memory.offHeap.size", "2g") .getOrCreate() From c43b203dd20a99c2ed9aed3cb45a658fa07cb9ef Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 22 Jun 2026 17:06:47 -0600 Subject: [PATCH 47/54] fix: resolve build failures after merging main into pyarrow-udf Drop the removed useDecimal128 argument from the CometVector.getVector call in CometMapInBatchExec, which no longer compiles after main removed that parameter. Add braces to the EligibleMapInBatch if/else to satisfy scalastyle, remove a redundant string interpolator flagged by scalafix, and apply spotless formatting. --- .../rules/EliminateRedundantTransitions.scala | 9 ++++++--- .../spark/sql/comet/CometMapInBatchExec.scala | 1 - .../org/apache/comet/shims/ShimSQLConf.scala | 4 ++-- .../org/apache/comet/shims/ShimSQLConf.scala | 4 ++-- .../sql/comet/shims/ShimCometMapInBatch.scala | 8 ++++---- .../python/CometColumnarPythonInput.scala | 18 +++++++++--------- .../spark/sql/comet/CometMapInBatchSuite.scala | 10 +++------- 7 files changed, 26 insertions(+), 28 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala index ce3b78a9fa..fa42f441e5 100644 --- a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala +++ b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala @@ -178,12 +178,15 @@ case class EliminateRedundantTransitions(session: SparkSession) */ private object EligibleMapInBatch { def unapply(plan: SparkPlan): Option[(MapInBatchInfo, SparkPlan)] = { - if (!CometConf.COMET_PYARROW_UDF_ENABLED.get()) None - else if (arrowUseLargeVarTypes(plan.conf)) None - else + if (!CometConf.COMET_PYARROW_UDF_ENABLED.get()) { + None + } else if (arrowUseLargeVarTypes(plan.conf)) { + None + } else { matchMapInArrow(plan) .orElse(matchMapInPandas(plan)) .flatMap(info => extractColumnarChild(info.child).map(child => (info, child))) + } } } diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala index 4c40e68809..c39eb405ad 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala @@ -115,7 +115,6 @@ case class CometMapInBatchExec( val childArrow = structVector.getChild(i) CometVector.getVector( childArrow.getValueVector, - /* useDecimal128 */ true, /* dictionaryProvider */ null) }.toArray val flattenedBatch = new ColumnarBatch(outputVectors) diff --git a/spark/src/main/spark-3.4/org/apache/comet/shims/ShimSQLConf.scala b/spark/src/main/spark-3.4/org/apache/comet/shims/ShimSQLConf.scala index e809e33904..3ab5e5cbac 100644 --- a/spark/src/main/spark-3.4/org/apache/comet/shims/ShimSQLConf.scala +++ b/spark/src/main/spark-3.4/org/apache/comet/shims/ShimSQLConf.scala @@ -28,8 +28,8 @@ trait ShimSQLConf { /** * Reads `spark.sql.execution.arrow.useLargeVarTypes`. Spark 3.4 has no typed accessor for this - * conf, so read by raw key. The conf only governs the destination Arrow IPC root width on - * Spark 4.x, so the value returned here matters only to callers that look it up explicitly. + * conf, so read by raw key. The conf only governs the destination Arrow IPC root width on Spark + * 4.x, so the value returned here matters only to callers that look it up explicitly. */ protected def arrowUseLargeVarTypes(conf: SQLConf): Boolean = conf.getConfString("spark.sql.execution.arrow.useLargeVarTypes", "false").toBoolean diff --git a/spark/src/main/spark-3.5/org/apache/comet/shims/ShimSQLConf.scala b/spark/src/main/spark-3.5/org/apache/comet/shims/ShimSQLConf.scala index 219e0f2a2e..c87e8358f3 100644 --- a/spark/src/main/spark-3.5/org/apache/comet/shims/ShimSQLConf.scala +++ b/spark/src/main/spark-3.5/org/apache/comet/shims/ShimSQLConf.scala @@ -26,8 +26,8 @@ trait ShimSQLConf { protected val CORRECTED = LegacyBehaviorPolicy.CORRECTED /** - * Reads `spark.sql.execution.arrow.useLargeVarTypes`. Spark 3.5 has the typed accessor; - * forward to it. + * Reads `spark.sql.execution.arrow.useLargeVarTypes`. Spark 3.5 has the typed accessor; forward + * to it. */ protected def arrowUseLargeVarTypes(conf: SQLConf): Boolean = conf.arrowUseLargeVarTypes } diff --git a/spark/src/main/spark-3.x/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-3.x/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala index c0a31c6e52..59d8c7f251 100644 --- a/spark/src/main/spark-3.x/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala +++ b/spark/src/main/spark-3.x/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala @@ -30,14 +30,14 @@ import org.apache.spark.sql.vectorized.ColumnarBatch /** * Spark 3.x stub for the PyArrow UDF acceleration support. * - * The columnar runner introduced in #4234 only targets Spark 4.0+. On Spark 3.4 / 3.5 the matchers - * return `None`, the rewrite does not fire, and vanilla Spark handles `mapInArrow` / + * The columnar runner introduced in #4234 only targets Spark 4.0+. On Spark 3.4 / 3.5 the + * matchers return `None`, the rewrite does not fire, and vanilla Spark handles `mapInArrow` / * `mapInPandas` unchanged. The runner factory throws; it is never called because the matchers * always return `None`. 3.x support can be added later if there is user demand. * * Shared across spark-3.4 and spark-3.5 because both are identical: 3.4 lacks the modern - * `ArrowPythonRunner` constructor and `arrowUseLargeVarTypes`, and 3.5's `PythonArrowInput` - * trait has a different contract (`writeIteratorToArrowStream` one-shot vs 4.x's + * `ArrowPythonRunner` constructor and `arrowUseLargeVarTypes`, and 3.5's `PythonArrowInput` trait + * has a different contract (`writeIteratorToArrowStream` one-shot vs 4.x's * `writeNextBatchToArrowStream` batch-at-a-time), so neither version can host the columnar input * implementation without a separate rewrite. */ diff --git a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala index cf4f324a23..dacf1d1638 100644 --- a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala +++ b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala @@ -43,17 +43,17 @@ import org.apache.comet.vector.CometDecodedVector * * Per batch: walk the destination struct's children, allocate each child sized to match the * corresponding Comet column, and copy each buffer with `ArrowBuf.setBytes`. The current path - * does two copies per batch: this one (Comet vector buffers → destination IPC root), and a - * second one inside `VectorUnloader` / `MessageSerializer.serialize` (root → pipe). The pipe - * write is structural — Spark's transport to Python is fork + pipe + Arrow IPC, so the buffer - * bytes must reach the pipe at least once. Dropping the first copy by serialising directly - * from Comet's vectors is tracked in #4294; once done, the path is at the single-copy floor. + * does two copies per batch: this one (Comet vector buffers → destination IPC root), and a second + * one inside `VectorUnloader` / `MessageSerializer.serialize` (root → pipe). The pipe write is + * structural — Spark's transport to Python is fork + pipe + Arrow IPC, so the buffer bytes must + * reach the pipe at least once. Dropping the first copy by serialising directly from Comet's + * vectors is tracked in #4294; once done, the path is at the single-copy floor. * * The cross-allocator constraint on `TransferPair` is independent of the copy count: even after - * #4294, true zero-copy at the JVM boundary is blocked because Comet's source `FieldVector`s - * are imported from native via Arrow C Data Interface (their buffers route `release` through - * FFI), while Spark's destination IPC root is a child of `ArrowUtils.rootAllocator`. The two - * reference managers cannot share buffers. + * #4294, true zero-copy at the JVM boundary is blocked because Comet's source `FieldVector`s are + * imported from native via Arrow C Data Interface (their buffers route `release` through FFI), + * while Spark's destination IPC root is a child of `ArrowUtils.rootAllocator`. The two reference + * managers cannot share buffers. */ private[python] trait CometColumnarPythonInput extends PythonArrowInput[Iterator[ColumnarBatch]] { self: BasePythonRunner[Iterator[ColumnarBatch], _] => diff --git a/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala b/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala index b4f3d64d76..e18e838b29 100644 --- a/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala +++ b/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala @@ -122,18 +122,14 @@ class CometMapInBatchSuite extends CometTestBase { ColumnarToRowExec(cometLeaf), isBarrier = false, profile = None) - val outer = MapInArrowExec( - stubPythonUDF, - cometLeaf.output, - inner, - isBarrier = false, - profile = None) + val outer = + MapInArrowExec(stubPythonUDF, cometLeaf.output, inner, isBarrier = false, profile = None) val rewritten = EliminateRedundantTransitions(spark).apply(outer) val cometOps = rewritten.collect { case op: CometMapInBatchExec => op } assert( cometOps.size == 1, - s"expected the inner MapInArrowExec to be rewritten, but the chain produced " + + "expected the inner MapInArrowExec to be rewritten, but the chain produced " + s"${cometOps.size} CometMapInBatchExec(s):\n$rewritten") } } From 2fcd89fac7b0131ea97eb768a707dd7f07891852 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 23 Jun 2026 08:50:03 -0600 Subject: [PATCH 48/54] fix: align PyArrow UDF workflow PySpark with Spark 4.0.2 build The workflow compiles Comet against the spark-4.0 profile (Spark 4.0.2) but ran the pytest against pyspark==4.0.1. The PythonArrowInput trait's private-field mixin is not binary-compatible across that gap, so constructing CometArrowPythonRunner failed with AbstractMethodError on the synthesized arrowSchema setter. Pin pyspark to 4.0.2 to match. --- .github/workflows/pyarrow_udf_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pyarrow_udf_test.yml b/.github/workflows/pyarrow_udf_test.yml index e325ab8b6d..f8ab15437e 100644 --- a/.github/workflows/pyarrow_udf_test.yml +++ b/.github/workflows/pyarrow_udf_test.yml @@ -97,7 +97,7 @@ jobs: apt-get install -y --no-install-recommends python3 python3-venv python3-pip python3 -m venv /tmp/venv /tmp/venv/bin/pip install --upgrade pip - /tmp/venv/bin/pip install "pyspark==4.0.1" "pyarrow>=14" pandas pytest + /tmp/venv/bin/pip install "pyspark==4.0.2" "pyarrow>=14" pandas pytest - name: Run PyArrow UDF pytest env: From de6393748dacb9485c362479f8e03e451395fd1b Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 23 Jun 2026 11:31:03 -0600 Subject: [PATCH 49/54] fix: run PyArrow UDFs through a self-contained shaded-Arrow runner The runner extended Spark's PythonArrowInput / BasicPythonArrowOutput traits, whose members expose Spark's (unshaded) Arrow types. The packaged comet-spark jar relocates org.apache.arrow to org.apache.comet.shaded.arrow, so the synthetic Arrow members on the generated runner no longer matched Spark's unshaded trait contract, raising AbstractMethodError at runtime (the output path had the same latent break, wrapping Spark's unshaded ArrowColumnVector into a shaded CometVector). It only surfaced in the packaged jar, not in tests run from classes, which is why CI failed while local runs passed. Shading must stay (Comet bundles a different Arrow version than Spark), so instead extend only the Arrow-agnostic BasePythonRunner and perform the Arrow IPC exchange directly with Comet's shaded Arrow. The Python worker only ever sees a standard Arrow IPC byte stream, so nothing crosses the shaded/unshaded boundary: input copies each Comet batch into a shaded struct root written with a shaded ArrowStreamWriter; output reads the worker's IPC with a shaded ArrowStreamReader straight into CometVectors, which is what CometMapInBatchExec and downstream native operators already consume. BasePythonRunner has the same shape across Spark 4.0/4.1/4.2, so the IPC logic lives in one shared CometArrowPythonRunnerBase and the per-version runners are thin subclasses. Removes the now-unused CometColumnarPythonInput. --- .../rules/EliminateRedundantTransitions.scala | 2 +- .../spark/sql/comet/CometMapInBatchExec.scala | 24 +- .../python/CometArrowPythonRunner.scala | 32 +- .../python/CometArrowPythonRunner.scala | 30 +- .../python/CometArrowPythonRunner.scala | 26 +- .../python/CometArrowPythonRunnerBase.scala | 303 ++++++++++++++++++ .../python/CometColumnarPythonInput.scala | 196 ----------- 7 files changed, 341 insertions(+), 272 deletions(-) create mode 100644 spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala delete mode 100644 spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala diff --git a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala index fa42f441e5..95b05dc1a2 100644 --- a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala +++ b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala @@ -110,7 +110,7 @@ case class EliminateRedundantTransitions(session: SparkSession) // 4.1+ matches the renamed `MapInArrowExec`. // // Falls back to vanilla Spark when `spark.sql.execution.arrow.useLargeVarTypes` is enabled: - // CometColumnarPythonInput.copyVector does raw `setBytes` on each Arrow buffer, but Comet's + // CometArrowPythonRunnerBase.copyVector does raw `setBytes` on each Arrow buffer, but Comet's // source string/binary vectors always use 4-byte offsets while the destination root is // allocated with 8-byte offsets when this conf is on. The buffer counts match but the // offset width does not, so a direct memcpy would corrupt the offsets. diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala index c39eb405ad..8ac9a70de3 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala @@ -30,9 +30,9 @@ import org.apache.spark.sql.execution.{ColumnarToRowExec, SparkPlan, UnaryExecNo import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.sql.execution.python.PythonSQLMetrics import org.apache.spark.sql.types.{StructField, StructType} -import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnVector} +import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector} -import org.apache.comet.vector.CometVector +import org.apache.comet.vector.CometStructVector /** * Comet replacement for Spark's `MapInBatchExec` family (`PythonMapInArrowExec` / @@ -104,19 +104,13 @@ case class CometMapInBatchExec( context) columnarBatchIter.map { batch => - // Python returns a single struct column; flatten to the user's output columns and - // re-wrap each child as CometVector so consumers that expect Comet's vector hierarchy - // (e.g. another CometMapInBatchExec stacked on top, or NativeUtil.exportBatch for a - // downstream native Comet operator) see the right type. Sharing the underlying Arrow - // ValueVector with the original ArrowColumnVector is safe: close() on either ends up - // releasing the same buffers, and arrow-vector's release path is idempotent. - val structVector = batch.column(0).asInstanceOf[ArrowColumnVector] - val outputVectors: Array[ColumnVector] = outputAttrs.indices.map { i => - val childArrow = structVector.getChild(i) - CometVector.getVector( - childArrow.getValueVector, - /* dictionaryProvider */ null) - }.toArray + // Python returns a single struct column; flatten to the user's output columns. The runner + // produces Comet vectors, so the struct's children are already CometVectors that downstream + // consumers (a stacked CometMapInBatchExec, or NativeUtil.exportBatch for a native Comet + // operator) can use directly. + val structVector = batch.column(0).asInstanceOf[CometStructVector] + val outputVectors: Array[ColumnVector] = + outputAttrs.indices.map(i => structVector.getChild(i)).toArray val flattenedBatch = new ColumnarBatch(outputVectors) flattenedBatch.setNumRows(batch.numRows()) numOutputRows += flattenedBatch.numRows() diff --git a/spark/src/main/spark-4.0/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala b/spark/src/main/spark-4.0/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala index 63d282e8b9..051fe14638 100644 --- a/spark/src/main/spark-4.0/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala +++ b/spark/src/main/spark-4.0/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala @@ -23,23 +23,21 @@ import java.io.DataOutputStream import org.apache.spark.api.python.{BasePythonRunner, ChainedPythonFunctions} import org.apache.spark.sql.execution.metric.SQLMetric -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType import org.apache.spark.sql.vectorized.ColumnarBatch /** - * Comet's Arrow Python runner for Spark 4.0. Extends `BasePythonRunner` directly because Spark - * 4.0's `BaseArrowPythonRunner` is bound to `Iterator[InternalRow]` and mixes in - * `BasicPythonArrowInput`, so we cannot inherit from it. Wires the SQLConf-driven fields that - * `BaseArrowPythonRunner` provides. + * Comet's Arrow Python runner for Spark 4.0. The Arrow IPC exchange lives in + * [[CometArrowPythonRunnerBase]]; this subclass only supplies the Spark 4.0 constructor shape and + * UDF command serialization. */ class CometArrowPythonRunner( funcs: Seq[(ChainedPythonFunctions, Long)], evalType: Int, argOffsets: Array[Array[Int]], - protected override val schema: StructType, - protected override val timeZoneId: String, - protected override val largeVarTypes: Boolean, + schema: StructType, + timeZoneId: String, + largeVarTypes: Boolean, override val workerConf: Map[String, String], override val pythonMetrics: Map[String, SQLMetric], jobArtifactUUID: Option[String]) @@ -49,23 +47,7 @@ class CometArrowPythonRunner( argOffsets, jobArtifactUUID, pythonMetrics) - with CometColumnarPythonInput - with BasicPythonArrowOutput { - - override val pythonExec: String = - SQLConf.get.pysparkWorkerPythonExecutable.getOrElse(funcs.head._1.funcs.head.pythonExec) - - override val faultHandlerEnabled: Boolean = SQLConf.get.pythonUDFWorkerFaulthandlerEnabled - override val idleTimeoutSeconds: Long = SQLConf.get.pythonUDFWorkerIdleTimeoutSeconds - override val errorOnDuplicatedFieldNames: Boolean = true - override val hideTraceback: Boolean = SQLConf.get.pysparkHideTraceback - override val simplifiedTraceback: Boolean = SQLConf.get.pysparkSimplifiedTraceback - - override val bufferSize: Int = SQLConf.get.pandasUDFBufferSize - require( - bufferSize >= 4, - "Pandas execution requires more than 4 bytes. Please set higher buffer. " + - s"Please change '${SQLConf.PANDAS_UDF_BUFFER_SIZE.key}'.") + with CometArrowPythonRunnerBase { override protected def writeUDF(dataOut: DataOutputStream): Unit = PythonUDFRunner.writeUDFs(dataOut, funcs, argOffsets, jobArtifactUUID) diff --git a/spark/src/main/spark-4.1/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala b/spark/src/main/spark-4.1/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala index 7b82b0aed8..8700e282ac 100644 --- a/spark/src/main/spark-4.1/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala +++ b/spark/src/main/spark-4.1/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala @@ -21,18 +21,16 @@ package org.apache.spark.sql.execution.python import java.io.DataOutputStream -import org.apache.spark.api.python.ChainedPythonFunctions +import org.apache.spark.api.python.{BasePythonRunner, ChainedPythonFunctions} import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.types.StructType import org.apache.spark.sql.vectorized.ColumnarBatch /** - * Comet's Arrow Python runner for Spark 4.1. Extends `BaseArrowPythonRunner` parameterized over - * `Iterator[ColumnarBatch]` input, and supplies the columnar input via `CometColumnarPythonInput` - * instead of `BasicPythonArrowInput`. - * - * Spark 4.1's `PythonUDFRunner.writeUDFs` takes a `profiler: Option[String]` fourth argument; we - * pass `None` since Comet does not support Python profiling. + * Comet's Arrow Python runner for Spark 4.1. The Arrow IPC exchange lives in + * [[CometArrowPythonRunnerBase]]; this subclass only supplies the Spark 4.1 constructor shape and + * UDF command serialization (`PythonUDFRunner.writeUDFs` takes a `profiler: Option[String]` + * fourth argument, which Comet does not use). */ class CometArrowPythonRunner( funcs: Seq[(ChainedPythonFunctions, Long)], @@ -41,23 +39,17 @@ class CometArrowPythonRunner( schema: StructType, timeZoneId: String, largeVarTypes: Boolean, - workerConf: Map[String, String], - pythonMetrics: Map[String, SQLMetric], + override val workerConf: Map[String, String], + override val pythonMetrics: Map[String, SQLMetric], jobArtifactUUID: Option[String], sessionUUID: Option[String]) - extends BaseArrowPythonRunner[Iterator[ColumnarBatch], ColumnarBatch]( - funcs, + extends BasePythonRunner[Iterator[ColumnarBatch], ColumnarBatch]( + funcs.map(_._1), evalType, argOffsets, - schema, - timeZoneId, - largeVarTypes, - workerConf, - pythonMetrics, jobArtifactUUID, - sessionUUID) - with CometColumnarPythonInput - with BasicPythonArrowOutput { + pythonMetrics) + with CometArrowPythonRunnerBase { override protected def writeUDF(dataOut: DataOutputStream): Unit = PythonUDFRunner.writeUDFs(dataOut, funcs, argOffsets, None) diff --git a/spark/src/main/spark-4.2/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala b/spark/src/main/spark-4.2/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala index c9714ce068..09848d602e 100644 --- a/spark/src/main/spark-4.2/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala +++ b/spark/src/main/spark-4.2/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala @@ -21,15 +21,15 @@ package org.apache.spark.sql.execution.python import java.io.DataOutputStream -import org.apache.spark.api.python.ChainedPythonFunctions +import org.apache.spark.api.python.{BasePythonRunner, ChainedPythonFunctions} import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.types.StructType import org.apache.spark.sql.vectorized.ColumnarBatch /** - * Comet's Arrow Python runner for Spark 4.2. Spark 4.2's `BaseArrowPythonRunner` no longer - * accepts `workerConf` in its constructor; the subclass overrides `runnerConf` instead. - * `PythonUDFRunner.writeUDFs` drops the `profiler` argument compared to 4.1. + * Comet's Arrow Python runner for Spark 4.2. The Arrow IPC exchange lives in + * [[CometArrowPythonRunnerBase]]; this subclass only supplies the Spark 4.2 constructor shape and + * UDF command serialization (`PythonUDFRunner.writeUDFs` drops the `profiler` argument). */ class CometArrowPythonRunner( funcs: Seq[(ChainedPythonFunctions, Long)], @@ -39,24 +39,18 @@ class CometArrowPythonRunner( timeZoneId: String, largeVarTypes: Boolean, pythonRunnerConf: Map[String, String], - pythonMetrics: Map[String, SQLMetric], + override val pythonMetrics: Map[String, SQLMetric], jobArtifactUUID: Option[String], sessionUUID: Option[String]) - extends BaseArrowPythonRunner[Iterator[ColumnarBatch], ColumnarBatch]( - funcs, + extends BasePythonRunner[Iterator[ColumnarBatch], ColumnarBatch]( + funcs.map(_._1), evalType, argOffsets, - schema, - timeZoneId, - largeVarTypes, - pythonMetrics, jobArtifactUUID, - sessionUUID) - with CometColumnarPythonInput - with BasicPythonArrowOutput { + pythonMetrics) + with CometArrowPythonRunnerBase { - override protected def runnerConf: Map[String, String] = - super.runnerConf ++ pythonRunnerConf + override protected def workerConf: Map[String, String] = pythonRunnerConf override protected def writeUDF(dataOut: DataOutputStream): Unit = PythonUDFRunner.writeUDFs(dataOut, funcs, argOffsets) diff --git a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala new file mode 100644 index 0000000000..c05b1aafd3 --- /dev/null +++ b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala @@ -0,0 +1,303 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.execution.python + +import java.io.{DataInputStream, DataOutputStream} +import java.nio.channels.Channels +import java.util.concurrent.atomic.AtomicBoolean + +import scala.jdk.CollectionConverters._ + +import org.apache.arrow.vector.{BaseFixedWidthVector, BaseLargeVariableWidthVector, BaseVariableWidthVector, FieldVector, VectorSchemaRoot} +import org.apache.arrow.vector.complex.{LargeListVector, ListVector, StructVector} +import org.apache.arrow.vector.ipc.{ArrowStreamReader, ArrowStreamWriter} +import org.apache.arrow.vector.types.pojo.{ArrowType, Field, FieldType} +import org.apache.spark.{SparkEnv, TaskContext} +import org.apache.spark.api.python.{BasePythonRunner, PythonRDD, PythonWorker, SpecialLengths} +import org.apache.spark.sql.execution.metric.SQLMetric +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector} +import org.apache.spark.unsafe.Platform + +import org.apache.comet.CometArrowAllocator +import org.apache.comet.vector.{CometDecodedVector, CometVector} + +/** + * Shared base for Comet's Arrow Python runners (Spark 4.0 / 4.1 / 4.2). + * + * Unlike a stock `ArrowPythonRunner`, this does not extend Spark's `PythonArrowInput` / + * `BasicPythonArrowOutput` traits. Those traits expose Spark's Arrow types (`VectorSchemaRoot`, + * `Schema`) in their members, and the packaged `comet-spark` jar relocates `org.apache.arrow` to + * `org.apache.comet.shaded.arrow`, so mixing them in produces a class whose synthetic Arrow + * members no longer match Spark's unshaded trait contract (an `AbstractMethodError` at runtime). + * + * Instead it extends only the Arrow-agnostic `BasePythonRunner` and performs the Arrow IPC + * exchange itself using Comet's (shaded) Arrow. The Python worker only ever sees a standard Arrow + * IPC byte stream, which is version-neutral, so nothing crosses the shaded/unshaded boundary: + * - Input: each Comet `ColumnarBatch` is copied into a shaded struct root and written to the + * worker with a shaded `ArrowStreamWriter`. + * - Output: the worker's Arrow IPC is read with a shaded `ArrowStreamReader` straight into + * `CometVector`s, which is exactly what `CometMapInBatchExec` and downstream native operators + * consume. + * + * `BasePythonRunner` has the same shape across Spark 4.0/4.1/4.2; only the subclass constructor + * arguments and `writeUDF` differ, so those stay in the per-version subclasses. + */ +private[python] trait CometArrowPythonRunnerBase + extends BasePythonRunner[Iterator[ColumnarBatch], ColumnarBatch] { + + /** Worker configuration written to the Python worker before execution. */ + protected def workerConf: Map[String, String] + + /** Comet's Python SQL metrics (data sent/received, rows). */ + protected def pythonMetrics: Map[String, SQLMetric] + + /** Version-specific UDF command serialization. */ + protected def writeUDF(dataOut: DataOutputStream): Unit + + override val pythonExec: String = + SQLConf.get.pysparkWorkerPythonExecutable.getOrElse(funcs.head.funcs.head.pythonExec) + + override val faultHandlerEnabled: Boolean = SQLConf.get.pythonUDFWorkerFaulthandlerEnabled + override val idleTimeoutSeconds: Long = SQLConf.get.pythonUDFWorkerIdleTimeoutSeconds + override val hideTraceback: Boolean = SQLConf.get.pysparkHideTraceback + override val simplifiedTraceback: Boolean = SQLConf.get.pysparkSimplifiedTraceback + + override val bufferSize: Int = SQLConf.get.pandasUDFBufferSize + require( + bufferSize >= 4, + "Pandas execution requires more than 4 bytes. Please set higher buffer. " + + s"Please change '${SQLConf.PANDAS_UDF_BUFFER_SIZE.key}'.") + + override protected def newWriter( + env: SparkEnv, + worker: PythonWorker, + inputIterator: Iterator[Iterator[ColumnarBatch]], + partitionIndex: Int, + context: TaskContext): Writer = { + new Writer(env, worker, inputIterator, partitionIndex, context) { + + private val allocator = + CometArrowAllocator.newChildAllocator(s"stdout writer for $pythonExec", 0, Long.MaxValue) + private var currentGroup: Iterator[ColumnarBatch] = _ + private var arrowWriter: ArrowStreamWriter = _ + private var writeRoot: VectorSchemaRoot = _ + private var structVec: StructVector = _ + + context.addTaskCompletionListener[Unit] { _ => + if (writeRoot != null) { + writeRoot.close() + } + allocator.close() + } + + protected override def writeCommand(dataOut: DataOutputStream): Unit = { + // handleMetadataBeforeExec: write the worker config as key/value string pairs. + dataOut.writeInt(workerConf.size) + for ((k, v) <- workerConf) { + PythonRDD.writeUTF(k, dataOut) + PythonRDD.writeUTF(v, dataOut) + } + writeUDF(dataOut) + } + + override def writeNextInputToStream(dataOut: DataOutputStream): Boolean = { + while (currentGroup == null || !currentGroup.hasNext) { + if (!inputIterator.hasNext) { + if (arrowWriter != null) { + arrowWriter.end() + } + return false + } + currentGroup = inputIterator.next() + } + + val cometBatch = currentGroup.next() + val startData = dataOut.size() + + if (arrowWriter == null) { + // Build the destination struct root once, sized to the first batch's child fields. + // mapInArrow/mapInPandas exchange the columns under a single non-nullable struct. + val childFields = (0 until cometBatch.numCols()).map { i => + cometBatch.column(i).asInstanceOf[CometDecodedVector].getValueVector.getField + } + val structField = + new Field( + "struct", + new FieldType(false, ArrowType.Struct.INSTANCE, null), + childFields.asJava) + structVec = structField.createVector(allocator).asInstanceOf[StructVector] + writeRoot = new VectorSchemaRoot(Seq[FieldVector](structVec).asJava) + arrowWriter = new ArrowStreamWriter(writeRoot, null, Channels.newChannel(dataOut)) + arrowWriter.start() + } + + var i = 0 + while (i < cometBatch.numCols()) { + val src = cometBatch + .column(i) + .asInstanceOf[CometDecodedVector] + .getValueVector + .asInstanceOf[FieldVector] + val dst = structVec.getChildByOrdinal(i).asInstanceOf[FieldVector] + copyVector(src, dst) + i += 1 + } + val numRows = cometBatch.numRows() + structVec.setValueCount(numRows) + // Mark every row of the struct non-null (all-1 validity). The validity buffer is freshly + // allocated and zero-initialised, so without this Python would see an all-null struct. + val validityBytes = (numRows + 7) / 8 + Platform.setMemory( + structVec.getValidityBuffer.memoryAddress(), + 0xff.toByte, + validityBytes) + writeRoot.setRowCount(numRows) + arrowWriter.writeBatch() + + pythonMetrics("pythonDataSent") += dataOut.size() - startData + true + } + } + } + + override protected def newReaderIterator( + stream: DataInputStream, + writer: Writer, + startTime: Long, + env: SparkEnv, + worker: PythonWorker, + pid: Option[Int], + releasedOrClosed: AtomicBoolean, + context: TaskContext): Iterator[ColumnarBatch] = { + new ReaderIterator(stream, writer, startTime, env, worker, pid, releasedOrClosed, context) { + + private val allocator = + CometArrowAllocator.newChildAllocator(s"stdin reader for $pythonExec", 0, Long.MaxValue) + private var reader: ArrowStreamReader = _ + private var root: VectorSchemaRoot = _ + private var batchLoaded = true + + context.addTaskCompletionListener[Unit] { _ => + if (reader != null) { + reader.close(false) + } + allocator.close() + } + + protected override def read(): ColumnarBatch = { + if (writer.exception.isDefined) { + throw writer.exception.get + } + try { + if (reader != null && batchLoaded) { + batchLoaded = reader.loadNextBatch() + if (batchLoaded) { + // Re-wrap the (reloaded) field vectors fresh each batch, mirroring Comet's + // StreamReader, so each ColumnarBatch reflects the current buffers. + val vectors: Array[ColumnVector] = root.getFieldVectors.asScala.map { vector => + CometVector.getVector(vector, null).asInstanceOf[ColumnVector] + }.toArray + val batch = new ColumnarBatch(vectors) + batch.setNumRows(root.getRowCount) + pythonMetrics("pythonNumRowsReceived") += root.getRowCount + batch + } else { + reader.close(false) + allocator.close() + read() + } + } else { + stream.readInt() match { + case SpecialLengths.START_ARROW_STREAM => + reader = new ArrowStreamReader(stream, allocator) + root = reader.getVectorSchemaRoot() + read() + case SpecialLengths.TIMING_DATA => + handleTimingData() + read() + case SpecialLengths.PYTHON_EXCEPTION_THROWN => + throw handlePythonException() + case SpecialLengths.END_OF_DATA_SECTION => + handleEndOfDataSection() + null + } + } + } catch handleException + } + } + } + + /** + * Copy a Comet column into the destination FieldVector. Walks both trees in lockstep: sizes + * each destination node from the source, copies every buffer with `ArrowBuf.setBytes`, then + * sets value counts bottom-up so `setValueCount` does not rewrite the offset bytes we just + * copied. Both source and destination are Comet's (shaded) Arrow vectors, so no shaded / + * unshaded type crosses. + */ + private def copyVector(src: FieldVector, dst: FieldVector): Unit = { + val valueCount = src.getValueCount + + dst match { + case bfwv: BaseFixedWidthVector => + bfwv.allocateNew(valueCount) + case bvwv: BaseVariableWidthVector => + bvwv.allocateNew(src.getDataBuffer.readableBytes, valueCount) + case blvwv: BaseLargeVariableWidthVector => + blvwv.allocateNew(src.getDataBuffer.readableBytes, valueCount) + case _ => + dst.setInitialCapacity(valueCount) + dst.allocateNew() + } + + val srcBufs = src.getFieldBuffers + val dstBufs = dst.getFieldBuffers + require( + srcBufs.size == dstBufs.size, + s"buffer count mismatch for ${dst.getField}: src=${srcBufs.size}, dst=${dstBufs.size}") + var b = 0 + while (b < srcBufs.size) { + val s = srcBufs.get(b) + dstBufs.get(b).setBytes(0, s, 0, s.readableBytes) + b += 1 + } + + val srcChildren = src.getChildrenFromFields + val dstChildren = dst.getChildrenFromFields + require( + srcChildren.size == dstChildren.size, + s"child count mismatch for ${dst.getField}: src=${srcChildren.size}, dst=${dstChildren.size}") + srcChildren.asScala.zip(dstChildren.asScala).foreach { case (sc, dc) => + copyVector(sc.asInstanceOf[FieldVector], dc.asInstanceOf[FieldVector]) + } + + // For vectors that fill offset-buffer "holes" in setValueCount (variable-width and list + // types), set lastSet = vc - 1 first so fillHoles is a no-op and the already-copied offset + // bytes are preserved. + dst match { + case v: BaseVariableWidthVector => v.setLastSet(valueCount - 1) + case v: BaseLargeVariableWidthVector => v.setLastSet(valueCount - 1) + case v: ListVector => v.setLastSet(valueCount - 1) + case v: LargeListVector => v.setLastSet(valueCount - 1) + case _ => + } + dst.setValueCount(valueCount) + } +} diff --git a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala deleted file mode 100644 index dacf1d1638..0000000000 --- a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala +++ /dev/null @@ -1,196 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.spark.sql.execution.python - -import java.io.DataOutputStream -import java.nio.channels.Channels - -import scala.jdk.CollectionConverters._ - -import org.apache.arrow.vector.{BaseFixedWidthVector, BaseLargeVariableWidthVector, BaseVariableWidthVector, FieldVector, VectorSchemaRoot, VectorUnloader} -import org.apache.arrow.vector.complex.{LargeListVector, ListVector, StructVector} -import org.apache.arrow.vector.compression.{CompressionCodec, CompressionUtil, NoCompressionCodec} -import org.apache.arrow.vector.ipc.{ArrowStreamWriter, WriteChannel} -import org.apache.arrow.vector.ipc.message.MessageSerializer -import org.apache.spark.SparkException -import org.apache.spark.api.python.BasePythonRunner -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.vectorized.ColumnarBatch -import org.apache.spark.unsafe.Platform - -import org.apache.comet.vector.CometDecodedVector - -/** - * `PythonArrowInput` implementation that streams Comet `ColumnarBatch` values to the Python - * worker as Arrow IPC. - * - * Per batch: walk the destination struct's children, allocate each child sized to match the - * corresponding Comet column, and copy each buffer with `ArrowBuf.setBytes`. The current path - * does two copies per batch: this one (Comet vector buffers → destination IPC root), and a second - * one inside `VectorUnloader` / `MessageSerializer.serialize` (root → pipe). The pipe write is - * structural — Spark's transport to Python is fork + pipe + Arrow IPC, so the buffer bytes must - * reach the pipe at least once. Dropping the first copy by serialising directly from Comet's - * vectors is tracked in #4294; once done, the path is at the single-copy floor. - * - * The cross-allocator constraint on `TransferPair` is independent of the copy count: even after - * #4294, true zero-copy at the JVM boundary is blocked because Comet's source `FieldVector`s are - * imported from native via Arrow C Data Interface (their buffers route `release` through FFI), - * while Spark's destination IPC root is a child of `ArrowUtils.rootAllocator`. The two reference - * managers cannot share buffers. - */ -private[python] trait CometColumnarPythonInput extends PythonArrowInput[Iterator[ColumnarBatch]] { - self: BasePythonRunner[Iterator[ColumnarBatch], _] => - - private var currentGroup: Iterator[ColumnarBatch] = _ - - // Constructed once per task: `root` (the trait's persistent destination IPC root) and - // `cometCodec` are both stable across the partition. `getRecordBatch` reads the current - // contents of `root.getFieldVectors` on every call, so re-using the unloader is safe. - private lazy val batchUnloader: VectorUnloader = - new VectorUnloader(root, /* includeNullCount */ true, cometCodec, /* alignBuffers */ true) - - // Read the codec name via raw config key. Spark 4.0.x has no `SQLConf.arrowCompressionCodec` - // accessor at all (it was added after the 4.0 line was cut), so a typed `ShimSQLConf` - // forwarder would still need a stringly-typed fallback for the 4.0 build. The codec instances - // are obtained through `CompressionCodec.Factory` (arrow-vector) rather than importing the - // concrete `Lz4CompressionCodec` / `ZstdCompressionCodec` from the separate - // arrow-compression artifact, which Comet does not depend on. - private lazy val cometCodec: CompressionCodec = { - val factory = CompressionCodec.Factory.INSTANCE - SQLConf.get.getConfString("spark.sql.execution.arrow.compression.codec", "none") match { - case "none" => NoCompressionCodec.INSTANCE - case "lz4" => - factory.createCodec(CompressionUtil.CodecType.LZ4_FRAME) - case "zstd" => - val level = - SQLConf.get.getConfString("spark.sql.execution.arrow.compression.zstd.level", "3").toInt - factory.createCodec(CompressionUtil.CodecType.ZSTD, level) - case other => - throw SparkException.internalError( - s"Unsupported Arrow compression codec: $other. Supported values: none, lz4, zstd") - } - } - - override protected def writeNextBatchToArrowStream( - root: VectorSchemaRoot, - writer: ArrowStreamWriter, - dataOut: DataOutputStream, - inputIterator: Iterator[Iterator[ColumnarBatch]]): Boolean = { - - while (currentGroup == null || !currentGroup.hasNext) { - if (!inputIterator.hasNext) { - super[PythonArrowInput].close() - return false - } - currentGroup = inputIterator.next() - } - - val cometBatch = currentGroup.next() - val startData = dataOut.size() - val structVec = root.getVector(0).asInstanceOf[StructVector] - - var i = 0 - while (i < cometBatch.numCols()) { - val src = - cometBatch - .column(i) - .asInstanceOf[CometDecodedVector] - .getValueVector - .asInstanceOf[FieldVector] - val dst = structVec.getChildByOrdinal(i).asInstanceOf[FieldVector] - copyVector(src, dst) - i += 1 - } - val numRows = cometBatch.numRows() - structVec.setValueCount(numRows) - // Mark every row in the struct as non-null (all-1 validity bits). The struct validity - // buffer is freshly allocated (or cleared) and zero-initialised, so without this step - // Python would see an all-null struct column and return null for every output row. - val validityBytes = (numRows + 7) / 8 - Platform.setMemory(structVec.getValidityBuffer.memoryAddress(), 0xff.toByte, validityBytes) - root.setRowCount(numRows) - - val recordBatch = batchUnloader.getRecordBatch - try { - val writeChannel = new WriteChannel(Channels.newChannel(dataOut)) - MessageSerializer.serialize(writeChannel, recordBatch) - } finally { - recordBatch.close() - } - - pythonMetrics("pythonDataSent") += dataOut.size() - startData - true - } - - /** - * Copy a Comet column into the destination FieldVector. Walks both trees in lockstep: sizes - * each destination node from the source, copies every buffer with `ArrowBuf.setBytes`, then - * sets value counts bottom-up so `setValueCount` does not rewrite the offset bytes we just - * copied. - */ - private def copyVector(src: FieldVector, dst: FieldVector): Unit = { - val valueCount = src.getValueCount - - dst match { - case bfwv: BaseFixedWidthVector => - bfwv.allocateNew(valueCount) - case bvwv: BaseVariableWidthVector => - bvwv.allocateNew(src.getDataBuffer.readableBytes, valueCount) - case blvwv: BaseLargeVariableWidthVector => - blvwv.allocateNew(src.getDataBuffer.readableBytes, valueCount) - case _ => - dst.setInitialCapacity(valueCount) - dst.allocateNew() - } - - val srcBufs = src.getFieldBuffers - val dstBufs = dst.getFieldBuffers - require( - srcBufs.size == dstBufs.size, - s"buffer count mismatch for ${dst.getField}: src=${srcBufs.size}, dst=${dstBufs.size}") - var b = 0 - while (b < srcBufs.size) { - val s = srcBufs.get(b) - dstBufs.get(b).setBytes(0, s, 0, s.readableBytes) - b += 1 - } - - val srcChildren = src.getChildrenFromFields - val dstChildren = dst.getChildrenFromFields - require( - srcChildren.size == dstChildren.size, - s"child count mismatch for ${dst.getField}: src=${srcChildren.size}, dst=${dstChildren.size}") - srcChildren.asScala.zip(dstChildren.asScala).foreach { case (sc, dc) => - copyVector(sc.asInstanceOf[FieldVector], dc.asInstanceOf[FieldVector]) - } - - // For vectors that fill offset-buffer "holes" in setValueCount (variable-width and list - // types), set lastSet = vc - 1 first so fillHoles is a no-op and the already-copied - // offset bytes are preserved. - dst match { - case v: BaseVariableWidthVector => v.setLastSet(valueCount - 1) - case v: BaseLargeVariableWidthVector => v.setLastSet(valueCount - 1) - case v: ListVector => v.setLastSet(valueCount - 1) - case v: LargeListVector => v.setLastSet(valueCount - 1) - case _ => - } - dst.setValueCount(valueCount) - } -} From 7a266df65f901a34d992334893d8e480d249ac7d Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 26 Jun 2026 08:22:50 -0600 Subject: [PATCH 50/54] fix: restore input field names in CometArrowPythonRunner IPC schema The runner built the destination Arrow struct's child fields straight from Comet's vectors (`getValueVector.getField`). Comet's FFI-imported vectors carry Arrow `Field`s with null names (Comet uses positional schema), so shaded Arrow's `AbstractStructVector.put` rejected them with `NullPointerException: field name cannot be null`, failing every accelerated mapInArrow/mapInPandas query. Wire the already-available input `schema` into the shared base trait and rename each top-level field from it, recursively substituting a placeholder for any null nested name. Field types and child structure are preserved so `copyVector` still walks the source and destination trees in lockstep. This also fixes a latent correctness gap: the Python worker reads columns by name, so the IPC schema must carry the real names rather than anonymous fields. --- .../python/CometArrowPythonRunner.scala | 2 +- .../python/CometArrowPythonRunner.scala | 2 +- .../python/CometArrowPythonRunner.scala | 2 +- .../python/CometArrowPythonRunnerBase.scala | 35 ++++++++++++++++++- 4 files changed, 37 insertions(+), 4 deletions(-) diff --git a/spark/src/main/spark-4.0/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala b/spark/src/main/spark-4.0/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala index 051fe14638..82c9ccd9b5 100644 --- a/spark/src/main/spark-4.0/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala +++ b/spark/src/main/spark-4.0/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala @@ -35,7 +35,7 @@ class CometArrowPythonRunner( funcs: Seq[(ChainedPythonFunctions, Long)], evalType: Int, argOffsets: Array[Array[Int]], - schema: StructType, + override val schema: StructType, timeZoneId: String, largeVarTypes: Boolean, override val workerConf: Map[String, String], diff --git a/spark/src/main/spark-4.1/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala b/spark/src/main/spark-4.1/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala index 8700e282ac..9e29de64d6 100644 --- a/spark/src/main/spark-4.1/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala +++ b/spark/src/main/spark-4.1/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala @@ -36,7 +36,7 @@ class CometArrowPythonRunner( funcs: Seq[(ChainedPythonFunctions, Long)], evalType: Int, argOffsets: Array[Array[Int]], - schema: StructType, + override val schema: StructType, timeZoneId: String, largeVarTypes: Boolean, override val workerConf: Map[String, String], diff --git a/spark/src/main/spark-4.2/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala b/spark/src/main/spark-4.2/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala index 09848d602e..02f0531b46 100644 --- a/spark/src/main/spark-4.2/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala +++ b/spark/src/main/spark-4.2/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala @@ -35,7 +35,7 @@ class CometArrowPythonRunner( funcs: Seq[(ChainedPythonFunctions, Long)], evalType: Int, argOffsets: Array[Array[Int]], - schema: StructType, + override val schema: StructType, timeZoneId: String, largeVarTypes: Boolean, pythonRunnerConf: Map[String, String], diff --git a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala index c05b1aafd3..9b80a2bc5c 100644 --- a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala +++ b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala @@ -33,6 +33,7 @@ import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.api.python.{BasePythonRunner, PythonRDD, PythonWorker, SpecialLengths} import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.StructType import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector} import org.apache.spark.unsafe.Platform @@ -72,6 +73,14 @@ private[python] trait CometArrowPythonRunnerBase /** Version-specific UDF command serialization. */ protected def writeUDF(dataOut: DataOutputStream): Unit + /** + * Input schema as Comet hands it to the runner: a single non-nullable struct named "struct" + * whose children are the user's input columns. Comet's FFI-imported vectors carry Arrow + * `Field`s with null names (Comet uses positional schema), so these names are the source of + * truth for the field names written into the IPC stream that the Python worker reads by name. + */ + protected def schema: StructType + override val pythonExec: String = SQLConf.get.pysparkWorkerPythonExecutable.getOrElse(funcs.head.funcs.head.pythonExec) @@ -135,8 +144,15 @@ private[python] trait CometArrowPythonRunnerBase if (arrowWriter == null) { // Build the destination struct root once, sized to the first batch's child fields. // mapInArrow/mapInPandas exchange the columns under a single non-nullable struct. + // Comet's FFI-imported vectors leave the Arrow Field name null, so restore the real + // column names from the input schema (the worker reads columns by name, and shaded + // Arrow rejects a null field name). The field types and child structure are kept as-is + // so copyVector still walks the source and destination trees in lockstep. + val childNames = schema.head.dataType.asInstanceOf[StructType].fieldNames val childFields = (0 until cometBatch.numCols()).map { i => - cometBatch.column(i).asInstanceOf[CometDecodedVector].getValueVector.getField + val vecField = + cometBatch.column(i).asInstanceOf[CometDecodedVector].getValueVector.getField + renamed(vecField, childNames(i)) } val structField = new Field( @@ -245,6 +261,23 @@ private[python] trait CometArrowPythonRunnerBase } } + /** + * Rebuild `field` with `name`, preserving its Arrow type and child structure. Any nested child + * whose name Comet's FFI import left null is given a positional placeholder so shaded Arrow can + * materialize the struct. Keeping the type and structure intact means the destination tree + * still mirrors the Comet source tree for [[copyVector]]. + */ + private def renamed(field: Field, name: String): Field = { + val children = field.getChildren + val newChildren = + if (children.isEmpty) children + else + children.asScala.zipWithIndex.map { case (child, idx) => + renamed(child, if (child.getName == null) s"_$idx" else child.getName) + }.asJava + new Field(name, field.getFieldType, newChildren) + } + /** * Copy a Comet column into the destination FieldVector. Walks both trees in lockstep: sizes * each destination node from the source, copies every buffer with `ArrowBuf.setBytes`, then From 53f29e6ab47f38e2d2fd8aac041d4b0d4bd706f5 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 26 Jun 2026 08:22:50 -0600 Subject: [PATCH 51/54] fix: avoid decimal context rounding in precision sweep test The decimal precision sweep negated the maximum value with unary minus (`-largest`). Python's `decimal` applies the default 28-digit context to that operation, rounding the 38-digit maximum up to 1E38 and overflowing Decimal(38, 0) when writing the source parquet, before any UDF runs. Use `copy_negate()`, which flips the sign without applying the context. --- spark/src/test/resources/pyspark/test_pyarrow_udf.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spark/src/test/resources/pyspark/test_pyarrow_udf.py b/spark/src/test/resources/pyspark/test_pyarrow_udf.py index 6347411cb7..8c3000fb34 100644 --- a/spark/src/test/resources/pyspark/test_pyarrow_udf.py +++ b/spark/src/test/resources/pyspark/test_pyarrow_udf.py @@ -376,10 +376,13 @@ def test_map_in_arrow_decimal_precision_sweep( abs_int = (10**integer_digits - 1) if integer_digits > 0 else 0 abs_frac = (10**scale - 1) if scale > 0 else 0 largest = Decimal(f"{abs_int}.{abs_frac:0{scale}d}") if scale else Decimal(abs_int) + # copy_negate() flips the sign without applying the decimal context. Plain `-largest` would + # round to the context's default 28 significant digits, turning the 38-digit maximum into + # 1E38 and overflowing Decimal(38, 0). rows = [ (1, Decimal(0)), (2, largest), - (3, -largest), + (3, largest.copy_negate()), (4, None), ] src = str(tmp_path / "src.parquet") From 92ee13b00686ecffbe6fc9a9348321043ad7073a Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 26 Jun 2026 09:06:50 -0600 Subject: [PATCH 52/54] fix: pin output Arrow schema in array-transforming UDF test reverse_arrays rebuilt the batch with pa.RecordBatch.from_pandas(pdf), which infers list from the Python-int lists and mismatches the declared array (list) output. Spark's int32 projection over the result then called getInt on a long-backed ArrowColumnVector accessor and threw UnsupportedOperationException, failing the test in both modes (vanilla Spark's own output handling rejects the type-mismatched UDF result). Pass schema=batch.schema so the output keeps the int32 element type. --- spark/src/test/resources/pyspark/test_pyarrow_udf.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/spark/src/test/resources/pyspark/test_pyarrow_udf.py b/spark/src/test/resources/pyspark/test_pyarrow_udf.py index 8c3000fb34..b24a717e88 100644 --- a/spark/src/test/resources/pyspark/test_pyarrow_udf.py +++ b/spark/src/test/resources/pyspark/test_pyarrow_udf.py @@ -566,7 +566,11 @@ def reverse_arrays(iterator): pdf["nums"] = pdf["nums"].apply( lambda lst: list(reversed(lst)) if lst is not None else None ) - yield pa.RecordBatch.from_pandas(pdf) + # Pin the output to the incoming Arrow schema. Without it, + # from_pandas infers list from the Python-int lists, mismatching + # the declared array (list) output and tripping Spark's + # int32 projection over the result. + yield pa.RecordBatch.from_pandas(pdf, schema=batch.schema) result_df = spark.read.parquet(src).mapInArrow(reverse_arrays, schema_in) _assert_plan_matches_mode(_executed_plan(result_df), accelerated) From 53764446fe4381a339fc823d83a4ddbd35f32589 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 30 Jun 2026 08:27:04 -0600 Subject: [PATCH 53/54] fix: handle large Arrow types, empty input, nullability, and chaining in PyArrow UDFs Fix the PyArrow UDF acceleration test failures: - Read large_string / large_binary output from Python workers. PyArrow (with pandas 3) emits 64-bit offset variants and mapInArrow passes them through untouched. Map LargeUtf8/LargeBinary in Utils.fromArrowType and read 64-bit offsets in CometPlainVector. - Emit a valid empty Arrow IPC stream when an upstream operator produces no input batches, instead of writing nothing and tripping the worker's ArrowStreamReader. - Send input fields to the worker as nullable so columns containing nulls are accepted, while keeping Map entries non-nullable as Arrow requires, and fill an all-valid validity buffer when the source has no nulls. - Rewrite chained mapInArrow operators: the outer operator now consumes the inner CometMapInBatchExec's columnar output directly. Set spark.comet.scan.unsignedSmallIntSafetyCheck=false in the numeric scalar test so the ShortType column does not force the scan to fall back, and update the chained JVM test to expect both operators to be rewritten. --- .../apache/comet/vector/CometPlainVector.java | 31 ++++++-- .../rules/EliminateRedundantTransitions.scala | 6 ++ .../apache/spark/sql/comet/util/Utils.scala | 5 ++ .../python/CometArrowPythonRunnerBase.scala | 72 +++++++++++++++---- .../resources/pyspark/test_pyarrow_udf.py | 23 ++++-- .../sql/comet/CometMapInBatchSuite.scala | 24 ++++--- 6 files changed, 128 insertions(+), 33 deletions(-) diff --git a/spark/src/main/java/org/apache/comet/vector/CometPlainVector.java b/spark/src/main/java/org/apache/comet/vector/CometPlainVector.java index 473f30d928..8fbef7a490 100644 --- a/spark/src/main/java/org/apache/comet/vector/CometPlainVector.java +++ b/spark/src/main/java/org/apache/comet/vector/CometPlainVector.java @@ -41,6 +41,9 @@ public class CometPlainVector extends CometDecodedVector { private final long valueBufferAddress; private final long offsetBufferAddress; private final boolean isBaseFixedWidthVector; + // True when the variable-width offsets are 64-bit (LargeVarChar / LargeVarBinary) rather than + // the usual 32-bit. PyArrow UDFs can hand back large_string / large_binary columns. + private final boolean isLargeVarWidth; private byte booleanByteCache; private int booleanByteCacheIndex = -1; @@ -61,8 +64,14 @@ public CometPlainVector(ValueVector vector, boolean isUuid) { if (vector instanceof BaseVariableWidthVector) { this.offsetBufferAddress = ((BaseVariableWidthVector) vector).getOffsetBuffer().memoryAddress(); + this.isLargeVarWidth = false; + } else if (vector instanceof BaseLargeVariableWidthVector) { + this.offsetBufferAddress = + ((BaseLargeVariableWidthVector) vector).getOffsetBuffer().memoryAddress(); + this.isLargeVarWidth = true; } else { this.offsetBufferAddress = -1; + this.isLargeVarWidth = false; } } @@ -115,8 +124,15 @@ public double getDouble(int rowId) { public UTF8String getUTF8String(int rowId) { if (isNullAt(rowId)) return null; if (offsetBufferAddress != -1) { - int offset = Platform.getInt(null, offsetBufferAddress + rowId * 4L); - int length = Platform.getInt(null, offsetBufferAddress + (rowId + 1L) * 4L) - offset; + long offset; + int length; + if (isLargeVarWidth) { + offset = Platform.getLong(null, offsetBufferAddress + rowId * 8L); + length = (int) (Platform.getLong(null, offsetBufferAddress + (rowId + 1L) * 8L) - offset); + } else { + offset = Platform.getInt(null, offsetBufferAddress + rowId * 4L); + length = Platform.getInt(null, offsetBufferAddress + (rowId + 1L) * 4L) - (int) offset; + } return UTF8String.fromAddress(null, valueBufferAddress + offset, length); } else if (isBaseFixedWidthVector) { BaseFixedWidthVector fixedWidthVector = (BaseFixedWidthVector) valueVector; @@ -139,11 +155,16 @@ public UTF8String getUTF8String(int rowId) { @Override public byte[] getBinary(int rowId) { if (isNullAt(rowId)) return null; - int offset; + long offset; int length; if (offsetBufferAddress != -1) { - offset = Platform.getInt(null, offsetBufferAddress + rowId * 4L); - length = Platform.getInt(null, offsetBufferAddress + (rowId + 1L) * 4L) - offset; + if (isLargeVarWidth) { + offset = Platform.getLong(null, offsetBufferAddress + rowId * 8L); + length = (int) (Platform.getLong(null, offsetBufferAddress + (rowId + 1L) * 8L) - offset); + } else { + offset = Platform.getInt(null, offsetBufferAddress + rowId * 4L); + length = Platform.getInt(null, offsetBufferAddress + (rowId + 1L) * 4L) - (int) offset; + } } else if (valueVector instanceof BaseFixedWidthVector) { BaseFixedWidthVector fixedWidthVector = (BaseFixedWidthVector) valueVector; length = fixedWidthVector.getTypeWidth(); diff --git a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala index 95b05dc1a2..477c2afd99 100644 --- a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala +++ b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala @@ -164,6 +164,12 @@ case class EliminateRedundantTransitions(session: SparkSession) private def extractColumnarChild(plan: SparkPlan): Option[SparkPlan] = plan match { case CometColumnarToRowExec(child) => Some(child) case CometNativeColumnarToRowExec(child) => Some(child) + // Chained `mapInArrow(udf1).mapInArrow(udf2)`: by the time the outer operator is visited + // (transformUp is bottom-up) the inner one has already become a `CometMapInBatchExec`, which + // is itself columnar. There is no row transition between them to strip, so consume its + // columnar output directly. Its flattened output vectors are `CometVector`s, exactly what + // `CometMapInBatchExec`'s input path expects. + case child: CometMapInBatchExec => Some(child) case _ => None } diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/util/Utils.scala b/spark/src/main/scala/org/apache/spark/sql/comet/util/Utils.scala index 15e1e2c410..dea872169d 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/util/Utils.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/util/Utils.scala @@ -97,7 +97,12 @@ object Utils extends CometTypeShim with Logging { case float: ArrowType.FloatingPoint if float.getPrecision == FloatingPointPrecision.DOUBLE => DoubleType case ArrowType.Utf8.INSTANCE => StringType + // Large (64-bit offset) variants: a PyArrow UDF's Python output may use large_string / + // large_binary (e.g. pandas 3 backs string columns with Arrow large types), and mapInArrow + // passes those types straight through to the JVM. CometPlainVector reads both offset widths. + case ArrowType.LargeUtf8.INSTANCE => StringType case ArrowType.Binary.INSTANCE => BinaryType + case ArrowType.LargeBinary.INSTANCE => BinaryType case _: ArrowType.FixedSizeBinary => BinaryType case d: ArrowType.Decimal => DecimalType(d.getPrecision, d.getScale) case date: ArrowType.Date if date.getUnit == DateUnit.DAY => DateType diff --git a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala index 9b80a2bc5c..ee811a5375 100644 --- a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala +++ b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala @@ -31,6 +31,7 @@ import org.apache.arrow.vector.ipc.{ArrowStreamReader, ArrowStreamWriter} import org.apache.arrow.vector.types.pojo.{ArrowType, Field, FieldType} import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.api.python.{BasePythonRunner, PythonRDD, PythonWorker, SpecialLengths} +import org.apache.spark.sql.comet.util.Utils import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType @@ -127,12 +128,35 @@ private[python] trait CometArrowPythonRunnerBase writeUDF(dataOut) } + /** Build the destination struct root and start the writer from the given child fields. */ + private def startWriter(childFields: Seq[Field], dataOut: DataOutputStream): Unit = { + val structField = + new Field( + "struct", + new FieldType(false, ArrowType.Struct.INSTANCE, null), + childFields.asJava) + structVec = structField.createVector(allocator).asInstanceOf[StructVector] + writeRoot = new VectorSchemaRoot(Seq[FieldVector](structVec).asJava) + arrowWriter = new ArrowStreamWriter(writeRoot, null, Channels.newChannel(dataOut)) + arrowWriter.start() + } + override def writeNextInputToStream(dataOut: DataOutputStream): Boolean = { while (currentGroup == null || !currentGroup.hasNext) { if (!inputIterator.hasNext) { - if (arrowWriter != null) { - arrowWriter.end() + if (arrowWriter == null) { + // No input batch was ever produced (e.g. an upstream filter removed every row). + // Still emit a valid, empty Arrow IPC stream so the Python worker's + // ArrowStreamReader reads a schema and then sees zero batches, instead of failing + // on an absent stream ("Invalid IPC stream: negative continuation token"). There is + // no sample batch, so derive the schema from the Spark input schema. The timezone is + // irrelevant here because no rows are exchanged. + val inner = schema.head.dataType.asInstanceOf[StructType] + val childFields = inner.fields.toSeq.map(f => + Utils.toArrowField(f.name, f.dataType, nullable = true, "UTC")) + startWriter(childFields, dataOut) } + arrowWriter.end() return false } currentGroup = inputIterator.next() @@ -152,17 +176,9 @@ private[python] trait CometArrowPythonRunnerBase val childFields = (0 until cometBatch.numCols()).map { i => val vecField = cometBatch.column(i).asInstanceOf[CometDecodedVector].getValueVector.getField - renamed(vecField, childNames(i)) + renamed(vecField, childNames(i), forceNullable = true) } - val structField = - new Field( - "struct", - new FieldType(false, ArrowType.Struct.INSTANCE, null), - childFields.asJava) - structVec = structField.createVector(allocator).asInstanceOf[StructVector] - writeRoot = new VectorSchemaRoot(Seq[FieldVector](structVec).asJava) - arrowWriter = new ArrowStreamWriter(writeRoot, null, Channels.newChannel(dataOut)) - arrowWriter.start() + startWriter(childFields, dataOut) } var i = 0 @@ -267,15 +283,31 @@ private[python] trait CometArrowPythonRunnerBase * materialize the struct. Keeping the type and structure intact means the destination tree * still mirrors the Comet source tree for [[copyVector]]. */ - private def renamed(field: Field, name: String): Field = { + private def renamed(field: Field, name: String, forceNullable: Boolean): Field = { + // A Map's descendants must keep their original nullability: Arrow requires the entries struct + // (and its key) to be non-nullable, and `MapVector.createVector` rejects a nullable entries + // struct. Stop forcing nullable once we enter a Map subtree. + val childrenForceNullable = forceNullable && !field.getType.isInstanceOf[ArrowType.Map] val children = field.getChildren val newChildren = if (children.isEmpty) children else children.asScala.zipWithIndex.map { case (child, idx) => - renamed(child, if (child.getName == null) s"_$idx" else child.getName) + renamed( + child, + if (child.getName == null) s"_$idx" else child.getName, + childrenForceNullable) }.asJava - new Field(name, field.getFieldType, newChildren) + // Force the field nullable where allowed. Comet's FFI-imported vectors may carry a + // non-nullable Arrow `Field` even for columns that contain nulls (Comet uses positional schema + // and does not round-trip Spark's nullability), and the worker rejects a null value under a + // non-nullable field (`from_pandas(pdf, schema=batch.schema)` raises). Marking the field + // nullable is a safe superset; `copyVector` fills an all-valid validity buffer when the source + // has no nulls. + val ft = field.getFieldType + val nullable = forceNullable || ft.isNullable + val newFt = new FieldType(nullable, ft.getType, ft.getDictionary, ft.getMetadata) + new Field(name, newFt, newChildren) } /** @@ -332,5 +364,15 @@ private[python] trait CometArrowPythonRunnerBase case _ => } dst.setValueCount(valueCount) + + // Every destination field is nullable (see `renamed`), so the worker reads the validity + // buffer. When the source has no nulls its validity buffer may be empty (Comet omits it), + // which would otherwise leave the freshly-allocated destination validity all-zero and make + // the worker see every value as null. Set all-valid in that case. Done after setValueCount, + // which can rewrite validity, mirroring the struct-level all-valid fill in writeNextInput. + if (valueCount > 0 && dst.getField.isNullable && src.getNullCount == 0) { + val validityBytes = (valueCount + 7) / 8 + Platform.setMemory(dst.getValidityBuffer.memoryAddress(), 0xff.toByte, validityBytes) + } } } diff --git a/spark/src/test/resources/pyspark/test_pyarrow_udf.py b/spark/src/test/resources/pyspark/test_pyarrow_udf.py index b24a717e88..fc5e8053e3 100644 --- a/spark/src/test/resources/pyspark/test_pyarrow_udf.py +++ b/spark/src/test/resources/pyspark/test_pyarrow_udf.py @@ -688,11 +688,26 @@ def passthrough(iterator): for batch in iterator: yield batch - result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in) - _assert_plan_matches_mode(_executed_plan(result_df), accelerated) + # ShortType (the `small` column) forces the Comet scan to fall back to vanilla Spark by + # default: Parquet UINT_8 maps to ShortType and Comet cannot distinguish it from signed + # INT16 (spark.comet.scan.unsignedSmallIntSafetyCheck). Without a Comet scan there is no + # columnar producer for the UDF to consume, so the rewrite cannot fire. This data is signed, + # so allow native execution to exercise the accelerated path. + prev_uint_check = spark.conf.get( + "spark.comet.scan.unsignedSmallIntSafetyCheck", "true" + ) + spark.conf.set("spark.comet.scan.unsignedSmallIntSafetyCheck", "false") + try: + result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in) + _assert_plan_matches_mode(_executed_plan(result_df), accelerated) - out = {(r["id"], r["b"], r["tiny"], r["small"], r["flt"]) for r in result_df.collect()} - assert out == set(rows) + out = { + (r["id"], r["b"], r["tiny"], r["small"], r["flt"]) + for r in result_df.collect() + } + assert out == set(rows) + finally: + spark.conf.set("spark.comet.scan.unsignedSmallIntSafetyCheck", prev_uint_check) def test_map_in_arrow_binary_type(spark, tmp_path, accelerated): diff --git a/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala b/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala index e18e838b29..c9acd2cc3f 100644 --- a/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala +++ b/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala @@ -105,15 +105,15 @@ class CometMapInBatchSuite extends CometTestBase { } } - test("rule handles chained MapInArrowExec without crashing") { + test("rule rewrites chained MapInArrowExec into stacked CometMapInBatchExec") { // df.mapInArrow(...).mapInArrow(...) produces two MapInArrowExec operators. The outer // consumes rows from the inner directly (MapInArrowExec is a row producer), so there is - // no ColumnarToRow between them. After the rule's bottom-up rewrite the inner becomes - // CometMapInBatchExec; the outer keeps its row contract and is satisfied by - // CometMapInBatchExec.doExecute() reintroducing a ColumnarToRow internally. The - // assertion exists mainly to pin the structure: regress this if a future change makes - // both rewrite (the bulk-copy input path would then need to accept a CometVector input - // that did not come from a CometDecodedVector chain). + // no ColumnarToRow between them. The rule rewrites bottom-up: the inner becomes + // CometMapInBatchExec first, then the outer is matched against a child that is already a + // (columnar) CometMapInBatchExec and rewrites too, consuming the inner's columnar output + // directly. Both operators end up native and the chain stays columnar end to end. The + // inner's flattened output vectors are CometVectors, exactly what the outer's bulk-copy + // input path expects. withSQLConf(CometConf.COMET_PYARROW_UDF_ENABLED.key -> "true") { val cometLeaf = StubCometLeaf(Seq(AttributeReference("id", LongType)(ExprId(0L)))) val inner = MapInArrowExec( @@ -128,9 +128,15 @@ class CometMapInBatchSuite extends CometTestBase { val rewritten = EliminateRedundantTransitions(spark).apply(outer) val cometOps = rewritten.collect { case op: CometMapInBatchExec => op } assert( - cometOps.size == 1, - "expected the inner MapInArrowExec to be rewritten, but the chain produced " + + cometOps.size == 2, + "expected both MapInArrowExec operators to be rewritten, but the chain produced " + s"${cometOps.size} CometMapInBatchExec(s):\n$rewritten") + assert( + outer.output == cometOps.head.output, + s"expected the outer operator to be rewritten:\n$rewritten") + assert( + cometOps.head.child.isInstanceOf[CometMapInBatchExec], + s"expected the outer CometMapInBatchExec to consume the inner one directly:\n$rewritten") } } From 5802e7ca65c037b9cbeeed66890866159d99aacf Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 1 Jul 2026 07:36:42 -0600 Subject: [PATCH 54/54] fix: address PyArrow UDF review feedback Track pythonDataReceived bytes in the reader so the metric matches the vanilla BasicPythonArrowOutput fallback. Hoist the repeated input struct cast to a single lazy val, replace the manual buffer-copy loop with zip/foreach, and document the positional field-name placeholder. Correct two test docstrings: the Arrow DecimalVector copy is always 16 bytes wide (the 8-byte long-backed form is Spark's UnsafeRow encoding, not the Arrow path), and the multi-batch test reuses only the struct container, not the leaf buffers (see #4383). The persistent-root reuse, input compression codec, and large-var-type offset widening are deferred to #4383, which removes the copyVector bulk-copy path entirely. --- .../python/CometArrowPythonRunnerBase.scala | 24 ++++++++++++------- .../resources/pyspark/test_pyarrow_udf.py | 17 +++++++------ 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala index ee811a5375..4030d26679 100644 --- a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala +++ b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala @@ -111,6 +111,10 @@ private[python] trait CometArrowPythonRunnerBase private var writeRoot: VectorSchemaRoot = _ private var structVec: StructVector = _ + // The runner's input schema is a single struct column ("struct") whose children are the + // user's input columns (see `schema` above). Cast once here rather than at each use site. + private lazy val inputStructType = schema.head.dataType.asInstanceOf[StructType] + context.addTaskCompletionListener[Unit] { _ => if (writeRoot != null) { writeRoot.close() @@ -151,8 +155,7 @@ private[python] trait CometArrowPythonRunnerBase // on an absent stream ("Invalid IPC stream: negative continuation token"). There is // no sample batch, so derive the schema from the Spark input schema. The timezone is // irrelevant here because no rows are exchanged. - val inner = schema.head.dataType.asInstanceOf[StructType] - val childFields = inner.fields.toSeq.map(f => + val childFields = inputStructType.fields.toSeq.map(f => Utils.toArrowField(f.name, f.dataType, nullable = true, "UTC")) startWriter(childFields, dataOut) } @@ -172,7 +175,7 @@ private[python] trait CometArrowPythonRunnerBase // column names from the input schema (the worker reads columns by name, and shaded // Arrow rejects a null field name). The field types and child structure are kept as-is // so copyVector still walks the source and destination trees in lockstep. - val childNames = schema.head.dataType.asInstanceOf[StructType].fieldNames + val childNames = inputStructType.fieldNames val childFields = (0 until cometBatch.numCols()).map { i => val vecField = cometBatch.column(i).asInstanceOf[CometDecodedVector].getValueVector.getField @@ -240,6 +243,7 @@ private[python] trait CometArrowPythonRunnerBase } try { if (reader != null && batchLoaded) { + val bytesReadStart = reader.bytesRead() batchLoaded = reader.loadNextBatch() if (batchLoaded) { // Re-wrap the (reloaded) field vectors fresh each batch, mirroring Comet's @@ -249,6 +253,9 @@ private[python] trait CometArrowPythonRunnerBase }.toArray val batch = new ColumnarBatch(vectors) batch.setNumRows(root.getRowCount) + // Track bytes read so `pythonDataReceived` matches the vanilla fallback path + // (`BasicPythonArrowOutput`), which meters the same delta around `loadNextBatch`. + pythonMetrics("pythonDataReceived") += reader.bytesRead() - bytesReadStart pythonMetrics("pythonNumRowsReceived") += root.getRowCount batch } else { @@ -293,6 +300,10 @@ private[python] trait CometArrowPythonRunnerBase if (children.isEmpty) children else children.asScala.zipWithIndex.map { case (child, idx) => + // Only null-named FFI children get the positional `_$idx` placeholder. This assumes no + // real sibling is literally named `_0`, `_1`, ... (which would collide); struct fields + // reaching here carry their real names, so a null name means Comet's FFI import dropped + // it and a synthetic positional name is safe. renamed( child, if (child.getName == null) s"_$idx" else child.getName, @@ -337,11 +348,8 @@ private[python] trait CometArrowPythonRunnerBase require( srcBufs.size == dstBufs.size, s"buffer count mismatch for ${dst.getField}: src=${srcBufs.size}, dst=${dstBufs.size}") - var b = 0 - while (b < srcBufs.size) { - val s = srcBufs.get(b) - dstBufs.get(b).setBytes(0, s, 0, s.readableBytes) - b += 1 + srcBufs.asScala.zip(dstBufs.asScala).foreach { case (s, d) => + d.setBytes(0, s, 0, s.readableBytes) } val srcChildren = src.getChildrenFromFields diff --git a/spark/src/test/resources/pyspark/test_pyarrow_udf.py b/spark/src/test/resources/pyspark/test_pyarrow_udf.py index fc5e8053e3..67cdb9f134 100644 --- a/spark/src/test/resources/pyspark/test_pyarrow_udf.py +++ b/spark/src/test/resources/pyspark/test_pyarrow_udf.py @@ -361,10 +361,11 @@ def test_map_in_arrow_decimal_precision_sweep( spark, tmp_path, accelerated, precision, scale ): """ - Spark's `BaseFixedWidthVector` handles short decimals (precision <= 18, long-backed) and long - decimals (precision >= 19, 16-byte `FixedSizeBinary`) on different code paths. The 18/19 - boundary is where buffer-width assumptions in `copyVector` can hide bugs. Sweep over - representative precisions and scale extremes (0, half, max). + The Arrow `DecimalVector` that `copyVector` touches is always 16 bytes wide regardless of + precision, so there is no buffer-width boundary on the Arrow path (the 8-byte long-backed form + is Spark's `UnsafeRow` encoding, a layer this Arrow buffer copy never sees). This sweep instead + guards the precision/scale extremes and the 18/19 point where Spark's own decimal handling + changes representation, keeping the round trip value-exact. Scale extremes: 0, half, max. """ schema_in = T.StructType( [ @@ -436,9 +437,11 @@ def passthrough(iterator): def test_map_in_arrow_multi_batch_per_partition(spark, tmp_path, accelerated): """ - Force many small batches in a single partition so the writer/unloader exercises the - persistent destination IPC root over multiple batches. Catches buffer-reuse bugs and - variable-width data-buffer growth across batches that single-batch tests miss. + Force many small batches in a single partition so the writer runs its per-batch + allocate/copy/write loop hundreds of times against a reused struct container (the leaf + buffers are reallocated each batch today; see #4383). Catches errors that only appear across + the batch boundary: stale value counts, offset/validity sizing on the second and later + batches, and variable-width data-buffer sizing as row content changes batch to batch. """ schema_in = T.StructType( [