From a454360378ba872f76969ca2800165baec211e76 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 5 May 2026 18:06:09 -0600
Subject: [PATCH 01/54] feat: add optimized PyArrow UDF execution
 (CometPythonMapInArrowExec)

When Comet operators produce Arrow columnar data and the next operator
is a Python UDF (mapInArrow/mapInPandas), Spark currently inserts an
unnecessary ColumnarToRow transition. The Python runner then converts
those rows back to Arrow to send to Python, creating a wasteful
Arrow->Row->Arrow round-trip.

This adds CometPythonMapInArrowExec which:
- Accepts columnar input directly from Comet operators
- Uses lightweight batch.rowIterator() instead of UnsafeProjection
- Keeps the Python output as ColumnarBatch (no output row conversion)

The optimization is detected in EliminateRedundantTransitions and
controlled by spark.comet.exec.pythonMapInArrow.enabled (default: true).
---
 .../scala/org/apache/comet/CometConf.scala    |  10 +
 .../rules/EliminateRedundantTransitions.scala |  42 +++-
 .../sql/comet/CometPythonMapInArrowExec.scala | 143 ++++++++++++++
 .../resources/pyspark/test_pyarrow_udf.py     | 183 ++++++++++++++++++
 .../exec/CometPythonMapInArrowSuite.scala     |  68 +++++++
 5 files changed, 445 insertions(+), 1 deletion(-)
 create mode 100644 spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala
 create mode 100644 spark/src/test/resources/pyspark/test_pyarrow_udf.py
 create mode 100644 spark/src/test/scala/org/apache/comet/exec/CometPythonMapInArrowSuite.scala

diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala
index d3f51dfbe2..a06cd896ec 100644
--- a/common/src/main/scala/org/apache/comet/CometConf.scala
+++ b/common/src/main/scala/org/apache/comet/CometConf.scala
@@ -314,6 +314,16 @@ object CometConf extends ShimCometConf {
       .booleanConf
       .createWithDefault(false)
 
+  val COMET_PYTHON_MAP_IN_ARROW_ENABLED: ConfigEntry[Boolean] =
+    conf("spark.comet.exec.pythonMapInArrow.enabled")
+      .category(CATEGORY_EXEC)
+      .doc(
+        "Whether to enable optimized execution of PyArrow UDFs (mapInArrow/mapInPandas). " +
+          "When enabled, Comet passes Arrow columnar data directly to Python UDFs without " +
+          "the intermediate Arrow-to-Row-to-Arrow conversion that Spark normally performs.")
+      .booleanConf
+      .createWithDefault(true)
+
   val COMET_TRACING_ENABLED: ConfigEntry[Boolean] = conf("spark.comet.tracing.enabled")
     .category(CATEGORY_TUNING)
     .doc(s"Enable fine-grained tracing of events and memory usage. $TRACING_GUIDE.")
diff --git a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala
index 7402a83248..272ef76484 100644
--- a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala
+++ b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala
@@ -20,13 +20,15 @@
 package org.apache.comet.rules
 
 import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.expressions.PythonUDF
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.catalyst.util.sideBySide
-import org.apache.spark.sql.comet.{CometCollectLimitExec, CometColumnarToRowExec, CometNativeColumnarToRowExec, CometNativeWriteExec, CometPlan, CometSparkToColumnarExec}
+import org.apache.spark.sql.comet.{CometCollectLimitExec, CometColumnarToRowExec, CometNativeColumnarToRowExec, CometNativeWriteExec, CometPlan, CometPythonMapInArrowExec, CometSparkToColumnarExec}
 import org.apache.spark.sql.comet.execution.shuffle.{CometColumnarShuffle, CometShuffleExchangeExec}
 import org.apache.spark.sql.execution.{ColumnarToRowExec, RowToColumnarExec, SparkPlan}
 import org.apache.spark.sql.execution.adaptive.QueryStageExec
 import org.apache.spark.sql.execution.exchange.ReusedExchangeExec
+import org.apache.spark.sql.execution.python.{MapInPandasExec, PythonMapInArrowExec}
 
 import org.apache.comet.CometConf
 
@@ -98,6 +100,32 @@ case class EliminateRedundantTransitions(session: SparkSession) extends Rule[Spa
       case CometNativeColumnarToRowExec(sparkToColumnar: CometSparkToColumnarExec) =>
         sparkToColumnar.child
       case CometSparkToColumnarExec(child: CometSparkToColumnarExec) => child
+      // Replace MapInBatchExec (PythonMapInArrowExec / MapInPandasExec) that has a
+      // ColumnarToRow child with CometPythonMapInArrowExec to avoid the unnecessary
+      // Arrow->Row->Arrow round-trip.
+      case p: PythonMapInArrowExec if CometConf.COMET_PYTHON_MAP_IN_ARROW_ENABLED.get() =>
+        extractColumnarChild(p.child)
+          .map { columnarChild =>
+            CometPythonMapInArrowExec(
+              p.func,
+              p.output,
+              columnarChild,
+              p.isBarrier,
+              p.func.asInstanceOf[PythonUDF].evalType)
+          }
+          .getOrElse(p)
+      case p: MapInPandasExec if CometConf.COMET_PYTHON_MAP_IN_ARROW_ENABLED.get() =>
+        extractColumnarChild(p.child)
+          .map { columnarChild =>
+            CometPythonMapInArrowExec(
+              p.func,
+              p.output,
+              columnarChild,
+              p.isBarrier,
+              p.func.asInstanceOf[PythonUDF].evalType)
+          }
+          .getOrElse(p)
+
       // Spark adds `RowToColumnar` under Comet columnar shuffle. But it's redundant as the
       // shuffle takes row-based input.
       case s @ CometShuffleExchangeExec(
@@ -130,6 +158,18 @@ case class EliminateRedundantTransitions(session: SparkSession) extends Rule[Spa
     }
   }
 
+  /**
+   * If the given plan is a ColumnarToRow transition wrapping a columnar child, returns that
+   * columnar child. Used to detect and eliminate unnecessary transitions before Python UDF
+   * operators.
+   */
+  private def extractColumnarChild(plan: SparkPlan): Option[SparkPlan] = plan match {
+    case ColumnarToRowExec(child) if child.supportsColumnar => Some(child)
+    case CometColumnarToRowExec(child) => Some(child)
+    case CometNativeColumnarToRowExec(child) => Some(child)
+    case _ => None
+  }
+
   /**
    * Creates an appropriate columnar to row transition operator.
    *
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala
new file mode 100644
index 0000000000..84b3c31113
--- /dev/null
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.comet
+
+import scala.collection.JavaConverters._
+
+import org.apache.spark.{ContextAwareIterator, TaskContext}
+import org.apache.spark.api.python.ChainedPythonFunctions
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.PythonUDF
+import org.apache.spark.sql.catalyst.plans.physical.Partitioning
+import org.apache.spark.sql.execution.{ColumnarToRowExec, SparkPlan, UnaryExecNode}
+import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
+import org.apache.spark.sql.execution.python.{ArrowPythonRunner, BatchIterator, PythonSQLMetrics}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch}
+
+/**
+ * An optimized version of Spark's MapInBatchExec (PythonMapInArrowExec / MapInPandasExec) that
+ * accepts columnar input directly from Comet operators, avoiding unnecessary Arrow -> Row ->
+ * Arrow conversions.
+ *
+ * Normal Spark flow: CometNativeExec (Arrow) -> ColumnarToRow -> PythonMapInArrowExec
+ * (internally: rows -> Arrow -> Python -> Arrow -> rows)
+ *
+ * Optimized flow: CometNativeExec (Arrow) -> CometPythonMapInArrowExec (batch.rowIterator() ->
+ * Arrow -> Python -> Arrow columnar output)
+ *
+ * This eliminates:
+ *   1. The UnsafeProjection in ColumnarToRow (expensive copy) 2. The output Arrow->Row conversion
+ *      (keeps Python output as ColumnarBatch)
+ */
+case class CometPythonMapInArrowExec(
+    func: Expression,
+    output: Seq[Attribute],
+    child: SparkPlan,
+    isBarrier: Boolean,
+    pythonEvalType: Int)
+    extends UnaryExecNode
+    with PythonSQLMetrics {
+
+  override def supportsColumnar: Boolean = true
+
+  override def producedAttributes: AttributeSet = AttributeSet(output)
+
+  override def outputPartitioning: Partitioning = child.outputPartitioning
+
+  override lazy val metrics: Map[String, SQLMetric] = Map(
+    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
+    "numOutputBatches" -> SQLMetrics.createMetric(sparkContext, "number of output batches"),
+    "numInputRows" -> SQLMetrics.createMetric(sparkContext, "number of input rows")) ++
+    pythonMetrics
+
+  override def doExecute(): RDD[InternalRow] = {
+    ColumnarToRowExec(this).doExecute()
+  }
+
+  override def doExecuteColumnar(): RDD[ColumnarBatch] = {
+    val numOutputRows = longMetric("numOutputRows")
+    val numOutputBatches = longMetric("numOutputBatches")
+    val numInputRows = longMetric("numInputRows")
+
+    val pythonRunnerConf = ArrowPythonRunner.getPythonRunnerConfMap(conf)
+    val pythonFunction = func.asInstanceOf[PythonUDF].func
+    val chainedFunc = Seq(ChainedPythonFunctions(Seq(pythonFunction)))
+    val localOutput = output
+    val localChildSchema = child.schema
+    val batchSize = conf.arrowMaxRecordsPerBatch
+    val sessionLocalTimeZone = conf.sessionLocalTimeZone
+    val largeVarTypes = conf.arrowUseLargeVarTypes
+    val localPythonEvalType = pythonEvalType
+    val localPythonMetrics = pythonMetrics
+    val jobArtifactUUID =
+      org.apache.spark.JobArtifactSet.getCurrentJobArtifactState.map(_.uuid)
+
+    val inputRDD = child.executeColumnar()
+
+    inputRDD.mapPartitionsInternal { batches =>
+      val context = TaskContext.get()
+      val argOffsets = Array(Array(0))
+
+      // Convert columnar batches to rows using lightweight rowIterator
+      // (avoids UnsafeProjection copy that ColumnarToRow would do)
+      val rowIter = batches.flatMap { batch =>
+        numInputRows += batch.numRows()
+        batch.rowIterator().asScala
+      }
+
+      val contextAwareIterator = new ContextAwareIterator(context, rowIter)
+
+      // Wrap rows as a struct, matching MapInBatchEvaluatorFactory behavior
+      val wrappedIter = contextAwareIterator.map(InternalRow(_))
+
+      val batchIter =
+        if (batchSize > 0) new BatchIterator(wrappedIter, batchSize) else Iterator(wrappedIter)
+
+      val columnarBatchIter = new ArrowPythonRunner(
+        chainedFunc,
+        localPythonEvalType,
+        argOffsets,
+        org.apache.spark.sql.types
+          .StructType(Array(org.apache.spark.sql.types.StructField("struct", localChildSchema))),
+        sessionLocalTimeZone,
+        largeVarTypes,
+        pythonRunnerConf,
+        localPythonMetrics,
+        jobArtifactUUID).compute(batchIter, context.partitionId(), context)
+
+      columnarBatchIter.map { batch =>
+        // Python returns a StructType column; flatten to individual columns
+        val structVector = batch.column(0).asInstanceOf[ArrowColumnVector]
+        val outputVectors = localOutput.indices.map(structVector.getChild)
+        val flattenedBatch = new ColumnarBatch(outputVectors.toArray)
+        flattenedBatch.setNumRows(batch.numRows())
+        numOutputRows += flattenedBatch.numRows()
+        numOutputBatches += 1
+        flattenedBatch
+      }
+    }
+  }
+
+  override protected def withNewChildInternal(newChild: SparkPlan): CometPythonMapInArrowExec =
+    copy(child = newChild)
+}
diff --git a/spark/src/test/resources/pyspark/test_pyarrow_udf.py b/spark/src/test/resources/pyspark/test_pyarrow_udf.py
new file mode 100644
index 0000000000..04b83fe66b
--- /dev/null
+++ b/spark/src/test/resources/pyspark/test_pyarrow_udf.py
@@ -0,0 +1,183 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Integration test for CometPythonMapInArrowExec.
+
+This test verifies that Comet's optimized PyArrow UDF execution works correctly
+by checking:
+1. The plan uses CometPythonMapInArrowExec instead of PythonMapInArrow + ColumnarToRow
+2. The UDF produces correct results
+3. Performance improvement by eliminating unnecessary Arrow->Row->Arrow conversions
+
+Usage:
+    # Build Comet first: make release
+    # Then run with PySpark:
+    spark-submit --jars spark/target/comet-spark-spark3.5_2.12-*.jar \
+        --conf spark.plugins=org.apache.comet.CometSparkSessionExtensions \
+        --conf spark.comet.enabled=true \
+        --conf spark.comet.exec.enabled=true \
+        --conf spark.comet.exec.pythonMapInArrow.enabled=true \
+        --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \
+        --conf spark.memory.offHeap.enabled=true \
+        --conf spark.memory.offHeap.size=2g \
+        spark/src/test/resources/pyspark/test_pyarrow_udf.py
+"""
+
+import sys
+import pyarrow as pa
+from pyspark.sql import SparkSession
+from pyspark.sql import types as T
+
+
+def test_map_in_arrow_basic():
+    """Test basic mapInArrow with Comet optimization."""
+    spark = SparkSession.builder.getOrCreate()
+
+    # Create test data
+    data = [(i, float(i * 1.5), f"name_{i}") for i in range(100)]
+    df = spark.createDataFrame(data, ["id", "value", "name"])
+
+    # Write to parquet so CometScan can read it
+    df.write.mode("overwrite").parquet("/tmp/comet_pyarrow_test_data")
+    test_df = spark.read.parquet("/tmp/comet_pyarrow_test_data")
+
+    # Define a PyArrow UDF that doubles the value column
+    def double_value(batch: pa.RecordBatch) -> pa.RecordBatch:
+        pdf = batch.to_pandas()
+        pdf["value"] = pdf["value"] * 2
+        return pa.RecordBatch.from_pandas(pdf)
+
+    output_schema = T.StructType([
+        T.StructField("id", T.LongType()),
+        T.StructField("value", T.DoubleType()),
+        T.StructField("name", T.StringType()),
+    ])
+
+    # Apply mapInArrow
+    result_df = test_df.mapInArrow(double_value, output_schema)
+
+    # Check the explain plan
+    print("=" * 60)
+    print("PHYSICAL PLAN:")
+    print("=" * 60)
+    result_df.explain(mode="extended")
+    print("=" * 60)
+
+    plan_str = result_df.queryExecution.executedPlan.toString()
+    print(f"\nPlan string:\n{plan_str}\n")
+
+    # Verify CometPythonMapInArrowExec is in the plan (if Comet is active)
+    if "CometPythonMapInArrowExec" in plan_str:
+        print("SUCCESS: CometPythonMapInArrowExec is in the plan!")
+    elif "CometScan" in plan_str and "ColumnarToRow" in plan_str:
+        print("WARNING: CometScan present but still using ColumnarToRow before Python UDF")
+    elif "CometScan" not in plan_str:
+        print("INFO: Comet is not active for this query (CometScan not found)")
+    else:
+        print("INFO: Plan does not contain CometPythonMapInArrowExec")
+
+    # Verify correctness
+    result = result_df.orderBy("id").collect()
+    expected_first = data[0]
+    actual_first = result[0]
+
+    assert actual_first["id"] == expected_first[0], \
+        f"ID mismatch: {actual_first['id']} != {expected_first[0]}"
+    assert abs(actual_first["value"] - expected_first[1] * 2) < 0.001, \
+        f"Value mismatch: {actual_first['value']} != {expected_first[1] * 2}"
+    assert actual_first["name"] == expected_first[2], \
+        f"Name mismatch: {actual_first['name']} != {expected_first[2]}"
+
+    print(f"\nFirst row: {actual_first}")
+    print(f"Expected value (doubled): {expected_first[1] * 2}")
+    print("CORRECTNESS: PASSED")
+
+    # Verify all rows
+    for i, row in enumerate(result):
+        expected_val = data[i][1] * 2
+        assert abs(row["value"] - expected_val) < 0.001, \
+            f"Row {i}: expected value {expected_val}, got {row['value']}"
+
+    print(f"All {len(result)} rows verified correctly.")
+    return True
+
+
+def test_map_in_arrow_type_change():
+    """Test mapInArrow that changes the schema."""
+    spark = SparkSession.builder.getOrCreate()
+
+    data = [(i, float(i)) for i in range(50)]
+    df = spark.createDataFrame(data, ["id", "value"])
+    df.write.mode("overwrite").parquet("/tmp/comet_pyarrow_test_data2")
+    test_df = spark.read.parquet("/tmp/comet_pyarrow_test_data2")
+
+    def add_computed_column(batch: pa.RecordBatch) -> pa.RecordBatch:
+        pdf = batch.to_pandas()
+        pdf["squared"] = pdf["value"] ** 2
+        pdf["label"] = pdf["id"].apply(lambda x: f"item_{x}")
+        return pa.RecordBatch.from_pandas(pdf)
+
+    output_schema = T.StructType([
+        T.StructField("id", T.LongType()),
+        T.StructField("value", T.DoubleType()),
+        T.StructField("squared", T.DoubleType()),
+        T.StructField("label", T.StringType()),
+    ])
+
+    result_df = test_df.mapInArrow(add_computed_column, output_schema)
+    result = result_df.orderBy("id").collect()
+
+    assert len(result) == 50
+    for i, row in enumerate(result):
+        assert abs(row["squared"] - float(i) ** 2) < 0.001
+        assert row["label"] == f"item_{i}"
+
+    print("test_map_in_arrow_type_change: PASSED")
+    return True
+
+
+if __name__ == "__main__":
+    print("Running PyArrow UDF integration tests for Comet...")
+    print()
+
+    tests = [
+        ("test_map_in_arrow_basic", test_map_in_arrow_basic),
+        ("test_map_in_arrow_type_change", test_map_in_arrow_type_change),
+    ]
+
+    passed = 0
+    failed = 0
+    for name, test_fn in tests:
+        print(f"\n{'=' * 60}")
+        print(f"Running: {name}")
+        print(f"{'=' * 60}")
+        try:
+            test_fn()
+            passed += 1
+        except Exception as e:
+            print(f"FAILED: {e}")
+            import traceback
+            traceback.print_exc()
+            failed += 1
+
+    print(f"\n{'=' * 60}")
+    print(f"Results: {passed} passed, {failed} failed")
+    print(f"{'=' * 60}")
+
+    sys.exit(0 if failed == 0 else 1)
diff --git a/spark/src/test/scala/org/apache/comet/exec/CometPythonMapInArrowSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometPythonMapInArrowSuite.scala
new file mode 100644
index 0000000000..94145cea2b
--- /dev/null
+++ b/spark/src/test/scala/org/apache/comet/exec/CometPythonMapInArrowSuite.scala
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet.exec
+
+import org.apache.spark.sql.CometTestBase
+import org.apache.spark.sql.comet.CometPythonMapInArrowExec
+import org.apache.spark.sql.execution.ColumnarToRowExec
+import org.apache.spark.sql.execution.python.PythonMapInArrowExec
+
+import org.apache.comet.CometConf
+
+class CometPythonMapInArrowSuite extends CometTestBase {
+
+  test("plan with CometScan has columnar support for Python UDF optimization") {
+    withSQLConf(
+      CometConf.COMET_ENABLED.key -> "true",
+      CometConf.COMET_EXEC_ENABLED.key -> "true",
+      CometConf.COMET_PYTHON_MAP_IN_ARROW_ENABLED.key -> "true") {
+      withParquetTable(
+        (1 to 10).map(i => (i.toDouble, s"str_$i")),
+        "testTable",
+        withDictionary = false) {
+        val df = spark.sql("SELECT * FROM testTable")
+        val plan = df.queryExecution.executedPlan
+        val cometScans = plan.collect { case s if s.supportsColumnar => s }
+        assert(cometScans.nonEmpty, "Expected columnar operators that can feed Python UDFs")
+      }
+    }
+  }
+
+  test("config disables Python map in arrow optimization") {
+    withSQLConf(
+      CometConf.COMET_ENABLED.key -> "true",
+      CometConf.COMET_EXEC_ENABLED.key -> "true",
+      CometConf.COMET_PYTHON_MAP_IN_ARROW_ENABLED.key -> "false") {
+      withParquetTable(
+        (1 to 10).map(i => (i.toDouble, s"str_$i")),
+        "testTable",
+        withDictionary = false) {
+        val df = spark.sql("SELECT * FROM testTable")
+        val plan = df.queryExecution.executedPlan
+        // With the feature disabled, no CometPythonMapInArrowExec should appear
+        val cometPythonExecs =
+          plan.collect { case e: CometPythonMapInArrowExec => e }
+        assert(
+          cometPythonExecs.isEmpty,
+          "CometPythonMapInArrowExec should not appear when disabled")
+      }
+    }
+  }
+}

From 84aec8406f093abff96ca916ca9c4602065f9019 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 5 May 2026 18:11:52 -0600
Subject: [PATCH 02/54] docs: add PyArrow UDF acceleration user guide page

Documents the CometPythonMapInArrowExec optimization, including
supported APIs, configuration, usage example, and how to verify
the optimization is active in query plans.
---
 docs/source/user-guide/latest/index.rst       |   1 +
 docs/source/user-guide/latest/pyarrow-udfs.md | 132 ++++++++++++++++++
 .../resources/pyspark/test_pyarrow_udf.py     |   3 +-
 3 files changed, 135 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/user-guide/latest/pyarrow-udfs.md

diff --git a/docs/source/user-guide/latest/index.rst b/docs/source/user-guide/latest/index.rst
index 480ec4f702..c96dea7750 100644
--- a/docs/source/user-guide/latest/index.rst
+++ b/docs/source/user-guide/latest/index.rst
@@ -38,5 +38,6 @@ Comet $COMET_VERSION User Guide
    Understanding Comet Plans <understanding-comet-plans>
    Tuning Guide <tuning>
    Metrics Guide <metrics>
+   PyArrow UDF Acceleration <pyarrow-udfs>
    Iceberg Guide <iceberg>
    Kubernetes Guide <kubernetes>
diff --git a/docs/source/user-guide/latest/pyarrow-udfs.md b/docs/source/user-guide/latest/pyarrow-udfs.md
new file mode 100644
index 0000000000..71701960cd
--- /dev/null
+++ b/docs/source/user-guide/latest/pyarrow-udfs.md
@@ -0,0 +1,132 @@
+<!---
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# PyArrow UDF Acceleration
+
+Comet can accelerate Python UDFs that use PyArrow-backed batch processing, such as `mapInArrow` and `mapInPandas`.
+These APIs are commonly used for ML inference, feature engineering, and data transformation workloads.
+
+## Background
+
+Spark's `mapInArrow` and `mapInPandas` APIs allow users to apply Python functions that operate on Arrow
+RecordBatches or Pandas DataFrames. Under the hood, Spark communicates with the Python worker process
+using the Arrow IPC format.
+
+Without Comet, the execution path for these UDFs involves unnecessary data conversions:
+
+1. Comet reads data in Arrow columnar format (via CometScan)
+2. Spark inserts a ColumnarToRow transition (converts Arrow to UnsafeRow)
+3. The Python runner converts those rows back to Arrow to send to Python
+4. Python executes the UDF on Arrow batches
+5. Results are returned as Arrow and then converted back to rows
+
+Steps 2 and 3 are redundant since the data starts and ends in Arrow format.
+
+## How Comet Optimizes This
+
+When enabled, Comet detects `PythonMapInArrowExec` and `MapInPandasExec` operators in the physical plan
+and replaces them with `CometPythonMapInArrowExec`, which:
+
+- Reads Arrow columnar batches directly from the upstream Comet operator
+- Feeds them to the Python runner without the expensive UnsafeProjection copy
+- Keeps the Python output in columnar format for downstream operators
+
+This eliminates the ColumnarToRow transition and the output row conversion, reducing CPU overhead
+and memory allocations.
+
+## Configuration
+
+The optimization is controlled by:
+
+```
+spark.comet.exec.pythonMapInArrow.enabled=true  (default)
+```
+
+It is enabled by default when Comet execution is active.
+
+## Supported APIs
+
+| PySpark API | Spark Plan Node | Supported |
+|-------------|-----------------|-----------|
+| `df.mapInArrow(func, schema)` | `PythonMapInArrowExec` | Yes |
+| `df.mapInPandas(func, schema)` | `MapInPandasExec` | Yes |
+| `@pandas_udf` (scalar) | `ArrowEvalPythonExec` | Not yet |
+| `df.applyInPandas(func, schema)` | `FlatMapGroupsInPandasExec` | Not yet |
+
+## Example
+
+```python
+import pyarrow as pa
+from pyspark.sql import SparkSession, types as T
+
+spark = SparkSession.builder \
+    .config("spark.plugins", "org.apache.spark.CometPlugin") \
+    .config("spark.comet.enabled", "true") \
+    .config("spark.comet.exec.enabled", "true") \
+    .config("spark.comet.exec.pythonMapInArrow.enabled", "true") \
+    .config("spark.memory.offHeap.enabled", "true") \
+    .config("spark.memory.offHeap.size", "2g") \
+    .getOrCreate()
+
+df = spark.read.parquet("data.parquet")
+
+def transform(batch: pa.RecordBatch) -> pa.RecordBatch:
+    # Your transformation logic here
+    table = batch.to_pandas()
+    table["new_col"] = table["value"] * 2
+    return pa.RecordBatch.from_pandas(table)
+
+output_schema = T.StructType([
+    T.StructField("value", T.DoubleType()),
+    T.StructField("new_col", T.DoubleType()),
+])
+
+result = df.mapInArrow(transform, output_schema)
+```
+
+## Verifying the Optimization
+
+Use `explain()` to verify that `CometPythonMapInArrowExec` appears in your plan:
+
+```python
+result.explain(mode="extended")
+```
+
+You should see:
+```
+CometPythonMapInArrowExec ...
++- CometNativeExec ...
+   +- CometScan ...
+```
+
+Instead of the unoptimized plan:
+```
+PythonMapInArrow ...
++- ColumnarToRow
+   +- CometNativeExec ...
+      +- CometScan ...
+```
+
+## Limitations
+
+- The optimization currently applies only to `mapInArrow` and `mapInPandas`. Scalar pandas UDFs
+  (`@pandas_udf`) and grouped operations (`applyInPandas`) are not yet supported.
+- The internal row-to-Arrow conversion inside the Python runner is still present in this version.
+  A future optimization will write Arrow batches directly to the Python IPC stream, achieving
+  near zero-copy data transfer.
diff --git a/spark/src/test/resources/pyspark/test_pyarrow_udf.py b/spark/src/test/resources/pyspark/test_pyarrow_udf.py
index 04b83fe66b..1993f29f9f 100644
--- a/spark/src/test/resources/pyspark/test_pyarrow_udf.py
+++ b/spark/src/test/resources/pyspark/test_pyarrow_udf.py
@@ -26,10 +26,11 @@
 3. Performance improvement by eliminating unnecessary Arrow->Row->Arrow conversions
 
 Usage:
+    # Requires Python 3.11 or 3.12 (PySpark 3.5 does not support 3.13+)
     # Build Comet first: make release
     # Then run with PySpark:
     spark-submit --jars spark/target/comet-spark-spark3.5_2.12-*.jar \
-        --conf spark.plugins=org.apache.comet.CometSparkSessionExtensions \
+        --conf spark.plugins=org.apache.spark.CometPlugin \
         --conf spark.comet.enabled=true \
         --conf spark.comet.exec.enabled=true \
         --conf spark.comet.exec.pythonMapInArrow.enabled=true \

From af98fbba92faed24484ae32504218821b4eb59d7 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 5 May 2026 18:38:38 -0600
Subject: [PATCH 03/54] fix(test): correct PyArrow UDF integration test
 signatures and assertions

Fix three issues that prevented test_pyarrow_udf.py from running:

1. mapInArrow callbacks must accept Iterator[pa.RecordBatch] and yield
   batches. The previous single-batch signatures crashed with
   "'map' object has no attribute 'to_pandas'".
2. PySpark DataFrame has no `queryExecution` attribute. Use
   `_jdf.queryExecution().executedPlan().toString()` instead.
3. Replace soft plan-string heuristics with assertions that fail loudly
   if the optimization regresses. Match on `CometPythonMapInArrow` (no
   `Exec` suffix in the plan toString) and assert no `ColumnarToRow`
   transition is present.
---
 .../resources/pyspark/test_pyarrow_udf.py     | 42 ++++++++++---------
 1 file changed, 22 insertions(+), 20 deletions(-)

diff --git a/spark/src/test/resources/pyspark/test_pyarrow_udf.py b/spark/src/test/resources/pyspark/test_pyarrow_udf.py
index 1993f29f9f..6acac6a912 100644
--- a/spark/src/test/resources/pyspark/test_pyarrow_udf.py
+++ b/spark/src/test/resources/pyspark/test_pyarrow_udf.py
@@ -58,11 +58,13 @@ def test_map_in_arrow_basic():
     df.write.mode("overwrite").parquet("/tmp/comet_pyarrow_test_data")
     test_df = spark.read.parquet("/tmp/comet_pyarrow_test_data")
 
-    # Define a PyArrow UDF that doubles the value column
-    def double_value(batch: pa.RecordBatch) -> pa.RecordBatch:
-        pdf = batch.to_pandas()
-        pdf["value"] = pdf["value"] * 2
-        return pa.RecordBatch.from_pandas(pdf)
+    # Define a PyArrow UDF that doubles the value column.
+    # mapInArrow callbacks receive an iterator of RecordBatches and must yield batches.
+    def double_value(iterator):
+        for batch in iterator:
+            pdf = batch.to_pandas()
+            pdf["value"] = pdf["value"] * 2
+            yield pa.RecordBatch.from_pandas(pdf)
 
     output_schema = T.StructType([
         T.StructField("id", T.LongType()),
@@ -80,18 +82,17 @@ def double_value(batch: pa.RecordBatch) -> pa.RecordBatch:
     result_df.explain(mode="extended")
     print("=" * 60)
 
-    plan_str = result_df.queryExecution.executedPlan.toString()
+    plan_str = result_df._jdf.queryExecution().executedPlan().toString()
     print(f"\nPlan string:\n{plan_str}\n")
 
-    # Verify CometPythonMapInArrowExec is in the plan (if Comet is active)
-    if "CometPythonMapInArrowExec" in plan_str:
-        print("SUCCESS: CometPythonMapInArrowExec is in the plan!")
-    elif "CometScan" in plan_str and "ColumnarToRow" in plan_str:
-        print("WARNING: CometScan present but still using ColumnarToRow before Python UDF")
-    elif "CometScan" not in plan_str:
-        print("INFO: Comet is not active for this query (CometScan not found)")
-    else:
-        print("INFO: Plan does not contain CometPythonMapInArrowExec")
+    # Verify the optimized Comet operator is in the plan. The toString form is
+    # "CometPythonMapInArrow" (no Exec suffix) and the upstream scan prints as
+    # "CometNativeScan".
+    assert "CometPythonMapInArrow" in plan_str, \
+        f"CometPythonMapInArrow missing from plan:\n{plan_str}"
+    assert "ColumnarToRow" not in plan_str, \
+        f"Unexpected ColumnarToRow in optimized plan:\n{plan_str}"
+    print("SUCCESS: CometPythonMapInArrow is in the plan with no ColumnarToRow transition.")
 
     # Verify correctness
     result = result_df.orderBy("id").collect()
@@ -128,11 +129,12 @@ def test_map_in_arrow_type_change():
     df.write.mode("overwrite").parquet("/tmp/comet_pyarrow_test_data2")
     test_df = spark.read.parquet("/tmp/comet_pyarrow_test_data2")
 
-    def add_computed_column(batch: pa.RecordBatch) -> pa.RecordBatch:
-        pdf = batch.to_pandas()
-        pdf["squared"] = pdf["value"] ** 2
-        pdf["label"] = pdf["id"].apply(lambda x: f"item_{x}")
-        return pa.RecordBatch.from_pandas(pdf)
+    def add_computed_column(iterator):
+        for batch in iterator:
+            pdf = batch.to_pandas()
+            pdf["squared"] = pdf["value"] ** 2
+            pdf["label"] = pdf["id"].apply(lambda x: f"item_{x}")
+            yield pa.RecordBatch.from_pandas(pdf)
 
     output_schema = T.StructType([
         T.StructField("id", T.LongType()),

From f29cb2f53f5437edcfc906129a8ca3253fb0b0ea Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 5 May 2026 18:43:55 -0600
Subject: [PATCH 04/54] test: convert PyArrow UDF script to pytest and add CI
 coverage

- Rewrite test_pyarrow_udf.py as a pytest module. A session-scoped
  SparkSession fixture builds the Comet-enabled session once and a
  parametrized `accelerated` fixture toggles
  spark.comet.exec.pythonMapInArrow.enabled per test, so each case runs
  under both the optimized and fallback paths and asserts the expected
  plan operator (`CometPythonMapInArrow` vs vanilla `PythonMapInArrow`).
  The jar is auto-discovered from spark/target by matching the installed
  pyspark version, or taken from the COMET_JAR env var.
- Add a dedicated `PyArrow UDF Tests` workflow that builds Comet against
  Spark 3.5 / Scala 2.12, installs pyspark/pyarrow/pandas/pytest, and
  runs the new pytest module.
- Add CometPythonMapInArrowSuite to the `exec` suite list in both
  pr_build_linux.yml and pr_build_macos.yml so the JVM-side suite is
  exercised on every PR.
---
 .github/workflows/pr_build_linux.yml          |   1 +
 .github/workflows/pr_build_macos.yml          |   1 +
 .github/workflows/pyarrow_udf_test.yml        |  96 ++++++
 .../resources/pyspark/test_pyarrow_udf.py     | 299 +++++++++---------
 4 files changed, 256 insertions(+), 141 deletions(-)
 create mode 100644 .github/workflows/pyarrow_udf_test.yml

diff --git a/.github/workflows/pr_build_linux.yml b/.github/workflows/pr_build_linux.yml
index b0f09bc43b..b62a000f6c 100644
--- a/.github/workflows/pr_build_linux.yml
+++ b/.github/workflows/pr_build_linux.yml
@@ -354,6 +354,7 @@ jobs:
               org.apache.comet.exec.CometGenerateExecSuite
               org.apache.comet.exec.CometWindowExecSuite
               org.apache.comet.exec.CometJoinSuite
+              org.apache.comet.exec.CometPythonMapInArrowSuite
               org.apache.comet.CometNativeSuite
               org.apache.comet.CometSparkSessionExtensionsSuite
               org.apache.spark.CometPluginsSuite
diff --git a/.github/workflows/pr_build_macos.yml b/.github/workflows/pr_build_macos.yml
index c743d1888a..fe972818e6 100644
--- a/.github/workflows/pr_build_macos.yml
+++ b/.github/workflows/pr_build_macos.yml
@@ -193,6 +193,7 @@ jobs:
               org.apache.comet.exec.CometGenerateExecSuite
               org.apache.comet.exec.CometWindowExecSuite
               org.apache.comet.exec.CometJoinSuite
+              org.apache.comet.exec.CometPythonMapInArrowSuite
               org.apache.comet.CometNativeSuite
               org.apache.comet.CometSparkSessionExtensionsSuite
               org.apache.spark.CometPluginsSuite
diff --git a/.github/workflows/pyarrow_udf_test.yml b/.github/workflows/pyarrow_udf_test.yml
new file mode 100644
index 0000000000..0779f092a4
--- /dev/null
+++ b/.github/workflows/pyarrow_udf_test.yml
@@ -0,0 +1,96 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+name: PyArrow UDF Tests
+
+concurrency:
+  group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
+  cancel-in-progress: true
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala"
+      - "spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala"
+      - "spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala"
+      - "spark/src/test/resources/pyspark/test_pyarrow_udf.py"
+      - ".github/workflows/pyarrow_udf_test.yml"
+      - "native/**"
+  pull_request:
+    paths:
+      - "spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala"
+      - "spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala"
+      - "spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala"
+      - "spark/src/test/resources/pyspark/test_pyarrow_udf.py"
+      - ".github/workflows/pyarrow_udf_test.yml"
+      - "native/**"
+  workflow_dispatch:
+
+env:
+  RUST_VERSION: stable
+  RUST_BACKTRACE: 1
+  RUSTFLAGS: "-Clink-arg=-fuse-ld=bfd"
+
+jobs:
+  pyarrow-udf:
+    name: PyArrow UDF (Spark 3.5, JDK 17, Python 3.11)
+    runs-on: ubuntu-latest
+    container:
+      image: amd64/rust
+      env:
+        JAVA_TOOL_OPTIONS: "--add-exports=java.base/sun.nio.ch=ALL-UNNAMED --add-exports=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED"
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Setup Rust & Java toolchain
+        uses: ./.github/actions/setup-builder
+        with:
+          rust-version: ${{ env.RUST_VERSION }}
+          jdk-version: 17
+
+      - name: Cache Maven dependencies
+        uses: actions/cache@v5
+        with:
+          path: |
+            ~/.m2/repository
+            /root/.m2/repository
+          key: ${{ runner.os }}-java-maven-${{ hashFiles('**/pom.xml') }}-pyarrow-udf
+          restore-keys: |
+            ${{ runner.os }}-java-maven-
+
+      - name: Build Comet (release, Spark 3.5 / Scala 2.12)
+        run: |
+          cd native && cargo build --release
+          cd .. && ./mvnw -B -Prelease install -DskipTests -Pspark-3.5 -Pscala-2.12
+
+      - name: Install Python 3.11 and pip
+        run: |
+          apt-get update
+          apt-get install -y --no-install-recommends python3.11 python3.11-venv python3-pip
+          python3.11 -m venv /tmp/venv
+          /tmp/venv/bin/pip install --upgrade pip
+          /tmp/venv/bin/pip install "pyspark==3.5.8" "pyarrow>=14" pandas pytest
+
+      - name: Run PyArrow UDF pytest
+        run: |
+          jar=$(ls "$PWD"/spark/target/comet-spark-spark3.5_2.12-*-SNAPSHOT.jar \
+                | grep -v sources | grep -v tests | head -n1)
+          echo "Using $jar"
+          COMET_JAR="$jar" /tmp/venv/bin/python -m pytest -v \
+            spark/src/test/resources/pyspark/test_pyarrow_udf.py
diff --git a/spark/src/test/resources/pyspark/test_pyarrow_udf.py b/spark/src/test/resources/pyspark/test_pyarrow_udf.py
index 6acac6a912..462f4efdc6 100644
--- a/spark/src/test/resources/pyspark/test_pyarrow_udf.py
+++ b/spark/src/test/resources/pyspark/test_pyarrow_udf.py
@@ -17,117 +17,165 @@
 # under the License.
 
 """
-Integration test for CometPythonMapInArrowExec.
+Pytest-driven integration tests for Comet's PyArrow UDF acceleration.
 
-This test verifies that Comet's optimized PyArrow UDF execution works correctly
-by checking:
-1. The plan uses CometPythonMapInArrowExec instead of PythonMapInArrow + ColumnarToRow
-2. The UDF produces correct results
-3. Performance improvement by eliminating unnecessary Arrow->Row->Arrow conversions
+Each test runs against two execution paths:
+  - "accelerated": spark.comet.exec.pythonMapInArrow.enabled=true
+                   (plan should contain CometPythonMapInArrow and no ColumnarToRow)
+  - "fallback":    spark.comet.exec.pythonMapInArrow.enabled=false
+                   (plan should contain vanilla PythonMapInArrow)
 
 Usage:
-    # Requires Python 3.11 or 3.12 (PySpark 3.5 does not support 3.13+)
-    # Build Comet first: make release
-    # Then run with PySpark:
-    spark-submit --jars spark/target/comet-spark-spark3.5_2.12-*.jar \
-        --conf spark.plugins=org.apache.spark.CometPlugin \
-        --conf spark.comet.enabled=true \
-        --conf spark.comet.exec.enabled=true \
-        --conf spark.comet.exec.pythonMapInArrow.enabled=true \
-        --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \
-        --conf spark.memory.offHeap.enabled=true \
-        --conf spark.memory.offHeap.size=2g \
-        spark/src/test/resources/pyspark/test_pyarrow_udf.py
-"""
+    # Build Comet first:
+    make release
 
-import sys
-import pyarrow as pa
-from pyspark.sql import SparkSession
-from pyspark.sql import types as T
+    # Then either let the test discover the jar from spark/target, or pass it
+    # explicitly via COMET_JAR:
+    export COMET_JAR=$PWD/spark/target/comet-spark-spark3.5_2.12-0.16.0-SNAPSHOT.jar
 
+    pip install pyspark==3.5.8 pyarrow pandas pytest
+    pytest -v spark/src/test/resources/pyspark/test_pyarrow_udf.py
+"""
 
-def test_map_in_arrow_basic():
-    """Test basic mapInArrow with Comet optimization."""
-    spark = SparkSession.builder.getOrCreate()
+import glob
+import os
 
-    # Create test data
+import pyarrow as pa
+import pytest
+from pyspark.sql import SparkSession, types as T
+
+
+REPO_ROOT = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..")
+)
+
+
+def _resolve_comet_jar() -> str:
+    explicit = os.environ.get("COMET_JAR")
+    if explicit:
+        if any(ch in explicit for ch in "*?["):
+            matches = sorted(glob.glob(explicit))
+            if not matches:
+                raise FileNotFoundError(
+                    f"COMET_JAR pattern matched nothing: {explicit}"
+                )
+            return matches[-1]
+        return explicit
+
+    # Pick the jar that matches the installed pyspark major.minor version. The
+    # Comet jars are published per Spark version (e.g., comet-spark-spark3.5_2.12-*.jar);
+    # using the wrong one yields ClassNotFoundException on Scala stdlib classes.
+    import pyspark
+
+    major_minor = ".".join(pyspark.__version__.split(".")[:2])
+    spark_tag = f"spark{major_minor}"
+    scala_tag = "_2.12" if major_minor.startswith("3.") else "_2.13"
+    pattern = os.path.join(
+        REPO_ROOT,
+        f"spark/target/comet-spark-{spark_tag}{scala_tag}-*-SNAPSHOT.jar",
+    )
+    candidates = [
+        m
+        for m in sorted(glob.glob(pattern))
+        if "sources" not in os.path.basename(m) and "tests" not in os.path.basename(m)
+    ]
+    if not candidates:
+        raise FileNotFoundError(
+            "Comet jar not found. Set COMET_JAR or run `make release`. "
+            f"Looked under {pattern}."
+        )
+    return candidates[-1]
+
+
+@pytest.fixture(scope="session")
+def spark():
+    jar = _resolve_comet_jar()
+    # PYSPARK_SUBMIT_ARGS is consumed when pyspark launches its JVM. Setting
+    # --jars puts the Comet jar on both driver and executor classpaths so the
+    # CometPlugin can be loaded.
+    os.environ["PYSPARK_SUBMIT_ARGS"] = (
+        f"--jars {jar} --driver-class-path {jar} pyspark-shell"
+    )
+    session = (
+        SparkSession.builder.master("local[2]")
+        .appName("comet-pyarrow-udf-tests")
+        .config("spark.plugins", "org.apache.spark.CometPlugin")
+        .config("spark.comet.enabled", "true")
+        .config("spark.comet.exec.enabled", "true")
+        .config("spark.memory.offHeap.enabled", "true")
+        .config("spark.memory.offHeap.size", "2g")
+        .getOrCreate()
+    )
+    try:
+        yield session
+    finally:
+        session.stop()
+
+
+@pytest.fixture(params=[True, False], ids=["accelerated", "fallback"])
+def accelerated(request, spark) -> bool:
+    spark.conf.set(
+        "spark.comet.exec.pythonMapInArrow.enabled",
+        "true" if request.param else "false",
+    )
+    return request.param
+
+
+def _executed_plan(df) -> str:
+    return df._jdf.queryExecution().executedPlan().toString()
+
+
+def _assert_plan_matches_mode(plan: str, accelerated: bool) -> None:
+    if accelerated:
+        assert "CometPythonMapInArrow" in plan, (
+            f"expected CometPythonMapInArrow in accelerated plan, got:\n{plan}"
+        )
+        assert "ColumnarToRow" not in plan, (
+            f"unexpected ColumnarToRow in accelerated plan:\n{plan}"
+        )
+    else:
+        assert "CometPythonMapInArrow" not in plan, (
+            f"unexpected CometPythonMapInArrow in fallback plan:\n{plan}"
+        )
+        assert "PythonMapInArrow" in plan, (
+            f"expected PythonMapInArrow in fallback plan, got:\n{plan}"
+        )
+
+
+def test_map_in_arrow_doubles_value(spark, tmp_path, accelerated):
     data = [(i, float(i * 1.5), f"name_{i}") for i in range(100)]
-    df = spark.createDataFrame(data, ["id", "value", "name"])
+    src = str(tmp_path / "src.parquet")
+    spark.createDataFrame(data, ["id", "value", "name"]).write.parquet(src)
 
-    # Write to parquet so CometScan can read it
-    df.write.mode("overwrite").parquet("/tmp/comet_pyarrow_test_data")
-    test_df = spark.read.parquet("/tmp/comet_pyarrow_test_data")
-
-    # Define a PyArrow UDF that doubles the value column.
-    # mapInArrow callbacks receive an iterator of RecordBatches and must yield batches.
     def double_value(iterator):
         for batch in iterator:
             pdf = batch.to_pandas()
             pdf["value"] = pdf["value"] * 2
             yield pa.RecordBatch.from_pandas(pdf)
 
-    output_schema = T.StructType([
-        T.StructField("id", T.LongType()),
-        T.StructField("value", T.DoubleType()),
-        T.StructField("name", T.StringType()),
-    ])
-
-    # Apply mapInArrow
-    result_df = test_df.mapInArrow(double_value, output_schema)
-
-    # Check the explain plan
-    print("=" * 60)
-    print("PHYSICAL PLAN:")
-    print("=" * 60)
-    result_df.explain(mode="extended")
-    print("=" * 60)
-
-    plan_str = result_df._jdf.queryExecution().executedPlan().toString()
-    print(f"\nPlan string:\n{plan_str}\n")
-
-    # Verify the optimized Comet operator is in the plan. The toString form is
-    # "CometPythonMapInArrow" (no Exec suffix) and the upstream scan prints as
-    # "CometNativeScan".
-    assert "CometPythonMapInArrow" in plan_str, \
-        f"CometPythonMapInArrow missing from plan:\n{plan_str}"
-    assert "ColumnarToRow" not in plan_str, \
-        f"Unexpected ColumnarToRow in optimized plan:\n{plan_str}"
-    print("SUCCESS: CometPythonMapInArrow is in the plan with no ColumnarToRow transition.")
-
-    # Verify correctness
-    result = result_df.orderBy("id").collect()
-    expected_first = data[0]
-    actual_first = result[0]
-
-    assert actual_first["id"] == expected_first[0], \
-        f"ID mismatch: {actual_first['id']} != {expected_first[0]}"
-    assert abs(actual_first["value"] - expected_first[1] * 2) < 0.001, \
-        f"Value mismatch: {actual_first['value']} != {expected_first[1] * 2}"
-    assert actual_first["name"] == expected_first[2], \
-        f"Name mismatch: {actual_first['name']} != {expected_first[2]}"
-
-    print(f"\nFirst row: {actual_first}")
-    print(f"Expected value (doubled): {expected_first[1] * 2}")
-    print("CORRECTNESS: PASSED")
-
-    # Verify all rows
-    for i, row in enumerate(result):
-        expected_val = data[i][1] * 2
-        assert abs(row["value"] - expected_val) < 0.001, \
-            f"Row {i}: expected value {expected_val}, got {row['value']}"
-
-    print(f"All {len(result)} rows verified correctly.")
-    return True
-
-
-def test_map_in_arrow_type_change():
-    """Test mapInArrow that changes the schema."""
-    spark = SparkSession.builder.getOrCreate()
+    schema = T.StructType(
+        [
+            T.StructField("id", T.LongType()),
+            T.StructField("value", T.DoubleType()),
+            T.StructField("name", T.StringType()),
+        ]
+    )
+    result_df = spark.read.parquet(src).mapInArrow(double_value, schema)
+
+    _assert_plan_matches_mode(_executed_plan(result_df), accelerated)
+
+    rows = result_df.orderBy("id").collect()
+    assert len(rows) == len(data)
+    for row, original in zip(rows, data):
+        assert row["id"] == original[0]
+        assert abs(row["value"] - original[1] * 2) < 1e-6
+        assert row["name"] == original[2]
+
 
+def test_map_in_arrow_changes_schema(spark, tmp_path, accelerated):
     data = [(i, float(i)) for i in range(50)]
-    df = spark.createDataFrame(data, ["id", "value"])
-    df.write.mode("overwrite").parquet("/tmp/comet_pyarrow_test_data2")
-    test_df = spark.read.parquet("/tmp/comet_pyarrow_test_data2")
+    src = str(tmp_path / "src.parquet")
+    spark.createDataFrame(data, ["id", "value"]).write.parquet(src)
 
     def add_computed_column(iterator):
         for batch in iterator:
@@ -136,51 +184,20 @@ def add_computed_column(iterator):
             pdf["label"] = pdf["id"].apply(lambda x: f"item_{x}")
             yield pa.RecordBatch.from_pandas(pdf)
 
-    output_schema = T.StructType([
-        T.StructField("id", T.LongType()),
-        T.StructField("value", T.DoubleType()),
-        T.StructField("squared", T.DoubleType()),
-        T.StructField("label", T.StringType()),
-    ])
-
-    result_df = test_df.mapInArrow(add_computed_column, output_schema)
-    result = result_df.orderBy("id").collect()
-
-    assert len(result) == 50
-    for i, row in enumerate(result):
-        assert abs(row["squared"] - float(i) ** 2) < 0.001
+    schema = T.StructType(
+        [
+            T.StructField("id", T.LongType()),
+            T.StructField("value", T.DoubleType()),
+            T.StructField("squared", T.DoubleType()),
+            T.StructField("label", T.StringType()),
+        ]
+    )
+    result_df = spark.read.parquet(src).mapInArrow(add_computed_column, schema)
+
+    _assert_plan_matches_mode(_executed_plan(result_df), accelerated)
+
+    rows = result_df.orderBy("id").collect()
+    assert len(rows) == 50
+    for i, row in enumerate(rows):
+        assert abs(row["squared"] - float(i) ** 2) < 1e-6
         assert row["label"] == f"item_{i}"
-
-    print("test_map_in_arrow_type_change: PASSED")
-    return True
-
-
-if __name__ == "__main__":
-    print("Running PyArrow UDF integration tests for Comet...")
-    print()
-
-    tests = [
-        ("test_map_in_arrow_basic", test_map_in_arrow_basic),
-        ("test_map_in_arrow_type_change", test_map_in_arrow_type_change),
-    ]
-
-    passed = 0
-    failed = 0
-    for name, test_fn in tests:
-        print(f"\n{'=' * 60}")
-        print(f"Running: {name}")
-        print(f"{'=' * 60}")
-        try:
-            test_fn()
-            passed += 1
-        except Exception as e:
-            print(f"FAILED: {e}")
-            import traceback
-            traceback.print_exc()
-            failed += 1
-
-    print(f"\n{'=' * 60}")
-    print(f"Results: {passed} passed, {failed} failed")
-    print(f"{'=' * 60}")
-
-    sys.exit(0 if failed == 0 else 1)

From f7515397e4aada8fc956552b9042d3ce00ceb039 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 5 May 2026 18:43:57 -0600
Subject: [PATCH 05/54] docs: run prettier on pyarrow-udfs user guide page

---
 docs/source/user-guide/latest/pyarrow-udfs.md | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/docs/source/user-guide/latest/pyarrow-udfs.md b/docs/source/user-guide/latest/pyarrow-udfs.md
index 71701960cd..2d555cedc4 100644
--- a/docs/source/user-guide/latest/pyarrow-udfs.md
+++ b/docs/source/user-guide/latest/pyarrow-udfs.md
@@ -62,12 +62,12 @@ It is enabled by default when Comet execution is active.
 
 ## Supported APIs
 
-| PySpark API | Spark Plan Node | Supported |
-|-------------|-----------------|-----------|
-| `df.mapInArrow(func, schema)` | `PythonMapInArrowExec` | Yes |
-| `df.mapInPandas(func, schema)` | `MapInPandasExec` | Yes |
-| `@pandas_udf` (scalar) | `ArrowEvalPythonExec` | Not yet |
-| `df.applyInPandas(func, schema)` | `FlatMapGroupsInPandasExec` | Not yet |
+| PySpark API                      | Spark Plan Node             | Supported |
+| -------------------------------- | --------------------------- | --------- |
+| `df.mapInArrow(func, schema)`    | `PythonMapInArrowExec`      | Yes       |
+| `df.mapInPandas(func, schema)`   | `MapInPandasExec`           | Yes       |
+| `@pandas_udf` (scalar)           | `ArrowEvalPythonExec`       | Not yet   |
+| `df.applyInPandas(func, schema)` | `FlatMapGroupsInPandasExec` | Not yet   |
 
 ## Example
 
@@ -109,6 +109,7 @@ result.explain(mode="extended")
 ```
 
 You should see:
+
 ```
 CometPythonMapInArrowExec ...
 +- CometNativeExec ...
@@ -116,6 +117,7 @@ CometPythonMapInArrowExec ...
 ```
 
 Instead of the unoptimized plan:
+
 ```
 PythonMapInArrow ...
 +- ColumnarToRow

From b14fbfb58adaf3b9219e8f06171f450ef7fd1deb Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 5 May 2026 18:46:57 -0600
Subject: [PATCH 06/54] style: apply spotless formatting

---
 .../org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala  | 1 -
 .../org/apache/comet/exec/CometPythonMapInArrowSuite.scala      | 2 --
 2 files changed, 3 deletions(-)

diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala
index 84b3c31113..223153d7d8 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala
@@ -31,7 +31,6 @@ import org.apache.spark.sql.catalyst.plans.physical.Partitioning
 import org.apache.spark.sql.execution.{ColumnarToRowExec, SparkPlan, UnaryExecNode}
 import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
 import org.apache.spark.sql.execution.python.{ArrowPythonRunner, BatchIterator, PythonSQLMetrics}
-import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch}
 
 /**
diff --git a/spark/src/test/scala/org/apache/comet/exec/CometPythonMapInArrowSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometPythonMapInArrowSuite.scala
index 94145cea2b..7b1e17c4ed 100644
--- a/spark/src/test/scala/org/apache/comet/exec/CometPythonMapInArrowSuite.scala
+++ b/spark/src/test/scala/org/apache/comet/exec/CometPythonMapInArrowSuite.scala
@@ -21,8 +21,6 @@ package org.apache.comet.exec
 
 import org.apache.spark.sql.CometTestBase
 import org.apache.spark.sql.comet.CometPythonMapInArrowExec
-import org.apache.spark.sql.execution.ColumnarToRowExec
-import org.apache.spark.sql.execution.python.PythonMapInArrowExec
 
 import org.apache.comet.CometConf
 

From ca0bbbf50892860e7e103af8c016163d9d4310ef Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 5 May 2026 18:46:57 -0600
Subject: [PATCH 07/54] ci: broaden pyarrow_udf_test triggers to match
 pr_build_linux

Replace the narrow paths allowlist with the same paths-ignore list used
by pr_build_linux.yml so the workflow runs on any source change that
could affect Comet's PyArrow UDF execution path, not just the few files
explicitly named.
---
 .github/workflows/pyarrow_udf_test.yml | 34 +++++++++++++++-----------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/pyarrow_udf_test.yml b/.github/workflows/pyarrow_udf_test.yml
index 0779f092a4..46c5fbe079 100644
--- a/.github/workflows/pyarrow_udf_test.yml
+++ b/.github/workflows/pyarrow_udf_test.yml
@@ -25,21 +25,27 @@ on:
   push:
     branches:
       - main
-    paths:
-      - "spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala"
-      - "spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala"
-      - "spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala"
-      - "spark/src/test/resources/pyspark/test_pyarrow_udf.py"
-      - ".github/workflows/pyarrow_udf_test.yml"
-      - "native/**"
+    paths-ignore:
+      - "benchmarks/**"
+      - "doc/**"
+      - "docs/**"
+      - "**.md"
+      - "dev/changelog/*.md"
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
+      - "spark/src/main/scala/org/apache/comet/GenerateDocs.scala"
   pull_request:
-    paths:
-      - "spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala"
-      - "spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala"
-      - "spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala"
-      - "spark/src/test/resources/pyspark/test_pyarrow_udf.py"
-      - ".github/workflows/pyarrow_udf_test.yml"
-      - "native/**"
+    paths-ignore:
+      - "benchmarks/**"
+      - "doc/**"
+      - "docs/**"
+      - "**.md"
+      - "dev/changelog/*.md"
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
+      - "spark/src/main/scala/org/apache/comet/GenerateDocs.scala"
   workflow_dispatch:
 
 env:

From 55c28c32a187cce9bdf6b49a2b4113e845ed1d44 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 5 May 2026 18:47:48 -0600
Subject: [PATCH 08/54] ci: restrict GITHUB_TOKEN to contents:read in
 pyarrow_udf_test

---
 .github/workflows/pyarrow_udf_test.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/pyarrow_udf_test.yml b/.github/workflows/pyarrow_udf_test.yml
index 46c5fbe079..0740842413 100644
--- a/.github/workflows/pyarrow_udf_test.yml
+++ b/.github/workflows/pyarrow_udf_test.yml
@@ -48,6 +48,9 @@ on:
       - "spark/src/main/scala/org/apache/comet/GenerateDocs.scala"
   workflow_dispatch:
 
+permissions:
+  contents: read
+
 env:
   RUST_VERSION: stable
   RUST_BACKTRACE: 1

From 05b1e7afd38437c9eb72309ac2f4f5f764a97adc Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 5 May 2026 22:19:01 -0600
Subject: [PATCH 09/54] fix: shim CometPythonMapInArrowExec for cross-version
 Spark builds

The PR's `CometPythonMapInArrowExec` and `EliminateRedundantTransitions`
rule directly reference Spark 3.5 APIs that differ across supported
Spark versions: the `ArrowPythonRunner` constructor (4 distinct
signatures across 3.4/3.5/4.0/4.1+/4.2), `arrowUseLargeVarTypes`,
`JobArtifactSet`, `MapInBatchExec.isBarrier`, and the `PythonMapInArrowExec`
type itself (renamed to `MapInArrowExec` in 4.0+). This breaks compile
on every profile other than 3.5.

Introduce a per-version `ShimCometPythonMapInArrow` trait under
`org.apache.spark.sql.comet.shims` (placed in the spark namespace so
it can reach `private[spark]` members) that:

* matches the Spark-version-specific MapInArrow / MapInPandas exec types
  and exposes their `(func, output, child, isBarrier, evalType)` tuple,
* constructs the right `ArrowPythonRunner` for the version,
* hides `arrowUseLargeVarTypes` / `JobArtifactSet` / `getPythonRunnerConfMap`
  behind helper methods.

Spark 3.4 lacks the prerequisite APIs (no `isBarrier`, no `JobArtifactSet`,
no `arrowUseLargeVarTypes`), so its shim returns `None` from the matchers
and the optimization is a no-op there.
---
 .../rules/EliminateRedundantTransitions.scala | 41 ++++-----
 .../sql/comet/CometPythonMapInArrowExec.scala | 32 +++----
 .../shims/ShimCometPythonMapInArrow.scala     | 68 +++++++++++++++
 .../shims/ShimCometPythonMapInArrow.scala     | 84 ++++++++++++++++++
 .../shims/ShimCometPythonMapInArrow.scala     | 86 ++++++++++++++++++
 .../shims/ShimCometPythonMapInArrow.scala     | 87 +++++++++++++++++++
 .../shims/ShimCometPythonMapInArrow.scala     | 86 ++++++++++++++++++
 7 files changed, 446 insertions(+), 38 deletions(-)
 create mode 100644 spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala
 create mode 100644 spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala
 create mode 100644 spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala
 create mode 100644 spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala
 create mode 100644 spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala

diff --git a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala
index 272ef76484..e7218ab935 100644
--- a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala
+++ b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala
@@ -20,15 +20,14 @@
 package org.apache.comet.rules
 
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.catalyst.expressions.PythonUDF
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.catalyst.util.sideBySide
 import org.apache.spark.sql.comet.{CometCollectLimitExec, CometColumnarToRowExec, CometNativeColumnarToRowExec, CometNativeWriteExec, CometPlan, CometPythonMapInArrowExec, CometSparkToColumnarExec}
 import org.apache.spark.sql.comet.execution.shuffle.{CometColumnarShuffle, CometShuffleExchangeExec}
+import org.apache.spark.sql.comet.shims.ShimCometPythonMapInArrow
 import org.apache.spark.sql.execution.{ColumnarToRowExec, RowToColumnarExec, SparkPlan}
 import org.apache.spark.sql.execution.adaptive.QueryStageExec
 import org.apache.spark.sql.execution.exchange.ReusedExchangeExec
-import org.apache.spark.sql.execution.python.{MapInPandasExec, PythonMapInArrowExec}
 
 import org.apache.comet.CometConf
 
@@ -53,7 +52,9 @@ import org.apache.comet.CometConf
 // various reasons) or Spark requests row-based output such as a `collect` call. Spark will adds
 // another `ColumnarToRowExec` on top of `CometSparkToColumnarExec`. In this case, the pair could
 // be removed.
-case class EliminateRedundantTransitions(session: SparkSession) extends Rule[SparkPlan] {
+case class EliminateRedundantTransitions(session: SparkSession)
+    extends Rule[SparkPlan]
+    with ShimCometPythonMapInArrow {
 
   private lazy val showTransformations = CometConf.COMET_EXPLAIN_TRANSFORMATIONS.get()
 
@@ -100,29 +101,23 @@ case class EliminateRedundantTransitions(session: SparkSession) extends Rule[Spa
       case CometNativeColumnarToRowExec(sparkToColumnar: CometSparkToColumnarExec) =>
         sparkToColumnar.child
       case CometSparkToColumnarExec(child: CometSparkToColumnarExec) => child
-      // Replace MapInBatchExec (PythonMapInArrowExec / MapInPandasExec) that has a
-      // ColumnarToRow child with CometPythonMapInArrowExec to avoid the unnecessary
-      // Arrow->Row->Arrow round-trip.
-      case p: PythonMapInArrowExec if CometConf.COMET_PYTHON_MAP_IN_ARROW_ENABLED.get() =>
-        extractColumnarChild(p.child)
+      // Replace MapInBatchExec (PythonMapInArrowExec / MapInArrowExec / MapInPandasExec) that has
+      // a ColumnarToRow child with CometPythonMapInArrowExec to avoid the unnecessary
+      // Arrow->Row->Arrow round-trip. The matchers are version-shimmed: Spark 3.4 returns None
+      // (it lacks the required APIs) and Spark 4.1+ matches the renamed `MapInArrowExec`.
+      case p: SparkPlan
+          if CometConf.COMET_PYTHON_MAP_IN_ARROW_ENABLED.get() &&
+            matchMapInArrow(p).orElse(matchMapInPandas(p)).isDefined =>
+        val (mapFunc, mapOutput, mapChild, mapIsBarrier, mapEvalType) =
+          matchMapInArrow(p).orElse(matchMapInPandas(p)).get
+        extractColumnarChild(mapChild)
           .map { columnarChild =>
             CometPythonMapInArrowExec(
-              p.func,
-              p.output,
+              mapFunc,
+              mapOutput,
               columnarChild,
-              p.isBarrier,
-              p.func.asInstanceOf[PythonUDF].evalType)
-          }
-          .getOrElse(p)
-      case p: MapInPandasExec if CometConf.COMET_PYTHON_MAP_IN_ARROW_ENABLED.get() =>
-        extractColumnarChild(p.child)
-          .map { columnarChild =>
-            CometPythonMapInArrowExec(
-              p.func,
-              p.output,
-              columnarChild,
-              p.isBarrier,
-              p.func.asInstanceOf[PythonUDF].evalType)
+              mapIsBarrier,
+              mapEvalType)
           }
           .getOrElse(p)
 
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala
index 223153d7d8..9b3e820023 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala
@@ -22,15 +22,16 @@ package org.apache.spark.sql.comet
 import scala.collection.JavaConverters._
 
 import org.apache.spark.{ContextAwareIterator, TaskContext}
-import org.apache.spark.api.python.ChainedPythonFunctions
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.PythonUDF
 import org.apache.spark.sql.catalyst.plans.physical.Partitioning
+import org.apache.spark.sql.comet.shims.ShimCometPythonMapInArrow
 import org.apache.spark.sql.execution.{ColumnarToRowExec, SparkPlan, UnaryExecNode}
 import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
-import org.apache.spark.sql.execution.python.{ArrowPythonRunner, BatchIterator, PythonSQLMetrics}
+import org.apache.spark.sql.execution.python.{BatchIterator, PythonSQLMetrics}
+import org.apache.spark.sql.types.{StructField, StructType}
 import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch}
 
 /**
@@ -55,7 +56,8 @@ case class CometPythonMapInArrowExec(
     isBarrier: Boolean,
     pythonEvalType: Int)
     extends UnaryExecNode
-    with PythonSQLMetrics {
+    with PythonSQLMetrics
+    with ShimCometPythonMapInArrow {
 
   override def supportsColumnar: Boolean = true
 
@@ -78,18 +80,16 @@ case class CometPythonMapInArrowExec(
     val numOutputBatches = longMetric("numOutputBatches")
     val numInputRows = longMetric("numInputRows")
 
-    val pythonRunnerConf = ArrowPythonRunner.getPythonRunnerConfMap(conf)
-    val pythonFunction = func.asInstanceOf[PythonUDF].func
-    val chainedFunc = Seq(ChainedPythonFunctions(Seq(pythonFunction)))
+    val pythonUDF = func.asInstanceOf[PythonUDF]
     val localOutput = output
     val localChildSchema = child.schema
     val batchSize = conf.arrowMaxRecordsPerBatch
     val sessionLocalTimeZone = conf.sessionLocalTimeZone
-    val largeVarTypes = conf.arrowUseLargeVarTypes
+    val useLargeVarTypes = largeVarTypes(conf)
+    val pythonRunnerConf = getPythonRunnerConfMap(conf)
     val localPythonEvalType = pythonEvalType
     val localPythonMetrics = pythonMetrics
-    val jobArtifactUUID =
-      org.apache.spark.JobArtifactSet.getCurrentJobArtifactState.map(_.uuid)
+    val jobArtifactUUID = currentJobArtifactUUID()
 
     val inputRDD = child.executeColumnar()
 
@@ -112,17 +112,19 @@ case class CometPythonMapInArrowExec(
       val batchIter =
         if (batchSize > 0) new BatchIterator(wrappedIter, batchSize) else Iterator(wrappedIter)
 
-      val columnarBatchIter = new ArrowPythonRunner(
-        chainedFunc,
+      val columnarBatchIter = computeArrowPython(
+        pythonUDF,
         localPythonEvalType,
         argOffsets,
-        org.apache.spark.sql.types
-          .StructType(Array(org.apache.spark.sql.types.StructField("struct", localChildSchema))),
+        StructType(Array(StructField("struct", localChildSchema))),
         sessionLocalTimeZone,
-        largeVarTypes,
+        useLargeVarTypes,
         pythonRunnerConf,
         localPythonMetrics,
-        jobArtifactUUID).compute(batchIter, context.partitionId(), context)
+        jobArtifactUUID,
+        batchIter,
+        context.partitionId(),
+        context)
 
       columnarBatchIter.map { batch =>
         // Python returns a StructType column; flatten to individual columns
diff --git a/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala b/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala
new file mode 100644
index 0000000000..30736d99b3
--- /dev/null
+++ b/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.comet.shims
+
+import org.apache.spark.TaskContext
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, PythonUDF}
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.metric.SQLMetric
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+/**
+ * Spark 3.4 shim for the PyArrow UDF acceleration support.
+ *
+ * Spark 3.4 lacks several APIs that the optimization relies on (`isBarrier` on `MapInBatchExec`,
+ * `arrowUseLargeVarTypes`, `JobArtifactSet`, the modern `ArrowPythonRunner` constructor), so the
+ * matchers return `None` and the runner factory throws. The optimization is effectively a no-op
+ * on Spark 3.4.
+ */
+trait ShimCometPythonMapInArrow {
+
+  protected def matchMapInArrow(
+      plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] = None
+
+  protected def matchMapInPandas(
+      plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] = None
+
+  protected def currentJobArtifactUUID(): Option[String] = None
+
+  protected def largeVarTypes(conf: SQLConf): Boolean = false
+
+  protected def getPythonRunnerConfMap(conf: SQLConf): Map[String, String] = Map.empty
+
+  protected def computeArrowPython(
+      pythonUDF: PythonUDF,
+      evalType: Int,
+      argOffsets: Array[Array[Int]],
+      schema: StructType,
+      timeZoneId: String,
+      largeVarTypes: Boolean,
+      pythonRunnerConf: Map[String, String],
+      pythonMetrics: Map[String, SQLMetric],
+      jobArtifactUUID: Option[String],
+      batchIter: Iterator[Iterator[InternalRow]],
+      partitionId: Int,
+      context: TaskContext): Iterator[ColumnarBatch] =
+    throw new UnsupportedOperationException(
+      "CometPythonMapInArrowExec is not supported on Spark 3.4")
+}
diff --git a/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala b/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala
new file mode 100644
index 0000000000..f7c8221d9e
--- /dev/null
+++ b/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.comet.shims
+
+import org.apache.spark.{JobArtifactSet, TaskContext}
+import org.apache.spark.api.python.ChainedPythonFunctions
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, PythonUDF}
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.metric.SQLMetric
+import org.apache.spark.sql.execution.python.{ArrowPythonRunner, MapInPandasExec, PythonMapInArrowExec}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+trait ShimCometPythonMapInArrow {
+
+  protected def matchMapInArrow(
+      plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] =
+    plan match {
+      case p: PythonMapInArrowExec =>
+        Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType))
+      case _ => None
+    }
+
+  protected def matchMapInPandas(
+      plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] =
+    plan match {
+      case p: MapInPandasExec =>
+        Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType))
+      case _ => None
+    }
+
+  protected def currentJobArtifactUUID(): Option[String] =
+    JobArtifactSet.getCurrentJobArtifactState.map(_.uuid)
+
+  protected def largeVarTypes(conf: SQLConf): Boolean = conf.arrowUseLargeVarTypes
+
+  protected def getPythonRunnerConfMap(conf: SQLConf): Map[String, String] =
+    ArrowPythonRunner.getPythonRunnerConfMap(conf)
+
+  protected def computeArrowPython(
+      pythonUDF: PythonUDF,
+      evalType: Int,
+      argOffsets: Array[Array[Int]],
+      schema: StructType,
+      timeZoneId: String,
+      largeVarTypes: Boolean,
+      pythonRunnerConf: Map[String, String],
+      pythonMetrics: Map[String, SQLMetric],
+      jobArtifactUUID: Option[String],
+      batchIter: Iterator[Iterator[InternalRow]],
+      partitionId: Int,
+      context: TaskContext): Iterator[ColumnarBatch] = {
+    val chainedFunc = Seq(ChainedPythonFunctions(Seq(pythonUDF.func)))
+    new ArrowPythonRunner(
+      chainedFunc,
+      evalType,
+      argOffsets,
+      schema,
+      timeZoneId,
+      largeVarTypes,
+      pythonRunnerConf,
+      pythonMetrics,
+      jobArtifactUUID).compute(batchIter, partitionId, context)
+  }
+}
diff --git a/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala b/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala
new file mode 100644
index 0000000000..78935f54c5
--- /dev/null
+++ b/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.comet.shims
+
+import org.apache.spark.{JobArtifactSet, TaskContext}
+import org.apache.spark.api.python.ChainedPythonFunctions
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, PythonUDF}
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.metric.SQLMetric
+import org.apache.spark.sql.execution.python.{ArrowPythonRunner, MapInArrowExec, MapInPandasExec}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+trait ShimCometPythonMapInArrow {
+
+  protected def matchMapInArrow(
+      plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] =
+    plan match {
+      case p: MapInArrowExec =>
+        Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType))
+      case _ => None
+    }
+
+  protected def matchMapInPandas(
+      plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] =
+    plan match {
+      case p: MapInPandasExec =>
+        Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType))
+      case _ => None
+    }
+
+  protected def currentJobArtifactUUID(): Option[String] =
+    JobArtifactSet.getCurrentJobArtifactState.map(_.uuid)
+
+  protected def largeVarTypes(conf: SQLConf): Boolean = conf.arrowUseLargeVarTypes
+
+  protected def getPythonRunnerConfMap(conf: SQLConf): Map[String, String] =
+    ArrowPythonRunner.getPythonRunnerConfMap(conf)
+
+  protected def computeArrowPython(
+      pythonUDF: PythonUDF,
+      evalType: Int,
+      argOffsets: Array[Array[Int]],
+      schema: StructType,
+      timeZoneId: String,
+      largeVarTypes: Boolean,
+      pythonRunnerConf: Map[String, String],
+      pythonMetrics: Map[String, SQLMetric],
+      jobArtifactUUID: Option[String],
+      batchIter: Iterator[Iterator[InternalRow]],
+      partitionId: Int,
+      context: TaskContext): Iterator[ColumnarBatch] = {
+    val chainedFunc =
+      Seq((ChainedPythonFunctions(Seq(pythonUDF.func)), pythonUDF.resultId.id))
+    new ArrowPythonRunner(
+      chainedFunc,
+      evalType,
+      argOffsets,
+      schema,
+      timeZoneId,
+      largeVarTypes,
+      pythonRunnerConf,
+      pythonMetrics,
+      jobArtifactUUID,
+      None).compute(batchIter, partitionId, context)
+  }
+}
diff --git a/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala b/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala
new file mode 100644
index 0000000000..f7f775b1fa
--- /dev/null
+++ b/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.comet.shims
+
+import org.apache.spark.{JobArtifactSet, TaskContext}
+import org.apache.spark.api.python.ChainedPythonFunctions
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, PythonUDF}
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.metric.SQLMetric
+import org.apache.spark.sql.execution.python.{ArrowPythonRunner, MapInArrowExec, MapInPandasExec}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+trait ShimCometPythonMapInArrow {
+
+  protected def matchMapInArrow(
+      plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] =
+    plan match {
+      case p: MapInArrowExec =>
+        Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType))
+      case _ => None
+    }
+
+  protected def matchMapInPandas(
+      plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] =
+    plan match {
+      case p: MapInPandasExec =>
+        Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType))
+      case _ => None
+    }
+
+  protected def currentJobArtifactUUID(): Option[String] =
+    JobArtifactSet.getCurrentJobArtifactState.map(_.uuid)
+
+  protected def largeVarTypes(conf: SQLConf): Boolean = conf.arrowUseLargeVarTypes
+
+  protected def getPythonRunnerConfMap(conf: SQLConf): Map[String, String] =
+    ArrowPythonRunner.getPythonRunnerConfMap(conf)
+
+  protected def computeArrowPython(
+      pythonUDF: PythonUDF,
+      evalType: Int,
+      argOffsets: Array[Array[Int]],
+      schema: StructType,
+      timeZoneId: String,
+      largeVarTypes: Boolean,
+      pythonRunnerConf: Map[String, String],
+      pythonMetrics: Map[String, SQLMetric],
+      jobArtifactUUID: Option[String],
+      batchIter: Iterator[Iterator[InternalRow]],
+      partitionId: Int,
+      context: TaskContext): Iterator[ColumnarBatch] = {
+    val chainedFunc =
+      Seq((ChainedPythonFunctions(Seq(pythonUDF.func)), pythonUDF.resultId.id))
+    new ArrowPythonRunner(
+      chainedFunc,
+      evalType,
+      argOffsets,
+      schema,
+      timeZoneId,
+      largeVarTypes,
+      pythonRunnerConf,
+      pythonMetrics,
+      jobArtifactUUID,
+      None,
+      None).compute(batchIter, partitionId, context)
+  }
+}
diff --git a/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala b/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala
new file mode 100644
index 0000000000..78935f54c5
--- /dev/null
+++ b/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.comet.shims
+
+import org.apache.spark.{JobArtifactSet, TaskContext}
+import org.apache.spark.api.python.ChainedPythonFunctions
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, PythonUDF}
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.metric.SQLMetric
+import org.apache.spark.sql.execution.python.{ArrowPythonRunner, MapInArrowExec, MapInPandasExec}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+trait ShimCometPythonMapInArrow {
+
+  protected def matchMapInArrow(
+      plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] =
+    plan match {
+      case p: MapInArrowExec =>
+        Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType))
+      case _ => None
+    }
+
+  protected def matchMapInPandas(
+      plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] =
+    plan match {
+      case p: MapInPandasExec =>
+        Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType))
+      case _ => None
+    }
+
+  protected def currentJobArtifactUUID(): Option[String] =
+    JobArtifactSet.getCurrentJobArtifactState.map(_.uuid)
+
+  protected def largeVarTypes(conf: SQLConf): Boolean = conf.arrowUseLargeVarTypes
+
+  protected def getPythonRunnerConfMap(conf: SQLConf): Map[String, String] =
+    ArrowPythonRunner.getPythonRunnerConfMap(conf)
+
+  protected def computeArrowPython(
+      pythonUDF: PythonUDF,
+      evalType: Int,
+      argOffsets: Array[Array[Int]],
+      schema: StructType,
+      timeZoneId: String,
+      largeVarTypes: Boolean,
+      pythonRunnerConf: Map[String, String],
+      pythonMetrics: Map[String, SQLMetric],
+      jobArtifactUUID: Option[String],
+      batchIter: Iterator[Iterator[InternalRow]],
+      partitionId: Int,
+      context: TaskContext): Iterator[ColumnarBatch] = {
+    val chainedFunc =
+      Seq((ChainedPythonFunctions(Seq(pythonUDF.func)), pythonUDF.resultId.id))
+    new ArrowPythonRunner(
+      chainedFunc,
+      evalType,
+      argOffsets,
+      schema,
+      timeZoneId,
+      largeVarTypes,
+      pythonRunnerConf,
+      pythonMetrics,
+      jobArtifactUUID,
+      None).compute(batchIter, partitionId, context)
+  }
+}

From 66eb246d3cb9f04b6b25878a02a32f6a2007b669 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 5 May 2026 22:19:07 -0600
Subject: [PATCH 10/54] ci: switch pyarrow_udf_test container to rust:bookworm

The default `amd64/rust` image is Debian 13 (trixie), where the system
`python3` is 3.13 and there is no `python3.11` apt package. The workflow
installed `python3.11` explicitly, which fails on trixie with `Unable to
locate package python3.11`.

Switching to `rust:bookworm` gives a Debian 12 base where `python3` is
3.11, matching the job name and pyspark 3.5.x's supported runtime.
---
 .github/workflows/pyarrow_udf_test.yml | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/pyarrow_udf_test.yml b/.github/workflows/pyarrow_udf_test.yml
index 0740842413..622ee59fd0 100644
--- a/.github/workflows/pyarrow_udf_test.yml
+++ b/.github/workflows/pyarrow_udf_test.yml
@@ -61,7 +61,10 @@ jobs:
     name: PyArrow UDF (Spark 3.5, JDK 17, Python 3.11)
     runs-on: ubuntu-latest
     container:
-      image: amd64/rust
+      # Pinned to the Debian 12 (bookworm) base so the system `python3` is 3.11. The default
+      # `amd64/rust` image is Debian 13 (trixie) which ships Python 3.13 and no python3.11 apt
+      # package, breaking `apt-get install python3.11`.
+      image: rust:bookworm
       env:
         JAVA_TOOL_OPTIONS: "--add-exports=java.base/sun.nio.ch=ALL-UNNAMED --add-exports=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED"
     steps:
@@ -91,8 +94,8 @@ jobs:
       - name: Install Python 3.11 and pip
         run: |
           apt-get update
-          apt-get install -y --no-install-recommends python3.11 python3.11-venv python3-pip
-          python3.11 -m venv /tmp/venv
+          apt-get install -y --no-install-recommends python3 python3-venv python3-pip
+          python3 -m venv /tmp/venv
           /tmp/venv/bin/pip install --upgrade pip
           /tmp/venv/bin/pip install "pyspark==3.5.8" "pyarrow>=14" pandas pytest
 

From ec6fa783ed9bb9495dfd709159f5c10cdf37a60b Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 6 May 2026 06:23:52 -0600
Subject: [PATCH 11/54] ci: set PYSPARK_PYTHON to venv python for
 pyarrow_udf_test

Spark launches Python workers in fresh subprocesses that look up python3
on PATH. Without PYSPARK_PYTHON, workers use the system python (no pyarrow
installed) and UDF execution fails with ModuleNotFoundError. Point both
PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON at /tmp/venv/bin/python so workers
inherit the same interpreter that pytest uses.
---
 .github/workflows/pyarrow_udf_test.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/pyarrow_udf_test.yml b/.github/workflows/pyarrow_udf_test.yml
index 622ee59fd0..e8018889cc 100644
--- a/.github/workflows/pyarrow_udf_test.yml
+++ b/.github/workflows/pyarrow_udf_test.yml
@@ -100,6 +100,13 @@ jobs:
           /tmp/venv/bin/pip install "pyspark==3.5.8" "pyarrow>=14" pandas pytest
 
       - name: Run PyArrow UDF pytest
+        env:
+          # Spark launches Python workers in a fresh subprocess and looks up `python3`
+          # on PATH unless PYSPARK_PYTHON is set. Without this, workers use the system
+          # python which has no pyarrow installed and UDF execution fails with
+          # ModuleNotFoundError.
+          PYSPARK_PYTHON: /tmp/venv/bin/python
+          PYSPARK_DRIVER_PYTHON: /tmp/venv/bin/python
         run: |
           jar=$(ls "$PWD"/spark/target/comet-spark-spark3.5_2.12-*-SNAPSHOT.jar \
                 | grep -v sources | grep -v tests | head -n1)

From 1de2c2f815607115c03cc3075200ec4bc28d8223 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 6 May 2026 07:44:54 -0600
Subject: [PATCH 12/54] feat: default-disable PyArrow UDF optimization while
 experimental

Flip spark.comet.exec.pythonMapInArrow.enabled default from true to false
and prefix the config doc with "Experimental:" so the default matches the
"[experimental]" label on the feature. Update the user guide to instruct
users to opt in explicitly.
---
 common/src/main/scala/org/apache/comet/CometConf.scala | 10 ++++++----
 docs/source/user-guide/latest/pyarrow-udfs.md          |  6 +++---
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala
index a06cd896ec..675e872b6e 100644
--- a/common/src/main/scala/org/apache/comet/CometConf.scala
+++ b/common/src/main/scala/org/apache/comet/CometConf.scala
@@ -318,11 +318,13 @@ object CometConf extends ShimCometConf {
     conf("spark.comet.exec.pythonMapInArrow.enabled")
       .category(CATEGORY_EXEC)
       .doc(
-        "Whether to enable optimized execution of PyArrow UDFs (mapInArrow/mapInPandas). " +
-          "When enabled, Comet passes Arrow columnar data directly to Python UDFs without " +
-          "the intermediate Arrow-to-Row-to-Arrow conversion that Spark normally performs.")
+        "Experimental: whether to enable optimized execution of PyArrow UDFs " +
+          "(mapInArrow/mapInPandas). When enabled, Comet passes Arrow columnar data " +
+          "directly to Python UDFs without the intermediate Arrow-to-Row-to-Arrow " +
+          "conversion that Spark normally performs. Disabled by default while the " +
+          "feature stabilizes.")
       .booleanConf
-      .createWithDefault(true)
+      .createWithDefault(false)
 
   val COMET_TRACING_ENABLED: ConfigEntry[Boolean] = conf("spark.comet.tracing.enabled")
     .category(CATEGORY_TUNING)
diff --git a/docs/source/user-guide/latest/pyarrow-udfs.md b/docs/source/user-guide/latest/pyarrow-udfs.md
index 2d555cedc4..374948c039 100644
--- a/docs/source/user-guide/latest/pyarrow-udfs.md
+++ b/docs/source/user-guide/latest/pyarrow-udfs.md
@@ -52,13 +52,13 @@ and memory allocations.
 
 ## Configuration
 
-The optimization is controlled by:
+The optimization is experimental and disabled by default. Enable it with:
 
 ```
-spark.comet.exec.pythonMapInArrow.enabled=true  (default)
+spark.comet.exec.pythonMapInArrow.enabled=true
 ```
 
-It is enabled by default when Comet execution is active.
+The default is `false` while the feature stabilizes.
 
 ## Supported APIs
 

From 3f68cbeb56f6be4a3235b73113630d9b9a928249 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 6 May 2026 07:45:02 -0600
Subject: [PATCH 13/54] test: expand PyArrow UDF pytest coverage

Add coverage for cases that the original pytest module did not exercise:

- mapInPandas (claimed supported, previously zero coverage)
- Null preservation across long and string columns via Arrow passthrough
- Empty input from a CometScan via filter pushdown
- Python exception propagation (sentinel must surface in driver-side error)
- DecimalType(18,6), DateType, TimestampType round-trip with nulls
- ArrayType<Int> and nested StructType, including null arrays/structs and
  arrays containing null elements
- repartition between scan and UDF (correctness only; the optimization
  itself does not fire across a vanilla Exchange and is documented as
  such in the test)

Generalize _assert_plan_matches_mode to take the vanilla node name so the
fallback assertion can match either PythonMapInArrow or MapInPandas.
---
 .../resources/pyspark/test_pyarrow_udf.py     | 280 +++++++++++++++++-
 1 file changed, 277 insertions(+), 3 deletions(-)

diff --git a/spark/src/test/resources/pyspark/test_pyarrow_udf.py b/spark/src/test/resources/pyspark/test_pyarrow_udf.py
index 462f4efdc6..b62db73be1 100644
--- a/spark/src/test/resources/pyspark/test_pyarrow_udf.py
+++ b/spark/src/test/resources/pyspark/test_pyarrow_udf.py
@@ -37,8 +37,10 @@
     pytest -v spark/src/test/resources/pyspark/test_pyarrow_udf.py
 """
 
+import datetime as dt
 import glob
 import os
+from decimal import Decimal
 
 import pyarrow as pa
 import pytest
@@ -125,7 +127,9 @@ def _executed_plan(df) -> str:
     return df._jdf.queryExecution().executedPlan().toString()
 
 
-def _assert_plan_matches_mode(plan: str, accelerated: bool) -> None:
+def _assert_plan_matches_mode(
+    plan: str, accelerated: bool, vanilla_node: str = "PythonMapInArrow"
+) -> None:
     if accelerated:
         assert "CometPythonMapInArrow" in plan, (
             f"expected CometPythonMapInArrow in accelerated plan, got:\n{plan}"
@@ -137,8 +141,8 @@ def _assert_plan_matches_mode(plan: str, accelerated: bool) -> None:
         assert "CometPythonMapInArrow" not in plan, (
             f"unexpected CometPythonMapInArrow in fallback plan:\n{plan}"
         )
-        assert "PythonMapInArrow" in plan, (
-            f"expected PythonMapInArrow in fallback plan, got:\n{plan}"
+        assert vanilla_node in plan, (
+            f"expected {vanilla_node} in fallback plan, got:\n{plan}"
         )
 
 
@@ -201,3 +205,273 @@ def add_computed_column(iterator):
     for i, row in enumerate(rows):
         assert abs(row["squared"] - float(i) ** 2) < 1e-6
         assert row["label"] == f"item_{i}"
+
+
+def test_map_in_pandas_doubles_value(spark, tmp_path, accelerated):
+    data = [(i, float(i * 1.5)) for i in range(100)]
+    src = str(tmp_path / "src.parquet")
+    spark.createDataFrame(data, ["id", "value"]).write.parquet(src)
+
+    def double_value(iterator):
+        for pdf in iterator:
+            pdf = pdf.copy()
+            pdf["value"] = pdf["value"] * 2
+            yield pdf
+
+    schema = T.StructType(
+        [
+            T.StructField("id", T.LongType()),
+            T.StructField("value", T.DoubleType()),
+        ]
+    )
+    result_df = spark.read.parquet(src).mapInPandas(double_value, schema)
+
+    _assert_plan_matches_mode(
+        _executed_plan(result_df), accelerated, vanilla_node="MapInPandas"
+    )
+
+    rows = result_df.orderBy("id").collect()
+    assert len(rows) == len(data)
+    for row, original in zip(rows, data):
+        assert row["id"] == original[0]
+        assert abs(row["value"] - original[1] * 2) < 1e-6
+
+
+def test_map_in_pandas_changes_schema(spark, tmp_path, accelerated):
+    data = [(i, float(i)) for i in range(50)]
+    src = str(tmp_path / "src.parquet")
+    spark.createDataFrame(data, ["id", "value"]).write.parquet(src)
+
+    def add_squared(iterator):
+        for pdf in iterator:
+            pdf = pdf.copy()
+            pdf["squared"] = pdf["value"] ** 2
+            yield pdf
+
+    schema = T.StructType(
+        [
+            T.StructField("id", T.LongType()),
+            T.StructField("value", T.DoubleType()),
+            T.StructField("squared", T.DoubleType()),
+        ]
+    )
+    result_df = spark.read.parquet(src).mapInPandas(add_squared, schema)
+
+    _assert_plan_matches_mode(
+        _executed_plan(result_df), accelerated, vanilla_node="MapInPandas"
+    )
+
+    rows = result_df.orderBy("id").collect()
+    assert len(rows) == 50
+    for i, row in enumerate(rows):
+        assert abs(row["squared"] - float(i) ** 2) < 1e-6
+
+
+def test_map_in_arrow_preserves_nulls(spark, tmp_path, accelerated):
+    schema_in = T.StructType(
+        [
+            T.StructField("id", T.LongType()),
+            T.StructField("name", T.StringType()),
+        ]
+    )
+    rows = [
+        (1, "a"),
+        (2, None),
+        (None, "c"),
+        (None, None),
+        (5, "e"),
+    ]
+    src = str(tmp_path / "src.parquet")
+    spark.createDataFrame(rows, schema_in).write.parquet(src)
+
+    def passthrough(iterator):
+        # Pure Arrow passthrough so nulls survive without a pandas roundtrip
+        # (pandas would coerce null longs to NaN floats).
+        for batch in iterator:
+            yield batch
+
+    result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in)
+    _assert_plan_matches_mode(_executed_plan(result_df), accelerated)
+
+    out = {(r["id"], r["name"]) for r in result_df.collect()}
+    assert out == set(rows)
+
+
+def test_map_in_arrow_empty_input(spark, tmp_path, accelerated):
+    schema_in = T.StructType(
+        [
+            T.StructField("id", T.LongType()),
+            T.StructField("value", T.DoubleType()),
+        ]
+    )
+    src = str(tmp_path / "src.parquet")
+    spark.createDataFrame([(1, 1.0), (2, 2.0)], schema_in).write.parquet(src)
+
+    def passthrough(iterator):
+        for batch in iterator:
+            yield batch
+
+    # Filter all rows out so the operator sees an empty stream from CometScan.
+    result_df = (
+        spark.read.parquet(src).where("id < 0").mapInArrow(passthrough, schema_in)
+    )
+    _assert_plan_matches_mode(_executed_plan(result_df), accelerated)
+
+    assert result_df.count() == 0
+
+
+def test_map_in_arrow_python_exception_propagates(spark, tmp_path, accelerated):
+    schema_in = T.StructType([T.StructField("id", T.LongType())])
+    data = [(i,) for i in range(10)]
+    src = str(tmp_path / "src.parquet")
+    spark.createDataFrame(data, schema_in).write.parquet(src)
+
+    sentinel = "boom-from-pyarrow-udf"
+
+    def boom(iterator):
+        for _batch in iterator:
+            raise ValueError(sentinel)
+        # Unreachable, but mapInArrow requires the callable to be a generator.
+        yield  # pragma: no cover
+
+    result_df = spark.read.parquet(src).mapInArrow(boom, schema_in)
+    _assert_plan_matches_mode(_executed_plan(result_df), accelerated)
+
+    with pytest.raises(Exception) as exc_info:
+        result_df.collect()
+    assert sentinel in str(exc_info.value), (
+        f"expected sentinel {sentinel!r} in exception, got: {exc_info.value}"
+    )
+
+
+def test_map_in_arrow_decimal_type(spark, tmp_path, accelerated):
+    schema_in = T.StructType(
+        [
+            T.StructField("id", T.LongType()),
+            T.StructField("amount", T.DecimalType(18, 6)),
+        ]
+    )
+    rows = [
+        (1, Decimal("123.456789")),
+        (2, Decimal("0.000001")),
+        (3, Decimal("-99999999.999999")),
+        (4, None),
+    ]
+    src = str(tmp_path / "src.parquet")
+    spark.createDataFrame(rows, schema_in).write.parquet(src)
+
+    def passthrough(iterator):
+        for batch in iterator:
+            yield batch
+
+    result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in)
+    _assert_plan_matches_mode(_executed_plan(result_df), accelerated)
+
+    out = {(r["id"], r["amount"]) for r in result_df.collect()}
+    assert out == set(rows)
+
+
+def test_map_in_arrow_date_and_timestamp(spark, tmp_path, accelerated):
+    schema_in = T.StructType(
+        [
+            T.StructField("id", T.LongType()),
+            T.StructField("d", T.DateType()),
+            T.StructField("ts", T.TimestampType()),
+        ]
+    )
+    rows = [
+        (1, dt.date(2024, 1, 1), dt.datetime(2024, 1, 1, 12, 30, 45)),
+        (2, dt.date(1999, 12, 31), dt.datetime(2000, 6, 15, 0, 0, 0)),
+        (3, None, None),
+    ]
+    src = str(tmp_path / "src.parquet")
+    spark.createDataFrame(rows, schema_in).write.parquet(src)
+
+    def passthrough(iterator):
+        for batch in iterator:
+            yield batch
+
+    result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in)
+    _assert_plan_matches_mode(_executed_plan(result_df), accelerated)
+
+    out = {(r["id"], r["d"], r["ts"]) for r in result_df.collect()}
+    assert out == set(rows)
+
+
+def test_map_in_arrow_array_and_struct(spark, tmp_path, accelerated):
+    schema_in = T.StructType(
+        [
+            T.StructField("id", T.LongType()),
+            T.StructField("nums", T.ArrayType(T.IntegerType())),
+            T.StructField(
+                "addr",
+                T.StructType(
+                    [
+                        T.StructField("city", T.StringType()),
+                        T.StructField("zip", T.IntegerType()),
+                    ]
+                ),
+            ),
+        ]
+    )
+    rows = [
+        (1, [1, 2, 3], ("Berlin", 10115)),
+        (2, [], ("NYC", 10001)),
+        (3, None, None),
+        (4, [None, 5], ("Tokyo", None)),
+    ]
+    src = str(tmp_path / "src.parquet")
+    spark.createDataFrame(rows, schema_in).write.parquet(src)
+
+    def passthrough(iterator):
+        for batch in iterator:
+            yield batch
+
+    result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in)
+    _assert_plan_matches_mode(_executed_plan(result_df), accelerated)
+
+    def _normalize(row):
+        nums = tuple(row["nums"]) if row["nums"] is not None else None
+        addr = row["addr"]
+        addr_tuple = (addr["city"], addr["zip"]) if addr is not None else None
+        return (row["id"], nums, addr_tuple)
+
+    out = {_normalize(r) for r in result_df.collect()}
+    expected = {
+        (r[0], tuple(r[1]) if r[1] is not None else None, r[2]) for r in rows
+    }
+    assert out == expected
+
+
+def test_map_in_arrow_after_shuffle(spark, tmp_path, accelerated):
+    """
+    Verifies correctness when a shuffle sits between the Comet scan and the
+    Python UDF. Without `spark.shuffle.manager` configured at session startup
+    the shuffle stays a vanilla `Exchange`, which is not columnar, so the
+    optimization does not fire across it today. This test does not assert on
+    the plan; it only ensures the path produces correct results in both modes
+    so a future change that wires Comet shuffle into the optimization does
+    not silently break correctness.
+    """
+    schema_in = T.StructType(
+        [
+            T.StructField("id", T.LongType()),
+            T.StructField("value", T.DoubleType()),
+        ]
+    )
+    rows = [(i, float(i)) for i in range(50)]
+    src = str(tmp_path / "src.parquet")
+    spark.createDataFrame(rows, schema_in).write.parquet(src)
+
+    def passthrough(iterator):
+        for batch in iterator:
+            yield batch
+
+    result_df = (
+        spark.read.parquet(src)
+        .repartition(4, "id")
+        .mapInArrow(passthrough, schema_in)
+    )
+
+    out = sorted((r["id"], r["value"]) for r in result_df.collect())
+    assert out == sorted(rows)

From e2ca2d2d91e5a10a829bd3793cac31727c27f6d4 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 6 May 2026 07:45:52 -0600
Subject: [PATCH 14/54] docs: document PyArrow UDF limitations and AQE explain
 quirk

Expand the user guide with the limitations a user should know before
enabling the experimental optimization:

- The remaining row-to-Arrow round-trip inside the Python runner is
  documented more precisely (the input goes through ColumnarBatch.rowIterator
  to feed ArrowPythonRunner, which re-encodes to Arrow IPC).
- A vanilla Spark Exchange between the Comet scan and the UDF prevents
  the optimization from firing. Users must configure Comet's native
  shuffle manager at session startup to keep the data columnar.
- Spark 3.4 lacks the prerequisite APIs and the feature is a no-op there.
- isBarrier is captured by the operator constructor but not yet
  propagated to the Python runner.

Also explain the AQE display quirk: with AQE on and a shuffle present,
the pre-execution plan shows the unoptimized form because the rule
only sees the materialized subplan after stage execution. Running an
action and re-inspecting explain() reveals the optimized plan.
---
 docs/source/user-guide/latest/pyarrow-udfs.md | 37 +++++++++++++++++--
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/docs/source/user-guide/latest/pyarrow-udfs.md b/docs/source/user-guide/latest/pyarrow-udfs.md
index 374948c039..08a731e5de 100644
--- a/docs/source/user-guide/latest/pyarrow-udfs.md
+++ b/docs/source/user-guide/latest/pyarrow-udfs.md
@@ -102,7 +102,7 @@ result = df.mapInArrow(transform, output_schema)
 
 ## Verifying the Optimization
 
-Use `explain()` to verify that `CometPythonMapInArrowExec` appears in your plan:
+Use `explain()` to verify that `CometPythonMapInArrow` appears in your plan:
 
 ```python
 result.explain(mode="extended")
@@ -111,7 +111,7 @@ result.explain(mode="extended")
 You should see:
 
 ```
-CometPythonMapInArrowExec ...
+CometPythonMapInArrow ...
 +- CometNativeExec ...
    +- CometScan ...
 ```
@@ -125,10 +125,39 @@ PythonMapInArrow ...
       +- CometScan ...
 ```
 
+When AQE is enabled (the Spark default) and the query contains a shuffle, the
+optimization is applied during stage materialization. Calling `explain()` before
+running an action will show the unoptimized plan:
+
+```
+AdaptiveSparkPlan isFinalPlan=false
++- PythonMapInArrow ...
+   +- CometExchange ...
+```
+
+To see the optimized plan, run an action first (for example `result.collect()` or
+`result.cache(); result.count()`) and then call `explain()`. The post-execution
+plan shows the materialized stages and includes `CometPythonMapInArrow` if the
+optimization fired.
+
 ## Limitations
 
 - The optimization currently applies only to `mapInArrow` and `mapInPandas`. Scalar pandas UDFs
   (`@pandas_udf`) and grouped operations (`applyInPandas`) are not yet supported.
 - The internal row-to-Arrow conversion inside the Python runner is still present in this version.
-  A future optimization will write Arrow batches directly to the Python IPC stream, achieving
-  near zero-copy data transfer.
+  Comet currently routes columnar input through `ColumnarBatch.rowIterator()` so that the existing
+  `ArrowPythonRunner` can re-encode the rows back to Arrow IPC. A future optimization will write
+  Arrow batches directly to the Python IPC stream, eliminating the remaining round-trip and
+  achieving near zero-copy data transfer.
+- The optimization requires Arrow data on the input side. If a shuffle sits between the upstream
+  Comet operator and the Python UDF, you need Comet's native shuffle for the optimization to
+  apply. Set `spark.shuffle.manager` to
+  `org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager` and enable
+  `spark.comet.exec.shuffle.enabled=true` at session startup. With a vanilla Spark `Exchange`
+  in the plan the data leaves the shuffle as rows and the optimization cannot fire.
+- Spark 3.4 lacks several APIs the optimization depends on (`MapInBatchExec.isBarrier`,
+  `arrowUseLargeVarTypes`, `JobArtifactSet`, the modern `ArrowPythonRunner` constructor). On
+  Spark 3.4 the feature is a no-op even when enabled. Spark 3.5+ is required.
+- The `isBarrier` flag on `mapInArrow` / `mapInPandas` is currently captured but not propagated
+  through to the Python runner. If your job depends on barrier-execution semantics, leave the
+  optimization disabled until this is fixed.

From f4b5c3274cc45400fe5e7102b72e2fea96ed8496 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 6 May 2026 07:57:26 -0600
Subject: [PATCH 15/54] bench: add Python end-to-end benchmark for PyArrow UDF
 acceleration

Standalone Python script that times df.mapInArrow(passthrough).count()
and the equivalent mapInPandas query with the optimization toggled on
and off. Numbers are wall-clock seconds, so they include the Python
worker, Arrow IPC, and downstream count() costs. That is the right
unit for a feature whose user surface is Python: it shows what
fraction of end-to-end time the optimization shaves off, not just the
JVM-side delta in isolation.

Three workloads exercise the dimension where the optimization helps
most:

- narrow primitives (long, int, double)
- mixed with strings (variable-length encoding)
- wide rows (50 columns, projection cost scales with column count)

Local smoke run with 200k rows shows 1.17x to 1.45x speedup across
mapInArrow and mapInPandas, narrow/wide schemas. The script is
configurable via BENCHMARK_ROWS / BENCHMARK_WARMUP / BENCHMARK_ITERS
env vars for users who want longer or shorter runs.
---
 .../pyspark/benchmark_pyarrow_udf.py          | 235 ++++++++++++++++++
 1 file changed, 235 insertions(+)
 create mode 100644 spark/src/test/resources/pyspark/benchmark_pyarrow_udf.py

diff --git a/spark/src/test/resources/pyspark/benchmark_pyarrow_udf.py b/spark/src/test/resources/pyspark/benchmark_pyarrow_udf.py
new file mode 100644
index 0000000000..8a3b4333c4
--- /dev/null
+++ b/spark/src/test/resources/pyspark/benchmark_pyarrow_udf.py
@@ -0,0 +1,235 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+End-to-end wall-clock benchmark for Comet's PyArrow UDF acceleration.
+
+Times `df.mapInArrow(passthrough, schema).count()` and the equivalent
+`mapInPandas` query with `spark.comet.exec.pythonMapInArrow.enabled` set
+to false (vanilla Spark path) and true (Comet's optimized path). Both
+modes run the same Python worker, so the measured delta covers what the
+optimization actually changes for users:
+
+  * vanilla:   CometScan -> ColumnarToRow + UnsafeProjection -> ArrowPythonRunner
+  * optimized: CometScan -> rowIterator -> ArrowPythonRunner (same runner;
+              no UnsafeProjection, output kept as ColumnarBatch)
+
+Results are wall-clock seconds, so they include Python interpreter,
+Arrow IPC, and downstream count() costs. That's intentional: the
+optimization's user-visible value is what fraction of end-to-end time
+it shaves off, not the JVM-side delta in isolation.
+
+Usage:
+    # Build Comet (release for representative numbers):
+    make release
+
+    pip install pyspark==3.5.8 pyarrow pandas
+
+    python3 spark/src/test/resources/pyspark/benchmark_pyarrow_udf.py
+
+Override defaults via environment variables:
+    COMET_JAR=/path/to/comet.jar          path to the Comet jar
+    BENCHMARK_ROWS=2000000                rows per run
+    BENCHMARK_WARMUP=2                    warmup iterations per case
+    BENCHMARK_ITERS=5                     measured iterations per case
+"""
+
+import contextlib
+import glob
+import os
+import statistics
+import tempfile
+import time
+
+from pyspark.sql import SparkSession
+
+
+REPO_ROOT = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..")
+)
+
+
+def _resolve_comet_jar() -> str:
+    explicit = os.environ.get("COMET_JAR")
+    if explicit:
+        return explicit
+    import pyspark
+
+    major_minor = ".".join(pyspark.__version__.split(".")[:2])
+    spark_tag = f"spark{major_minor}"
+    scala_tag = "_2.12" if major_minor.startswith("3.") else "_2.13"
+    pattern = os.path.join(
+        REPO_ROOT,
+        f"spark/target/comet-spark-{spark_tag}{scala_tag}-*-SNAPSHOT.jar",
+    )
+    candidates = [
+        m
+        for m in sorted(glob.glob(pattern))
+        if "sources" not in os.path.basename(m) and "tests" not in os.path.basename(m)
+    ]
+    if not candidates:
+        raise FileNotFoundError(
+            "Comet jar not found. Set COMET_JAR or run `make release`. "
+            f"Looked under {pattern}."
+        )
+    return candidates[-1]
+
+
+def _build_spark() -> SparkSession:
+    jar = _resolve_comet_jar()
+    os.environ["PYSPARK_SUBMIT_ARGS"] = (
+        f"--jars {jar} --driver-class-path {jar} pyspark-shell"
+    )
+    return (
+        SparkSession.builder.master("local[2]")
+        .appName("comet-pyarrow-udf-benchmark")
+        .config("spark.plugins", "org.apache.spark.CometPlugin")
+        .config("spark.comet.enabled", "true")
+        .config("spark.comet.exec.enabled", "true")
+        .config("spark.memory.offHeap.enabled", "true")
+        .config("spark.memory.offHeap.size", "4g")
+        .config("spark.driver.memory", "4g")
+        # Pin AQE off so the explain output and plan structure are stable
+        # across iterations. AQE doesn't change the optimization's behavior;
+        # it just makes plan inspection harder.
+        .config("spark.sql.adaptive.enabled", "false")
+        .getOrCreate()
+    )
+
+
+def _passthrough_arrow(iterator):
+    for batch in iterator:
+        yield batch
+
+
+def _passthrough_pandas(iterator):
+    for pdf in iterator:
+        yield pdf
+
+
+def _narrow_primitives(spark: SparkSession, n: int):
+    return spark.range(n).selectExpr(
+        "id as id_long",
+        "cast(id as int) as id_int",
+        "cast(id as double) as id_double",
+    )
+
+
+def _mixed_with_strings(spark: SparkSession, n: int):
+    return spark.range(n).selectExpr(
+        "id as id_long",
+        "cast(id as int) as id_int",
+        "cast(id as double) as id_double",
+        "concat('row_', cast(id as string)) as id_str",
+        "cast(id % 2 as boolean) as id_bool",
+    )
+
+
+def _wide_rows(spark: SparkSession, n: int):
+    types = ["int", "long", "double"]
+    cols = [
+        f"cast(id + {i} as {types[i % len(types)]}) as col_{i}" for i in range(50)
+    ]
+    return spark.range(n).selectExpr(*cols)
+
+
+WORKLOADS = [
+    ("narrow primitives", _narrow_primitives),
+    ("mixed with strings", _mixed_with_strings),
+    ("wide rows (50 cols)", _wide_rows),
+]
+
+
+@contextlib.contextmanager
+def _temp_parquet(spark: SparkSession, build_df, n: int):
+    with tempfile.TemporaryDirectory() as d:
+        path = os.path.join(d, "src.parquet")
+        build_df(spark, n).write.parquet(path)
+        yield path
+
+
+def _time_run(spark: SparkSession, parquet_path: str, accelerate: bool, api: str) -> float:
+    spark.conf.set(
+        "spark.comet.exec.pythonMapInArrow.enabled",
+        "true" if accelerate else "false",
+    )
+    df = spark.read.parquet(parquet_path)
+    schema = df.schema
+    if api == "mapInArrow":
+        df = df.mapInArrow(_passthrough_arrow, schema)
+    else:
+        df = df.mapInPandas(_passthrough_pandas, schema)
+    t0 = time.perf_counter()
+    df.count()
+    return time.perf_counter() - t0
+
+
+def main() -> None:
+    rows = int(os.environ.get("BENCHMARK_ROWS", 1024 * 1024))
+    warmup = int(os.environ.get("BENCHMARK_WARMUP", 2))
+    iters = int(os.environ.get("BENCHMARK_ITERS", 5))
+
+    spark = _build_spark()
+    spark.sparkContext.setLogLevel("WARN")
+
+    print(f"\nrows per run: {rows:,}")
+    print(f"warmup iters: {warmup}, measured iters: {iters}")
+    print(f"jar: {_resolve_comet_jar()}\n")
+
+    header = "  {:<14} {:<10} {:>10} {:>10} {:>10} {:>13} {:>9}".format(
+        "api", "mode", "min (s)", "median (s)", "max (s)", "rows/s", "speedup"
+    )
+    print(header)
+    print("  " + "-" * (len(header) - 2))
+
+    for name, build_df in WORKLOADS:
+        print(f"\n=== {name} ===")
+        with _temp_parquet(spark, build_df, rows) as parquet_path:
+            for api in ("mapInArrow", "mapInPandas"):
+                samples_by_mode = {}
+                for mode, accelerate in (("vanilla", False), ("optimized", True)):
+                    for _ in range(warmup):
+                        _time_run(spark, parquet_path, accelerate, api)
+                    samples = [
+                        _time_run(spark, parquet_path, accelerate, api)
+                        for _ in range(iters)
+                    ]
+                    samples_by_mode[mode] = samples
+                    median = statistics.median(samples)
+                    speedup = ""
+                    if mode == "optimized":
+                        speedup = "{:.2f}x".format(
+                            statistics.median(samples_by_mode["vanilla"]) / median
+                        )
+                    print(
+                        "  {:<14} {:<10} {:>10} {:>10} {:>10} {:>13} {:>9}".format(
+                            api,
+                            mode,
+                            "{:.3f}".format(min(samples)),
+                            "{:.3f}".format(median),
+                            "{:.3f}".format(max(samples)),
+                            "{:,.0f}".format(rows / median),
+                            speedup,
+                        )
+                    )
+
+    spark.stop()
+
+
+if __name__ == "__main__":
+    main()

From 3822ed7d90368a0746a577a681d2bf56efccb087 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 6 May 2026 08:17:49 -0600
Subject: [PATCH 16/54] fix: propagate isBarrier through
 CometPythonMapInArrowExec

The operator captured isBarrier in its constructor but always called
inputRDD.mapPartitionsInternal, dropping the barrier execution mode
semantics that mapInArrow(..., barrier=True) requests. Stages running
under the optimization lost gang scheduling and the BarrierTaskContext
APIs the UDF expects.

Branch on isBarrier and route through inputRDD.barrier().mapPartitions
in the barrier case, matching what Spark's MapInBatchExec.doExecute
does. Add a pytest case that calls BarrierTaskContext.get() inside the
UDF, which raises if the task is not running in a barrier stage; runs
in both vanilla and optimized modes. Drop the isBarrier limitation
note from the user guide.
---
 docs/source/user-guide/latest/pyarrow-udfs.md |  3 --
 .../sql/comet/CometPythonMapInArrowExec.scala | 15 ++++++--
 .../resources/pyspark/test_pyarrow_udf.py     | 38 +++++++++++++++++++
 3 files changed, 50 insertions(+), 6 deletions(-)

diff --git a/docs/source/user-guide/latest/pyarrow-udfs.md b/docs/source/user-guide/latest/pyarrow-udfs.md
index 08a731e5de..6a95fbac0d 100644
--- a/docs/source/user-guide/latest/pyarrow-udfs.md
+++ b/docs/source/user-guide/latest/pyarrow-udfs.md
@@ -158,6 +158,3 @@ optimization fired.
 - Spark 3.4 lacks several APIs the optimization depends on (`MapInBatchExec.isBarrier`,
   `arrowUseLargeVarTypes`, `JobArtifactSet`, the modern `ArrowPythonRunner` constructor). On
   Spark 3.4 the feature is a no-op even when enabled. Spark 3.5+ is required.
-- The `isBarrier` flag on `mapInArrow` / `mapInPandas` is currently captured but not propagated
-  through to the Python runner. If your job depends on barrier-execution semantics, leave the
-  optimization disabled until this is fixed.
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala
index 9b3e820023..68e27b9355 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala
@@ -93,12 +93,13 @@ case class CometPythonMapInArrowExec(
 
     val inputRDD = child.executeColumnar()
 
-    inputRDD.mapPartitionsInternal { batches =>
+    // Run on every partition. Identical to what MapInBatchExec does, except the input
+    // is columnar; we intentionally avoid the UnsafeProjection copy that ColumnarToRow
+    // would do.
+    def processPartition(batches: Iterator[ColumnarBatch]): Iterator[ColumnarBatch] = {
       val context = TaskContext.get()
       val argOffsets = Array(Array(0))
 
-      // Convert columnar batches to rows using lightweight rowIterator
-      // (avoids UnsafeProjection copy that ColumnarToRow would do)
       val rowIter = batches.flatMap { batch =>
         numInputRows += batch.numRows()
         batch.rowIterator().asScala
@@ -137,6 +138,14 @@ case class CometPythonMapInArrowExec(
         flattenedBatch
       }
     }
+
+    // Preserve isBarrier semantics: when set, run inside a barrier stage so all tasks
+    // are gang-scheduled and BarrierTaskContext.barrier() works inside the UDF.
+    if (isBarrier) {
+      inputRDD.barrier().mapPartitions(processPartition)
+    } else {
+      inputRDD.mapPartitionsInternal(processPartition)
+    }
   }
 
   override protected def withNewChildInternal(newChild: SparkPlan): CometPythonMapInArrowExec =
diff --git a/spark/src/test/resources/pyspark/test_pyarrow_udf.py b/spark/src/test/resources/pyspark/test_pyarrow_udf.py
index b62db73be1..ea72436841 100644
--- a/spark/src/test/resources/pyspark/test_pyarrow_udf.py
+++ b/spark/src/test/resources/pyspark/test_pyarrow_udf.py
@@ -475,3 +475,41 @@ def passthrough(iterator):
 
     out = sorted((r["id"], r["value"]) for r in result_df.collect())
     assert out == sorted(rows)
+
+
+def test_map_in_arrow_barrier_mode(spark, tmp_path, accelerated):
+    """
+    `mapInArrow(..., barrier=True)` runs the stage in barrier execution mode
+    (gang scheduling, all-or-nothing failure semantics, BarrierTaskContext
+    available inside the UDF). The optimization captures isBarrier in the
+    operator constructor and must propagate it through to RDD.barrier();
+    otherwise the runtime context the UDF sees changes when the optimization
+    fires and any code calling BarrierTaskContext APIs breaks.
+    """
+    schema_in = T.StructType(
+        [
+            T.StructField("id", T.LongType()),
+            T.StructField("value", T.DoubleType()),
+        ]
+    )
+    rows = [(i, float(i)) for i in range(20)]
+    src = str(tmp_path / "src.parquet")
+    spark.createDataFrame(rows, schema_in).write.parquet(src)
+
+    def assert_barrier_context(iterator):
+        from pyspark import BarrierTaskContext
+
+        # Will raise if the task is not running inside a barrier stage.
+        BarrierTaskContext.get()
+        for batch in iterator:
+            yield batch
+
+    result_df = (
+        spark.read.parquet(src).mapInArrow(
+            assert_barrier_context, schema_in, barrier=True
+        )
+    )
+    _assert_plan_matches_mode(_executed_plan(result_df), accelerated)
+
+    out = sorted((r["id"], r["value"]) for r in result_df.collect())
+    assert out == sorted(rows)

From 24dc84b86368a73cf0a509a457cfab0af15cfda7 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 8 May 2026 04:56:15 -0600
Subject: [PATCH 17/54] refactor: address PR review feedback for PyArrow UDF
 acceleration

Conf and operator rename:
- spark.comet.exec.pythonMapInArrow.enabled -> spark.comet.exec.pyarrowUdf.enabled
- CometPythonMapInArrowExec -> CometMapInBatchExec, matching Spark's MapInBatchExec
  parent class and reflecting that this op handles MapInArrow + MapInPandas

Shim and rule cleanup:
- Mix CometPlan into the operator
- Use PythonEvalType.SQL_MAP_ARROW_ITER_UDF / SQL_MAP_PANDAS_ITER_UDF directly in
  the matchers instead of dereferencing PythonUDF.evalType
- Replace 5-tuple shim return with named MapInBatchInfo case class
- Collapse double matcher evaluation in the rule into a single match
- Remove unreachable ColumnarToRowExec branch in extractColumnarChild
- Reduce computeArrowPython parameter count by passing SQLConf and deriving
  timeZoneId / largeVarTypes / pythonRunnerConf / jobArtifactUUID inside
- Add a why-comment to the doExecute fallback
- Drop comments that restate the code

Spark 4.x shim consolidation:
- Move shared 4.x matchers and runnerInputs helper into spark-4.x/
  Spark4xMapInBatchSupport, leaving each minor's ShimCometMapInBatch as a small
  ArrowPythonRunner constructor factory

CI workflow:
- Switch pyarrow_udf_test workflow from Spark 3.5 to Spark 4.0 to cover the 4.x
  shim path; build in debug mode (no -Prelease, no cargo --release)

Tests:
- Replace the no-op CometPythonMapInArrowSuite with per-Spark-version
  CometMapInBatchSuite under spark/src/test/spark-{3.5,4.x} that constructs a
  PythonMapInArrowExec / MapInArrowExec over a stub CometPlan leaf and verifies
  EliminateRedundantTransitions rewrites it to CometMapInBatchExec (and does
  not when the conf is disabled)
- Consolidate jar resolution into spark/src/test/resources/pyspark/conftest.py;
  pytest and the benchmark script both import resolve_comet_jar from there,
  and the workflow no longer needs an inline ls/grep
- Update plan-string assertions to look for CometMapInBatch and the substring
  MapInArrow which is shared by Spark 3.5's PythonMapInArrowExec and Spark 4.x's
  MapInArrowExec node names

Docs:
- Rename references in the user guide and add a barrier=True section noting
  that isBarrier is propagated through RDD.barrier()
---
 .github/workflows/pyarrow_udf_test.yml        |  15 +--
 .../scala/org/apache/comet/CometConf.scala    |   4 +-
 docs/source/user-guide/latest/pyarrow-udfs.md |  44 +++++--
 .../rules/EliminateRedundantTransitions.scala |  52 +++++----
 ...owExec.scala => CometMapInBatchExec.scala} |  62 +++++-----
 .../sql/comet/shims/MapInBatchInfo.scala      |  36 ++++++
 ...nArrow.scala => ShimCometMapInBatch.scala} |  24 +---
 ...nArrow.scala => ShimCometMapInBatch.scala} |  48 ++++----
 .../sql/comet/shims/ShimCometMapInBatch.scala |  56 +++++++++
 .../shims/ShimCometPythonMapInArrow.scala     |  86 --------------
 .../sql/comet/shims/ShimCometMapInBatch.scala |  57 ++++++++++
 .../shims/ShimCometPythonMapInArrow.scala     |  87 --------------
 .../sql/comet/shims/ShimCometMapInBatch.scala |  56 +++++++++
 .../shims/ShimCometPythonMapInArrow.scala     |  86 --------------
 .../shims/Spark4xMapInBatchSupport.scala      |  81 +++++++++++++
 .../pyspark/benchmark_pyarrow_udf.py          |  48 +++-----
 spark/src/test/resources/pyspark/conftest.py  |  73 ++++++++++++
 .../resources/pyspark/test_pyarrow_udf.py     |  72 +++---------
 .../exec/CometPythonMapInArrowSuite.scala     |  66 -----------
 .../sql/comet/CometMapInBatchSuite.scala      | 106 +++++++++++++++++
 .../sql/comet/CometMapInBatchSuite.scala      | 107 ++++++++++++++++++
 21 files changed, 730 insertions(+), 536 deletions(-)
 rename spark/src/main/scala/org/apache/spark/sql/comet/{CometPythonMapInArrowExec.scala => CometMapInBatchExec.scala} (71%)
 create mode 100644 spark/src/main/scala/org/apache/spark/sql/comet/shims/MapInBatchInfo.scala
 rename spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/{ShimCometPythonMapInArrow.scala => ShimCometMapInBatch.scala} (69%)
 rename spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/{ShimCometPythonMapInArrow.scala => ShimCometMapInBatch.scala} (64%)
 create mode 100644 spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
 delete mode 100644 spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala
 create mode 100644 spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
 delete mode 100644 spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala
 create mode 100644 spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
 delete mode 100644 spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala
 create mode 100644 spark/src/main/spark-4.x/org/apache/spark/sql/comet/shims/Spark4xMapInBatchSupport.scala
 create mode 100644 spark/src/test/resources/pyspark/conftest.py
 delete mode 100644 spark/src/test/scala/org/apache/comet/exec/CometPythonMapInArrowSuite.scala
 create mode 100644 spark/src/test/spark-3.5/org/apache/spark/sql/comet/CometMapInBatchSuite.scala
 create mode 100644 spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala

diff --git a/.github/workflows/pyarrow_udf_test.yml b/.github/workflows/pyarrow_udf_test.yml
index e8018889cc..211a9bd23a 100644
--- a/.github/workflows/pyarrow_udf_test.yml
+++ b/.github/workflows/pyarrow_udf_test.yml
@@ -58,7 +58,7 @@ env:
 
 jobs:
   pyarrow-udf:
-    name: PyArrow UDF (Spark 3.5, JDK 17, Python 3.11)
+    name: PyArrow UDF (Spark 4.0, JDK 17, Python 3.11)
     runs-on: ubuntu-latest
     container:
       # Pinned to the Debian 12 (bookworm) base so the system `python3` is 3.11. The default
@@ -86,10 +86,10 @@ jobs:
           restore-keys: |
             ${{ runner.os }}-java-maven-
 
-      - name: Build Comet (release, Spark 3.5 / Scala 2.12)
+      - name: Build Comet (debug, Spark 4.0 / Scala 2.13)
         run: |
-          cd native && cargo build --release
-          cd .. && ./mvnw -B -Prelease install -DskipTests -Pspark-3.5 -Pscala-2.12
+          cd native && cargo build
+          cd .. && ./mvnw -B install -DskipTests -Pspark-4.0 -Pscala-2.13
 
       - name: Install Python 3.11 and pip
         run: |
@@ -97,7 +97,7 @@ jobs:
           apt-get install -y --no-install-recommends python3 python3-venv python3-pip
           python3 -m venv /tmp/venv
           /tmp/venv/bin/pip install --upgrade pip
-          /tmp/venv/bin/pip install "pyspark==3.5.8" "pyarrow>=14" pandas pytest
+          /tmp/venv/bin/pip install "pyspark==4.0.1" "pyarrow>=14" pandas pytest
 
       - name: Run PyArrow UDF pytest
         env:
@@ -108,8 +108,5 @@ jobs:
           PYSPARK_PYTHON: /tmp/venv/bin/python
           PYSPARK_DRIVER_PYTHON: /tmp/venv/bin/python
         run: |
-          jar=$(ls "$PWD"/spark/target/comet-spark-spark3.5_2.12-*-SNAPSHOT.jar \
-                | grep -v sources | grep -v tests | head -n1)
-          echo "Using $jar"
-          COMET_JAR="$jar" /tmp/venv/bin/python -m pytest -v \
+          /tmp/venv/bin/python -m pytest -v \
             spark/src/test/resources/pyspark/test_pyarrow_udf.py
diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala
index 675e872b6e..0bdc35d3ce 100644
--- a/common/src/main/scala/org/apache/comet/CometConf.scala
+++ b/common/src/main/scala/org/apache/comet/CometConf.scala
@@ -314,8 +314,8 @@ object CometConf extends ShimCometConf {
       .booleanConf
       .createWithDefault(false)
 
-  val COMET_PYTHON_MAP_IN_ARROW_ENABLED: ConfigEntry[Boolean] =
-    conf("spark.comet.exec.pythonMapInArrow.enabled")
+  val COMET_PYARROW_UDF_ENABLED: ConfigEntry[Boolean] =
+    conf("spark.comet.exec.pyarrowUdf.enabled")
       .category(CATEGORY_EXEC)
       .doc(
         "Experimental: whether to enable optimized execution of PyArrow UDFs " +
diff --git a/docs/source/user-guide/latest/pyarrow-udfs.md b/docs/source/user-guide/latest/pyarrow-udfs.md
index 6a95fbac0d..23ef50e79c 100644
--- a/docs/source/user-guide/latest/pyarrow-udfs.md
+++ b/docs/source/user-guide/latest/pyarrow-udfs.md
@@ -40,22 +40,43 @@ Steps 2 and 3 are redundant since the data starts and ends in Arrow format.
 
 ## How Comet Optimizes This
 
-When enabled, Comet detects `PythonMapInArrowExec` and `MapInPandasExec` operators in the physical plan
-and replaces them with `CometPythonMapInArrowExec`, which:
+When enabled, Comet detects `PythonMapInArrowExec` / `MapInArrowExec` and `MapInPandasExec`
+operators in the physical plan and replaces them with `CometMapInBatchExec`, which:
 
 - Reads Arrow columnar batches directly from the upstream Comet operator
 - Feeds them to the Python runner without the expensive UnsafeProjection copy
 - Keeps the Python output in columnar format for downstream operators
 
 This eliminates the ColumnarToRow transition and the output row conversion, reducing CPU overhead
-and memory allocations.
+and memory allocations. The internal row-to-Arrow IPC re-encoding inside Spark's
+`ArrowPythonRunner` is unchanged in this version; full round-trip elimination is tracked in
+[#4240](https://github.com/apache/datafusion-comet/issues/4240).
+
+### Plan flow
+
+Without Comet's optimization:
+
+```
+PythonMapInArrow / MapInArrow / MapInPandas
++- ColumnarToRow         <- Arrow -> Row copy
+   +- CometNativeExec    <- Arrow batch
+      +- CometScan
+```
+
+With the optimization enabled:
+
+```
+CometMapInBatch          <- Arrow batch in/out, Python runner attached
++- CometNativeExec
+   +- CometScan
+```
 
 ## Configuration
 
 The optimization is experimental and disabled by default. Enable it with:
 
 ```
-spark.comet.exec.pythonMapInArrow.enabled=true
+spark.comet.exec.pyarrowUdf.enabled=true
 ```
 
 The default is `false` while the feature stabilizes.
@@ -79,7 +100,7 @@ spark = SparkSession.builder \
     .config("spark.plugins", "org.apache.spark.CometPlugin") \
     .config("spark.comet.enabled", "true") \
     .config("spark.comet.exec.enabled", "true") \
-    .config("spark.comet.exec.pythonMapInArrow.enabled", "true") \
+    .config("spark.comet.exec.pyarrowUdf.enabled", "true") \
     .config("spark.memory.offHeap.enabled", "true") \
     .config("spark.memory.offHeap.size", "2g") \
     .getOrCreate()
@@ -102,7 +123,7 @@ result = df.mapInArrow(transform, output_schema)
 
 ## Verifying the Optimization
 
-Use `explain()` to verify that `CometPythonMapInArrow` appears in your plan:
+Use `explain()` to verify that `CometMapInBatch` appears in your plan:
 
 ```python
 result.explain(mode="extended")
@@ -111,7 +132,7 @@ result.explain(mode="extended")
 You should see:
 
 ```
-CometPythonMapInArrow ...
+CometMapInBatch ...
 +- CometNativeExec ...
    +- CometScan ...
 ```
@@ -137,9 +158,16 @@ AdaptiveSparkPlan isFinalPlan=false
 
 To see the optimized plan, run an action first (for example `result.collect()` or
 `result.cache(); result.count()`) and then call `explain()`. The post-execution
-plan shows the materialized stages and includes `CometPythonMapInArrow` if the
+plan shows the materialized stages and includes `CometMapInBatch` if the
 optimization fired.
 
+## Barrier execution
+
+`mapInArrow(..., barrier=True)` and `mapInPandas(..., barrier=True)` are honored: the
+optimized operator propagates `isBarrier` through `RDD.barrier()`, so all tasks are
+gang-scheduled and `BarrierTaskContext.barrier()` works inside the UDF the same way it does
+on the unoptimized path.
+
 ## Limitations
 
 - The optimization currently applies only to `mapInArrow` and `mapInPandas`. Scalar pandas UDFs
diff --git a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala
index e7218ab935..24c969c173 100644
--- a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala
+++ b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala
@@ -22,9 +22,9 @@ package org.apache.comet.rules
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.catalyst.util.sideBySide
-import org.apache.spark.sql.comet.{CometCollectLimitExec, CometColumnarToRowExec, CometNativeColumnarToRowExec, CometNativeWriteExec, CometPlan, CometPythonMapInArrowExec, CometSparkToColumnarExec}
+import org.apache.spark.sql.comet.{CometCollectLimitExec, CometColumnarToRowExec, CometMapInBatchExec, CometNativeColumnarToRowExec, CometNativeWriteExec, CometPlan, CometSparkToColumnarExec}
 import org.apache.spark.sql.comet.execution.shuffle.{CometColumnarShuffle, CometShuffleExchangeExec}
-import org.apache.spark.sql.comet.shims.ShimCometPythonMapInArrow
+import org.apache.spark.sql.comet.shims.ShimCometMapInBatch
 import org.apache.spark.sql.execution.{ColumnarToRowExec, RowToColumnarExec, SparkPlan}
 import org.apache.spark.sql.execution.adaptive.QueryStageExec
 import org.apache.spark.sql.execution.exchange.ReusedExchangeExec
@@ -54,7 +54,7 @@ import org.apache.comet.CometConf
 // be removed.
 case class EliminateRedundantTransitions(session: SparkSession)
     extends Rule[SparkPlan]
-    with ShimCometPythonMapInArrow {
+    with ShimCometMapInBatch {
 
   private lazy val showTransformations = CometConf.COMET_EXPLAIN_TRANSFORMATIONS.get()
 
@@ -102,24 +102,25 @@ case class EliminateRedundantTransitions(session: SparkSession)
         sparkToColumnar.child
       case CometSparkToColumnarExec(child: CometSparkToColumnarExec) => child
       // Replace MapInBatchExec (PythonMapInArrowExec / MapInArrowExec / MapInPandasExec) that has
-      // a ColumnarToRow child with CometPythonMapInArrowExec to avoid the unnecessary
-      // Arrow->Row->Arrow round-trip. The matchers are version-shimmed: Spark 3.4 returns None
-      // (it lacks the required APIs) and Spark 4.1+ matches the renamed `MapInArrowExec`.
-      case p: SparkPlan
-          if CometConf.COMET_PYTHON_MAP_IN_ARROW_ENABLED.get() &&
-            matchMapInArrow(p).orElse(matchMapInPandas(p)).isDefined =>
-        val (mapFunc, mapOutput, mapChild, mapIsBarrier, mapEvalType) =
-          matchMapInArrow(p).orElse(matchMapInPandas(p)).get
-        extractColumnarChild(mapChild)
-          .map { columnarChild =>
-            CometPythonMapInArrowExec(
-              mapFunc,
-              mapOutput,
-              columnarChild,
-              mapIsBarrier,
-              mapEvalType)
-          }
-          .getOrElse(p)
+      // a ColumnarToRow child with CometMapInBatchExec, eliminating the input and output
+      // UnsafeProjection copies and keeping the stage columnar. The matchers are
+      // version-shimmed: Spark 3.4 returns None (it lacks the required APIs) and Spark 4.1+
+      // matches the renamed `MapInArrowExec`.
+      case p: SparkPlan if CometConf.COMET_PYARROW_UDF_ENABLED.get() =>
+        matchMapInArrow(p).orElse(matchMapInPandas(p)) match {
+          case Some(info) =>
+            extractColumnarChild(info.child)
+              .map { columnarChild =>
+                CometMapInBatchExec(
+                  info.func,
+                  info.output,
+                  columnarChild,
+                  info.isBarrier,
+                  info.pythonEvalType)
+              }
+              .getOrElse(p)
+          case None => p
+        }
 
       // Spark adds `RowToColumnar` under Comet columnar shuffle. But it's redundant as the
       // shuffle takes row-based input.
@@ -154,12 +155,13 @@ case class EliminateRedundantTransitions(session: SparkSession)
   }
 
   /**
-   * If the given plan is a ColumnarToRow transition wrapping a columnar child, returns that
-   * columnar child. Used to detect and eliminate unnecessary transitions before Python UDF
-   * operators.
+   * If the given plan is a Comet ColumnarToRow transition, returns the columnar child the Python
+   * UDF operator can consume directly. By the time this rule runs the earlier
+   * `hasCometNativeChild` arm has already rewritten any `ColumnarToRowExec` over a Comet columnar
+   * source to one of the Comet variants, so vanilla `ColumnarToRowExec` cannot reach here on a
+   * Comet-driven plan and is intentionally not handled.
    */
   private def extractColumnarChild(plan: SparkPlan): Option[SparkPlan] = plan match {
-    case ColumnarToRowExec(child) if child.supportsColumnar => Some(child)
     case CometColumnarToRowExec(child) => Some(child)
     case CometNativeColumnarToRowExec(child) => Some(child)
     case _ => None
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala
similarity index 71%
rename from spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala
rename to spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala
index 68e27b9355..77dbfff7ce 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/CometPythonMapInArrowExec.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.PythonUDF
 import org.apache.spark.sql.catalyst.plans.physical.Partitioning
-import org.apache.spark.sql.comet.shims.ShimCometPythonMapInArrow
+import org.apache.spark.sql.comet.shims.ShimCometMapInBatch
 import org.apache.spark.sql.execution.{ColumnarToRowExec, SparkPlan, UnaryExecNode}
 import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
 import org.apache.spark.sql.execution.python.{BatchIterator, PythonSQLMetrics}
@@ -35,29 +35,26 @@ import org.apache.spark.sql.types.{StructField, StructType}
 import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch}
 
 /**
- * An optimized version of Spark's MapInBatchExec (PythonMapInArrowExec / MapInPandasExec) that
- * accepts columnar input directly from Comet operators, avoiding unnecessary Arrow -> Row ->
- * Arrow conversions.
+ * Comet replacement for Spark's `MapInBatchExec` family (`PythonMapInArrowExec` /
+ * `MapInArrowExec` in 4.1+ / `MapInPandasExec`). Accepts columnar input directly from a Comet
+ * child instead of going through the per-row `UnsafeProjection` that `ColumnarToRowExec` applies,
+ * and keeps the Python runner output as `ColumnarBatch` so downstream Comet operators consume it
+ * natively.
  *
- * Normal Spark flow: CometNativeExec (Arrow) -> ColumnarToRow -> PythonMapInArrowExec
- * (internally: rows -> Arrow -> Python -> Arrow -> rows)
- *
- * Optimized flow: CometNativeExec (Arrow) -> CometPythonMapInArrowExec (batch.rowIterator() ->
- * Arrow -> Python -> Arrow columnar output)
- *
- * This eliminates:
- *   1. The UnsafeProjection in ColumnarToRow (expensive copy) 2. The output Arrow->Row conversion
- *      (keeps Python output as ColumnarBatch)
+ * What this eliminates: two `UnsafeProjection` copies (input and output) and the row transition
+ * between Comet and the Python operator. The internal row-to-Arrow IPC re-encoding inside
+ * `ArrowPythonRunner` is unchanged; full round-trip elimination is tracked in #4240.
  */
-case class CometPythonMapInArrowExec(
+case class CometMapInBatchExec(
     func: Expression,
     output: Seq[Attribute],
     child: SparkPlan,
     isBarrier: Boolean,
     pythonEvalType: Int)
     extends UnaryExecNode
+    with CometPlan
     with PythonSQLMetrics
-    with ShimCometPythonMapInArrow {
+    with ShimCometMapInBatch {
 
   override def supportsColumnar: Boolean = true
 
@@ -71,6 +68,9 @@ case class CometPythonMapInArrowExec(
     "numInputRows" -> SQLMetrics.createMetric(sparkContext, "number of input rows")) ++
     pythonMetrics
 
+  // Fallback for row-consuming parents (e.g. a top-level `collect()` that produces rows).
+  // Wraps this columnar exec in `ColumnarToRowExec`, reintroducing exactly the row transition
+  // this operator otherwise eliminates. Only fires when nothing downstream consumes columnar.
   override def doExecute(): RDD[InternalRow] = {
     ColumnarToRowExec(this).doExecute()
   }
@@ -81,21 +81,15 @@ case class CometPythonMapInArrowExec(
     val numInputRows = longMetric("numInputRows")
 
     val pythonUDF = func.asInstanceOf[PythonUDF]
-    val localOutput = output
-    val localChildSchema = child.schema
+    val outputAttrs = output
+    val childSchema = child.schema
     val batchSize = conf.arrowMaxRecordsPerBatch
-    val sessionLocalTimeZone = conf.sessionLocalTimeZone
-    val useLargeVarTypes = largeVarTypes(conf)
-    val pythonRunnerConf = getPythonRunnerConfMap(conf)
-    val localPythonEvalType = pythonEvalType
-    val localPythonMetrics = pythonMetrics
-    val jobArtifactUUID = currentJobArtifactUUID()
+    val evalType = pythonEvalType
+    val sqlConf = conf
+    val metricsCopy = pythonMetrics
 
     val inputRDD = child.executeColumnar()
 
-    // Run on every partition. Identical to what MapInBatchExec does, except the input
-    // is columnar; we intentionally avoid the UnsafeProjection copy that ColumnarToRow
-    // would do.
     def processPartition(batches: Iterator[ColumnarBatch]): Iterator[ColumnarBatch] = {
       val context = TaskContext.get()
       val argOffsets = Array(Array(0))
@@ -115,22 +109,18 @@ case class CometPythonMapInArrowExec(
 
       val columnarBatchIter = computeArrowPython(
         pythonUDF,
-        localPythonEvalType,
+        evalType,
         argOffsets,
-        StructType(Array(StructField("struct", localChildSchema))),
-        sessionLocalTimeZone,
-        useLargeVarTypes,
-        pythonRunnerConf,
-        localPythonMetrics,
-        jobArtifactUUID,
+        StructType(Array(StructField("struct", childSchema))),
+        sqlConf,
+        metricsCopy,
         batchIter,
         context.partitionId(),
         context)
 
       columnarBatchIter.map { batch =>
-        // Python returns a StructType column; flatten to individual columns
         val structVector = batch.column(0).asInstanceOf[ArrowColumnVector]
-        val outputVectors = localOutput.indices.map(structVector.getChild)
+        val outputVectors = outputAttrs.indices.map(structVector.getChild)
         val flattenedBatch = new ColumnarBatch(outputVectors.toArray)
         flattenedBatch.setNumRows(batch.numRows())
         numOutputRows += flattenedBatch.numRows()
@@ -148,6 +138,6 @@ case class CometPythonMapInArrowExec(
     }
   }
 
-  override protected def withNewChildInternal(newChild: SparkPlan): CometPythonMapInArrowExec =
+  override protected def withNewChildInternal(newChild: SparkPlan): CometMapInBatchExec =
     copy(child = newChild)
 }
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/shims/MapInBatchInfo.scala b/spark/src/main/scala/org/apache/spark/sql/comet/shims/MapInBatchInfo.scala
new file mode 100644
index 0000000000..f610c575b1
--- /dev/null
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/shims/MapInBatchInfo.scala
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.comet.shims
+
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
+import org.apache.spark.sql.execution.SparkPlan
+
+/**
+ * Spark-version-agnostic projection of a `MapInBatchExec` (`PythonMapInArrowExec`,
+ * `MapInArrowExec`, or `MapInPandasExec`) that the Comet rewrite needs. Lives outside the shims
+ * so the Comet planner can pattern-match on it without depending on which concrete Spark class
+ * was matched.
+ */
+case class MapInBatchInfo(
+    func: Expression,
+    output: Seq[Attribute],
+    child: SparkPlan,
+    isBarrier: Boolean,
+    pythonEvalType: Int)
diff --git a/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala b/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
similarity index 69%
rename from spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala
rename to spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
index 30736d99b3..c7d6ae2f97 100644
--- a/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala
+++ b/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
@@ -21,7 +21,7 @@ package org.apache.spark.sql.comet.shims
 
 import org.apache.spark.TaskContext
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, PythonUDF}
+import org.apache.spark.sql.catalyst.expressions.PythonUDF
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.metric.SQLMetric
 import org.apache.spark.sql.internal.SQLConf
@@ -36,33 +36,21 @@ import org.apache.spark.sql.vectorized.ColumnarBatch
  * matchers return `None` and the runner factory throws. The optimization is effectively a no-op
  * on Spark 3.4.
  */
-trait ShimCometPythonMapInArrow {
+trait ShimCometMapInBatch {
 
-  protected def matchMapInArrow(
-      plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] = None
+  protected def matchMapInArrow(plan: SparkPlan): Option[MapInBatchInfo] = None
 
-  protected def matchMapInPandas(
-      plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] = None
-
-  protected def currentJobArtifactUUID(): Option[String] = None
-
-  protected def largeVarTypes(conf: SQLConf): Boolean = false
-
-  protected def getPythonRunnerConfMap(conf: SQLConf): Map[String, String] = Map.empty
+  protected def matchMapInPandas(plan: SparkPlan): Option[MapInBatchInfo] = None
 
   protected def computeArrowPython(
       pythonUDF: PythonUDF,
       evalType: Int,
       argOffsets: Array[Array[Int]],
       schema: StructType,
-      timeZoneId: String,
-      largeVarTypes: Boolean,
-      pythonRunnerConf: Map[String, String],
+      conf: SQLConf,
       pythonMetrics: Map[String, SQLMetric],
-      jobArtifactUUID: Option[String],
       batchIter: Iterator[Iterator[InternalRow]],
       partitionId: Int,
       context: TaskContext): Iterator[ColumnarBatch] =
-    throw new UnsupportedOperationException(
-      "CometPythonMapInArrowExec is not supported on Spark 3.4")
+    throw new UnsupportedOperationException("CometMapInBatchExec is not supported on Spark 3.4")
 }
diff --git a/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala b/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
similarity index 64%
rename from spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala
rename to spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
index f7c8221d9e..42d66465f4 100644
--- a/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala
+++ b/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
@@ -20,9 +20,9 @@
 package org.apache.spark.sql.comet.shims
 
 import org.apache.spark.{JobArtifactSet, TaskContext}
-import org.apache.spark.api.python.ChainedPythonFunctions
+import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType}
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, PythonUDF}
+import org.apache.spark.sql.catalyst.expressions.PythonUDF
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.metric.SQLMetric
 import org.apache.spark.sql.execution.python.{ArrowPythonRunner, MapInPandasExec, PythonMapInArrowExec}
@@ -30,54 +30,54 @@ import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
-trait ShimCometPythonMapInArrow {
+trait ShimCometMapInBatch {
 
-  protected def matchMapInArrow(
-      plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] =
+  protected def matchMapInArrow(plan: SparkPlan): Option[MapInBatchInfo] =
     plan match {
       case p: PythonMapInArrowExec =>
-        Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType))
+        Some(
+          MapInBatchInfo(
+            p.func,
+            p.output,
+            p.child,
+            p.isBarrier,
+            PythonEvalType.SQL_MAP_ARROW_ITER_UDF))
       case _ => None
     }
 
-  protected def matchMapInPandas(
-      plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] =
+  protected def matchMapInPandas(plan: SparkPlan): Option[MapInBatchInfo] =
     plan match {
       case p: MapInPandasExec =>
-        Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType))
+        Some(
+          MapInBatchInfo(
+            p.func,
+            p.output,
+            p.child,
+            p.isBarrier,
+            PythonEvalType.SQL_MAP_PANDAS_ITER_UDF))
       case _ => None
     }
 
-  protected def currentJobArtifactUUID(): Option[String] =
-    JobArtifactSet.getCurrentJobArtifactState.map(_.uuid)
-
-  protected def largeVarTypes(conf: SQLConf): Boolean = conf.arrowUseLargeVarTypes
-
-  protected def getPythonRunnerConfMap(conf: SQLConf): Map[String, String] =
-    ArrowPythonRunner.getPythonRunnerConfMap(conf)
-
   protected def computeArrowPython(
       pythonUDF: PythonUDF,
       evalType: Int,
       argOffsets: Array[Array[Int]],
       schema: StructType,
-      timeZoneId: String,
-      largeVarTypes: Boolean,
-      pythonRunnerConf: Map[String, String],
+      conf: SQLConf,
       pythonMetrics: Map[String, SQLMetric],
-      jobArtifactUUID: Option[String],
       batchIter: Iterator[Iterator[InternalRow]],
       partitionId: Int,
       context: TaskContext): Iterator[ColumnarBatch] = {
     val chainedFunc = Seq(ChainedPythonFunctions(Seq(pythonUDF.func)))
+    val jobArtifactUUID = JobArtifactSet.getCurrentJobArtifactState.map(_.uuid)
     new ArrowPythonRunner(
       chainedFunc,
       evalType,
       argOffsets,
       schema,
-      timeZoneId,
-      largeVarTypes,
-      pythonRunnerConf,
+      conf.sessionLocalTimeZone,
+      conf.arrowUseLargeVarTypes,
+      ArrowPythonRunner.getPythonRunnerConfMap(conf),
       pythonMetrics,
       jobArtifactUUID).compute(batchIter, partitionId, context)
   }
diff --git a/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
new file mode 100644
index 0000000000..0c21cb3738
--- /dev/null
+++ b/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.comet.shims
+
+import org.apache.spark.TaskContext
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.PythonUDF
+import org.apache.spark.sql.execution.metric.SQLMetric
+import org.apache.spark.sql.execution.python.ArrowPythonRunner
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+trait ShimCometMapInBatch extends Spark4xMapInBatchSupport {
+
+  protected def computeArrowPython(
+      pythonUDF: PythonUDF,
+      evalType: Int,
+      argOffsets: Array[Array[Int]],
+      schema: StructType,
+      conf: SQLConf,
+      pythonMetrics: Map[String, SQLMetric],
+      batchIter: Iterator[Iterator[InternalRow]],
+      partitionId: Int,
+      context: TaskContext): Iterator[ColumnarBatch] = {
+    val r = runnerInputs(pythonUDF, conf)
+    new ArrowPythonRunner(
+      r.chainedFunc,
+      evalType,
+      argOffsets,
+      schema,
+      r.timeZoneId,
+      r.largeVarTypes,
+      r.pythonRunnerConf,
+      pythonMetrics,
+      r.jobArtifactUUID,
+      None).compute(batchIter, partitionId, context)
+  }
+}
diff --git a/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala b/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala
deleted file mode 100644
index 78935f54c5..0000000000
--- a/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.comet.shims
-
-import org.apache.spark.{JobArtifactSet, TaskContext}
-import org.apache.spark.api.python.ChainedPythonFunctions
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, PythonUDF}
-import org.apache.spark.sql.execution.SparkPlan
-import org.apache.spark.sql.execution.metric.SQLMetric
-import org.apache.spark.sql.execution.python.{ArrowPythonRunner, MapInArrowExec, MapInPandasExec}
-import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.types.StructType
-import org.apache.spark.sql.vectorized.ColumnarBatch
-
-trait ShimCometPythonMapInArrow {
-
-  protected def matchMapInArrow(
-      plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] =
-    plan match {
-      case p: MapInArrowExec =>
-        Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType))
-      case _ => None
-    }
-
-  protected def matchMapInPandas(
-      plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] =
-    plan match {
-      case p: MapInPandasExec =>
-        Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType))
-      case _ => None
-    }
-
-  protected def currentJobArtifactUUID(): Option[String] =
-    JobArtifactSet.getCurrentJobArtifactState.map(_.uuid)
-
-  protected def largeVarTypes(conf: SQLConf): Boolean = conf.arrowUseLargeVarTypes
-
-  protected def getPythonRunnerConfMap(conf: SQLConf): Map[String, String] =
-    ArrowPythonRunner.getPythonRunnerConfMap(conf)
-
-  protected def computeArrowPython(
-      pythonUDF: PythonUDF,
-      evalType: Int,
-      argOffsets: Array[Array[Int]],
-      schema: StructType,
-      timeZoneId: String,
-      largeVarTypes: Boolean,
-      pythonRunnerConf: Map[String, String],
-      pythonMetrics: Map[String, SQLMetric],
-      jobArtifactUUID: Option[String],
-      batchIter: Iterator[Iterator[InternalRow]],
-      partitionId: Int,
-      context: TaskContext): Iterator[ColumnarBatch] = {
-    val chainedFunc =
-      Seq((ChainedPythonFunctions(Seq(pythonUDF.func)), pythonUDF.resultId.id))
-    new ArrowPythonRunner(
-      chainedFunc,
-      evalType,
-      argOffsets,
-      schema,
-      timeZoneId,
-      largeVarTypes,
-      pythonRunnerConf,
-      pythonMetrics,
-      jobArtifactUUID,
-      None).compute(batchIter, partitionId, context)
-  }
-}
diff --git a/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
new file mode 100644
index 0000000000..e73748aafe
--- /dev/null
+++ b/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.comet.shims
+
+import org.apache.spark.TaskContext
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.PythonUDF
+import org.apache.spark.sql.execution.metric.SQLMetric
+import org.apache.spark.sql.execution.python.ArrowPythonRunner
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+trait ShimCometMapInBatch extends Spark4xMapInBatchSupport {
+
+  protected def computeArrowPython(
+      pythonUDF: PythonUDF,
+      evalType: Int,
+      argOffsets: Array[Array[Int]],
+      schema: StructType,
+      conf: SQLConf,
+      pythonMetrics: Map[String, SQLMetric],
+      batchIter: Iterator[Iterator[InternalRow]],
+      partitionId: Int,
+      context: TaskContext): Iterator[ColumnarBatch] = {
+    val r = runnerInputs(pythonUDF, conf)
+    new ArrowPythonRunner(
+      r.chainedFunc,
+      evalType,
+      argOffsets,
+      schema,
+      r.timeZoneId,
+      r.largeVarTypes,
+      r.pythonRunnerConf,
+      pythonMetrics,
+      r.jobArtifactUUID,
+      None,
+      None).compute(batchIter, partitionId, context)
+  }
+}
diff --git a/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala b/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala
deleted file mode 100644
index f7f775b1fa..0000000000
--- a/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.comet.shims
-
-import org.apache.spark.{JobArtifactSet, TaskContext}
-import org.apache.spark.api.python.ChainedPythonFunctions
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, PythonUDF}
-import org.apache.spark.sql.execution.SparkPlan
-import org.apache.spark.sql.execution.metric.SQLMetric
-import org.apache.spark.sql.execution.python.{ArrowPythonRunner, MapInArrowExec, MapInPandasExec}
-import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.types.StructType
-import org.apache.spark.sql.vectorized.ColumnarBatch
-
-trait ShimCometPythonMapInArrow {
-
-  protected def matchMapInArrow(
-      plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] =
-    plan match {
-      case p: MapInArrowExec =>
-        Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType))
-      case _ => None
-    }
-
-  protected def matchMapInPandas(
-      plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] =
-    plan match {
-      case p: MapInPandasExec =>
-        Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType))
-      case _ => None
-    }
-
-  protected def currentJobArtifactUUID(): Option[String] =
-    JobArtifactSet.getCurrentJobArtifactState.map(_.uuid)
-
-  protected def largeVarTypes(conf: SQLConf): Boolean = conf.arrowUseLargeVarTypes
-
-  protected def getPythonRunnerConfMap(conf: SQLConf): Map[String, String] =
-    ArrowPythonRunner.getPythonRunnerConfMap(conf)
-
-  protected def computeArrowPython(
-      pythonUDF: PythonUDF,
-      evalType: Int,
-      argOffsets: Array[Array[Int]],
-      schema: StructType,
-      timeZoneId: String,
-      largeVarTypes: Boolean,
-      pythonRunnerConf: Map[String, String],
-      pythonMetrics: Map[String, SQLMetric],
-      jobArtifactUUID: Option[String],
-      batchIter: Iterator[Iterator[InternalRow]],
-      partitionId: Int,
-      context: TaskContext): Iterator[ColumnarBatch] = {
-    val chainedFunc =
-      Seq((ChainedPythonFunctions(Seq(pythonUDF.func)), pythonUDF.resultId.id))
-    new ArrowPythonRunner(
-      chainedFunc,
-      evalType,
-      argOffsets,
-      schema,
-      timeZoneId,
-      largeVarTypes,
-      pythonRunnerConf,
-      pythonMetrics,
-      jobArtifactUUID,
-      None,
-      None).compute(batchIter, partitionId, context)
-  }
-}
diff --git a/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
new file mode 100644
index 0000000000..0c21cb3738
--- /dev/null
+++ b/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.comet.shims
+
+import org.apache.spark.TaskContext
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.PythonUDF
+import org.apache.spark.sql.execution.metric.SQLMetric
+import org.apache.spark.sql.execution.python.ArrowPythonRunner
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+trait ShimCometMapInBatch extends Spark4xMapInBatchSupport {
+
+  protected def computeArrowPython(
+      pythonUDF: PythonUDF,
+      evalType: Int,
+      argOffsets: Array[Array[Int]],
+      schema: StructType,
+      conf: SQLConf,
+      pythonMetrics: Map[String, SQLMetric],
+      batchIter: Iterator[Iterator[InternalRow]],
+      partitionId: Int,
+      context: TaskContext): Iterator[ColumnarBatch] = {
+    val r = runnerInputs(pythonUDF, conf)
+    new ArrowPythonRunner(
+      r.chainedFunc,
+      evalType,
+      argOffsets,
+      schema,
+      r.timeZoneId,
+      r.largeVarTypes,
+      r.pythonRunnerConf,
+      pythonMetrics,
+      r.jobArtifactUUID,
+      None).compute(batchIter, partitionId, context)
+  }
+}
diff --git a/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala b/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala
deleted file mode 100644
index 78935f54c5..0000000000
--- a/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometPythonMapInArrow.scala
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.comet.shims
-
-import org.apache.spark.{JobArtifactSet, TaskContext}
-import org.apache.spark.api.python.ChainedPythonFunctions
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, PythonUDF}
-import org.apache.spark.sql.execution.SparkPlan
-import org.apache.spark.sql.execution.metric.SQLMetric
-import org.apache.spark.sql.execution.python.{ArrowPythonRunner, MapInArrowExec, MapInPandasExec}
-import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.types.StructType
-import org.apache.spark.sql.vectorized.ColumnarBatch
-
-trait ShimCometPythonMapInArrow {
-
-  protected def matchMapInArrow(
-      plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] =
-    plan match {
-      case p: MapInArrowExec =>
-        Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType))
-      case _ => None
-    }
-
-  protected def matchMapInPandas(
-      plan: SparkPlan): Option[(Expression, Seq[Attribute], SparkPlan, Boolean, Int)] =
-    plan match {
-      case p: MapInPandasExec =>
-        Some((p.func, p.output, p.child, p.isBarrier, p.func.asInstanceOf[PythonUDF].evalType))
-      case _ => None
-    }
-
-  protected def currentJobArtifactUUID(): Option[String] =
-    JobArtifactSet.getCurrentJobArtifactState.map(_.uuid)
-
-  protected def largeVarTypes(conf: SQLConf): Boolean = conf.arrowUseLargeVarTypes
-
-  protected def getPythonRunnerConfMap(conf: SQLConf): Map[String, String] =
-    ArrowPythonRunner.getPythonRunnerConfMap(conf)
-
-  protected def computeArrowPython(
-      pythonUDF: PythonUDF,
-      evalType: Int,
-      argOffsets: Array[Array[Int]],
-      schema: StructType,
-      timeZoneId: String,
-      largeVarTypes: Boolean,
-      pythonRunnerConf: Map[String, String],
-      pythonMetrics: Map[String, SQLMetric],
-      jobArtifactUUID: Option[String],
-      batchIter: Iterator[Iterator[InternalRow]],
-      partitionId: Int,
-      context: TaskContext): Iterator[ColumnarBatch] = {
-    val chainedFunc =
-      Seq((ChainedPythonFunctions(Seq(pythonUDF.func)), pythonUDF.resultId.id))
-    new ArrowPythonRunner(
-      chainedFunc,
-      evalType,
-      argOffsets,
-      schema,
-      timeZoneId,
-      largeVarTypes,
-      pythonRunnerConf,
-      pythonMetrics,
-      jobArtifactUUID,
-      None).compute(batchIter, partitionId, context)
-  }
-}
diff --git a/spark/src/main/spark-4.x/org/apache/spark/sql/comet/shims/Spark4xMapInBatchSupport.scala b/spark/src/main/spark-4.x/org/apache/spark/sql/comet/shims/Spark4xMapInBatchSupport.scala
new file mode 100644
index 0000000000..78672aea5e
--- /dev/null
+++ b/spark/src/main/spark-4.x/org/apache/spark/sql/comet/shims/Spark4xMapInBatchSupport.scala
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.comet.shims
+
+import org.apache.spark.{JobArtifactSet, TaskContext}
+import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.PythonUDF
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.metric.SQLMetric
+import org.apache.spark.sql.execution.python.{ArrowPythonRunner, MapInArrowExec, MapInPandasExec}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+/**
+ * Shared 4.x bits for `ShimCometMapInBatch`. The matchers and `getRunnerInputs` helper are
+ * identical across 4.0/4.1/4.2; only the `ArrowPythonRunner` constructor parameter list differs
+ * per minor, so each minor's `ShimCometMapInBatch` provides only `computeArrowPython`.
+ */
+trait Spark4xMapInBatchSupport {
+
+  protected def matchMapInArrow(plan: SparkPlan): Option[MapInBatchInfo] =
+    plan match {
+      case p: MapInArrowExec =>
+        Some(
+          MapInBatchInfo(
+            p.func,
+            p.output,
+            p.child,
+            p.isBarrier,
+            PythonEvalType.SQL_MAP_ARROW_ITER_UDF))
+      case _ => None
+    }
+
+  protected def matchMapInPandas(plan: SparkPlan): Option[MapInBatchInfo] =
+    plan match {
+      case p: MapInPandasExec =>
+        Some(
+          MapInBatchInfo(
+            p.func,
+            p.output,
+            p.child,
+            p.isBarrier,
+            PythonEvalType.SQL_MAP_PANDAS_ITER_UDF))
+      case _ => None
+    }
+
+  /** Inputs every 4.x `ArrowPythonRunner` constructor needs in the same shape. */
+  protected case class RunnerInputs(
+      chainedFunc: Seq[(ChainedPythonFunctions, Long)],
+      timeZoneId: String,
+      largeVarTypes: Boolean,
+      pythonRunnerConf: Map[String, String],
+      jobArtifactUUID: Option[String])
+
+  protected def runnerInputs(pythonUDF: PythonUDF, conf: SQLConf): RunnerInputs =
+    RunnerInputs(
+      chainedFunc = Seq((ChainedPythonFunctions(Seq(pythonUDF.func)), pythonUDF.resultId.id)),
+      timeZoneId = conf.sessionLocalTimeZone,
+      largeVarTypes = conf.arrowUseLargeVarTypes,
+      pythonRunnerConf = ArrowPythonRunner.getPythonRunnerConfMap(conf),
+      jobArtifactUUID = JobArtifactSet.getCurrentJobArtifactState.map(_.uuid))
+}
diff --git a/spark/src/test/resources/pyspark/benchmark_pyarrow_udf.py b/spark/src/test/resources/pyspark/benchmark_pyarrow_udf.py
index 8a3b4333c4..49574130c0 100644
--- a/spark/src/test/resources/pyspark/benchmark_pyarrow_udf.py
+++ b/spark/src/test/resources/pyspark/benchmark_pyarrow_udf.py
@@ -20,7 +20,7 @@
 End-to-end wall-clock benchmark for Comet's PyArrow UDF acceleration.
 
 Times `df.mapInArrow(passthrough, schema).count()` and the equivalent
-`mapInPandas` query with `spark.comet.exec.pythonMapInArrow.enabled` set
+`mapInPandas` query with `spark.comet.exec.pyarrowUdf.enabled` set
 to false (vanilla Spark path) and true (Comet's optimized path). Both
 modes run the same Python worker, so the measured delta covers what the
 optimization actually changes for users:
@@ -34,6 +34,12 @@
 optimization's user-visible value is what fraction of end-to-end time
 it shaves off, not the JVM-side delta in isolation.
 
+Caveat: the workload here is `passthrough_udf` + `count()` on `local[2]`,
+so most of the wall time is Spark's Python fork/IPC overhead with very
+little real Python work. Real UDFs (PyArrow compute, pandas ops, model
+inference) increase the per-row Python cost, which dilutes the JVM-side
+savings and shrinks the speedup ratio relative to what you see here.
+
 Usage:
     # Build Comet (release for representative numbers):
     make release
@@ -50,48 +56,20 @@
 """
 
 import contextlib
-import glob
 import os
 import statistics
+import sys
 import tempfile
 import time
 
 from pyspark.sql import SparkSession
 
-
-REPO_ROOT = os.path.abspath(
-    os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..")
-)
-
-
-def _resolve_comet_jar() -> str:
-    explicit = os.environ.get("COMET_JAR")
-    if explicit:
-        return explicit
-    import pyspark
-
-    major_minor = ".".join(pyspark.__version__.split(".")[:2])
-    spark_tag = f"spark{major_minor}"
-    scala_tag = "_2.12" if major_minor.startswith("3.") else "_2.13"
-    pattern = os.path.join(
-        REPO_ROOT,
-        f"spark/target/comet-spark-{spark_tag}{scala_tag}-*-SNAPSHOT.jar",
-    )
-    candidates = [
-        m
-        for m in sorted(glob.glob(pattern))
-        if "sources" not in os.path.basename(m) and "tests" not in os.path.basename(m)
-    ]
-    if not candidates:
-        raise FileNotFoundError(
-            "Comet jar not found. Set COMET_JAR or run `make release`. "
-            f"Looked under {pattern}."
-        )
-    return candidates[-1]
+sys.path.insert(0, os.path.dirname(__file__))
+from conftest import resolve_comet_jar
 
 
 def _build_spark() -> SparkSession:
-    jar = _resolve_comet_jar()
+    jar = resolve_comet_jar()
     os.environ["PYSPARK_SUBMIT_ARGS"] = (
         f"--jars {jar} --driver-class-path {jar} pyspark-shell"
     )
@@ -165,7 +143,7 @@ def _temp_parquet(spark: SparkSession, build_df, n: int):
 
 def _time_run(spark: SparkSession, parquet_path: str, accelerate: bool, api: str) -> float:
     spark.conf.set(
-        "spark.comet.exec.pythonMapInArrow.enabled",
+        "spark.comet.exec.pyarrowUdf.enabled",
         "true" if accelerate else "false",
     )
     df = spark.read.parquet(parquet_path)
@@ -189,7 +167,7 @@ def main() -> None:
 
     print(f"\nrows per run: {rows:,}")
     print(f"warmup iters: {warmup}, measured iters: {iters}")
-    print(f"jar: {_resolve_comet_jar()}\n")
+    print(f"jar: {resolve_comet_jar()}\n")
 
     header = "  {:<14} {:<10} {:>10} {:>10} {:>10} {:>13} {:>9}".format(
         "api", "mode", "min (s)", "median (s)", "max (s)", "rows/s", "speedup"
diff --git a/spark/src/test/resources/pyspark/conftest.py b/spark/src/test/resources/pyspark/conftest.py
new file mode 100644
index 0000000000..35d6d85191
--- /dev/null
+++ b/spark/src/test/resources/pyspark/conftest.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Shared helpers for the pytest modules under this directory and for the
+benchmark scripts that import them.
+
+`resolve_comet_jar` returns the path to the Comet jar a Spark session needs.
+Resolution order: the `COMET_JAR` env var (taken verbatim if it points at a
+file, expanded as a glob otherwise), then `<repo>/spark/target` matched against
+the installed pyspark major.minor version.
+"""
+
+import glob
+import os
+
+
+REPO_ROOT = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..")
+)
+
+
+def resolve_comet_jar() -> str:
+    explicit = os.environ.get("COMET_JAR")
+    if explicit:
+        if any(ch in explicit for ch in "*?["):
+            matches = sorted(glob.glob(explicit))
+            if not matches:
+                raise FileNotFoundError(
+                    f"COMET_JAR pattern matched nothing: {explicit}"
+                )
+            return matches[-1]
+        return explicit
+
+    # Pick the jar that matches the installed pyspark major.minor version. The
+    # Comet jars are published per Spark version (e.g.
+    # comet-spark-spark3.5_2.12-*.jar); using the wrong one yields
+    # ClassNotFoundException on Scala stdlib classes.
+    import pyspark
+
+    major_minor = ".".join(pyspark.__version__.split(".")[:2])
+    spark_tag = f"spark{major_minor}"
+    scala_tag = "_2.12" if major_minor.startswith("3.") else "_2.13"
+    pattern = os.path.join(
+        REPO_ROOT,
+        f"spark/target/comet-spark-{spark_tag}{scala_tag}-*-SNAPSHOT.jar",
+    )
+    candidates = [
+        m
+        for m in sorted(glob.glob(pattern))
+        if "sources" not in os.path.basename(m) and "tests" not in os.path.basename(m)
+    ]
+    if not candidates:
+        raise FileNotFoundError(
+            "Comet jar not found. Set COMET_JAR or run `make release`. "
+            f"Looked under {pattern}."
+        )
+    return candidates[-1]
diff --git a/spark/src/test/resources/pyspark/test_pyarrow_udf.py b/spark/src/test/resources/pyspark/test_pyarrow_udf.py
index ea72436841..87558ec057 100644
--- a/spark/src/test/resources/pyspark/test_pyarrow_udf.py
+++ b/spark/src/test/resources/pyspark/test_pyarrow_udf.py
@@ -20,14 +20,14 @@
 Pytest-driven integration tests for Comet's PyArrow UDF acceleration.
 
 Each test runs against two execution paths:
-  - "accelerated": spark.comet.exec.pythonMapInArrow.enabled=true
-                   (plan should contain CometPythonMapInArrow and no ColumnarToRow)
-  - "fallback":    spark.comet.exec.pythonMapInArrow.enabled=false
-                   (plan should contain vanilla PythonMapInArrow)
+  - "accelerated": spark.comet.exec.pyarrowUdf.enabled=true
+                   (plan should contain CometMapInBatch and no ColumnarToRow)
+  - "fallback":    spark.comet.exec.pyarrowUdf.enabled=false
+                   (plan should contain vanilla PythonMapInArrow / MapInArrow)
 
 Usage:
     # Build Comet first:
-    make release
+    make
 
     # Then either let the test discover the jar from spark/target, or pass it
     # explicitly via COMET_JAR:
@@ -38,7 +38,6 @@
 """
 
 import datetime as dt
-import glob
 import os
 from decimal import Decimal
 
@@ -46,52 +45,12 @@
 import pytest
 from pyspark.sql import SparkSession, types as T
 
-
-REPO_ROOT = os.path.abspath(
-    os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..")
-)
-
-
-def _resolve_comet_jar() -> str:
-    explicit = os.environ.get("COMET_JAR")
-    if explicit:
-        if any(ch in explicit for ch in "*?["):
-            matches = sorted(glob.glob(explicit))
-            if not matches:
-                raise FileNotFoundError(
-                    f"COMET_JAR pattern matched nothing: {explicit}"
-                )
-            return matches[-1]
-        return explicit
-
-    # Pick the jar that matches the installed pyspark major.minor version. The
-    # Comet jars are published per Spark version (e.g., comet-spark-spark3.5_2.12-*.jar);
-    # using the wrong one yields ClassNotFoundException on Scala stdlib classes.
-    import pyspark
-
-    major_minor = ".".join(pyspark.__version__.split(".")[:2])
-    spark_tag = f"spark{major_minor}"
-    scala_tag = "_2.12" if major_minor.startswith("3.") else "_2.13"
-    pattern = os.path.join(
-        REPO_ROOT,
-        f"spark/target/comet-spark-{spark_tag}{scala_tag}-*-SNAPSHOT.jar",
-    )
-    candidates = [
-        m
-        for m in sorted(glob.glob(pattern))
-        if "sources" not in os.path.basename(m) and "tests" not in os.path.basename(m)
-    ]
-    if not candidates:
-        raise FileNotFoundError(
-            "Comet jar not found. Set COMET_JAR or run `make release`. "
-            f"Looked under {pattern}."
-        )
-    return candidates[-1]
+from conftest import resolve_comet_jar
 
 
 @pytest.fixture(scope="session")
 def spark():
-    jar = _resolve_comet_jar()
+    jar = resolve_comet_jar()
     # PYSPARK_SUBMIT_ARGS is consumed when pyspark launches its JVM. Setting
     # --jars puts the Comet jar on both driver and executor classpaths so the
     # CometPlugin can be loaded.
@@ -117,7 +76,7 @@ def spark():
 @pytest.fixture(params=[True, False], ids=["accelerated", "fallback"])
 def accelerated(request, spark) -> bool:
     spark.conf.set(
-        "spark.comet.exec.pythonMapInArrow.enabled",
+        "spark.comet.exec.pyarrowUdf.enabled",
         "true" if request.param else "false",
     )
     return request.param
@@ -128,18 +87,18 @@ def _executed_plan(df) -> str:
 
 
 def _assert_plan_matches_mode(
-    plan: str, accelerated: bool, vanilla_node: str = "PythonMapInArrow"
+    plan: str, accelerated: bool, vanilla_node: str = "MapInArrow"
 ) -> None:
     if accelerated:
-        assert "CometPythonMapInArrow" in plan, (
-            f"expected CometPythonMapInArrow in accelerated plan, got:\n{plan}"
+        assert "CometMapInBatch" in plan, (
+            f"expected CometMapInBatch in accelerated plan, got:\n{plan}"
         )
         assert "ColumnarToRow" not in plan, (
             f"unexpected ColumnarToRow in accelerated plan:\n{plan}"
         )
     else:
-        assert "CometPythonMapInArrow" not in plan, (
-            f"unexpected CometPythonMapInArrow in fallback plan:\n{plan}"
+        assert "CometMapInBatch" not in plan, (
+            f"unexpected CometMapInBatch in fallback plan:\n{plan}"
         )
         assert vanilla_node in plan, (
             f"expected {vanilla_node} in fallback plan, got:\n{plan}"
@@ -176,6 +135,11 @@ def double_value(iterator):
         assert row["name"] == original[2]
 
 
+# All other tests use the default `vanilla_node="MapInArrow"`. The mapInPandas tests below
+# pass `MapInPandas` explicitly. The substring is the same on Spark 3.5 (PythonMapInArrowExec)
+# and Spark 4.x (MapInArrowExec) since the latter is a substring of the former.
+
+
 def test_map_in_arrow_changes_schema(spark, tmp_path, accelerated):
     data = [(i, float(i)) for i in range(50)]
     src = str(tmp_path / "src.parquet")
diff --git a/spark/src/test/scala/org/apache/comet/exec/CometPythonMapInArrowSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometPythonMapInArrowSuite.scala
deleted file mode 100644
index 7b1e17c4ed..0000000000
--- a/spark/src/test/scala/org/apache/comet/exec/CometPythonMapInArrowSuite.scala
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.comet.exec
-
-import org.apache.spark.sql.CometTestBase
-import org.apache.spark.sql.comet.CometPythonMapInArrowExec
-
-import org.apache.comet.CometConf
-
-class CometPythonMapInArrowSuite extends CometTestBase {
-
-  test("plan with CometScan has columnar support for Python UDF optimization") {
-    withSQLConf(
-      CometConf.COMET_ENABLED.key -> "true",
-      CometConf.COMET_EXEC_ENABLED.key -> "true",
-      CometConf.COMET_PYTHON_MAP_IN_ARROW_ENABLED.key -> "true") {
-      withParquetTable(
-        (1 to 10).map(i => (i.toDouble, s"str_$i")),
-        "testTable",
-        withDictionary = false) {
-        val df = spark.sql("SELECT * FROM testTable")
-        val plan = df.queryExecution.executedPlan
-        val cometScans = plan.collect { case s if s.supportsColumnar => s }
-        assert(cometScans.nonEmpty, "Expected columnar operators that can feed Python UDFs")
-      }
-    }
-  }
-
-  test("config disables Python map in arrow optimization") {
-    withSQLConf(
-      CometConf.COMET_ENABLED.key -> "true",
-      CometConf.COMET_EXEC_ENABLED.key -> "true",
-      CometConf.COMET_PYTHON_MAP_IN_ARROW_ENABLED.key -> "false") {
-      withParquetTable(
-        (1 to 10).map(i => (i.toDouble, s"str_$i")),
-        "testTable",
-        withDictionary = false) {
-        val df = spark.sql("SELECT * FROM testTable")
-        val plan = df.queryExecution.executedPlan
-        // With the feature disabled, no CometPythonMapInArrowExec should appear
-        val cometPythonExecs =
-          plan.collect { case e: CometPythonMapInArrowExec => e }
-        assert(
-          cometPythonExecs.isEmpty,
-          "CometPythonMapInArrowExec should not appear when disabled")
-      }
-    }
-  }
-}
diff --git a/spark/src/test/spark-3.5/org/apache/spark/sql/comet/CometMapInBatchSuite.scala b/spark/src/test/spark-3.5/org/apache/spark/sql/comet/CometMapInBatchSuite.scala
new file mode 100644
index 0000000000..af960c5c97
--- /dev/null
+++ b/spark/src/test/spark-3.5/org/apache/spark/sql/comet/CometMapInBatchSuite.scala
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.comet
+
+import org.apache.spark.api.python.{PythonAccumulatorV2, PythonBroadcast, PythonEvalType, PythonFunction}
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.CometTestBase
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, ExprId, PythonUDF}
+import org.apache.spark.sql.execution.{ColumnarToRowExec, LeafExecNode}
+import org.apache.spark.sql.execution.python.PythonMapInArrowExec
+import org.apache.spark.sql.types.{LongType, StructField, StructType}
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+import org.apache.comet.CometConf
+import org.apache.comet.rules.EliminateRedundantTransitions
+
+/** Minimal CometPlan leaf used to anchor the rule's transform without triggering execution. */
+private case class StubCometLeaf(override val output: Seq[Attribute])
+    extends LeafExecNode
+    with CometPlan {
+  override def supportsColumnar: Boolean = true
+  override protected def doExecute(): RDD[InternalRow] =
+    throw new UnsupportedOperationException
+  override protected def doExecuteColumnar(): RDD[ColumnarBatch] =
+    throw new UnsupportedOperationException
+}
+
+/**
+ * Plan-rule test for the `EliminateRedundantTransitions` rewrite that produces
+ * `CometMapInBatchExec`. Pure Python execution paths are covered by the pytest module
+ * `test_pyarrow_udf.py`; this suite verifies the JVM-side rule without spinning up Python.
+ *
+ * Lives under `org.apache.spark.sql.comet` so it can reference Spark's `private[spark]`
+ * `PythonFunction` / `PythonAccumulatorV2` / `PythonBroadcast` classes when fabricating a stub
+ * `PythonUDF` for `PythonMapInArrowExec` to wrap.
+ */
+class CometMapInBatchSuite extends CometTestBase {
+
+  private def stubPythonUDF: PythonUDF = {
+    val pyFunc = new PythonFunction {
+      override val command: Seq[Byte] = Seq.empty[Byte]
+      override val envVars: java.util.Map[String, String] =
+        new java.util.HashMap[String, String]()
+      override val pythonIncludes: java.util.List[String] =
+        java.util.Collections.emptyList[String]()
+      override val pythonExec: String = "python3"
+      override val pythonVer: String = "3"
+      override val broadcastVars: java.util.List[Broadcast[PythonBroadcast]] =
+        java.util.Collections.emptyList[Broadcast[PythonBroadcast]]()
+      override val accumulator: PythonAccumulatorV2 = null
+    }
+    PythonUDF(
+      name = "test_udf",
+      func = pyFunc,
+      dataType = StructType(Seq(StructField("id", LongType))),
+      children = Seq(AttributeReference("id", LongType)(ExprId(0L))),
+      evalType = PythonEvalType.SQL_MAP_ARROW_ITER_UDF,
+      udfDeterministic = true)
+  }
+
+  private def buildPlan(): PythonMapInArrowExec = {
+    val cometChild = StubCometLeaf(Seq(AttributeReference("id", LongType)(ExprId(0L))))
+    PythonMapInArrowExec(
+      stubPythonUDF,
+      cometChild.output,
+      ColumnarToRowExec(cometChild),
+      isBarrier = false)
+  }
+
+  test("rule rewrites PythonMapInArrowExec over Comet to CometMapInBatchExec") {
+    withSQLConf(CometConf.COMET_PYARROW_UDF_ENABLED.key -> "true") {
+      val rewritten = EliminateRedundantTransitions(spark).apply(buildPlan())
+      assert(
+        rewritten.exists(_.isInstanceOf[CometMapInBatchExec]),
+        s"expected CometMapInBatchExec in rewritten plan:\n$rewritten")
+    }
+  }
+
+  test("rule does not rewrite when feature is disabled") {
+    withSQLConf(CometConf.COMET_PYARROW_UDF_ENABLED.key -> "false") {
+      val rewritten = EliminateRedundantTransitions(spark).apply(buildPlan())
+      assert(
+        !rewritten.exists(_.isInstanceOf[CometMapInBatchExec]),
+        s"unexpected CometMapInBatchExec when disabled:\n$rewritten")
+    }
+  }
+}
diff --git a/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala b/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala
new file mode 100644
index 0000000000..5ab0b927a2
--- /dev/null
+++ b/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.comet
+
+import org.apache.spark.api.python.{PythonAccumulatorV2, PythonBroadcast, PythonEvalType, PythonFunction}
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.CometTestBase
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, ExprId, PythonUDF}
+import org.apache.spark.sql.execution.{ColumnarToRowExec, LeafExecNode}
+import org.apache.spark.sql.execution.python.MapInArrowExec
+import org.apache.spark.sql.types.{LongType, StructField, StructType}
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+import org.apache.comet.CometConf
+import org.apache.comet.rules.EliminateRedundantTransitions
+
+/** Minimal CometPlan leaf used to anchor the rule's transform without triggering execution. */
+private case class StubCometLeaf(override val output: Seq[Attribute])
+    extends LeafExecNode
+    with CometPlan {
+  override def supportsColumnar: Boolean = true
+  override protected def doExecute(): RDD[InternalRow] =
+    throw new UnsupportedOperationException
+  override protected def doExecuteColumnar(): RDD[ColumnarBatch] =
+    throw new UnsupportedOperationException
+}
+
+/**
+ * Plan-rule test for the `EliminateRedundantTransitions` rewrite that produces
+ * `CometMapInBatchExec`. Pure Python execution paths are covered by the pytest module
+ * `test_pyarrow_udf.py`; this suite verifies the JVM-side rule without spinning up Python.
+ *
+ * Lives under `org.apache.spark.sql.comet` so it can reference Spark's `private[spark]`
+ * `PythonFunction` / `PythonAccumulatorV2` / `PythonBroadcast` classes when fabricating a stub
+ * `PythonUDF` for `MapInArrowExec` to wrap.
+ */
+class CometMapInBatchSuite extends CometTestBase {
+
+  private def stubPythonUDF: PythonUDF = {
+    val pyFunc = new PythonFunction {
+      override val command: Seq[Byte] = Seq.empty[Byte]
+      override val envVars: java.util.Map[String, String] =
+        new java.util.HashMap[String, String]()
+      override val pythonIncludes: java.util.List[String] =
+        java.util.Collections.emptyList[String]()
+      override val pythonExec: String = "python3"
+      override val pythonVer: String = "3"
+      override val broadcastVars: java.util.List[Broadcast[PythonBroadcast]] =
+        java.util.Collections.emptyList[Broadcast[PythonBroadcast]]()
+      override val accumulator: PythonAccumulatorV2 = null
+    }
+    PythonUDF(
+      name = "test_udf",
+      func = pyFunc,
+      dataType = StructType(Seq(StructField("id", LongType))),
+      children = Seq(AttributeReference("id", LongType)(ExprId(0L))),
+      evalType = PythonEvalType.SQL_MAP_ARROW_ITER_UDF,
+      udfDeterministic = true)
+  }
+
+  private def buildPlan(): MapInArrowExec = {
+    val cometChild = StubCometLeaf(Seq(AttributeReference("id", LongType)(ExprId(0L))))
+    MapInArrowExec(
+      stubPythonUDF,
+      cometChild.output,
+      ColumnarToRowExec(cometChild),
+      isBarrier = false,
+      profile = None)
+  }
+
+  test("rule rewrites MapInArrowExec over Comet to CometMapInBatchExec") {
+    withSQLConf(CometConf.COMET_PYARROW_UDF_ENABLED.key -> "true") {
+      val rewritten = EliminateRedundantTransitions(spark).apply(buildPlan())
+      assert(
+        rewritten.exists(_.isInstanceOf[CometMapInBatchExec]),
+        s"expected CometMapInBatchExec in rewritten plan:\n$rewritten")
+    }
+  }
+
+  test("rule does not rewrite when feature is disabled") {
+    withSQLConf(CometConf.COMET_PYARROW_UDF_ENABLED.key -> "false") {
+      val rewritten = EliminateRedundantTransitions(spark).apply(buildPlan())
+      assert(
+        !rewritten.exists(_.isInstanceOf[CometMapInBatchExec]),
+        s"unexpected CometMapInBatchExec when disabled:\n$rewritten")
+    }
+  }
+}

From 6f5aca3b30d6d84d06db7f6dc1556ab3ad50a3e8 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Mon, 11 May 2026 18:40:12 -0600
Subject: [PATCH 18/54] fix: resolve ArrowPythonRunner inputs on driver, not in
 task closure

CometMapInBatchExec previously captured `SQLConf` in the partition closure
and resolved `sessionLocalTimeZone` / `arrowUseLargeVarTypes` /
`getPythonRunnerConfMap` on the executor inside `runnerInputs`. SQLConf
reads from a thread-local ConfigReader that only exists on the driver, so
this NPEs on the executor (reported by wForget on #4234).

Move the `runnerInputs(...)` call to the driver in `doExecuteColumnar` and
pass the resolved primitives into `computeArrowPython` as a serializable
`RunnerInputs` case class. The per-minor shims now take `RunnerInputs`
instead of `(PythonUDF, SQLConf)`.

Also drop now-unused imports from `Spark4xMapInBatchSupport` which were
flagged by scalafix on the Spark 4.0 lint job.
---
 .../spark/sql/comet/CometMapInBatchExec.scala | 10 +++--
 .../sql/comet/shims/ShimCometMapInBatch.scala |  9 ++++-
 .../sql/comet/shims/ShimCometMapInBatch.scala | 39 +++++++++++++------
 .../sql/comet/shims/ShimCometMapInBatch.scala | 19 ++++-----
 .../sql/comet/shims/ShimCometMapInBatch.scala | 19 ++++-----
 .../sql/comet/shims/ShimCometMapInBatch.scala | 19 ++++-----
 .../shims/Spark4xMapInBatchSupport.scala      | 17 ++++----
 7 files changed, 71 insertions(+), 61 deletions(-)

diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala
index 77dbfff7ce..233df4d0dc 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala
@@ -80,14 +80,17 @@ case class CometMapInBatchExec(
     val numOutputBatches = longMetric("numOutputBatches")
     val numInputRows = longMetric("numInputRows")
 
-    val pythonUDF = func.asInstanceOf[PythonUDF]
     val outputAttrs = output
     val childSchema = child.schema
     val batchSize = conf.arrowMaxRecordsPerBatch
     val evalType = pythonEvalType
-    val sqlConf = conf
     val metricsCopy = pythonMetrics
 
+    // Resolve every `SQLConf`-derived input on the driver. `SQLConf.get` reads from a thread-local
+    // `ConfigReader` that only exists on the driver, so dereferencing `conf` from inside the task
+    // closure NPEs (see #4234 review).
+    val resolvedRunnerInputs = runnerInputs(func.asInstanceOf[PythonUDF], conf)
+
     val inputRDD = child.executeColumnar()
 
     def processPartition(batches: Iterator[ColumnarBatch]): Iterator[ColumnarBatch] = {
@@ -108,11 +111,10 @@ case class CometMapInBatchExec(
         if (batchSize > 0) new BatchIterator(wrappedIter, batchSize) else Iterator(wrappedIter)
 
       val columnarBatchIter = computeArrowPython(
-        pythonUDF,
+        resolvedRunnerInputs,
         evalType,
         argOffsets,
         StructType(Array(StructField("struct", childSchema))),
-        sqlConf,
         metricsCopy,
         batchIter,
         context.partitionId(),
diff --git a/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
index c7d6ae2f97..1bde7ca094 100644
--- a/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
+++ b/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
@@ -42,12 +42,17 @@ trait ShimCometMapInBatch {
 
   protected def matchMapInPandas(plan: SparkPlan): Option[MapInBatchInfo] = None
 
+  /** Stub; never constructed on Spark 3.4 because the matchers always return `None`. */
+  protected case class RunnerInputs()
+
+  protected def runnerInputs(pythonUDF: PythonUDF, conf: SQLConf): RunnerInputs =
+    throw new UnsupportedOperationException("CometMapInBatchExec is not supported on Spark 3.4")
+
   protected def computeArrowPython(
-      pythonUDF: PythonUDF,
+      runnerInputs: RunnerInputs,
       evalType: Int,
       argOffsets: Array[Array[Int]],
       schema: StructType,
-      conf: SQLConf,
       pythonMetrics: Map[String, SQLMetric],
       batchIter: Iterator[Iterator[InternalRow]],
       partitionId: Int,
diff --git a/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
index 42d66465f4..a04681044c 100644
--- a/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
+++ b/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
@@ -58,27 +58,44 @@ trait ShimCometMapInBatch {
       case _ => None
     }
 
+  /** Inputs Spark 3.5's `ArrowPythonRunner` constructor needs. */
+  protected case class RunnerInputs(
+      chainedFunc: Seq[ChainedPythonFunctions],
+      timeZoneId: String,
+      largeVarTypes: Boolean,
+      pythonRunnerConf: Map[String, String],
+      jobArtifactUUID: Option[String])
+
+  /**
+   * Resolves the `SQLConf`-derived inputs the `ArrowPythonRunner` needs. Must be called on the
+   * driver: `conf.sessionLocalTimeZone` etc. read from a thread-local `ConfigReader` that only
+   * exists on the driver, so dereferencing them from a task closure NPEs.
+   */
+  protected def runnerInputs(pythonUDF: PythonUDF, conf: SQLConf): RunnerInputs =
+    RunnerInputs(
+      chainedFunc = Seq(ChainedPythonFunctions(Seq(pythonUDF.func))),
+      timeZoneId = conf.sessionLocalTimeZone,
+      largeVarTypes = conf.arrowUseLargeVarTypes,
+      pythonRunnerConf = ArrowPythonRunner.getPythonRunnerConfMap(conf),
+      jobArtifactUUID = JobArtifactSet.getCurrentJobArtifactState.map(_.uuid))
+
   protected def computeArrowPython(
-      pythonUDF: PythonUDF,
+      runnerInputs: RunnerInputs,
       evalType: Int,
       argOffsets: Array[Array[Int]],
       schema: StructType,
-      conf: SQLConf,
       pythonMetrics: Map[String, SQLMetric],
       batchIter: Iterator[Iterator[InternalRow]],
       partitionId: Int,
-      context: TaskContext): Iterator[ColumnarBatch] = {
-    val chainedFunc = Seq(ChainedPythonFunctions(Seq(pythonUDF.func)))
-    val jobArtifactUUID = JobArtifactSet.getCurrentJobArtifactState.map(_.uuid)
+      context: TaskContext): Iterator[ColumnarBatch] =
     new ArrowPythonRunner(
-      chainedFunc,
+      runnerInputs.chainedFunc,
       evalType,
       argOffsets,
       schema,
-      conf.sessionLocalTimeZone,
-      conf.arrowUseLargeVarTypes,
-      ArrowPythonRunner.getPythonRunnerConfMap(conf),
+      runnerInputs.timeZoneId,
+      runnerInputs.largeVarTypes,
+      runnerInputs.pythonRunnerConf,
       pythonMetrics,
-      jobArtifactUUID).compute(batchIter, partitionId, context)
-  }
+      runnerInputs.jobArtifactUUID).compute(batchIter, partitionId, context)
 }
diff --git a/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
index 0c21cb3738..fdc9a03e14 100644
--- a/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
+++ b/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
@@ -21,36 +21,31 @@ package org.apache.spark.sql.comet.shims
 
 import org.apache.spark.TaskContext
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.PythonUDF
 import org.apache.spark.sql.execution.metric.SQLMetric
 import org.apache.spark.sql.execution.python.ArrowPythonRunner
-import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
 trait ShimCometMapInBatch extends Spark4xMapInBatchSupport {
 
   protected def computeArrowPython(
-      pythonUDF: PythonUDF,
+      runnerInputs: RunnerInputs,
       evalType: Int,
       argOffsets: Array[Array[Int]],
       schema: StructType,
-      conf: SQLConf,
       pythonMetrics: Map[String, SQLMetric],
       batchIter: Iterator[Iterator[InternalRow]],
       partitionId: Int,
-      context: TaskContext): Iterator[ColumnarBatch] = {
-    val r = runnerInputs(pythonUDF, conf)
+      context: TaskContext): Iterator[ColumnarBatch] =
     new ArrowPythonRunner(
-      r.chainedFunc,
+      runnerInputs.chainedFunc,
       evalType,
       argOffsets,
       schema,
-      r.timeZoneId,
-      r.largeVarTypes,
-      r.pythonRunnerConf,
+      runnerInputs.timeZoneId,
+      runnerInputs.largeVarTypes,
+      runnerInputs.pythonRunnerConf,
       pythonMetrics,
-      r.jobArtifactUUID,
+      runnerInputs.jobArtifactUUID,
       None).compute(batchIter, partitionId, context)
-  }
 }
diff --git a/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
index e73748aafe..b0e6ecc3a0 100644
--- a/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
+++ b/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
@@ -21,37 +21,32 @@ package org.apache.spark.sql.comet.shims
 
 import org.apache.spark.TaskContext
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.PythonUDF
 import org.apache.spark.sql.execution.metric.SQLMetric
 import org.apache.spark.sql.execution.python.ArrowPythonRunner
-import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
 trait ShimCometMapInBatch extends Spark4xMapInBatchSupport {
 
   protected def computeArrowPython(
-      pythonUDF: PythonUDF,
+      runnerInputs: RunnerInputs,
       evalType: Int,
       argOffsets: Array[Array[Int]],
       schema: StructType,
-      conf: SQLConf,
       pythonMetrics: Map[String, SQLMetric],
       batchIter: Iterator[Iterator[InternalRow]],
       partitionId: Int,
-      context: TaskContext): Iterator[ColumnarBatch] = {
-    val r = runnerInputs(pythonUDF, conf)
+      context: TaskContext): Iterator[ColumnarBatch] =
     new ArrowPythonRunner(
-      r.chainedFunc,
+      runnerInputs.chainedFunc,
       evalType,
       argOffsets,
       schema,
-      r.timeZoneId,
-      r.largeVarTypes,
-      r.pythonRunnerConf,
+      runnerInputs.timeZoneId,
+      runnerInputs.largeVarTypes,
+      runnerInputs.pythonRunnerConf,
       pythonMetrics,
-      r.jobArtifactUUID,
+      runnerInputs.jobArtifactUUID,
       None,
       None).compute(batchIter, partitionId, context)
-  }
 }
diff --git a/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
index 0c21cb3738..fdc9a03e14 100644
--- a/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
+++ b/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
@@ -21,36 +21,31 @@ package org.apache.spark.sql.comet.shims
 
 import org.apache.spark.TaskContext
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.PythonUDF
 import org.apache.spark.sql.execution.metric.SQLMetric
 import org.apache.spark.sql.execution.python.ArrowPythonRunner
-import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
 trait ShimCometMapInBatch extends Spark4xMapInBatchSupport {
 
   protected def computeArrowPython(
-      pythonUDF: PythonUDF,
+      runnerInputs: RunnerInputs,
       evalType: Int,
       argOffsets: Array[Array[Int]],
       schema: StructType,
-      conf: SQLConf,
       pythonMetrics: Map[String, SQLMetric],
       batchIter: Iterator[Iterator[InternalRow]],
       partitionId: Int,
-      context: TaskContext): Iterator[ColumnarBatch] = {
-    val r = runnerInputs(pythonUDF, conf)
+      context: TaskContext): Iterator[ColumnarBatch] =
     new ArrowPythonRunner(
-      r.chainedFunc,
+      runnerInputs.chainedFunc,
       evalType,
       argOffsets,
       schema,
-      r.timeZoneId,
-      r.largeVarTypes,
-      r.pythonRunnerConf,
+      runnerInputs.timeZoneId,
+      runnerInputs.largeVarTypes,
+      runnerInputs.pythonRunnerConf,
       pythonMetrics,
-      r.jobArtifactUUID,
+      runnerInputs.jobArtifactUUID,
       None).compute(batchIter, partitionId, context)
-  }
 }
diff --git a/spark/src/main/spark-4.x/org/apache/spark/sql/comet/shims/Spark4xMapInBatchSupport.scala b/spark/src/main/spark-4.x/org/apache/spark/sql/comet/shims/Spark4xMapInBatchSupport.scala
index 78672aea5e..bfb56427cf 100644
--- a/spark/src/main/spark-4.x/org/apache/spark/sql/comet/shims/Spark4xMapInBatchSupport.scala
+++ b/spark/src/main/spark-4.x/org/apache/spark/sql/comet/shims/Spark4xMapInBatchSupport.scala
@@ -19,21 +19,17 @@
 
 package org.apache.spark.sql.comet.shims
 
-import org.apache.spark.{JobArtifactSet, TaskContext}
+import org.apache.spark.JobArtifactSet
 import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType}
-import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.PythonUDF
 import org.apache.spark.sql.execution.SparkPlan
-import org.apache.spark.sql.execution.metric.SQLMetric
 import org.apache.spark.sql.execution.python.{ArrowPythonRunner, MapInArrowExec, MapInPandasExec}
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.types.StructType
-import org.apache.spark.sql.vectorized.ColumnarBatch
 
 /**
- * Shared 4.x bits for `ShimCometMapInBatch`. The matchers and `getRunnerInputs` helper are
- * identical across 4.0/4.1/4.2; only the `ArrowPythonRunner` constructor parameter list differs
- * per minor, so each minor's `ShimCometMapInBatch` provides only `computeArrowPython`.
+ * Shared 4.x bits for `ShimCometMapInBatch`. The matchers and `runnerInputs` helper are identical
+ * across 4.0/4.1/4.2; only the `ArrowPythonRunner` constructor parameter list differs per minor,
+ * so each minor's `ShimCometMapInBatch` provides only `computeArrowPython`.
  */
 trait Spark4xMapInBatchSupport {
 
@@ -71,6 +67,11 @@ trait Spark4xMapInBatchSupport {
       pythonRunnerConf: Map[String, String],
       jobArtifactUUID: Option[String])
 
+  /**
+   * Resolves the `SQLConf`-derived inputs the `ArrowPythonRunner` needs. Must be called on the
+   * driver: `SQLConf.get` reads from a thread-local `ConfigReader` that only exists on the
+   * driver, so dereferencing `conf.sessionLocalTimeZone` etc. from a task closure NPEs.
+   */
   protected def runnerInputs(pythonUDF: PythonUDF, conf: SQLConf): RunnerInputs =
     RunnerInputs(
       chainedFunc = Seq((ChainedPythonFunctions(Seq(pythonUDF.func)), pythonUDF.resultId.id)),

From 213e96cd5ca21c3274e5db1a7161c1f73a7e9ca2 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Mon, 11 May 2026 18:40:17 -0600
Subject: [PATCH 19/54] ci: register CometMapInBatchSuite in pr_build_linux
 suite list

The refactor commit renamed `CometPythonMapInArrowSuite` to
`CometMapInBatchSuite` and moved it under `org.apache.spark.sql.comet`,
but the pr_build_linux workflow still referenced the old FQN, so
`check-missing-suites` failed.
---
 .github/workflows/pr_build_linux.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr_build_linux.yml b/.github/workflows/pr_build_linux.yml
index b62a000f6c..d40226c941 100644
--- a/.github/workflows/pr_build_linux.yml
+++ b/.github/workflows/pr_build_linux.yml
@@ -354,7 +354,7 @@ jobs:
               org.apache.comet.exec.CometGenerateExecSuite
               org.apache.comet.exec.CometWindowExecSuite
               org.apache.comet.exec.CometJoinSuite
-              org.apache.comet.exec.CometPythonMapInArrowSuite
+              org.apache.spark.sql.comet.CometMapInBatchSuite
               org.apache.comet.CometNativeSuite
               org.apache.comet.CometSparkSessionExtensionsSuite
               org.apache.spark.CometPluginsSuite

From a7e3fa39f8028d960a6bb580f805a2b5d75d1b2a Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Mon, 11 May 2026 18:40:19 -0600
Subject: [PATCH 20/54] ci: tighten pyarrow_udf_test triggers to
 feature-specific paths

Switch from `paths-ignore` to an explicit allowlist anchored on the files
that actually affect the feature. The previous filter re-ran the
15-minute workflow on any unrelated Rust or Scala change.

Per review feedback on #4234.
---
 .github/workflows/pyarrow_udf_test.yml | 40 +++++++++++++-------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/pyarrow_udf_test.yml b/.github/workflows/pyarrow_udf_test.yml
index 211a9bd23a..e325ab8b6d 100644
--- a/.github/workflows/pyarrow_udf_test.yml
+++ b/.github/workflows/pyarrow_udf_test.yml
@@ -25,27 +25,27 @@ on:
   push:
     branches:
       - main
-    paths-ignore:
-      - "benchmarks/**"
-      - "doc/**"
-      - "docs/**"
-      - "**.md"
-      - "dev/changelog/*.md"
-      - "native/core/benches/**"
-      - "native/spark-expr/benches/**"
-      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
-      - "spark/src/main/scala/org/apache/comet/GenerateDocs.scala"
+    paths: &feature-paths
+      - "pom.xml"
+      - "common/pom.xml"
+      - "common/src/main/scala/org/apache/comet/CometConf.scala"
+      - "spark/pom.xml"
+      - "spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala"
+      - "spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala"
+      - "spark/src/main/scala/org/apache/spark/sql/comet/shims/MapInBatchInfo.scala"
+      - "spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala"
+      - "spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala"
+      - "spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala"
+      - "spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala"
+      - "spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala"
+      - "spark/src/main/spark-4.x/org/apache/spark/sql/comet/shims/Spark4xMapInBatchSupport.scala"
+      - "spark/src/test/resources/pyspark/conftest.py"
+      - "spark/src/test/resources/pyspark/test_pyarrow_udf.py"
+      - "spark/src/test/spark-3.5/org/apache/spark/sql/comet/CometMapInBatchSuite.scala"
+      - "spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala"
+      - ".github/workflows/pyarrow_udf_test.yml"
   pull_request:
-    paths-ignore:
-      - "benchmarks/**"
-      - "doc/**"
-      - "docs/**"
-      - "**.md"
-      - "dev/changelog/*.md"
-      - "native/core/benches/**"
-      - "native/spark-expr/benches/**"
-      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
-      - "spark/src/main/scala/org/apache/comet/GenerateDocs.scala"
+    paths: *feature-paths
   workflow_dispatch:
 
 permissions:

From 338dcc1e945af7738c2757aa4c0a84523453792a Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Mon, 11 May 2026 18:40:21 -0600
Subject: [PATCH 21/54] docs: clarify pyarrowUdf conf vs Spark's PySpark Arrow
 conversion conf

Address review question on #4234: `spark.comet.exec.pyarrowUdf.enabled` is
distinct from `spark.sql.execution.arrow.pyspark.enabled` (which controls
`toPandas()` / `createDataFrame(pandas_df)` and pandas UDFs, not
`mapInArrow` / `mapInPandas`). Add a short section to the user guide so
readers don't conflate the two.
---
 docs/source/user-guide/latest/pyarrow-udfs.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/docs/source/user-guide/latest/pyarrow-udfs.md b/docs/source/user-guide/latest/pyarrow-udfs.md
index 23ef50e79c..8495184812 100644
--- a/docs/source/user-guide/latest/pyarrow-udfs.md
+++ b/docs/source/user-guide/latest/pyarrow-udfs.md
@@ -81,6 +81,15 @@ spark.comet.exec.pyarrowUdf.enabled=true
 
 The default is `false` while the feature stabilizes.
 
+### Relationship to Spark's PySpark Arrow conversion conf
+
+`spark.comet.exec.pyarrowUdf.enabled` is **not** the same as PySpark's
+[`spark.sql.execution.arrow.pyspark.enabled`](https://spark.apache.org/docs/latest/api/python/tutorial/sql/arrow_pandas.html#enabling-for-conversion-to-from-pandas).
+That conf controls whether Spark uses Arrow when materializing a DataFrame to a Pandas DataFrame
+(`toPandas()`) or constructing one from Pandas. The Comet conf controls a planner rewrite for
+`mapInArrow` / `mapInPandas`, and only affects how Comet's columnar batches feed the Python
+worker. Both confs can be set independently.
+
 ## Supported APIs
 
 | PySpark API                      | Spark Plan Node             | Supported |

From 46a238ed9ad76d44d8494d61d5b40d4ccc9a82de Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Mon, 11 May 2026 18:44:59 -0600
Subject: [PATCH 22/54] ci: register CometMapInBatchSuite in pr_build_macos
 suite list

Mirror the pr_build_linux fix: the macos workflow also still referenced the old CometPythonMapInArrowSuite FQN.
---
 .github/workflows/pr_build_macos.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr_build_macos.yml b/.github/workflows/pr_build_macos.yml
index 6f2d2fb41b..0faca13269 100644
--- a/.github/workflows/pr_build_macos.yml
+++ b/.github/workflows/pr_build_macos.yml
@@ -194,7 +194,7 @@ jobs:
               org.apache.comet.exec.CometGenerateExecSuite
               org.apache.comet.exec.CometWindowExecSuite
               org.apache.comet.exec.CometJoinSuite
-              org.apache.comet.exec.CometPythonMapInArrowSuite
+              org.apache.spark.sql.comet.CometMapInBatchSuite
               org.apache.comet.CometNativeSuite
               org.apache.comet.CometSetOpWithGroupBySuite
               org.apache.comet.CometSparkSessionExtensionsSuite

From 5fed7342f4841df881e739be028059115659756e Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Mon, 11 May 2026 19:38:58 -0600
Subject: [PATCH 23/54] test: add cross-allocator transfer probe for pyarrow
 UDF runner [skip ci]

---
 .../CometColumnarPythonInputProbeSuite.scala  | 135 ++++++++++++++++++
 1 file changed, 135 insertions(+)
 create mode 100644 spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometColumnarPythonInputProbeSuite.scala

diff --git a/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometColumnarPythonInputProbeSuite.scala b/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometColumnarPythonInputProbeSuite.scala
new file mode 100644
index 0000000000..2ee43b5982
--- /dev/null
+++ b/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometColumnarPythonInputProbeSuite.scala
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.comet
+
+import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
+import java.util.concurrent.atomic.AtomicReference
+
+import org.apache.arrow.vector.{FieldVector, VectorSchemaRoot}
+import org.apache.arrow.vector.complex.StructVector
+import org.apache.arrow.vector.ipc.{ArrowStreamReader, ArrowStreamWriter}
+import org.apache.spark.sql.CometTestBase
+import org.apache.spark.sql.comet.util.Utils
+import org.apache.spark.sql.types.{StructField, StructType}
+import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch}
+
+/**
+ * Probe test for the invariant `CometColumnarPythonInput` relies on: `makeTransferPair` between a
+ * Comet `ColumnarBatch`'s field vectors and a `VectorSchemaRoot` allocated from
+ * `ArrowUtils.rootAllocator` must succeed, and the resulting Arrow IPC stream must round-trip to
+ * equivalent row counts and values.
+ *
+ * If this test starts failing because of allocator changes, `CometColumnarPythonInput` must grow
+ * a per-buffer copy fallback before the regression lands on main.
+ */
+class CometColumnarPythonInputProbeSuite extends CometTestBase {
+
+  test("Comet vectors transfer-pair into ArrowUtils.rootAllocator VSR and round-trip via IPC") {
+    withTempPath { path =>
+      val pathStr = path.getCanonicalPath
+      spark
+        .range(0, 1000, 1, 1)
+        .selectExpr("id AS id", "CAST(id AS DOUBLE) * 1.5 AS value")
+        .write
+        .mode("overwrite")
+        .parquet(pathStr)
+
+      val df = spark.read.parquet(pathStr)
+      val childSchema = df.schema
+      val wireSchema = StructType(Array(StructField("struct", childSchema)))
+      // Capture timezone on the driver before entering the partition closure.
+      val timeZoneId = conf.sessionLocalTimeZone
+
+      // ColumnarBatch is not serializable, so we can't use take()/collect(). Run all Arrow
+      // operations inside foreachPartition. In local mode this executes in the same JVM.
+      // A shared AtomicReference carries any failure back to the test thread.
+      val failureRef = new AtomicReference[Throwable](null)
+
+      df.queryExecution.executedPlan
+        .collectFirst { case n: CometNativeExec => n }
+        .getOrElse(fail("Expected CometNativeExec in plan"))
+        .executeColumnar()
+        .foreachPartition { (batches: Iterator[ColumnarBatch]) =>
+          if (batches.hasNext) {
+            val batch = batches.next()
+            try {
+              val arrowSchema = Utils.toArrowSchema(wireSchema, timeZoneId)
+              val rootAlloc = org.apache.spark.sql.util.ArrowUtils.rootAllocator
+              val allocator = rootAlloc.newChildAllocator("probe-allocator", 0, Long.MaxValue)
+              val root = VectorSchemaRoot.create(arrowSchema, allocator)
+              try {
+                val structVec = root.getVector(0).asInstanceOf[StructVector]
+                var i = 0
+                while (i < batch.numCols()) {
+                  val src = batch.column(i).asInstanceOf[ArrowColumnVector].getValueVector
+                  val dst = structVec.getChildByOrdinal(i).asInstanceOf[FieldVector]
+                  src.makeTransferPair(dst).transfer()
+                  i += 1
+                }
+                structVec.setValueCount(batch.numRows())
+                root.setRowCount(batch.numRows())
+
+                val baos = new ByteArrayOutputStream()
+                val writer = new ArrowStreamWriter(root, null, baos)
+                writer.start()
+                writer.writeBatch()
+                writer.end()
+
+                val readAllocator =
+                  rootAlloc.newChildAllocator("probe-read", 0, Long.MaxValue)
+                try {
+                  val reader = new ArrowStreamReader(
+                    new ByteArrayInputStream(baos.toByteArray),
+                    readAllocator)
+                  try {
+                    if (!reader.loadNextBatch()) {
+                      failureRef.set(
+                        new AssertionError("IPC round-trip: expected at least one record batch"))
+                    } else {
+                      val readRoot = reader.getVectorSchemaRoot
+                      if (readRoot.getRowCount != batch.numRows()) {
+                        failureRef.set(
+                          new AssertionError(
+                            s"row count mismatch: read=${readRoot.getRowCount}, " +
+                              s"expected=${batch.numRows()}"))
+                      }
+                    }
+                  } finally {
+                    reader.close()
+                  }
+                } finally {
+                  readAllocator.close()
+                }
+              } finally {
+                root.close()
+                allocator.close()
+                batch.close()
+              }
+            } catch {
+              case t: Throwable => failureRef.set(t)
+            }
+          }
+        }
+
+      val failure = failureRef.get()
+      if (failure != null) throw failure
+    }
+  }
+}

From 03f74898e07f0ab104c289405759ceddead2cee2 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Mon, 11 May 2026 20:34:26 -0600
Subject: [PATCH 24/54] Revert "test: add cross-allocator transfer probe for
 pyarrow UDF runner" [skip ci]

Probe's cross-allocator transfer invariant turned out false: Comet's Parquet readers each construct their own RootAllocator, separate from ArrowUtils.rootAllocator. The original probe also had a silent-pass bug (AtomicReference doesn't cross Spark task boundaries). The redesigned trait uses per-buffer byte copy instead of TransferPair, so the probe is no longer load-bearing.
---
 .../CometColumnarPythonInputProbeSuite.scala  | 135 ------------------
 1 file changed, 135 deletions(-)
 delete mode 100644 spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometColumnarPythonInputProbeSuite.scala

diff --git a/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometColumnarPythonInputProbeSuite.scala b/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometColumnarPythonInputProbeSuite.scala
deleted file mode 100644
index 2ee43b5982..0000000000
--- a/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometColumnarPythonInputProbeSuite.scala
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.comet
-
-import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
-import java.util.concurrent.atomic.AtomicReference
-
-import org.apache.arrow.vector.{FieldVector, VectorSchemaRoot}
-import org.apache.arrow.vector.complex.StructVector
-import org.apache.arrow.vector.ipc.{ArrowStreamReader, ArrowStreamWriter}
-import org.apache.spark.sql.CometTestBase
-import org.apache.spark.sql.comet.util.Utils
-import org.apache.spark.sql.types.{StructField, StructType}
-import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch}
-
-/**
- * Probe test for the invariant `CometColumnarPythonInput` relies on: `makeTransferPair` between a
- * Comet `ColumnarBatch`'s field vectors and a `VectorSchemaRoot` allocated from
- * `ArrowUtils.rootAllocator` must succeed, and the resulting Arrow IPC stream must round-trip to
- * equivalent row counts and values.
- *
- * If this test starts failing because of allocator changes, `CometColumnarPythonInput` must grow
- * a per-buffer copy fallback before the regression lands on main.
- */
-class CometColumnarPythonInputProbeSuite extends CometTestBase {
-
-  test("Comet vectors transfer-pair into ArrowUtils.rootAllocator VSR and round-trip via IPC") {
-    withTempPath { path =>
-      val pathStr = path.getCanonicalPath
-      spark
-        .range(0, 1000, 1, 1)
-        .selectExpr("id AS id", "CAST(id AS DOUBLE) * 1.5 AS value")
-        .write
-        .mode("overwrite")
-        .parquet(pathStr)
-
-      val df = spark.read.parquet(pathStr)
-      val childSchema = df.schema
-      val wireSchema = StructType(Array(StructField("struct", childSchema)))
-      // Capture timezone on the driver before entering the partition closure.
-      val timeZoneId = conf.sessionLocalTimeZone
-
-      // ColumnarBatch is not serializable, so we can't use take()/collect(). Run all Arrow
-      // operations inside foreachPartition. In local mode this executes in the same JVM.
-      // A shared AtomicReference carries any failure back to the test thread.
-      val failureRef = new AtomicReference[Throwable](null)
-
-      df.queryExecution.executedPlan
-        .collectFirst { case n: CometNativeExec => n }
-        .getOrElse(fail("Expected CometNativeExec in plan"))
-        .executeColumnar()
-        .foreachPartition { (batches: Iterator[ColumnarBatch]) =>
-          if (batches.hasNext) {
-            val batch = batches.next()
-            try {
-              val arrowSchema = Utils.toArrowSchema(wireSchema, timeZoneId)
-              val rootAlloc = org.apache.spark.sql.util.ArrowUtils.rootAllocator
-              val allocator = rootAlloc.newChildAllocator("probe-allocator", 0, Long.MaxValue)
-              val root = VectorSchemaRoot.create(arrowSchema, allocator)
-              try {
-                val structVec = root.getVector(0).asInstanceOf[StructVector]
-                var i = 0
-                while (i < batch.numCols()) {
-                  val src = batch.column(i).asInstanceOf[ArrowColumnVector].getValueVector
-                  val dst = structVec.getChildByOrdinal(i).asInstanceOf[FieldVector]
-                  src.makeTransferPair(dst).transfer()
-                  i += 1
-                }
-                structVec.setValueCount(batch.numRows())
-                root.setRowCount(batch.numRows())
-
-                val baos = new ByteArrayOutputStream()
-                val writer = new ArrowStreamWriter(root, null, baos)
-                writer.start()
-                writer.writeBatch()
-                writer.end()
-
-                val readAllocator =
-                  rootAlloc.newChildAllocator("probe-read", 0, Long.MaxValue)
-                try {
-                  val reader = new ArrowStreamReader(
-                    new ByteArrayInputStream(baos.toByteArray),
-                    readAllocator)
-                  try {
-                    if (!reader.loadNextBatch()) {
-                      failureRef.set(
-                        new AssertionError("IPC round-trip: expected at least one record batch"))
-                    } else {
-                      val readRoot = reader.getVectorSchemaRoot
-                      if (readRoot.getRowCount != batch.numRows()) {
-                        failureRef.set(
-                          new AssertionError(
-                            s"row count mismatch: read=${readRoot.getRowCount}, " +
-                              s"expected=${batch.numRows()}"))
-                      }
-                    }
-                  } finally {
-                    reader.close()
-                  }
-                } finally {
-                  readAllocator.close()
-                }
-              } finally {
-                root.close()
-                allocator.close()
-                batch.close()
-              }
-            } catch {
-              case t: Throwable => failureRef.set(t)
-            }
-          }
-        }
-
-      val failure = failureRef.get()
-      if (failure != null) throw failure
-    }
-  }
-}

From 1ba717c509bd99d050c5754da14a1e501e0179d6 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Mon, 11 May 2026 21:02:57 -0600
Subject: [PATCH 25/54] feat: add CometColumnarPythonInput trait for bulk-copy
 Arrow IPC [skip ci]

Also add arrow-vector as provided scope to spark/pom.xml so Scala can
resolve org.apache.arrow.vector during compilation; the partial
org/apache/arrow/c/ tree in common/target/classes otherwise masks the
package and causes "object vector is not a member of package
org.apache.arrow" errors.

The unloader is created inline per-batch rather than as a class field to
stay compatible across Spark 4.0/4.1/4.2, which differ in whether
PythonArrowInput declares unloader as abstract.
---
 spark/pom.xml                                 |  10 +-
 .../python/CometColumnarPythonInput.scala     | 177 ++++++++++++++++++
 2 files changed, 186 insertions(+), 1 deletion(-)
 create mode 100644 spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala

diff --git a/spark/pom.xml b/spark/pom.xml
index d3c18ccf87..e35832d9a6 100644
--- a/spark/pom.xml
+++ b/spark/pom.xml
@@ -130,7 +130,15 @@ under the License.
     </dependency>
     <!-- We shade & relocate Arrow dependencies in comet-common, so comet-spark module no longer
          depends on Arrow. However, when running `mvn test` we still need Arrow classes in the
-         classpath, since the Maven shading happens in 'package' phase which is after 'test' -->
+         classpath, since the Maven shading happens in 'package' phase which is after 'test'.
+         arrow-vector is listed as provided (not test) so that Scala sees the full
+         org.apache.arrow.vector package during compile; without it, the partial
+         org/apache/arrow/c/ tree in common/target/classes masks the package. -->
+    <dependency>
+      <groupId>org.apache.arrow</groupId>
+      <artifactId>arrow-vector</artifactId>
+      <scope>provided</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.arrow</groupId>
       <artifactId>arrow-memory-unsafe</artifactId>
diff --git a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala
new file mode 100644
index 0000000000..b27d8f0568
--- /dev/null
+++ b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala
@@ -0,0 +1,177 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.execution.python
+
+import java.io.DataOutputStream
+import java.nio.channels.Channels
+
+import org.apache.arrow.vector.{BaseFixedWidthVector, BaseVariableWidthVector, FieldVector, VectorSchemaRoot, VectorUnloader}
+import org.apache.arrow.vector.complex.StructVector
+import org.apache.arrow.vector.compression.{CompressionCodec, CompressionUtil, NoCompressionCodec}
+import org.apache.arrow.vector.ipc.{ArrowStreamWriter, WriteChannel}
+import org.apache.arrow.vector.ipc.message.MessageSerializer
+import org.apache.spark.SparkException
+import org.apache.spark.api.python.BasePythonRunner
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+import org.apache.comet.vector.CometDecodedVector
+
+/**
+ * `PythonArrowInput` implementation that streams Comet `ColumnarBatch` values to the Python
+ * worker as Arrow IPC, bypassing the row materialization that `BasicPythonArrowInput` performs.
+ * The persistent root supplied by `PythonArrowInput` carries the wrapped-struct schema
+ * (`StructType(Array(StructField("struct", childSchema)))`) so the Python worker contract is
+ * preserved.
+ *
+ * Each call writes one Comet batch. The runner contract repeats `writeNextBatchToArrowStream`
+ * until it returns `false`. Per-batch the input trait allocates a destination vector in the
+ * persistent root and copies each source buffer via `ArrowBuf.setBytes` -- this is bulk per
+ * buffer, not per row, but it is NOT zero-copy: Comet's Parquet reader allocators are independent
+ * roots from `ArrowUtils.rootAllocator`.
+ */
+private[python] trait CometColumnarPythonInput extends PythonArrowInput[Iterator[ColumnarBatch]] {
+  self: BasePythonRunner[Iterator[ColumnarBatch], _] =>
+
+  private var currentGroup: Iterator[ColumnarBatch] = _
+
+  // Read the codec name via raw config key so this compiles against Spark 4.0 (which lacks
+  // SQLConf.arrowCompressionCodec) as well as 4.1/4.2. The codec instances are obtained
+  // through CompressionCodec.Factory (arrow-vector) rather than importing the concrete
+  // Lz4CompressionCodec / ZstdCompressionCodec from the separate arrow-compression artifact.
+  private lazy val cometCodec: CompressionCodec = {
+    val factory = CompressionCodec.Factory.INSTANCE
+    SQLConf.get.getConfString("spark.sql.execution.arrow.compression.codec", "none") match {
+      case "none" => NoCompressionCodec.INSTANCE
+      case "lz4" =>
+        factory.createCodec(CompressionUtil.CodecType.LZ4_FRAME)
+      case "zstd" =>
+        val level =
+          SQLConf.get.getConfString("spark.sql.execution.arrow.compression.zstd.level", "3").toInt
+        factory.createCodec(CompressionUtil.CodecType.ZSTD, level)
+      case other =>
+        throw SparkException.internalError(
+          s"Unsupported Arrow compression codec: $other. Supported values: none, lz4, zstd")
+    }
+  }
+
+  override protected def writeNextBatchToArrowStream(
+      root: VectorSchemaRoot,
+      writer: ArrowStreamWriter,
+      dataOut: DataOutputStream,
+      inputIterator: Iterator[Iterator[ColumnarBatch]]): Boolean = {
+
+    while (currentGroup == null || !currentGroup.hasNext) {
+      if (!inputIterator.hasNext) {
+        super[PythonArrowInput].close()
+        return false
+      }
+      currentGroup = inputIterator.next()
+    }
+
+    val cometBatch = currentGroup.next()
+    val startData = dataOut.size()
+    val structVec = root.getVector(0).asInstanceOf[StructVector]
+
+    var i = 0
+    while (i < cometBatch.numCols()) {
+      val src = cometBatch
+        .column(i)
+        .asInstanceOf[CometDecodedVector]
+        .getValueVector
+        .asInstanceOf[FieldVector]
+      val dst = structVec.getChildByOrdinal(i).asInstanceOf[FieldVector]
+      copyVector(src, dst)
+      i += 1
+    }
+    structVec.setValueCount(cometBatch.numRows())
+    root.setRowCount(cometBatch.numRows())
+
+    // VectorUnloader is lightweight (wraps root); create per-batch to stay compatible
+    // across Spark 4.0/4.1/4.2 which differ in how the unloader field is managed.
+    val batchUnloader =
+      new VectorUnloader(root, /* includeNullCount */ true, cometCodec, /* alignBuffers */ true)
+    val recordBatch = batchUnloader.getRecordBatch
+    try {
+      val writeChannel = new WriteChannel(Channels.newChannel(dataOut))
+      MessageSerializer.serialize(writeChannel, recordBatch)
+    } finally {
+      recordBatch.close()
+    }
+
+    pythonMetrics("pythonDataSent") += dataOut.size() - startData
+    true
+  }
+
+  /**
+   * Copy `src` into `dst` via per-buffer memcpy. Allocates `dst` sized to match `src`, then
+   * `ArrowBuf.setBytes` copies each field buffer (validity, offsets, data) wholesale. Recurses
+   * into struct / list children.
+   *
+   * This does NOT transfer buffer ownership and does NOT change refcounts: `src` retains its
+   * buffers, `dst` allocates new ones in the runner's allocator. Required because Comet's Parquet
+   * reader allocators are independent roots from `ArrowUtils.rootAllocator`.
+   */
+  private def copyVector(src: FieldVector, dst: FieldVector): Unit = {
+    val numRows = src.getValueCount
+
+    dst match {
+      case bfwv: BaseFixedWidthVector =>
+        bfwv.allocateNew(numRows)
+      case bvwv: BaseVariableWidthVector =>
+        // Variable-width data buffer size depends on actual byte content, not just numRows.
+        // Match the source data buffer's readable bytes.
+        val srcFieldBufs = src.getFieldBuffers
+        val dataBufIdx = srcFieldBufs.size - 1
+        val srcDataSize = srcFieldBufs.get(dataBufIdx).readableBytes
+        bvwv.allocateNew(srcDataSize, numRows)
+      case _ =>
+        dst.setInitialCapacity(numRows)
+        dst.allocateNew()
+    }
+
+    val srcBufs = src.getFieldBuffers
+    val dstBufs = dst.getFieldBuffers
+    require(
+      srcBufs.size == dstBufs.size,
+      s"buffer count mismatch for ${src.getField}: src=${srcBufs.size} dst=${dstBufs.size}")
+    var bi = 0
+    while (bi < srcBufs.size) {
+      val sBuf = srcBufs.get(bi)
+      val dBuf = dstBufs.get(bi)
+      dBuf.setBytes(0L, sBuf, 0L, sBuf.readableBytes)
+      bi += 1
+    }
+
+    val srcChildren = src.getChildrenFromFields
+    val dstChildren = dst.getChildrenFromFields
+    require(
+      srcChildren.size == dstChildren.size,
+      s"child count mismatch for ${src.getField}: " +
+        s"src=${srcChildren.size} dst=${dstChildren.size}")
+    var ci = 0
+    while (ci < srcChildren.size) {
+      copyVector(srcChildren.get(ci), dstChildren.get(ci))
+      ci += 1
+    }
+
+    dst.setValueCount(numRows)
+  }
+}

From 06a284568a87f7c442db5a7c8d38e8596880590d Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Mon, 11 May 2026 21:44:15 -0600
Subject: [PATCH 26/54] feat: add CometArrowPythonRunner for Spark 4.0 [skip
 ci]

---
 .../python/CometArrowPythonRunner.scala       | 72 +++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100644 spark/src/main/spark-4.0/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala

diff --git a/spark/src/main/spark-4.0/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala b/spark/src/main/spark-4.0/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala
new file mode 100644
index 0000000000..63d282e8b9
--- /dev/null
+++ b/spark/src/main/spark-4.0/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.execution.python
+
+import java.io.DataOutputStream
+
+import org.apache.spark.api.python.{BasePythonRunner, ChainedPythonFunctions}
+import org.apache.spark.sql.execution.metric.SQLMetric
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+/**
+ * Comet's Arrow Python runner for Spark 4.0. Extends `BasePythonRunner` directly because Spark
+ * 4.0's `BaseArrowPythonRunner` is bound to `Iterator[InternalRow]` and mixes in
+ * `BasicPythonArrowInput`, so we cannot inherit from it. Wires the SQLConf-driven fields that
+ * `BaseArrowPythonRunner` provides.
+ */
+class CometArrowPythonRunner(
+    funcs: Seq[(ChainedPythonFunctions, Long)],
+    evalType: Int,
+    argOffsets: Array[Array[Int]],
+    protected override val schema: StructType,
+    protected override val timeZoneId: String,
+    protected override val largeVarTypes: Boolean,
+    override val workerConf: Map[String, String],
+    override val pythonMetrics: Map[String, SQLMetric],
+    jobArtifactUUID: Option[String])
+    extends BasePythonRunner[Iterator[ColumnarBatch], ColumnarBatch](
+      funcs.map(_._1),
+      evalType,
+      argOffsets,
+      jobArtifactUUID,
+      pythonMetrics)
+    with CometColumnarPythonInput
+    with BasicPythonArrowOutput {
+
+  override val pythonExec: String =
+    SQLConf.get.pysparkWorkerPythonExecutable.getOrElse(funcs.head._1.funcs.head.pythonExec)
+
+  override val faultHandlerEnabled: Boolean = SQLConf.get.pythonUDFWorkerFaulthandlerEnabled
+  override val idleTimeoutSeconds: Long = SQLConf.get.pythonUDFWorkerIdleTimeoutSeconds
+  override val errorOnDuplicatedFieldNames: Boolean = true
+  override val hideTraceback: Boolean = SQLConf.get.pysparkHideTraceback
+  override val simplifiedTraceback: Boolean = SQLConf.get.pysparkSimplifiedTraceback
+
+  override val bufferSize: Int = SQLConf.get.pandasUDFBufferSize
+  require(
+    bufferSize >= 4,
+    "Pandas execution requires more than 4 bytes. Please set higher buffer. " +
+      s"Please change '${SQLConf.PANDAS_UDF_BUFFER_SIZE.key}'.")
+
+  override protected def writeUDF(dataOut: DataOutputStream): Unit =
+    PythonUDFRunner.writeUDFs(dataOut, funcs, argOffsets, jobArtifactUUID)
+}

From 95a4dd0d4876a18d1b5731c60839fe6038e39c38 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Mon, 11 May 2026 21:48:21 -0600
Subject: [PATCH 27/54] refactor: swap 4.0 shim to CometArrowPythonRunner with
 columnar input [skip ci]

---
 .../spark/sql/comet/shims/ShimCometMapInBatch.scala    | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
index fdc9a03e14..ddb73ac95c 100644
--- a/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
+++ b/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
@@ -20,9 +20,8 @@
 package org.apache.spark.sql.comet.shims
 
 import org.apache.spark.TaskContext
-import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.execution.metric.SQLMetric
-import org.apache.spark.sql.execution.python.ArrowPythonRunner
+import org.apache.spark.sql.execution.python.CometArrowPythonRunner
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
@@ -34,10 +33,10 @@ trait ShimCometMapInBatch extends Spark4xMapInBatchSupport {
       argOffsets: Array[Array[Int]],
       schema: StructType,
       pythonMetrics: Map[String, SQLMetric],
-      batchIter: Iterator[Iterator[InternalRow]],
+      batchIter: Iterator[Iterator[ColumnarBatch]],
       partitionId: Int,
       context: TaskContext): Iterator[ColumnarBatch] =
-    new ArrowPythonRunner(
+    new CometArrowPythonRunner(
       runnerInputs.chainedFunc,
       evalType,
       argOffsets,
@@ -46,6 +45,5 @@ trait ShimCometMapInBatch extends Spark4xMapInBatchSupport {
       runnerInputs.largeVarTypes,
       runnerInputs.pythonRunnerConf,
       pythonMetrics,
-      runnerInputs.jobArtifactUUID,
-      None).compute(batchIter, partitionId, context)
+      runnerInputs.jobArtifactUUID).compute(batchIter, partitionId, context)
 }

From 1c4a6e258ec681dcc24c950f6284f91a7ab2a500 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Mon, 11 May 2026 21:48:24 -0600
Subject: [PATCH 28/54] feat: switch CometMapInBatchExec to columnar Python
 runner input [skip ci]

---
 .../spark/sql/comet/CometMapInBatchExec.scala | 44 ++++++-------------
 1 file changed, 13 insertions(+), 31 deletions(-)

diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala
index 233df4d0dc..14e19ab50f 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala
@@ -19,9 +19,7 @@
 
 package org.apache.spark.sql.comet
 
-import scala.collection.JavaConverters._
-
-import org.apache.spark.{ContextAwareIterator, TaskContext}
+import org.apache.spark.TaskContext
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -30,20 +28,17 @@ import org.apache.spark.sql.catalyst.plans.physical.Partitioning
 import org.apache.spark.sql.comet.shims.ShimCometMapInBatch
 import org.apache.spark.sql.execution.{ColumnarToRowExec, SparkPlan, UnaryExecNode}
 import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
-import org.apache.spark.sql.execution.python.{BatchIterator, PythonSQLMetrics}
+import org.apache.spark.sql.execution.python.PythonSQLMetrics
 import org.apache.spark.sql.types.{StructField, StructType}
 import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch}
 
 /**
  * Comet replacement for Spark's `MapInBatchExec` family (`PythonMapInArrowExec` /
- * `MapInArrowExec` in 4.1+ / `MapInPandasExec`). Accepts columnar input directly from a Comet
- * child instead of going through the per-row `UnsafeProjection` that `ColumnarToRowExec` applies,
- * and keeps the Python runner output as `ColumnarBatch` so downstream Comet operators consume it
- * natively.
+ * `MapInArrowExec` in 4.1+ / `MapInPandasExec`). Feeds upstream Comet `ColumnarBatch` values
+ * directly to a `CometArrowPythonRunner`, eliminating the per-row `InternalRow.getXXX` loop that
+ * vanilla Spark's `ArrowPythonRunner` performs.
  *
- * What this eliminates: two `UnsafeProjection` copies (input and output) and the row transition
- * between Comet and the Python operator. The internal row-to-Arrow IPC re-encoding inside
- * `ArrowPythonRunner` is unchanged; full round-trip elimination is tracked in #4240.
+ * Per-Spark-minor wiring lives in `ShimCometMapInBatch.computeArrowPython`.
  */
 case class CometMapInBatchExec(
     func: Expression,
@@ -69,8 +64,8 @@ case class CometMapInBatchExec(
     pythonMetrics
 
   // Fallback for row-consuming parents (e.g. a top-level `collect()` that produces rows).
-  // Wraps this columnar exec in `ColumnarToRowExec`, reintroducing exactly the row transition
-  // this operator otherwise eliminates. Only fires when nothing downstream consumes columnar.
+  // Wraps this columnar exec in `ColumnarToRowExec`, reintroducing the row transition this
+  // operator otherwise eliminates. Only fires when nothing downstream consumes columnar.
   override def doExecute(): RDD[InternalRow] = {
     ColumnarToRowExec(this).doExecute()
   }
@@ -82,45 +77,32 @@ case class CometMapInBatchExec(
 
     val outputAttrs = output
     val childSchema = child.schema
-    val batchSize = conf.arrowMaxRecordsPerBatch
     val evalType = pythonEvalType
     val metricsCopy = pythonMetrics
 
     // Resolve every `SQLConf`-derived input on the driver. `SQLConf.get` reads from a thread-local
     // `ConfigReader` that only exists on the driver, so dereferencing `conf` from inside the task
-    // closure NPEs (see #4234 review).
+    // closure NPEs.
     val resolvedRunnerInputs = runnerInputs(func.asInstanceOf[PythonUDF], conf)
 
     val inputRDD = child.executeColumnar()
 
     def processPartition(batches: Iterator[ColumnarBatch]): Iterator[ColumnarBatch] = {
       val context = TaskContext.get()
-      val argOffsets = Array(Array(0))
-
-      val rowIter = batches.flatMap { batch =>
-        numInputRows += batch.numRows()
-        batch.rowIterator().asScala
-      }
-
-      val contextAwareIterator = new ContextAwareIterator(context, rowIter)
-
-      // Wrap rows as a struct, matching MapInBatchEvaluatorFactory behavior
-      val wrappedIter = contextAwareIterator.map(InternalRow(_))
-
-      val batchIter =
-        if (batchSize > 0) new BatchIterator(wrappedIter, batchSize) else Iterator(wrappedIter)
+      val counting = batches.map { b => numInputRows += b.numRows(); b }
 
       val columnarBatchIter = computeArrowPython(
         resolvedRunnerInputs,
         evalType,
-        argOffsets,
+        Array(Array(0)),
         StructType(Array(StructField("struct", childSchema))),
         metricsCopy,
-        batchIter,
+        Iterator(counting),
         context.partitionId(),
         context)
 
       columnarBatchIter.map { batch =>
+        // Python returns a single struct column; flatten to the user's output columns.
         val structVector = batch.column(0).asInstanceOf[ArrowColumnVector]
         val outputVectors = outputAttrs.indices.map(structVector.getChild)
         val flattenedBatch = new ColumnarBatch(outputVectors.toArray)

From 8874b5729e0355e5587cbf6dad773fbab26087e0 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Mon, 11 May 2026 21:53:50 -0600
Subject: [PATCH 29/54] refactor: collapse 3.5 shim to no-op; columnar path
 targets 4.x only [skip ci]

---
 .../sql/comet/shims/ShimCometMapInBatch.scala |  72 +++---------
 .../sql/comet/CometMapInBatchSuite.scala      | 106 ------------------
 2 files changed, 15 insertions(+), 163 deletions(-)
 delete mode 100644 spark/src/test/spark-3.5/org/apache/spark/sql/comet/CometMapInBatchSuite.scala

diff --git a/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
index a04681044c..73a1077de2 100644
--- a/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
+++ b/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
@@ -19,65 +19,32 @@
 
 package org.apache.spark.sql.comet.shims
 
-import org.apache.spark.{JobArtifactSet, TaskContext}
-import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType}
-import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.TaskContext
 import org.apache.spark.sql.catalyst.expressions.PythonUDF
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.metric.SQLMetric
-import org.apache.spark.sql.execution.python.{ArrowPythonRunner, MapInPandasExec, PythonMapInArrowExec}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
+/**
+ * Spark 3.5 shim for the PyArrow UDF acceleration support.
+ *
+ * The columnar runner introduced in #4234 only targets Spark 4.0+. On Spark 3.5 the matchers
+ * return `None`, the rewrite does not fire, and vanilla Spark handles `mapInArrow` /
+ * `mapInPandas` unchanged. 3.5 support can be added later if there is user demand.
+ */
 trait ShimCometMapInBatch {
 
-  protected def matchMapInArrow(plan: SparkPlan): Option[MapInBatchInfo] =
-    plan match {
-      case p: PythonMapInArrowExec =>
-        Some(
-          MapInBatchInfo(
-            p.func,
-            p.output,
-            p.child,
-            p.isBarrier,
-            PythonEvalType.SQL_MAP_ARROW_ITER_UDF))
-      case _ => None
-    }
+  protected def matchMapInArrow(plan: SparkPlan): Option[MapInBatchInfo] = None
 
-  protected def matchMapInPandas(plan: SparkPlan): Option[MapInBatchInfo] =
-    plan match {
-      case p: MapInPandasExec =>
-        Some(
-          MapInBatchInfo(
-            p.func,
-            p.output,
-            p.child,
-            p.isBarrier,
-            PythonEvalType.SQL_MAP_PANDAS_ITER_UDF))
-      case _ => None
-    }
+  protected def matchMapInPandas(plan: SparkPlan): Option[MapInBatchInfo] = None
 
-  /** Inputs Spark 3.5's `ArrowPythonRunner` constructor needs. */
-  protected case class RunnerInputs(
-      chainedFunc: Seq[ChainedPythonFunctions],
-      timeZoneId: String,
-      largeVarTypes: Boolean,
-      pythonRunnerConf: Map[String, String],
-      jobArtifactUUID: Option[String])
+  /** Stub; never constructed on Spark 3.5 because the matchers always return `None`. */
+  protected case class RunnerInputs()
 
-  /**
-   * Resolves the `SQLConf`-derived inputs the `ArrowPythonRunner` needs. Must be called on the
-   * driver: `conf.sessionLocalTimeZone` etc. read from a thread-local `ConfigReader` that only
-   * exists on the driver, so dereferencing them from a task closure NPEs.
-   */
   protected def runnerInputs(pythonUDF: PythonUDF, conf: SQLConf): RunnerInputs =
-    RunnerInputs(
-      chainedFunc = Seq(ChainedPythonFunctions(Seq(pythonUDF.func))),
-      timeZoneId = conf.sessionLocalTimeZone,
-      largeVarTypes = conf.arrowUseLargeVarTypes,
-      pythonRunnerConf = ArrowPythonRunner.getPythonRunnerConfMap(conf),
-      jobArtifactUUID = JobArtifactSet.getCurrentJobArtifactState.map(_.uuid))
+    throw new UnsupportedOperationException("CometMapInBatchExec is not supported on Spark 3.5")
 
   protected def computeArrowPython(
       runnerInputs: RunnerInputs,
@@ -85,17 +52,8 @@ trait ShimCometMapInBatch {
       argOffsets: Array[Array[Int]],
       schema: StructType,
       pythonMetrics: Map[String, SQLMetric],
-      batchIter: Iterator[Iterator[InternalRow]],
+      batchIter: Iterator[Iterator[ColumnarBatch]],
       partitionId: Int,
       context: TaskContext): Iterator[ColumnarBatch] =
-    new ArrowPythonRunner(
-      runnerInputs.chainedFunc,
-      evalType,
-      argOffsets,
-      schema,
-      runnerInputs.timeZoneId,
-      runnerInputs.largeVarTypes,
-      runnerInputs.pythonRunnerConf,
-      pythonMetrics,
-      runnerInputs.jobArtifactUUID).compute(batchIter, partitionId, context)
+    throw new UnsupportedOperationException("CometMapInBatchExec is not supported on Spark 3.5")
 }
diff --git a/spark/src/test/spark-3.5/org/apache/spark/sql/comet/CometMapInBatchSuite.scala b/spark/src/test/spark-3.5/org/apache/spark/sql/comet/CometMapInBatchSuite.scala
deleted file mode 100644
index af960c5c97..0000000000
--- a/spark/src/test/spark-3.5/org/apache/spark/sql/comet/CometMapInBatchSuite.scala
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.comet
-
-import org.apache.spark.api.python.{PythonAccumulatorV2, PythonBroadcast, PythonEvalType, PythonFunction}
-import org.apache.spark.broadcast.Broadcast
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.CometTestBase
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, ExprId, PythonUDF}
-import org.apache.spark.sql.execution.{ColumnarToRowExec, LeafExecNode}
-import org.apache.spark.sql.execution.python.PythonMapInArrowExec
-import org.apache.spark.sql.types.{LongType, StructField, StructType}
-import org.apache.spark.sql.vectorized.ColumnarBatch
-
-import org.apache.comet.CometConf
-import org.apache.comet.rules.EliminateRedundantTransitions
-
-/** Minimal CometPlan leaf used to anchor the rule's transform without triggering execution. */
-private case class StubCometLeaf(override val output: Seq[Attribute])
-    extends LeafExecNode
-    with CometPlan {
-  override def supportsColumnar: Boolean = true
-  override protected def doExecute(): RDD[InternalRow] =
-    throw new UnsupportedOperationException
-  override protected def doExecuteColumnar(): RDD[ColumnarBatch] =
-    throw new UnsupportedOperationException
-}
-
-/**
- * Plan-rule test for the `EliminateRedundantTransitions` rewrite that produces
- * `CometMapInBatchExec`. Pure Python execution paths are covered by the pytest module
- * `test_pyarrow_udf.py`; this suite verifies the JVM-side rule without spinning up Python.
- *
- * Lives under `org.apache.spark.sql.comet` so it can reference Spark's `private[spark]`
- * `PythonFunction` / `PythonAccumulatorV2` / `PythonBroadcast` classes when fabricating a stub
- * `PythonUDF` for `PythonMapInArrowExec` to wrap.
- */
-class CometMapInBatchSuite extends CometTestBase {
-
-  private def stubPythonUDF: PythonUDF = {
-    val pyFunc = new PythonFunction {
-      override val command: Seq[Byte] = Seq.empty[Byte]
-      override val envVars: java.util.Map[String, String] =
-        new java.util.HashMap[String, String]()
-      override val pythonIncludes: java.util.List[String] =
-        java.util.Collections.emptyList[String]()
-      override val pythonExec: String = "python3"
-      override val pythonVer: String = "3"
-      override val broadcastVars: java.util.List[Broadcast[PythonBroadcast]] =
-        java.util.Collections.emptyList[Broadcast[PythonBroadcast]]()
-      override val accumulator: PythonAccumulatorV2 = null
-    }
-    PythonUDF(
-      name = "test_udf",
-      func = pyFunc,
-      dataType = StructType(Seq(StructField("id", LongType))),
-      children = Seq(AttributeReference("id", LongType)(ExprId(0L))),
-      evalType = PythonEvalType.SQL_MAP_ARROW_ITER_UDF,
-      udfDeterministic = true)
-  }
-
-  private def buildPlan(): PythonMapInArrowExec = {
-    val cometChild = StubCometLeaf(Seq(AttributeReference("id", LongType)(ExprId(0L))))
-    PythonMapInArrowExec(
-      stubPythonUDF,
-      cometChild.output,
-      ColumnarToRowExec(cometChild),
-      isBarrier = false)
-  }
-
-  test("rule rewrites PythonMapInArrowExec over Comet to CometMapInBatchExec") {
-    withSQLConf(CometConf.COMET_PYARROW_UDF_ENABLED.key -> "true") {
-      val rewritten = EliminateRedundantTransitions(spark).apply(buildPlan())
-      assert(
-        rewritten.exists(_.isInstanceOf[CometMapInBatchExec]),
-        s"expected CometMapInBatchExec in rewritten plan:\n$rewritten")
-    }
-  }
-
-  test("rule does not rewrite when feature is disabled") {
-    withSQLConf(CometConf.COMET_PYARROW_UDF_ENABLED.key -> "false") {
-      val rewritten = EliminateRedundantTransitions(spark).apply(buildPlan())
-      assert(
-        !rewritten.exists(_.isInstanceOf[CometMapInBatchExec]),
-        s"unexpected CometMapInBatchExec when disabled:\n$rewritten")
-    }
-  }
-}

From 6523eb9f9f80f2a3b4ce9a9a3bd0595fb31b6667 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Mon, 11 May 2026 21:55:44 -0600
Subject: [PATCH 30/54] feat: add CometArrowPythonRunner for Spark 4.1 [skip
 ci]

---
 .../sql/comet/shims/ShimCometMapInBatch.scala |  8 +--
 .../python/CometArrowPythonRunner.scala       | 64 +++++++++++++++++++
 2 files changed, 67 insertions(+), 5 deletions(-)
 create mode 100644 spark/src/main/spark-4.1/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala

diff --git a/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
index b0e6ecc3a0..ad27b7de42 100644
--- a/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
+++ b/spark/src/main/spark-4.1/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
@@ -20,9 +20,8 @@
 package org.apache.spark.sql.comet.shims
 
 import org.apache.spark.TaskContext
-import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.execution.metric.SQLMetric
-import org.apache.spark.sql.execution.python.ArrowPythonRunner
+import org.apache.spark.sql.execution.python.CometArrowPythonRunner
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
@@ -34,10 +33,10 @@ trait ShimCometMapInBatch extends Spark4xMapInBatchSupport {
       argOffsets: Array[Array[Int]],
       schema: StructType,
       pythonMetrics: Map[String, SQLMetric],
-      batchIter: Iterator[Iterator[InternalRow]],
+      batchIter: Iterator[Iterator[ColumnarBatch]],
       partitionId: Int,
       context: TaskContext): Iterator[ColumnarBatch] =
-    new ArrowPythonRunner(
+    new CometArrowPythonRunner(
       runnerInputs.chainedFunc,
       evalType,
       argOffsets,
@@ -47,6 +46,5 @@ trait ShimCometMapInBatch extends Spark4xMapInBatchSupport {
       runnerInputs.pythonRunnerConf,
       pythonMetrics,
       runnerInputs.jobArtifactUUID,
-      None,
       None).compute(batchIter, partitionId, context)
 }
diff --git a/spark/src/main/spark-4.1/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala b/spark/src/main/spark-4.1/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala
new file mode 100644
index 0000000000..7b82b0aed8
--- /dev/null
+++ b/spark/src/main/spark-4.1/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.execution.python
+
+import java.io.DataOutputStream
+
+import org.apache.spark.api.python.ChainedPythonFunctions
+import org.apache.spark.sql.execution.metric.SQLMetric
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+/**
+ * Comet's Arrow Python runner for Spark 4.1. Extends `BaseArrowPythonRunner` parameterized over
+ * `Iterator[ColumnarBatch]` input, and supplies the columnar input via `CometColumnarPythonInput`
+ * instead of `BasicPythonArrowInput`.
+ *
+ * Spark 4.1's `PythonUDFRunner.writeUDFs` takes a `profiler: Option[String]` fourth argument; we
+ * pass `None` since Comet does not support Python profiling.
+ */
+class CometArrowPythonRunner(
+    funcs: Seq[(ChainedPythonFunctions, Long)],
+    evalType: Int,
+    argOffsets: Array[Array[Int]],
+    schema: StructType,
+    timeZoneId: String,
+    largeVarTypes: Boolean,
+    workerConf: Map[String, String],
+    pythonMetrics: Map[String, SQLMetric],
+    jobArtifactUUID: Option[String],
+    sessionUUID: Option[String])
+    extends BaseArrowPythonRunner[Iterator[ColumnarBatch], ColumnarBatch](
+      funcs,
+      evalType,
+      argOffsets,
+      schema,
+      timeZoneId,
+      largeVarTypes,
+      workerConf,
+      pythonMetrics,
+      jobArtifactUUID,
+      sessionUUID)
+    with CometColumnarPythonInput
+    with BasicPythonArrowOutput {
+
+  override protected def writeUDF(dataOut: DataOutputStream): Unit =
+    PythonUDFRunner.writeUDFs(dataOut, funcs, argOffsets, None)
+}

From 173e1971e8efd75b757281e3dbd49d695a57897c Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 12 May 2026 00:19:43 -0600
Subject: [PATCH 31/54] feat: add CometArrowPythonRunner for Spark 4.2 [skip
 ci]

---
 .../sql/comet/shims/ShimCometMapInBatch.scala |  7 +--
 .../python/CometArrowPythonRunner.scala       | 63 +++++++++++++++++++
 2 files changed, 66 insertions(+), 4 deletions(-)
 create mode 100644 spark/src/main/spark-4.2/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala

diff --git a/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
index fdc9a03e14..ad27b7de42 100644
--- a/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
+++ b/spark/src/main/spark-4.2/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
@@ -20,9 +20,8 @@
 package org.apache.spark.sql.comet.shims
 
 import org.apache.spark.TaskContext
-import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.execution.metric.SQLMetric
-import org.apache.spark.sql.execution.python.ArrowPythonRunner
+import org.apache.spark.sql.execution.python.CometArrowPythonRunner
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
@@ -34,10 +33,10 @@ trait ShimCometMapInBatch extends Spark4xMapInBatchSupport {
       argOffsets: Array[Array[Int]],
       schema: StructType,
       pythonMetrics: Map[String, SQLMetric],
-      batchIter: Iterator[Iterator[InternalRow]],
+      batchIter: Iterator[Iterator[ColumnarBatch]],
       partitionId: Int,
       context: TaskContext): Iterator[ColumnarBatch] =
-    new ArrowPythonRunner(
+    new CometArrowPythonRunner(
       runnerInputs.chainedFunc,
       evalType,
       argOffsets,
diff --git a/spark/src/main/spark-4.2/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala b/spark/src/main/spark-4.2/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala
new file mode 100644
index 0000000000..c9714ce068
--- /dev/null
+++ b/spark/src/main/spark-4.2/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.execution.python
+
+import java.io.DataOutputStream
+
+import org.apache.spark.api.python.ChainedPythonFunctions
+import org.apache.spark.sql.execution.metric.SQLMetric
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+/**
+ * Comet's Arrow Python runner for Spark 4.2. Spark 4.2's `BaseArrowPythonRunner` no longer
+ * accepts `workerConf` in its constructor; the subclass overrides `runnerConf` instead.
+ * `PythonUDFRunner.writeUDFs` drops the `profiler` argument compared to 4.1.
+ */
+class CometArrowPythonRunner(
+    funcs: Seq[(ChainedPythonFunctions, Long)],
+    evalType: Int,
+    argOffsets: Array[Array[Int]],
+    schema: StructType,
+    timeZoneId: String,
+    largeVarTypes: Boolean,
+    pythonRunnerConf: Map[String, String],
+    pythonMetrics: Map[String, SQLMetric],
+    jobArtifactUUID: Option[String],
+    sessionUUID: Option[String])
+    extends BaseArrowPythonRunner[Iterator[ColumnarBatch], ColumnarBatch](
+      funcs,
+      evalType,
+      argOffsets,
+      schema,
+      timeZoneId,
+      largeVarTypes,
+      pythonMetrics,
+      jobArtifactUUID,
+      sessionUUID)
+    with CometColumnarPythonInput
+    with BasicPythonArrowOutput {
+
+  override protected def runnerConf: Map[String, String] =
+    super.runnerConf ++ pythonRunnerConf
+
+  override protected def writeUDF(dataOut: DataOutputStream): Unit =
+    PythonUDFRunner.writeUDFs(dataOut, funcs, argOffsets)
+}

From d6128d6c5246f9991bd5600f01e935f914ef2d53 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 12 May 2026 00:23:21 -0600
Subject: [PATCH 32/54] fix: align Spark 3.4 shim ColumnarBatch signature [skip
 ci]

---
 .../org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
index 1bde7ca094..1fd4b96f09 100644
--- a/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
+++ b/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
@@ -20,7 +20,6 @@
 package org.apache.spark.sql.comet.shims
 
 import org.apache.spark.TaskContext
-import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.PythonUDF
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.metric.SQLMetric
@@ -54,7 +53,7 @@ trait ShimCometMapInBatch {
       argOffsets: Array[Array[Int]],
       schema: StructType,
       pythonMetrics: Map[String, SQLMetric],
-      batchIter: Iterator[Iterator[InternalRow]],
+      batchIter: Iterator[Iterator[ColumnarBatch]],
       partitionId: Int,
       context: TaskContext): Iterator[ColumnarBatch] =
     throw new UnsupportedOperationException("CometMapInBatchExec is not supported on Spark 3.4")

From e64ce0ecd26c3d5f77f56e17f665a5e78e742859 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 12 May 2026 00:26:22 -0600
Subject: [PATCH 33/54] test: add end-to-end runner check to
 CometMapInBatchSuite [skip ci]

---
 .../sql/comet/CometMapInBatchSuite.scala      | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala b/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala
index 5ab0b927a2..79d75bb2cf 100644
--- a/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala
+++ b/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala
@@ -104,4 +104,38 @@ class CometMapInBatchSuite extends CometTestBase {
         s"unexpected CometMapInBatchExec when disabled:\n$rewritten")
     }
   }
+
+  test("end-to-end: rewrite-on output matches rewrite-off output for primitives + varchar") {
+    // This test needs PySpark workers; only run if PYSPARK_PYTHON is set in the env.
+    assume(
+      sys.env.contains("PYSPARK_PYTHON"),
+      "set PYSPARK_PYTHON to enable end-to-end pyarrow UDF tests")
+
+    withTempPath { path =>
+      val pathStr = path.getCanonicalPath
+      spark
+        .range(0, 1000, 1, 4)
+        .selectExpr(
+          "id AS id",
+          "CAST(id AS DOUBLE) * 1.5 AS dbl",
+          "CASE WHEN id % 10 = 0 THEN NULL ELSE CONCAT('row_', CAST(id AS STRING)) END AS s")
+        .write
+        .mode("overwrite")
+        .parquet(pathStr)
+
+      // Baseline: rewrite disabled, vanilla MapInArrowExec runs.
+      val baseline = withSQLConf(CometConf.COMET_PYARROW_UDF_ENABLED.key -> "false") {
+        spark.read.parquet(pathStr).collect().map(_.toSeq).toSet
+      }
+
+      // Optimized: rewrite enabled, CometMapInBatchExec + CometArrowPythonRunner runs.
+      withSQLConf(CometConf.COMET_PYARROW_UDF_ENABLED.key -> "true") {
+        val df = spark.read.parquet(pathStr)
+        val result = df.collect().map(_.toSeq).toSet
+        assert(
+          result == baseline,
+          s"optimized output differs from baseline:\noptimized=$result\nbaseline=$baseline")
+      }
+    }
+  }
 }

From 7fa8dcbad883c8b96f587101ef836a5e01a0cb51 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 12 May 2026 00:32:00 -0600
Subject: [PATCH 34/54] build: allowlist Comet pyarrow UDF runner classes in
 jar-contents check [skip ci]

---
 dev/ensure-jars-have-correct-contents.sh | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/dev/ensure-jars-have-correct-contents.sh b/dev/ensure-jars-have-correct-contents.sh
index 084936475d..3c8b7a3afa 100755
--- a/dev/ensure-jars-have-correct-contents.sh
+++ b/dev/ensure-jars-have-correct-contents.sh
@@ -91,6 +91,13 @@ allowed_expr+="|^org/apache/spark/shuffle/comet/.*$"
 allowed_expr+="|^org/apache/spark/sql/$"
 # allow ExplainPlanGenerator trait since it may not be available in older Spark versions
 allowed_expr+="|^org/apache/spark/sql/ExtendedExplainGenerator.*$"
+# PyArrow UDF acceleration runner classes are under org/apache/spark/sql/execution/python
+# because PythonArrowInput and BasicPythonArrowOutput are private[python]; Comet's classes
+# must be in that package to mix them in.
+allowed_expr+="|^org/apache/spark/sql/execution/$"
+allowed_expr+="|^org/apache/spark/sql/execution/python/$"
+allowed_expr+="|^org/apache/spark/sql/execution/python/CometColumnarPythonInput.*$"
+allowed_expr+="|^org/apache/spark/sql/execution/python/CometArrowPythonRunner.*$"
 allowed_expr+="|^org/apache/spark/CometPlugin.class$"
 allowed_expr+="|^org/apache/spark/CometDriverPlugin.*$"
 allowed_expr+="|^org/apache/spark/CometSource.*$"

From dbe603b344cbc77373bdb1ed8c497049ba355ef4 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 12 May 2026 01:35:01 -0600
Subject: [PATCH 35/54] feat: add CometVectorIpcCopier helper to copy comet
 vector bytes by address [skip ci]

Adds a helper class in comet-common that copies Arrow buffer bytes from a
CometDecodedVector to caller-supplied memory addresses without exposing shaded
Arrow types across the module boundary. Uses getFieldBuffers()/getChildrenFromFields()
traversal consistent with VectorUnloader so buffer ordering matches between source
(shaded) and destination (unshaded) sides.
---
 .../comet/vector/CometVectorIpcCopier.java    | 122 ++++++++++++++++++
 1 file changed, 122 insertions(+)
 create mode 100644 common/src/main/java/org/apache/comet/vector/CometVectorIpcCopier.java

diff --git a/common/src/main/java/org/apache/comet/vector/CometVectorIpcCopier.java b/common/src/main/java/org/apache/comet/vector/CometVectorIpcCopier.java
new file mode 100644
index 0000000000..368f02b0ec
--- /dev/null
+++ b/common/src/main/java/org/apache/comet/vector/CometVectorIpcCopier.java
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet.vector;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.arrow.memory.ArrowBuf;
+import org.apache.arrow.memory.util.MemoryUtil;
+import org.apache.arrow.vector.FieldVector;
+
+/**
+ * Helpers that copy the contents of a {@link CometDecodedVector} (whose underlying Arrow buffers
+ * live in the shaded {@code org.apache.comet.shaded.arrow.*} package after the comet-common jar is
+ * built) into destination buffer addresses provided by the caller.
+ *
+ * <p>Callers in {@code comet-spark} reference the unshaded {@code org.apache.arrow.*} classes
+ * supplied by Spark at runtime. Direct cross-package access from the spark module would fail with a
+ * {@code ClassCastException}. Crossing the boundary via raw memory addresses (long primitives)
+ * sidesteps the class identity issue: the bytes on disk are identical regardless of which Arrow
+ * Java distribution produced them.
+ *
+ * <p>All traversals use {@code getFieldBuffers()} and {@code getChildrenFromFields()} — the same
+ * API that {@code VectorUnloader} uses — so buffer ordering and counts are consistent between the
+ * source (shaded) and destination (unshaded) sides.
+ */
+public final class CometVectorIpcCopier {
+
+  private CometVectorIpcCopier() {}
+
+  /**
+   * Returns the readable byte counts of all buffers in {@code cometVec}'s underlying Arrow tree, in
+   * depth-first order (the same order {@code VectorUnloader} uses).
+   *
+   * <p>The caller can use this to size destination buffers before calling {@link
+   * #copyBuffersToAddresses}.
+   */
+  public static long[] bufferReadableBytes(CometDecodedVector cometVec) {
+    List<Long> sizes = new ArrayList<>();
+    collectBufferSizes((FieldVector) cometVec.getValueVector(), sizes);
+    long[] out = new long[sizes.size()];
+    for (int i = 0; i < sizes.size(); i++) {
+      out[i] = sizes.get(i);
+    }
+    return out;
+  }
+
+  /**
+   * Returns the {@code valueCount} of every {@link FieldVector} node in {@code cometVec}'s tree, in
+   * depth-first order. The first entry is the value count of the top-level vector; subsequent
+   * entries are for nested children (struct fields, list elements).
+   */
+  public static int[] valueCounts(CometDecodedVector cometVec) {
+    List<Integer> counts = new ArrayList<>();
+    collectValueCounts((FieldVector) cometVec.getValueVector(), counts);
+    int[] out = new int[counts.size()];
+    for (int i = 0; i < counts.size(); i++) {
+      out[i] = counts.get(i);
+    }
+    return out;
+  }
+
+  /**
+   * Copies all of {@code cometVec}'s buffer bytes into {@code destAddresses}, in the same
+   * depth-first order as {@link #bufferReadableBytes}. Each destination address must be backed by
+   * at least the corresponding entry from {@code bufferReadableBytes} bytes of writable memory.
+   */
+  public static void copyBuffersToAddresses(CometDecodedVector cometVec, long[] destAddresses) {
+    walkAndCopy((FieldVector) cometVec.getValueVector(), destAddresses, new int[] {0});
+  }
+
+  private static void collectBufferSizes(FieldVector vec, List<Long> out) {
+    for (ArrowBuf buf : vec.getFieldBuffers()) {
+      out.add(buf.readableBytes());
+    }
+    for (FieldVector child : vec.getChildrenFromFields()) {
+      collectBufferSizes(child, out);
+    }
+  }
+
+  private static void collectValueCounts(FieldVector vec, List<Integer> out) {
+    out.add(vec.getValueCount());
+    for (FieldVector child : vec.getChildrenFromFields()) {
+      collectValueCounts(child, out);
+    }
+  }
+
+  private static void walkAndCopy(FieldVector vec, long[] addrs, int[] cursor) {
+    for (ArrowBuf buf : vec.getFieldBuffers()) {
+      if (cursor[0] >= addrs.length) {
+        throw new IllegalArgumentException(
+            "destAddresses too small at cursor="
+                + cursor[0]
+                + " (have "
+                + addrs.length
+                + " addresses)");
+      }
+      MemoryUtil.copyMemory(buf.memoryAddress(), addrs[cursor[0]], buf.readableBytes());
+      cursor[0]++;
+    }
+    for (FieldVector child : vec.getChildrenFromFields()) {
+      walkAndCopy(child, addrs, cursor);
+    }
+  }
+}

From f02caed1268d93471fde37cc5a3488456d6fb13e Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 12 May 2026 01:35:08 -0600
Subject: [PATCH 36/54] fix: route comet-to-arrow buffer copy through unshaded
 primitive API [skip ci]

Rewrites CometColumnarPythonInput to copy Comet vector bytes via
CometVectorIpcCopier (long-address API) rather than casting shaded FieldVector
to unshaded FieldVector, which caused ClassCastException at runtime.

Additional fixes for correct Arrow IPC semantics:
- Fill struct validity buffer with 0xFF so Python sees non-null struct rows
- Set lastSet before setValueCount on variable-width and list vectors to prevent
  fillHoles from overwriting correctly copied offset buffers
- Process nodes bottom-up so parent setValueCount cascade does not clobber
  children that have not yet had lastSet updated
---
 .../python/CometColumnarPythonInput.scala     | 183 +++++++++++-------
 1 file changed, 117 insertions(+), 66 deletions(-)

diff --git a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala
index b27d8f0568..cd50c8b238 100644
--- a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala
+++ b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala
@@ -22,8 +22,11 @@ package org.apache.spark.sql.execution.python
 import java.io.DataOutputStream
 import java.nio.channels.Channels
 
-import org.apache.arrow.vector.{BaseFixedWidthVector, BaseVariableWidthVector, FieldVector, VectorSchemaRoot, VectorUnloader}
-import org.apache.arrow.vector.complex.StructVector
+import scala.collection.mutable.ArrayBuffer
+import scala.jdk.CollectionConverters._
+
+import org.apache.arrow.vector.{BaseFixedWidthVector, BaseLargeVariableWidthVector, BaseVariableWidthVector, FieldVector, VectorSchemaRoot, VectorUnloader}
+import org.apache.arrow.vector.complex.{LargeListVector, ListVector, StructVector}
 import org.apache.arrow.vector.compression.{CompressionCodec, CompressionUtil, NoCompressionCodec}
 import org.apache.arrow.vector.ipc.{ArrowStreamWriter, WriteChannel}
 import org.apache.arrow.vector.ipc.message.MessageSerializer
@@ -31,21 +34,23 @@ import org.apache.spark.SparkException
 import org.apache.spark.api.python.BasePythonRunner
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.vectorized.ColumnarBatch
+import org.apache.spark.unsafe.Platform
 
-import org.apache.comet.vector.CometDecodedVector
+import org.apache.comet.vector.{CometDecodedVector, CometVectorIpcCopier}
 
 /**
  * `PythonArrowInput` implementation that streams Comet `ColumnarBatch` values to the Python
- * worker as Arrow IPC, bypassing the row materialization that `BasicPythonArrowInput` performs.
- * The persistent root supplied by `PythonArrowInput` carries the wrapped-struct schema
- * (`StructType(Array(StructField("struct", childSchema)))`) so the Python worker contract is
- * preserved.
+ * worker as Arrow IPC.
+ *
+ * Comet's vectors live in the shaded `org.apache.comet.shaded.arrow.*` package at runtime
+ * (relocated by comet-common's maven-shade-plugin). This trait must not reference shaded Arrow
+ * types directly; buffer copying is delegated to `CometVectorIpcCopier` in comet-common, which
+ * crosses the module boundary using only `long` primitives.
  *
- * Each call writes one Comet batch. The runner contract repeats `writeNextBatchToArrowStream`
- * until it returns `false`. Per-batch the input trait allocates a destination vector in the
- * persistent root and copies each source buffer via `ArrowBuf.setBytes` -- this is bulk per
- * buffer, not per row, but it is NOT zero-copy: Comet's Parquet reader allocators are independent
- * roots from `ArrowUtils.rootAllocator`.
+ * Per-batch: walk the destination struct's children (unshaded, allocated from the runner's
+ * persistent root), allocate each child sized to match the corresponding Comet column, collect
+ * dst buffer addresses into a `long[]`, and call the helper for a single bulk memcpy across all
+ * buffers.
  */
 private[python] trait CometColumnarPythonInput extends PythonArrowInput[Iterator[ColumnarBatch]] {
   self: BasePythonRunner[Iterator[ColumnarBatch], _] =>
@@ -92,20 +97,20 @@ private[python] trait CometColumnarPythonInput extends PythonArrowInput[Iterator
 
     var i = 0
     while (i < cometBatch.numCols()) {
-      val src = cometBatch
-        .column(i)
-        .asInstanceOf[CometDecodedVector]
-        .getValueVector
-        .asInstanceOf[FieldVector]
+      val src = cometBatch.column(i).asInstanceOf[CometDecodedVector]
       val dst = structVec.getChildByOrdinal(i).asInstanceOf[FieldVector]
       copyVector(src, dst)
       i += 1
     }
-    structVec.setValueCount(cometBatch.numRows())
-    root.setRowCount(cometBatch.numRows())
+    val numRows = cometBatch.numRows()
+    structVec.setValueCount(numRows)
+    // Mark every row in the struct as non-null (all-1 validity bits). The struct validity
+    // buffer is freshly allocated (or cleared) and zero-initialised, so without this step
+    // Python would see an all-null struct column and return null for every output row.
+    val validityBytes = (numRows + 7) / 8
+    Platform.setMemory(structVec.getValidityBuffer.memoryAddress(), 0xff.toByte, validityBytes)
+    root.setRowCount(numRows)
 
-    // VectorUnloader is lightweight (wraps root); create per-batch to stay compatible
-    // across Spark 4.0/4.1/4.2 which differ in how the unloader field is managed.
     val batchUnloader =
       new VectorUnloader(root, /* includeNullCount */ true, cometCodec, /* alignBuffers */ true)
     val recordBatch = batchUnloader.getRecordBatch
@@ -121,57 +126,103 @@ private[python] trait CometColumnarPythonInput extends PythonArrowInput[Iterator
   }
 
   /**
-   * Copy `src` into `dst` via per-buffer memcpy. Allocates `dst` sized to match `src`, then
-   * `ArrowBuf.setBytes` copies each field buffer (validity, offsets, data) wholesale. Recurses
-   * into struct / list children.
-   *
-   * This does NOT transfer buffer ownership and does NOT change refcounts: `src` retains its
-   * buffers, `dst` allocates new ones in the runner's allocator. Required because Comet's Parquet
-   * reader allocators are independent roots from `ArrowUtils.rootAllocator`.
+   * Copy a Comet column (whose Arrow buffers are in the shaded class tree) into the destination
+   * FieldVector (allocated from the runner's persistent root, in the unshaded class tree). The
+   * actual byte copy happens inside `CometVectorIpcCopier` in comet-common, which references only
+   * shaded Arrow types internally and exposes the buffer addresses as `long` primitives.
    */
-  private def copyVector(src: FieldVector, dst: FieldVector): Unit = {
-    val numRows = src.getValueCount
-
-    dst match {
-      case bfwv: BaseFixedWidthVector =>
-        bfwv.allocateNew(numRows)
-      case bvwv: BaseVariableWidthVector =>
-        // Variable-width data buffer size depends on actual byte content, not just numRows.
-        // Match the source data buffer's readable bytes.
-        val srcFieldBufs = src.getFieldBuffers
-        val dataBufIdx = srcFieldBufs.size - 1
-        val srcDataSize = srcFieldBufs.get(dataBufIdx).readableBytes
-        bvwv.allocateNew(srcDataSize, numRows)
-      case _ =>
-        dst.setInitialCapacity(numRows)
-        dst.allocateNew()
-    }
+  private def copyVector(src: CometDecodedVector, dst: FieldVector): Unit = {
+    val srcBufSizes = CometVectorIpcCopier.bufferReadableBytes(src)
+    val srcValueCounts = CometVectorIpcCopier.valueCounts(src)
 
-    val srcBufs = src.getFieldBuffers
-    val dstBufs = dst.getFieldBuffers
+    val dstNodes = collectFieldVectors(dst)
     require(
-      srcBufs.size == dstBufs.size,
-      s"buffer count mismatch for ${src.getField}: src=${srcBufs.size} dst=${dstBufs.size}")
-    var bi = 0
-    while (bi < srcBufs.size) {
-      val sBuf = srcBufs.get(bi)
-      val dBuf = dstBufs.get(bi)
-      dBuf.setBytes(0L, sBuf, 0L, sBuf.readableBytes)
-      bi += 1
+      dstNodes.size == srcValueCounts.length,
+      s"tree node count mismatch for ${dst.getField}: " +
+        s"dst=${dstNodes.size}, src=${srcValueCounts.length}")
+
+    var bufIdx = 0
+    var nodeIdx = 0
+    while (nodeIdx < dstNodes.size) {
+      val node = dstNodes(nodeIdx)
+      val valueCount = srcValueCounts(nodeIdx)
+      node match {
+        case bfwv: BaseFixedWidthVector =>
+          bfwv.allocateNew(valueCount)
+        case bvwv: BaseVariableWidthVector =>
+          val ownBufCount = node.getFieldBuffers.size
+          val dataSize = srcBufSizes(bufIdx + ownBufCount - 1)
+          bvwv.allocateNew(dataSize, valueCount)
+        case blvwv: BaseLargeVariableWidthVector =>
+          val ownBufCount = node.getFieldBuffers.size
+          val dataSize = srcBufSizes(bufIdx + ownBufCount - 1)
+          blvwv.allocateNew(dataSize, valueCount)
+        case _ =>
+          node.setInitialCapacity(valueCount)
+          node.allocateNew()
+      }
+      bufIdx += node.getFieldBuffers.size
+      nodeIdx += 1
     }
-
-    val srcChildren = src.getChildrenFromFields
-    val dstChildren = dst.getChildrenFromFields
     require(
-      srcChildren.size == dstChildren.size,
-      s"child count mismatch for ${src.getField}: " +
-        s"src=${srcChildren.size} dst=${dstChildren.size}")
-    var ci = 0
-    while (ci < srcChildren.size) {
-      copyVector(srcChildren.get(ci), dstChildren.get(ci))
-      ci += 1
+      bufIdx == srcBufSizes.length,
+      s"buffer count mismatch for ${dst.getField}: dst=$bufIdx, src=${srcBufSizes.length}")
+
+    val dstAddrs = collectBufferAddresses(dstNodes, srcBufSizes.length)
+    CometVectorIpcCopier.copyBuffersToAddresses(src, dstAddrs)
+
+    // Process nodes bottom-up (leaves first) so that when a composite vector (struct, list)
+    // calls setValueCount on its children recursively, those children have already had their
+    // lastSet field updated and fillHoles becomes a no-op.
+    var fi = dstNodes.size - 1
+    while (fi >= 0) {
+      val node = dstNodes(fi)
+      val vc = srcValueCounts(fi)
+      // For vectors that fill offset-buffer "holes" in setValueCount (variable-width and list
+      // types), set lastSet = vc - 1 first so fillHoles is a no-op and the already-copied
+      // offset bytes are preserved.
+      node match {
+        case v: BaseVariableWidthVector => v.setLastSet(vc - 1)
+        case v: BaseLargeVariableWidthVector => v.setLastSet(vc - 1)
+        case v: ListVector => v.setLastSet(vc - 1)
+        case v: LargeListVector => v.setLastSet(vc - 1)
+        case _ =>
+      }
+      node.setValueCount(vc)
+      fi -= 1
+    }
+  }
+
+  private def collectFieldVectors(vec: FieldVector): IndexedSeq[FieldVector] = {
+    val buf = ArrayBuffer.empty[FieldVector]
+    walkFieldVectors(vec, buf)
+    buf.toIndexedSeq
+  }
+
+  private def walkFieldVectors(vec: FieldVector, buf: ArrayBuffer[FieldVector]): Unit = {
+    buf += vec
+    vec.getChildrenFromFields.asScala.foreach { child =>
+      walkFieldVectors(child.asInstanceOf[FieldVector], buf)
     }
+  }
 
-    dst.setValueCount(numRows)
+  private def collectBufferAddresses(
+      nodes: IndexedSeq[FieldVector],
+      expected: Int): Array[Long] = {
+    val addrs = new Array[Long](expected)
+    var idx = 0
+    var ni = 0
+    while (ni < nodes.size) {
+      val bufs = nodes(ni).getFieldBuffers
+      var bi = 0
+      while (bi < bufs.size) {
+        addrs(idx) = bufs.get(bi).memoryAddress()
+        idx += 1
+        bi += 1
+      }
+      ni += 1
+    }
+    require(idx == expected, s"collected $idx addresses, expected $expected")
+    addrs
   }
 }

From 0a98c7debc2210df9d8b0924933285db23dbe627 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 12 May 2026 01:41:01 -0600
Subject: [PATCH 37/54] test: refresh pyarrow UDF benchmark header for
 bulk-copy path [skip ci]

---
 .../src/test/resources/pyspark/benchmark_pyarrow_udf.py  | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/spark/src/test/resources/pyspark/benchmark_pyarrow_udf.py b/spark/src/test/resources/pyspark/benchmark_pyarrow_udf.py
index 49574130c0..08f2c6540f 100644
--- a/spark/src/test/resources/pyspark/benchmark_pyarrow_udf.py
+++ b/spark/src/test/resources/pyspark/benchmark_pyarrow_udf.py
@@ -19,6 +19,9 @@
 """
 End-to-end wall-clock benchmark for Comet's PyArrow UDF acceleration.
 
+Requires PySpark 4.0.1+ (Comet's columnar runner targets Spark 4.0+ only;
+3.5 and 3.4 are documented no-ops).
+
 Times `df.mapInArrow(passthrough, schema).count()` and the equivalent
 `mapInPandas` query with `spark.comet.exec.pyarrowUdf.enabled` set
 to false (vanilla Spark path) and true (Comet's optimized path). Both
@@ -26,8 +29,10 @@
 optimization actually changes for users:
 
   * vanilla:   CometScan -> ColumnarToRow + UnsafeProjection -> ArrowPythonRunner
-  * optimized: CometScan -> rowIterator -> ArrowPythonRunner (same runner;
-              no UnsafeProjection, output kept as ColumnarBatch)
+              (per-row InternalRow.getXXX() loop inside ArrowWriter.write)
+  * optimized: CometScan -> CometMapInBatchExec -> CometArrowPythonRunner
+              (per-buffer Unsafe.copyMemory from Comet's vectors into the
+              runner's persistent VectorSchemaRoot; no row materialization)
 
 Results are wall-clock seconds, so they include Python interpreter,
 Arrow IPC, and downstream count() costs. That's intentional: the

From 7e4ace85d76595a31ef2401edac881bb7d007afb Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 12 May 2026 01:41:29 -0600
Subject: [PATCH 38/54] docs: bump pyarrow UDF user guide to Spark 4.0+; note
 buffer-copy boundary

---
 docs/source/user-guide/latest/pyarrow-udfs.md | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/docs/source/user-guide/latest/pyarrow-udfs.md b/docs/source/user-guide/latest/pyarrow-udfs.md
index 8495184812..f22d926541 100644
--- a/docs/source/user-guide/latest/pyarrow-udfs.md
+++ b/docs/source/user-guide/latest/pyarrow-udfs.md
@@ -181,17 +181,18 @@ on the unoptimized path.
 
 - The optimization currently applies only to `mapInArrow` and `mapInPandas`. Scalar pandas UDFs
   (`@pandas_udf`) and grouped operations (`applyInPandas`) are not yet supported.
-- The internal row-to-Arrow conversion inside the Python runner is still present in this version.
-  Comet currently routes columnar input through `ColumnarBatch.rowIterator()` so that the existing
-  `ArrowPythonRunner` can re-encode the rows back to Arrow IPC. A future optimization will write
-  Arrow batches directly to the Python IPC stream, eliminating the remaining round-trip and
-  achieving near zero-copy data transfer.
 - The optimization requires Arrow data on the input side. If a shuffle sits between the upstream
   Comet operator and the Python UDF, you need Comet's native shuffle for the optimization to
   apply. Set `spark.shuffle.manager` to
   `org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager` and enable
   `spark.comet.exec.shuffle.enabled=true` at session startup. With a vanilla Spark `Exchange`
   in the plan the data leaves the shuffle as rows and the optimization cannot fire.
-- Spark 3.4 lacks several APIs the optimization depends on (`MapInBatchExec.isBarrier`,
-  `arrowUseLargeVarTypes`, `JobArtifactSet`, the modern `ArrowPythonRunner` constructor). On
-  Spark 3.4 the feature is a no-op even when enabled. Spark 3.5+ is required.
+- Spark 4.0 or newer is required. On Spark 3.4 and 3.5 the optimization is a no-op even when
+  enabled; vanilla `PythonMapInArrowExec` / `MapInPandasExec` handle the operation. The Spark 3.5
+  `PythonArrowInput` trait has a different contract than 4.x and a separate implementation has
+  not been written. Track 3.5 support as a future follow-on if there is user demand.
+- The current implementation copies Comet's vector buffers into Spark's allocator via
+  `Unsafe.copyMemory` (one bulk memcpy per buffer per column). True zero-copy via
+  `TransferPair` is blocked on Comet's Parquet readers allocating from `ArrowUtils.rootAllocator`
+  (rather than each reader constructing its own independent `RootAllocator`). A future PR that
+  unifies the allocator parent would unlock zero-copy.

From e4dfd2a64a3b5e41cda0591bdb14c99486c0a154 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 12 May 2026 02:01:38 -0600
Subject: [PATCH 39/54] docs: link pyarrow UDF zero-copy follow-on issue
 (#4294)

---
 docs/source/user-guide/latest/pyarrow-udfs.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/user-guide/latest/pyarrow-udfs.md b/docs/source/user-guide/latest/pyarrow-udfs.md
index f22d926541..42130c18eb 100644
--- a/docs/source/user-guide/latest/pyarrow-udfs.md
+++ b/docs/source/user-guide/latest/pyarrow-udfs.md
@@ -194,5 +194,5 @@ on the unoptimized path.
 - The current implementation copies Comet's vector buffers into Spark's allocator via
   `Unsafe.copyMemory` (one bulk memcpy per buffer per column). True zero-copy via
   `TransferPair` is blocked on Comet's Parquet readers allocating from `ArrowUtils.rootAllocator`
-  (rather than each reader constructing its own independent `RootAllocator`). A future PR that
-  unifies the allocator parent would unlock zero-copy.
+  (rather than each reader constructing its own independent `RootAllocator`). Tracked in
+  [#4294](https://github.com/apache/datafusion-comet/issues/4294).

From 35c03b3c00a5a61a6274bfab22741ce520f80a9b Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 20 May 2026 12:16:47 -0600
Subject: [PATCH 40/54] refactor: remove Arrow shading workaround from pyarrow
 UDF input trait
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The long[]-address indirection through CometVectorIpcCopier existed because
comet-common shaded org.apache.arrow.* into org.apache.comet.shaded.arrow.*,
making source vectors and Spark's IPC root different JVM types. After #4325
moved most JVM code into comet-spark and dropped the shading, both sides see
the same Arrow classes — the helper is no longer needed.

Replace with a direct walk of the source/destination FieldVector trees using
ArrowBuf.setBytes for the buffer copy. Same per-buffer memcpy semantics; the
cross-RootAllocator constraint that blocks true zero-copy is independent of
shading and still tracked in #4294.
---
 docs/source/user-guide/latest/pyarrow-udfs.md |   8 +-
 .../comet/vector/CometVectorIpcCopier.java    | 122 --------------
 .../python/CometColumnarPythonInput.scala     | 159 +++++++-----------
 3 files changed, 61 insertions(+), 228 deletions(-)
 delete mode 100644 spark/src/main/java/org/apache/comet/vector/CometVectorIpcCopier.java

diff --git a/docs/source/user-guide/latest/pyarrow-udfs.md b/docs/source/user-guide/latest/pyarrow-udfs.md
index 42130c18eb..68b3ba15c3 100644
--- a/docs/source/user-guide/latest/pyarrow-udfs.md
+++ b/docs/source/user-guide/latest/pyarrow-udfs.md
@@ -191,8 +191,8 @@ on the unoptimized path.
   enabled; vanilla `PythonMapInArrowExec` / `MapInPandasExec` handle the operation. The Spark 3.5
   `PythonArrowInput` trait has a different contract than 4.x and a separate implementation has
   not been written. Track 3.5 support as a future follow-on if there is user demand.
-- The current implementation copies Comet's vector buffers into Spark's allocator via
-  `Unsafe.copyMemory` (one bulk memcpy per buffer per column). True zero-copy via
-  `TransferPair` is blocked on Comet's Parquet readers allocating from `ArrowUtils.rootAllocator`
-  (rather than each reader constructing its own independent `RootAllocator`). Tracked in
+- The current implementation copies Comet's vector buffers into Spark's allocator one
+  buffer at a time. True zero-copy via `TransferPair` is blocked on Comet's Parquet
+  readers allocating from `ArrowUtils.rootAllocator` (rather than each reader
+  constructing its own independent `RootAllocator`). Tracked in
   [#4294](https://github.com/apache/datafusion-comet/issues/4294).
diff --git a/spark/src/main/java/org/apache/comet/vector/CometVectorIpcCopier.java b/spark/src/main/java/org/apache/comet/vector/CometVectorIpcCopier.java
deleted file mode 100644
index 368f02b0ec..0000000000
--- a/spark/src/main/java/org/apache/comet/vector/CometVectorIpcCopier.java
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.comet.vector;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.arrow.memory.ArrowBuf;
-import org.apache.arrow.memory.util.MemoryUtil;
-import org.apache.arrow.vector.FieldVector;
-
-/**
- * Helpers that copy the contents of a {@link CometDecodedVector} (whose underlying Arrow buffers
- * live in the shaded {@code org.apache.comet.shaded.arrow.*} package after the comet-common jar is
- * built) into destination buffer addresses provided by the caller.
- *
- * <p>Callers in {@code comet-spark} reference the unshaded {@code org.apache.arrow.*} classes
- * supplied by Spark at runtime. Direct cross-package access from the spark module would fail with a
- * {@code ClassCastException}. Crossing the boundary via raw memory addresses (long primitives)
- * sidesteps the class identity issue: the bytes on disk are identical regardless of which Arrow
- * Java distribution produced them.
- *
- * <p>All traversals use {@code getFieldBuffers()} and {@code getChildrenFromFields()} — the same
- * API that {@code VectorUnloader} uses — so buffer ordering and counts are consistent between the
- * source (shaded) and destination (unshaded) sides.
- */
-public final class CometVectorIpcCopier {
-
-  private CometVectorIpcCopier() {}
-
-  /**
-   * Returns the readable byte counts of all buffers in {@code cometVec}'s underlying Arrow tree, in
-   * depth-first order (the same order {@code VectorUnloader} uses).
-   *
-   * <p>The caller can use this to size destination buffers before calling {@link
-   * #copyBuffersToAddresses}.
-   */
-  public static long[] bufferReadableBytes(CometDecodedVector cometVec) {
-    List<Long> sizes = new ArrayList<>();
-    collectBufferSizes((FieldVector) cometVec.getValueVector(), sizes);
-    long[] out = new long[sizes.size()];
-    for (int i = 0; i < sizes.size(); i++) {
-      out[i] = sizes.get(i);
-    }
-    return out;
-  }
-
-  /**
-   * Returns the {@code valueCount} of every {@link FieldVector} node in {@code cometVec}'s tree, in
-   * depth-first order. The first entry is the value count of the top-level vector; subsequent
-   * entries are for nested children (struct fields, list elements).
-   */
-  public static int[] valueCounts(CometDecodedVector cometVec) {
-    List<Integer> counts = new ArrayList<>();
-    collectValueCounts((FieldVector) cometVec.getValueVector(), counts);
-    int[] out = new int[counts.size()];
-    for (int i = 0; i < counts.size(); i++) {
-      out[i] = counts.get(i);
-    }
-    return out;
-  }
-
-  /**
-   * Copies all of {@code cometVec}'s buffer bytes into {@code destAddresses}, in the same
-   * depth-first order as {@link #bufferReadableBytes}. Each destination address must be backed by
-   * at least the corresponding entry from {@code bufferReadableBytes} bytes of writable memory.
-   */
-  public static void copyBuffersToAddresses(CometDecodedVector cometVec, long[] destAddresses) {
-    walkAndCopy((FieldVector) cometVec.getValueVector(), destAddresses, new int[] {0});
-  }
-
-  private static void collectBufferSizes(FieldVector vec, List<Long> out) {
-    for (ArrowBuf buf : vec.getFieldBuffers()) {
-      out.add(buf.readableBytes());
-    }
-    for (FieldVector child : vec.getChildrenFromFields()) {
-      collectBufferSizes(child, out);
-    }
-  }
-
-  private static void collectValueCounts(FieldVector vec, List<Integer> out) {
-    out.add(vec.getValueCount());
-    for (FieldVector child : vec.getChildrenFromFields()) {
-      collectValueCounts(child, out);
-    }
-  }
-
-  private static void walkAndCopy(FieldVector vec, long[] addrs, int[] cursor) {
-    for (ArrowBuf buf : vec.getFieldBuffers()) {
-      if (cursor[0] >= addrs.length) {
-        throw new IllegalArgumentException(
-            "destAddresses too small at cursor="
-                + cursor[0]
-                + " (have "
-                + addrs.length
-                + " addresses)");
-      }
-      MemoryUtil.copyMemory(buf.memoryAddress(), addrs[cursor[0]], buf.readableBytes());
-      cursor[0]++;
-    }
-    for (FieldVector child : vec.getChildrenFromFields()) {
-      walkAndCopy(child, addrs, cursor);
-    }
-  }
-}
diff --git a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala
index cd50c8b238..5cce2aaf28 100644
--- a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala
+++ b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala
@@ -22,7 +22,6 @@ package org.apache.spark.sql.execution.python
 import java.io.DataOutputStream
 import java.nio.channels.Channels
 
-import scala.collection.mutable.ArrayBuffer
 import scala.jdk.CollectionConverters._
 
 import org.apache.arrow.vector.{BaseFixedWidthVector, BaseLargeVariableWidthVector, BaseVariableWidthVector, FieldVector, VectorSchemaRoot, VectorUnloader}
@@ -36,21 +35,18 @@ import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.vectorized.ColumnarBatch
 import org.apache.spark.unsafe.Platform
 
-import org.apache.comet.vector.{CometDecodedVector, CometVectorIpcCopier}
+import org.apache.comet.vector.CometDecodedVector
 
 /**
  * `PythonArrowInput` implementation that streams Comet `ColumnarBatch` values to the Python
  * worker as Arrow IPC.
  *
- * Comet's vectors live in the shaded `org.apache.comet.shaded.arrow.*` package at runtime
- * (relocated by comet-common's maven-shade-plugin). This trait must not reference shaded Arrow
- * types directly; buffer copying is delegated to `CometVectorIpcCopier` in comet-common, which
- * crosses the module boundary using only `long` primitives.
- *
- * Per-batch: walk the destination struct's children (unshaded, allocated from the runner's
- * persistent root), allocate each child sized to match the corresponding Comet column, collect
- * dst buffer addresses into a `long[]`, and call the helper for a single bulk memcpy across all
- * buffers.
+ * Per batch: walk the destination struct's children, allocate each child sized to match the
+ * corresponding Comet column, and copy each buffer with `ArrowBuf.setBytes`. The source (Comet's
+ * vectors) and the destination (Spark's persistent IPC root) live in different `RootAllocator`
+ * trees, so `TransferPair` / `VectorLoader.load` cannot rebind buffers across the boundary;
+ * per-buffer memcpy is the available alternative until the readers share a parent allocator
+ * (tracked in #4294).
  */
 private[python] trait CometColumnarPythonInput extends PythonArrowInput[Iterator[ColumnarBatch]] {
   self: BasePythonRunner[Iterator[ColumnarBatch], _] =>
@@ -97,7 +93,12 @@ private[python] trait CometColumnarPythonInput extends PythonArrowInput[Iterator
 
     var i = 0
     while (i < cometBatch.numCols()) {
-      val src = cometBatch.column(i).asInstanceOf[CometDecodedVector]
+      val src =
+        cometBatch
+          .column(i)
+          .asInstanceOf[CometDecodedVector]
+          .getValueVector
+          .asInstanceOf[FieldVector]
       val dst = structVec.getChildByOrdinal(i).asInstanceOf[FieldVector]
       copyVector(src, dst)
       i += 1
@@ -126,103 +127,57 @@ private[python] trait CometColumnarPythonInput extends PythonArrowInput[Iterator
   }
 
   /**
-   * Copy a Comet column (whose Arrow buffers are in the shaded class tree) into the destination
-   * FieldVector (allocated from the runner's persistent root, in the unshaded class tree). The
-   * actual byte copy happens inside `CometVectorIpcCopier` in comet-common, which references only
-   * shaded Arrow types internally and exposes the buffer addresses as `long` primitives.
+   * Copy a Comet column into the destination FieldVector. Walks both trees in lockstep: sizes
+   * each destination node from the source, copies every buffer with `ArrowBuf.setBytes`, then
+   * sets value counts bottom-up so `setValueCount` does not rewrite the offset bytes we just
+   * copied.
    */
-  private def copyVector(src: CometDecodedVector, dst: FieldVector): Unit = {
-    val srcBufSizes = CometVectorIpcCopier.bufferReadableBytes(src)
-    val srcValueCounts = CometVectorIpcCopier.valueCounts(src)
-
-    val dstNodes = collectFieldVectors(dst)
-    require(
-      dstNodes.size == srcValueCounts.length,
-      s"tree node count mismatch for ${dst.getField}: " +
-        s"dst=${dstNodes.size}, src=${srcValueCounts.length}")
-
-    var bufIdx = 0
-    var nodeIdx = 0
-    while (nodeIdx < dstNodes.size) {
-      val node = dstNodes(nodeIdx)
-      val valueCount = srcValueCounts(nodeIdx)
-      node match {
-        case bfwv: BaseFixedWidthVector =>
-          bfwv.allocateNew(valueCount)
-        case bvwv: BaseVariableWidthVector =>
-          val ownBufCount = node.getFieldBuffers.size
-          val dataSize = srcBufSizes(bufIdx + ownBufCount - 1)
-          bvwv.allocateNew(dataSize, valueCount)
-        case blvwv: BaseLargeVariableWidthVector =>
-          val ownBufCount = node.getFieldBuffers.size
-          val dataSize = srcBufSizes(bufIdx + ownBufCount - 1)
-          blvwv.allocateNew(dataSize, valueCount)
-        case _ =>
-          node.setInitialCapacity(valueCount)
-          node.allocateNew()
-      }
-      bufIdx += node.getFieldBuffers.size
-      nodeIdx += 1
+  private def copyVector(src: FieldVector, dst: FieldVector): Unit = {
+    val valueCount = src.getValueCount
+
+    dst match {
+      case bfwv: BaseFixedWidthVector =>
+        bfwv.allocateNew(valueCount)
+      case bvwv: BaseVariableWidthVector =>
+        bvwv.allocateNew(src.getDataBuffer.readableBytes, valueCount)
+      case blvwv: BaseLargeVariableWidthVector =>
+        blvwv.allocateNew(src.getDataBuffer.readableBytes, valueCount)
+      case _ =>
+        dst.setInitialCapacity(valueCount)
+        dst.allocateNew()
     }
+
+    val srcBufs = src.getFieldBuffers
+    val dstBufs = dst.getFieldBuffers
     require(
-      bufIdx == srcBufSizes.length,
-      s"buffer count mismatch for ${dst.getField}: dst=$bufIdx, src=${srcBufSizes.length}")
-
-    val dstAddrs = collectBufferAddresses(dstNodes, srcBufSizes.length)
-    CometVectorIpcCopier.copyBuffersToAddresses(src, dstAddrs)
-
-    // Process nodes bottom-up (leaves first) so that when a composite vector (struct, list)
-    // calls setValueCount on its children recursively, those children have already had their
-    // lastSet field updated and fillHoles becomes a no-op.
-    var fi = dstNodes.size - 1
-    while (fi >= 0) {
-      val node = dstNodes(fi)
-      val vc = srcValueCounts(fi)
-      // For vectors that fill offset-buffer "holes" in setValueCount (variable-width and list
-      // types), set lastSet = vc - 1 first so fillHoles is a no-op and the already-copied
-      // offset bytes are preserved.
-      node match {
-        case v: BaseVariableWidthVector => v.setLastSet(vc - 1)
-        case v: BaseLargeVariableWidthVector => v.setLastSet(vc - 1)
-        case v: ListVector => v.setLastSet(vc - 1)
-        case v: LargeListVector => v.setLastSet(vc - 1)
-        case _ =>
-      }
-      node.setValueCount(vc)
-      fi -= 1
+      srcBufs.size == dstBufs.size,
+      s"buffer count mismatch for ${dst.getField}: src=${srcBufs.size}, dst=${dstBufs.size}")
+    var b = 0
+    while (b < srcBufs.size) {
+      val s = srcBufs.get(b)
+      dstBufs.get(b).setBytes(0, s, 0, s.readableBytes)
+      b += 1
     }
-  }
-
-  private def collectFieldVectors(vec: FieldVector): IndexedSeq[FieldVector] = {
-    val buf = ArrayBuffer.empty[FieldVector]
-    walkFieldVectors(vec, buf)
-    buf.toIndexedSeq
-  }
 
-  private def walkFieldVectors(vec: FieldVector, buf: ArrayBuffer[FieldVector]): Unit = {
-    buf += vec
-    vec.getChildrenFromFields.asScala.foreach { child =>
-      walkFieldVectors(child.asInstanceOf[FieldVector], buf)
+    val srcChildren = src.getChildrenFromFields
+    val dstChildren = dst.getChildrenFromFields
+    require(
+      srcChildren.size == dstChildren.size,
+      s"child count mismatch for ${dst.getField}: src=${srcChildren.size}, dst=${dstChildren.size}")
+    srcChildren.asScala.zip(dstChildren.asScala).foreach { case (sc, dc) =>
+      copyVector(sc.asInstanceOf[FieldVector], dc.asInstanceOf[FieldVector])
     }
-  }
 
-  private def collectBufferAddresses(
-      nodes: IndexedSeq[FieldVector],
-      expected: Int): Array[Long] = {
-    val addrs = new Array[Long](expected)
-    var idx = 0
-    var ni = 0
-    while (ni < nodes.size) {
-      val bufs = nodes(ni).getFieldBuffers
-      var bi = 0
-      while (bi < bufs.size) {
-        addrs(idx) = bufs.get(bi).memoryAddress()
-        idx += 1
-        bi += 1
-      }
-      ni += 1
+    // For vectors that fill offset-buffer "holes" in setValueCount (variable-width and list
+    // types), set lastSet = vc - 1 first so fillHoles is a no-op and the already-copied
+    // offset bytes are preserved.
+    dst match {
+      case v: BaseVariableWidthVector => v.setLastSet(valueCount - 1)
+      case v: BaseLargeVariableWidthVector => v.setLastSet(valueCount - 1)
+      case v: ListVector => v.setLastSet(valueCount - 1)
+      case v: LargeListVector => v.setLastSet(valueCount - 1)
+      case _ =>
     }
-    require(idx == expected, s"collected $idx addresses, expected $expected")
-    addrs
+    dst.setValueCount(valueCount)
   }
 }

From 693afef91b514fff870d3f05acf30257844b231e Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 20 May 2026 15:39:15 -0600
Subject: [PATCH 41/54] test: expand pyarrow UDF type coverage and fall back on
 useLargeVarTypes

Adds pytest cases for the data-type branches in CometColumnarPythonInput
that were previously unexercised: numeric scalars (boolean/byte/short/float),
binary, timestamp NTZ, map, and a deeply nested array/struct combination.

Falls back to vanilla Spark when spark.sql.execution.arrow.useLargeVarTypes
is enabled. With that conf on, Spark widens StringType/BinaryType to
8-byte-offset variants in the destination IPC root while Comet's source
vectors keep 4-byte offsets, so the per-buffer memcpy in copyVector would
corrupt the offset buffer.

While narrowing the rule to gate on largeVarTypes, also fix a pre-existing
greedy match: the MapInBatch case used `p: SparkPlan` with the pyarrow conf
as a guard, which matched every plan when the conf was on and consumed the
later CometShuffleExchangeExec arm. The case now gates on a structural check
via eligibleMapInBatchInfo so unrelated plans flow through.
---
 docs/source/user-guide/latest/pyarrow-udfs.md |   5 +
 .../rules/EliminateRedundantTransitions.scala |  61 +++-
 .../resources/pyspark/test_pyarrow_udf.py     | 318 ++++++++++++++++++
 3 files changed, 368 insertions(+), 16 deletions(-)

diff --git a/docs/source/user-guide/latest/pyarrow-udfs.md b/docs/source/user-guide/latest/pyarrow-udfs.md
index 68b3ba15c3..0b2cd9aebb 100644
--- a/docs/source/user-guide/latest/pyarrow-udfs.md
+++ b/docs/source/user-guide/latest/pyarrow-udfs.md
@@ -191,6 +191,11 @@ on the unoptimized path.
   enabled; vanilla `PythonMapInArrowExec` / `MapInPandasExec` handle the operation. The Spark 3.5
   `PythonArrowInput` trait has a different contract than 4.x and a separate implementation has
   not been written. Track 3.5 support as a future follow-on if there is user demand.
+- `spark.sql.execution.arrow.useLargeVarTypes=true` is not supported. With this conf enabled,
+  Spark widens `StringType` and `BinaryType` to Arrow's 8-byte-offset variants in the
+  destination IPC root, while Comet's source vectors always use 4-byte offsets. The buffer-copy
+  path cannot bridge that mismatch, so `EliminateRedundantTransitions` skips the rewrite and
+  vanilla Spark handles the operation.
 - The current implementation copies Comet's vector buffers into Spark's allocator one
   buffer at a time. True zero-copy via `TransferPair` is blocked on Comet's Parquet
   readers allocating from `ArrowUtils.rootAllocator` (rather than each reader
diff --git a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala
index 24c969c173..ee7a9e085b 100644
--- a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala
+++ b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.catalyst.util.sideBySide
 import org.apache.spark.sql.comet.{CometCollectLimitExec, CometColumnarToRowExec, CometMapInBatchExec, CometNativeColumnarToRowExec, CometNativeWriteExec, CometPlan, CometSparkToColumnarExec}
 import org.apache.spark.sql.comet.execution.shuffle.{CometColumnarShuffle, CometShuffleExchangeExec}
-import org.apache.spark.sql.comet.shims.ShimCometMapInBatch
+import org.apache.spark.sql.comet.shims.{MapInBatchInfo, ShimCometMapInBatch}
 import org.apache.spark.sql.execution.{ColumnarToRowExec, RowToColumnarExec, SparkPlan}
 import org.apache.spark.sql.execution.adaptive.QueryStageExec
 import org.apache.spark.sql.execution.exchange.ReusedExchangeExec
@@ -106,21 +106,29 @@ case class EliminateRedundantTransitions(session: SparkSession)
       // UnsafeProjection copies and keeping the stage columnar. The matchers are
       // version-shimmed: Spark 3.4 returns None (it lacks the required APIs) and Spark 4.1+
       // matches the renamed `MapInArrowExec`.
-      case p: SparkPlan if CometConf.COMET_PYARROW_UDF_ENABLED.get() =>
-        matchMapInArrow(p).orElse(matchMapInPandas(p)) match {
-          case Some(info) =>
-            extractColumnarChild(info.child)
-              .map { columnarChild =>
-                CometMapInBatchExec(
-                  info.func,
-                  info.output,
-                  columnarChild,
-                  info.isBarrier,
-                  info.pythonEvalType)
-              }
-              .getOrElse(p)
-          case None => p
-        }
+      //
+      // Falls back to vanilla Spark when `spark.sql.execution.arrow.useLargeVarTypes` is enabled:
+      // CometColumnarPythonInput.copyVector does raw `setBytes` on each Arrow buffer, but Comet's
+      // source string/binary vectors always use 4-byte offsets while the destination root is
+      // allocated with 8-byte offsets when this conf is on. The buffer counts match but the
+      // offset width does not, so a direct memcpy would corrupt the offsets.
+      //
+      // The guard runs `eligibleMapInBatchInfo` so this case only matches actual MapInArrow /
+      // MapInPandas operators. Without the structural check the case would match every
+      // `SparkPlan` whenever the pyarrow conf is on, short-circuiting the
+      // `CometShuffleExchangeExec` arm below.
+      case p if eligibleMapInBatchInfo(p).isDefined =>
+        val info = eligibleMapInBatchInfo(p).get
+        extractColumnarChild(info.child)
+          .map { columnarChild =>
+            CometMapInBatchExec(
+              info.func,
+              info.output,
+              columnarChild,
+              info.isBarrier,
+              info.pythonEvalType)
+          }
+          .getOrElse(p)
 
       // Spark adds `RowToColumnar` under Comet columnar shuffle. But it's redundant as the
       // shuffle takes row-based input.
@@ -167,6 +175,27 @@ case class EliminateRedundantTransitions(session: SparkSession)
     case _ => None
   }
 
+  /**
+   * Returns `Some(info)` only when this rule should attempt to rewrite `plan` to
+   * `CometMapInBatchExec`, i.e. when the conf is on, the largeVarTypes fallback does not apply,
+   * and the plan is one of the version-shimmed MapInArrow / MapInPandas operators. Used in the
+   * pattern guard so the case only fires for plans we actually want to rewrite - without that
+   * narrowing, the `case` would match every `SparkPlan` whenever the conf is on and consume the
+   * later `CometShuffleExchangeExec` arm. Read the conf via the raw key string so this compiles
+   * against Spark 3.4, which lacks `SQLConf.arrowUseLargeVarTypes`.
+   */
+  private def eligibleMapInBatchInfo(plan: SparkPlan): Option[MapInBatchInfo] = {
+    if (!CometConf.COMET_PYARROW_UDF_ENABLED.get()) {
+      None
+    } else if (plan.conf
+        .getConfString("spark.sql.execution.arrow.useLargeVarTypes", "false")
+        .toBoolean) {
+      None
+    } else {
+      matchMapInArrow(plan).orElse(matchMapInPandas(plan))
+    }
+  }
+
   /**
    * Creates an appropriate columnar to row transition operator.
    *
diff --git a/spark/src/test/resources/pyspark/test_pyarrow_udf.py b/spark/src/test/resources/pyspark/test_pyarrow_udf.py
index 87558ec057..fc671053a6 100644
--- a/spark/src/test/resources/pyspark/test_pyarrow_udf.py
+++ b/spark/src/test/resources/pyspark/test_pyarrow_udf.py
@@ -407,6 +407,324 @@ def _normalize(row):
     assert out == expected
 
 
+def test_map_in_arrow_numeric_scalars(spark, tmp_path, accelerated):
+    """
+    Covers the BaseFixedWidthVector branch in CometColumnarPythonInput.copyVector for
+    every fixed-width primitive Comet's scan supports beyond the long/double/int already
+    exercised by other tests: boolean, byte, short, float. Each has a distinct buffer
+    size, and the validity bit handling is independent per column.
+    """
+    schema_in = T.StructType(
+        [
+            T.StructField("id", T.LongType()),
+            T.StructField("b", T.BooleanType()),
+            T.StructField("tiny", T.ByteType()),
+            T.StructField("small", T.ShortType()),
+            T.StructField("flt", T.FloatType()),
+        ]
+    )
+    rows = [
+        (1, True, 1, 1000, 1.5),
+        (2, False, -128, -32768, -3.25),
+        (3, True, 127, 32767, float("inf")),
+        (4, None, None, None, None),
+    ]
+    src = str(tmp_path / "src.parquet")
+    spark.createDataFrame(rows, schema_in).write.parquet(src)
+
+    def passthrough(iterator):
+        for batch in iterator:
+            yield batch
+
+    result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in)
+    _assert_plan_matches_mode(_executed_plan(result_df), accelerated)
+
+    out = {(r["id"], r["b"], r["tiny"], r["small"], r["flt"]) for r in result_df.collect()}
+    assert out == set(rows)
+
+
+def test_map_in_arrow_binary_type(spark, tmp_path, accelerated):
+    """
+    BinaryType is the BaseVariableWidthVector path with non-string content. StringType
+    already exercises that path for utf-8 data; binary covers the case where the data
+    buffer can hold arbitrary bytes (including null bytes mid-string).
+    """
+    schema_in = T.StructType(
+        [
+            T.StructField("id", T.LongType()),
+            T.StructField("payload", T.BinaryType()),
+        ]
+    )
+    rows = [
+        (1, b"\x00\x01\x02\x03"),
+        (2, b""),
+        (3, b"\xff" * 64),
+        (4, None),
+    ]
+    src = str(tmp_path / "src.parquet")
+    spark.createDataFrame(rows, schema_in).write.parquet(src)
+
+    def passthrough(iterator):
+        for batch in iterator:
+            yield batch
+
+    result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in)
+    _assert_plan_matches_mode(_executed_plan(result_df), accelerated)
+
+    out = {(r["id"], bytes(r["payload"]) if r["payload"] is not None else None)
+           for r in result_df.collect()}
+    expected = set(rows)
+    assert out == expected
+
+
+def test_map_in_arrow_timestamp_ntz(spark, tmp_path, accelerated):
+    """
+    TimestampNTZType is a separate Arrow type from TimestampType (no timezone) and goes
+    through a different ArrowType.Timestamp(..., tz=None) on the wire.
+    """
+    schema_in = T.StructType(
+        [
+            T.StructField("id", T.LongType()),
+            T.StructField("ts_ntz", T.TimestampNTZType()),
+        ]
+    )
+    rows = [
+        (1, dt.datetime(2024, 1, 1, 12, 30, 45)),
+        (2, dt.datetime(1970, 1, 1, 0, 0, 0)),
+        (3, None),
+    ]
+    src = str(tmp_path / "src.parquet")
+    spark.createDataFrame(rows, schema_in).write.parquet(src)
+
+    def passthrough(iterator):
+        for batch in iterator:
+            yield batch
+
+    result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in)
+    _assert_plan_matches_mode(_executed_plan(result_df), accelerated)
+
+    out = {(r["id"], r["ts_ntz"]) for r in result_df.collect()}
+    assert out == set(rows)
+
+
+def test_map_in_arrow_map_type(spark, tmp_path, accelerated):
+    """
+    MapType is encoded in Arrow as a List<Struct<key, value>> with extra metadata. The
+    buffer layout (offsets + struct child + key/value children) is distinct from a plain
+    list, and CometMapVector is a separate vector class from CometListVector. Without
+    this test the recursive copy path through map-typed columns is unexercised.
+    """
+    schema_in = T.StructType(
+        [
+            T.StructField("id", T.LongType()),
+            T.StructField(
+                "attrs", T.MapType(T.StringType(), T.IntegerType(), valueContainsNull=True)
+            ),
+        ]
+    )
+    rows = [
+        (1, {"a": 1, "b": 2}),
+        (2, {}),
+        (3, None),
+        (4, {"only": None}),
+    ]
+    src = str(tmp_path / "src.parquet")
+    spark.createDataFrame(rows, schema_in).write.parquet(src)
+
+    def passthrough(iterator):
+        for batch in iterator:
+            yield batch
+
+    result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in)
+    _assert_plan_matches_mode(_executed_plan(result_df), accelerated)
+
+    def _normalize(row):
+        attrs = row["attrs"]
+        attrs_norm = (
+            tuple(sorted(attrs.items(), key=lambda kv: kv[0]))
+            if attrs is not None
+            else None
+        )
+        return (row["id"], attrs_norm)
+
+    out = {_normalize(r) for r in result_df.collect()}
+    expected = {
+        (
+            r[0],
+            tuple(sorted(r[1].items(), key=lambda kv: kv[0])) if r[1] is not None else None,
+        )
+        for r in rows
+    }
+    assert out == expected
+
+
+def test_map_in_arrow_deeply_nested(spark, tmp_path, accelerated):
+    """
+    Exercises the recursive descent in CometColumnarPythonInput.copyVector at depth > 1,
+    in every nesting combination: array-of-array, array-of-struct, struct-of-array,
+    struct-of-struct. Single-level nesting is covered by test_map_in_arrow_array_and_struct;
+    the bug surface here is that setLastSet / setValueCount must be applied bottom-up
+    correctly at every level.
+    """
+    schema_in = T.StructType(
+        [
+            T.StructField("id", T.LongType()),
+            T.StructField("matrix", T.ArrayType(T.ArrayType(T.IntegerType()))),
+            T.StructField(
+                "people",
+                T.ArrayType(
+                    T.StructType(
+                        [
+                            T.StructField("name", T.StringType()),
+                            T.StructField("age", T.IntegerType()),
+                        ]
+                    )
+                ),
+            ),
+            T.StructField(
+                "config",
+                T.StructType(
+                    [
+                        T.StructField("flags", T.ArrayType(T.StringType())),
+                        T.StructField(
+                            "limits",
+                            T.StructType(
+                                [
+                                    T.StructField("min", T.IntegerType()),
+                                    T.StructField("max", T.IntegerType()),
+                                ]
+                            ),
+                        ),
+                    ]
+                ),
+            ),
+        ]
+    )
+    rows = [
+        (
+            1,
+            [[1, 2], [3, 4, 5]],
+            [("alice", 30), ("bob", 25)],
+            (["x", "y"], (0, 100)),
+        ),
+        (
+            2,
+            [[], [None, 7]],
+            [("solo", None)],
+            ([], (None, None)),
+        ),
+        (3, None, None, None),
+        (4, [None, [9]], [None, ("ghost", 0)], (None, None)),
+    ]
+    src = str(tmp_path / "src.parquet")
+    spark.createDataFrame(rows, schema_in).write.parquet(src)
+
+    def passthrough(iterator):
+        for batch in iterator:
+            yield batch
+
+    result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in)
+    _assert_plan_matches_mode(_executed_plan(result_df), accelerated)
+
+    def _norm_array(a):
+        return tuple(a) if a is not None else None
+
+    def _norm_matrix(m):
+        return tuple(_norm_array(inner) for inner in m) if m is not None else None
+
+    def _norm_people(p):
+        if p is None:
+            return None
+        return tuple(
+            (item["name"], item["age"]) if item is not None else None for item in p
+        )
+
+    def _norm_config(c):
+        if c is None:
+            return None
+        flags = _norm_array(c["flags"])
+        limits = c["limits"]
+        limits_norm = (limits["min"], limits["max"]) if limits is not None else None
+        return (flags, limits_norm)
+
+    def _norm_row(r):
+        return (
+            r["id"],
+            _norm_matrix(r["matrix"]),
+            _norm_people(r["people"]),
+            _norm_config(r["config"]),
+        )
+
+    def _norm_input_people(p):
+        if p is None:
+            return None
+        return tuple(item if item is not None else None for item in p)
+
+    def _norm_input_config(c):
+        if c is None:
+            return None
+        flags, limits = c
+        return (_norm_array(flags), limits)
+
+    out = {_norm_row(r) for r in result_df.collect()}
+    expected = {
+        (
+            r[0],
+            _norm_matrix(r[1]),
+            _norm_input_people(r[2]),
+            _norm_input_config(r[3]),
+        )
+        for r in rows
+    }
+    assert out == expected
+
+
+def test_map_in_arrow_falls_back_when_use_large_var_types(spark, tmp_path):
+    """
+    `spark.sql.execution.arrow.useLargeVarTypes=true` widens StringType / BinaryType to
+    LargeUtf8 / LargeBinary in the destination IPC root (8-byte offsets). Comet's source
+    vectors always use 4-byte offsets; CometColumnarPythonInput.copyVector does a raw
+    setBytes per buffer and would corrupt the offset buffer in this configuration.
+    EliminateRedundantTransitions must skip the rewrite in that case so vanilla Spark
+    handles the operation. This test does not use the `accelerated` fixture: it sets
+    pyarrowUdf.enabled=true AND useLargeVarTypes=true and asserts the plan still falls
+    back to vanilla MapInArrow.
+    """
+    schema_in = T.StructType(
+        [
+            T.StructField("id", T.LongType()),
+            T.StructField("name", T.StringType()),
+        ]
+    )
+    rows = [(i, f"name_{i}") for i in range(20)]
+    src = str(tmp_path / "src.parquet")
+    spark.createDataFrame(rows, schema_in).write.parquet(src)
+
+    def passthrough(iterator):
+        for batch in iterator:
+            yield batch
+
+    prev_pyarrow = spark.conf.get("spark.comet.exec.pyarrowUdf.enabled", "false")
+    prev_large = spark.conf.get("spark.sql.execution.arrow.useLargeVarTypes", "false")
+    spark.conf.set("spark.comet.exec.pyarrowUdf.enabled", "true")
+    spark.conf.set("spark.sql.execution.arrow.useLargeVarTypes", "true")
+    try:
+        result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in)
+        plan = _executed_plan(result_df)
+        assert "CometMapInBatch" not in plan, (
+            f"useLargeVarTypes=true should force fallback, but plan has "
+            f"CometMapInBatch:\n{plan}"
+        )
+        assert "MapInArrow" in plan, (
+            f"expected vanilla MapInArrow in fallback plan, got:\n{plan}"
+        )
+        out = sorted((r["id"], r["name"]) for r in result_df.collect())
+        assert out == sorted(rows)
+    finally:
+        spark.conf.set("spark.comet.exec.pyarrowUdf.enabled", prev_pyarrow)
+        spark.conf.set("spark.sql.execution.arrow.useLargeVarTypes", prev_large)
+
+
 def test_map_in_arrow_after_shuffle(spark, tmp_path, accelerated):
     """
     Verifies correctness when a shuffle sits between the Comet scan and the

From f79e905334773d467409ac2d3a2d638658f241ed Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Thu, 21 May 2026 09:05:37 -0600
Subject: [PATCH 42/54] fix: emit CometVector from CometMapInBatchExec so
 downstream Comet consumers work

Wrap each ArrowColumnVector from the Python output as a CometVector via
CometVector.getVector before emitting the batch. This makes the operator a
proper Comet columnar producer:

- a CometMapInBatchExec stacked above another one can cast the input column
  to CometDecodedVector (as CometColumnarPythonInput already does)
- NativeUtil.exportBatch's case match handles the output (CometVector arm,
  not the SparkException-throwing fallthrough), so a Comet native aggregate
  or join probe over the UDF output does not blow up at FFI handoff

Adds pytest cases that exercise the consumer shapes (chained mapInArrow,
filter on UDF output, groupBy/agg on UDF output) plus a Scala plan-level
test pinning the chained-rewrite structure.

Addresses mbutrovich's correctness comment on #4234.
---
 .../spark/sql/comet/CometMapInBatchExec.scala |  21 +++-
 .../resources/pyspark/test_pyarrow_udf.py     | 113 ++++++++++++++++++
 .../sql/comet/CometMapInBatchSuite.scala      |  33 +++++
 3 files changed, 163 insertions(+), 4 deletions(-)

diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala
index 14e19ab50f..4c40e68809 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala
@@ -30,7 +30,9 @@ import org.apache.spark.sql.execution.{ColumnarToRowExec, SparkPlan, UnaryExecNo
 import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
 import org.apache.spark.sql.execution.python.PythonSQLMetrics
 import org.apache.spark.sql.types.{StructField, StructType}
-import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch}
+import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnVector}
+
+import org.apache.comet.vector.CometVector
 
 /**
  * Comet replacement for Spark's `MapInBatchExec` family (`PythonMapInArrowExec` /
@@ -102,10 +104,21 @@ case class CometMapInBatchExec(
         context)
 
       columnarBatchIter.map { batch =>
-        // Python returns a single struct column; flatten to the user's output columns.
+        // Python returns a single struct column; flatten to the user's output columns and
+        // re-wrap each child as CometVector so consumers that expect Comet's vector hierarchy
+        // (e.g. another CometMapInBatchExec stacked on top, or NativeUtil.exportBatch for a
+        // downstream native Comet operator) see the right type. Sharing the underlying Arrow
+        // ValueVector with the original ArrowColumnVector is safe: close() on either ends up
+        // releasing the same buffers, and arrow-vector's release path is idempotent.
         val structVector = batch.column(0).asInstanceOf[ArrowColumnVector]
-        val outputVectors = outputAttrs.indices.map(structVector.getChild)
-        val flattenedBatch = new ColumnarBatch(outputVectors.toArray)
+        val outputVectors: Array[ColumnVector] = outputAttrs.indices.map { i =>
+          val childArrow = structVector.getChild(i)
+          CometVector.getVector(
+            childArrow.getValueVector,
+            /* useDecimal128 */ true,
+            /* dictionaryProvider */ null)
+        }.toArray
+        val flattenedBatch = new ColumnarBatch(outputVectors)
         flattenedBatch.setNumRows(batch.numRows())
         numOutputRows += flattenedBatch.numRows()
         numOutputBatches += 1
diff --git a/spark/src/test/resources/pyspark/test_pyarrow_udf.py b/spark/src/test/resources/pyspark/test_pyarrow_udf.py
index fc671053a6..4567bedf6d 100644
--- a/spark/src/test/resources/pyspark/test_pyarrow_udf.py
+++ b/spark/src/test/resources/pyspark/test_pyarrow_udf.py
@@ -759,6 +759,119 @@ def passthrough(iterator):
     assert out == sorted(rows)
 
 
+def test_chained_map_in_arrow(spark, tmp_path, accelerated):
+    """
+    `df.mapInArrow(udf1).mapInArrow(udf2)` stacks two operators. With the rewrite
+    enabled both become `CometMapInBatchExec`, so the inner one's output feeds
+    the outer one's input. The outer operator's input path expects vectors of
+    `CometDecodedVector` type: if the inner's output is plain `ArrowColumnVector`
+    the outer throws `ClassCastException` on the first batch.
+    """
+    schema = T.StructType(
+        [
+            T.StructField("id", T.LongType()),
+            T.StructField("value", T.DoubleType()),
+        ]
+    )
+    rows = [(i, float(i)) for i in range(50)]
+    src = str(tmp_path / "src.parquet")
+    spark.createDataFrame(rows, schema).write.parquet(src)
+
+    def add_one(iterator):
+        for batch in iterator:
+            pdf = batch.to_pandas()
+            pdf["value"] = pdf["value"] + 1.0
+            yield pa.RecordBatch.from_pandas(pdf)
+
+    def double_value(iterator):
+        for batch in iterator:
+            pdf = batch.to_pandas()
+            pdf["value"] = pdf["value"] * 2.0
+            yield pa.RecordBatch.from_pandas(pdf)
+
+    result_df = (
+        spark.read.parquet(src)
+        .mapInArrow(add_one, schema)
+        .mapInArrow(double_value, schema)
+    )
+
+    if accelerated:
+        plan = _executed_plan(result_df)
+        assert plan.count("CometMapInBatch") >= 2, (
+            f"expected two CometMapInBatch operators in accelerated plan, got:\n{plan}"
+        )
+
+    out = sorted((r["id"], r["value"]) for r in result_df.collect())
+    expected = sorted((i, (float(i) + 1.0) * 2.0) for i in range(50))
+    assert out == expected
+
+
+def test_filter_on_map_in_arrow_output(spark, tmp_path, accelerated):
+    """
+    A filter on the UDF output column is a downstream Comet operator (when Comet's
+    native filter applies) reading from `CometMapInBatchExec`'s output. If the
+    output were plain `ArrowColumnVector`, NativeUtil.exportBatch's case match
+    would fall to the `case c =>` arm and throw SparkException.
+    """
+    schema = T.StructType(
+        [
+            T.StructField("id", T.LongType()),
+            T.StructField("value", T.LongType()),
+        ]
+    )
+    rows = [(i, i * 2) for i in range(100)]
+    src = str(tmp_path / "src.parquet")
+    spark.createDataFrame(rows, schema).write.parquet(src)
+
+    def passthrough(iterator):
+        for batch in iterator:
+            yield batch
+
+    result_df = (
+        spark.read.parquet(src).mapInArrow(passthrough, schema).filter("value > 50")
+    )
+
+    out = sorted((r["id"], r["value"]) for r in result_df.collect())
+    expected = sorted((i, i * 2) for i in range(100) if i * 2 > 50)
+    assert out == expected
+
+
+def test_aggregate_on_map_in_arrow_output(spark, tmp_path, accelerated):
+    """
+    `mapInArrow(...).groupBy(...).agg(...)` puts an aggregate over the UDF output.
+    The aggregate is a Comet operator and reads from `CometMapInBatchExec`'s
+    output via NativeUtil.exportBatch when promoted to the native pipeline. If
+    the output were ArrowColumnVector, exportBatch would throw on every batch.
+    """
+    schema = T.StructType(
+        [
+            T.StructField("id", T.LongType()),
+            T.StructField("grp", T.LongType()),
+            T.StructField("value", T.LongType()),
+        ]
+    )
+    rows = [(i, i % 5, i) for i in range(100)]
+    src = str(tmp_path / "src.parquet")
+    spark.createDataFrame(rows, schema).write.parquet(src)
+
+    def passthrough(iterator):
+        for batch in iterator:
+            yield batch
+
+    result_df = (
+        spark.read.parquet(src)
+        .mapInArrow(passthrough, schema)
+        .groupBy("grp")
+        .agg({"value": "sum"})
+    )
+
+    out = {r["grp"]: r["sum(value)"] for r in result_df.collect()}
+    expected = {}
+    for i in range(100):
+        expected[i % 5] = expected.get(i % 5, 0) + i
+    assert out == expected
+
+
 def test_map_in_arrow_barrier_mode(spark, tmp_path, accelerated):
     """
     `mapInArrow(..., barrier=True)` runs the stage in barrier execution mode
diff --git a/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala b/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala
index 79d75bb2cf..b4f3d64d76 100644
--- a/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala
+++ b/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala
@@ -105,6 +105,39 @@ class CometMapInBatchSuite extends CometTestBase {
     }
   }
 
+  test("rule handles chained MapInArrowExec without crashing") {
+    // df.mapInArrow(...).mapInArrow(...) produces two MapInArrowExec operators. The outer
+    // consumes rows from the inner directly (MapInArrowExec is a row producer), so there is
+    // no ColumnarToRow between them. After the rule's bottom-up rewrite the inner becomes
+    // CometMapInBatchExec; the outer keeps its row contract and is satisfied by
+    // CometMapInBatchExec.doExecute() reintroducing a ColumnarToRow internally. The
+    // assertion exists mainly to pin the structure: regress this if a future change makes
+    // both rewrite (the bulk-copy input path would then need to accept a CometVector input
+    // that did not come from a CometDecodedVector chain).
+    withSQLConf(CometConf.COMET_PYARROW_UDF_ENABLED.key -> "true") {
+      val cometLeaf = StubCometLeaf(Seq(AttributeReference("id", LongType)(ExprId(0L))))
+      val inner = MapInArrowExec(
+        stubPythonUDF,
+        cometLeaf.output,
+        ColumnarToRowExec(cometLeaf),
+        isBarrier = false,
+        profile = None)
+      val outer = MapInArrowExec(
+        stubPythonUDF,
+        cometLeaf.output,
+        inner,
+        isBarrier = false,
+        profile = None)
+
+      val rewritten = EliminateRedundantTransitions(spark).apply(outer)
+      val cometOps = rewritten.collect { case op: CometMapInBatchExec => op }
+      assert(
+        cometOps.size == 1,
+        s"expected the inner MapInArrowExec to be rewritten, but the chain produced " +
+          s"${cometOps.size} CometMapInBatchExec(s):\n$rewritten")
+    }
+  }
+
   test("end-to-end: rewrite-on output matches rewrite-off output for primitives + varchar") {
     // This test needs PySpark workers; only run if PYSPARK_PYTHON is set in the env.
     assume(

From 2b11f4612d3d8dbf3d9b8b2f4de05f669e4fd4ba Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Thu, 21 May 2026 09:17:28 -0600
Subject: [PATCH 43/54] refactor: tighten EliminateRedundantTransitions arm and
 dedupe 3.x stub

- Move the 3.4 / 3.5 ShimCometMapInBatch stubs into a single spark-3.x shim
  (they were byte-identical). The matchers still return None on both versions
  so the rule is a no-op on Spark 3.x.

- Replace the eligibleMapInBatchInfo guard + .get unpack with an
  EligibleMapInBatch extractor that runs the matchers and conf reads once
  per visited plan.

- Add arrowUseLargeVarTypes(conf) to ShimSQLConf so the rule no longer reads
  the conf stringly. 4.x and 3.5 forward to the typed accessor; 3.4 falls
  back to getConfString because that version has no accessor.

- Hoist the per-batch VectorUnloader in CometColumnarPythonInput to a lazy
  val. getRecordBatch reads root.getFieldVectors on every call so reuse is
  safe; this drops one allocation per batch.

- Clarify the comment on cometCodec: 4.0.x has no
  SQLConf.arrowCompressionCodec accessor (added after 4.0 branch was cut),
  so a typed ShimSQLConf forwarder would still need a stringly fallback for
  the 4.0 build. The 4.1+ codec instances live in the separate
  arrow-compression artifact, which Comet does not depend on; the
  CompressionCodec.Factory path keeps that dependency contained.

Addresses mbutrovich's items 5, 6, 8, 10 on #4234.
---
 .../rules/EliminateRedundantTransitions.scala | 63 ++++++++-----------
 .../org/apache/comet/shims/ShimSQLConf.scala  |  9 +++
 .../sql/comet/shims/ShimCometMapInBatch.scala | 60 ------------------
 .../org/apache/comet/shims/ShimSQLConf.scala  |  8 ++-
 .../sql/comet/shims/ShimCometMapInBatch.scala | 19 ++++--
 .../org/apache/comet/shims/ShimSQLConf.scala  | 10 ++-
 .../python/CometColumnarPythonInput.scala     | 18 ++++--
 7 files changed, 77 insertions(+), 110 deletions(-)
 delete mode 100644 spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
 rename spark/src/main/{spark-3.5 => spark-3.x}/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala (72%)

diff --git a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala
index ee7a9e085b..ce3b78a9fa 100644
--- a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala
+++ b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala
@@ -30,6 +30,7 @@ import org.apache.spark.sql.execution.adaptive.QueryStageExec
 import org.apache.spark.sql.execution.exchange.ReusedExchangeExec
 
 import org.apache.comet.CometConf
+import org.apache.comet.shims.ShimSQLConf
 
 // This rule is responsible for eliminating redundant transitions between row-based and
 // columnar-based operators for Comet. Currently, three potential redundant transitions are:
@@ -54,7 +55,8 @@ import org.apache.comet.CometConf
 // be removed.
 case class EliminateRedundantTransitions(session: SparkSession)
     extends Rule[SparkPlan]
-    with ShimCometMapInBatch {
+    with ShimCometMapInBatch
+    with ShimSQLConf {
 
   private lazy val showTransformations = CometConf.COMET_EXPLAIN_TRANSFORMATIONS.get()
 
@@ -104,31 +106,21 @@ case class EliminateRedundantTransitions(session: SparkSession)
       // Replace MapInBatchExec (PythonMapInArrowExec / MapInArrowExec / MapInPandasExec) that has
       // a ColumnarToRow child with CometMapInBatchExec, eliminating the input and output
       // UnsafeProjection copies and keeping the stage columnar. The matchers are
-      // version-shimmed: Spark 3.4 returns None (it lacks the required APIs) and Spark 4.1+
-      // matches the renamed `MapInArrowExec`.
+      // version-shimmed: Spark 3.4 / 3.5 return None (they lack the required APIs) and Spark
+      // 4.1+ matches the renamed `MapInArrowExec`.
       //
       // Falls back to vanilla Spark when `spark.sql.execution.arrow.useLargeVarTypes` is enabled:
       // CometColumnarPythonInput.copyVector does raw `setBytes` on each Arrow buffer, but Comet's
       // source string/binary vectors always use 4-byte offsets while the destination root is
       // allocated with 8-byte offsets when this conf is on. The buffer counts match but the
       // offset width does not, so a direct memcpy would corrupt the offsets.
-      //
-      // The guard runs `eligibleMapInBatchInfo` so this case only matches actual MapInArrow /
-      // MapInPandas operators. Without the structural check the case would match every
-      // `SparkPlan` whenever the pyarrow conf is on, short-circuiting the
-      // `CometShuffleExchangeExec` arm below.
-      case p if eligibleMapInBatchInfo(p).isDefined =>
-        val info = eligibleMapInBatchInfo(p).get
-        extractColumnarChild(info.child)
-          .map { columnarChild =>
-            CometMapInBatchExec(
-              info.func,
-              info.output,
-              columnarChild,
-              info.isBarrier,
-              info.pythonEvalType)
-          }
-          .getOrElse(p)
+      case EligibleMapInBatch(info, columnarChild) =>
+        CometMapInBatchExec(
+          info.func,
+          info.output,
+          columnarChild,
+          info.isBarrier,
+          info.pythonEvalType)
 
       // Spark adds `RowToColumnar` under Comet columnar shuffle. But it's redundant as the
       // shuffle takes row-based input.
@@ -176,23 +168,22 @@ case class EliminateRedundantTransitions(session: SparkSession)
   }
 
   /**
-   * Returns `Some(info)` only when this rule should attempt to rewrite `plan` to
-   * `CometMapInBatchExec`, i.e. when the conf is on, the largeVarTypes fallback does not apply,
-   * and the plan is one of the version-shimmed MapInArrow / MapInPandas operators. Used in the
-   * pattern guard so the case only fires for plans we actually want to rewrite - without that
-   * narrowing, the `case` would match every `SparkPlan` whenever the conf is on and consume the
-   * later `CometShuffleExchangeExec` arm. Read the conf via the raw key string so this compiles
-   * against Spark 3.4, which lacks `SQLConf.arrowUseLargeVarTypes`.
+   * Matches the plans this rule should rewrite to `CometMapInBatchExec`. Single extractor used in
+   * the `transformUp` arm above so the matchers and conf reads run once per visited plan. Returns
+   * `(info, columnarChild)` where `columnarChild` is the Comet columnar producer that
+   * `CometMapInBatchExec` will consume directly. Returns `None` (and the arm misses) when the
+   * conf is off, when `useLargeVarTypes` forces the fallback, when the plan is not one of the
+   * version-shimmed MapInArrow / MapInPandas operators, or when the child is not a Comet
+   * columnar-to-row transition we can strip.
    */
-  private def eligibleMapInBatchInfo(plan: SparkPlan): Option[MapInBatchInfo] = {
-    if (!CometConf.COMET_PYARROW_UDF_ENABLED.get()) {
-      None
-    } else if (plan.conf
-        .getConfString("spark.sql.execution.arrow.useLargeVarTypes", "false")
-        .toBoolean) {
-      None
-    } else {
-      matchMapInArrow(plan).orElse(matchMapInPandas(plan))
+  private object EligibleMapInBatch {
+    def unapply(plan: SparkPlan): Option[(MapInBatchInfo, SparkPlan)] = {
+      if (!CometConf.COMET_PYARROW_UDF_ENABLED.get()) None
+      else if (arrowUseLargeVarTypes(plan.conf)) None
+      else
+        matchMapInArrow(plan)
+          .orElse(matchMapInPandas(plan))
+          .flatMap(info => extractColumnarChild(info.child).map(child => (info, child)))
     }
   }
 
diff --git a/spark/src/main/spark-3.4/org/apache/comet/shims/ShimSQLConf.scala b/spark/src/main/spark-3.4/org/apache/comet/shims/ShimSQLConf.scala
index 0bff426c21..e809e33904 100644
--- a/spark/src/main/spark-3.4/org/apache/comet/shims/ShimSQLConf.scala
+++ b/spark/src/main/spark-3.4/org/apache/comet/shims/ShimSQLConf.scala
@@ -19,9 +19,18 @@
 
 package org.apache.comet.shims
 
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy
 
 trait ShimSQLConf {
   protected val LEGACY = LegacyBehaviorPolicy.LEGACY
   protected val CORRECTED = LegacyBehaviorPolicy.CORRECTED
+
+  /**
+   * Reads `spark.sql.execution.arrow.useLargeVarTypes`. Spark 3.4 has no typed accessor for this
+   * conf, so read by raw key. The conf only governs the destination Arrow IPC root width on
+   * Spark 4.x, so the value returned here matters only to callers that look it up explicitly.
+   */
+  protected def arrowUseLargeVarTypes(conf: SQLConf): Boolean =
+    conf.getConfString("spark.sql.execution.arrow.useLargeVarTypes", "false").toBoolean
 }
diff --git a/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
deleted file mode 100644
index 1fd4b96f09..0000000000
--- a/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.comet.shims
-
-import org.apache.spark.TaskContext
-import org.apache.spark.sql.catalyst.expressions.PythonUDF
-import org.apache.spark.sql.execution.SparkPlan
-import org.apache.spark.sql.execution.metric.SQLMetric
-import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.types.StructType
-import org.apache.spark.sql.vectorized.ColumnarBatch
-
-/**
- * Spark 3.4 shim for the PyArrow UDF acceleration support.
- *
- * Spark 3.4 lacks several APIs that the optimization relies on (`isBarrier` on `MapInBatchExec`,
- * `arrowUseLargeVarTypes`, `JobArtifactSet`, the modern `ArrowPythonRunner` constructor), so the
- * matchers return `None` and the runner factory throws. The optimization is effectively a no-op
- * on Spark 3.4.
- */
-trait ShimCometMapInBatch {
-
-  protected def matchMapInArrow(plan: SparkPlan): Option[MapInBatchInfo] = None
-
-  protected def matchMapInPandas(plan: SparkPlan): Option[MapInBatchInfo] = None
-
-  /** Stub; never constructed on Spark 3.4 because the matchers always return `None`. */
-  protected case class RunnerInputs()
-
-  protected def runnerInputs(pythonUDF: PythonUDF, conf: SQLConf): RunnerInputs =
-    throw new UnsupportedOperationException("CometMapInBatchExec is not supported on Spark 3.4")
-
-  protected def computeArrowPython(
-      runnerInputs: RunnerInputs,
-      evalType: Int,
-      argOffsets: Array[Array[Int]],
-      schema: StructType,
-      pythonMetrics: Map[String, SQLMetric],
-      batchIter: Iterator[Iterator[ColumnarBatch]],
-      partitionId: Int,
-      context: TaskContext): Iterator[ColumnarBatch] =
-    throw new UnsupportedOperationException("CometMapInBatchExec is not supported on Spark 3.4")
-}
diff --git a/spark/src/main/spark-3.5/org/apache/comet/shims/ShimSQLConf.scala b/spark/src/main/spark-3.5/org/apache/comet/shims/ShimSQLConf.scala
index bdb2739460..219e0f2a2e 100644
--- a/spark/src/main/spark-3.5/org/apache/comet/shims/ShimSQLConf.scala
+++ b/spark/src/main/spark-3.5/org/apache/comet/shims/ShimSQLConf.scala
@@ -19,9 +19,15 @@
 
 package org.apache.comet.shims
 
-import org.apache.spark.sql.internal.LegacyBehaviorPolicy
+import org.apache.spark.sql.internal.{LegacyBehaviorPolicy, SQLConf}
 
 trait ShimSQLConf {
   protected val LEGACY = LegacyBehaviorPolicy.LEGACY
   protected val CORRECTED = LegacyBehaviorPolicy.CORRECTED
+
+  /**
+   * Reads `spark.sql.execution.arrow.useLargeVarTypes`. Spark 3.5 has the typed accessor;
+   * forward to it.
+   */
+  protected def arrowUseLargeVarTypes(conf: SQLConf): Boolean = conf.arrowUseLargeVarTypes
 }
diff --git a/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-3.x/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
similarity index 72%
rename from spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
rename to spark/src/main/spark-3.x/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
index 73a1077de2..c0a31c6e52 100644
--- a/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
+++ b/spark/src/main/spark-3.x/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
@@ -28,11 +28,18 @@ import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
 /**
- * Spark 3.5 shim for the PyArrow UDF acceleration support.
+ * Spark 3.x stub for the PyArrow UDF acceleration support.
  *
- * The columnar runner introduced in #4234 only targets Spark 4.0+. On Spark 3.5 the matchers
+ * The columnar runner introduced in #4234 only targets Spark 4.0+. On Spark 3.4 / 3.5 the matchers
  * return `None`, the rewrite does not fire, and vanilla Spark handles `mapInArrow` /
- * `mapInPandas` unchanged. 3.5 support can be added later if there is user demand.
+ * `mapInPandas` unchanged. The runner factory throws; it is never called because the matchers
+ * always return `None`. 3.x support can be added later if there is user demand.
+ *
+ * Shared across spark-3.4 and spark-3.5 because both are identical: 3.4 lacks the modern
+ * `ArrowPythonRunner` constructor and `arrowUseLargeVarTypes`, and 3.5's `PythonArrowInput`
+ * trait has a different contract (`writeIteratorToArrowStream` one-shot vs 4.x's
+ * `writeNextBatchToArrowStream` batch-at-a-time), so neither version can host the columnar input
+ * implementation without a separate rewrite.
  */
 trait ShimCometMapInBatch {
 
@@ -40,11 +47,11 @@ trait ShimCometMapInBatch {
 
   protected def matchMapInPandas(plan: SparkPlan): Option[MapInBatchInfo] = None
 
-  /** Stub; never constructed on Spark 3.5 because the matchers always return `None`. */
+  /** Stub; never constructed on Spark 3.x because the matchers always return `None`. */
   protected case class RunnerInputs()
 
   protected def runnerInputs(pythonUDF: PythonUDF, conf: SQLConf): RunnerInputs =
-    throw new UnsupportedOperationException("CometMapInBatchExec is not supported on Spark 3.5")
+    throw new UnsupportedOperationException("CometMapInBatchExec is not supported on Spark 3.x")
 
   protected def computeArrowPython(
       runnerInputs: RunnerInputs,
@@ -55,5 +62,5 @@ trait ShimCometMapInBatch {
       batchIter: Iterator[Iterator[ColumnarBatch]],
       partitionId: Int,
       context: TaskContext): Iterator[ColumnarBatch] =
-    throw new UnsupportedOperationException("CometMapInBatchExec is not supported on Spark 3.5")
+    throw new UnsupportedOperationException("CometMapInBatchExec is not supported on Spark 3.x")
 }
diff --git a/spark/src/main/spark-4.x/org/apache/comet/shims/ShimSQLConf.scala b/spark/src/main/spark-4.x/org/apache/comet/shims/ShimSQLConf.scala
index bdb2739460..3157889b43 100644
--- a/spark/src/main/spark-4.x/org/apache/comet/shims/ShimSQLConf.scala
+++ b/spark/src/main/spark-4.x/org/apache/comet/shims/ShimSQLConf.scala
@@ -19,9 +19,17 @@
 
 package org.apache.comet.shims
 
-import org.apache.spark.sql.internal.LegacyBehaviorPolicy
+import org.apache.spark.sql.internal.{LegacyBehaviorPolicy, SQLConf}
 
 trait ShimSQLConf {
   protected val LEGACY = LegacyBehaviorPolicy.LEGACY
   protected val CORRECTED = LegacyBehaviorPolicy.CORRECTED
+
+  /**
+   * Reads `spark.sql.execution.arrow.useLargeVarTypes`. Spark 4.x exposes a typed accessor; 3.4
+   * lacks it (a 3.5 backport added it, but Comet's 3.x shim collapses both into a single string
+   * fallback). Forward to the accessor here so callers do not depend on which version they're
+   * compiled against.
+   */
+  protected def arrowUseLargeVarTypes(conf: SQLConf): Boolean = conf.arrowUseLargeVarTypes
 }
diff --git a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala
index 5cce2aaf28..ac821901c0 100644
--- a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala
+++ b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala
@@ -53,10 +53,18 @@ private[python] trait CometColumnarPythonInput extends PythonArrowInput[Iterator
 
   private var currentGroup: Iterator[ColumnarBatch] = _
 
-  // Read the codec name via raw config key so this compiles against Spark 4.0 (which lacks
-  // SQLConf.arrowCompressionCodec) as well as 4.1/4.2. The codec instances are obtained
-  // through CompressionCodec.Factory (arrow-vector) rather than importing the concrete
-  // Lz4CompressionCodec / ZstdCompressionCodec from the separate arrow-compression artifact.
+  // Constructed once per task: `root` (the trait's persistent destination IPC root) and
+  // `cometCodec` are both stable across the partition. `getRecordBatch` reads the current
+  // contents of `root.getFieldVectors` on every call, so re-using the unloader is safe.
+  private lazy val batchUnloader: VectorUnloader =
+    new VectorUnloader(root, /* includeNullCount */ true, cometCodec, /* alignBuffers */ true)
+
+  // Read the codec name via raw config key. Spark 4.0.x has no `SQLConf.arrowCompressionCodec`
+  // accessor at all (it was added after the 4.0 line was cut), so a typed `ShimSQLConf`
+  // forwarder would still need a stringly-typed fallback for the 4.0 build. The codec instances
+  // are obtained through `CompressionCodec.Factory` (arrow-vector) rather than importing the
+  // concrete `Lz4CompressionCodec` / `ZstdCompressionCodec` from the separate
+  // arrow-compression artifact, which Comet does not depend on.
   private lazy val cometCodec: CompressionCodec = {
     val factory = CompressionCodec.Factory.INSTANCE
     SQLConf.get.getConfString("spark.sql.execution.arrow.compression.codec", "none") match {
@@ -112,8 +120,6 @@ private[python] trait CometColumnarPythonInput extends PythonArrowInput[Iterator
     Platform.setMemory(structVec.getValidityBuffer.memoryAddress(), 0xff.toByte, validityBytes)
     root.setRowCount(numRows)
 
-    val batchUnloader =
-      new VectorUnloader(root, /* includeNullCount */ true, cometCodec, /* alignBuffers */ true)
     val recordBatch = batchUnloader.getRecordBatch
     try {
       val writeChannel = new WriteChannel(Channels.newChannel(dataOut))

From b69da2ebe64211df47e24e08bbb7fc19bbea18a7 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Thu, 21 May 2026 09:18:24 -0600
Subject: [PATCH 44/54] docs: ground pyarrow UDF allocator framing in actual
 code

The PR description, CometColumnarPythonInput header, and pyarrow-udfs.md all
blamed the per-buffer copy on 'Comet's Parquet readers each constructing
their own RootAllocator'. The repo only has one process-wide RootAllocator
(CometArrowAllocator), and native scan does Parquet reading on the Rust
side: arrow buffers cross the boundary via Arrow C Data Interface, not a
JVM allocator.

The actual blocker on TransferPair is that imported buffers carry a
ReferenceManager whose release routes through FFI, while Spark's
destination IPC root is a child of ArrowUtils.rootAllocator. The two
reference managers cannot share buffers.

Reframe the per-batch work as 'two copies, one structural':
- copy 1 (Comet -> destination IPC root) is droppable, tracked in #4294
- copy 2 (root -> pipe via VectorUnloader / MessageSerializer) is the
  structural floor; Spark's transport to Python is fork + pipe + Arrow
  IPC, so the bytes must reach the pipe at least once

Addresses mbutrovich's items 1 and 3 (framing) on #4234. The PR
description update is a separate step.
---
 docs/source/user-guide/latest/pyarrow-udfs.md   | 16 +++++++++++-----
 .../python/CometColumnarPythonInput.scala       | 17 ++++++++++++-----
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/docs/source/user-guide/latest/pyarrow-udfs.md b/docs/source/user-guide/latest/pyarrow-udfs.md
index 0b2cd9aebb..d0e2346998 100644
--- a/docs/source/user-guide/latest/pyarrow-udfs.md
+++ b/docs/source/user-guide/latest/pyarrow-udfs.md
@@ -196,8 +196,14 @@ on the unoptimized path.
   destination IPC root, while Comet's source vectors always use 4-byte offsets. The buffer-copy
   path cannot bridge that mismatch, so `EliminateRedundantTransitions` skips the rewrite and
   vanilla Spark handles the operation.
-- The current implementation copies Comet's vector buffers into Spark's allocator one
-  buffer at a time. True zero-copy via `TransferPair` is blocked on Comet's Parquet
-  readers allocating from `ArrowUtils.rootAllocator` (rather than each reader
-  constructing its own independent `RootAllocator`). Tracked in
-  [#4294](https://github.com/apache/datafusion-comet/issues/4294).
+- Each batch is copied twice on the JVM side: once from Comet's vectors into Spark's
+  destination IPC root (per-buffer `setBytes`), and a second time inside the IPC writer when
+  `VectorUnloader` / `MessageSerializer.serialize` walks the root and writes bytes to the
+  pipe to the Python worker. The pipe write is structural (Spark's transport to Python is
+  fork + pipe + Arrow IPC, so the buffer bytes must reach the pipe at least once); dropping
+  the first copy by serialising directly from Comet's vectors is tracked in
+  [#4294](https://github.com/apache/datafusion-comet/issues/4294). Even after that,
+  true zero-copy at the JVM boundary is blocked because Comet's source `FieldVector`s are
+  imported from native via Arrow C Data Interface (their buffers route `release` through FFI),
+  while Spark's destination IPC root is a child of `ArrowUtils.rootAllocator`. The two
+  reference managers cannot share buffers via `TransferPair`.
diff --git a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala
index ac821901c0..cf4f324a23 100644
--- a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala
+++ b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala
@@ -42,11 +42,18 @@ import org.apache.comet.vector.CometDecodedVector
  * worker as Arrow IPC.
  *
  * Per batch: walk the destination struct's children, allocate each child sized to match the
- * corresponding Comet column, and copy each buffer with `ArrowBuf.setBytes`. The source (Comet's
- * vectors) and the destination (Spark's persistent IPC root) live in different `RootAllocator`
- * trees, so `TransferPair` / `VectorLoader.load` cannot rebind buffers across the boundary;
- * per-buffer memcpy is the available alternative until the readers share a parent allocator
- * (tracked in #4294).
+ * corresponding Comet column, and copy each buffer with `ArrowBuf.setBytes`. The current path
+ * does two copies per batch: this one (Comet vector buffers → destination IPC root), and a
+ * second one inside `VectorUnloader` / `MessageSerializer.serialize` (root → pipe). The pipe
+ * write is structural — Spark's transport to Python is fork + pipe + Arrow IPC, so the buffer
+ * bytes must reach the pipe at least once. Dropping the first copy by serialising directly
+ * from Comet's vectors is tracked in #4294; once done, the path is at the single-copy floor.
+ *
+ * The cross-allocator constraint on `TransferPair` is independent of the copy count: even after
+ * #4294, true zero-copy at the JVM boundary is blocked because Comet's source `FieldVector`s
+ * are imported from native via Arrow C Data Interface (their buffers route `release` through
+ * FFI), while Spark's destination IPC root is a child of `ArrowUtils.rootAllocator`. The two
+ * reference managers cannot share buffers.
  */
 private[python] trait CometColumnarPythonInput extends PythonArrowInput[Iterator[ColumnarBatch]] {
   self: BasePythonRunner[Iterator[ColumnarBatch], _] =>

From a520321765a44fd8809c272ae50fc27c13963bce Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Thu, 21 May 2026 09:19:45 -0600
Subject: [PATCH 45/54] test: expand pyarrow UDF coverage for vector-copy edge
 cases

Hand-written cases that pin the boundaries mbutrovich called out as gaps:

- decimal precision sweep (1, 9, 17, 18, 19, 28, 38; scale 0/half/max)
  covering the short-decimal (long-backed) and long-decimal (16-byte
  FixedSizeBinary) paths and the 18/19 boundary
- null density sweep (0, 0.01, 0.5, 0.99, 1.0) for validity-buffer memcpy
- multi-batch per partition (batch size 16, 4000 rows in 1 partition) so
  the persistent destination IPC root is exercised across many batches
- wide schema (50 cols, mixed primitives + strings + booleans) for the
  flattened-tree address arithmetic
- mid-stream zero-row batch so setValueCount(0) + validity sizing is
  hit while the iterator continues
- transforming array UDF (reverse each list) to catch symmetric
  encode/decode mistakes that a passthrough would invert

A randomised fuzz harness (analogous to CometCodegenFuzzSuite) is the
right next step for the recursive vector-tree walk; deferred to a
separate follow-on.
---
 .../resources/pyspark/test_pyarrow_udf.py     | 239 ++++++++++++++++++
 1 file changed, 239 insertions(+)

diff --git a/spark/src/test/resources/pyspark/test_pyarrow_udf.py b/spark/src/test/resources/pyspark/test_pyarrow_udf.py
index 4567bedf6d..24e3b65049 100644
--- a/spark/src/test/resources/pyspark/test_pyarrow_udf.py
+++ b/spark/src/test/resources/pyspark/test_pyarrow_udf.py
@@ -335,6 +335,245 @@ def passthrough(iterator):
     assert out == set(rows)
 
 
+@pytest.mark.parametrize(
+    "precision,scale",
+    [
+        (1, 0),
+        (9, 0),
+        (9, 4),
+        (17, 8),
+        (18, 0),
+        (18, 18),
+        (19, 0),
+        (28, 14),
+        (38, 0),
+        (38, 18),
+        (38, 38),
+    ],
+)
+def test_map_in_arrow_decimal_precision_sweep(
+    spark, tmp_path, accelerated, precision, scale
+):
+    """
+    Spark's `BaseFixedWidthVector` handles short decimals (precision <= 18, long-backed) and long
+    decimals (precision >= 19, 16-byte `FixedSizeBinary`) on different code paths. The 18/19
+    boundary is where buffer-width assumptions in `copyVector` can hide bugs. Sweep over
+    representative precisions and scale extremes (0, half, max).
+    """
+    schema_in = T.StructType(
+        [
+            T.StructField("id", T.LongType()),
+            T.StructField("amount", T.DecimalType(precision, scale)),
+        ]
+    )
+    integer_digits = precision - scale
+    abs_int = (10**integer_digits - 1) if integer_digits > 0 else 0
+    abs_frac = (10**scale - 1) if scale > 0 else 0
+    largest = Decimal(f"{abs_int}.{abs_frac:0{scale}d}") if scale else Decimal(abs_int)
+    rows = [
+        (1, Decimal(0)),
+        (2, largest),
+        (3, -largest),
+        (4, None),
+    ]
+    src = str(tmp_path / "src.parquet")
+    spark.createDataFrame(rows, schema_in).write.parquet(src)
+
+    def passthrough(iterator):
+        for batch in iterator:
+            yield batch
+
+    result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in)
+    _assert_plan_matches_mode(_executed_plan(result_df), accelerated)
+
+    out = {(r["id"], r["amount"]) for r in result_df.collect()}
+    assert out == set(rows)
+
+
+@pytest.mark.parametrize("null_fraction", [0.0, 0.01, 0.5, 0.99, 1.0])
+def test_map_in_arrow_null_density_sweep(
+    spark, tmp_path, accelerated, null_fraction
+):
+    """
+    Validity-buffer memcpy is where Arrow Java vector copies historically break. Sweep null
+    density across the corner cases: all-non-null, sparse-null, half-null, sparse-non-null,
+    all-null. Catches off-by-one in validity packing and edge cases where source/destination
+    null counts diverge.
+    """
+    schema_in = T.StructType(
+        [
+            T.StructField("id", T.LongType()),
+            T.StructField("value", T.LongType()),
+        ]
+    )
+    n = 256
+    rows = [
+        (i, None if (i * 9973) % 100 < int(null_fraction * 100) else i * 2)
+        for i in range(n)
+    ]
+    src = str(tmp_path / "src.parquet")
+    spark.createDataFrame(rows, schema_in).write.parquet(src)
+
+    def passthrough(iterator):
+        for batch in iterator:
+            yield batch
+
+    result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in)
+    _assert_plan_matches_mode(_executed_plan(result_df), accelerated)
+
+    out = sorted((r["id"], r["value"]) for r in result_df.collect())
+    assert out == sorted(rows)
+
+
+def test_map_in_arrow_multi_batch_per_partition(spark, tmp_path, accelerated):
+    """
+    Force many small batches in a single partition so the writer/unloader exercises the
+    persistent destination IPC root over multiple batches. Catches buffer-reuse bugs and
+    variable-width data-buffer growth across batches that single-batch tests miss.
+    """
+    schema_in = T.StructType(
+        [
+            T.StructField("id", T.LongType()),
+            T.StructField("s", T.StringType()),
+        ]
+    )
+    n = 4000
+    rows = [(i, f"row_{i}" if i % 7 != 0 else None) for i in range(n)]
+    src = str(tmp_path / "src.parquet")
+    # Single partition; small arrow batch limit forces ~250 batches per partition.
+    spark.createDataFrame(rows, schema_in).coalesce(1).write.parquet(src)
+
+    prev_records = spark.conf.get("spark.sql.execution.arrow.maxRecordsPerBatch")
+    spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "16")
+    try:
+
+        def passthrough(iterator):
+            for batch in iterator:
+                yield batch
+
+        result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in)
+        _assert_plan_matches_mode(_executed_plan(result_df), accelerated)
+
+        out = sorted((r["id"], r["s"]) for r in result_df.collect())
+        assert out == sorted(rows)
+    finally:
+        spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", prev_records)
+
+
+def test_map_in_arrow_wide_schema(spark, tmp_path, accelerated):
+    """
+    50-column mixed-type schema. The bulk-copy path walks a flattened addresses[] array indexed
+    across the whole vector tree; off-by-one in flattening logic surfaces at depth * width.
+    """
+    fields = [T.StructField("id", T.LongType())]
+    for i in range(15):
+        fields.append(T.StructField(f"i{i}", T.IntegerType()))
+    for i in range(15):
+        fields.append(T.StructField(f"d{i}", T.DoubleType()))
+    for i in range(15):
+        fields.append(T.StructField(f"s{i}", T.StringType()))
+    for i in range(4):
+        fields.append(T.StructField(f"b{i}", T.BooleanType()))
+    assert len(fields) == 50
+    schema_in = T.StructType(fields)
+
+    rows = []
+    for i in range(60):
+        row = [i]
+        row += [i + k if k % 3 != 0 else None for k in range(15)]
+        row += [float(i + k) * 0.5 if k % 4 != 0 else None for k in range(15)]
+        row += [f"s{i}_{k}" if k % 5 != 0 else None for k in range(15)]
+        row += [bool((i + k) % 2) for k in range(4)]
+        rows.append(tuple(row))
+
+    src = str(tmp_path / "src.parquet")
+    spark.createDataFrame(rows, schema_in).write.parquet(src)
+
+    def passthrough(iterator):
+        for batch in iterator:
+            yield batch
+
+    result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in)
+    _assert_plan_matches_mode(_executed_plan(result_df), accelerated)
+
+    out = sorted(tuple(r[name] for name in schema_in.names) for r in result_df.collect())
+    assert out == sorted(rows)
+
+
+def test_map_in_arrow_zero_row_batch_in_stream(spark, tmp_path, accelerated):
+    """
+    A non-empty stream that contains a 0-row batch mid-stream. The existing empty-input test
+    filters everything out so the operator sees zero batches; this one keeps later batches so
+    the writer must handle a 0-row batch and continue. setValueCount(0) + validity buffer
+    sizing are the candidates that can break here.
+    """
+    schema_in = T.StructType(
+        [
+            T.StructField("id", T.LongType()),
+            T.StructField("value", T.LongType()),
+        ]
+    )
+    rows = [(i, i * 3) for i in range(50)]
+    src = str(tmp_path / "src.parquet")
+    spark.createDataFrame(rows, schema_in).coalesce(1).write.parquet(src)
+
+    def emit_with_empty(iterator):
+        for batch in iterator:
+            # Yield an empty record batch first, then the real one.
+            yield batch.slice(0, 0)
+            yield batch
+
+    result_df = spark.read.parquet(src).mapInArrow(emit_with_empty, schema_in)
+    _assert_plan_matches_mode(_executed_plan(result_df), accelerated)
+
+    out = sorted((r["id"], r["value"]) for r in result_df.collect())
+    assert out == sorted(rows)
+
+
+def test_map_in_arrow_transforming_array(spark, tmp_path, accelerated):
+    """
+    Mutating UDF over a complex type: reverse each array. Catches symmetric encode/decode
+    mistakes that a passthrough UDF would invert and hide.
+    """
+    schema_in = T.StructType(
+        [
+            T.StructField("id", T.LongType()),
+            T.StructField("nums", T.ArrayType(T.IntegerType())),
+        ]
+    )
+    rows = [
+        (1, [1, 2, 3, 4]),
+        (2, [None, 5, None]),
+        (3, []),
+        (4, None),
+        (5, [42]),
+    ]
+    src = str(tmp_path / "src.parquet")
+    spark.createDataFrame(rows, schema_in).write.parquet(src)
+
+    def reverse_arrays(iterator):
+        for batch in iterator:
+            pdf = batch.to_pandas()
+            pdf["nums"] = pdf["nums"].apply(
+                lambda lst: list(reversed(lst)) if lst is not None else None
+            )
+            yield pa.RecordBatch.from_pandas(pdf)
+
+    result_df = spark.read.parquet(src).mapInArrow(reverse_arrays, schema_in)
+    _assert_plan_matches_mode(_executed_plan(result_df), accelerated)
+
+    def _norm(row):
+        nums = row["nums"]
+        return (row["id"], None if nums is None else tuple(nums))
+
+    out = {_norm(r) for r in result_df.collect()}
+    expected = set()
+    for id_, nums in rows:
+        rev = None if nums is None else tuple(reversed(nums))
+        expected.add((id_, rev))
+    assert out == expected
+
+
 def test_map_in_arrow_date_and_timestamp(spark, tmp_path, accelerated):
     schema_in = T.StructType(
         [

From 1c00e1b1db28a9b3e2ba32dcca56673fcebcb10b Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Thu, 21 May 2026 09:29:08 -0600
Subject: [PATCH 46/54] test: disable Comet shuffle in pyarrow UDF pytest
 session

CometSparkSessionExtensions.isCometLoaded short-circuits the whole
extension (returning false; no rules registered) when
spark.comet.exec.shuffle.enabled is true but spark.shuffle.manager is
not Comet's manager. The pytest conftest only sets the basic Comet
configs, so this guard fired and CometScanRule never ran. The plan
stayed vanilla Parquet, the rewrite chain never had a Comet columnar
producer to match, and every [accelerated] assertion that checks for
CometMapInBatch failed.

These tests do not exercise shuffle, so disable Comet shuffle in the
session. Comet's scan and exec rules then run normally and the rewrite
fires.

Diagnoses the wholesale PyArrow UDF Spark 4.0 CI failure on #4234.
---
 spark/src/test/resources/pyspark/test_pyarrow_udf.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/spark/src/test/resources/pyspark/test_pyarrow_udf.py b/spark/src/test/resources/pyspark/test_pyarrow_udf.py
index 24e3b65049..6347411cb7 100644
--- a/spark/src/test/resources/pyspark/test_pyarrow_udf.py
+++ b/spark/src/test/resources/pyspark/test_pyarrow_udf.py
@@ -63,6 +63,12 @@ def spark():
         .config("spark.plugins", "org.apache.spark.CometPlugin")
         .config("spark.comet.enabled", "true")
         .config("spark.comet.exec.enabled", "true")
+        # spark.comet.exec.shuffle.enabled defaults to true, and
+        # CometSparkSessionExtensions.isCometLoaded refuses to register Comet's rules
+        # at all when shuffle is on but spark.shuffle.manager is not the Comet manager.
+        # These tests do not need Comet shuffle, so disable it explicitly to keep
+        # Comet's scan and exec rules active without configuring shuffle.
+        .config("spark.comet.exec.shuffle.enabled", "false")
         .config("spark.memory.offHeap.enabled", "true")
         .config("spark.memory.offHeap.size", "2g")
         .getOrCreate()

From c43b203dd20a99c2ed9aed3cb45a658fa07cb9ef Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Mon, 22 Jun 2026 17:06:47 -0600
Subject: [PATCH 47/54] fix: resolve build failures after merging main into
 pyarrow-udf

Drop the removed useDecimal128 argument from the CometVector.getVector
call in CometMapInBatchExec, which no longer compiles after main removed
that parameter. Add braces to the EligibleMapInBatch if/else to satisfy
scalastyle, remove a redundant string interpolator flagged by scalafix,
and apply spotless formatting.
---
 .../rules/EliminateRedundantTransitions.scala  |  9 ++++++---
 .../spark/sql/comet/CometMapInBatchExec.scala  |  1 -
 .../org/apache/comet/shims/ShimSQLConf.scala   |  4 ++--
 .../org/apache/comet/shims/ShimSQLConf.scala   |  4 ++--
 .../sql/comet/shims/ShimCometMapInBatch.scala  |  8 ++++----
 .../python/CometColumnarPythonInput.scala      | 18 +++++++++---------
 .../spark/sql/comet/CometMapInBatchSuite.scala | 10 +++-------
 7 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala
index ce3b78a9fa..fa42f441e5 100644
--- a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala
+++ b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala
@@ -178,12 +178,15 @@ case class EliminateRedundantTransitions(session: SparkSession)
    */
   private object EligibleMapInBatch {
     def unapply(plan: SparkPlan): Option[(MapInBatchInfo, SparkPlan)] = {
-      if (!CometConf.COMET_PYARROW_UDF_ENABLED.get()) None
-      else if (arrowUseLargeVarTypes(plan.conf)) None
-      else
+      if (!CometConf.COMET_PYARROW_UDF_ENABLED.get()) {
+        None
+      } else if (arrowUseLargeVarTypes(plan.conf)) {
+        None
+      } else {
         matchMapInArrow(plan)
           .orElse(matchMapInPandas(plan))
           .flatMap(info => extractColumnarChild(info.child).map(child => (info, child)))
+      }
     }
   }
 
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala
index 4c40e68809..c39eb405ad 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala
@@ -115,7 +115,6 @@ case class CometMapInBatchExec(
           val childArrow = structVector.getChild(i)
           CometVector.getVector(
             childArrow.getValueVector,
-            /* useDecimal128 */ true,
             /* dictionaryProvider */ null)
         }.toArray
         val flattenedBatch = new ColumnarBatch(outputVectors)
diff --git a/spark/src/main/spark-3.4/org/apache/comet/shims/ShimSQLConf.scala b/spark/src/main/spark-3.4/org/apache/comet/shims/ShimSQLConf.scala
index e809e33904..3ab5e5cbac 100644
--- a/spark/src/main/spark-3.4/org/apache/comet/shims/ShimSQLConf.scala
+++ b/spark/src/main/spark-3.4/org/apache/comet/shims/ShimSQLConf.scala
@@ -28,8 +28,8 @@ trait ShimSQLConf {
 
   /**
    * Reads `spark.sql.execution.arrow.useLargeVarTypes`. Spark 3.4 has no typed accessor for this
-   * conf, so read by raw key. The conf only governs the destination Arrow IPC root width on
-   * Spark 4.x, so the value returned here matters only to callers that look it up explicitly.
+   * conf, so read by raw key. The conf only governs the destination Arrow IPC root width on Spark
+   * 4.x, so the value returned here matters only to callers that look it up explicitly.
    */
   protected def arrowUseLargeVarTypes(conf: SQLConf): Boolean =
     conf.getConfString("spark.sql.execution.arrow.useLargeVarTypes", "false").toBoolean
diff --git a/spark/src/main/spark-3.5/org/apache/comet/shims/ShimSQLConf.scala b/spark/src/main/spark-3.5/org/apache/comet/shims/ShimSQLConf.scala
index 219e0f2a2e..c87e8358f3 100644
--- a/spark/src/main/spark-3.5/org/apache/comet/shims/ShimSQLConf.scala
+++ b/spark/src/main/spark-3.5/org/apache/comet/shims/ShimSQLConf.scala
@@ -26,8 +26,8 @@ trait ShimSQLConf {
   protected val CORRECTED = LegacyBehaviorPolicy.CORRECTED
 
   /**
-   * Reads `spark.sql.execution.arrow.useLargeVarTypes`. Spark 3.5 has the typed accessor;
-   * forward to it.
+   * Reads `spark.sql.execution.arrow.useLargeVarTypes`. Spark 3.5 has the typed accessor; forward
+   * to it.
    */
   protected def arrowUseLargeVarTypes(conf: SQLConf): Boolean = conf.arrowUseLargeVarTypes
 }
diff --git a/spark/src/main/spark-3.x/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala b/spark/src/main/spark-3.x/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
index c0a31c6e52..59d8c7f251 100644
--- a/spark/src/main/spark-3.x/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
+++ b/spark/src/main/spark-3.x/org/apache/spark/sql/comet/shims/ShimCometMapInBatch.scala
@@ -30,14 +30,14 @@ import org.apache.spark.sql.vectorized.ColumnarBatch
 /**
  * Spark 3.x stub for the PyArrow UDF acceleration support.
  *
- * The columnar runner introduced in #4234 only targets Spark 4.0+. On Spark 3.4 / 3.5 the matchers
- * return `None`, the rewrite does not fire, and vanilla Spark handles `mapInArrow` /
+ * The columnar runner introduced in #4234 only targets Spark 4.0+. On Spark 3.4 / 3.5 the
+ * matchers return `None`, the rewrite does not fire, and vanilla Spark handles `mapInArrow` /
  * `mapInPandas` unchanged. The runner factory throws; it is never called because the matchers
  * always return `None`. 3.x support can be added later if there is user demand.
  *
  * Shared across spark-3.4 and spark-3.5 because both are identical: 3.4 lacks the modern
- * `ArrowPythonRunner` constructor and `arrowUseLargeVarTypes`, and 3.5's `PythonArrowInput`
- * trait has a different contract (`writeIteratorToArrowStream` one-shot vs 4.x's
+ * `ArrowPythonRunner` constructor and `arrowUseLargeVarTypes`, and 3.5's `PythonArrowInput` trait
+ * has a different contract (`writeIteratorToArrowStream` one-shot vs 4.x's
  * `writeNextBatchToArrowStream` batch-at-a-time), so neither version can host the columnar input
  * implementation without a separate rewrite.
  */
diff --git a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala
index cf4f324a23..dacf1d1638 100644
--- a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala
+++ b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala
@@ -43,17 +43,17 @@ import org.apache.comet.vector.CometDecodedVector
  *
  * Per batch: walk the destination struct's children, allocate each child sized to match the
  * corresponding Comet column, and copy each buffer with `ArrowBuf.setBytes`. The current path
- * does two copies per batch: this one (Comet vector buffers → destination IPC root), and a
- * second one inside `VectorUnloader` / `MessageSerializer.serialize` (root → pipe). The pipe
- * write is structural — Spark's transport to Python is fork + pipe + Arrow IPC, so the buffer
- * bytes must reach the pipe at least once. Dropping the first copy by serialising directly
- * from Comet's vectors is tracked in #4294; once done, the path is at the single-copy floor.
+ * does two copies per batch: this one (Comet vector buffers → destination IPC root), and a second
+ * one inside `VectorUnloader` / `MessageSerializer.serialize` (root → pipe). The pipe write is
+ * structural — Spark's transport to Python is fork + pipe + Arrow IPC, so the buffer bytes must
+ * reach the pipe at least once. Dropping the first copy by serialising directly from Comet's
+ * vectors is tracked in #4294; once done, the path is at the single-copy floor.
  *
  * The cross-allocator constraint on `TransferPair` is independent of the copy count: even after
- * #4294, true zero-copy at the JVM boundary is blocked because Comet's source `FieldVector`s
- * are imported from native via Arrow C Data Interface (their buffers route `release` through
- * FFI), while Spark's destination IPC root is a child of `ArrowUtils.rootAllocator`. The two
- * reference managers cannot share buffers.
+ * #4294, true zero-copy at the JVM boundary is blocked because Comet's source `FieldVector`s are
+ * imported from native via Arrow C Data Interface (their buffers route `release` through FFI),
+ * while Spark's destination IPC root is a child of `ArrowUtils.rootAllocator`. The two reference
+ * managers cannot share buffers.
  */
 private[python] trait CometColumnarPythonInput extends PythonArrowInput[Iterator[ColumnarBatch]] {
   self: BasePythonRunner[Iterator[ColumnarBatch], _] =>
diff --git a/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala b/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala
index b4f3d64d76..e18e838b29 100644
--- a/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala
+++ b/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala
@@ -122,18 +122,14 @@ class CometMapInBatchSuite extends CometTestBase {
         ColumnarToRowExec(cometLeaf),
         isBarrier = false,
         profile = None)
-      val outer = MapInArrowExec(
-        stubPythonUDF,
-        cometLeaf.output,
-        inner,
-        isBarrier = false,
-        profile = None)
+      val outer =
+        MapInArrowExec(stubPythonUDF, cometLeaf.output, inner, isBarrier = false, profile = None)
 
       val rewritten = EliminateRedundantTransitions(spark).apply(outer)
       val cometOps = rewritten.collect { case op: CometMapInBatchExec => op }
       assert(
         cometOps.size == 1,
-        s"expected the inner MapInArrowExec to be rewritten, but the chain produced " +
+        "expected the inner MapInArrowExec to be rewritten, but the chain produced " +
           s"${cometOps.size} CometMapInBatchExec(s):\n$rewritten")
     }
   }

From 2fcd89fac7b0131ea97eb768a707dd7f07891852 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 23 Jun 2026 08:50:03 -0600
Subject: [PATCH 48/54] fix: align PyArrow UDF workflow PySpark with Spark
 4.0.2 build

The workflow compiles Comet against the spark-4.0 profile (Spark 4.0.2)
but ran the pytest against pyspark==4.0.1. The PythonArrowInput trait's
private-field mixin is not binary-compatible across that gap, so
constructing CometArrowPythonRunner failed with AbstractMethodError on
the synthesized arrowSchema setter. Pin pyspark to 4.0.2 to match.
---
 .github/workflows/pyarrow_udf_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pyarrow_udf_test.yml b/.github/workflows/pyarrow_udf_test.yml
index e325ab8b6d..f8ab15437e 100644
--- a/.github/workflows/pyarrow_udf_test.yml
+++ b/.github/workflows/pyarrow_udf_test.yml
@@ -97,7 +97,7 @@ jobs:
           apt-get install -y --no-install-recommends python3 python3-venv python3-pip
           python3 -m venv /tmp/venv
           /tmp/venv/bin/pip install --upgrade pip
-          /tmp/venv/bin/pip install "pyspark==4.0.1" "pyarrow>=14" pandas pytest
+          /tmp/venv/bin/pip install "pyspark==4.0.2" "pyarrow>=14" pandas pytest
 
       - name: Run PyArrow UDF pytest
         env:

From de6393748dacb9485c362479f8e03e451395fd1b Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 23 Jun 2026 11:31:03 -0600
Subject: [PATCH 49/54] fix: run PyArrow UDFs through a self-contained
 shaded-Arrow runner

The runner extended Spark's PythonArrowInput / BasicPythonArrowOutput
traits, whose members expose Spark's (unshaded) Arrow types. The
packaged comet-spark jar relocates org.apache.arrow to
org.apache.comet.shaded.arrow, so the synthetic Arrow members on the
generated runner no longer matched Spark's unshaded trait contract,
raising AbstractMethodError at runtime (the output path had the same
latent break, wrapping Spark's unshaded ArrowColumnVector into a shaded
CometVector). It only surfaced in the packaged jar, not in tests run
from classes, which is why CI failed while local runs passed.

Shading must stay (Comet bundles a different Arrow version than Spark),
so instead extend only the Arrow-agnostic BasePythonRunner and perform
the Arrow IPC exchange directly with Comet's shaded Arrow. The Python
worker only ever sees a standard Arrow IPC byte stream, so nothing
crosses the shaded/unshaded boundary: input copies each Comet batch into
a shaded struct root written with a shaded ArrowStreamWriter; output
reads the worker's IPC with a shaded ArrowStreamReader straight into
CometVectors, which is what CometMapInBatchExec and downstream native
operators already consume.

BasePythonRunner has the same shape across Spark 4.0/4.1/4.2, so the IPC
logic lives in one shared CometArrowPythonRunnerBase and the per-version
runners are thin subclasses. Removes the now-unused
CometColumnarPythonInput.
---
 .../rules/EliminateRedundantTransitions.scala |   2 +-
 .../spark/sql/comet/CometMapInBatchExec.scala |  24 +-
 .../python/CometArrowPythonRunner.scala       |  32 +-
 .../python/CometArrowPythonRunner.scala       |  30 +-
 .../python/CometArrowPythonRunner.scala       |  26 +-
 .../python/CometArrowPythonRunnerBase.scala   | 303 ++++++++++++++++++
 .../python/CometColumnarPythonInput.scala     | 196 -----------
 7 files changed, 341 insertions(+), 272 deletions(-)
 create mode 100644 spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala
 delete mode 100644 spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala

diff --git a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala
index fa42f441e5..95b05dc1a2 100644
--- a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala
+++ b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala
@@ -110,7 +110,7 @@ case class EliminateRedundantTransitions(session: SparkSession)
       // 4.1+ matches the renamed `MapInArrowExec`.
       //
       // Falls back to vanilla Spark when `spark.sql.execution.arrow.useLargeVarTypes` is enabled:
-      // CometColumnarPythonInput.copyVector does raw `setBytes` on each Arrow buffer, but Comet's
+      // CometArrowPythonRunnerBase.copyVector does raw `setBytes` on each Arrow buffer, but Comet's
       // source string/binary vectors always use 4-byte offsets while the destination root is
       // allocated with 8-byte offsets when this conf is on. The buffer counts match but the
       // offset width does not, so a direct memcpy would corrupt the offsets.
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala
index c39eb405ad..8ac9a70de3 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometMapInBatchExec.scala
@@ -30,9 +30,9 @@ import org.apache.spark.sql.execution.{ColumnarToRowExec, SparkPlan, UnaryExecNo
 import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
 import org.apache.spark.sql.execution.python.PythonSQLMetrics
 import org.apache.spark.sql.types.{StructField, StructType}
-import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnVector}
+import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector}
 
-import org.apache.comet.vector.CometVector
+import org.apache.comet.vector.CometStructVector
 
 /**
  * Comet replacement for Spark's `MapInBatchExec` family (`PythonMapInArrowExec` /
@@ -104,19 +104,13 @@ case class CometMapInBatchExec(
         context)
 
       columnarBatchIter.map { batch =>
-        // Python returns a single struct column; flatten to the user's output columns and
-        // re-wrap each child as CometVector so consumers that expect Comet's vector hierarchy
-        // (e.g. another CometMapInBatchExec stacked on top, or NativeUtil.exportBatch for a
-        // downstream native Comet operator) see the right type. Sharing the underlying Arrow
-        // ValueVector with the original ArrowColumnVector is safe: close() on either ends up
-        // releasing the same buffers, and arrow-vector's release path is idempotent.
-        val structVector = batch.column(0).asInstanceOf[ArrowColumnVector]
-        val outputVectors: Array[ColumnVector] = outputAttrs.indices.map { i =>
-          val childArrow = structVector.getChild(i)
-          CometVector.getVector(
-            childArrow.getValueVector,
-            /* dictionaryProvider */ null)
-        }.toArray
+        // Python returns a single struct column; flatten to the user's output columns. The runner
+        // produces Comet vectors, so the struct's children are already CometVectors that downstream
+        // consumers (a stacked CometMapInBatchExec, or NativeUtil.exportBatch for a native Comet
+        // operator) can use directly.
+        val structVector = batch.column(0).asInstanceOf[CometStructVector]
+        val outputVectors: Array[ColumnVector] =
+          outputAttrs.indices.map(i => structVector.getChild(i)).toArray
         val flattenedBatch = new ColumnarBatch(outputVectors)
         flattenedBatch.setNumRows(batch.numRows())
         numOutputRows += flattenedBatch.numRows()
diff --git a/spark/src/main/spark-4.0/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala b/spark/src/main/spark-4.0/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala
index 63d282e8b9..051fe14638 100644
--- a/spark/src/main/spark-4.0/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala
+++ b/spark/src/main/spark-4.0/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala
@@ -23,23 +23,21 @@ import java.io.DataOutputStream
 
 import org.apache.spark.api.python.{BasePythonRunner, ChainedPythonFunctions}
 import org.apache.spark.sql.execution.metric.SQLMetric
-import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
 /**
- * Comet's Arrow Python runner for Spark 4.0. Extends `BasePythonRunner` directly because Spark
- * 4.0's `BaseArrowPythonRunner` is bound to `Iterator[InternalRow]` and mixes in
- * `BasicPythonArrowInput`, so we cannot inherit from it. Wires the SQLConf-driven fields that
- * `BaseArrowPythonRunner` provides.
+ * Comet's Arrow Python runner for Spark 4.0. The Arrow IPC exchange lives in
+ * [[CometArrowPythonRunnerBase]]; this subclass only supplies the Spark 4.0 constructor shape and
+ * UDF command serialization.
  */
 class CometArrowPythonRunner(
     funcs: Seq[(ChainedPythonFunctions, Long)],
     evalType: Int,
     argOffsets: Array[Array[Int]],
-    protected override val schema: StructType,
-    protected override val timeZoneId: String,
-    protected override val largeVarTypes: Boolean,
+    schema: StructType,
+    timeZoneId: String,
+    largeVarTypes: Boolean,
     override val workerConf: Map[String, String],
     override val pythonMetrics: Map[String, SQLMetric],
     jobArtifactUUID: Option[String])
@@ -49,23 +47,7 @@ class CometArrowPythonRunner(
       argOffsets,
       jobArtifactUUID,
       pythonMetrics)
-    with CometColumnarPythonInput
-    with BasicPythonArrowOutput {
-
-  override val pythonExec: String =
-    SQLConf.get.pysparkWorkerPythonExecutable.getOrElse(funcs.head._1.funcs.head.pythonExec)
-
-  override val faultHandlerEnabled: Boolean = SQLConf.get.pythonUDFWorkerFaulthandlerEnabled
-  override val idleTimeoutSeconds: Long = SQLConf.get.pythonUDFWorkerIdleTimeoutSeconds
-  override val errorOnDuplicatedFieldNames: Boolean = true
-  override val hideTraceback: Boolean = SQLConf.get.pysparkHideTraceback
-  override val simplifiedTraceback: Boolean = SQLConf.get.pysparkSimplifiedTraceback
-
-  override val bufferSize: Int = SQLConf.get.pandasUDFBufferSize
-  require(
-    bufferSize >= 4,
-    "Pandas execution requires more than 4 bytes. Please set higher buffer. " +
-      s"Please change '${SQLConf.PANDAS_UDF_BUFFER_SIZE.key}'.")
+    with CometArrowPythonRunnerBase {
 
   override protected def writeUDF(dataOut: DataOutputStream): Unit =
     PythonUDFRunner.writeUDFs(dataOut, funcs, argOffsets, jobArtifactUUID)
diff --git a/spark/src/main/spark-4.1/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala b/spark/src/main/spark-4.1/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala
index 7b82b0aed8..8700e282ac 100644
--- a/spark/src/main/spark-4.1/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala
+++ b/spark/src/main/spark-4.1/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala
@@ -21,18 +21,16 @@ package org.apache.spark.sql.execution.python
 
 import java.io.DataOutputStream
 
-import org.apache.spark.api.python.ChainedPythonFunctions
+import org.apache.spark.api.python.{BasePythonRunner, ChainedPythonFunctions}
 import org.apache.spark.sql.execution.metric.SQLMetric
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
 /**
- * Comet's Arrow Python runner for Spark 4.1. Extends `BaseArrowPythonRunner` parameterized over
- * `Iterator[ColumnarBatch]` input, and supplies the columnar input via `CometColumnarPythonInput`
- * instead of `BasicPythonArrowInput`.
- *
- * Spark 4.1's `PythonUDFRunner.writeUDFs` takes a `profiler: Option[String]` fourth argument; we
- * pass `None` since Comet does not support Python profiling.
+ * Comet's Arrow Python runner for Spark 4.1. The Arrow IPC exchange lives in
+ * [[CometArrowPythonRunnerBase]]; this subclass only supplies the Spark 4.1 constructor shape and
+ * UDF command serialization (`PythonUDFRunner.writeUDFs` takes a `profiler: Option[String]`
+ * fourth argument, which Comet does not use).
  */
 class CometArrowPythonRunner(
     funcs: Seq[(ChainedPythonFunctions, Long)],
@@ -41,23 +39,17 @@ class CometArrowPythonRunner(
     schema: StructType,
     timeZoneId: String,
     largeVarTypes: Boolean,
-    workerConf: Map[String, String],
-    pythonMetrics: Map[String, SQLMetric],
+    override val workerConf: Map[String, String],
+    override val pythonMetrics: Map[String, SQLMetric],
     jobArtifactUUID: Option[String],
     sessionUUID: Option[String])
-    extends BaseArrowPythonRunner[Iterator[ColumnarBatch], ColumnarBatch](
-      funcs,
+    extends BasePythonRunner[Iterator[ColumnarBatch], ColumnarBatch](
+      funcs.map(_._1),
       evalType,
       argOffsets,
-      schema,
-      timeZoneId,
-      largeVarTypes,
-      workerConf,
-      pythonMetrics,
       jobArtifactUUID,
-      sessionUUID)
-    with CometColumnarPythonInput
-    with BasicPythonArrowOutput {
+      pythonMetrics)
+    with CometArrowPythonRunnerBase {
 
   override protected def writeUDF(dataOut: DataOutputStream): Unit =
     PythonUDFRunner.writeUDFs(dataOut, funcs, argOffsets, None)
diff --git a/spark/src/main/spark-4.2/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala b/spark/src/main/spark-4.2/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala
index c9714ce068..09848d602e 100644
--- a/spark/src/main/spark-4.2/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala
+++ b/spark/src/main/spark-4.2/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala
@@ -21,15 +21,15 @@ package org.apache.spark.sql.execution.python
 
 import java.io.DataOutputStream
 
-import org.apache.spark.api.python.ChainedPythonFunctions
+import org.apache.spark.api.python.{BasePythonRunner, ChainedPythonFunctions}
 import org.apache.spark.sql.execution.metric.SQLMetric
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
 /**
- * Comet's Arrow Python runner for Spark 4.2. Spark 4.2's `BaseArrowPythonRunner` no longer
- * accepts `workerConf` in its constructor; the subclass overrides `runnerConf` instead.
- * `PythonUDFRunner.writeUDFs` drops the `profiler` argument compared to 4.1.
+ * Comet's Arrow Python runner for Spark 4.2. The Arrow IPC exchange lives in
+ * [[CometArrowPythonRunnerBase]]; this subclass only supplies the Spark 4.2 constructor shape and
+ * UDF command serialization (`PythonUDFRunner.writeUDFs` drops the `profiler` argument).
  */
 class CometArrowPythonRunner(
     funcs: Seq[(ChainedPythonFunctions, Long)],
@@ -39,24 +39,18 @@ class CometArrowPythonRunner(
     timeZoneId: String,
     largeVarTypes: Boolean,
     pythonRunnerConf: Map[String, String],
-    pythonMetrics: Map[String, SQLMetric],
+    override val pythonMetrics: Map[String, SQLMetric],
     jobArtifactUUID: Option[String],
     sessionUUID: Option[String])
-    extends BaseArrowPythonRunner[Iterator[ColumnarBatch], ColumnarBatch](
-      funcs,
+    extends BasePythonRunner[Iterator[ColumnarBatch], ColumnarBatch](
+      funcs.map(_._1),
       evalType,
       argOffsets,
-      schema,
-      timeZoneId,
-      largeVarTypes,
-      pythonMetrics,
       jobArtifactUUID,
-      sessionUUID)
-    with CometColumnarPythonInput
-    with BasicPythonArrowOutput {
+      pythonMetrics)
+    with CometArrowPythonRunnerBase {
 
-  override protected def runnerConf: Map[String, String] =
-    super.runnerConf ++ pythonRunnerConf
+  override protected def workerConf: Map[String, String] = pythonRunnerConf
 
   override protected def writeUDF(dataOut: DataOutputStream): Unit =
     PythonUDFRunner.writeUDFs(dataOut, funcs, argOffsets)
diff --git a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala
new file mode 100644
index 0000000000..c05b1aafd3
--- /dev/null
+++ b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala
@@ -0,0 +1,303 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.execution.python
+
+import java.io.{DataInputStream, DataOutputStream}
+import java.nio.channels.Channels
+import java.util.concurrent.atomic.AtomicBoolean
+
+import scala.jdk.CollectionConverters._
+
+import org.apache.arrow.vector.{BaseFixedWidthVector, BaseLargeVariableWidthVector, BaseVariableWidthVector, FieldVector, VectorSchemaRoot}
+import org.apache.arrow.vector.complex.{LargeListVector, ListVector, StructVector}
+import org.apache.arrow.vector.ipc.{ArrowStreamReader, ArrowStreamWriter}
+import org.apache.arrow.vector.types.pojo.{ArrowType, Field, FieldType}
+import org.apache.spark.{SparkEnv, TaskContext}
+import org.apache.spark.api.python.{BasePythonRunner, PythonRDD, PythonWorker, SpecialLengths}
+import org.apache.spark.sql.execution.metric.SQLMetric
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector}
+import org.apache.spark.unsafe.Platform
+
+import org.apache.comet.CometArrowAllocator
+import org.apache.comet.vector.{CometDecodedVector, CometVector}
+
+/**
+ * Shared base for Comet's Arrow Python runners (Spark 4.0 / 4.1 / 4.2).
+ *
+ * Unlike a stock `ArrowPythonRunner`, this does not extend Spark's `PythonArrowInput` /
+ * `BasicPythonArrowOutput` traits. Those traits expose Spark's Arrow types (`VectorSchemaRoot`,
+ * `Schema`) in their members, and the packaged `comet-spark` jar relocates `org.apache.arrow` to
+ * `org.apache.comet.shaded.arrow`, so mixing them in produces a class whose synthetic Arrow
+ * members no longer match Spark's unshaded trait contract (an `AbstractMethodError` at runtime).
+ *
+ * Instead it extends only the Arrow-agnostic `BasePythonRunner` and performs the Arrow IPC
+ * exchange itself using Comet's (shaded) Arrow. The Python worker only ever sees a standard Arrow
+ * IPC byte stream, which is version-neutral, so nothing crosses the shaded/unshaded boundary:
+ *   - Input: each Comet `ColumnarBatch` is copied into a shaded struct root and written to the
+ *     worker with a shaded `ArrowStreamWriter`.
+ *   - Output: the worker's Arrow IPC is read with a shaded `ArrowStreamReader` straight into
+ *     `CometVector`s, which is exactly what `CometMapInBatchExec` and downstream native operators
+ *     consume.
+ *
+ * `BasePythonRunner` has the same shape across Spark 4.0/4.1/4.2; only the subclass constructor
+ * arguments and `writeUDF` differ, so those stay in the per-version subclasses.
+ */
+private[python] trait CometArrowPythonRunnerBase
+    extends BasePythonRunner[Iterator[ColumnarBatch], ColumnarBatch] {
+
+  /** Worker configuration written to the Python worker before execution. */
+  protected def workerConf: Map[String, String]
+
+  /** Comet's Python SQL metrics (data sent/received, rows). */
+  protected def pythonMetrics: Map[String, SQLMetric]
+
+  /** Version-specific UDF command serialization. */
+  protected def writeUDF(dataOut: DataOutputStream): Unit
+
+  override val pythonExec: String =
+    SQLConf.get.pysparkWorkerPythonExecutable.getOrElse(funcs.head.funcs.head.pythonExec)
+
+  override val faultHandlerEnabled: Boolean = SQLConf.get.pythonUDFWorkerFaulthandlerEnabled
+  override val idleTimeoutSeconds: Long = SQLConf.get.pythonUDFWorkerIdleTimeoutSeconds
+  override val hideTraceback: Boolean = SQLConf.get.pysparkHideTraceback
+  override val simplifiedTraceback: Boolean = SQLConf.get.pysparkSimplifiedTraceback
+
+  override val bufferSize: Int = SQLConf.get.pandasUDFBufferSize
+  require(
+    bufferSize >= 4,
+    "Pandas execution requires more than 4 bytes. Please set higher buffer. " +
+      s"Please change '${SQLConf.PANDAS_UDF_BUFFER_SIZE.key}'.")
+
+  override protected def newWriter(
+      env: SparkEnv,
+      worker: PythonWorker,
+      inputIterator: Iterator[Iterator[ColumnarBatch]],
+      partitionIndex: Int,
+      context: TaskContext): Writer = {
+    new Writer(env, worker, inputIterator, partitionIndex, context) {
+
+      private val allocator =
+        CometArrowAllocator.newChildAllocator(s"stdout writer for $pythonExec", 0, Long.MaxValue)
+      private var currentGroup: Iterator[ColumnarBatch] = _
+      private var arrowWriter: ArrowStreamWriter = _
+      private var writeRoot: VectorSchemaRoot = _
+      private var structVec: StructVector = _
+
+      context.addTaskCompletionListener[Unit] { _ =>
+        if (writeRoot != null) {
+          writeRoot.close()
+        }
+        allocator.close()
+      }
+
+      protected override def writeCommand(dataOut: DataOutputStream): Unit = {
+        // handleMetadataBeforeExec: write the worker config as key/value string pairs.
+        dataOut.writeInt(workerConf.size)
+        for ((k, v) <- workerConf) {
+          PythonRDD.writeUTF(k, dataOut)
+          PythonRDD.writeUTF(v, dataOut)
+        }
+        writeUDF(dataOut)
+      }
+
+      override def writeNextInputToStream(dataOut: DataOutputStream): Boolean = {
+        while (currentGroup == null || !currentGroup.hasNext) {
+          if (!inputIterator.hasNext) {
+            if (arrowWriter != null) {
+              arrowWriter.end()
+            }
+            return false
+          }
+          currentGroup = inputIterator.next()
+        }
+
+        val cometBatch = currentGroup.next()
+        val startData = dataOut.size()
+
+        if (arrowWriter == null) {
+          // Build the destination struct root once, sized to the first batch's child fields.
+          // mapInArrow/mapInPandas exchange the columns under a single non-nullable struct.
+          val childFields = (0 until cometBatch.numCols()).map { i =>
+            cometBatch.column(i).asInstanceOf[CometDecodedVector].getValueVector.getField
+          }
+          val structField =
+            new Field(
+              "struct",
+              new FieldType(false, ArrowType.Struct.INSTANCE, null),
+              childFields.asJava)
+          structVec = structField.createVector(allocator).asInstanceOf[StructVector]
+          writeRoot = new VectorSchemaRoot(Seq[FieldVector](structVec).asJava)
+          arrowWriter = new ArrowStreamWriter(writeRoot, null, Channels.newChannel(dataOut))
+          arrowWriter.start()
+        }
+
+        var i = 0
+        while (i < cometBatch.numCols()) {
+          val src = cometBatch
+            .column(i)
+            .asInstanceOf[CometDecodedVector]
+            .getValueVector
+            .asInstanceOf[FieldVector]
+          val dst = structVec.getChildByOrdinal(i).asInstanceOf[FieldVector]
+          copyVector(src, dst)
+          i += 1
+        }
+        val numRows = cometBatch.numRows()
+        structVec.setValueCount(numRows)
+        // Mark every row of the struct non-null (all-1 validity). The validity buffer is freshly
+        // allocated and zero-initialised, so without this Python would see an all-null struct.
+        val validityBytes = (numRows + 7) / 8
+        Platform.setMemory(
+          structVec.getValidityBuffer.memoryAddress(),
+          0xff.toByte,
+          validityBytes)
+        writeRoot.setRowCount(numRows)
+        arrowWriter.writeBatch()
+
+        pythonMetrics("pythonDataSent") += dataOut.size() - startData
+        true
+      }
+    }
+  }
+
+  override protected def newReaderIterator(
+      stream: DataInputStream,
+      writer: Writer,
+      startTime: Long,
+      env: SparkEnv,
+      worker: PythonWorker,
+      pid: Option[Int],
+      releasedOrClosed: AtomicBoolean,
+      context: TaskContext): Iterator[ColumnarBatch] = {
+    new ReaderIterator(stream, writer, startTime, env, worker, pid, releasedOrClosed, context) {
+
+      private val allocator =
+        CometArrowAllocator.newChildAllocator(s"stdin reader for $pythonExec", 0, Long.MaxValue)
+      private var reader: ArrowStreamReader = _
+      private var root: VectorSchemaRoot = _
+      private var batchLoaded = true
+
+      context.addTaskCompletionListener[Unit] { _ =>
+        if (reader != null) {
+          reader.close(false)
+        }
+        allocator.close()
+      }
+
+      protected override def read(): ColumnarBatch = {
+        if (writer.exception.isDefined) {
+          throw writer.exception.get
+        }
+        try {
+          if (reader != null && batchLoaded) {
+            batchLoaded = reader.loadNextBatch()
+            if (batchLoaded) {
+              // Re-wrap the (reloaded) field vectors fresh each batch, mirroring Comet's
+              // StreamReader, so each ColumnarBatch reflects the current buffers.
+              val vectors: Array[ColumnVector] = root.getFieldVectors.asScala.map { vector =>
+                CometVector.getVector(vector, null).asInstanceOf[ColumnVector]
+              }.toArray
+              val batch = new ColumnarBatch(vectors)
+              batch.setNumRows(root.getRowCount)
+              pythonMetrics("pythonNumRowsReceived") += root.getRowCount
+              batch
+            } else {
+              reader.close(false)
+              allocator.close()
+              read()
+            }
+          } else {
+            stream.readInt() match {
+              case SpecialLengths.START_ARROW_STREAM =>
+                reader = new ArrowStreamReader(stream, allocator)
+                root = reader.getVectorSchemaRoot()
+                read()
+              case SpecialLengths.TIMING_DATA =>
+                handleTimingData()
+                read()
+              case SpecialLengths.PYTHON_EXCEPTION_THROWN =>
+                throw handlePythonException()
+              case SpecialLengths.END_OF_DATA_SECTION =>
+                handleEndOfDataSection()
+                null
+            }
+          }
+        } catch handleException
+      }
+    }
+  }
+
+  /**
+   * Copy a Comet column into the destination FieldVector. Walks both trees in lockstep: sizes
+   * each destination node from the source, copies every buffer with `ArrowBuf.setBytes`, then
+   * sets value counts bottom-up so `setValueCount` does not rewrite the offset bytes we just
+   * copied. Both source and destination are Comet's (shaded) Arrow vectors, so no shaded /
+   * unshaded type crosses.
+   */
+  private def copyVector(src: FieldVector, dst: FieldVector): Unit = {
+    val valueCount = src.getValueCount
+
+    dst match {
+      case bfwv: BaseFixedWidthVector =>
+        bfwv.allocateNew(valueCount)
+      case bvwv: BaseVariableWidthVector =>
+        bvwv.allocateNew(src.getDataBuffer.readableBytes, valueCount)
+      case blvwv: BaseLargeVariableWidthVector =>
+        blvwv.allocateNew(src.getDataBuffer.readableBytes, valueCount)
+      case _ =>
+        dst.setInitialCapacity(valueCount)
+        dst.allocateNew()
+    }
+
+    val srcBufs = src.getFieldBuffers
+    val dstBufs = dst.getFieldBuffers
+    require(
+      srcBufs.size == dstBufs.size,
+      s"buffer count mismatch for ${dst.getField}: src=${srcBufs.size}, dst=${dstBufs.size}")
+    var b = 0
+    while (b < srcBufs.size) {
+      val s = srcBufs.get(b)
+      dstBufs.get(b).setBytes(0, s, 0, s.readableBytes)
+      b += 1
+    }
+
+    val srcChildren = src.getChildrenFromFields
+    val dstChildren = dst.getChildrenFromFields
+    require(
+      srcChildren.size == dstChildren.size,
+      s"child count mismatch for ${dst.getField}: src=${srcChildren.size}, dst=${dstChildren.size}")
+    srcChildren.asScala.zip(dstChildren.asScala).foreach { case (sc, dc) =>
+      copyVector(sc.asInstanceOf[FieldVector], dc.asInstanceOf[FieldVector])
+    }
+
+    // For vectors that fill offset-buffer "holes" in setValueCount (variable-width and list
+    // types), set lastSet = vc - 1 first so fillHoles is a no-op and the already-copied offset
+    // bytes are preserved.
+    dst match {
+      case v: BaseVariableWidthVector => v.setLastSet(valueCount - 1)
+      case v: BaseLargeVariableWidthVector => v.setLastSet(valueCount - 1)
+      case v: ListVector => v.setLastSet(valueCount - 1)
+      case v: LargeListVector => v.setLastSet(valueCount - 1)
+      case _ =>
+    }
+    dst.setValueCount(valueCount)
+  }
+}
diff --git a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala
deleted file mode 100644
index dacf1d1638..0000000000
--- a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometColumnarPythonInput.scala
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.execution.python
-
-import java.io.DataOutputStream
-import java.nio.channels.Channels
-
-import scala.jdk.CollectionConverters._
-
-import org.apache.arrow.vector.{BaseFixedWidthVector, BaseLargeVariableWidthVector, BaseVariableWidthVector, FieldVector, VectorSchemaRoot, VectorUnloader}
-import org.apache.arrow.vector.complex.{LargeListVector, ListVector, StructVector}
-import org.apache.arrow.vector.compression.{CompressionCodec, CompressionUtil, NoCompressionCodec}
-import org.apache.arrow.vector.ipc.{ArrowStreamWriter, WriteChannel}
-import org.apache.arrow.vector.ipc.message.MessageSerializer
-import org.apache.spark.SparkException
-import org.apache.spark.api.python.BasePythonRunner
-import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.vectorized.ColumnarBatch
-import org.apache.spark.unsafe.Platform
-
-import org.apache.comet.vector.CometDecodedVector
-
-/**
- * `PythonArrowInput` implementation that streams Comet `ColumnarBatch` values to the Python
- * worker as Arrow IPC.
- *
- * Per batch: walk the destination struct's children, allocate each child sized to match the
- * corresponding Comet column, and copy each buffer with `ArrowBuf.setBytes`. The current path
- * does two copies per batch: this one (Comet vector buffers → destination IPC root), and a second
- * one inside `VectorUnloader` / `MessageSerializer.serialize` (root → pipe). The pipe write is
- * structural — Spark's transport to Python is fork + pipe + Arrow IPC, so the buffer bytes must
- * reach the pipe at least once. Dropping the first copy by serialising directly from Comet's
- * vectors is tracked in #4294; once done, the path is at the single-copy floor.
- *
- * The cross-allocator constraint on `TransferPair` is independent of the copy count: even after
- * #4294, true zero-copy at the JVM boundary is blocked because Comet's source `FieldVector`s are
- * imported from native via Arrow C Data Interface (their buffers route `release` through FFI),
- * while Spark's destination IPC root is a child of `ArrowUtils.rootAllocator`. The two reference
- * managers cannot share buffers.
- */
-private[python] trait CometColumnarPythonInput extends PythonArrowInput[Iterator[ColumnarBatch]] {
-  self: BasePythonRunner[Iterator[ColumnarBatch], _] =>
-
-  private var currentGroup: Iterator[ColumnarBatch] = _
-
-  // Constructed once per task: `root` (the trait's persistent destination IPC root) and
-  // `cometCodec` are both stable across the partition. `getRecordBatch` reads the current
-  // contents of `root.getFieldVectors` on every call, so re-using the unloader is safe.
-  private lazy val batchUnloader: VectorUnloader =
-    new VectorUnloader(root, /* includeNullCount */ true, cometCodec, /* alignBuffers */ true)
-
-  // Read the codec name via raw config key. Spark 4.0.x has no `SQLConf.arrowCompressionCodec`
-  // accessor at all (it was added after the 4.0 line was cut), so a typed `ShimSQLConf`
-  // forwarder would still need a stringly-typed fallback for the 4.0 build. The codec instances
-  // are obtained through `CompressionCodec.Factory` (arrow-vector) rather than importing the
-  // concrete `Lz4CompressionCodec` / `ZstdCompressionCodec` from the separate
-  // arrow-compression artifact, which Comet does not depend on.
-  private lazy val cometCodec: CompressionCodec = {
-    val factory = CompressionCodec.Factory.INSTANCE
-    SQLConf.get.getConfString("spark.sql.execution.arrow.compression.codec", "none") match {
-      case "none" => NoCompressionCodec.INSTANCE
-      case "lz4" =>
-        factory.createCodec(CompressionUtil.CodecType.LZ4_FRAME)
-      case "zstd" =>
-        val level =
-          SQLConf.get.getConfString("spark.sql.execution.arrow.compression.zstd.level", "3").toInt
-        factory.createCodec(CompressionUtil.CodecType.ZSTD, level)
-      case other =>
-        throw SparkException.internalError(
-          s"Unsupported Arrow compression codec: $other. Supported values: none, lz4, zstd")
-    }
-  }
-
-  override protected def writeNextBatchToArrowStream(
-      root: VectorSchemaRoot,
-      writer: ArrowStreamWriter,
-      dataOut: DataOutputStream,
-      inputIterator: Iterator[Iterator[ColumnarBatch]]): Boolean = {
-
-    while (currentGroup == null || !currentGroup.hasNext) {
-      if (!inputIterator.hasNext) {
-        super[PythonArrowInput].close()
-        return false
-      }
-      currentGroup = inputIterator.next()
-    }
-
-    val cometBatch = currentGroup.next()
-    val startData = dataOut.size()
-    val structVec = root.getVector(0).asInstanceOf[StructVector]
-
-    var i = 0
-    while (i < cometBatch.numCols()) {
-      val src =
-        cometBatch
-          .column(i)
-          .asInstanceOf[CometDecodedVector]
-          .getValueVector
-          .asInstanceOf[FieldVector]
-      val dst = structVec.getChildByOrdinal(i).asInstanceOf[FieldVector]
-      copyVector(src, dst)
-      i += 1
-    }
-    val numRows = cometBatch.numRows()
-    structVec.setValueCount(numRows)
-    // Mark every row in the struct as non-null (all-1 validity bits). The struct validity
-    // buffer is freshly allocated (or cleared) and zero-initialised, so without this step
-    // Python would see an all-null struct column and return null for every output row.
-    val validityBytes = (numRows + 7) / 8
-    Platform.setMemory(structVec.getValidityBuffer.memoryAddress(), 0xff.toByte, validityBytes)
-    root.setRowCount(numRows)
-
-    val recordBatch = batchUnloader.getRecordBatch
-    try {
-      val writeChannel = new WriteChannel(Channels.newChannel(dataOut))
-      MessageSerializer.serialize(writeChannel, recordBatch)
-    } finally {
-      recordBatch.close()
-    }
-
-    pythonMetrics("pythonDataSent") += dataOut.size() - startData
-    true
-  }
-
-  /**
-   * Copy a Comet column into the destination FieldVector. Walks both trees in lockstep: sizes
-   * each destination node from the source, copies every buffer with `ArrowBuf.setBytes`, then
-   * sets value counts bottom-up so `setValueCount` does not rewrite the offset bytes we just
-   * copied.
-   */
-  private def copyVector(src: FieldVector, dst: FieldVector): Unit = {
-    val valueCount = src.getValueCount
-
-    dst match {
-      case bfwv: BaseFixedWidthVector =>
-        bfwv.allocateNew(valueCount)
-      case bvwv: BaseVariableWidthVector =>
-        bvwv.allocateNew(src.getDataBuffer.readableBytes, valueCount)
-      case blvwv: BaseLargeVariableWidthVector =>
-        blvwv.allocateNew(src.getDataBuffer.readableBytes, valueCount)
-      case _ =>
-        dst.setInitialCapacity(valueCount)
-        dst.allocateNew()
-    }
-
-    val srcBufs = src.getFieldBuffers
-    val dstBufs = dst.getFieldBuffers
-    require(
-      srcBufs.size == dstBufs.size,
-      s"buffer count mismatch for ${dst.getField}: src=${srcBufs.size}, dst=${dstBufs.size}")
-    var b = 0
-    while (b < srcBufs.size) {
-      val s = srcBufs.get(b)
-      dstBufs.get(b).setBytes(0, s, 0, s.readableBytes)
-      b += 1
-    }
-
-    val srcChildren = src.getChildrenFromFields
-    val dstChildren = dst.getChildrenFromFields
-    require(
-      srcChildren.size == dstChildren.size,
-      s"child count mismatch for ${dst.getField}: src=${srcChildren.size}, dst=${dstChildren.size}")
-    srcChildren.asScala.zip(dstChildren.asScala).foreach { case (sc, dc) =>
-      copyVector(sc.asInstanceOf[FieldVector], dc.asInstanceOf[FieldVector])
-    }
-
-    // For vectors that fill offset-buffer "holes" in setValueCount (variable-width and list
-    // types), set lastSet = vc - 1 first so fillHoles is a no-op and the already-copied
-    // offset bytes are preserved.
-    dst match {
-      case v: BaseVariableWidthVector => v.setLastSet(valueCount - 1)
-      case v: BaseLargeVariableWidthVector => v.setLastSet(valueCount - 1)
-      case v: ListVector => v.setLastSet(valueCount - 1)
-      case v: LargeListVector => v.setLastSet(valueCount - 1)
-      case _ =>
-    }
-    dst.setValueCount(valueCount)
-  }
-}

From 7a266df65f901a34d992334893d8e480d249ac7d Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 26 Jun 2026 08:22:50 -0600
Subject: [PATCH 50/54] fix: restore input field names in
 CometArrowPythonRunner IPC schema

The runner built the destination Arrow struct's child fields straight from
Comet's vectors (`getValueVector.getField`). Comet's FFI-imported vectors carry
Arrow `Field`s with null names (Comet uses positional schema), so shaded Arrow's
`AbstractStructVector.put` rejected them with `NullPointerException: field name
cannot be null`, failing every accelerated mapInArrow/mapInPandas query.

Wire the already-available input `schema` into the shared base trait and rename
each top-level field from it, recursively substituting a placeholder for any
null nested name. Field types and child structure are preserved so `copyVector`
still walks the source and destination trees in lockstep. This also fixes a
latent correctness gap: the Python worker reads columns by name, so the IPC
schema must carry the real names rather than anonymous fields.
---
 .../python/CometArrowPythonRunner.scala       |  2 +-
 .../python/CometArrowPythonRunner.scala       |  2 +-
 .../python/CometArrowPythonRunner.scala       |  2 +-
 .../python/CometArrowPythonRunnerBase.scala   | 35 ++++++++++++++++++-
 4 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/spark/src/main/spark-4.0/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala b/spark/src/main/spark-4.0/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala
index 051fe14638..82c9ccd9b5 100644
--- a/spark/src/main/spark-4.0/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala
+++ b/spark/src/main/spark-4.0/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala
@@ -35,7 +35,7 @@ class CometArrowPythonRunner(
     funcs: Seq[(ChainedPythonFunctions, Long)],
     evalType: Int,
     argOffsets: Array[Array[Int]],
-    schema: StructType,
+    override val schema: StructType,
     timeZoneId: String,
     largeVarTypes: Boolean,
     override val workerConf: Map[String, String],
diff --git a/spark/src/main/spark-4.1/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala b/spark/src/main/spark-4.1/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala
index 8700e282ac..9e29de64d6 100644
--- a/spark/src/main/spark-4.1/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala
+++ b/spark/src/main/spark-4.1/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala
@@ -36,7 +36,7 @@ class CometArrowPythonRunner(
     funcs: Seq[(ChainedPythonFunctions, Long)],
     evalType: Int,
     argOffsets: Array[Array[Int]],
-    schema: StructType,
+    override val schema: StructType,
     timeZoneId: String,
     largeVarTypes: Boolean,
     override val workerConf: Map[String, String],
diff --git a/spark/src/main/spark-4.2/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala b/spark/src/main/spark-4.2/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala
index 09848d602e..02f0531b46 100644
--- a/spark/src/main/spark-4.2/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala
+++ b/spark/src/main/spark-4.2/org/apache/spark/sql/execution/python/CometArrowPythonRunner.scala
@@ -35,7 +35,7 @@ class CometArrowPythonRunner(
     funcs: Seq[(ChainedPythonFunctions, Long)],
     evalType: Int,
     argOffsets: Array[Array[Int]],
-    schema: StructType,
+    override val schema: StructType,
     timeZoneId: String,
     largeVarTypes: Boolean,
     pythonRunnerConf: Map[String, String],
diff --git a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala
index c05b1aafd3..9b80a2bc5c 100644
--- a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala
+++ b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala
@@ -33,6 +33,7 @@ import org.apache.spark.{SparkEnv, TaskContext}
 import org.apache.spark.api.python.{BasePythonRunner, PythonRDD, PythonWorker, SpecialLengths}
 import org.apache.spark.sql.execution.metric.SQLMetric
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector}
 import org.apache.spark.unsafe.Platform
 
@@ -72,6 +73,14 @@ private[python] trait CometArrowPythonRunnerBase
   /** Version-specific UDF command serialization. */
   protected def writeUDF(dataOut: DataOutputStream): Unit
 
+  /**
+   * Input schema as Comet hands it to the runner: a single non-nullable struct named "struct"
+   * whose children are the user's input columns. Comet's FFI-imported vectors carry Arrow
+   * `Field`s with null names (Comet uses positional schema), so these names are the source of
+   * truth for the field names written into the IPC stream that the Python worker reads by name.
+   */
+  protected def schema: StructType
+
   override val pythonExec: String =
     SQLConf.get.pysparkWorkerPythonExecutable.getOrElse(funcs.head.funcs.head.pythonExec)
 
@@ -135,8 +144,15 @@ private[python] trait CometArrowPythonRunnerBase
         if (arrowWriter == null) {
           // Build the destination struct root once, sized to the first batch's child fields.
           // mapInArrow/mapInPandas exchange the columns under a single non-nullable struct.
+          // Comet's FFI-imported vectors leave the Arrow Field name null, so restore the real
+          // column names from the input schema (the worker reads columns by name, and shaded
+          // Arrow rejects a null field name). The field types and child structure are kept as-is
+          // so copyVector still walks the source and destination trees in lockstep.
+          val childNames = schema.head.dataType.asInstanceOf[StructType].fieldNames
           val childFields = (0 until cometBatch.numCols()).map { i =>
-            cometBatch.column(i).asInstanceOf[CometDecodedVector].getValueVector.getField
+            val vecField =
+              cometBatch.column(i).asInstanceOf[CometDecodedVector].getValueVector.getField
+            renamed(vecField, childNames(i))
           }
           val structField =
             new Field(
@@ -245,6 +261,23 @@ private[python] trait CometArrowPythonRunnerBase
     }
   }
 
+  /**
+   * Rebuild `field` with `name`, preserving its Arrow type and child structure. Any nested child
+   * whose name Comet's FFI import left null is given a positional placeholder so shaded Arrow can
+   * materialize the struct. Keeping the type and structure intact means the destination tree
+   * still mirrors the Comet source tree for [[copyVector]].
+   */
+  private def renamed(field: Field, name: String): Field = {
+    val children = field.getChildren
+    val newChildren =
+      if (children.isEmpty) children
+      else
+        children.asScala.zipWithIndex.map { case (child, idx) =>
+          renamed(child, if (child.getName == null) s"_$idx" else child.getName)
+        }.asJava
+    new Field(name, field.getFieldType, newChildren)
+  }
+
   /**
    * Copy a Comet column into the destination FieldVector. Walks both trees in lockstep: sizes
    * each destination node from the source, copies every buffer with `ArrowBuf.setBytes`, then

From 53f29e6ab47f38e2d2fd8aac041d4b0d4bd706f5 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 26 Jun 2026 08:22:50 -0600
Subject: [PATCH 51/54] fix: avoid decimal context rounding in precision sweep
 test

The decimal precision sweep negated the maximum value with unary minus
(`-largest`). Python's `decimal` applies the default 28-digit context to that
operation, rounding the 38-digit maximum up to 1E38 and overflowing
Decimal(38, 0) when writing the source parquet, before any UDF runs. Use
`copy_negate()`, which flips the sign without applying the context.
---
 spark/src/test/resources/pyspark/test_pyarrow_udf.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/spark/src/test/resources/pyspark/test_pyarrow_udf.py b/spark/src/test/resources/pyspark/test_pyarrow_udf.py
index 6347411cb7..8c3000fb34 100644
--- a/spark/src/test/resources/pyspark/test_pyarrow_udf.py
+++ b/spark/src/test/resources/pyspark/test_pyarrow_udf.py
@@ -376,10 +376,13 @@ def test_map_in_arrow_decimal_precision_sweep(
     abs_int = (10**integer_digits - 1) if integer_digits > 0 else 0
     abs_frac = (10**scale - 1) if scale > 0 else 0
     largest = Decimal(f"{abs_int}.{abs_frac:0{scale}d}") if scale else Decimal(abs_int)
+    # copy_negate() flips the sign without applying the decimal context. Plain `-largest` would
+    # round to the context's default 28 significant digits, turning the 38-digit maximum into
+    # 1E38 and overflowing Decimal(38, 0).
     rows = [
         (1, Decimal(0)),
         (2, largest),
-        (3, -largest),
+        (3, largest.copy_negate()),
         (4, None),
     ]
     src = str(tmp_path / "src.parquet")

From 92ee13b00686ecffbe6fc9a9348321043ad7073a Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 26 Jun 2026 09:06:50 -0600
Subject: [PATCH 52/54] fix: pin output Arrow schema in array-transforming UDF
 test

reverse_arrays rebuilt the batch with pa.RecordBatch.from_pandas(pdf), which
infers list<int64> from the Python-int lists and mismatches the declared
array<int> (list<int32>) output. Spark's int32 projection over the result then
called getInt on a long-backed ArrowColumnVector accessor and threw
UnsupportedOperationException, failing the test in both modes (vanilla Spark's
own output handling rejects the type-mismatched UDF result). Pass
schema=batch.schema so the output keeps the int32 element type.
---
 spark/src/test/resources/pyspark/test_pyarrow_udf.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/spark/src/test/resources/pyspark/test_pyarrow_udf.py b/spark/src/test/resources/pyspark/test_pyarrow_udf.py
index 8c3000fb34..b24a717e88 100644
--- a/spark/src/test/resources/pyspark/test_pyarrow_udf.py
+++ b/spark/src/test/resources/pyspark/test_pyarrow_udf.py
@@ -566,7 +566,11 @@ def reverse_arrays(iterator):
             pdf["nums"] = pdf["nums"].apply(
                 lambda lst: list(reversed(lst)) if lst is not None else None
             )
-            yield pa.RecordBatch.from_pandas(pdf)
+            # Pin the output to the incoming Arrow schema. Without it,
+            # from_pandas infers list<int64> from the Python-int lists, mismatching
+            # the declared array<int> (list<int32>) output and tripping Spark's
+            # int32 projection over the result.
+            yield pa.RecordBatch.from_pandas(pdf, schema=batch.schema)
 
     result_df = spark.read.parquet(src).mapInArrow(reverse_arrays, schema_in)
     _assert_plan_matches_mode(_executed_plan(result_df), accelerated)

From 53764446fe4381a339fc823d83a4ddbd35f32589 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 30 Jun 2026 08:27:04 -0600
Subject: [PATCH 53/54] fix: handle large Arrow types, empty input,
 nullability, and chaining in PyArrow UDFs

Fix the PyArrow UDF acceleration test failures:

- Read large_string / large_binary output from Python workers. PyArrow (with
  pandas 3) emits 64-bit offset variants and mapInArrow passes them through
  untouched. Map LargeUtf8/LargeBinary in Utils.fromArrowType and read 64-bit
  offsets in CometPlainVector.
- Emit a valid empty Arrow IPC stream when an upstream operator produces no
  input batches, instead of writing nothing and tripping the worker's
  ArrowStreamReader.
- Send input fields to the worker as nullable so columns containing nulls are
  accepted, while keeping Map entries non-nullable as Arrow requires, and fill
  an all-valid validity buffer when the source has no nulls.
- Rewrite chained mapInArrow operators: the outer operator now consumes the
  inner CometMapInBatchExec's columnar output directly.

Set spark.comet.scan.unsignedSmallIntSafetyCheck=false in the numeric scalar
test so the ShortType column does not force the scan to fall back, and update
the chained JVM test to expect both operators to be rewritten.
---
 .../apache/comet/vector/CometPlainVector.java | 31 ++++++--
 .../rules/EliminateRedundantTransitions.scala |  6 ++
 .../apache/spark/sql/comet/util/Utils.scala   |  5 ++
 .../python/CometArrowPythonRunnerBase.scala   | 72 +++++++++++++++----
 .../resources/pyspark/test_pyarrow_udf.py     | 23 ++++--
 .../sql/comet/CometMapInBatchSuite.scala      | 24 ++++---
 6 files changed, 128 insertions(+), 33 deletions(-)

diff --git a/spark/src/main/java/org/apache/comet/vector/CometPlainVector.java b/spark/src/main/java/org/apache/comet/vector/CometPlainVector.java
index 473f30d928..8fbef7a490 100644
--- a/spark/src/main/java/org/apache/comet/vector/CometPlainVector.java
+++ b/spark/src/main/java/org/apache/comet/vector/CometPlainVector.java
@@ -41,6 +41,9 @@ public class CometPlainVector extends CometDecodedVector {
   private final long valueBufferAddress;
   private final long offsetBufferAddress;
   private final boolean isBaseFixedWidthVector;
+  // True when the variable-width offsets are 64-bit (LargeVarChar / LargeVarBinary) rather than
+  // the usual 32-bit. PyArrow UDFs can hand back large_string / large_binary columns.
+  private final boolean isLargeVarWidth;
 
   private byte booleanByteCache;
   private int booleanByteCacheIndex = -1;
@@ -61,8 +64,14 @@ public CometPlainVector(ValueVector vector, boolean isUuid) {
     if (vector instanceof BaseVariableWidthVector) {
       this.offsetBufferAddress =
           ((BaseVariableWidthVector) vector).getOffsetBuffer().memoryAddress();
+      this.isLargeVarWidth = false;
+    } else if (vector instanceof BaseLargeVariableWidthVector) {
+      this.offsetBufferAddress =
+          ((BaseLargeVariableWidthVector) vector).getOffsetBuffer().memoryAddress();
+      this.isLargeVarWidth = true;
     } else {
       this.offsetBufferAddress = -1;
+      this.isLargeVarWidth = false;
     }
   }
 
@@ -115,8 +124,15 @@ public double getDouble(int rowId) {
   public UTF8String getUTF8String(int rowId) {
     if (isNullAt(rowId)) return null;
     if (offsetBufferAddress != -1) {
-      int offset = Platform.getInt(null, offsetBufferAddress + rowId * 4L);
-      int length = Platform.getInt(null, offsetBufferAddress + (rowId + 1L) * 4L) - offset;
+      long offset;
+      int length;
+      if (isLargeVarWidth) {
+        offset = Platform.getLong(null, offsetBufferAddress + rowId * 8L);
+        length = (int) (Platform.getLong(null, offsetBufferAddress + (rowId + 1L) * 8L) - offset);
+      } else {
+        offset = Platform.getInt(null, offsetBufferAddress + rowId * 4L);
+        length = Platform.getInt(null, offsetBufferAddress + (rowId + 1L) * 4L) - (int) offset;
+      }
       return UTF8String.fromAddress(null, valueBufferAddress + offset, length);
     } else if (isBaseFixedWidthVector) {
       BaseFixedWidthVector fixedWidthVector = (BaseFixedWidthVector) valueVector;
@@ -139,11 +155,16 @@ public UTF8String getUTF8String(int rowId) {
   @Override
   public byte[] getBinary(int rowId) {
     if (isNullAt(rowId)) return null;
-    int offset;
+    long offset;
     int length;
     if (offsetBufferAddress != -1) {
-      offset = Platform.getInt(null, offsetBufferAddress + rowId * 4L);
-      length = Platform.getInt(null, offsetBufferAddress + (rowId + 1L) * 4L) - offset;
+      if (isLargeVarWidth) {
+        offset = Platform.getLong(null, offsetBufferAddress + rowId * 8L);
+        length = (int) (Platform.getLong(null, offsetBufferAddress + (rowId + 1L) * 8L) - offset);
+      } else {
+        offset = Platform.getInt(null, offsetBufferAddress + rowId * 4L);
+        length = Platform.getInt(null, offsetBufferAddress + (rowId + 1L) * 4L) - (int) offset;
+      }
     } else if (valueVector instanceof BaseFixedWidthVector) {
       BaseFixedWidthVector fixedWidthVector = (BaseFixedWidthVector) valueVector;
       length = fixedWidthVector.getTypeWidth();
diff --git a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala
index 95b05dc1a2..477c2afd99 100644
--- a/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala
+++ b/spark/src/main/scala/org/apache/comet/rules/EliminateRedundantTransitions.scala
@@ -164,6 +164,12 @@ case class EliminateRedundantTransitions(session: SparkSession)
   private def extractColumnarChild(plan: SparkPlan): Option[SparkPlan] = plan match {
     case CometColumnarToRowExec(child) => Some(child)
     case CometNativeColumnarToRowExec(child) => Some(child)
+    // Chained `mapInArrow(udf1).mapInArrow(udf2)`: by the time the outer operator is visited
+    // (transformUp is bottom-up) the inner one has already become a `CometMapInBatchExec`, which
+    // is itself columnar. There is no row transition between them to strip, so consume its
+    // columnar output directly. Its flattened output vectors are `CometVector`s, exactly what
+    // `CometMapInBatchExec`'s input path expects.
+    case child: CometMapInBatchExec => Some(child)
     case _ => None
   }
 
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/util/Utils.scala b/spark/src/main/scala/org/apache/spark/sql/comet/util/Utils.scala
index 15e1e2c410..dea872169d 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/util/Utils.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/util/Utils.scala
@@ -97,7 +97,12 @@ object Utils extends CometTypeShim with Logging {
     case float: ArrowType.FloatingPoint if float.getPrecision == FloatingPointPrecision.DOUBLE =>
       DoubleType
     case ArrowType.Utf8.INSTANCE => StringType
+    // Large (64-bit offset) variants: a PyArrow UDF's Python output may use large_string /
+    // large_binary (e.g. pandas 3 backs string columns with Arrow large types), and mapInArrow
+    // passes those types straight through to the JVM. CometPlainVector reads both offset widths.
+    case ArrowType.LargeUtf8.INSTANCE => StringType
     case ArrowType.Binary.INSTANCE => BinaryType
+    case ArrowType.LargeBinary.INSTANCE => BinaryType
     case _: ArrowType.FixedSizeBinary => BinaryType
     case d: ArrowType.Decimal => DecimalType(d.getPrecision, d.getScale)
     case date: ArrowType.Date if date.getUnit == DateUnit.DAY => DateType
diff --git a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala
index 9b80a2bc5c..ee811a5375 100644
--- a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala
+++ b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala
@@ -31,6 +31,7 @@ import org.apache.arrow.vector.ipc.{ArrowStreamReader, ArrowStreamWriter}
 import org.apache.arrow.vector.types.pojo.{ArrowType, Field, FieldType}
 import org.apache.spark.{SparkEnv, TaskContext}
 import org.apache.spark.api.python.{BasePythonRunner, PythonRDD, PythonWorker, SpecialLengths}
+import org.apache.spark.sql.comet.util.Utils
 import org.apache.spark.sql.execution.metric.SQLMetric
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.StructType
@@ -127,12 +128,35 @@ private[python] trait CometArrowPythonRunnerBase
         writeUDF(dataOut)
       }
 
+      /** Build the destination struct root and start the writer from the given child fields. */
+      private def startWriter(childFields: Seq[Field], dataOut: DataOutputStream): Unit = {
+        val structField =
+          new Field(
+            "struct",
+            new FieldType(false, ArrowType.Struct.INSTANCE, null),
+            childFields.asJava)
+        structVec = structField.createVector(allocator).asInstanceOf[StructVector]
+        writeRoot = new VectorSchemaRoot(Seq[FieldVector](structVec).asJava)
+        arrowWriter = new ArrowStreamWriter(writeRoot, null, Channels.newChannel(dataOut))
+        arrowWriter.start()
+      }
+
       override def writeNextInputToStream(dataOut: DataOutputStream): Boolean = {
         while (currentGroup == null || !currentGroup.hasNext) {
           if (!inputIterator.hasNext) {
-            if (arrowWriter != null) {
-              arrowWriter.end()
+            if (arrowWriter == null) {
+              // No input batch was ever produced (e.g. an upstream filter removed every row).
+              // Still emit a valid, empty Arrow IPC stream so the Python worker's
+              // ArrowStreamReader reads a schema and then sees zero batches, instead of failing
+              // on an absent stream ("Invalid IPC stream: negative continuation token"). There is
+              // no sample batch, so derive the schema from the Spark input schema. The timezone is
+              // irrelevant here because no rows are exchanged.
+              val inner = schema.head.dataType.asInstanceOf[StructType]
+              val childFields = inner.fields.toSeq.map(f =>
+                Utils.toArrowField(f.name, f.dataType, nullable = true, "UTC"))
+              startWriter(childFields, dataOut)
             }
+            arrowWriter.end()
             return false
           }
           currentGroup = inputIterator.next()
@@ -152,17 +176,9 @@ private[python] trait CometArrowPythonRunnerBase
           val childFields = (0 until cometBatch.numCols()).map { i =>
             val vecField =
               cometBatch.column(i).asInstanceOf[CometDecodedVector].getValueVector.getField
-            renamed(vecField, childNames(i))
+            renamed(vecField, childNames(i), forceNullable = true)
           }
-          val structField =
-            new Field(
-              "struct",
-              new FieldType(false, ArrowType.Struct.INSTANCE, null),
-              childFields.asJava)
-          structVec = structField.createVector(allocator).asInstanceOf[StructVector]
-          writeRoot = new VectorSchemaRoot(Seq[FieldVector](structVec).asJava)
-          arrowWriter = new ArrowStreamWriter(writeRoot, null, Channels.newChannel(dataOut))
-          arrowWriter.start()
+          startWriter(childFields, dataOut)
         }
 
         var i = 0
@@ -267,15 +283,31 @@ private[python] trait CometArrowPythonRunnerBase
    * materialize the struct. Keeping the type and structure intact means the destination tree
    * still mirrors the Comet source tree for [[copyVector]].
    */
-  private def renamed(field: Field, name: String): Field = {
+  private def renamed(field: Field, name: String, forceNullable: Boolean): Field = {
+    // A Map's descendants must keep their original nullability: Arrow requires the entries struct
+    // (and its key) to be non-nullable, and `MapVector.createVector` rejects a nullable entries
+    // struct. Stop forcing nullable once we enter a Map subtree.
+    val childrenForceNullable = forceNullable && !field.getType.isInstanceOf[ArrowType.Map]
     val children = field.getChildren
     val newChildren =
       if (children.isEmpty) children
       else
         children.asScala.zipWithIndex.map { case (child, idx) =>
-          renamed(child, if (child.getName == null) s"_$idx" else child.getName)
+          renamed(
+            child,
+            if (child.getName == null) s"_$idx" else child.getName,
+            childrenForceNullable)
         }.asJava
-    new Field(name, field.getFieldType, newChildren)
+    // Force the field nullable where allowed. Comet's FFI-imported vectors may carry a
+    // non-nullable Arrow `Field` even for columns that contain nulls (Comet uses positional schema
+    // and does not round-trip Spark's nullability), and the worker rejects a null value under a
+    // non-nullable field (`from_pandas(pdf, schema=batch.schema)` raises). Marking the field
+    // nullable is a safe superset; `copyVector` fills an all-valid validity buffer when the source
+    // has no nulls.
+    val ft = field.getFieldType
+    val nullable = forceNullable || ft.isNullable
+    val newFt = new FieldType(nullable, ft.getType, ft.getDictionary, ft.getMetadata)
+    new Field(name, newFt, newChildren)
   }
 
   /**
@@ -332,5 +364,15 @@ private[python] trait CometArrowPythonRunnerBase
       case _ =>
     }
     dst.setValueCount(valueCount)
+
+    // Every destination field is nullable (see `renamed`), so the worker reads the validity
+    // buffer. When the source has no nulls its validity buffer may be empty (Comet omits it),
+    // which would otherwise leave the freshly-allocated destination validity all-zero and make
+    // the worker see every value as null. Set all-valid in that case. Done after setValueCount,
+    // which can rewrite validity, mirroring the struct-level all-valid fill in writeNextInput.
+    if (valueCount > 0 && dst.getField.isNullable && src.getNullCount == 0) {
+      val validityBytes = (valueCount + 7) / 8
+      Platform.setMemory(dst.getValidityBuffer.memoryAddress(), 0xff.toByte, validityBytes)
+    }
   }
 }
diff --git a/spark/src/test/resources/pyspark/test_pyarrow_udf.py b/spark/src/test/resources/pyspark/test_pyarrow_udf.py
index b24a717e88..fc5e8053e3 100644
--- a/spark/src/test/resources/pyspark/test_pyarrow_udf.py
+++ b/spark/src/test/resources/pyspark/test_pyarrow_udf.py
@@ -688,11 +688,26 @@ def passthrough(iterator):
         for batch in iterator:
             yield batch
 
-    result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in)
-    _assert_plan_matches_mode(_executed_plan(result_df), accelerated)
+    # ShortType (the `small` column) forces the Comet scan to fall back to vanilla Spark by
+    # default: Parquet UINT_8 maps to ShortType and Comet cannot distinguish it from signed
+    # INT16 (spark.comet.scan.unsignedSmallIntSafetyCheck). Without a Comet scan there is no
+    # columnar producer for the UDF to consume, so the rewrite cannot fire. This data is signed,
+    # so allow native execution to exercise the accelerated path.
+    prev_uint_check = spark.conf.get(
+        "spark.comet.scan.unsignedSmallIntSafetyCheck", "true"
+    )
+    spark.conf.set("spark.comet.scan.unsignedSmallIntSafetyCheck", "false")
+    try:
+        result_df = spark.read.parquet(src).mapInArrow(passthrough, schema_in)
+        _assert_plan_matches_mode(_executed_plan(result_df), accelerated)
 
-    out = {(r["id"], r["b"], r["tiny"], r["small"], r["flt"]) for r in result_df.collect()}
-    assert out == set(rows)
+        out = {
+            (r["id"], r["b"], r["tiny"], r["small"], r["flt"])
+            for r in result_df.collect()
+        }
+        assert out == set(rows)
+    finally:
+        spark.conf.set("spark.comet.scan.unsignedSmallIntSafetyCheck", prev_uint_check)
 
 
 def test_map_in_arrow_binary_type(spark, tmp_path, accelerated):
diff --git a/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala b/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala
index e18e838b29..c9acd2cc3f 100644
--- a/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala
+++ b/spark/src/test/spark-4.x/org/apache/spark/sql/comet/CometMapInBatchSuite.scala
@@ -105,15 +105,15 @@ class CometMapInBatchSuite extends CometTestBase {
     }
   }
 
-  test("rule handles chained MapInArrowExec without crashing") {
+  test("rule rewrites chained MapInArrowExec into stacked CometMapInBatchExec") {
     // df.mapInArrow(...).mapInArrow(...) produces two MapInArrowExec operators. The outer
     // consumes rows from the inner directly (MapInArrowExec is a row producer), so there is
-    // no ColumnarToRow between them. After the rule's bottom-up rewrite the inner becomes
-    // CometMapInBatchExec; the outer keeps its row contract and is satisfied by
-    // CometMapInBatchExec.doExecute() reintroducing a ColumnarToRow internally. The
-    // assertion exists mainly to pin the structure: regress this if a future change makes
-    // both rewrite (the bulk-copy input path would then need to accept a CometVector input
-    // that did not come from a CometDecodedVector chain).
+    // no ColumnarToRow between them. The rule rewrites bottom-up: the inner becomes
+    // CometMapInBatchExec first, then the outer is matched against a child that is already a
+    // (columnar) CometMapInBatchExec and rewrites too, consuming the inner's columnar output
+    // directly. Both operators end up native and the chain stays columnar end to end. The
+    // inner's flattened output vectors are CometVectors, exactly what the outer's bulk-copy
+    // input path expects.
     withSQLConf(CometConf.COMET_PYARROW_UDF_ENABLED.key -> "true") {
       val cometLeaf = StubCometLeaf(Seq(AttributeReference("id", LongType)(ExprId(0L))))
       val inner = MapInArrowExec(
@@ -128,9 +128,15 @@ class CometMapInBatchSuite extends CometTestBase {
       val rewritten = EliminateRedundantTransitions(spark).apply(outer)
       val cometOps = rewritten.collect { case op: CometMapInBatchExec => op }
       assert(
-        cometOps.size == 1,
-        "expected the inner MapInArrowExec to be rewritten, but the chain produced " +
+        cometOps.size == 2,
+        "expected both MapInArrowExec operators to be rewritten, but the chain produced " +
           s"${cometOps.size} CometMapInBatchExec(s):\n$rewritten")
+      assert(
+        outer.output == cometOps.head.output,
+        s"expected the outer operator to be rewritten:\n$rewritten")
+      assert(
+        cometOps.head.child.isInstanceOf[CometMapInBatchExec],
+        s"expected the outer CometMapInBatchExec to consume the inner one directly:\n$rewritten")
     }
   }
 

From 5802e7ca65c037b9cbeeed66890866159d99aacf Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 1 Jul 2026 07:36:42 -0600
Subject: [PATCH 54/54] fix: address PyArrow UDF review feedback

Track pythonDataReceived bytes in the reader so the metric matches the
vanilla BasicPythonArrowOutput fallback. Hoist the repeated input struct
cast to a single lazy val, replace the manual buffer-copy loop with
zip/foreach, and document the positional field-name placeholder.

Correct two test docstrings: the Arrow DecimalVector copy is always
16 bytes wide (the 8-byte long-backed form is Spark's UnsafeRow encoding,
not the Arrow path), and the multi-batch test reuses only the struct
container, not the leaf buffers (see #4383).

The persistent-root reuse, input compression codec, and large-var-type
offset widening are deferred to #4383, which removes the copyVector
bulk-copy path entirely.
---
 .../python/CometArrowPythonRunnerBase.scala   | 24 ++++++++++++-------
 .../resources/pyspark/test_pyarrow_udf.py     | 17 +++++++------
 2 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala
index ee811a5375..4030d26679 100644
--- a/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala
+++ b/spark/src/main/spark-4.x/org/apache/spark/sql/execution/python/CometArrowPythonRunnerBase.scala
@@ -111,6 +111,10 @@ private[python] trait CometArrowPythonRunnerBase
       private var writeRoot: VectorSchemaRoot = _
       private var structVec: StructVector = _
 
+      // The runner's input schema is a single struct column ("struct") whose children are the
+      // user's input columns (see `schema` above). Cast once here rather than at each use site.
+      private lazy val inputStructType = schema.head.dataType.asInstanceOf[StructType]
+
       context.addTaskCompletionListener[Unit] { _ =>
         if (writeRoot != null) {
           writeRoot.close()
@@ -151,8 +155,7 @@ private[python] trait CometArrowPythonRunnerBase
               // on an absent stream ("Invalid IPC stream: negative continuation token"). There is
               // no sample batch, so derive the schema from the Spark input schema. The timezone is
               // irrelevant here because no rows are exchanged.
-              val inner = schema.head.dataType.asInstanceOf[StructType]
-              val childFields = inner.fields.toSeq.map(f =>
+              val childFields = inputStructType.fields.toSeq.map(f =>
                 Utils.toArrowField(f.name, f.dataType, nullable = true, "UTC"))
               startWriter(childFields, dataOut)
             }
@@ -172,7 +175,7 @@ private[python] trait CometArrowPythonRunnerBase
           // column names from the input schema (the worker reads columns by name, and shaded
           // Arrow rejects a null field name). The field types and child structure are kept as-is
           // so copyVector still walks the source and destination trees in lockstep.
-          val childNames = schema.head.dataType.asInstanceOf[StructType].fieldNames
+          val childNames = inputStructType.fieldNames
           val childFields = (0 until cometBatch.numCols()).map { i =>
             val vecField =
               cometBatch.column(i).asInstanceOf[CometDecodedVector].getValueVector.getField
@@ -240,6 +243,7 @@ private[python] trait CometArrowPythonRunnerBase
         }
         try {
           if (reader != null && batchLoaded) {
+            val bytesReadStart = reader.bytesRead()
             batchLoaded = reader.loadNextBatch()
             if (batchLoaded) {
               // Re-wrap the (reloaded) field vectors fresh each batch, mirroring Comet's
@@ -249,6 +253,9 @@ private[python] trait CometArrowPythonRunnerBase
               }.toArray
               val batch = new ColumnarBatch(vectors)
               batch.setNumRows(root.getRowCount)
+              // Track bytes read so `pythonDataReceived` matches the vanilla fallback path
+              // (`BasicPythonArrowOutput`), which meters the same delta around `loadNextBatch`.
+              pythonMetrics("pythonDataReceived") += reader.bytesRead() - bytesReadStart
               pythonMetrics("pythonNumRowsReceived") += root.getRowCount
               batch
             } else {
@@ -293,6 +300,10 @@ private[python] trait CometArrowPythonRunnerBase
       if (children.isEmpty) children
       else
         children.asScala.zipWithIndex.map { case (child, idx) =>
+          // Only null-named FFI children get the positional `_$idx` placeholder. This assumes no
+          // real sibling is literally named `_0`, `_1`, ... (which would collide); struct fields
+          // reaching here carry their real names, so a null name means Comet's FFI import dropped
+          // it and a synthetic positional name is safe.
           renamed(
             child,
             if (child.getName == null) s"_$idx" else child.getName,
@@ -337,11 +348,8 @@ private[python] trait CometArrowPythonRunnerBase
     require(
       srcBufs.size == dstBufs.size,
       s"buffer count mismatch for ${dst.getField}: src=${srcBufs.size}, dst=${dstBufs.size}")
-    var b = 0
-    while (b < srcBufs.size) {
-      val s = srcBufs.get(b)
-      dstBufs.get(b).setBytes(0, s, 0, s.readableBytes)
-      b += 1
+    srcBufs.asScala.zip(dstBufs.asScala).foreach { case (s, d) =>
+      d.setBytes(0, s, 0, s.readableBytes)
     }
 
     val srcChildren = src.getChildrenFromFields
diff --git a/spark/src/test/resources/pyspark/test_pyarrow_udf.py b/spark/src/test/resources/pyspark/test_pyarrow_udf.py
index fc5e8053e3..67cdb9f134 100644
--- a/spark/src/test/resources/pyspark/test_pyarrow_udf.py
+++ b/spark/src/test/resources/pyspark/test_pyarrow_udf.py
@@ -361,10 +361,11 @@ def test_map_in_arrow_decimal_precision_sweep(
     spark, tmp_path, accelerated, precision, scale
 ):
     """
-    Spark's `BaseFixedWidthVector` handles short decimals (precision <= 18, long-backed) and long
-    decimals (precision >= 19, 16-byte `FixedSizeBinary`) on different code paths. The 18/19
-    boundary is where buffer-width assumptions in `copyVector` can hide bugs. Sweep over
-    representative precisions and scale extremes (0, half, max).
+    The Arrow `DecimalVector` that `copyVector` touches is always 16 bytes wide regardless of
+    precision, so there is no buffer-width boundary on the Arrow path (the 8-byte long-backed form
+    is Spark's `UnsafeRow` encoding, a layer this Arrow buffer copy never sees). This sweep instead
+    guards the precision/scale extremes and the 18/19 point where Spark's own decimal handling
+    changes representation, keeping the round trip value-exact. Scale extremes: 0, half, max.
     """
     schema_in = T.StructType(
         [
@@ -436,9 +437,11 @@ def passthrough(iterator):
 
 def test_map_in_arrow_multi_batch_per_partition(spark, tmp_path, accelerated):
     """
-    Force many small batches in a single partition so the writer/unloader exercises the
-    persistent destination IPC root over multiple batches. Catches buffer-reuse bugs and
-    variable-width data-buffer growth across batches that single-batch tests miss.
+    Force many small batches in a single partition so the writer runs its per-batch
+    allocate/copy/write loop hundreds of times against a reused struct container (the leaf
+    buffers are reallocated each batch today; see #4383). Catches errors that only appear across
+    the batch boundary: stale value counts, offset/validity sizing on the second and later
+    batches, and variable-width data-buffer sizing as row content changes batch to batch.
     """
     schema_in = T.StructType(
         [