apache · kerwin-zk · Jul 4, 2026 · JingsongLi · Jul 4, 2026 · kerwin-zk
diff --git a/...imon-spark-common/src/main/scala/org/apache/paimon/spark/write/PaimonV2WriteBuilder.scala b/...imon-spark-common/src/main/scala/org/apache/paimon/spark/write/PaimonV2WriteBuilder.scala
@@ -21,10 +21,15 @@ package org.apache.paimon.spark.write
 import org.apache.paimon.CoreOptions
 import org.apache.paimon.Snapshot
 import org.apache.paimon.options.Options
+import org.apache.paimon.spark.SparkTypeUtils
+import org.apache.paimon.spark.commands.SchemaEvolutionHelper
+import org.apache.paimon.spark.schema.SparkSystemColumns
 import org.apache.paimon.table.FileStoreTable
 import org.apache.paimon.types.RowType
 
-import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.PaimonUtils
+import org.apache.spark.sql.catalyst.util.CharVarcharUtils
+import org.apache.spark.sql.types.{DataType, StructType}
 
 import scala.collection.JavaConverters._
 
@@ -40,6 +45,7 @@ class PaimonV2WriteBuilder(table: FileStoreTable, dataSchema: StructType, option
   }
 
   override def build: PaimonV2Write = {
+    validateDataSchema()
     val finalTable = overwriteDynamic match {
       case Some(o) =>
         table.copy(Map(CoreOptions.DYNAMIC_PARTITION_OVERWRITE.key -> o.toString).asJava)
@@ -55,4 +61,63 @@ class PaimonV2WriteBuilder(table: FileStoreTable, dataSchema: StructType, option
   }
 
   override def partitionRowType(): RowType = table.schema().logicalPartitionType()
+
+  /**
+   * Fail fast when the query schema is binary-incompatible with the table schema.
+   *
+   * Paimon tables declare `ACCEPT_ANY_SCHEMA`, so Spark skips its own output resolution and the
+   * query is aligned to the table schema by `PaimonAnalysis` instead, which is only injected when
+   * `PaimonSparkSessionExtensions` is configured. If a write is planned without it, a
+   * type-mismatched query would reach the writer as-is and be interpreted with the table's row
+   * type, silently corrupting data or failing with errors like `NegativeArraySizeException` deep
+   * inside the format writer.
+   */
+  private def validateDataSchema(): Unit = {
+    // Row-level operations write the table's own rows back; schema evolution intentionally
+    // diverges from the current table schema and is validated when merging.
+    if (copyOnWriteScan.nonEmpty || SchemaEvolutionHelper.mergeSchemaEnabled(options)) {
+      return
+    }
+
+    val expectedFields = SparkTypeUtils.fromPaimonRowType(table.rowType()).fields
+    val actualFields = SparkSystemColumns.filterSparkSystemColumns(dataSchema).fields
+
+    def fail(reason: String): Unit = {
+      throw new RuntimeException(
+        s"Cannot write incompatible data to table '${table.name()}': $reason. " +
+          "The write was planned without Paimon's output resolution, which usually means " +
+          "'org.apache.paimon.spark.extensions.PaimonSparkSessionExtensions' is not configured " +
+          "in 'spark.sql.extensions'. Please configure it, or align the query columns with the " +
+          "table schema explicitly.")
+    }
+
+    if (actualFields.length != expectedFields.length) {
+      fail(
+        s"the number of query columns (${actualFields.length}) does not match " +
+          s"the table schema's (${expectedFields.length})")
+    }
+
+    val incompatible = actualFields.zip(expectedFields).filterNot {
+      case (actual, expected) =>
+        val actualType = binaryCompatibleType(
+          CharVarcharUtils.getRawType(actual.metadata).getOrElse(actual.dataType))
+        PaimonUtils.equalsIgnoreCompatibleNullability(
+          actualType,
+          binaryCompatibleType(expected.dataType))
+    }
+    if (incompatible.nonEmpty) {
+      val details = incompatible
+        .map {
+          case (actual, expected) =>
+            s"${actual.name} ${actual.dataType.simpleString} -> " +
+              s"${expected.name} ${expected.dataType.simpleString}"
+        }
+        .mkString("[", ", ", "]")
+      fail(s"incompatible query column type(s): $details")
+    }
+  }
+
+  // CHAR/VARCHAR share the string binary layout, treat them as STRING for compatibility check.
+  private def binaryCompatibleType(dataType: DataType): DataType =
+    CharVarcharUtils.replaceCharVarcharWithString(dataType)
 }
diff --git a/...n-spark/paimon-spark-ut/src/test/scala/org/apache/paimon/spark/sql/PaimonOptionTest.scala b/...n-spark/paimon-spark-ut/src/test/scala/org/apache/paimon/spark/sql/PaimonOptionTest.scala
@@ -305,3 +305,41 @@ class PaimonConfigCheckTest extends SparkFunSuite {
     }
   }
 }
+
+class PaimonV2WriteSchemaValidationTest extends SparkFunSuite with SparkVersionSupport {
+
+  test("Paimon V2 write: fail fast on incompatible query schema without Paimon extensions") {
+    assume(gteqSpark3_4)
+    val spark = SparkSession
+      .builder()
+      .master("local[2]")
+      .config("spark.sql.catalog.paimon", "org.apache.paimon.spark.SparkCatalog")
+      .config("spark.sql.catalog.paimon.warehouse", Utils.createTempDir.getCanonicalPath)
+      .config("spark.paimon.requiredSparkConfsCheck.enabled", "false")
+      .config("spark.paimon.write.use-v2-write", "true")
+      .getOrCreate()
+    try {
+      spark.sql("USE paimon")
+      spark.sql("CREATE TABLE T (a STRING, b STRING, pt STRING) PARTITIONED BY (pt)")
+
+      // A query schema that matches the table schema still works without the extensions.
+      spark.sql("INSERT INTO T VALUES ('x', 'y', '20260601')")
+      assert(spark.sql("SELECT * FROM T").collect() === Array(Row("x", "y", "20260601")))
+
+      // Type mismatch: without PaimonAnalysis no cast is added, the write must fail fast
+      // instead of corrupting data or throwing NegativeArraySizeException in the writer.
+      val typeMismatch = intercept[RuntimeException] {
+        spark.sql("INSERT OVERWRITE TABLE T PARTITION (pt) SELECT 'x', 0.98d, '20260601'")
+      }
+      assert(typeMismatch.getMessage.contains("PaimonSparkSessionExtensions"))
+
+      // Column count mismatch must fail fast as well.
+      val countMismatch = intercept[RuntimeException] {
+        spark.sql("INSERT INTO T VALUES ('x', 'y')")
+      }
+      assert(countMismatch.getMessage.contains("PaimonSparkSessionExtensions"))
+    } finally {
+      spark.close()
+    }
+  }
+}