fix: use Spark Error framework for DataFrame input validation

Yicong-Huang · Yicong-Huang · commit a768b6a46e37 · 2026-04-04T01:52:49.000Z
diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json
@@ -5572,6 +5572,16 @@
     ],
     "sqlState" : "42846"
   },
+  "PARSE_INPUT_NOT_SINGLE_COLUMN" : {
+    "message" : [
+      "Input DataFrame must have exactly one column, but got <numColumns>."
+    ]
+  },
+  "PARSE_INPUT_NOT_STRING_TYPE" : {
+    "message" : [
+      "Input DataFrame column must be StringType, but got <dataType>."
+    ]
+  },
   "PARSE_EMPTY_STATEMENT" : {
     "message" : [
       "Syntax error, unexpected empty statement."
diff --git a/python/pyspark/sql/tests/test_datasources.py b/python/pyspark/sql/tests/test_datasources.py
@@ -113,19 +113,19 @@ def test_json_with_dataframe_input_and_schema(self):
 
     def test_json_with_dataframe_input_non_string_column(self):
         int_df = self.spark.createDataFrame([(1,), (2,)], schema="value INT")
-        with self.assertRaisesRegex(Exception, "StringType"):
+        with self.assertRaisesRegex(Exception, "PARSE_INPUT_NOT_STRING_TYPE"):
             self.spark.read.json(int_df).collect()
 
     def test_json_with_dataframe_input_multiple_columns(self):
         multi_df = self.spark.createDataFrame(
             [("a", "b"), ("c", "d")], schema="col1 STRING, col2 STRING"
         )
-        with self.assertRaisesRegex(Exception, "exactly one column"):
+        with self.assertRaisesRegex(Exception, "PARSE_INPUT_NOT_SINGLE_COLUMN"):
             self.spark.read.json(multi_df).collect()
 
     def test_json_with_dataframe_input_zero_columns(self):
         empty_schema_df = self.spark.range(1).select()
-        with self.assertRaisesRegex(Exception, "exactly one column"):
+        with self.assertRaisesRegex(Exception, "PARSE_INPUT_NOT_SINGLE_COLUMN"):
             self.spark.read.json(empty_schema_df).collect()
 
     def test_multiline_csv(self):
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala
@@ -3495,6 +3495,18 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase with Compilat
     )
   }
 
+  def parseInputNotSingleColumnError(numColumns: Int): Throwable = {
+    new AnalysisException(
+      errorClass = "PARSE_INPUT_NOT_SINGLE_COLUMN",
+      messageParameters = Map("numColumns" -> numColumns.toString))
+  }
+
+  def parseInputNotStringTypeError(dataType: DataType): Throwable = {
+    new AnalysisException(
+      errorClass = "PARSE_INPUT_NOT_STRING_TYPE",
+      messageParameters = Map("dataType" -> toSQLType(dataType)))
+  }
+
   def textDataSourceWithMultiColumnsError(schema: StructType): Throwable = {
     new AnalysisException(
       errorClass = "_LEGACY_ERROR_TEMP_1290",
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala
@@ -33,6 +33,7 @@ import org.apache.spark.sql.catalyst.analysis.{FunctionRegistry, TableFunctionRe
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
+import org.apache.spark.sql.errors.QueryCompilationErrors
 import org.apache.spark.sql.classic.ClassicConversions._
 import org.apache.spark.sql.classic.ExpressionUtils.expression
 import org.apache.spark.sql.execution.{ExplainMode, QueryExecution}
@@ -201,10 +202,12 @@ private[sql] object PythonSQLUtils extends Logging {
   def jsonFromDataFrame(
       reader: DataFrameReader,
       df: DataFrame): DataFrame = {
-    require(df.schema.fields.length == 1,
-      s"Input DataFrame must have exactly one column, but got ${df.schema.fields.length}")
-    require(df.schema.fields.head.dataType == org.apache.spark.sql.types.StringType,
-      s"Input DataFrame column must be StringType, but got ${df.schema.fields.head.dataType}")
+    if (df.schema.fields.length != 1) {
+      throw QueryCompilationErrors.parseInputNotSingleColumnError(df.schema.fields.length)
+    }
+    if (df.schema.fields.head.dataType != org.apache.spark.sql.types.StringType) {
+      throw QueryCompilationErrors.parseInputNotStringTypeError(df.schema.fields.head.dataType)
+    }
     reader.json(df.as(Encoders.STRING))
   }