[SPARK-56176][SPARK-56232][SQL] V2-native ANALYZE TABLE/COLUMN with stats propagation to FileScan

LuciferYang · LuciferYang · commit a2f06d72ac93 · 2026-03-31T12:19:16.000+08:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala
@@ -430,34 +430,9 @@ class ResolveSessionCatalog(val catalogManager: CatalogManager)
     case AnalyzeTables(ResolvedV1Database(db), noScan) =>
       AnalyzeTablesCommand(Some(db), noScan)
 
-    // TODO(SPARK-56176): V2-native ANALYZE TABLE/COLUMN for file tables.
-    // FileTable from V2SessionCatalog.loadTable doesn't match V1 extractors,
-    // so we intercept here and delegate to V1 commands using catalogTable.
-    case AnalyzeTable(
-        ResolvedTable(catalog, _, ft: FileTable, _),
-        partitionSpec, noScan)
-        if supportsV1Command(catalog)
-          && ft.catalogTable.isDefined =>
-      val tableIdent = ft.catalogTable.get.identifier
-      if (partitionSpec.isEmpty) {
-        AnalyzeTableCommand(tableIdent, noScan)
-      } else {
-        AnalyzePartitionCommand(
-          tableIdent, partitionSpec, noScan)
-      }
-
     case AnalyzeColumn(ResolvedV1TableOrViewIdentifier(ident), columnNames, allColumns) =>
       AnalyzeColumnCommand(ident, columnNames, allColumns)
 
-    case AnalyzeColumn(
-        ResolvedTable(catalog, _, ft: FileTable, _),
-        columnNames, allColumns)
-        if supportsV1Command(catalog)
-          && ft.catalogTable.isDefined =>
-      AnalyzeColumnCommand(
-        ft.catalogTable.get.identifier,
-        columnNames, allColumns)
-
     // V2 catalog doesn't support REPAIR TABLE yet, we must use v1 command here.
     case RepairTable(
         ResolvedV1TableIdentifierInSessionCatalog(ident),
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/AnalyzeColumnExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/AnalyzeColumnExec.scala
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.datasources.v2
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.connector.catalog.{Identifier, TableCatalog, TableChange}
+import org.apache.spark.sql.execution.command.CommandUtils
+
+/**
+ * Physical plan for ANALYZE TABLE ... FOR COLUMNS on V2
+ * file tables. Computes column-level statistics and
+ * persists them as table properties via
+ * [[TableCatalog.alterTable()]].
+ *
+ * Column stats property key format:
+ * `spark.sql.statistics.colStats.<col>.<stat>`
+ */
+case class AnalyzeColumnExec(
+    catalog: TableCatalog,
+    ident: Identifier,
+    table: FileTable,
+    columnNames: Option[Seq[String]],
+    allColumns: Boolean)
+    extends LeafV2CommandExec {
+
+  override def output: Seq[Attribute] = Seq.empty
+
+  override protected def run(): Seq[InternalRow] = {
+    val relation = DataSourceV2Relation.create(
+      table, Some(catalog), Some(ident))
+
+    val columnsToAnalyze = if (allColumns) {
+      relation.output
+    } else {
+      columnNames.getOrElse(Seq.empty).map { name =>
+        relation.output.find(
+          _.name.equalsIgnoreCase(name)).getOrElse(
+          throw new IllegalArgumentException(
+            s"Column '$name' not found"))
+      }
+    }
+
+    val (rowCount, colStats) =
+      CommandUtils.computeColumnStats(
+        session, relation, columnsToAnalyze)
+
+    // Refresh fileIndex for accurate size
+    table.fileIndex.refresh()
+    val totalSize = table.fileIndex.sizeInBytes
+
+    val changes =
+      scala.collection.mutable.ArrayBuffer(
+        TableChange.setProperty(
+          "spark.sql.statistics.totalSize",
+          totalSize.toString),
+        TableChange.setProperty(
+          "spark.sql.statistics.numRows",
+          rowCount.toString))
+
+    // Store column stats as table properties
+    val prefix = "spark.sql.statistics.colStats."
+    colStats.foreach { case (attr, stat) =>
+      val catalogStat = stat.toCatalogColumnStat(
+        attr.name, attr.dataType)
+      catalogStat.toMap(attr.name).foreach {
+        case (k, v) =>
+          changes += TableChange.setProperty(
+            prefix + k, v)
+      }
+    }
+
+    catalog.alterTable(ident, changes.toSeq: _*)
+    Seq.empty
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/AnalyzeTableExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/AnalyzeTableExec.scala
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.datasources.v2
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.connector.catalog.{Identifier, TableCatalog, TableChange}
+
+/**
+ * Physical plan for ANALYZE TABLE on V2 file tables.
+ * Computes table statistics and persists them as table
+ * properties via [[TableCatalog.alterTable()]].
+ *
+ * Statistics property keys:
+ * - `spark.sql.statistics.totalSize`
+ * - `spark.sql.statistics.numRows`
+ */
+case class AnalyzeTableExec(
+    catalog: TableCatalog,
+    ident: Identifier,
+    table: FileTable,
+    partitionSpec: Map[String, Option[String]],
+    noScan: Boolean) extends LeafV2CommandExec {
+
+  override def output: Seq[Attribute] = Seq.empty
+
+  override protected def run(): Seq[InternalRow] = {
+    table.fileIndex.refresh()
+    val totalSize = table.fileIndex.sizeInBytes
+
+    val changes =
+      scala.collection.mutable.ArrayBuffer(
+        TableChange.setProperty(
+          "spark.sql.statistics.totalSize",
+          totalSize.toString))
+
+    if (!noScan) {
+      val relation = DataSourceV2Relation.create(
+        table, Some(catalog), Some(ident))
+      val df = session.internalCreateDataFrame(
+        session.sessionState.executePlan(
+          relation).toRdd,
+        relation.schema)
+      val rowCount = df.count()
+      changes += TableChange.setProperty(
+        "spark.sql.statistics.numRows",
+        rowCount.toString)
+    }
+
+    catalog.alterTable(ident, changes.toSeq: _*)
+    Seq.empty
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala
@@ -530,8 +530,26 @@ class DataSourceV2Strategy(session: SparkSession) extends Strategy with Predicat
     case ShowTableProperties(rt: ResolvedTable, propertyKey, output) =>
       ShowTablePropertiesExec(output, rt.table, rt.name, propertyKey) :: Nil
 
-    case AnalyzeTable(_: ResolvedTable, _, _) | AnalyzeColumn(_: ResolvedTable, _, _) =>
-      throw QueryCompilationErrors.analyzeTableNotSupportedForV2TablesError()
+    case AnalyzeTable(
+        ResolvedTable(catalog, ident,
+          ft: FileTable, _),
+        partitionSpec, noScan) =>
+      AnalyzeTableExec(
+        catalog, ident, ft,
+        partitionSpec, noScan) :: Nil
+
+    case AnalyzeColumn(
+        ResolvedTable(catalog, ident,
+          ft: FileTable, _),
+        columnNames, allColumns) =>
+      AnalyzeColumnExec(
+        catalog, ident, ft,
+        columnNames, allColumns) :: Nil
+
+    case AnalyzeTable(_: ResolvedTable, _, _) |
+         AnalyzeColumn(_: ResolvedTable, _, _) =>
+      throw QueryCompilationErrors
+        .analyzeTableNotSupportedForV2TablesError()
 
     case AddPartitions(
         r @ ResolvedTable(_, _, table: SupportsPartitionManagement, _), parts, ignoreIfExists) =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScan.scala
@@ -37,6 +37,7 @@ import org.apache.spark.sql.internal.{SessionStateHelper, SQLConf}
 import org.apache.spark.sql.internal.connector.SupportsMetadata
 import org.apache.spark.sql.sources.Filter
 import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.util.CaseInsensitiveStringMap
 import org.apache.spark.util.Utils
 
 trait FileScan extends Scan
@@ -68,6 +69,8 @@ trait FileScan extends Scan
    */
   def readPartitionSchema: StructType
 
+  def options: CaseInsensitiveStringMap
+
   /**
    * Returns the filters that can be use for partition pruning
    */
@@ -197,10 +200,22 @@ trait FileScan extends Scan
         OptionalLong.of(size)
       }
 
-      override def numRows(): OptionalLong = OptionalLong.empty()
+      override def numRows(): OptionalLong = {
+        // Try to read stored row count from table
+        // properties (set by ANALYZE TABLE).
+        storedNumRows.map(OptionalLong.of)
+          .getOrElse(OptionalLong.empty())
+      }
     }
   }
 
+  /**
+   * Stored row count from ANALYZE TABLE, if available.
+   * Injected via FileTable.mergedOptions as __numRows.
+   */
+  protected def storedNumRows: Option[Long] =
+    Option(options.get("__numRows")).map(_.toLong)
+
   override def toBatch: Batch = this
 
   override def readSchema(): StructType =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileTable.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileTable.scala
@@ -273,9 +273,16 @@ abstract class FileTable(
    * @return
    */
   protected def mergedOptions(options: CaseInsensitiveStringMap): CaseInsensitiveStringMap = {
-    val finalOptions = this.options.asCaseSensitiveMap().asScala ++
+    val base = this.options.asCaseSensitiveMap().asScala ++
       options.asCaseSensitiveMap().asScala
-    new CaseInsensitiveStringMap(finalOptions.asJava)
+    // Inject stored numRows from catalog for FileScan.estimateStatistics()
+    val withStats = catalogTable.flatMap(_.stats)
+      .flatMap(_.rowCount) match {
+      case Some(rows) =>
+        base ++ Map("__numRows" -> rows.toString)
+      case None => base
+    }
+    new CaseInsensitiveStringMap(withStats.asJava)
   }
 
   /**