Skip to content

Commit 806b66c

Browse files
authored
refactor!: New mod transforms for expression and schema transforms (delta-io#2077)
## What changes are proposed in this pull request? Create a new top-level `transforms` module to host all things related to schema and expression transforms (which become sub-modules). Move `SchemaTransform` and `ExpressionTransform`, plus associated tests. NOTE: The vast majority of "changes" in this PR are really just code movement that git/github can't show cleanly (they show as big blocks of deleted and added code). The rest is mod declaration and import churn. ### This PR affects the following public APIs Module names changed/moved. No API signature or behavior changes. ## How was this change tested? Pure code movement. Compilation suffices.
1 parent 6065e5a commit 806b66c

18 files changed

Lines changed: 439 additions & 408 deletions

File tree

CLAUDE.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,9 @@ version. From it you build a `Scan` (reads) or `Transaction` (writes).
8181
assembles commit actions, enforces protocol compliance, delegates atomic commit to a
8282
`Committer`.
8383

84-
**Engine trait:** four handlers (`StorageHandler`, `JsonHandler`, `ParquetHandler`,
85-
`EvaluationHandler`). `DefaultEngine` lives in `kernel/src/engine/default/`.
84+
**Engine trait:** five handlers (`StorageHandler`, `JsonHandler`, `ParquetHandler`,
85+
`EvaluationHandler`, optional `MetricsReporter`). `DefaultEngine` lives in
86+
`kernel/src/engine/default/`.
8687

8788
**EngineData:** opaque columnar data interface. IMPORTANT: never access `EngineData` columns
8889
directly -- always use the visitor pattern (`visit_rows` with typed `GetData` accessors).
@@ -158,6 +159,8 @@ Keep this list updated when new protocol features are added to kernel.
158159
- **Column mapping:** Physical column names can differ from logical names. Always use
159160
the schema from `Snapshot::schema()` for user data columns. Metadata/system schema
160161
column names (defined by the protocol) are not subject to column mapping.
162+
- **Transforms:** Generic recursive schema and expression transform traits and helpers
163+
are in `kernel/src/transforms/`.
161164

162165
## Code Style / Documentation
163166

CLAUDE/architecture.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,8 @@ all returned batches -- the engine may split a single file across multiple batch
127127
- `kernel/src/schema/` -- `StructType`/`StructField`/`DataType`, projections
128128
- `kernel/src/expressions/` -- expression AST (`Expression`, `Predicate`, `Scalar`),
129129
`column_expr!` macro
130+
- `kernel/src/transforms/` -- generic recursive transforms (`ExpressionTransform`,
131+
`SchemaTransform`)
130132
- `kernel/src/checkpoint/` -- checkpoint writing (V1 and V2 single-file classic-named)
131133
- `kernel/src/table_configuration.rs` -- table metadata, properties, feature management
132134
- `kernel/src/table_features/` -- protocol feature definitions, `TableFeature` enum

kernel/src/engine/arrow_conversion.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -357,9 +357,9 @@ mod tests {
357357
use crate::engine::arrow_data::unshredded_variant_arrow_type;
358358
use crate::parquet::arrow::PARQUET_FIELD_ID_META_KEY;
359359
use crate::schema::{
360-
ArrayType, ColumnMetadataKey, DataType, MapType, MetadataValue, SchemaTransform,
361-
StructField, StructType,
360+
ArrayType, ColumnMetadataKey, DataType, MapType, MetadataValue, StructField, StructType,
362361
};
362+
use crate::transforms::SchemaTransform;
363363
use crate::DeltaResult;
364364
use std::collections::HashMap;
365365

@@ -410,7 +410,7 @@ mod tests {
410410
}
411411
}
412412

413-
impl<'a> crate::schema::SchemaTransform<'a> for FieldIdCollector {
413+
impl<'a> SchemaTransform<'a> for FieldIdCollector {
414414
fn transform_struct_field(
415415
&mut self,
416416
field: &'a StructField,

kernel/src/expressions/literal_expression_transform.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,8 @@ use std::ops::Deref as _;
77
use tracing::debug;
88

99
use crate::expressions::{Expression, Scalar};
10-
use crate::schema::{
11-
ArrayType, DataType, MapType, PrimitiveType, SchemaTransform, StructField, StructType,
12-
};
10+
use crate::schema::{ArrayType, DataType, MapType, PrimitiveType, StructField, StructType};
11+
use crate::transforms::SchemaTransform;
1312
use crate::DeltaResult;
1413

1514
/// [`SchemaTransform`] that will transform a [`Schema`] and an ordered list of leaf values

kernel/src/expressions/mod.rs

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,19 +13,18 @@ pub use self::column_names::{
1313
ColumnName,
1414
};
1515
pub use self::scalars::{ArrayData, DecimalData, MapData, Scalar, StructData};
16-
use self::transforms::ExpressionTransform;
1716
use crate::kernel_predicates::{
1817
DirectDataSkippingPredicateEvaluator, DirectPredicateEvaluator,
1918
IndirectDataSkippingPredicateEvaluator,
2019
};
2120
use crate::schema::SchemaRef;
21+
use crate::transforms::ExpressionTransform;
2222
use crate::{DataType, DeltaResult, DynPartialEq};
2323

2424
mod column_names;
2525
pub(crate) mod literal_expression_transform;
2626
pub(crate) use literal_expression_transform::literal_expression_transform;
2727
mod scalars;
28-
pub mod transforms;
2928

3029
pub type ExpressionRef = std::sync::Arc<Expression>;
3130
pub type PredicateRef = std::sync::Arc<Predicate>;
@@ -286,7 +285,10 @@ where
286285
}
287286

288287
impl OpaquePredicate {
289-
fn new(op: OpaquePredicateOpRef, exprs: impl IntoIterator<Item = Expression>) -> Self {
288+
pub(crate) fn new(
289+
op: OpaquePredicateOpRef,
290+
exprs: impl IntoIterator<Item = Expression>,
291+
) -> Self {
290292
let exprs = exprs.into_iter().collect();
291293
Self { op, exprs }
292294
}
@@ -305,7 +307,10 @@ pub struct OpaqueExpression {
305307
}
306308

307309
impl OpaqueExpression {
308-
fn new(op: OpaqueExpressionOpRef, exprs: impl IntoIterator<Item = Expression>) -> Self {
310+
pub(crate) fn new(
311+
op: OpaqueExpressionOpRef,
312+
exprs: impl IntoIterator<Item = Expression>,
313+
) -> Self {
309314
let exprs = exprs.into_iter().collect();
310315
Self { op, exprs }
311316
}
@@ -545,21 +550,21 @@ impl JunctionPredicateOp {
545550
}
546551

547552
impl UnaryExpression {
548-
fn new(op: UnaryExpressionOp, expr: impl Into<Expression>) -> Self {
553+
pub(crate) fn new(op: UnaryExpressionOp, expr: impl Into<Expression>) -> Self {
549554
let expr = Box::new(expr.into());
550555
Self { op, expr }
551556
}
552557
}
553558

554559
impl UnaryPredicate {
555-
fn new(op: UnaryPredicateOp, expr: impl Into<Expression>) -> Self {
560+
pub(crate) fn new(op: UnaryPredicateOp, expr: impl Into<Expression>) -> Self {
556561
let expr = Box::new(expr.into());
557562
Self { op, expr }
558563
}
559564
}
560565

561566
impl BinaryExpression {
562-
fn new(
567+
pub(crate) fn new(
563568
op: BinaryExpressionOp,
564569
left: impl Into<Expression>,
565570
right: impl Into<Expression>,
@@ -571,7 +576,7 @@ impl BinaryExpression {
571576
}
572577

573578
impl BinaryPredicate {
574-
fn new(
579+
pub(crate) fn new(
575580
op: BinaryPredicateOp,
576581
left: impl Into<Expression>,
577582
right: impl Into<Expression>,
@@ -583,7 +588,7 @@ impl BinaryPredicate {
583588
}
584589

585590
impl VariadicExpression {
586-
fn new(
591+
pub(crate) fn new(
587592
op: VariadicExpressionOp,
588593
exprs: impl IntoIterator<Item = impl Into<Expression>>,
589594
) -> Self {
@@ -593,7 +598,7 @@ impl VariadicExpression {
593598
}
594599

595600
impl ParseJsonExpression {
596-
fn new(json_expr: impl Into<Expression>, output_schema: SchemaRef) -> Self {
601+
pub(crate) fn new(json_expr: impl Into<Expression>, output_schema: SchemaRef) -> Self {
597602
Self {
598603
json_expr: Box::new(json_expr.into()),
599604
output_schema,
@@ -619,15 +624,15 @@ pub struct MapToStructExpression {
619624
}
620625

621626
impl MapToStructExpression {
622-
fn new(map_expr: impl Into<Expression>) -> Self {
627+
pub(crate) fn new(map_expr: impl Into<Expression>) -> Self {
623628
Self {
624629
map_expr: Box::new(map_expr.into()),
625630
}
626631
}
627632
}
628633

629634
impl JunctionPredicate {
630-
fn new(op: JunctionPredicateOp, preds: Vec<Predicate>) -> Self {
635+
pub(crate) fn new(op: JunctionPredicateOp, preds: Vec<Predicate>) -> Self {
631636
Self { op, preds }
632637
}
633638
}

kernel/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ pub mod table_configuration;
108108
pub mod table_features;
109109
pub mod table_properties;
110110
pub mod transaction;
111+
pub mod transforms;
111112

112113
pub use log_path::LogPath;
113114

kernel/src/scan/data_skipping/stats_schema/mod.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,10 @@ use std::borrow::Cow;
66
use std::sync::Arc;
77

88
use crate::schema::{
9-
ArrayType, ColumnName, DataType, MapType, PrimitiveType, Schema, SchemaRef, SchemaTransform,
10-
StructField, StructType,
9+
ArrayType, ColumnName, DataType, MapType, PrimitiveType, Schema, SchemaRef, StructField,
10+
StructType,
1111
};
12+
use crate::transforms::SchemaTransform;
1213
use crate::{DeltaResult, Error};
1314

1415
use column_filter::StatsColumnFilter;

kernel/src/scan/mod.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ use crate::actions::deletion_vector::{
1616
};
1717
use crate::actions::{get_commit_schema, Add, ADD_NAME, REMOVE_NAME};
1818
use crate::engine_data::FilteredEngineData;
19-
use crate::expressions::transforms::ExpressionTransform;
2019
use crate::expressions::{ColumnName, ExpressionRef, Predicate, PredicateRef, Scalar};
2120
use crate::kernel_predicates::{DefaultKernelPredicateEvaluator, EmptyColumnResolver};
2221
use crate::log_replay::{ActionsBatch, HasSelectionVector};
@@ -27,10 +26,11 @@ use crate::scan::log_replay::ScanLogReplayProcessor;
2726
use crate::scan::log_replay::{BASE_ROW_ID_NAME, CLUSTERING_PROVIDER_NAME};
2827
use crate::scan::state_info::StateInfo;
2928
use crate::schema::{
30-
ArrayType, DataType, MapType, PrimitiveType, Schema, SchemaRef, SchemaTransform, StructField,
31-
StructType, ToSchema as _,
29+
ArrayType, DataType, MapType, PrimitiveType, Schema, SchemaRef, StructField, StructType,
30+
ToSchema as _,
3231
};
3332
use crate::table_features::{ColumnMappingMode, Operation};
33+
use crate::transforms::{ExpressionTransform, SchemaTransform};
3434
use crate::{DeltaResult, Engine, EngineData, Error, FileMeta, SnapshotRef, Version};
3535

3636
use self::log_replay::scan_action_iter;

0 commit comments

Comments
 (0)