diff --git a/vortex-array/src/expr/exprs/like.rs b/vortex-array/src/expr/exprs/like.rs index 4c876db00a5..5ad5b408ec7 100644 --- a/vortex-array/src/expr/exprs/like.rs +++ b/vortex-array/src/expr/exprs/like.rs @@ -9,6 +9,7 @@ use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; use vortex_proto::expr as pb; +use vortex_scalar::StringLike; use crate::ArrayRef; use crate::compute::LikeOptions; @@ -19,9 +20,16 @@ use crate::expr::ExecutionArgs; use crate::expr::ExecutionResult; use crate::expr::ExprId; use crate::expr::Expression; +use crate::expr::Literal; +use crate::expr::StatsCatalog; use crate::expr::VTable; use crate::expr::VTableExt; use crate::expr::and; +use crate::expr::gt; +use crate::expr::gt_eq; +use crate::expr::lit; +use crate::expr::lt; +use crate::expr::or; /// Expression that performs SQL LIKE pattern matching. pub struct Like; @@ -127,6 +135,67 @@ impl VTable for Like { fn is_null_sensitive(&self, _instance: &Self::Options) -> bool { false } + + fn stat_falsification( + &self, + like_opts: &LikeOptions, + expr: &Expression, + catalog: &dyn StatsCatalog, + ) -> Option { + // Attempt to do min/max pruning for LIKE 'exact' or LIKE 'prefix%' + + // Don't attempt to handle ilike or negated like + if like_opts.negated || like_opts.case_insensitive { + return None; + } + + // Extract the pattern out + let pat = expr.child(1).as_::(); + + // LIKE NULL is nonsensical, don't try to handle it + let pat_str = pat.as_utf8().value()?; + + let src = expr.child(0).clone(); + let src_min = src.stat_min(catalog)?; + let src_max = src.stat_max(catalog)?; + + match LikeVariant::from_str(&pat_str)? { + LikeVariant::Exact(text) => { + // col LIKE 'exact' ==> col.min > 'exact' || col.max < 'exact' + Some(or(gt(src_min, lit(text)), lt(src_max, lit(text)))) + } + LikeVariant::Prefix(prefix) => { + // col LIKE 'prefix%' ==> col.max < 'prefix' || col.min >= 'prefiy' + let succ = prefix.to_string().increment().ok()?; + + Some(or(gt_eq(src_min, lit(succ)), lt(src_max, lit(prefix)))) + } + } + } +} + +/// Variants of the LIKE filter that we know how to turn into a stats pruning predicate.s +#[derive(Debug, PartialEq)] +enum LikeVariant<'a> { + Exact(&'a str), + Prefix(&'a str), +} + +impl<'a> LikeVariant<'a> { + /// Parse a LIKE pattern string into its relevant variant + fn from_str(string: &str) -> Option> { + let Some(wildcard_pos) = string.find(['%', '_']) else { + return Some(LikeVariant::Exact(string)); + }; + + // Can't handle wildcard in the front. + if wildcard_pos == 0 { + return None; + } + + let prefix = &string[..wildcard_pos]; + Some(LikeVariant::Prefix(prefix)) + } } pub fn like(child: Expression, pattern: Expression) -> Expression { @@ -176,12 +245,17 @@ mod tests { use crate::ToCanonical; use crate::arrays::BoolArray; + use crate::expr::col; use crate::expr::exprs::get_item::get_item; + use crate::expr::exprs::like::LikeVariant; use crate::expr::exprs::like::like; use crate::expr::exprs::like::not_ilike; use crate::expr::exprs::literal::lit; use crate::expr::exprs::not::not; use crate::expr::exprs::root::root; + use crate::expr::ilike; + use crate::expr::not_like; + use crate::expr::pruning::pruning_expr::TrackingStatsCatalog; #[test] fn invert_booleans() { @@ -217,4 +291,66 @@ mod tests { let expr2 = not_ilike(root(), lit("test*")); assert_eq!(expr2.to_string(), "$ not ilike \"test*\""); } + + #[test] + fn test_like_variant() { + // Supported patterns + assert_eq!( + LikeVariant::from_str("simple"), + Some(LikeVariant::Exact("simple")) + ); + assert_eq!( + LikeVariant::from_str("prefix%"), + Some(LikeVariant::Prefix("prefix")) + ); + assert_eq!( + LikeVariant::from_str("first%rest_stuff"), + Some(LikeVariant::Prefix("first")) + ); + + // Unsupported patterns + assert_eq!(LikeVariant::from_str("%suffix"), None); + assert_eq!(LikeVariant::from_str("_pattern"), None); + } + + #[test] + fn test_like_pushdown() { + // Test that LIKE prefix and exactness filters can be pushed down into stats filtering + // at scan time. + let catalog = TrackingStatsCatalog::default(); + + let pruning_expr = like(col("a"), lit("prefix%")) + .stat_falsification(&catalog) + .expect("LIKE stat falsification"); + + insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "prefiy") or ($.a_max < "prefix"))"#); + + // Multiple wildcards + let pruning_expr = like(col("a"), lit("pref%ix%")) + .stat_falsification(&catalog) + .expect("LIKE stat falsification"); + insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "preg") or ($.a_max < "pref"))"#); + + let pruning_expr = like(col("a"), lit("pref_ix_")) + .stat_falsification(&catalog) + .expect("LIKE stat falsification"); + insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "preg") or ($.a_max < "pref"))"#); + + // Exact match + let pruning_expr = like(col("a"), lit("exactly")) + .stat_falsification(&catalog) + .expect("LIKE stat falsification"); + insta::assert_snapshot!(pruning_expr, @r#"(($.a_min > "exactly") or ($.a_max < "exactly"))"#); + + // Suffix search skips pushdown + let pruning_expr = like(col("a"), lit("%suffix")).stat_falsification(&catalog); + assert_eq!(pruning_expr, None); + + // NOT LIKE, ILIKE not supported currently + assert_eq!( + None, + not_like(col("a"), lit("a")).stat_falsification(&catalog) + ); + assert_eq!(None, ilike(col("a"), lit("a")).stat_falsification(&catalog)); + } } diff --git a/vortex-array/src/expr/exprs/mod.rs b/vortex-array/src/expr/exprs/mod.rs index c606b53f5a0..145d225bcae 100644 --- a/vortex-array/src/expr/exprs/mod.rs +++ b/vortex-array/src/expr/exprs/mod.rs @@ -17,7 +17,6 @@ pub(crate) mod operators; pub(crate) mod pack; pub(crate) mod root; pub(crate) mod select; - pub use between::*; pub use binary::*; pub use cast::*; diff --git a/vortex-array/src/expr/pruning/mod.rs b/vortex-array/src/expr/pruning/mod.rs index 404f0aa8563..2446a27fcb4 100644 --- a/vortex-array/src/expr/pruning/mod.rs +++ b/vortex-array/src/expr/pruning/mod.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -mod pruning_expr; +pub(crate) mod pruning_expr; mod relation; pub use pruning_expr::RequiredStats; diff --git a/vortex-array/src/expr/pruning/pruning_expr.rs b/vortex-array/src/expr/pruning/pruning_expr.rs index d47c436632b..1e2e70b0c7f 100644 --- a/vortex-array/src/expr/pruning/pruning_expr.rs +++ b/vortex-array/src/expr/pruning/pruning_expr.rs @@ -23,7 +23,7 @@ pub type RequiredStats = Relation; // A catalog that return a stat column whenever it is required, tracking all accessed // stats and returning them later. #[derive(Default)] -struct TrackingStatsCatalog { +pub(crate) struct TrackingStatsCatalog { usage: RefCell>, } @@ -37,7 +37,7 @@ impl TrackingStatsCatalog { // A catalog that return a stat column if it exists in the given scope. struct ScopeStatsCatalog<'a> { - any_catalog: TrackingStatsCatalog, + inner: TrackingStatsCatalog, available_stats: &'a FieldPathSet, } @@ -46,7 +46,7 @@ impl StatsCatalog for ScopeStatsCatalog<'_> { let stat_path = field_path.clone().push(stat.name()); if self.available_stats.contains(&stat_path) { - self.any_catalog.stats_ref(field_path, stat) + self.inner.stats_ref(field_path, stat) } else { None } @@ -93,7 +93,7 @@ pub fn checked_pruning_expr( available_stats: &FieldPathSet, ) -> Option<(Expression, RequiredStats)> { let catalog = ScopeStatsCatalog { - any_catalog: Default::default(), + inner: Default::default(), available_stats, }; @@ -101,7 +101,7 @@ pub fn checked_pruning_expr( // TODO(joe): filter access by used exprs let mut relation: Relation = Relation::new(); - for ((field_path, stat), _) in catalog.any_catalog.into_usages() { + for ((field_path, stat), _) in catalog.inner.into_usages() { relation.insert(field_path, stat) } diff --git a/vortex-scalar/src/binary.rs b/vortex-scalar/src/binary.rs index 5c8c8060b2b..79d8b352ca7 100644 --- a/vortex-scalar/src/binary.rs +++ b/vortex-scalar/src/binary.rs @@ -95,7 +95,8 @@ impl<'a> BinaryScalar<'a> { self.value.as_ref().map(|v| v.as_ref()) } - /// Constructs a value at most `max_length` in size that's greater than this value. + /// Constructs the next scalar at most `max_length` bytes that's lexicographically greater than + /// this. /// /// Returns None if constructing a greater value would overflow. pub fn upper_bound(self, max_length: usize) -> Option { diff --git a/vortex-scalar/src/utf8.rs b/vortex-scalar/src/utf8.rs index 8b7b164ecb0..c274ac6b735 100644 --- a/vortex-scalar/src/utf8.rs +++ b/vortex-scalar/src/utf8.rs @@ -22,6 +22,72 @@ use crate::InnerScalarValue; use crate::Scalar; use crate::ScalarValue; +/// Types that can hold a valid UTF-8 string. +pub trait StringLike: private::Sealed + Sized { + /// Replace the last codepoint in the string with the next codepoint. + /// + /// This operation will attempt to reuse the original memory. + /// + /// If incrementing the last char fails, or if the string is empty, + /// we return an Err with the original unmodified string. + fn increment(self) -> Result; +} + +mod private { + use vortex_buffer::BufferString; + + use crate::StringLike; + + pub trait Sealed {} + + impl Sealed for String {} + + impl StringLike for String { + fn increment(mut self) -> Result { + let Some(last_char) = self.pop() else { + return Ok(self); + }; + + if let Some(next_char) = char::from_u32(last_char as u32 + 1) { + self.push(next_char); + Ok(self) + } else { + // Return the original string + self.push(last_char); + Err(self) + } + } + } + + impl Sealed for BufferString {} + + impl StringLike for BufferString { + #[allow(clippy::unwrap_in_result, clippy::expect_used)] + fn increment(self) -> Result { + if self.is_empty() { + return Err(self); + } + + // Chop off the last char and return it here. + let (last_idx, last_char) = self.char_indices().last().expect("non-empty"); + if let Some(next_char) = char::from_u32(last_char as u32 + 1) + && next_char.len_utf8() == last_char.len_utf8() + { + // Because the next char has the same byte width as the last char, we can overwrite + // the memory directly. + let mut bytes = self.into_inner().into_mut(); + next_char.encode_utf8(&mut bytes.as_mut()[last_idx..]); + + // SAFETY: we overwrite the last valid char with new valid char, so + // the buffer continues to hold valid UTF-8 data. + unsafe { Ok(BufferString::new_unchecked(bytes.freeze())) } + } else { + Err(self) + } + } + } +} + /// A scalar value representing a UTF-8 encoded string. /// /// This type provides a view into a UTF-8 string scalar value, which can be either @@ -92,7 +158,8 @@ impl<'a> Utf8Scalar<'a> { self.value.as_ref().map(|v| v.as_ref()) } - /// Constructs a value at most `max_length` in size that's greater than this value. + /// Constructs the next scalar at most `max_length` bytes that's lexicographically greater than + /// this. /// /// Returns None if constructing a greater value would overflow. pub fn upper_bound(self, max_length: usize) -> Option { @@ -102,29 +169,16 @@ impl<'a> Utf8Scalar<'a> { .rfind(|p| value.is_char_boundary(*p)) .vortex_expect("Failed to find utf8 character boundary"); - let utf8_mut = value - .get(..utf8_split_pos) - .vortex_expect("Slicing with existing index"); - - for (idx, original_char) in utf8_mut.char_indices().rev() { - let original_len = original_char.len_utf8(); - if let Some(next_char) = char::from_u32(original_char as u32 + 1) { - // do not allow increasing byte width of incremented char - if next_char.len_utf8() == original_len { - let sliced = value.inner().slice(0..idx + original_len); - drop(value); - let mut result = sliced.into_mut(); - next_char.encode_utf8(&mut result[idx..]); - return Some(Self { - dtype: self.dtype, - value: Some(Arc::new(unsafe { - BufferString::new_unchecked(result.freeze()) - })), - }); - } - } - } - None + let sliced = value.inner().slice(..utf8_split_pos); + drop(value); + + // SAFETY: we slice to a char boundary so the sliced range contains valid UTF-8. + let sliced_buf = unsafe { BufferString::new_unchecked(sliced) }; + let incremented = sliced_buf.increment().ok()?; + Some(Self { + dtype: self.dtype, + value: Some(Arc::new(incremented)), + }) } else { Some(Self { dtype: self.dtype, @@ -382,6 +436,7 @@ mod tests { #[test] fn upper_bound_overflow() { let utf8 = Scalar::utf8("🂑🂒🂓", Nullability::NonNullable); + assert!( Utf8Scalar::try_from(&utf8) .vortex_expect("utf8 scalar conversion should succeed")