From 630ceafba828ad4672e33a80c449474aa2566c19 Mon Sep 17 00:00:00 2001 From: "mintlify[bot]" <109931778+mintlify[bot]@users.noreply.github.com> Date: Tue, 23 Jun 2026 14:04:53 +0000 Subject: [PATCH] docs: document Rust blob v2 read APIs for multimodal tables --- docs/snippets/multimodal.mdx | 60 ++++++++++++++++++++++++++++++++++++ docs/tables/multimodal.mdx | 57 ++++++++++++++++++++++++++++++++++ 2 files changed, 117 insertions(+) diff --git a/docs/snippets/multimodal.mdx b/docs/snippets/multimodal.mdx index 24c66af2..3e074e2a 100644 --- a/docs/snippets/multimodal.mdx +++ b/docs/snippets/multimodal.mdx @@ -52,3 +52,63 @@ export const RsProcessResults = "for batch in &results {\n let filenames = ba export const RsSearchData = "let query_vector = vec![0.1_f32; 128];\nlet results = table\n .query()\n .nearest_to(query_vector)\n .unwrap()\n .limit(1)\n .execute()\n .await\n .unwrap()\n .try_collect::>()\n .await\n .unwrap();\n"; +export const RsBlobV2Schema = `use std::sync::Arc; +use arrow_schema::{DataType, Field, Schema}; +use lancedb::blob; + +let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int64, false), + // Declare a blob v2 column. Pass nullable: true to allow null payloads. + blob("image", true), +])); +`; + +export const RsBlobV2Fetch = `use arrow_array::UInt64Array; +use futures::TryStreamExt; +use lancedb::query::{ExecutableQuery, QueryBase}; + +// 1. Run any query, asking LanceDB to return the stable _rowid column. +let mut stream = table + .query() + .with_row_id() + .only_if("label = 'red'") + .limit(32) + .execute() + .await?; + +// 2. Collect row ids from the result. +let mut row_ids: Vec = Vec::new(); +while let Some(batch) = stream.try_next().await? { + let ids = batch + .column_by_name("_rowid") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + row_ids.extend(ids.values()); +} + +// 3. Materialize blob bytes. Output length and order match row_ids. +// Null and zero-length rows come back as null. +let images = table.fetch_blobs("image", &row_ids).await?; +for i in 0..images.len() { + if images.is_null(i) { + continue; + } + let bytes = images.value(i); + println!("row {} -> {} bytes", row_ids[i], bytes.len()); +} +`; + +export const RsBlobV2FetchFiles = `// Open one lazy handle per row id. No bytes are read yet. +let handles = table.fetch_blob_files("image", &row_ids).await?; + +// Read only the rows you actually need. +for (row_id, handle) in row_ids.iter().zip(handles.iter()) { + let Some(handle) = handle.as_ref() else { + continue; // null row + }; + let bytes = handle.read().await?; + println!("row {} -> {} bytes", row_id, bytes.len()); +} +`; diff --git a/docs/tables/multimodal.mdx b/docs/tables/multimodal.mdx index 95670fc0..aa8de166 100644 --- a/docs/tables/multimodal.mdx +++ b/docs/tables/multimodal.mdx @@ -33,6 +33,9 @@ import { RsBlobApiIngest as RsBlobApiIngest, PyBlobApiToPandas as BlobApiToPandas, PyQueryToPandasKwargs as QueryToPandasKwargs, + RsBlobV2Schema, + RsBlobV2Fetch, + RsBlobV2FetchFiles, } from '/snippets/multimodal.mdx'; LanceDB handles multimodal data—images, audio, video, and PDF files—natively by storing the raw bytes in a binary column alongside your vectors and metadata. This approach simplifies your data infrastructure by keeping the raw assets and their embeddings in the same database, eliminating the need for separate object storage for many use cases. @@ -224,6 +227,60 @@ Query builders accept the same PyArrow `to_pandas` kwargs, but not `blob_mode`. +## Blob v2 columns (Rust) + +The Rust crate exposes blob v2 columns — a successor to the `lance-encoding:blob` metadata +flag — together with read APIs that materialize bytes on demand. Use blob v2 when you want +queries to return small descriptors instead of full payloads, then pull the bytes for only +the rows you need. + +Blob v2 columns require Lance file format 2.2 or newer and stable row IDs at table creation +time. They are currently available in the Rust SDK only; remote tables return `NotSupported`. + +### Declare a blob v2 column + +Use [`lancedb::blob`](https://docs.rs/lancedb/latest/lancedb/fn.blob.html) to create the field. +A blob column may be top-level or nested inside a struct or list — nested blobs are addressed +by a dotted path (for example `info.blob`) when reading. + + + + {RsBlobV2Schema} + + + +### Materialize bytes for selected rows + +Run a query with `with_row_id()` to surface the `_rowid` column, then pass those IDs to +`fetch_blobs` to pull the raw payloads. The returned `LargeBinaryArray` matches the input +length and order; null and zero-length rows come back as null. + + + + {RsBlobV2Fetch} + + + +Use `fetch_blobs` when the selection fits comfortably in memory and you want one round trip. +For larger payloads, switch to lazy handles. + +### Open lazy blob handles + +`fetch_blob_files` returns one [`BlobFile`](https://docs.rs/lancedb/latest/lancedb/blob/struct.BlobFile.html) +handle per requested row ID. Bytes are not read from disk until you call `read()` (or a +ranged read) on the handle, so you can defer or skip I/O for rows you do not end up using. + + + + {RsBlobV2FetchFiles} + + + +Call `table.blob_columns().await?` to discover the blob v2 columns on a table — useful when +you ship generic tooling that should adapt to a table's schema. Multiple blob columns +(for example `image` and `thumbnail`) can be materialized independently with separate +`fetch_blobs` or `fetch_blob_files` calls. + ## Other modalities The `pa.binary()` and `pa.large_binary()` types are universal. You can use this same pattern for other types of multimodal data: