Add support for structured dtypes to `zarr3` driver, open structs as void #271

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

BrianMichell wants to merge 63 commits into google:master from BrianMichell:v3_structs_and_void

tensorstore/driver/zarr3/BUILD

-Original file line number
+Diff line change
@@ Expand Up / @@ -94,8 +94,8 @@ tensorstore_cc_library( @@
     tensorstore_cc_library(
         name = "metadata",
-        srcs = ["metadata.cc"],
-        hdrs = ["metadata.h"],
+        srcs = ["metadata.cc", "dtype.cc"],
+        hdrs = ["metadata.h", "dtype.h"],
         deps = [
             ":default_nan",
             ":name_configuration_json_binder",
@@ Expand Down Expand Up / @@ -145,6 +145,23 @@ tensorstore_cc_library( @@
         ],
     )
+    tensorstore_cc_test(
+        name = "dtype_test",
+        size = "small",
+        srcs = ["dtype_test.cc"],
+        deps = [
+            ":metadata",
+            "//tensorstore:data_type",
+            "//tensorstore:index",
+            "//tensorstore/internal/testing:json_gtest",
+            "//tensorstore/util:status_testutil",
+            "//tensorstore/util:str_cat",
+            "@abseil-cpp//absl/status",
+            "@googletest//:gtest_main",
+            "@nlohmann_json//:json",
+        ],
+    )
     tensorstore_cc_test(
         name = "driver_test",
         size = "small",
@@ Expand Down Expand Up / @@ -204,10 +221,12 @@ tensorstore_cc_library( @@
         srcs = ["chunk_cache.cc"],
         hdrs = ["chunk_cache.h"],
         deps = [
+            ":metadata",
             "//tensorstore:array",
             "//tensorstore:array_storage_statistics",
             "//tensorstore:batch",
             "//tensorstore:box",
+            "//tensorstore:contiguous_layout",
             "//tensorstore:index",
             "//tensorstore:index_interval",
             "//tensorstore:rank",
@@ Expand All / @@ -224,6 +243,7 @@ tensorstore_cc_library( @@
             "//tensorstore/internal:intrusive_ptr",
             "//tensorstore/internal:lexicographical_grid_index_key",
             "//tensorstore/internal:regular_grid",
+            "//tensorstore:strided_layout",
             "//tensorstore/internal:storage_statistics",
             "//tensorstore/internal/cache",
             "//tensorstore/internal/cache:chunk_cache",
@@ Expand Down @@

tensorstore/driver/zarr3/chunk_cache.cc

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -18,6 +18,7 @@
  
    #include <algorithm>

    #include <cassert>

    #include <cstring>

    #include <functional>

    #include <memory>

    #include <string>

    @@ -31,11 +32,14 @@
  
    #include "tensorstore/array_storage_statistics.h"

    #include "tensorstore/batch.h"

    #include "tensorstore/box.h"

    #include "tensorstore/contiguous_layout.h"

    #include "tensorstore/data_type.h"

    #include "tensorstore/driver/chunk.h"

    #include "tensorstore/driver/chunk_receiver_utils.h"

    #include "tensorstore/driver/read_request.h"

    #include "tensorstore/driver/write_request.h"

    #include "tensorstore/driver/zarr3/codec/codec.h"

    #include "tensorstore/driver/zarr3/dtype.h"

    #include "tensorstore/index.h"

    #include "tensorstore/index_interval.h"

    #include "tensorstore/index_space/index_transform.h"

    @@ -51,6 +55,7 @@
  
    #include "tensorstore/internal/meta/type_traits.h"

    #include "tensorstore/internal/regular_grid.h"

    #include "tensorstore/internal/storage_statistics.h"

    #include "tensorstore/strided_layout.h"

    #include "tensorstore/kvstore/driver.h"

    #include "tensorstore/kvstore/key_range.h"

    #include "tensorstore/kvstore/kvstore.h"

    @@ -73,15 +78,23 @@ ZarrChunkCache::~ZarrChunkCache() = default;
  
    ZarrLeafChunkCache::ZarrLeafChunkCache(

        kvstore::DriverPtr store, ZarrCodecChain::PreparedState::Ptr codec_state,

        internal::CachePool::WeakPtr /*data_cache_pool*/)

        : Base(std::move(store)), codec_state_(std::move(codec_state)) {}

        ZarrDType dtype, internal::CachePool::WeakPtr /*data_cache_pool*/,

        bool open_as_void, bool original_is_structured, DataType original_dtype,

        bool grid_has_void_dimension)

        : Base(std::move(store)),

          codec_state_(std::move(codec_state)),

          dtype_(std::move(dtype)),

          open_as_void_(open_as_void),

          original_is_structured_(original_is_structured),

          original_dtype_(original_dtype),

          grid_has_void_dimension_(grid_has_void_dimension) {}

    void ZarrLeafChunkCache::Read(ZarrChunkCache::ReadRequest request,

                                  AnyFlowReceiver<absl::Status, internal::ReadChunk,

                                                  IndexTransform<>>&& receiver) {

      return internal::ChunkCache::Read(

          {static_cast<internal::DriverReadRequest&&>(request),

           /*component_index=*/0, request.staleness_bound,

           request.component_index, request.staleness_bound,

           request.fill_missing_data_reads},

          std::move(receiver));

    }

    @@ -92,7 +105,7 @@ void ZarrLeafChunkCache::Write(
  
            receiver) {

      return internal::ChunkCache::Write(

          {static_cast<internal::DriverWriteRequest&&>(request),

           /*component_index=*/0, request.store_data_equal_to_fill_value},

           request.component_index, request.store_data_equal_to_fill_value},

          std::move(receiver));

    }

    @@ -149,19 +162,206 @@ std::string ZarrLeafChunkCache::GetChunkStorageKey(
  
    Result<absl::InlinedVector<SharedArray<const void>, 1>>

    ZarrLeafChunkCache::DecodeChunk(span<const Index> chunk_indices,

                                    absl::Cord data) {

      const size_t num_fields = dtype_.fields.size();

      absl::InlinedVector<SharedArray<const void>, 1> field_arrays(num_fields);

      // Special case: void access - decode and return as bytes.

      //

      // For non-structured types: codec was prepared for [chunk_shape] with

      // original dtype. We decode to that shape then reinterpret as bytes.

      //

      // For structured types: codec was already prepared for

      // [chunk_shape, bytes_per_elem] with byte dtype. Just decode directly.

      if (open_as_void_) {

        assert(num_fields == 1);  // Void access uses a single synthesized field

        const auto& void_component_shape = grid().components[0].shape();

        if (original_is_structured_) {

          // Structured types: codec already expects bytes with extra dimension.

          // Just decode directly to the void component shape.

          TENSORSTORE_ASSIGN_OR_RETURN(

              field_arrays[0],

              codec_state_->DecodeArray(void_component_shape, std::move(data)));

          return field_arrays;

        }

        // Non-structured types: codec expects original dtype without extra

        // dimension. Decode, then reinterpret as bytes.

        //

        // For top-level caches, grid().chunk_shape includes bytes dimension.

        // For sub-chunk caches (inside sharding), grid() returns the sharding

        // codec's sub_chunk_grid which doesn't have bytes dimension.

        const Index bytes_per_element = dtype_.bytes_per_outer_element;

        const auto& grid_chunk_shape = grid().chunk_shape;

        std::vector<Index> original_chunk_shape;

        if (grid_has_void_dimension_) {

          // Strip the bytes dimension to get original shape

          original_chunk_shape.assign(grid_chunk_shape.begin(),

                                      grid_chunk_shape.end() - 1);

        } else {

          // Sub-chunk cache: grid shape is already the original shape

          original_chunk_shape.assign(grid_chunk_shape.begin(),

                                      grid_chunk_shape.end());

        }

        // Decode using original codec shape

        TENSORSTORE_ASSIGN_OR_RETURN(

            auto decoded_array,

            codec_state_->DecodeArray(original_chunk_shape, std::move(data)));

        // Verify decoded array is C-contiguous (codec chain should guarantee this)

        assert(IsContiguousLayout(decoded_array.layout(), c_order,

                                  decoded_array.dtype().size()));

        // Build the void output shape: original_shape + [bytes_per_element]

        std::vector<Index> void_output_shape = original_chunk_shape;

        void_output_shape.push_back(bytes_per_element);

        // Reinterpret the decoded array's bytes as [chunk_shape..., bytes_per_elem]

        auto byte_array = AllocateArray(

            void_output_shape, c_order, default_init,

            dtype_v<tensorstore::dtypes::byte_t>);

        // Copy decoded data to byte array (handles potential layout differences)

        std::memcpy(byte_array.data(), decoded_array.data(),

                    decoded_array.num_elements() *

                        decoded_array.dtype().size());

        field_arrays[0] = std::move(byte_array);

        return field_arrays;

      }

      // For single non-structured field, decode directly

      if (num_fields == 1 && dtype_.fields[0].outer_shape.empty()) {

        TENSORSTORE_ASSIGN_OR_RETURN(

            field_arrays[0], codec_state_->DecodeArray(grid().components[0].shape(),

                                                       std::move(data)));

        return field_arrays;

      }

      // For structured types, decode byte array then extract fields

      // Build decode shape: [chunk_dims..., bytes_per_outer_element]

      const auto& chunk_shape = grid().chunk_shape;

      std::vector<Index> decode_shape(chunk_shape.begin(), chunk_shape.end());

      decode_shape.push_back(dtype_.bytes_per_outer_element);

      TENSORSTORE_ASSIGN_OR_RETURN(

          auto array,

          codec_state_->DecodeArray(grid().components[0].shape(), std::move(data)));

      absl::InlinedVector<SharedArray<const void>, 1> components;

      components.push_back(std::move(array));

      return components;

          auto byte_array, codec_state_->DecodeArray(decode_shape, std::move(data)));

      // Extract each field from the byte array.

      // We create a strided view into the source that maps to each field's

      // position within the interleaved struct layout, then use CopyArray which

      // safely handles any layout differences via IterateOverArrays.

      for (size_t field_i = 0; field_i < num_fields; ++field_i) {

        const auto& field = dtype_.fields[field_i];

        // Use the component's shape (from the grid) for the result array

        const auto& component_shape = grid().components[field_i].shape();

        auto result_array =

            AllocateArray(component_shape, c_order, default_init, field.dtype);

        // Build strides for the source view: each element is separated by

        // bytes_per_outer_element (the struct size), not field_size.

        std::vector<Index> src_byte_strides(chunk_shape.size());

        ComputeStrides(c_order, dtype_.bytes_per_outer_element, chunk_shape,

                       src_byte_strides);

        // Create source ArrayView pointing to this field's offset within

        // the interleaved byte array, with strides that skip over other fields.

        ArrayView<const void> src_field_view(

            {static_cast<const void*>(

                 static_cast<const std::byte*>(byte_array.data()) + field.byte_offset),

             field.dtype},

            StridedLayoutView<>(chunk_shape, src_byte_strides));

        // Use CopyArray which safely handles any layout differences

        CopyArray(src_field_view, result_array);

        field_arrays[field_i] = std::move(result_array);

      }

      return field_arrays;

    }

    Result<absl::Cord> ZarrLeafChunkCache::EncodeChunk(

        span<const Index> chunk_indices,

        span<const SharedArray<const void>> component_arrays) {

      assert(component_arrays.size() == 1);

      return codec_state_->EncodeArray(component_arrays[0]);

      const size_t num_fields = dtype_.fields.size();

      // Special case: void access - encode bytes back to original format.

      if (open_as_void_) {

        assert(component_arrays.size() == 1);

        if (original_is_structured_) {

          // Structured types: codec already expects bytes with extra dimension.

          return codec_state_->EncodeArray(component_arrays[0]);

        }

        // Non-structured types: reinterpret bytes as original dtype/shape.

        const auto& byte_array = component_arrays[0];

        const Index bytes_per_element = dtype_.bytes_per_outer_element;

        // Build original chunk shape by stripping the bytes dimension

        const auto& void_shape = byte_array.shape();

        std::vector<Index> original_shape(void_shape.begin(), void_shape.end() - 1);

        // Use the original dtype (stored during cache creation) for encoding.

        // Create a view over the byte data with original dtype and layout.

        // Use the aliasing constructor to share ownership with byte_array but

        // interpret the data with the original dtype.

        SharedArray<const void> encoded_array;

        auto aliased_ptr = std::shared_ptr<const void>(

            byte_array.pointer(),  // Share ownership with byte_array

            byte_array.data());    // But point to the raw data

        encoded_array.element_pointer() = SharedElementPointer<const void>(

            std::move(aliased_ptr), original_dtype_);

        encoded_array.layout() = StridedLayout<>(c_order, bytes_per_element,

                                                 original_shape);

        return codec_state_->EncodeArray(encoded_array);

      }

      // For single non-structured field, encode directly

      if (num_fields == 1 && dtype_.fields[0].outer_shape.empty()) {

        assert(component_arrays.size() == 1);

        return codec_state_->EncodeArray(component_arrays[0]);

      }

      // For structured types, combine multiple field arrays into a single byte array

      assert(component_arrays.size() == num_fields);

      // Build encode shape: [chunk_dims..., bytes_per_outer_element]

      const auto& chunk_shape = grid().chunk_shape;

      std::vector<Index> encode_shape(chunk_shape.begin(), chunk_shape.end());

      encode_shape.push_back(dtype_.bytes_per_outer_element);

      // Allocate byte array for combined fields

      auto byte_array = AllocateArray<std::byte>(encode_shape, c_order, value_init);

      // Copy each field's data into the byte array at their respective offsets.

      // We create a strided view into the destination that maps to each field's

      // position within the interleaved struct layout, then use CopyArray which

      // safely handles any source array strides via IterateOverArrays.

      for (size_t field_i = 0; field_i < num_fields; ++field_i) {

        const auto& field = dtype_.fields[field_i];

        const auto& field_array = component_arrays[field_i];

        // Build strides for the destination view: each element is separated by

        // bytes_per_outer_element (the struct size), not field_size.

        std::vector<Index> dest_byte_strides(chunk_shape.size());

        ComputeStrides(c_order, dtype_.bytes_per_outer_element, chunk_shape,

                       dest_byte_strides);

        // Create destination ArrayView pointing to this field's offset within

        // the interleaved byte array, with strides that skip over other fields.

        ArrayView<void> dest_field_view(

            {static_cast<void*>(byte_array.data() + field.byte_offset), field.dtype},

            StridedLayoutView<>(chunk_shape, dest_byte_strides));

        // Use CopyArray which safely handles any source strides via IterateOverArrays

        CopyArray(field_array, dest_field_view);

      }

      return codec_state_->EncodeArray(byte_array);

    }

    kvstore::Driver* ZarrLeafChunkCache::GetKvStoreDriver() {

    @@ -170,9 +370,15 @@ kvstore::Driver* ZarrLeafChunkCache::GetKvStoreDriver() {
  
    ZarrShardedChunkCache::ZarrShardedChunkCache(

        kvstore::DriverPtr store, ZarrCodecChain::PreparedState::Ptr codec_state,

        internal::CachePool::WeakPtr data_cache_pool)

        ZarrDType dtype, internal::CachePool::WeakPtr data_cache_pool,

        bool open_as_void, bool original_is_structured, DataType original_dtype,

        bool /*grid_has_void_dimension*/)

        : base_kvstore_(std::move(store)),

          codec_state_(std::move(codec_state)),

          dtype_(std::move(dtype)),

          open_as_void_(open_as_void),

          original_is_structured_(original_is_structured),

          original_dtype_(original_dtype),

          data_cache_pool_(std::move(data_cache_pool)) {}

    Result<IndexTransform<>> TranslateCellToSourceTransformForShard(

    @@ -326,6 +532,7 @@ void ZarrShardedChunkCache::Read(
  
          *this, std::move(request.transform), std::move(receiver),

          [transaction = std::move(request.transaction),

           batch = std::move(request.batch),

           component_index = request.component_index,

           staleness_bound = request.staleness_bound,

           fill_missing_data_reads = request.fill_missing_data_reads](auto entry) {

            Batch shard_batch = batch;

    @@ -339,8 +546,7 @@ void ZarrShardedChunkCache::Read(
  
                                    IndexTransform<>>&& receiver) {

                  entry->sub_chunk_cache.get()->Read(

                      {{transaction, std::move(transform), shard_batch},

                       staleness_bound,

                       fill_missing_data_reads},

                       component_index, staleness_bound, fill_missing_data_reads},

                      std::move(receiver));

                };

          });

    @@ -354,6 +560,7 @@ void ZarrShardedChunkCache::Write(
  
                         &ZarrArrayToArrayCodec::PreparedState::Write>(

          *this, std::move(request.transform), std::move(receiver),

          [transaction = std::move(request.transaction),

           component_index = request.component_index,

           store_data_equal_to_fill_value =

               request.store_data_equal_to_fill_value](auto entry) {

            internal::OpenTransactionPtr shard_transaction = transaction;

    @@ -366,7 +573,7 @@ void ZarrShardedChunkCache::Write(
  
                       AnyFlowReceiver<absl::Status, internal::WriteChunk,

                                       IndexTransform<>>&& receiver) {

              entry->sub_chunk_cache.get()->Write(

                  {{shard_transaction, std::move(transform)},

                  {{shard_transaction, std::move(transform)}, component_index,

                   store_data_equal_to_fill_value},

                  std::move(receiver));

            };

    @@ -481,7 +688,8 @@ void ZarrShardedChunkCache::Entry::DoInitialize() {
  
                    *sharding_state.sub_chunk_codec_chain,

                    std::move(sharding_kvstore), cache.executor(),

                    ZarrShardingCodec::PreparedState::Ptr(&sharding_state),

                    cache.data_cache_pool_);

                    cache.dtype_, cache.data_cache_pool_, cache.open_as_void_,

                    cache.original_is_structured_, cache.original_dtype_);

            zarr_chunk_cache = new_cache.release();

            return std::unique_ptr<internal::Cache>(&zarr_chunk_cache->cache());

          })

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add support for structured dtypes to `zarr3` driver, open structs as void #271

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Add support for structured dtypes to zarr3 driver, open structs as void #271

Are you sure you want to change the base?

Uh oh!

Add support for structured dtypes to zarr3 driver, open structs as void #271

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Add support for structured dtypes to `zarr3` driver, open structs as void #271

Add support for structured dtypes to `zarr3` driver, open structs as void #271