Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions xrspatial/geotiff/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1617,7 +1617,9 @@ def read_geotiff_gpu(source: str, *,

from ._reader import (
_FileSource, _check_dimensions, MAX_PIXELS_DEFAULT, _coerce_path,
_resolve_masked_fill,
)
from ._compression import COMPRESSION_LERC
from ._header import (
parse_header, parse_all_ifds, select_overview_ifd, validate_tile_layout,
)
Expand Down Expand Up @@ -1709,6 +1711,16 @@ def read_geotiff_gpu(source: str, *,
# Sparse tiles (byte_count == 0) are unsupported on the GPU pipeline;
# the CPU reader fills them with nodata and copies onto the GPU.
has_sparse_tile = any(bc == 0 for bc in byte_counts)
# LERC tiles can carry a per-pixel valid mask that GDAL writes
# zero-filled in the data array. Compute the nodata fill the same
# way the CPU reader does so the GPU decode path can restore it
# post-assembly (mirrors PR #1529 for the CPU path). Only the
# chunky (planar=1) GPU path threads masked_fill into its kernel
# call below; the planar=2 per-band branch falls back to the CPU
# reader for masked pixels (rare in practice -- LERC files
# typically use chunky layout).
masked_fill = (_resolve_masked_fill(ifd.nodata_str, file_dtype)
if compression == COMPRESSION_LERC else None)

# PlanarConfiguration=2 (separate bands): each band has its own list
# of tiles back-to-back in TileOffsets / TileByteCounts. The GPU
Expand Down Expand Up @@ -1795,6 +1807,7 @@ def _read_once():
tw, th, width, height,
compression, predictor, file_dtype, samples,
byte_order=header.byte_order,
masked_fill=masked_fill,
)
except Exception as e:
if gpu == 'strict':
Expand Down Expand Up @@ -1826,6 +1839,7 @@ def _read_once():
tw, th, width, height,
compression, predictor, file_dtype, samples,
byte_order=header.byte_order,
masked_fill=masked_fill,
)
except Exception as e:
if gpu == 'strict':
Expand Down
113 changes: 108 additions & 5 deletions xrspatial/geotiff/_gpu_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -1452,6 +1452,7 @@ def gpu_decode_tiles_from_file(
dtype: np.dtype,
samples: int = 1,
byte_order: str = '<',
masked_fill=None,
):
"""Decode tiles from a file, using GDS if available.

Expand Down Expand Up @@ -1496,7 +1497,7 @@ def gpu_decode_tiles_from_file(
return gpu_decode_tiles(
compressed_tiles, tile_width, tile_height,
image_width, image_height, compression, predictor, dtype, samples,
byte_order=byte_order)
byte_order=byte_order, masked_fill=masked_fill)


def _try_nvcomp_from_device_bufs(d_tiles, tile_bytes, compression):
Expand Down Expand Up @@ -1711,6 +1712,7 @@ def gpu_decode_tiles(
dtype: np.dtype,
samples: int = 1,
byte_order: str = '<',
masked_fill=None,
):
"""Decode and assemble TIFF tiles entirely on GPU.

Expand All @@ -1730,6 +1732,12 @@ def gpu_decode_tiles(
Output pixel dtype.
samples : int
Samples per pixel.
masked_fill : scalar or None
Value to write into pixels that LERC reports as invalid. Only
consulted for LERC-compressed inputs (tag 34887). ``None``
leaves any masked pixels at LERC's zero fill. Use
``_resolve_masked_fill(ifd.nodata_str, dtype)`` from
``_reader.py`` to mirror the CPU reader's nodata semantics.

Returns
-------
Expand All @@ -1742,6 +1750,12 @@ def gpu_decode_tiles(
bytes_per_pixel = dtype.itemsize * samples
tile_bytes = tile_width * tile_height * bytes_per_pixel

# Per-tile LERC valid masks; populated only by the LERC branch
# below. Kept as a flat local so the post-assembly fill block at
# the end of this function can find it regardless of which decode
# branch ran.
_lerc_masks: list[np.ndarray | None] | None = None

# Try nvCOMP batch decompression first (much faster if available)
nvcomp_result = _try_nvcomp_batch_decompress(
compressed_tiles, tile_bytes, compression)
Expand Down Expand Up @@ -1869,19 +1883,34 @@ def gpu_decode_tiles(
d_decomp_offsets = cupy.asarray(decomp_offsets)

elif compression == 34887: # LERC
from ._compression import lerc_decompress
from ._compression import lerc_decompress_with_mask
raw_host = np.empty(n_tiles * tile_bytes, dtype=np.uint8)
# Per-tile valid masks captured from LERC. None entries mean
# "all valid" (LERC returned no mask, or an all-True mask).
# The host-side mask buffer is materialised lazily: it stays
# None until at least one tile reports a real partial mask.
per_tile_masks: list[np.ndarray | None] = [None] * n_tiles
any_lerc_mask = False
for i, tile in enumerate(compressed_tiles):
start = i * tile_bytes
chunk = np.frombuffer(
lerc_decompress(tile, tile_width, tile_height, samples),
dtype=np.uint8)
decoded_bytes, valid_mask = lerc_decompress_with_mask(tile)
chunk = np.frombuffer(decoded_bytes, dtype=np.uint8)
raw_host[start:start + min(len(chunk), tile_bytes)] = \
chunk[:tile_bytes] if len(chunk) >= tile_bytes else \
np.pad(chunk, (0, tile_bytes - len(chunk)))
if valid_mask is not None:
per_tile_masks[i] = np.asarray(valid_mask)
any_lerc_mask = True
d_decomp = cupy.asarray(raw_host)
decomp_offsets = np.arange(n_tiles, dtype=np.int64) * tile_bytes
d_decomp_offsets = cupy.asarray(decomp_offsets)
if any_lerc_mask:
# Stash per-tile masks for the post-assembly fill pass below.
# Stored in a ``_lerc_masks`` local that the LERC fill block
# picks up after predictor decode and tile assembly.
_lerc_masks = per_tile_masks
else:
_lerc_masks = None

elif compression == 1: # Uncompressed
raw_host = np.empty(n_tiles * tile_bytes, dtype=np.uint8)
Expand Down Expand Up @@ -1965,6 +1994,80 @@ def gpu_decode_tiles(
if byte_order == '>' and dtype.itemsize > 1 and predictor != 2:
# cupy.ndarray has no .byteswap(), so use the dtype-view helper.
out = _xp_byteswap(out)

# LERC valid-mask fill: GDAL writes LERC TIFFs with masked pixels
# zero-filled in the data array, so without restoring nodata here a
# masked pixel reads back as a real zero measurement. Mirrors the
# CPU path in ``_decode_strip_or_tile`` (PR #1529).
if _lerc_masks is not None and masked_fill is not None:
out = _apply_lerc_mask_fill(
out, _lerc_masks, tile_width, tile_height,
image_width, image_height, samples, masked_fill)
return out


def _apply_lerc_mask_fill(out, per_tile_masks, tile_width, tile_height,
image_width, image_height, samples, masked_fill):
"""Write *masked_fill* into LERC-invalid positions of an assembled image.

Each tile contributes either an ``(h, w)``/``(h, w, samples)`` valid
mask (1=valid, 0=invalid) or ``None`` for "all valid". The host
builds a single assembled boolean invalid-mask sized to the output
image, transfers it to the GPU once, and uses it to overwrite
masked positions on device.
"""
import cupy

n_tiles = len(per_tile_masks)
tiles_across = math.ceil(image_width / tile_width)
tiles_down = math.ceil(image_height / tile_height)
if n_tiles != tiles_across * tiles_down:
# Defensive: tile grid mismatch means we cannot place masks
# safely. Skip the fill rather than corrupt the output.
return out

invalid = np.zeros((image_height, image_width), dtype=bool)
for idx, mask in enumerate(per_tile_masks):
if mask is None:
continue
tr = idx // tiles_across
tc = idx % tiles_across
y0 = tr * tile_height
x0 = tc * tile_width
# Trim the tile mask to the visible portion of the image so
# right- and bottom-edge tile padding does not leak into the
# assembled mask.
h = min(tile_height, image_height - y0)
w = min(tile_width, image_width - x0)
if h <= 0 or w <= 0:
continue
m = np.asarray(mask)
# LERC may report the mask as (h, w) or (h, w, samples); for the
# invalid-fill we collapse multi-sample masks via "any sample
# invalid".
if m.ndim == 3:
m2 = (m == 0).any(axis=2)
else:
m2 = (m == 0)
invalid[y0:y0 + h, x0:x0 + w] = m2[:h, :w]

if not invalid.any():
return out

# Account for the boolean mask buffer up front so a borderline-OK
# decode doesn't tip into CUDA OOM on the mask transfer. Boolean
# indexing on cupy materialises a temporary index array (typically
# int64 of length sum(invalid)); cap that at the worst case of
# one int64 per pixel so the budget covers both buffers.
_check_gpu_memory(invalid.nbytes, what="LERC valid-mask buffer")
_check_gpu_memory(invalid.size * 8, what="LERC mask index buffer")

d_invalid = cupy.asarray(invalid)
if out.ndim == 3:
# Broadcast (H, W) mask across the sample axis.
out[d_invalid, ...] = out.dtype.type(masked_fill)
else:
Comment on lines +2054 to +2069
out[d_invalid] = out.dtype.type(masked_fill)
return out


Expand Down
181 changes: 181 additions & 0 deletions xrspatial/geotiff/tests/test_lerc_valid_mask_gpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
"""GPU follow-up to PR #1529 (LERC valid-mask on decode).

The CPU LERC reader honours the LERC valid-mask and writes the file's
nodata sentinel into masked pixels. The GPU LERC tile-decode path used
to discard the mask, so masked pixels read back as LERC's zero fill
(real-looking measurements at z == 0) on GPU but as NaN/sentinel on
CPU. These tests confirm the GPU path now matches the CPU path for
representative LERC mask combinations.

Mirrors the structure of ``test_lerc_valid_mask.py`` but compares
``read_geotiff_gpu`` output to ``read_to_array`` output for each case.
"""
from __future__ import annotations

import importlib.util

import numpy as np
import pytest

lerc = pytest.importorskip("lerc")

from xrspatial.geotiff._compression import LERC_AVAILABLE # noqa: E402


def _gpu_available() -> bool:
"""True if cupy is importable and CUDA is initialised."""
if importlib.util.find_spec("cupy") is None:
return False
try:
import cupy
return bool(cupy.cuda.is_available())
except Exception:
return False


_HAS_GPU = _gpu_available()
_gpu_only = pytest.mark.skipif(
not (_HAS_GPU and LERC_AVAILABLE),
reason="cupy + CUDA + lerc required",
)


@pytest.fixture
def lerc_writer_with_mask(monkeypatch):
"""Patch ``lerc_compress`` to embed a valid-mask the writer can't pass.

The xrspatial writer hard-codes ``hasMask=False`` in its call to
``lerc.encode``. Tests inject a per-tile mask through this holder's
``invalid`` predicate so the masked pixels survive the encode and
show up at decode time. Same pattern as the CPU test fixture in
``test_lerc_valid_mask.py``.
"""
holder = {"invalid": None}

def _patched(data, width, height, samples=1,
dtype=np.dtype('float32'), max_z_error=0.0):
if samples == 1:
arr = np.frombuffer(data, dtype=dtype).reshape(height, width)
else:
arr = np.frombuffer(data, dtype=dtype).reshape(
height, width, samples)
invalid_pred = holder["invalid"]
if invalid_pred is None:
mask = None
has_mask = False
else:
invalid = invalid_pred(arr)
mask = np.where(invalid, np.uint8(0), np.uint8(1))
has_mask = True
result = lerc.encode(arr, samples, has_mask, mask, max_z_error, 1)
if result[0] != 0:
raise RuntimeError(
f"LERC encode failed with error code {result[0]}")
return bytes(result[2])

monkeypatch.setattr(
"xrspatial.geotiff._compression.lerc_compress", _patched,
)
return holder


def _read_cpu_gpu(path):
"""Read *path* with both readers and return ``(cpu_array, gpu_host_array)``."""
from xrspatial.geotiff import read_geotiff_gpu
from xrspatial.geotiff._reader import read_to_array

cpu, _geo = read_to_array(path)
gpu_da = read_geotiff_gpu(path, gpu='strict')
gpu_host = gpu_da.data.get()
return cpu, gpu_host


@_gpu_only
class TestGpuLercValidMask:
"""End-to-end TIFF round-trips comparing GPU vs CPU output."""

def test_float32_nan_nodata(self, tmp_path, lerc_writer_with_mask):
"""Float32 LERC + NaN nodata: GPU output matches CPU output."""
from xrspatial.geotiff._writer import write

arr = np.arange(1, 65, dtype=np.float32).reshape(8, 8)
invalid_positions = {(0, 1), (5, 4)}

def invalid_pred(a):
m = np.zeros(a.shape[:2], dtype=bool)
for r, c in invalid_positions:
m[r, c] = True
return m
lerc_writer_with_mask["invalid"] = invalid_pred

path = str(tmp_path / "lerc_mask_nan_gpu.tif")
write(arr, path, compression="lerc", tiled=True, tile_size=8,
nodata=float("nan"))

cpu, gpu = _read_cpu_gpu(path)
# NaN positions
for (r, c) in invalid_positions:
assert np.isnan(cpu[r, c])
assert np.isnan(gpu[r, c])
# Valid positions agree exactly
cpu_valid = np.where(np.isnan(cpu), 0.0, cpu)
gpu_valid = np.where(np.isnan(gpu), 0.0, gpu)
np.testing.assert_array_equal(cpu_valid, gpu_valid)

def test_float32_sentinel_nodata(self, tmp_path, lerc_writer_with_mask):
"""Float32 LERC + sentinel nodata (-9999): GPU matches CPU."""
from xrspatial.geotiff._writer import write

arr = np.arange(1, 65, dtype=np.float32).reshape(8, 8)
invalid_positions = {(0, 1), (3, 3), (7, 7)}

def invalid_pred(a):
m = np.zeros(a.shape[:2], dtype=bool)
for r, c in invalid_positions:
m[r, c] = True
return m
lerc_writer_with_mask["invalid"] = invalid_pred

path = str(tmp_path / "lerc_mask_sentinel_f32_gpu.tif")
write(arr, path, compression="lerc", tiled=True, tile_size=8,
nodata=-9999.0)

cpu, gpu = _read_cpu_gpu(path)
np.testing.assert_array_equal(cpu, gpu)
for (r, c) in invalid_positions:
assert gpu[r, c] == np.float32(-9999.0)

def test_uint16_sentinel_nodata(self, tmp_path, lerc_writer_with_mask):
"""Uint16 LERC + sentinel nodata (65535): GPU matches CPU."""
from xrspatial.geotiff._writer import write

arr = (np.arange(1, 65, dtype=np.uint16) * 100).reshape(8, 8)
invalid_positions = {(0, 1), (4, 4)}

def invalid_pred(a):
m = np.zeros(a.shape[:2], dtype=bool)
for r, c in invalid_positions:
m[r, c] = True
return m
lerc_writer_with_mask["invalid"] = invalid_pred

path = str(tmp_path / "lerc_mask_uint16_gpu.tif")
write(arr, path, compression="lerc", tiled=True, tile_size=8,
nodata=65535)

cpu, gpu = _read_cpu_gpu(path)
np.testing.assert_array_equal(cpu, gpu)
for (r, c) in invalid_positions:
assert gpu[r, c] == np.uint16(65535)

def test_no_mask_roundtrip_bitexact(self, tmp_path):
"""All-valid LERC (no encoded mask): GPU and CPU agree bit-exact."""
from xrspatial.geotiff._writer import write

arr = np.arange(64, dtype=np.float32).reshape(8, 8)
path = str(tmp_path / "lerc_no_mask_gpu.tif")
write(arr, path, compression="lerc", tiled=True, tile_size=8)

cpu, gpu = _read_cpu_gpu(path)
np.testing.assert_array_equal(cpu, arr)
np.testing.assert_array_equal(gpu, arr)
Loading