diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index bd7d97fe..4b27f245 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -1617,7 +1617,9 @@ def read_geotiff_gpu(source: str, *, from ._reader import ( _FileSource, _check_dimensions, MAX_PIXELS_DEFAULT, _coerce_path, + _resolve_masked_fill, ) + from ._compression import COMPRESSION_LERC from ._header import ( parse_header, parse_all_ifds, select_overview_ifd, validate_tile_layout, ) @@ -1709,6 +1711,16 @@ def read_geotiff_gpu(source: str, *, # Sparse tiles (byte_count == 0) are unsupported on the GPU pipeline; # the CPU reader fills them with nodata and copies onto the GPU. has_sparse_tile = any(bc == 0 for bc in byte_counts) + # LERC tiles can carry a per-pixel valid mask that GDAL writes + # zero-filled in the data array. Compute the nodata fill the same + # way the CPU reader does so the GPU decode path can restore it + # post-assembly (mirrors PR #1529 for the CPU path). Only the + # chunky (planar=1) GPU path threads masked_fill into its kernel + # call below; the planar=2 per-band branch falls back to the CPU + # reader for masked pixels (rare in practice -- LERC files + # typically use chunky layout). + masked_fill = (_resolve_masked_fill(ifd.nodata_str, file_dtype) + if compression == COMPRESSION_LERC else None) # PlanarConfiguration=2 (separate bands): each band has its own list # of tiles back-to-back in TileOffsets / TileByteCounts. The GPU @@ -1795,6 +1807,7 @@ def _read_once(): tw, th, width, height, compression, predictor, file_dtype, samples, byte_order=header.byte_order, + masked_fill=masked_fill, ) except Exception as e: if gpu == 'strict': @@ -1826,6 +1839,7 @@ def _read_once(): tw, th, width, height, compression, predictor, file_dtype, samples, byte_order=header.byte_order, + masked_fill=masked_fill, ) except Exception as e: if gpu == 'strict': diff --git a/xrspatial/geotiff/_gpu_decode.py b/xrspatial/geotiff/_gpu_decode.py index b2c51e09..ac8ba4bb 100644 --- a/xrspatial/geotiff/_gpu_decode.py +++ b/xrspatial/geotiff/_gpu_decode.py @@ -1452,6 +1452,7 @@ def gpu_decode_tiles_from_file( dtype: np.dtype, samples: int = 1, byte_order: str = '<', + masked_fill=None, ): """Decode tiles from a file, using GDS if available. @@ -1496,7 +1497,7 @@ def gpu_decode_tiles_from_file( return gpu_decode_tiles( compressed_tiles, tile_width, tile_height, image_width, image_height, compression, predictor, dtype, samples, - byte_order=byte_order) + byte_order=byte_order, masked_fill=masked_fill) def _try_nvcomp_from_device_bufs(d_tiles, tile_bytes, compression): @@ -1711,6 +1712,7 @@ def gpu_decode_tiles( dtype: np.dtype, samples: int = 1, byte_order: str = '<', + masked_fill=None, ): """Decode and assemble TIFF tiles entirely on GPU. @@ -1730,6 +1732,12 @@ def gpu_decode_tiles( Output pixel dtype. samples : int Samples per pixel. + masked_fill : scalar or None + Value to write into pixels that LERC reports as invalid. Only + consulted for LERC-compressed inputs (tag 34887). ``None`` + leaves any masked pixels at LERC's zero fill. Use + ``_resolve_masked_fill(ifd.nodata_str, dtype)`` from + ``_reader.py`` to mirror the CPU reader's nodata semantics. Returns ------- @@ -1742,6 +1750,12 @@ def gpu_decode_tiles( bytes_per_pixel = dtype.itemsize * samples tile_bytes = tile_width * tile_height * bytes_per_pixel + # Per-tile LERC valid masks; populated only by the LERC branch + # below. Kept as a flat local so the post-assembly fill block at + # the end of this function can find it regardless of which decode + # branch ran. + _lerc_masks: list[np.ndarray | None] | None = None + # Try nvCOMP batch decompression first (much faster if available) nvcomp_result = _try_nvcomp_batch_decompress( compressed_tiles, tile_bytes, compression) @@ -1869,19 +1883,34 @@ def gpu_decode_tiles( d_decomp_offsets = cupy.asarray(decomp_offsets) elif compression == 34887: # LERC - from ._compression import lerc_decompress + from ._compression import lerc_decompress_with_mask raw_host = np.empty(n_tiles * tile_bytes, dtype=np.uint8) + # Per-tile valid masks captured from LERC. None entries mean + # "all valid" (LERC returned no mask, or an all-True mask). + # The host-side mask buffer is materialised lazily: it stays + # None until at least one tile reports a real partial mask. + per_tile_masks: list[np.ndarray | None] = [None] * n_tiles + any_lerc_mask = False for i, tile in enumerate(compressed_tiles): start = i * tile_bytes - chunk = np.frombuffer( - lerc_decompress(tile, tile_width, tile_height, samples), - dtype=np.uint8) + decoded_bytes, valid_mask = lerc_decompress_with_mask(tile) + chunk = np.frombuffer(decoded_bytes, dtype=np.uint8) raw_host[start:start + min(len(chunk), tile_bytes)] = \ chunk[:tile_bytes] if len(chunk) >= tile_bytes else \ np.pad(chunk, (0, tile_bytes - len(chunk))) + if valid_mask is not None: + per_tile_masks[i] = np.asarray(valid_mask) + any_lerc_mask = True d_decomp = cupy.asarray(raw_host) decomp_offsets = np.arange(n_tiles, dtype=np.int64) * tile_bytes d_decomp_offsets = cupy.asarray(decomp_offsets) + if any_lerc_mask: + # Stash per-tile masks for the post-assembly fill pass below. + # Stored in a ``_lerc_masks`` local that the LERC fill block + # picks up after predictor decode and tile assembly. + _lerc_masks = per_tile_masks + else: + _lerc_masks = None elif compression == 1: # Uncompressed raw_host = np.empty(n_tiles * tile_bytes, dtype=np.uint8) @@ -1965,6 +1994,80 @@ def gpu_decode_tiles( if byte_order == '>' and dtype.itemsize > 1 and predictor != 2: # cupy.ndarray has no .byteswap(), so use the dtype-view helper. out = _xp_byteswap(out) + + # LERC valid-mask fill: GDAL writes LERC TIFFs with masked pixels + # zero-filled in the data array, so without restoring nodata here a + # masked pixel reads back as a real zero measurement. Mirrors the + # CPU path in ``_decode_strip_or_tile`` (PR #1529). + if _lerc_masks is not None and masked_fill is not None: + out = _apply_lerc_mask_fill( + out, _lerc_masks, tile_width, tile_height, + image_width, image_height, samples, masked_fill) + return out + + +def _apply_lerc_mask_fill(out, per_tile_masks, tile_width, tile_height, + image_width, image_height, samples, masked_fill): + """Write *masked_fill* into LERC-invalid positions of an assembled image. + + Each tile contributes either an ``(h, w)``/``(h, w, samples)`` valid + mask (1=valid, 0=invalid) or ``None`` for "all valid". The host + builds a single assembled boolean invalid-mask sized to the output + image, transfers it to the GPU once, and uses it to overwrite + masked positions on device. + """ + import cupy + + n_tiles = len(per_tile_masks) + tiles_across = math.ceil(image_width / tile_width) + tiles_down = math.ceil(image_height / tile_height) + if n_tiles != tiles_across * tiles_down: + # Defensive: tile grid mismatch means we cannot place masks + # safely. Skip the fill rather than corrupt the output. + return out + + invalid = np.zeros((image_height, image_width), dtype=bool) + for idx, mask in enumerate(per_tile_masks): + if mask is None: + continue + tr = idx // tiles_across + tc = idx % tiles_across + y0 = tr * tile_height + x0 = tc * tile_width + # Trim the tile mask to the visible portion of the image so + # right- and bottom-edge tile padding does not leak into the + # assembled mask. + h = min(tile_height, image_height - y0) + w = min(tile_width, image_width - x0) + if h <= 0 or w <= 0: + continue + m = np.asarray(mask) + # LERC may report the mask as (h, w) or (h, w, samples); for the + # invalid-fill we collapse multi-sample masks via "any sample + # invalid". + if m.ndim == 3: + m2 = (m == 0).any(axis=2) + else: + m2 = (m == 0) + invalid[y0:y0 + h, x0:x0 + w] = m2[:h, :w] + + if not invalid.any(): + return out + + # Account for the boolean mask buffer up front so a borderline-OK + # decode doesn't tip into CUDA OOM on the mask transfer. Boolean + # indexing on cupy materialises a temporary index array (typically + # int64 of length sum(invalid)); cap that at the worst case of + # one int64 per pixel so the budget covers both buffers. + _check_gpu_memory(invalid.nbytes, what="LERC valid-mask buffer") + _check_gpu_memory(invalid.size * 8, what="LERC mask index buffer") + + d_invalid = cupy.asarray(invalid) + if out.ndim == 3: + # Broadcast (H, W) mask across the sample axis. + out[d_invalid, ...] = out.dtype.type(masked_fill) + else: + out[d_invalid] = out.dtype.type(masked_fill) return out diff --git a/xrspatial/geotiff/tests/test_lerc_valid_mask_gpu.py b/xrspatial/geotiff/tests/test_lerc_valid_mask_gpu.py new file mode 100644 index 00000000..0f39697b --- /dev/null +++ b/xrspatial/geotiff/tests/test_lerc_valid_mask_gpu.py @@ -0,0 +1,181 @@ +"""GPU follow-up to PR #1529 (LERC valid-mask on decode). + +The CPU LERC reader honours the LERC valid-mask and writes the file's +nodata sentinel into masked pixels. The GPU LERC tile-decode path used +to discard the mask, so masked pixels read back as LERC's zero fill +(real-looking measurements at z == 0) on GPU but as NaN/sentinel on +CPU. These tests confirm the GPU path now matches the CPU path for +representative LERC mask combinations. + +Mirrors the structure of ``test_lerc_valid_mask.py`` but compares +``read_geotiff_gpu`` output to ``read_to_array`` output for each case. +""" +from __future__ import annotations + +import importlib.util + +import numpy as np +import pytest + +lerc = pytest.importorskip("lerc") + +from xrspatial.geotiff._compression import LERC_AVAILABLE # noqa: E402 + + +def _gpu_available() -> bool: + """True if cupy is importable and CUDA is initialised.""" + if importlib.util.find_spec("cupy") is None: + return False + try: + import cupy + return bool(cupy.cuda.is_available()) + except Exception: + return False + + +_HAS_GPU = _gpu_available() +_gpu_only = pytest.mark.skipif( + not (_HAS_GPU and LERC_AVAILABLE), + reason="cupy + CUDA + lerc required", +) + + +@pytest.fixture +def lerc_writer_with_mask(monkeypatch): + """Patch ``lerc_compress`` to embed a valid-mask the writer can't pass. + + The xrspatial writer hard-codes ``hasMask=False`` in its call to + ``lerc.encode``. Tests inject a per-tile mask through this holder's + ``invalid`` predicate so the masked pixels survive the encode and + show up at decode time. Same pattern as the CPU test fixture in + ``test_lerc_valid_mask.py``. + """ + holder = {"invalid": None} + + def _patched(data, width, height, samples=1, + dtype=np.dtype('float32'), max_z_error=0.0): + if samples == 1: + arr = np.frombuffer(data, dtype=dtype).reshape(height, width) + else: + arr = np.frombuffer(data, dtype=dtype).reshape( + height, width, samples) + invalid_pred = holder["invalid"] + if invalid_pred is None: + mask = None + has_mask = False + else: + invalid = invalid_pred(arr) + mask = np.where(invalid, np.uint8(0), np.uint8(1)) + has_mask = True + result = lerc.encode(arr, samples, has_mask, mask, max_z_error, 1) + if result[0] != 0: + raise RuntimeError( + f"LERC encode failed with error code {result[0]}") + return bytes(result[2]) + + monkeypatch.setattr( + "xrspatial.geotiff._compression.lerc_compress", _patched, + ) + return holder + + +def _read_cpu_gpu(path): + """Read *path* with both readers and return ``(cpu_array, gpu_host_array)``.""" + from xrspatial.geotiff import read_geotiff_gpu + from xrspatial.geotiff._reader import read_to_array + + cpu, _geo = read_to_array(path) + gpu_da = read_geotiff_gpu(path, gpu='strict') + gpu_host = gpu_da.data.get() + return cpu, gpu_host + + +@_gpu_only +class TestGpuLercValidMask: + """End-to-end TIFF round-trips comparing GPU vs CPU output.""" + + def test_float32_nan_nodata(self, tmp_path, lerc_writer_with_mask): + """Float32 LERC + NaN nodata: GPU output matches CPU output.""" + from xrspatial.geotiff._writer import write + + arr = np.arange(1, 65, dtype=np.float32).reshape(8, 8) + invalid_positions = {(0, 1), (5, 4)} + + def invalid_pred(a): + m = np.zeros(a.shape[:2], dtype=bool) + for r, c in invalid_positions: + m[r, c] = True + return m + lerc_writer_with_mask["invalid"] = invalid_pred + + path = str(tmp_path / "lerc_mask_nan_gpu.tif") + write(arr, path, compression="lerc", tiled=True, tile_size=8, + nodata=float("nan")) + + cpu, gpu = _read_cpu_gpu(path) + # NaN positions + for (r, c) in invalid_positions: + assert np.isnan(cpu[r, c]) + assert np.isnan(gpu[r, c]) + # Valid positions agree exactly + cpu_valid = np.where(np.isnan(cpu), 0.0, cpu) + gpu_valid = np.where(np.isnan(gpu), 0.0, gpu) + np.testing.assert_array_equal(cpu_valid, gpu_valid) + + def test_float32_sentinel_nodata(self, tmp_path, lerc_writer_with_mask): + """Float32 LERC + sentinel nodata (-9999): GPU matches CPU.""" + from xrspatial.geotiff._writer import write + + arr = np.arange(1, 65, dtype=np.float32).reshape(8, 8) + invalid_positions = {(0, 1), (3, 3), (7, 7)} + + def invalid_pred(a): + m = np.zeros(a.shape[:2], dtype=bool) + for r, c in invalid_positions: + m[r, c] = True + return m + lerc_writer_with_mask["invalid"] = invalid_pred + + path = str(tmp_path / "lerc_mask_sentinel_f32_gpu.tif") + write(arr, path, compression="lerc", tiled=True, tile_size=8, + nodata=-9999.0) + + cpu, gpu = _read_cpu_gpu(path) + np.testing.assert_array_equal(cpu, gpu) + for (r, c) in invalid_positions: + assert gpu[r, c] == np.float32(-9999.0) + + def test_uint16_sentinel_nodata(self, tmp_path, lerc_writer_with_mask): + """Uint16 LERC + sentinel nodata (65535): GPU matches CPU.""" + from xrspatial.geotiff._writer import write + + arr = (np.arange(1, 65, dtype=np.uint16) * 100).reshape(8, 8) + invalid_positions = {(0, 1), (4, 4)} + + def invalid_pred(a): + m = np.zeros(a.shape[:2], dtype=bool) + for r, c in invalid_positions: + m[r, c] = True + return m + lerc_writer_with_mask["invalid"] = invalid_pred + + path = str(tmp_path / "lerc_mask_uint16_gpu.tif") + write(arr, path, compression="lerc", tiled=True, tile_size=8, + nodata=65535) + + cpu, gpu = _read_cpu_gpu(path) + np.testing.assert_array_equal(cpu, gpu) + for (r, c) in invalid_positions: + assert gpu[r, c] == np.uint16(65535) + + def test_no_mask_roundtrip_bitexact(self, tmp_path): + """All-valid LERC (no encoded mask): GPU and CPU agree bit-exact.""" + from xrspatial.geotiff._writer import write + + arr = np.arange(64, dtype=np.float32).reshape(8, 8) + path = str(tmp_path / "lerc_no_mask_gpu.tif") + write(arr, path, compression="lerc", tiled=True, tile_size=8) + + cpu, gpu = _read_cpu_gpu(path) + np.testing.assert_array_equal(cpu, arr) + np.testing.assert_array_equal(gpu, arr)