diff --git a/xrspatial/geotiff/_gpu_decode.py b/xrspatial/geotiff/_gpu_decode.py index ac8ba4bb..f6e5cc63 100644 --- a/xrspatial/geotiff/_gpu_decode.py +++ b/xrspatial/geotiff/_gpu_decode.py @@ -1180,10 +1180,18 @@ def _get_nvjpeg(): # nvJPEG status codes _NVJPEG_STATUS_SUCCESS = 0 -# nvJPEG output formats -_NVJPEG_OUTPUT_RGB = 2 # planar RGB -_NVJPEG_OUTPUT_RGBI = 3 # interleaved RGB (R0G0B0 R1G1B1 ...) -_NVJPEG_OUTPUT_UNCHANGED = 5 # native colorspace +# nvJPEG output formats. Values must match ``nvjpegOutputFormat_t`` in +# ``nvjpeg.h`` (CUDA Toolkit). They were previously off-by-two, which made +# ``_NVJPEG_OUTPUT_RGBI`` resolve to the SDK's ``NVJPEG_OUTPUT_RGB`` (planar) +# constant. nvJPEG then dereferenced ``channel[1]``/``channel[2]`` of the +# output struct, both of which the wrappers below set to NULL for +# interleaved layouts, producing an out-of-bounds GPU write inside +# ``ycbcr_to_format_kernel_roi`` and a sticky ``cudaErrorIllegalAddress`` +# (issue #1549). +_NVJPEG_OUTPUT_UNCHANGED = 0 # source colorspace (channel[0] only for Y) +_NVJPEG_OUTPUT_Y = 2 # luma plane only +_NVJPEG_OUTPUT_RGB = 3 # planar RGB +_NVJPEG_OUTPUT_RGBI = 5 # interleaved RGB (R0G0B0 R1G1B1 ...) # nvJPEG backend _NVJPEG_BACKEND_DEFAULT = 0 @@ -1274,8 +1282,14 @@ class _NvjpegImage(ctypes.Structure): ) if status != _NVJPEG_STATUS_SUCCESS: return None + # nvjpegDecode is asynchronous on the default stream + # (we pass stream=0 above); the shared jpeg_state must + # not be reused for the next tile until this decode is + # complete. Sync only the default stream so concurrent + # work on other streams isn't blocked -- the data + # dependency is on jpeg_state, not on the whole device. + cupy.cuda.Stream.null.synchronize() - cupy.cuda.Device().synchronize() return d_all finally: diff --git a/xrspatial/geotiff/tests/test_jpeg_gpu_1549.py b/xrspatial/geotiff/tests/test_jpeg_gpu_1549.py new file mode 100644 index 00000000..607a8361 --- /dev/null +++ b/xrspatial/geotiff/tests/test_jpeg_gpu_1549.py @@ -0,0 +1,265 @@ +"""GPU regression test for issue #1549. + +Reading a 3-band tiled JPEG GeoTIFF with ``open_geotiff(..., gpu=True)`` +crashed inside the nvJPEG decode kernel with:: + + cupy.cuda.runtime.CUDARuntimeError: cudaErrorIllegalAddress + +The crash was sticky -- it poisoned the CUDA context so every later GPU +call in the same process also failed. Root cause: the +``nvjpegOutputFormat_t`` constants in ``_gpu_decode.py`` were defined +two values lower than the SDK's enum. The wrapper sent ``3`` thinking +it was ``NVJPEG_OUTPUT_RGBI`` (interleaved RGB), but ``3`` is +``NVJPEG_OUTPUT_RGB`` (planar) in the real SDK. nvJPEG then wrote the +G and B planes through ``nvjpegImage.channel[1]`` and +``channel[2]``, which the wrapper had set to NULL for an interleaved +output buffer, producing an out-of-bounds GPU write inside +``ycbcr_to_format_kernel_roi``. + +The same off-by-two affected the single-band path: it sent ``5`` +(thinking ``NVJPEG_OUTPUT_UNCHANGED``) which is actually +``NVJPEG_OUTPUT_RGBI`` -- nvJPEG produced 3-byte-per-pixel output into a +1-byte-per-pixel buffer, returning visibly wrong pixels rather than +crashing. + +These tests build the exact reproducer from the issue, decode it on GPU, +and verify (a) the decode does not crash, (b) the GPU pixels match the +CPU pixels within the typical libjpeg/nvjpeg rounding tolerance, and +(c) the CUDA context survives a follow-up GPU read of an unrelated +file. +""" +from __future__ import annotations + +import importlib.util + +import numpy as np +import pytest + + +def _gpu_available() -> bool: + """True when cupy is importable and CUDA is initialised.""" + if importlib.util.find_spec("cupy") is None: + return False + try: + import cupy + return bool(cupy.cuda.is_available()) + except Exception: + return False + + +def _nvjpeg_available() -> bool: + """True when libnvjpeg.so loads on this host. + + Without nvJPEG the GPU pipeline silently falls back to CPU Pillow + decode, so the regression for issue #1549 (an out-of-bounds write + inside the nvJPEG kernel) would never be exercised. Skip rather + than test a path the bug never lived on. + """ + if not _gpu_available(): + return False + try: + from xrspatial.geotiff._gpu_decode import _get_nvjpeg + return _get_nvjpeg() is not None + except Exception: + return False + + +_HAS_GPU = _gpu_available() +_HAS_TIFFFILE = importlib.util.find_spec("tifffile") is not None +_HAS_PIL = importlib.util.find_spec("PIL") is not None +# tifffile.imwrite(compression='jpeg') delegates the codec to imagecodecs +# (or libjpeg via Pillow on some installs); when neither is wired up the +# write raises and the suite would error instead of skipping cleanly. +_HAS_IMAGECODECS = importlib.util.find_spec("imagecodecs") is not None +_HAS_NVJPEG = _nvjpeg_available() + +_gpu_only = pytest.mark.skipif( + not (_HAS_GPU and _HAS_TIFFFILE and _HAS_PIL + and _HAS_IMAGECODECS and _HAS_NVJPEG), + reason="cupy + CUDA + tifffile + Pillow + imagecodecs + nvJPEG required", +) + + +def _write_jpeg_rgb_tiff(path: str, seed: int = 0, + noise: bool = True) -> np.ndarray: + """Write a 3-band 256x256 tiled JPEG TIFF using tifffile. + + tifffile emits a complete JFIF stream (SOI + APP0 + DQT + DHT + SOF0 + + ...) per tile rather than a JPEGTables-style abbreviated stream, + so the GPU decode path runs without any extra splice step. + + With ``noise=True`` the payload is uniform random bytes -- the + worst-case input for JPEG compression and useful for the + crash-reproducer test which only cares about safe completion. + With ``noise=False`` the payload is a smooth gradient where + libjpeg and nvjpeg agree to within a few LSBs per pixel; that lets + the cross-backend match assertion run with a tight tolerance. + """ + import tifffile + if noise: + rng = np.random.default_rng(seed) + arr = rng.integers(0, 256, size=(256, 256, 3), dtype=np.uint8) + else: + # Smooth gradient: per-channel ramp + cross terms. + ys, xs = np.mgrid[0:256, 0:256].astype(np.int32) + r = (ys + xs) // 2 + g = ys + b = xs + arr = np.stack([r, g, b], axis=2).clip(0, 255).astype(np.uint8) + tifffile.imwrite(path, arr, photometric='rgb', tile=(128, 128), + compression='jpeg') + return arr + + +def _write_jpeg_gray_tiff(path: str, seed: int = 42) -> np.ndarray: + """Write a 1-band 256x256 tiled JPEG TIFF using tifffile.""" + import tifffile + rng = np.random.default_rng(seed) + arr = rng.integers(0, 256, size=(256, 256), dtype=np.uint8) + tifffile.imwrite(path, arr, photometric='minisblack', tile=(128, 128), + compression='jpeg') + return arr + + +@_gpu_only +def test_rgb_jpeg_gpu_no_crash(tmp_path, monkeypatch): + """3-band JPEG must not raise CUDARuntimeError on GPU read. + + Uses ``gpu='strict'`` so the original + ``cudaErrorIllegalAddress`` would propagate up the stack instead of + being swallowed by the auto-mode CPU fallback. Without strict the + bug presented as a ``RuntimeWarning: GPU decode failed (...); + falling back to CPU`` and the returned array was the CPU array, so + the assertions below would still pass even with the bug present. + + Also spies on ``_try_nvjpeg_batch_decode`` to fail loudly if the + decode took a CPU fallback path instead of nvJPEG. Without this + guard the test would pass on a system whose nvJPEG returned None for + any reason, defeating the point of the regression test. + """ + from xrspatial.geotiff import read_geotiff_gpu + from xrspatial.geotiff import _gpu_decode + import cupy + + spy = {"calls": 0, "successes": 0} + original = _gpu_decode._try_nvjpeg_batch_decode + + def wrapped(*args, **kwargs): + spy["calls"] += 1 + result = original(*args, **kwargs) + if result is not None: + spy["successes"] += 1 + return result + + monkeypatch.setattr(_gpu_decode, "_try_nvjpeg_batch_decode", wrapped) + + path = str(tmp_path / "rgb_jpeg_1549.tif") + _write_jpeg_rgb_tiff(path) + + arr = read_geotiff_gpu(path, gpu='strict') + # Materialise the GPU buffer so any deferred kernel actually runs + # and surface any sticky error from the decode pipeline. + assert isinstance(arr.data, cupy.ndarray) + decoded = arr.data.get() + assert decoded.shape == (256, 256, 3) + assert decoded.dtype == np.uint8 + + assert spy["calls"] >= 1, ( + "nvJPEG branch was never called — test did not exercise the " + "code path the #1549 fix lives on" + ) + assert spy["successes"] >= 1, ( + "nvJPEG returned None — CPU Pillow fallback ran and the fix was " + "not exercised" + ) + + +@_gpu_only +def test_rgb_jpeg_gpu_matches_cpu(tmp_path): + """GPU pixels must be within JPEG decoder tolerance of CPU pixels. + + With a smooth gradient input the libjpeg (CPU) and nvjpeg (GPU) + decoders agree to a couple of LSBs per pixel. The off-by-two + constant bug scrambled channels enough to push the mean diff above + 60 and the max diff above 200, so a tight bound here pins both the + constant fix and the per-tile sync that keeps the multi-tile + decode deterministic. + """ + from xrspatial.geotiff import open_geotiff + + path = str(tmp_path / "rgb_jpeg_match_1549.tif") + _write_jpeg_rgb_tiff(path, noise=False) + + cpu = open_geotiff(path) + gpu = open_geotiff(path, gpu=True) + assert cpu.shape == gpu.shape == (256, 256, 3) + + cpu_arr = np.asarray(cpu.data) + gpu_arr = np.asarray(gpu.data.get()) + + diff = np.abs(cpu_arr.astype(int) - gpu_arr.astype(int)) + assert diff.mean() < 1.0, f"mean diff {diff.mean():.3f} too large" + assert diff.max() < 8, f"max diff {diff.max()} too large" + + +@_gpu_only +def test_grayscale_jpeg_gpu_matches_cpu(tmp_path): + """Single-band JPEG GPU read must also produce correct pixels. + + With the off-by-two constants the single-band path silently produced + wrong output (each Y value duplicated three times then wrapped) -- a + quieter failure mode than the 3-band crash but a corruption + nonetheless. + """ + from xrspatial.geotiff import open_geotiff + + path = str(tmp_path / "gray_jpeg_1549.tif") + _write_jpeg_gray_tiff(path) + + cpu = open_geotiff(path) + gpu = open_geotiff(path, gpu=True) + assert cpu.shape == gpu.shape == (256, 256) + + cpu_arr = np.asarray(cpu.data) + gpu_arr = np.asarray(gpu.data.get()) + diff = np.abs(cpu_arr.astype(int) - gpu_arr.astype(int)) + # For grayscale there is no chroma involved, so libjpeg and nvjpeg + # only diverge by IDCT rounding (typically <= 1 LSB). + assert diff.max() <= 2, ( + f"grayscale max diff {diff.max()} indicates corruption, " + f"not just rounding" + ) + + +@_gpu_only +def test_cuda_context_survives_after_jpeg_gpu_read(tmp_path): + """Verify the CUDA context is healthy after a GPU JPEG read. + + Before the fix, the failing nvJPEG kernel left the context in an + error state so every later GPU call in the same process raised + ``cudaErrorIllegalAddress`` -- even unrelated allocations. This + test reads the JPEG on GPU, then performs a small follow-up GPU + operation and an unrelated GPU read and asserts both succeed. + """ + import cupy + from xrspatial.geotiff import open_geotiff + + path = str(tmp_path / "rgb_ctx_1549.tif") + _write_jpeg_rgb_tiff(path) + + arr = open_geotiff(path, gpu=True) + _ = arr.data.get() + + # Plain CuPy op -- this is the call that used to surface the sticky + # cudaErrorIllegalAddress on its first allocation. + x = cupy.arange(1024, dtype=cupy.float32) + s = float(cupy.sum(x).item()) + assert s == 1023 * 1024 / 2 + + # Unrelated GPU TIFF read -- closes the loop on the issue's + # "every later GPU call fails" symptom. + other_path = str(tmp_path / "other_1549.tif") + _write_jpeg_gray_tiff(other_path, seed=7) + other = open_geotiff(other_path, gpu=True) + assert other.shape == (256, 256) + assert other.dtype == np.uint8