From 229a46796f84a400bcf786e8a49c44b73c6017a1 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Fri, 8 May 2026 08:40:58 -0700 Subject: [PATCH 1/3] Fix GPU predictor=2 decode on big-endian TIFFs (#1517) The per-dtype predictor=2 kernels view the raw decompressed byte buffer as native uint16/uint32/uint64. On big-endian files the bytes are MSB-first, so the prefix-sum runs on the wrong integer interpretation and the differencing produces garbage (~93% pixel mismatch in the reproducer from #1517). Swap the buffer to native byte order before running the predictor=2 kernel, then skip the post-assembly byteswap that previously tried to fix things up. Predictor=1 and predictor=3 paths keep the existing post-decode swap. Closes #1517. Builds on the byteswap helper added in #1515. --- xrspatial/geotiff/_gpu_decode.py | 51 +++- .../test_predictor2_big_endian_gpu_1517.py | 219 ++++++++++++++++++ 2 files changed, 267 insertions(+), 3 deletions(-) create mode 100644 xrspatial/geotiff/tests/test_predictor2_big_endian_gpu_1517.py diff --git a/xrspatial/geotiff/_gpu_decode.py b/xrspatial/geotiff/_gpu_decode.py index 2afe393d..17219a0b 100644 --- a/xrspatial/geotiff/_gpu_decode.py +++ b/xrspatial/geotiff/_gpu_decode.py @@ -78,6 +78,29 @@ def _xp_byteswap(arr): return u8[..., ::-1].copy().view(arr.dtype).reshape(arr.shape) +def _swap_bytes_inplace(buf, bps: int) -> None: + """Reverse bytes within each *bps*-sized sample of a flat uint8 buffer. + + Used by the GPU predictor=2 path to convert the raw decompressed byte + stream from file byte order to native byte order before differencing + (#1517). The per-dtype predictor kernels view ``buf`` as native + unsigned integers, so on big-endian files the prefix-sum would run on + a byte-swapped integer interpretation and produce wrong values. + + Operates on the underlying buffer of *buf* via a uint8 reshape, so + callers see the bytes mutated in place. Works on both numpy and cupy + arrays. + """ + if bps <= 1: + return + n = buf.size + if n % bps != 0: + raise ValueError( + f"buffer size {n} is not a multiple of bps={bps}") + view = buf.reshape(n // bps, bps) + view[:, :] = view[:, ::-1].copy() + + # LZW constants (same as _compression.py) LZW_CLEAR_CODE = 256 LZW_EOI_CODE = 257 @@ -1538,6 +1561,15 @@ def _apply_predictor_and_assemble(d_decomp, d_decomp_offsets, n_tiles, if predictor == 2: total_rows = n_tiles * tile_height + # Predictor=2 differences adjacent samples at the sample's natural + # width (uint8/16/32/64). The per-dtype kernels view the byte + # buffer as native unsigned dtype, so on big-endian files we must + # swap the bytes to native order BEFORE running the kernel, + # otherwise the prefix-sum runs on the wrong integer + # interpretation (#1517). The pre-swap then makes the post- + # assembly byteswap unnecessary; see the BE branch below. + if big_endian and dtype.itemsize > 1: + _swap_bytes_inplace(d_decomp, dtype.itemsize) # Sample-level differencing: stride is samples_per_pixel samples, # row width is tile_width pixels. Per-dtype kernels handle the # modular wrap at the sample's natural bit width. @@ -1575,7 +1607,12 @@ def _apply_predictor_and_assemble(d_decomp, d_decomp_offsets, n_tiles, else: out = d_output.view(dtype=cupy.dtype(dtype)).reshape( image_height, image_width) - if big_endian and dtype.itemsize > 1: + # Predictor=2 BE swapped d_decomp to native order pre-decode (#1517), + # so the assembled output is already native; skip the final swap. + needs_post_swap = ( + big_endian and dtype.itemsize > 1 and predictor != 2 + ) + if needs_post_swap: # See gpu_decode_tiles for why BE samples need a final byteswap. # cupy.ndarray has no .byteswap(), so use the dtype-view helper. out = _xp_byteswap(out) @@ -1793,6 +1830,12 @@ def gpu_decode_tiles( if predictor == 2: # Sample-level horizontal differencing: stride is samples_per_pixel # samples; per-dtype kernels handle the natural-width modular wrap. + # On big-endian multi-byte files the kernels would otherwise view + # the buffer with the wrong integer interpretation, so swap to + # native order first and skip the post-assembly swap below + # (#1517). + if byte_order == '>' and dtype.itemsize > 1: + _swap_bytes_inplace(d_decomp, dtype.itemsize) total_rows = n_tiles * tile_height _gpu_predictor2_decode( d_decomp, tile_width, total_rows, dtype, samples) @@ -1835,8 +1878,10 @@ def gpu_decode_tiles( # The decoded byte stream is in the file's byte order; cupy view # interprets it as native (always little-endian on supported GPUs), # so big-endian samples that are wider than a byte must be swapped - # back to native before the values mean anything. - if byte_order == '>' and dtype.itemsize > 1: + # back to native before the values mean anything. Predictor=2 BE + # already swapped the buffer pre-decode (#1517), so skip the swap + # in that case. + if byte_order == '>' and dtype.itemsize > 1 and predictor != 2: # cupy.ndarray has no .byteswap(), so use the dtype-view helper. out = _xp_byteswap(out) return out diff --git a/xrspatial/geotiff/tests/test_predictor2_big_endian_gpu_1517.py b/xrspatial/geotiff/tests/test_predictor2_big_endian_gpu_1517.py new file mode 100644 index 00000000..af456491 --- /dev/null +++ b/xrspatial/geotiff/tests/test_predictor2_big_endian_gpu_1517.py @@ -0,0 +1,219 @@ +"""Regression tests for issue #1517. + +PR #1515 fixed the ``AttributeError: 'ndarray' object has no attribute +'byteswap'`` crash on big-endian multi-byte TIFFs read via +``read_geotiff_gpu``. After that fix the GPU path no longer raised, but +predictor=2 BE files came back with wrong values: the per-dtype +predictor kernels view the byte buffer as native unsigned integers, so +on a BE file the prefix-sum runs on the wrong integer interpretation +and the differencing produces garbage. + +These tests confirm the GPU output now matches the CPU +``read_to_array`` baseline for predictor=2 BE files across several +dtypes and tile layouts, and that the LE predictor=2 path still +round-trips. +""" +from __future__ import annotations + +import importlib.util + +import numpy as np +import pytest + + +def _gpu_available() -> bool: + """True if cupy is importable and CUDA is initialised.""" + if importlib.util.find_spec("cupy") is None: + return False + try: + import cupy + return bool(cupy.cuda.is_available()) + except Exception: + return False + + +_HAS_GPU = _gpu_available() +_HAS_TIFFFILE = importlib.util.find_spec("tifffile") is not None +_gpu_only = pytest.mark.skipif( + not (_HAS_GPU and _HAS_TIFFFILE), + reason="cupy + CUDA + tifffile required", +) + + +@_gpu_only +def test_gpu_predictor2_big_endian_int32_tiled_reproducer(tmp_path): + """Exact reproducer from issue #1517: BE int32 tiled deflate + pred=2.""" + import cupy + import tifffile + + from xrspatial.geotiff import read_geotiff_gpu + from xrspatial.geotiff._reader import read_to_array + + rng = np.random.RandomState(20260507) + arr = rng.randint( + -1_000_000, 1_000_000, size=(32, 48), dtype=np.int64 + ).astype(np.int32) + + path = tmp_path / "be_pred2_int32.tif" + tifffile.imwrite( + str(path), arr, byteorder=">", predictor=2, + compression="deflate", tile=(16, 16), + ) + + cpu, _ = read_to_array(str(path)) + np.testing.assert_array_equal(cpu, arr) + + gpu_da = read_geotiff_gpu(str(path)) + assert isinstance(gpu_da.data, cupy.ndarray), ( + "expected cupy-backed DataArray; GPU path may have fallen back" + ) + assert gpu_da.data.dtype == np.dtype(np.int32) + assert gpu_da.data.dtype.isnative + np.testing.assert_array_equal(gpu_da.data.get(), cpu) + + +@_gpu_only +@pytest.mark.parametrize( + "dtype", + [np.uint16, np.int16, np.uint32, np.int32], +) +def test_gpu_predictor2_big_endian_dtypes_tiled(tmp_path, dtype): + """BE predictor=2 tiled files match CPU baseline across dtypes.""" + import cupy + import tifffile + + from xrspatial.geotiff import read_geotiff_gpu + from xrspatial.geotiff._reader import read_to_array + + rng = np.random.RandomState(20260508) + info = np.iinfo(dtype) + arr = rng.randint( + max(info.min, -1_000_000), + min(info.max, 1_000_000), + size=(32, 48), + dtype=np.int64, + ).astype(dtype) + + path = tmp_path / f"be_pred2_{np.dtype(dtype).name}.tif" + tifffile.imwrite( + str(path), arr, byteorder=">", predictor=2, + compression="deflate", tile=(16, 16), + ) + + cpu, _ = read_to_array(str(path)) + np.testing.assert_array_equal(cpu, arr) + + gpu_da = read_geotiff_gpu(str(path)) + assert isinstance(gpu_da.data, cupy.ndarray) + assert gpu_da.data.dtype == np.dtype(dtype) + assert gpu_da.data.dtype.isnative + np.testing.assert_array_equal(gpu_da.data.get(), cpu) + + +@_gpu_only +def test_gpu_predictor2_big_endian_stripped_uint16(tmp_path): + """Stripped BE predictor=2 files take the CPU fallback but stay correct. + + ``read_geotiff_gpu`` falls back to the CPU reader for stripped + layouts, then transfers the result to GPU. The fix must not regress + that path. + """ + import cupy + import tifffile + + from xrspatial.geotiff import read_geotiff_gpu + from xrspatial.geotiff._reader import read_to_array + + rng = np.random.RandomState(20260509) + arr = rng.randint(0, 60000, size=(32, 48), dtype=np.uint16) + + path = tmp_path / "be_pred2_uint16_strip.tif" + # Omit ``tile`` to get the strip layout. + tifffile.imwrite( + str(path), arr, byteorder=">", predictor=2, compression="deflate", + ) + + cpu, _ = read_to_array(str(path)) + np.testing.assert_array_equal(cpu, arr) + + gpu_da = read_geotiff_gpu(str(path)) + assert isinstance(gpu_da.data, cupy.ndarray) + assert gpu_da.data.dtype == np.dtype(np.uint16) + assert gpu_da.data.dtype.isnative + np.testing.assert_array_equal(gpu_da.data.get(), cpu) + + +@_gpu_only +def test_gpu_predictor2_little_endian_still_works(tmp_path): + """LE predictor=2 must still round-trip after the BE fix.""" + import cupy + import tifffile + + from xrspatial.geotiff import read_geotiff_gpu + from xrspatial.geotiff._reader import read_to_array + + rng = np.random.RandomState(20260510) + arr = rng.randint( + -1_000_000, 1_000_000, size=(32, 48), dtype=np.int64 + ).astype(np.int32) + + path = tmp_path / "le_pred2_int32.tif" + tifffile.imwrite( + str(path), arr, byteorder="<", predictor=2, + compression="deflate", tile=(16, 16), + ) + + cpu, _ = read_to_array(str(path)) + np.testing.assert_array_equal(cpu, arr) + + gpu_da = read_geotiff_gpu(str(path)) + assert isinstance(gpu_da.data, cupy.ndarray) + assert gpu_da.data.dtype == np.dtype(np.int32) + np.testing.assert_array_equal(gpu_da.data.get(), cpu) + + +@_gpu_only +def test_gpu_predictor3_big_endian_still_works(tmp_path): + """Floating-point predictor BE must still match CPU after the fix.""" + import cupy + import tifffile + + from xrspatial.geotiff import read_geotiff_gpu + from xrspatial.geotiff._reader import read_to_array + + rng = np.random.RandomState(20260511) + arr = rng.standard_normal((32, 48)).astype(np.float32) + + path = tmp_path / "be_pred3_float32.tif" + tifffile.imwrite( + str(path), arr, byteorder=">", predictor=3, + compression="deflate", tile=(16, 16), + ) + + cpu, _ = read_to_array(str(path)) + np.testing.assert_array_equal(cpu, arr) + + gpu_da = read_geotiff_gpu(str(path)) + assert isinstance(gpu_da.data, cupy.ndarray) + assert gpu_da.data.dtype == np.dtype(np.float32) + np.testing.assert_array_equal(gpu_da.data.get(), cpu) + + +def test_swap_bytes_inplace_numpy(): + """The byte-swap helper reverses bytes per sample on a numpy buffer.""" + from xrspatial.geotiff._gpu_decode import _swap_bytes_inplace + + # uint16 values 0x0102, 0x0304 in BE bytes: 01 02 03 04 + buf = np.array([0x01, 0x02, 0x03, 0x04], dtype=np.uint8) + _swap_bytes_inplace(buf, 2) + np.testing.assert_array_equal(buf, np.array([0x02, 0x01, 0x04, 0x03], + dtype=np.uint8)) + + +def test_swap_bytes_inplace_uint8_noop(): + """bps=1 must be a no-op.""" + from xrspatial.geotiff._gpu_decode import _swap_bytes_inplace + + buf = np.array([1, 2, 3], dtype=np.uint8) + _swap_bytes_inplace(buf, 1) + np.testing.assert_array_equal(buf, np.array([1, 2, 3], dtype=np.uint8)) From 26b16110e1a7ee4b8d84eecbc2b9d12f500773bb Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Fri, 8 May 2026 10:11:04 -0700 Subject: [PATCH 2/3] Address Copilot review on #1524 - Rename _swap_bytes_inplace -> _swap_byte_lanes. The previous name implied a zero-copy operation but view[:, ::-1].copy() allocates a same-sized temporary. The docstring now states the temp explicitly so callers can size their working set, and a CUDA kernel is left as a follow-up if peak GPU memory becomes an issue. - Add _block_cpu_fallback helper that monkeypatches the read_to_array binding inside xrspatial.geotiff. When the GPU tests call read_geotiff_gpu, any silent CPU fallback now raises an AssertionError instead of producing a passing-via-fallback result. - Apply the guard to the four tests where the GPU decode path is what is being exercised (BE int32 reproducer, BE dtype matrix, LE pred2, BE pred3). The stripped-uint16 test still relies on the CPU fallback by design and stays unguarded. --- xrspatial/geotiff/_gpu_decode.py | 14 ++--- .../test_predictor2_big_endian_gpu_1517.py | 54 ++++++++++++++----- 2 files changed, 49 insertions(+), 19 deletions(-) diff --git a/xrspatial/geotiff/_gpu_decode.py b/xrspatial/geotiff/_gpu_decode.py index 17219a0b..5a200244 100644 --- a/xrspatial/geotiff/_gpu_decode.py +++ b/xrspatial/geotiff/_gpu_decode.py @@ -78,7 +78,7 @@ def _xp_byteswap(arr): return u8[..., ::-1].copy().view(arr.dtype).reshape(arr.shape) -def _swap_bytes_inplace(buf, bps: int) -> None: +def _swap_byte_lanes(buf, bps: int) -> None: """Reverse bytes within each *bps*-sized sample of a flat uint8 buffer. Used by the GPU predictor=2 path to convert the raw decompressed byte @@ -87,9 +87,11 @@ def _swap_bytes_inplace(buf, bps: int) -> None: unsigned integers, so on big-endian files the prefix-sum would run on a byte-swapped integer interpretation and produce wrong values. - Operates on the underlying buffer of *buf* via a uint8 reshape, so - callers see the bytes mutated in place. Works on both numpy and cupy - arrays. + The original buffer is mutated, but the swap goes through a same-sized + temporary (``view[:, ::-1].copy()``); this roughly doubles peak memory + for the buffer being swapped. Callers should size their working set + accordingly. A true zero-temp swap is possible with a small CUDA + kernel; for now the simple form keeps numpy and cupy on the same path. """ if bps <= 1: return @@ -1569,7 +1571,7 @@ def _apply_predictor_and_assemble(d_decomp, d_decomp_offsets, n_tiles, # interpretation (#1517). The pre-swap then makes the post- # assembly byteswap unnecessary; see the BE branch below. if big_endian and dtype.itemsize > 1: - _swap_bytes_inplace(d_decomp, dtype.itemsize) + _swap_byte_lanes(d_decomp, dtype.itemsize) # Sample-level differencing: stride is samples_per_pixel samples, # row width is tile_width pixels. Per-dtype kernels handle the # modular wrap at the sample's natural bit width. @@ -1835,7 +1837,7 @@ def gpu_decode_tiles( # native order first and skip the post-assembly swap below # (#1517). if byte_order == '>' and dtype.itemsize > 1: - _swap_bytes_inplace(d_decomp, dtype.itemsize) + _swap_byte_lanes(d_decomp, dtype.itemsize) total_rows = n_tiles * tile_height _gpu_predictor2_decode( d_decomp, tile_width, total_rows, dtype, samples) diff --git a/xrspatial/geotiff/tests/test_predictor2_big_endian_gpu_1517.py b/xrspatial/geotiff/tests/test_predictor2_big_endian_gpu_1517.py index af456491..e6790639 100644 --- a/xrspatial/geotiff/tests/test_predictor2_big_endian_gpu_1517.py +++ b/xrspatial/geotiff/tests/test_predictor2_big_endian_gpu_1517.py @@ -40,8 +40,34 @@ def _gpu_available() -> bool: ) +def _block_cpu_fallback(monkeypatch): + """Make any call to ``read_to_array`` from ``read_geotiff_gpu`` fail loudly. + + ``read_geotiff_gpu`` returns a cupy-backed array even when its silent CPU + fallback fires (the fallback wraps the CPU result with ``cupy.asarray``), + so ``isinstance(gpu_da.data, cupy.ndarray)`` cannot distinguish the two + paths. Patching the module-bound ``read_to_array`` to raise turns any + silent fallback into a hard test failure, which is what we want when the + point of a test is to exercise the actual GPU decode kernels. + + Tests that legitimately rely on the CPU fallback (e.g. stripped + layouts) must not call this helper. + """ + from xrspatial import geotiff as geotiff_pkg + + def _no_fallback(*args, **kwargs): + raise AssertionError( + "read_geotiff_gpu fell back to read_to_array; " + "the GPU decode path was not exercised." + ) + + monkeypatch.setattr( + geotiff_pkg, 'read_to_array', _no_fallback, raising=True, + ) + + @_gpu_only -def test_gpu_predictor2_big_endian_int32_tiled_reproducer(tmp_path): +def test_gpu_predictor2_big_endian_int32_tiled_reproducer(tmp_path, monkeypatch): """Exact reproducer from issue #1517: BE int32 tiled deflate + pred=2.""" import cupy import tifffile @@ -63,10 +89,9 @@ def test_gpu_predictor2_big_endian_int32_tiled_reproducer(tmp_path): cpu, _ = read_to_array(str(path)) np.testing.assert_array_equal(cpu, arr) + _block_cpu_fallback(monkeypatch) gpu_da = read_geotiff_gpu(str(path)) - assert isinstance(gpu_da.data, cupy.ndarray), ( - "expected cupy-backed DataArray; GPU path may have fallen back" - ) + assert isinstance(gpu_da.data, cupy.ndarray) assert gpu_da.data.dtype == np.dtype(np.int32) assert gpu_da.data.dtype.isnative np.testing.assert_array_equal(gpu_da.data.get(), cpu) @@ -77,7 +102,7 @@ def test_gpu_predictor2_big_endian_int32_tiled_reproducer(tmp_path): "dtype", [np.uint16, np.int16, np.uint32, np.int32], ) -def test_gpu_predictor2_big_endian_dtypes_tiled(tmp_path, dtype): +def test_gpu_predictor2_big_endian_dtypes_tiled(tmp_path, monkeypatch, dtype): """BE predictor=2 tiled files match CPU baseline across dtypes.""" import cupy import tifffile @@ -103,6 +128,7 @@ def test_gpu_predictor2_big_endian_dtypes_tiled(tmp_path, dtype): cpu, _ = read_to_array(str(path)) np.testing.assert_array_equal(cpu, arr) + _block_cpu_fallback(monkeypatch) gpu_da = read_geotiff_gpu(str(path)) assert isinstance(gpu_da.data, cupy.ndarray) assert gpu_da.data.dtype == np.dtype(dtype) @@ -144,7 +170,7 @@ def test_gpu_predictor2_big_endian_stripped_uint16(tmp_path): @_gpu_only -def test_gpu_predictor2_little_endian_still_works(tmp_path): +def test_gpu_predictor2_little_endian_still_works(tmp_path, monkeypatch): """LE predictor=2 must still round-trip after the BE fix.""" import cupy import tifffile @@ -166,6 +192,7 @@ def test_gpu_predictor2_little_endian_still_works(tmp_path): cpu, _ = read_to_array(str(path)) np.testing.assert_array_equal(cpu, arr) + _block_cpu_fallback(monkeypatch) gpu_da = read_geotiff_gpu(str(path)) assert isinstance(gpu_da.data, cupy.ndarray) assert gpu_da.data.dtype == np.dtype(np.int32) @@ -173,7 +200,7 @@ def test_gpu_predictor2_little_endian_still_works(tmp_path): @_gpu_only -def test_gpu_predictor3_big_endian_still_works(tmp_path): +def test_gpu_predictor3_big_endian_still_works(tmp_path, monkeypatch): """Floating-point predictor BE must still match CPU after the fix.""" import cupy import tifffile @@ -193,27 +220,28 @@ def test_gpu_predictor3_big_endian_still_works(tmp_path): cpu, _ = read_to_array(str(path)) np.testing.assert_array_equal(cpu, arr) + _block_cpu_fallback(monkeypatch) gpu_da = read_geotiff_gpu(str(path)) assert isinstance(gpu_da.data, cupy.ndarray) assert gpu_da.data.dtype == np.dtype(np.float32) np.testing.assert_array_equal(gpu_da.data.get(), cpu) -def test_swap_bytes_inplace_numpy(): +def test_swap_byte_lanes_numpy(): """The byte-swap helper reverses bytes per sample on a numpy buffer.""" - from xrspatial.geotiff._gpu_decode import _swap_bytes_inplace + from xrspatial.geotiff._gpu_decode import _swap_byte_lanes # uint16 values 0x0102, 0x0304 in BE bytes: 01 02 03 04 buf = np.array([0x01, 0x02, 0x03, 0x04], dtype=np.uint8) - _swap_bytes_inplace(buf, 2) + _swap_byte_lanes(buf, 2) np.testing.assert_array_equal(buf, np.array([0x02, 0x01, 0x04, 0x03], dtype=np.uint8)) -def test_swap_bytes_inplace_uint8_noop(): +def test_swap_byte_lanes_uint8_noop(): """bps=1 must be a no-op.""" - from xrspatial.geotiff._gpu_decode import _swap_bytes_inplace + from xrspatial.geotiff._gpu_decode import _swap_byte_lanes buf = np.array([1, 2, 3], dtype=np.uint8) - _swap_bytes_inplace(buf, 1) + _swap_byte_lanes(buf, 1) np.testing.assert_array_equal(buf, np.array([1, 2, 3], dtype=np.uint8)) From cbac10c255fb2f396fd85a38125bfa9aa9452718 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Fri, 8 May 2026 10:16:57 -0700 Subject: [PATCH 3/3] Make _swap_byte_lanes truly zero-temp Replaces the view[:, ::-1].copy() form with in-place byte reversal: - numpy: dispatch to ndarray.byteswap(inplace=True) via a uint16/32/64 view of the original buffer, so the original allocation is mutated with no temp. - cupy: launch _byte_swap_lanes_kernel, one thread per sample, swapping bytes through register-resident temporaries. Peak GPU memory for the swap is now O(1) instead of O(buffer size). Adds explicit validation: bps must be 2, 4, or 8 (3/5/6/7 are rejected rather than silently corrupting), and size must be a multiple of bps. Tests cover bps=2/4/8 on numpy and cupy, the bps=1 no-op, the validation errors, and address-equality assertions confirming both paths really are in-place. The existing integration tests (test_gpu_predictor2_big_endian_*, predictor3_big_endian) still pass unchanged. --- xrspatial/geotiff/_gpu_decode.py | 47 +++++++-- .../test_predictor2_big_endian_gpu_1517.py | 97 ++++++++++++++++++- 2 files changed, 136 insertions(+), 8 deletions(-) diff --git a/xrspatial/geotiff/_gpu_decode.py b/xrspatial/geotiff/_gpu_decode.py index 5a200244..d18b5cfd 100644 --- a/xrspatial/geotiff/_gpu_decode.py +++ b/xrspatial/geotiff/_gpu_decode.py @@ -78,6 +78,30 @@ def _xp_byteswap(arr): return u8[..., ::-1].copy().view(arr.dtype).reshape(arr.shape) +@cuda.jit +def _byte_swap_lanes_kernel(buf, bps): + """Reverse bytes within each *bps*-sized sample, one thread per sample. + + Each thread loads ``bps`` bytes from its sample, swaps them in place + via register-resident temporaries, and writes them back. No global + memory beyond *buf* is touched, so peak GPU memory is unchanged. + """ + i = cuda.grid(1) + n_samples = buf.size // bps + if i >= n_samples: + return + base = i * bps + half = bps // 2 + for j in range(half): + a = buf[base + j] + b = buf[base + bps - 1 - j] + buf[base + j] = b + buf[base + bps - 1 - j] = a + + +_BPS_TO_UINT = {2: np.uint16, 4: np.uint32, 8: np.uint64} + + def _swap_byte_lanes(buf, bps: int) -> None: """Reverse bytes within each *bps*-sized sample of a flat uint8 buffer. @@ -87,11 +111,11 @@ def _swap_byte_lanes(buf, bps: int) -> None: unsigned integers, so on big-endian files the prefix-sum would run on a byte-swapped integer interpretation and produce wrong values. - The original buffer is mutated, but the swap goes through a same-sized - temporary (``view[:, ::-1].copy()``); this roughly doubles peak memory - for the buffer being swapped. Callers should size their working set - accordingly. A true zero-temp swap is possible with a small CUDA - kernel; for now the simple form keeps numpy and cupy on the same path. + The swap is true in-place: no same-sized temporary is allocated. + On numpy arrays it dispatches to ``ndarray.byteswap(inplace=True)`` + via a uint16/32/64 view; on cupy device arrays it launches + :func:`_byte_swap_lanes_kernel`, which swaps bytes per sample using + only register-resident temporaries. """ if bps <= 1: return @@ -99,8 +123,17 @@ def _swap_byte_lanes(buf, bps: int) -> None: if n % bps != 0: raise ValueError( f"buffer size {n} is not a multiple of bps={bps}") - view = buf.reshape(n // bps, bps) - view[:, :] = view[:, ::-1].copy() + if bps not in _BPS_TO_UINT: + raise ValueError(f"unsupported bps={bps}; expected 2, 4, or 8") + + if isinstance(buf, np.ndarray): + buf.view(_BPS_TO_UINT[bps]).byteswap(inplace=True) + return + + n_samples = n // bps + threads = 256 + blocks = (n_samples + threads - 1) // threads + _byte_swap_lanes_kernel[blocks, threads](buf, bps) # LZW constants (same as _compression.py) diff --git a/xrspatial/geotiff/tests/test_predictor2_big_endian_gpu_1517.py b/xrspatial/geotiff/tests/test_predictor2_big_endian_gpu_1517.py index e6790639..f2fee9ac 100644 --- a/xrspatial/geotiff/tests/test_predictor2_big_endian_gpu_1517.py +++ b/xrspatial/geotiff/tests/test_predictor2_big_endian_gpu_1517.py @@ -227,7 +227,7 @@ def test_gpu_predictor3_big_endian_still_works(tmp_path, monkeypatch): np.testing.assert_array_equal(gpu_da.data.get(), cpu) -def test_swap_byte_lanes_numpy(): +def test_swap_byte_lanes_numpy_bps2(): """The byte-swap helper reverses bytes per sample on a numpy buffer.""" from xrspatial.geotiff._gpu_decode import _swap_byte_lanes @@ -238,6 +238,29 @@ def test_swap_byte_lanes_numpy(): dtype=np.uint8)) +def test_swap_byte_lanes_numpy_bps4(): + """bps=4: full byte reversal within each 4-byte sample.""" + from xrspatial.geotiff._gpu_decode import _swap_byte_lanes + + buf = np.array([0x01, 0x02, 0x03, 0x04, + 0x05, 0x06, 0x07, 0x08], dtype=np.uint8) + _swap_byte_lanes(buf, 4) + np.testing.assert_array_equal( + buf, np.array([0x04, 0x03, 0x02, 0x01, + 0x08, 0x07, 0x06, 0x05], dtype=np.uint8)) + + +def test_swap_byte_lanes_numpy_bps8(): + """bps=8: full byte reversal within each 8-byte sample.""" + from xrspatial.geotiff._gpu_decode import _swap_byte_lanes + + sample = np.arange(1, 9, dtype=np.uint8) + buf = np.tile(sample, 2).copy() + _swap_byte_lanes(buf, 8) + np.testing.assert_array_equal( + buf, np.tile(sample[::-1], 2)) + + def test_swap_byte_lanes_uint8_noop(): """bps=1 must be a no-op.""" from xrspatial.geotiff._gpu_decode import _swap_byte_lanes @@ -245,3 +268,75 @@ def test_swap_byte_lanes_uint8_noop(): buf = np.array([1, 2, 3], dtype=np.uint8) _swap_byte_lanes(buf, 1) np.testing.assert_array_equal(buf, np.array([1, 2, 3], dtype=np.uint8)) + + +def test_swap_byte_lanes_rejects_unsupported_bps(): + """Unsupported bps values raise ValueError rather than corrupt data.""" + from xrspatial.geotiff._gpu_decode import _swap_byte_lanes + + buf = np.zeros(6, dtype=np.uint8) + with pytest.raises(ValueError, match="unsupported bps"): + _swap_byte_lanes(buf, 3) + + +def test_swap_byte_lanes_rejects_misaligned_size(): + """Buffer size must be a multiple of bps.""" + from xrspatial.geotiff._gpu_decode import _swap_byte_lanes + + buf = np.zeros(5, dtype=np.uint8) + with pytest.raises(ValueError, match="not a multiple"): + _swap_byte_lanes(buf, 2) + + +def test_swap_byte_lanes_numpy_is_zero_temp(): + """The numpy path must mutate the original buffer without realloc.""" + from xrspatial.geotiff._gpu_decode import _swap_byte_lanes + + buf = np.array([0x01, 0x02, 0x03, 0x04], dtype=np.uint8) + addr_before = buf.ctypes.data + _swap_byte_lanes(buf, 2) + assert buf.ctypes.data == addr_before + np.testing.assert_array_equal(buf, np.array([0x02, 0x01, 0x04, 0x03], + dtype=np.uint8)) + + +@_gpu_only +@pytest.mark.parametrize("bps,dtype", [ + (2, np.uint16), + (4, np.uint32), + (8, np.uint64), +]) +def test_swap_byte_lanes_cupy_kernel(bps, dtype): + """The cupy path runs the CUDA kernel and matches numpy.byteswap.""" + import cupy + + from xrspatial.geotiff._gpu_decode import _swap_byte_lanes + + rng = np.random.RandomState(20260512 + bps) + n_samples = 1024 + src = rng.randint(0, np.iinfo(dtype).max, size=n_samples, + dtype=np.uint64).astype(dtype) + expected = src.byteswap() # numpy reference, returns swapped copy + + d_buf = cupy.asarray(src.view(np.uint8)) + addr_before = int(d_buf.data.ptr) + _swap_byte_lanes(d_buf, bps) + addr_after = int(d_buf.data.ptr) + + assert addr_after == addr_before, "kernel must operate in place" + np.testing.assert_array_equal( + d_buf.get().view(dtype), expected, + ) + + +@_gpu_only +def test_swap_byte_lanes_cupy_uint8_noop(): + """bps=1 leaves cupy buffers untouched (no kernel launch).""" + import cupy + + from xrspatial.geotiff._gpu_decode import _swap_byte_lanes + + src = np.arange(16, dtype=np.uint8) + d_buf = cupy.asarray(src) + _swap_byte_lanes(d_buf, 1) + np.testing.assert_array_equal(d_buf.get(), src)