From 229a46796f84a400bcf786e8a49c44b73c6017a1 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Fri, 8 May 2026 08:40:58 -0700
Subject: [PATCH 1/3] Fix GPU predictor=2 decode on big-endian TIFFs (#1517)

The per-dtype predictor=2 kernels view the raw decompressed byte buffer
as native uint16/uint32/uint64. On big-endian files the bytes are
MSB-first, so the prefix-sum runs on the wrong integer interpretation
and the differencing produces garbage (~93% pixel mismatch in the
reproducer from #1517).

Swap the buffer to native byte order before running the predictor=2
kernel, then skip the post-assembly byteswap that previously tried to
fix things up. Predictor=1 and predictor=3 paths keep the existing
post-decode swap.

Closes #1517. Builds on the byteswap helper added in #1515.
---
 xrspatial/geotiff/_gpu_decode.py              |  51 +++-
 .../test_predictor2_big_endian_gpu_1517.py    | 219 ++++++++++++++++++
 2 files changed, 267 insertions(+), 3 deletions(-)
 create mode 100644 xrspatial/geotiff/tests/test_predictor2_big_endian_gpu_1517.py

diff --git a/xrspatial/geotiff/_gpu_decode.py b/xrspatial/geotiff/_gpu_decode.py
index 2afe393d..17219a0b 100644
--- a/xrspatial/geotiff/_gpu_decode.py
+++ b/xrspatial/geotiff/_gpu_decode.py
@@ -78,6 +78,29 @@ def _xp_byteswap(arr):
     return u8[..., ::-1].copy().view(arr.dtype).reshape(arr.shape)
 
 
+def _swap_bytes_inplace(buf, bps: int) -> None:
+    """Reverse bytes within each *bps*-sized sample of a flat uint8 buffer.
+
+    Used by the GPU predictor=2 path to convert the raw decompressed byte
+    stream from file byte order to native byte order before differencing
+    (#1517). The per-dtype predictor kernels view ``buf`` as native
+    unsigned integers, so on big-endian files the prefix-sum would run on
+    a byte-swapped integer interpretation and produce wrong values.
+
+    Operates on the underlying buffer of *buf* via a uint8 reshape, so
+    callers see the bytes mutated in place. Works on both numpy and cupy
+    arrays.
+    """
+    if bps <= 1:
+        return
+    n = buf.size
+    if n % bps != 0:
+        raise ValueError(
+            f"buffer size {n} is not a multiple of bps={bps}")
+    view = buf.reshape(n // bps, bps)
+    view[:, :] = view[:, ::-1].copy()
+
+
 # LZW constants (same as _compression.py)
 LZW_CLEAR_CODE = 256
 LZW_EOI_CODE = 257
@@ -1538,6 +1561,15 @@ def _apply_predictor_and_assemble(d_decomp, d_decomp_offsets, n_tiles,
 
     if predictor == 2:
         total_rows = n_tiles * tile_height
+        # Predictor=2 differences adjacent samples at the sample's natural
+        # width (uint8/16/32/64). The per-dtype kernels view the byte
+        # buffer as native unsigned dtype, so on big-endian files we must
+        # swap the bytes to native order BEFORE running the kernel,
+        # otherwise the prefix-sum runs on the wrong integer
+        # interpretation (#1517). The pre-swap then makes the post-
+        # assembly byteswap unnecessary; see the BE branch below.
+        if big_endian and dtype.itemsize > 1:
+            _swap_bytes_inplace(d_decomp, dtype.itemsize)
         # Sample-level differencing: stride is samples_per_pixel samples,
         # row width is tile_width pixels.  Per-dtype kernels handle the
         # modular wrap at the sample's natural bit width.
@@ -1575,7 +1607,12 @@ def _apply_predictor_and_assemble(d_decomp, d_decomp_offsets, n_tiles,
     else:
         out = d_output.view(dtype=cupy.dtype(dtype)).reshape(
             image_height, image_width)
-    if big_endian and dtype.itemsize > 1:
+    # Predictor=2 BE swapped d_decomp to native order pre-decode (#1517),
+    # so the assembled output is already native; skip the final swap.
+    needs_post_swap = (
+        big_endian and dtype.itemsize > 1 and predictor != 2
+    )
+    if needs_post_swap:
         # See gpu_decode_tiles for why BE samples need a final byteswap.
         # cupy.ndarray has no .byteswap(), so use the dtype-view helper.
         out = _xp_byteswap(out)
@@ -1793,6 +1830,12 @@ def gpu_decode_tiles(
     if predictor == 2:
         # Sample-level horizontal differencing: stride is samples_per_pixel
         # samples; per-dtype kernels handle the natural-width modular wrap.
+        # On big-endian multi-byte files the kernels would otherwise view
+        # the buffer with the wrong integer interpretation, so swap to
+        # native order first and skip the post-assembly swap below
+        # (#1517).
+        if byte_order == '>' and dtype.itemsize > 1:
+            _swap_bytes_inplace(d_decomp, dtype.itemsize)
         total_rows = n_tiles * tile_height
         _gpu_predictor2_decode(
             d_decomp, tile_width, total_rows, dtype, samples)
@@ -1835,8 +1878,10 @@ def gpu_decode_tiles(
     # The decoded byte stream is in the file's byte order; cupy view
     # interprets it as native (always little-endian on supported GPUs),
     # so big-endian samples that are wider than a byte must be swapped
-    # back to native before the values mean anything.
-    if byte_order == '>' and dtype.itemsize > 1:
+    # back to native before the values mean anything. Predictor=2 BE
+    # already swapped the buffer pre-decode (#1517), so skip the swap
+    # in that case.
+    if byte_order == '>' and dtype.itemsize > 1 and predictor != 2:
         # cupy.ndarray has no .byteswap(), so use the dtype-view helper.
         out = _xp_byteswap(out)
     return out
diff --git a/xrspatial/geotiff/tests/test_predictor2_big_endian_gpu_1517.py b/xrspatial/geotiff/tests/test_predictor2_big_endian_gpu_1517.py
new file mode 100644
index 00000000..af456491
--- /dev/null
+++ b/xrspatial/geotiff/tests/test_predictor2_big_endian_gpu_1517.py
@@ -0,0 +1,219 @@
+"""Regression tests for issue #1517.
+
+PR #1515 fixed the ``AttributeError: 'ndarray' object has no attribute
+'byteswap'`` crash on big-endian multi-byte TIFFs read via
+``read_geotiff_gpu``. After that fix the GPU path no longer raised, but
+predictor=2 BE files came back with wrong values: the per-dtype
+predictor kernels view the byte buffer as native unsigned integers, so
+on a BE file the prefix-sum runs on the wrong integer interpretation
+and the differencing produces garbage.
+
+These tests confirm the GPU output now matches the CPU
+``read_to_array`` baseline for predictor=2 BE files across several
+dtypes and tile layouts, and that the LE predictor=2 path still
+round-trips.
+"""
+from __future__ import annotations
+
+import importlib.util
+
+import numpy as np
+import pytest
+
+
+def _gpu_available() -> bool:
+    """True if cupy is importable and CUDA is initialised."""
+    if importlib.util.find_spec("cupy") is None:
+        return False
+    try:
+        import cupy
+        return bool(cupy.cuda.is_available())
+    except Exception:
+        return False
+
+
+_HAS_GPU = _gpu_available()
+_HAS_TIFFFILE = importlib.util.find_spec("tifffile") is not None
+_gpu_only = pytest.mark.skipif(
+    not (_HAS_GPU and _HAS_TIFFFILE),
+    reason="cupy + CUDA + tifffile required",
+)
+
+
+@_gpu_only
+def test_gpu_predictor2_big_endian_int32_tiled_reproducer(tmp_path):
+    """Exact reproducer from issue #1517: BE int32 tiled deflate + pred=2."""
+    import cupy
+    import tifffile
+
+    from xrspatial.geotiff import read_geotiff_gpu
+    from xrspatial.geotiff._reader import read_to_array
+
+    rng = np.random.RandomState(20260507)
+    arr = rng.randint(
+        -1_000_000, 1_000_000, size=(32, 48), dtype=np.int64
+    ).astype(np.int32)
+
+    path = tmp_path / "be_pred2_int32.tif"
+    tifffile.imwrite(
+        str(path), arr, byteorder=">", predictor=2,
+        compression="deflate", tile=(16, 16),
+    )
+
+    cpu, _ = read_to_array(str(path))
+    np.testing.assert_array_equal(cpu, arr)
+
+    gpu_da = read_geotiff_gpu(str(path))
+    assert isinstance(gpu_da.data, cupy.ndarray), (
+        "expected cupy-backed DataArray; GPU path may have fallen back"
+    )
+    assert gpu_da.data.dtype == np.dtype(np.int32)
+    assert gpu_da.data.dtype.isnative
+    np.testing.assert_array_equal(gpu_da.data.get(), cpu)
+
+
+@_gpu_only
+@pytest.mark.parametrize(
+    "dtype",
+    [np.uint16, np.int16, np.uint32, np.int32],
+)
+def test_gpu_predictor2_big_endian_dtypes_tiled(tmp_path, dtype):
+    """BE predictor=2 tiled files match CPU baseline across dtypes."""
+    import cupy
+    import tifffile
+
+    from xrspatial.geotiff import read_geotiff_gpu
+    from xrspatial.geotiff._reader import read_to_array
+
+    rng = np.random.RandomState(20260508)
+    info = np.iinfo(dtype)
+    arr = rng.randint(
+        max(info.min, -1_000_000),
+        min(info.max, 1_000_000),
+        size=(32, 48),
+        dtype=np.int64,
+    ).astype(dtype)
+
+    path = tmp_path / f"be_pred2_{np.dtype(dtype).name}.tif"
+    tifffile.imwrite(
+        str(path), arr, byteorder=">", predictor=2,
+        compression="deflate", tile=(16, 16),
+    )
+
+    cpu, _ = read_to_array(str(path))
+    np.testing.assert_array_equal(cpu, arr)
+
+    gpu_da = read_geotiff_gpu(str(path))
+    assert isinstance(gpu_da.data, cupy.ndarray)
+    assert gpu_da.data.dtype == np.dtype(dtype)
+    assert gpu_da.data.dtype.isnative
+    np.testing.assert_array_equal(gpu_da.data.get(), cpu)
+
+
+@_gpu_only
+def test_gpu_predictor2_big_endian_stripped_uint16(tmp_path):
+    """Stripped BE predictor=2 files take the CPU fallback but stay correct.
+
+    ``read_geotiff_gpu`` falls back to the CPU reader for stripped
+    layouts, then transfers the result to GPU. The fix must not regress
+    that path.
+    """
+    import cupy
+    import tifffile
+
+    from xrspatial.geotiff import read_geotiff_gpu
+    from xrspatial.geotiff._reader import read_to_array
+
+    rng = np.random.RandomState(20260509)
+    arr = rng.randint(0, 60000, size=(32, 48), dtype=np.uint16)
+
+    path = tmp_path / "be_pred2_uint16_strip.tif"
+    # Omit ``tile`` to get the strip layout.
+    tifffile.imwrite(
+        str(path), arr, byteorder=">", predictor=2, compression="deflate",
+    )
+
+    cpu, _ = read_to_array(str(path))
+    np.testing.assert_array_equal(cpu, arr)
+
+    gpu_da = read_geotiff_gpu(str(path))
+    assert isinstance(gpu_da.data, cupy.ndarray)
+    assert gpu_da.data.dtype == np.dtype(np.uint16)
+    assert gpu_da.data.dtype.isnative
+    np.testing.assert_array_equal(gpu_da.data.get(), cpu)
+
+
+@_gpu_only
+def test_gpu_predictor2_little_endian_still_works(tmp_path):
+    """LE predictor=2 must still round-trip after the BE fix."""
+    import cupy
+    import tifffile
+
+    from xrspatial.geotiff import read_geotiff_gpu
+    from xrspatial.geotiff._reader import read_to_array
+
+    rng = np.random.RandomState(20260510)
+    arr = rng.randint(
+        -1_000_000, 1_000_000, size=(32, 48), dtype=np.int64
+    ).astype(np.int32)
+
+    path = tmp_path / "le_pred2_int32.tif"
+    tifffile.imwrite(
+        str(path), arr, byteorder="<", predictor=2,
+        compression="deflate", tile=(16, 16),
+    )
+
+    cpu, _ = read_to_array(str(path))
+    np.testing.assert_array_equal(cpu, arr)
+
+    gpu_da = read_geotiff_gpu(str(path))
+    assert isinstance(gpu_da.data, cupy.ndarray)
+    assert gpu_da.data.dtype == np.dtype(np.int32)
+    np.testing.assert_array_equal(gpu_da.data.get(), cpu)
+
+
+@_gpu_only
+def test_gpu_predictor3_big_endian_still_works(tmp_path):
+    """Floating-point predictor BE must still match CPU after the fix."""
+    import cupy
+    import tifffile
+
+    from xrspatial.geotiff import read_geotiff_gpu
+    from xrspatial.geotiff._reader import read_to_array
+
+    rng = np.random.RandomState(20260511)
+    arr = rng.standard_normal((32, 48)).astype(np.float32)
+
+    path = tmp_path / "be_pred3_float32.tif"
+    tifffile.imwrite(
+        str(path), arr, byteorder=">", predictor=3,
+        compression="deflate", tile=(16, 16),
+    )
+
+    cpu, _ = read_to_array(str(path))
+    np.testing.assert_array_equal(cpu, arr)
+
+    gpu_da = read_geotiff_gpu(str(path))
+    assert isinstance(gpu_da.data, cupy.ndarray)
+    assert gpu_da.data.dtype == np.dtype(np.float32)
+    np.testing.assert_array_equal(gpu_da.data.get(), cpu)
+
+
+def test_swap_bytes_inplace_numpy():
+    """The byte-swap helper reverses bytes per sample on a numpy buffer."""
+    from xrspatial.geotiff._gpu_decode import _swap_bytes_inplace
+
+    # uint16 values 0x0102, 0x0304 in BE bytes: 01 02 03 04
+    buf = np.array([0x01, 0x02, 0x03, 0x04], dtype=np.uint8)
+    _swap_bytes_inplace(buf, 2)
+    np.testing.assert_array_equal(buf, np.array([0x02, 0x01, 0x04, 0x03],
+                                                dtype=np.uint8))
+
+
+def test_swap_bytes_inplace_uint8_noop():
+    """bps=1 must be a no-op."""
+    from xrspatial.geotiff._gpu_decode import _swap_bytes_inplace
+
+    buf = np.array([1, 2, 3], dtype=np.uint8)
+    _swap_bytes_inplace(buf, 1)
+    np.testing.assert_array_equal(buf, np.array([1, 2, 3], dtype=np.uint8))

From 26b16110e1a7ee4b8d84eecbc2b9d12f500773bb Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Fri, 8 May 2026 10:11:04 -0700
Subject: [PATCH 2/3] Address Copilot review on #1524

- Rename _swap_bytes_inplace -> _swap_byte_lanes. The previous name
  implied a zero-copy operation but view[:, ::-1].copy() allocates a
  same-sized temporary. The docstring now states the temp explicitly so
  callers can size their working set, and a CUDA kernel is left as a
  follow-up if peak GPU memory becomes an issue.
- Add _block_cpu_fallback helper that monkeypatches the
  read_to_array binding inside xrspatial.geotiff. When the GPU tests
  call read_geotiff_gpu, any silent CPU fallback now raises an
  AssertionError instead of producing a passing-via-fallback result.
- Apply the guard to the four tests where the GPU decode path is what
  is being exercised (BE int32 reproducer, BE dtype matrix, LE pred2,
  BE pred3). The stripped-uint16 test still relies on the CPU
  fallback by design and stays unguarded.
---
 xrspatial/geotiff/_gpu_decode.py              | 14 ++---
 .../test_predictor2_big_endian_gpu_1517.py    | 54 ++++++++++++++-----
 2 files changed, 49 insertions(+), 19 deletions(-)

diff --git a/xrspatial/geotiff/_gpu_decode.py b/xrspatial/geotiff/_gpu_decode.py
index 17219a0b..5a200244 100644
--- a/xrspatial/geotiff/_gpu_decode.py
+++ b/xrspatial/geotiff/_gpu_decode.py
@@ -78,7 +78,7 @@ def _xp_byteswap(arr):
     return u8[..., ::-1].copy().view(arr.dtype).reshape(arr.shape)
 
 
-def _swap_bytes_inplace(buf, bps: int) -> None:
+def _swap_byte_lanes(buf, bps: int) -> None:
     """Reverse bytes within each *bps*-sized sample of a flat uint8 buffer.
 
     Used by the GPU predictor=2 path to convert the raw decompressed byte
@@ -87,9 +87,11 @@ def _swap_bytes_inplace(buf, bps: int) -> None:
     unsigned integers, so on big-endian files the prefix-sum would run on
     a byte-swapped integer interpretation and produce wrong values.
 
-    Operates on the underlying buffer of *buf* via a uint8 reshape, so
-    callers see the bytes mutated in place. Works on both numpy and cupy
-    arrays.
+    The original buffer is mutated, but the swap goes through a same-sized
+    temporary (``view[:, ::-1].copy()``); this roughly doubles peak memory
+    for the buffer being swapped. Callers should size their working set
+    accordingly. A true zero-temp swap is possible with a small CUDA
+    kernel; for now the simple form keeps numpy and cupy on the same path.
     """
     if bps <= 1:
         return
@@ -1569,7 +1571,7 @@ def _apply_predictor_and_assemble(d_decomp, d_decomp_offsets, n_tiles,
         # interpretation (#1517). The pre-swap then makes the post-
         # assembly byteswap unnecessary; see the BE branch below.
         if big_endian and dtype.itemsize > 1:
-            _swap_bytes_inplace(d_decomp, dtype.itemsize)
+            _swap_byte_lanes(d_decomp, dtype.itemsize)
         # Sample-level differencing: stride is samples_per_pixel samples,
         # row width is tile_width pixels.  Per-dtype kernels handle the
         # modular wrap at the sample's natural bit width.
@@ -1835,7 +1837,7 @@ def gpu_decode_tiles(
         # native order first and skip the post-assembly swap below
         # (#1517).
         if byte_order == '>' and dtype.itemsize > 1:
-            _swap_bytes_inplace(d_decomp, dtype.itemsize)
+            _swap_byte_lanes(d_decomp, dtype.itemsize)
         total_rows = n_tiles * tile_height
         _gpu_predictor2_decode(
             d_decomp, tile_width, total_rows, dtype, samples)
diff --git a/xrspatial/geotiff/tests/test_predictor2_big_endian_gpu_1517.py b/xrspatial/geotiff/tests/test_predictor2_big_endian_gpu_1517.py
index af456491..e6790639 100644
--- a/xrspatial/geotiff/tests/test_predictor2_big_endian_gpu_1517.py
+++ b/xrspatial/geotiff/tests/test_predictor2_big_endian_gpu_1517.py
@@ -40,8 +40,34 @@ def _gpu_available() -> bool:
 )
 
 
+def _block_cpu_fallback(monkeypatch):
+    """Make any call to ``read_to_array`` from ``read_geotiff_gpu`` fail loudly.
+
+    ``read_geotiff_gpu`` returns a cupy-backed array even when its silent CPU
+    fallback fires (the fallback wraps the CPU result with ``cupy.asarray``),
+    so ``isinstance(gpu_da.data, cupy.ndarray)`` cannot distinguish the two
+    paths. Patching the module-bound ``read_to_array`` to raise turns any
+    silent fallback into a hard test failure, which is what we want when the
+    point of a test is to exercise the actual GPU decode kernels.
+
+    Tests that legitimately rely on the CPU fallback (e.g. stripped
+    layouts) must not call this helper.
+    """
+    from xrspatial import geotiff as geotiff_pkg
+
+    def _no_fallback(*args, **kwargs):
+        raise AssertionError(
+            "read_geotiff_gpu fell back to read_to_array; "
+            "the GPU decode path was not exercised."
+        )
+
+    monkeypatch.setattr(
+        geotiff_pkg, 'read_to_array', _no_fallback, raising=True,
+    )
+
+
 @_gpu_only
-def test_gpu_predictor2_big_endian_int32_tiled_reproducer(tmp_path):
+def test_gpu_predictor2_big_endian_int32_tiled_reproducer(tmp_path, monkeypatch):
     """Exact reproducer from issue #1517: BE int32 tiled deflate + pred=2."""
     import cupy
     import tifffile
@@ -63,10 +89,9 @@ def test_gpu_predictor2_big_endian_int32_tiled_reproducer(tmp_path):
     cpu, _ = read_to_array(str(path))
     np.testing.assert_array_equal(cpu, arr)
 
+    _block_cpu_fallback(monkeypatch)
     gpu_da = read_geotiff_gpu(str(path))
-    assert isinstance(gpu_da.data, cupy.ndarray), (
-        "expected cupy-backed DataArray; GPU path may have fallen back"
-    )
+    assert isinstance(gpu_da.data, cupy.ndarray)
     assert gpu_da.data.dtype == np.dtype(np.int32)
     assert gpu_da.data.dtype.isnative
     np.testing.assert_array_equal(gpu_da.data.get(), cpu)
@@ -77,7 +102,7 @@ def test_gpu_predictor2_big_endian_int32_tiled_reproducer(tmp_path):
     "dtype",
     [np.uint16, np.int16, np.uint32, np.int32],
 )
-def test_gpu_predictor2_big_endian_dtypes_tiled(tmp_path, dtype):
+def test_gpu_predictor2_big_endian_dtypes_tiled(tmp_path, monkeypatch, dtype):
     """BE predictor=2 tiled files match CPU baseline across dtypes."""
     import cupy
     import tifffile
@@ -103,6 +128,7 @@ def test_gpu_predictor2_big_endian_dtypes_tiled(tmp_path, dtype):
     cpu, _ = read_to_array(str(path))
     np.testing.assert_array_equal(cpu, arr)
 
+    _block_cpu_fallback(monkeypatch)
     gpu_da = read_geotiff_gpu(str(path))
     assert isinstance(gpu_da.data, cupy.ndarray)
     assert gpu_da.data.dtype == np.dtype(dtype)
@@ -144,7 +170,7 @@ def test_gpu_predictor2_big_endian_stripped_uint16(tmp_path):
 
 
 @_gpu_only
-def test_gpu_predictor2_little_endian_still_works(tmp_path):
+def test_gpu_predictor2_little_endian_still_works(tmp_path, monkeypatch):
     """LE predictor=2 must still round-trip after the BE fix."""
     import cupy
     import tifffile
@@ -166,6 +192,7 @@ def test_gpu_predictor2_little_endian_still_works(tmp_path):
     cpu, _ = read_to_array(str(path))
     np.testing.assert_array_equal(cpu, arr)
 
+    _block_cpu_fallback(monkeypatch)
     gpu_da = read_geotiff_gpu(str(path))
     assert isinstance(gpu_da.data, cupy.ndarray)
     assert gpu_da.data.dtype == np.dtype(np.int32)
@@ -173,7 +200,7 @@ def test_gpu_predictor2_little_endian_still_works(tmp_path):
 
 
 @_gpu_only
-def test_gpu_predictor3_big_endian_still_works(tmp_path):
+def test_gpu_predictor3_big_endian_still_works(tmp_path, monkeypatch):
     """Floating-point predictor BE must still match CPU after the fix."""
     import cupy
     import tifffile
@@ -193,27 +220,28 @@ def test_gpu_predictor3_big_endian_still_works(tmp_path):
     cpu, _ = read_to_array(str(path))
     np.testing.assert_array_equal(cpu, arr)
 
+    _block_cpu_fallback(monkeypatch)
     gpu_da = read_geotiff_gpu(str(path))
     assert isinstance(gpu_da.data, cupy.ndarray)
     assert gpu_da.data.dtype == np.dtype(np.float32)
     np.testing.assert_array_equal(gpu_da.data.get(), cpu)
 
 
-def test_swap_bytes_inplace_numpy():
+def test_swap_byte_lanes_numpy():
     """The byte-swap helper reverses bytes per sample on a numpy buffer."""
-    from xrspatial.geotiff._gpu_decode import _swap_bytes_inplace
+    from xrspatial.geotiff._gpu_decode import _swap_byte_lanes
 
     # uint16 values 0x0102, 0x0304 in BE bytes: 01 02 03 04
     buf = np.array([0x01, 0x02, 0x03, 0x04], dtype=np.uint8)
-    _swap_bytes_inplace(buf, 2)
+    _swap_byte_lanes(buf, 2)
     np.testing.assert_array_equal(buf, np.array([0x02, 0x01, 0x04, 0x03],
                                                 dtype=np.uint8))
 
 
-def test_swap_bytes_inplace_uint8_noop():
+def test_swap_byte_lanes_uint8_noop():
     """bps=1 must be a no-op."""
-    from xrspatial.geotiff._gpu_decode import _swap_bytes_inplace
+    from xrspatial.geotiff._gpu_decode import _swap_byte_lanes
 
     buf = np.array([1, 2, 3], dtype=np.uint8)
-    _swap_bytes_inplace(buf, 1)
+    _swap_byte_lanes(buf, 1)
     np.testing.assert_array_equal(buf, np.array([1, 2, 3], dtype=np.uint8))

From cbac10c255fb2f396fd85a38125bfa9aa9452718 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Fri, 8 May 2026 10:16:57 -0700
Subject: [PATCH 3/3] Make _swap_byte_lanes truly zero-temp

Replaces the view[:, ::-1].copy() form with in-place byte reversal:

- numpy: dispatch to ndarray.byteswap(inplace=True) via a uint16/32/64
  view of the original buffer, so the original allocation is mutated
  with no temp.
- cupy: launch _byte_swap_lanes_kernel, one thread per sample, swapping
  bytes through register-resident temporaries. Peak GPU memory for the
  swap is now O(1) instead of O(buffer size).

Adds explicit validation: bps must be 2, 4, or 8 (3/5/6/7 are rejected
rather than silently corrupting), and size must be a multiple of bps.

Tests cover bps=2/4/8 on numpy and cupy, the bps=1 no-op, the
validation errors, and address-equality assertions confirming both
paths really are in-place. The existing integration tests
(test_gpu_predictor2_big_endian_*, predictor3_big_endian) still pass
unchanged.
---
 xrspatial/geotiff/_gpu_decode.py              | 47 +++++++--
 .../test_predictor2_big_endian_gpu_1517.py    | 97 ++++++++++++++++++-
 2 files changed, 136 insertions(+), 8 deletions(-)

diff --git a/xrspatial/geotiff/_gpu_decode.py b/xrspatial/geotiff/_gpu_decode.py
index 5a200244..d18b5cfd 100644
--- a/xrspatial/geotiff/_gpu_decode.py
+++ b/xrspatial/geotiff/_gpu_decode.py
@@ -78,6 +78,30 @@ def _xp_byteswap(arr):
     return u8[..., ::-1].copy().view(arr.dtype).reshape(arr.shape)
 
 
+@cuda.jit
+def _byte_swap_lanes_kernel(buf, bps):
+    """Reverse bytes within each *bps*-sized sample, one thread per sample.
+
+    Each thread loads ``bps`` bytes from its sample, swaps them in place
+    via register-resident temporaries, and writes them back. No global
+    memory beyond *buf* is touched, so peak GPU memory is unchanged.
+    """
+    i = cuda.grid(1)
+    n_samples = buf.size // bps
+    if i >= n_samples:
+        return
+    base = i * bps
+    half = bps // 2
+    for j in range(half):
+        a = buf[base + j]
+        b = buf[base + bps - 1 - j]
+        buf[base + j] = b
+        buf[base + bps - 1 - j] = a
+
+
+_BPS_TO_UINT = {2: np.uint16, 4: np.uint32, 8: np.uint64}
+
+
 def _swap_byte_lanes(buf, bps: int) -> None:
     """Reverse bytes within each *bps*-sized sample of a flat uint8 buffer.
 
@@ -87,11 +111,11 @@ def _swap_byte_lanes(buf, bps: int) -> None:
     unsigned integers, so on big-endian files the prefix-sum would run on
     a byte-swapped integer interpretation and produce wrong values.
 
-    The original buffer is mutated, but the swap goes through a same-sized
-    temporary (``view[:, ::-1].copy()``); this roughly doubles peak memory
-    for the buffer being swapped. Callers should size their working set
-    accordingly. A true zero-temp swap is possible with a small CUDA
-    kernel; for now the simple form keeps numpy and cupy on the same path.
+    The swap is true in-place: no same-sized temporary is allocated.
+    On numpy arrays it dispatches to ``ndarray.byteswap(inplace=True)``
+    via a uint16/32/64 view; on cupy device arrays it launches
+    :func:`_byte_swap_lanes_kernel`, which swaps bytes per sample using
+    only register-resident temporaries.
     """
     if bps <= 1:
         return
@@ -99,8 +123,17 @@ def _swap_byte_lanes(buf, bps: int) -> None:
     if n % bps != 0:
         raise ValueError(
             f"buffer size {n} is not a multiple of bps={bps}")
-    view = buf.reshape(n // bps, bps)
-    view[:, :] = view[:, ::-1].copy()
+    if bps not in _BPS_TO_UINT:
+        raise ValueError(f"unsupported bps={bps}; expected 2, 4, or 8")
+
+    if isinstance(buf, np.ndarray):
+        buf.view(_BPS_TO_UINT[bps]).byteswap(inplace=True)
+        return
+
+    n_samples = n // bps
+    threads = 256
+    blocks = (n_samples + threads - 1) // threads
+    _byte_swap_lanes_kernel[blocks, threads](buf, bps)
 
 
 # LZW constants (same as _compression.py)
diff --git a/xrspatial/geotiff/tests/test_predictor2_big_endian_gpu_1517.py b/xrspatial/geotiff/tests/test_predictor2_big_endian_gpu_1517.py
index e6790639..f2fee9ac 100644
--- a/xrspatial/geotiff/tests/test_predictor2_big_endian_gpu_1517.py
+++ b/xrspatial/geotiff/tests/test_predictor2_big_endian_gpu_1517.py
@@ -227,7 +227,7 @@ def test_gpu_predictor3_big_endian_still_works(tmp_path, monkeypatch):
     np.testing.assert_array_equal(gpu_da.data.get(), cpu)
 
 
-def test_swap_byte_lanes_numpy():
+def test_swap_byte_lanes_numpy_bps2():
     """The byte-swap helper reverses bytes per sample on a numpy buffer."""
     from xrspatial.geotiff._gpu_decode import _swap_byte_lanes
 
@@ -238,6 +238,29 @@ def test_swap_byte_lanes_numpy():
                                                 dtype=np.uint8))
 
 
+def test_swap_byte_lanes_numpy_bps4():
+    """bps=4: full byte reversal within each 4-byte sample."""
+    from xrspatial.geotiff._gpu_decode import _swap_byte_lanes
+
+    buf = np.array([0x01, 0x02, 0x03, 0x04,
+                    0x05, 0x06, 0x07, 0x08], dtype=np.uint8)
+    _swap_byte_lanes(buf, 4)
+    np.testing.assert_array_equal(
+        buf, np.array([0x04, 0x03, 0x02, 0x01,
+                       0x08, 0x07, 0x06, 0x05], dtype=np.uint8))
+
+
+def test_swap_byte_lanes_numpy_bps8():
+    """bps=8: full byte reversal within each 8-byte sample."""
+    from xrspatial.geotiff._gpu_decode import _swap_byte_lanes
+
+    sample = np.arange(1, 9, dtype=np.uint8)
+    buf = np.tile(sample, 2).copy()
+    _swap_byte_lanes(buf, 8)
+    np.testing.assert_array_equal(
+        buf, np.tile(sample[::-1], 2))
+
+
 def test_swap_byte_lanes_uint8_noop():
     """bps=1 must be a no-op."""
     from xrspatial.geotiff._gpu_decode import _swap_byte_lanes
@@ -245,3 +268,75 @@ def test_swap_byte_lanes_uint8_noop():
     buf = np.array([1, 2, 3], dtype=np.uint8)
     _swap_byte_lanes(buf, 1)
     np.testing.assert_array_equal(buf, np.array([1, 2, 3], dtype=np.uint8))
+
+
+def test_swap_byte_lanes_rejects_unsupported_bps():
+    """Unsupported bps values raise ValueError rather than corrupt data."""
+    from xrspatial.geotiff._gpu_decode import _swap_byte_lanes
+
+    buf = np.zeros(6, dtype=np.uint8)
+    with pytest.raises(ValueError, match="unsupported bps"):
+        _swap_byte_lanes(buf, 3)
+
+
+def test_swap_byte_lanes_rejects_misaligned_size():
+    """Buffer size must be a multiple of bps."""
+    from xrspatial.geotiff._gpu_decode import _swap_byte_lanes
+
+    buf = np.zeros(5, dtype=np.uint8)
+    with pytest.raises(ValueError, match="not a multiple"):
+        _swap_byte_lanes(buf, 2)
+
+
+def test_swap_byte_lanes_numpy_is_zero_temp():
+    """The numpy path must mutate the original buffer without realloc."""
+    from xrspatial.geotiff._gpu_decode import _swap_byte_lanes
+
+    buf = np.array([0x01, 0x02, 0x03, 0x04], dtype=np.uint8)
+    addr_before = buf.ctypes.data
+    _swap_byte_lanes(buf, 2)
+    assert buf.ctypes.data == addr_before
+    np.testing.assert_array_equal(buf, np.array([0x02, 0x01, 0x04, 0x03],
+                                                dtype=np.uint8))
+
+
+@_gpu_only
+@pytest.mark.parametrize("bps,dtype", [
+    (2, np.uint16),
+    (4, np.uint32),
+    (8, np.uint64),
+])
+def test_swap_byte_lanes_cupy_kernel(bps, dtype):
+    """The cupy path runs the CUDA kernel and matches numpy.byteswap."""
+    import cupy
+
+    from xrspatial.geotiff._gpu_decode import _swap_byte_lanes
+
+    rng = np.random.RandomState(20260512 + bps)
+    n_samples = 1024
+    src = rng.randint(0, np.iinfo(dtype).max, size=n_samples,
+                      dtype=np.uint64).astype(dtype)
+    expected = src.byteswap()  # numpy reference, returns swapped copy
+
+    d_buf = cupy.asarray(src.view(np.uint8))
+    addr_before = int(d_buf.data.ptr)
+    _swap_byte_lanes(d_buf, bps)
+    addr_after = int(d_buf.data.ptr)
+
+    assert addr_after == addr_before, "kernel must operate in place"
+    np.testing.assert_array_equal(
+        d_buf.get().view(dtype), expected,
+    )
+
+
+@_gpu_only
+def test_swap_byte_lanes_cupy_uint8_noop():
+    """bps=1 leaves cupy buffers untouched (no kernel launch)."""
+    import cupy
+
+    from xrspatial.geotiff._gpu_decode import _swap_byte_lanes
+
+    src = np.arange(16, dtype=np.uint8)
+    d_buf = cupy.asarray(src)
+    _swap_byte_lanes(d_buf, 1)
+    np.testing.assert_array_equal(d_buf.get(), src)