diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py
index bd7d97fe..4b27f245 100644
--- a/xrspatial/geotiff/__init__.py
+++ b/xrspatial/geotiff/__init__.py
@@ -1617,7 +1617,9 @@ def read_geotiff_gpu(source: str, *,
 
     from ._reader import (
         _FileSource, _check_dimensions, MAX_PIXELS_DEFAULT, _coerce_path,
+        _resolve_masked_fill,
     )
+    from ._compression import COMPRESSION_LERC
     from ._header import (
         parse_header, parse_all_ifds, select_overview_ifd, validate_tile_layout,
     )
@@ -1709,6 +1711,16 @@ def read_geotiff_gpu(source: str, *,
     # Sparse tiles (byte_count == 0) are unsupported on the GPU pipeline;
     # the CPU reader fills them with nodata and copies onto the GPU.
     has_sparse_tile = any(bc == 0 for bc in byte_counts)
+    # LERC tiles can carry a per-pixel valid mask that GDAL writes
+    # zero-filled in the data array.  Compute the nodata fill the same
+    # way the CPU reader does so the GPU decode path can restore it
+    # post-assembly (mirrors PR #1529 for the CPU path). Only the
+    # chunky (planar=1) GPU path threads masked_fill into its kernel
+    # call below; the planar=2 per-band branch falls back to the CPU
+    # reader for masked pixels (rare in practice -- LERC files
+    # typically use chunky layout).
+    masked_fill = (_resolve_masked_fill(ifd.nodata_str, file_dtype)
+                   if compression == COMPRESSION_LERC else None)
 
     # PlanarConfiguration=2 (separate bands): each band has its own list
     # of tiles back-to-back in TileOffsets / TileByteCounts. The GPU
@@ -1795,6 +1807,7 @@ def _read_once():
                 tw, th, width, height,
                 compression, predictor, file_dtype, samples,
                 byte_order=header.byte_order,
+                masked_fill=masked_fill,
             )
         except Exception as e:
             if gpu == 'strict':
@@ -1826,6 +1839,7 @@ def _read_once():
                 tw, th, width, height,
                 compression, predictor, file_dtype, samples,
                 byte_order=header.byte_order,
+                masked_fill=masked_fill,
             )
         except Exception as e:
             if gpu == 'strict':
diff --git a/xrspatial/geotiff/_gpu_decode.py b/xrspatial/geotiff/_gpu_decode.py
index b2c51e09..ac8ba4bb 100644
--- a/xrspatial/geotiff/_gpu_decode.py
+++ b/xrspatial/geotiff/_gpu_decode.py
@@ -1452,6 +1452,7 @@ def gpu_decode_tiles_from_file(
     dtype: np.dtype,
     samples: int = 1,
     byte_order: str = '<',
+    masked_fill=None,
 ):
     """Decode tiles from a file, using GDS if available.
 
@@ -1496,7 +1497,7 @@ def gpu_decode_tiles_from_file(
     return gpu_decode_tiles(
         compressed_tiles, tile_width, tile_height,
         image_width, image_height, compression, predictor, dtype, samples,
-        byte_order=byte_order)
+        byte_order=byte_order, masked_fill=masked_fill)
 
 
 def _try_nvcomp_from_device_bufs(d_tiles, tile_bytes, compression):
@@ -1711,6 +1712,7 @@ def gpu_decode_tiles(
     dtype: np.dtype,
     samples: int = 1,
     byte_order: str = '<',
+    masked_fill=None,
 ):
     """Decode and assemble TIFF tiles entirely on GPU.
 
@@ -1730,6 +1732,12 @@ def gpu_decode_tiles(
         Output pixel dtype.
     samples : int
         Samples per pixel.
+    masked_fill : scalar or None
+        Value to write into pixels that LERC reports as invalid.  Only
+        consulted for LERC-compressed inputs (tag 34887).  ``None``
+        leaves any masked pixels at LERC's zero fill.  Use
+        ``_resolve_masked_fill(ifd.nodata_str, dtype)`` from
+        ``_reader.py`` to mirror the CPU reader's nodata semantics.
 
     Returns
     -------
@@ -1742,6 +1750,12 @@ def gpu_decode_tiles(
     bytes_per_pixel = dtype.itemsize * samples
     tile_bytes = tile_width * tile_height * bytes_per_pixel
 
+    # Per-tile LERC valid masks; populated only by the LERC branch
+    # below.  Kept as a flat local so the post-assembly fill block at
+    # the end of this function can find it regardless of which decode
+    # branch ran.
+    _lerc_masks: list[np.ndarray | None] | None = None
+
     # Try nvCOMP batch decompression first (much faster if available)
     nvcomp_result = _try_nvcomp_batch_decompress(
         compressed_tiles, tile_bytes, compression)
@@ -1869,19 +1883,34 @@ def gpu_decode_tiles(
             d_decomp_offsets = cupy.asarray(decomp_offsets)
 
     elif compression == 34887:  # LERC
-        from ._compression import lerc_decompress
+        from ._compression import lerc_decompress_with_mask
         raw_host = np.empty(n_tiles * tile_bytes, dtype=np.uint8)
+        # Per-tile valid masks captured from LERC.  None entries mean
+        # "all valid" (LERC returned no mask, or an all-True mask).
+        # The host-side mask buffer is materialised lazily: it stays
+        # None until at least one tile reports a real partial mask.
+        per_tile_masks: list[np.ndarray | None] = [None] * n_tiles
+        any_lerc_mask = False
         for i, tile in enumerate(compressed_tiles):
             start = i * tile_bytes
-            chunk = np.frombuffer(
-                lerc_decompress(tile, tile_width, tile_height, samples),
-                dtype=np.uint8)
+            decoded_bytes, valid_mask = lerc_decompress_with_mask(tile)
+            chunk = np.frombuffer(decoded_bytes, dtype=np.uint8)
             raw_host[start:start + min(len(chunk), tile_bytes)] = \
                 chunk[:tile_bytes] if len(chunk) >= tile_bytes else \
                 np.pad(chunk, (0, tile_bytes - len(chunk)))
+            if valid_mask is not None:
+                per_tile_masks[i] = np.asarray(valid_mask)
+                any_lerc_mask = True
         d_decomp = cupy.asarray(raw_host)
         decomp_offsets = np.arange(n_tiles, dtype=np.int64) * tile_bytes
         d_decomp_offsets = cupy.asarray(decomp_offsets)
+        if any_lerc_mask:
+            # Stash per-tile masks for the post-assembly fill pass below.
+            # Stored in a ``_lerc_masks`` local that the LERC fill block
+            # picks up after predictor decode and tile assembly.
+            _lerc_masks = per_tile_masks
+        else:
+            _lerc_masks = None
 
     elif compression == 1:  # Uncompressed
         raw_host = np.empty(n_tiles * tile_bytes, dtype=np.uint8)
@@ -1965,6 +1994,80 @@ def gpu_decode_tiles(
     if byte_order == '>' and dtype.itemsize > 1 and predictor != 2:
         # cupy.ndarray has no .byteswap(), so use the dtype-view helper.
         out = _xp_byteswap(out)
+
+    # LERC valid-mask fill: GDAL writes LERC TIFFs with masked pixels
+    # zero-filled in the data array, so without restoring nodata here a
+    # masked pixel reads back as a real zero measurement.  Mirrors the
+    # CPU path in ``_decode_strip_or_tile`` (PR #1529).
+    if _lerc_masks is not None and masked_fill is not None:
+        out = _apply_lerc_mask_fill(
+            out, _lerc_masks, tile_width, tile_height,
+            image_width, image_height, samples, masked_fill)
+    return out
+
+
+def _apply_lerc_mask_fill(out, per_tile_masks, tile_width, tile_height,
+                          image_width, image_height, samples, masked_fill):
+    """Write *masked_fill* into LERC-invalid positions of an assembled image.
+
+    Each tile contributes either an ``(h, w)``/``(h, w, samples)`` valid
+    mask (1=valid, 0=invalid) or ``None`` for "all valid".  The host
+    builds a single assembled boolean invalid-mask sized to the output
+    image, transfers it to the GPU once, and uses it to overwrite
+    masked positions on device.
+    """
+    import cupy
+
+    n_tiles = len(per_tile_masks)
+    tiles_across = math.ceil(image_width / tile_width)
+    tiles_down = math.ceil(image_height / tile_height)
+    if n_tiles != tiles_across * tiles_down:
+        # Defensive: tile grid mismatch means we cannot place masks
+        # safely.  Skip the fill rather than corrupt the output.
+        return out
+
+    invalid = np.zeros((image_height, image_width), dtype=bool)
+    for idx, mask in enumerate(per_tile_masks):
+        if mask is None:
+            continue
+        tr = idx // tiles_across
+        tc = idx % tiles_across
+        y0 = tr * tile_height
+        x0 = tc * tile_width
+        # Trim the tile mask to the visible portion of the image so
+        # right- and bottom-edge tile padding does not leak into the
+        # assembled mask.
+        h = min(tile_height, image_height - y0)
+        w = min(tile_width, image_width - x0)
+        if h <= 0 or w <= 0:
+            continue
+        m = np.asarray(mask)
+        # LERC may report the mask as (h, w) or (h, w, samples); for the
+        # invalid-fill we collapse multi-sample masks via "any sample
+        # invalid".
+        if m.ndim == 3:
+            m2 = (m == 0).any(axis=2)
+        else:
+            m2 = (m == 0)
+        invalid[y0:y0 + h, x0:x0 + w] = m2[:h, :w]
+
+    if not invalid.any():
+        return out
+
+    # Account for the boolean mask buffer up front so a borderline-OK
+    # decode doesn't tip into CUDA OOM on the mask transfer. Boolean
+    # indexing on cupy materialises a temporary index array (typically
+    # int64 of length sum(invalid)); cap that at the worst case of
+    # one int64 per pixel so the budget covers both buffers.
+    _check_gpu_memory(invalid.nbytes, what="LERC valid-mask buffer")
+    _check_gpu_memory(invalid.size * 8, what="LERC mask index buffer")
+
+    d_invalid = cupy.asarray(invalid)
+    if out.ndim == 3:
+        # Broadcast (H, W) mask across the sample axis.
+        out[d_invalid, ...] = out.dtype.type(masked_fill)
+    else:
+        out[d_invalid] = out.dtype.type(masked_fill)
     return out
 
 
diff --git a/xrspatial/geotiff/tests/test_lerc_valid_mask_gpu.py b/xrspatial/geotiff/tests/test_lerc_valid_mask_gpu.py
new file mode 100644
index 00000000..0f39697b
--- /dev/null
+++ b/xrspatial/geotiff/tests/test_lerc_valid_mask_gpu.py
@@ -0,0 +1,181 @@
+"""GPU follow-up to PR #1529 (LERC valid-mask on decode).
+
+The CPU LERC reader honours the LERC valid-mask and writes the file's
+nodata sentinel into masked pixels.  The GPU LERC tile-decode path used
+to discard the mask, so masked pixels read back as LERC's zero fill
+(real-looking measurements at z == 0) on GPU but as NaN/sentinel on
+CPU.  These tests confirm the GPU path now matches the CPU path for
+representative LERC mask combinations.
+
+Mirrors the structure of ``test_lerc_valid_mask.py`` but compares
+``read_geotiff_gpu`` output to ``read_to_array`` output for each case.
+"""
+from __future__ import annotations
+
+import importlib.util
+
+import numpy as np
+import pytest
+
+lerc = pytest.importorskip("lerc")
+
+from xrspatial.geotiff._compression import LERC_AVAILABLE  # noqa: E402
+
+
+def _gpu_available() -> bool:
+    """True if cupy is importable and CUDA is initialised."""
+    if importlib.util.find_spec("cupy") is None:
+        return False
+    try:
+        import cupy
+        return bool(cupy.cuda.is_available())
+    except Exception:
+        return False
+
+
+_HAS_GPU = _gpu_available()
+_gpu_only = pytest.mark.skipif(
+    not (_HAS_GPU and LERC_AVAILABLE),
+    reason="cupy + CUDA + lerc required",
+)
+
+
+@pytest.fixture
+def lerc_writer_with_mask(monkeypatch):
+    """Patch ``lerc_compress`` to embed a valid-mask the writer can't pass.
+
+    The xrspatial writer hard-codes ``hasMask=False`` in its call to
+    ``lerc.encode``.  Tests inject a per-tile mask through this holder's
+    ``invalid`` predicate so the masked pixels survive the encode and
+    show up at decode time.  Same pattern as the CPU test fixture in
+    ``test_lerc_valid_mask.py``.
+    """
+    holder = {"invalid": None}
+
+    def _patched(data, width, height, samples=1,
+                 dtype=np.dtype('float32'), max_z_error=0.0):
+        if samples == 1:
+            arr = np.frombuffer(data, dtype=dtype).reshape(height, width)
+        else:
+            arr = np.frombuffer(data, dtype=dtype).reshape(
+                height, width, samples)
+        invalid_pred = holder["invalid"]
+        if invalid_pred is None:
+            mask = None
+            has_mask = False
+        else:
+            invalid = invalid_pred(arr)
+            mask = np.where(invalid, np.uint8(0), np.uint8(1))
+            has_mask = True
+        result = lerc.encode(arr, samples, has_mask, mask, max_z_error, 1)
+        if result[0] != 0:
+            raise RuntimeError(
+                f"LERC encode failed with error code {result[0]}")
+        return bytes(result[2])
+
+    monkeypatch.setattr(
+        "xrspatial.geotiff._compression.lerc_compress", _patched,
+    )
+    return holder
+
+
+def _read_cpu_gpu(path):
+    """Read *path* with both readers and return ``(cpu_array, gpu_host_array)``."""
+    from xrspatial.geotiff import read_geotiff_gpu
+    from xrspatial.geotiff._reader import read_to_array
+
+    cpu, _geo = read_to_array(path)
+    gpu_da = read_geotiff_gpu(path, gpu='strict')
+    gpu_host = gpu_da.data.get()
+    return cpu, gpu_host
+
+
+@_gpu_only
+class TestGpuLercValidMask:
+    """End-to-end TIFF round-trips comparing GPU vs CPU output."""
+
+    def test_float32_nan_nodata(self, tmp_path, lerc_writer_with_mask):
+        """Float32 LERC + NaN nodata: GPU output matches CPU output."""
+        from xrspatial.geotiff._writer import write
+
+        arr = np.arange(1, 65, dtype=np.float32).reshape(8, 8)
+        invalid_positions = {(0, 1), (5, 4)}
+
+        def invalid_pred(a):
+            m = np.zeros(a.shape[:2], dtype=bool)
+            for r, c in invalid_positions:
+                m[r, c] = True
+            return m
+        lerc_writer_with_mask["invalid"] = invalid_pred
+
+        path = str(tmp_path / "lerc_mask_nan_gpu.tif")
+        write(arr, path, compression="lerc", tiled=True, tile_size=8,
+              nodata=float("nan"))
+
+        cpu, gpu = _read_cpu_gpu(path)
+        # NaN positions
+        for (r, c) in invalid_positions:
+            assert np.isnan(cpu[r, c])
+            assert np.isnan(gpu[r, c])
+        # Valid positions agree exactly
+        cpu_valid = np.where(np.isnan(cpu), 0.0, cpu)
+        gpu_valid = np.where(np.isnan(gpu), 0.0, gpu)
+        np.testing.assert_array_equal(cpu_valid, gpu_valid)
+
+    def test_float32_sentinel_nodata(self, tmp_path, lerc_writer_with_mask):
+        """Float32 LERC + sentinel nodata (-9999): GPU matches CPU."""
+        from xrspatial.geotiff._writer import write
+
+        arr = np.arange(1, 65, dtype=np.float32).reshape(8, 8)
+        invalid_positions = {(0, 1), (3, 3), (7, 7)}
+
+        def invalid_pred(a):
+            m = np.zeros(a.shape[:2], dtype=bool)
+            for r, c in invalid_positions:
+                m[r, c] = True
+            return m
+        lerc_writer_with_mask["invalid"] = invalid_pred
+
+        path = str(tmp_path / "lerc_mask_sentinel_f32_gpu.tif")
+        write(arr, path, compression="lerc", tiled=True, tile_size=8,
+              nodata=-9999.0)
+
+        cpu, gpu = _read_cpu_gpu(path)
+        np.testing.assert_array_equal(cpu, gpu)
+        for (r, c) in invalid_positions:
+            assert gpu[r, c] == np.float32(-9999.0)
+
+    def test_uint16_sentinel_nodata(self, tmp_path, lerc_writer_with_mask):
+        """Uint16 LERC + sentinel nodata (65535): GPU matches CPU."""
+        from xrspatial.geotiff._writer import write
+
+        arr = (np.arange(1, 65, dtype=np.uint16) * 100).reshape(8, 8)
+        invalid_positions = {(0, 1), (4, 4)}
+
+        def invalid_pred(a):
+            m = np.zeros(a.shape[:2], dtype=bool)
+            for r, c in invalid_positions:
+                m[r, c] = True
+            return m
+        lerc_writer_with_mask["invalid"] = invalid_pred
+
+        path = str(tmp_path / "lerc_mask_uint16_gpu.tif")
+        write(arr, path, compression="lerc", tiled=True, tile_size=8,
+              nodata=65535)
+
+        cpu, gpu = _read_cpu_gpu(path)
+        np.testing.assert_array_equal(cpu, gpu)
+        for (r, c) in invalid_positions:
+            assert gpu[r, c] == np.uint16(65535)
+
+    def test_no_mask_roundtrip_bitexact(self, tmp_path):
+        """All-valid LERC (no encoded mask): GPU and CPU agree bit-exact."""
+        from xrspatial.geotiff._writer import write
+
+        arr = np.arange(64, dtype=np.float32).reshape(8, 8)
+        path = str(tmp_path / "lerc_no_mask_gpu.tif")
+        write(arr, path, compression="lerc", tiled=True, tile_size=8)
+
+        cpu, gpu = _read_cpu_gpu(path)
+        np.testing.assert_array_equal(cpu, arr)
+        np.testing.assert_array_equal(gpu, arr)