Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 23 additions & 4 deletions xrspatial/geotiff/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,13 +509,18 @@ def open_geotiff(source, *, dtype=None, window=None,
attrs['colormap'] = _tv
break

# Apply nodata mask: replace nodata sentinel values with NaN
# Apply nodata mask: replace nodata sentinel values with NaN.
# ``arr`` came from ``read_to_array``, which returns a freshly
# allocated buffer from ``_read_tiles`` / ``_read_strips`` (and is
# ``np.ascontiguousarray``-wrapped if the orientation tag triggered
# a remap). Nothing else holds a reference here, so the in-place
# write is safe; an extra ``arr.copy()`` would just double peak
# memory for a multi-MB raster.
nodata = geo_info.nodata
if nodata is not None:
attrs['nodata'] = nodata
if arr.dtype.kind == 'f':
if not np.isnan(nodata):
arr = arr.copy()
arr[arr == arr.dtype.type(nodata)] = np.nan
elif arr.dtype.kind in ('u', 'i'):
# Integer arrays: convert to float to represent NaN
Expand Down Expand Up @@ -984,6 +989,11 @@ def to_geotiff(data: xr.DataArray | np.ndarray, path, *,

# Restore NaN pixels to the nodata sentinel value so the written file
# has sentinel values matching the GDAL_NODATA tag.
#
# The defensive ``arr.copy()`` here is intentional: ``arr`` may be
# ``np.asarray(raw)`` of a caller-owned numpy DataArray (a view,
# not a copy) or ``np.moveaxis(arr, 0, -1)`` of one (also a view).
# Mutating without a copy would corrupt the user's input buffer.
if nodata is not None and arr.dtype.kind == 'f' and not np.isnan(nodata):
nan_mask = np.isnan(arr)
if nan_mask.any():
Expand Down Expand Up @@ -1043,7 +1053,12 @@ def _write_single_tile(chunk_data, path, geo_transform, epsg, wkt,
elif arr.dtype == np.bool_:
arr = arr.astype(np.uint8)

# Restore NaN to nodata sentinel
# Restore NaN to nodata sentinel.
#
# The defensive ``arr.copy()`` here is intentional: ``arr`` came
# from ``np.asarray(chunk_data)`` where ``chunk_data`` may be a
# caller-owned numpy buffer. Mutating without a copy would corrupt
# the user's input.
if nodata is not None and arr.dtype.kind == 'f' and not np.isnan(nodata):
nan_mask = np.isnan(arr)
if nan_mask.any():
Expand Down Expand Up @@ -1444,8 +1459,12 @@ def _read(http_meta):
overview_level=overview_level,
band=band)
if nodata is not None:
# ``arr`` was just decoded by ``_fetch_decode_cog_http_tiles``
# or ``read_to_array``; both return freshly-allocated buffers
# that nothing else references, so the in-place sentinel
# rewrite is safe. Skip the defensive ``arr.copy()`` to
# avoid a peak-memory doubler on every dask chunk.
if arr.dtype.kind == 'f' and not np.isnan(nodata):
arr = arr.copy()
arr[arr == arr.dtype.type(nodata)] = np.nan
elif arr.dtype.kind in ('u', 'i'):
mask = arr == arr.dtype.type(int(nodata))
Expand Down
226 changes: 226 additions & 0 deletions xrspatial/geotiff/tests/test_nodata_no_extra_copy_1553.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
"""Regression tests for issue #1553.

Two ``arr = arr.copy()`` calls on freshly-allocated buffers were
removed from the read path in ``xrspatial/geotiff/__init__.py``
(``_geotiff_to_xarray`` and ``_delayed_read_window``). The copies
were unnecessary because the source array was uniquely owned at that
point, so dropping them halves peak memory for a nodata-bearing read
without changing observed values.

These tests exercise the float and integer nodata sentinel paths on
both tiled and strip TIFFs, and on the dask-chunked read path that
flows through ``_delayed_read_window``. They confirm:

- Sentinel pixels become NaN on read.
- Non-sentinel pixels round-trip exactly.
- The dropped copies do not alias any caller-visible buffer (a second
read of the same file returns the expected mask, proving we did not
smear NaNs into shared state across calls).
"""
from __future__ import annotations

import numpy as np
import xarray as xr

from xrspatial.geotiff import open_geotiff, to_geotiff


def _make_float_with_sentinel(h=24, w=24, sentinel=-9999.0,
dtype=np.float32):
"""Float array with a few sentinel pixels at known positions."""
rng = np.random.default_rng(seed=1553)
arr = rng.uniform(0.0, 100.0, size=(h, w)).astype(dtype)
arr[0, 0] = sentinel
arr[5, 7] = sentinel
arr[-1, -1] = sentinel
return arr


def _make_uint16_with_sentinel(h=24, w=24, sentinel=65535):
"""uint16 array with a few sentinel pixels at known positions."""
rng = np.random.default_rng(seed=1554)
arr = rng.integers(0, 1000, size=(h, w), dtype=np.uint16)
arr[0, 0] = sentinel
arr[5, 7] = sentinel
arr[-1, -1] = sentinel
return arr


def test_float_sentinel_strip_tiff_read(tmp_path):
"""Strip-organized float32 TIFF with sentinel nodata reads correctly."""
src = _make_float_with_sentinel()
path = str(tmp_path / 'issue_1553_float_strip.tif')
da = xr.DataArray(src, dims=('y', 'x'),
attrs={'crs': 4326, 'nodata': -9999.0})
to_geotiff(da, path, nodata=-9999.0, tiled=False,
compression='deflate')

out = open_geotiff(path)
expected_mask = (src == np.float32(-9999.0))
np.testing.assert_array_equal(np.isnan(out.data), expected_mask)
finite = ~expected_mask
np.testing.assert_allclose(out.data[finite], src[finite])
assert out.attrs.get('nodata') == -9999.0


def test_float_sentinel_tiled_tiff_read(tmp_path):
"""Tiled float32 TIFF with sentinel nodata reads correctly."""
src = _make_float_with_sentinel(h=64, w=64)
path = str(tmp_path / 'issue_1553_float_tiled.tif')
da = xr.DataArray(src, dims=('y', 'x'),
attrs={'crs': 4326, 'nodata': -9999.0})
to_geotiff(da, path, nodata=-9999.0, tiled=True, tile_size=16,
compression='deflate')

out = open_geotiff(path)
expected_mask = (src == np.float32(-9999.0))
np.testing.assert_array_equal(np.isnan(out.data), expected_mask)
finite = ~expected_mask
np.testing.assert_allclose(out.data[finite], src[finite])


def test_uint16_sentinel_tiled_tiff_read(tmp_path):
"""Tiled uint16 TIFF with sentinel nodata is promoted to float+NaN."""
src = _make_uint16_with_sentinel(h=48, w=48)
path = str(tmp_path / 'issue_1553_uint16_tiled.tif')
da = xr.DataArray(src, dims=('y', 'x'),
attrs={'crs': 4326, 'nodata': 65535})
to_geotiff(da, path, nodata=65535, tiled=True, tile_size=16,
compression='deflate')

out = open_geotiff(path)
assert out.dtype.kind == 'f'
expected_mask = (src == 65535)
np.testing.assert_array_equal(np.isnan(out.data), expected_mask)
finite = ~expected_mask
np.testing.assert_array_equal(out.data[finite].astype(np.uint16),
src[finite])


def test_repeat_reads_independent(tmp_path):
"""Repeated reads return independent arrays with the correct mask.

If the dropped ``.copy()`` had been protecting against shared
state across reads, the second read would either see corrupted
values (NaN bleeding into other slots) or share a buffer with the
first call. We mutate the first result and then re-read to verify
the file's contents are not affected.
"""
src = _make_float_with_sentinel()
path = str(tmp_path / 'issue_1553_repeat_reads.tif')
da = xr.DataArray(src, dims=('y', 'x'),
attrs={'crs': 4326, 'nodata': -9999.0})
to_geotiff(da, path, nodata=-9999.0, compression='deflate')

first = open_geotiff(path)
expected_mask = (src == np.float32(-9999.0))
np.testing.assert_array_equal(np.isnan(first.data), expected_mask)

# Mutate the first read in place. If subsequent reads share state
# with this buffer, the second read will look corrupted.
first.data[1, 1] = np.nan
first.data[2, 2] = 12345.0

second = open_geotiff(path)
np.testing.assert_array_equal(np.isnan(second.data), expected_mask)
finite = ~expected_mask
np.testing.assert_allclose(second.data[finite], src[finite])


def test_dask_chunked_float_sentinel_read(tmp_path):
"""Dask-chunked read of a float TIFF with sentinel nodata.

Exercises ``_delayed_read_window`` — the second site where the
defensive copy was dropped. Each chunk runs the per-window nodata
rewrite path in a worker task.
"""
src = _make_float_with_sentinel(h=64, w=64)
path = str(tmp_path / 'issue_1553_dask_float.tif')
da = xr.DataArray(src, dims=('y', 'x'),
attrs={'crs': 4326, 'nodata': -9999.0})
to_geotiff(da, path, nodata=-9999.0, tiled=True, tile_size=16,
compression='deflate')

out = open_geotiff(path, chunks=16)
materialised = out.compute().data
expected_mask = (src == np.float32(-9999.0))
np.testing.assert_array_equal(np.isnan(materialised), expected_mask)
finite = ~expected_mask
np.testing.assert_allclose(materialised[finite], src[finite])


def test_dask_chunked_uint16_sentinel_read(tmp_path):
"""Dask-chunked read of a uint16 TIFF promotes to float+NaN per chunk."""
src = _make_uint16_with_sentinel(h=64, w=64)
path = str(tmp_path / 'issue_1553_dask_uint16.tif')
da = xr.DataArray(src, dims=('y', 'x'),
attrs={'crs': 4326, 'nodata': 65535})
to_geotiff(da, path, nodata=65535, tiled=True, tile_size=16,
compression='deflate')

out = open_geotiff(path, chunks=16)
materialised = out.compute().data
assert materialised.dtype.kind == 'f'
expected_mask = (src == 65535)
np.testing.assert_array_equal(np.isnan(materialised), expected_mask)
finite = ~expected_mask
np.testing.assert_array_equal(materialised[finite].astype(np.uint16),
src[finite])


def test_writer_does_not_mutate_caller_input(tmp_path):
"""``to_geotiff`` must not mutate the caller's input array.

Covers the defensive copy at the ``to_geotiff`` entry; the sibling
test below covers the second kept copy in ``_write_single_tile``
(the .vrt tiled-output path).
"""
Comment thread
brendancol marked this conversation as resolved.
src = np.array([
[1.0, 2.0, np.nan],
[np.nan, 5.0, 6.0],
[7.0, 8.0, 9.0],
], dtype=np.float32)
snapshot = src.copy()
path = str(tmp_path / 'issue_1553_writer_no_mutate.tif')
da = xr.DataArray(src, dims=('y', 'x'),
attrs={'crs': 4326, 'nodata': -9999.0})
to_geotiff(da, path, nodata=-9999.0, compression='deflate')

# Caller's buffer must still hold its original NaNs and finite
# values; the writer must not have stamped the sentinel value into
# the user's array in place.
np.testing.assert_array_equal(np.isnan(src), np.isnan(snapshot))
finite = ~np.isnan(snapshot)
np.testing.assert_array_equal(src[finite], snapshot[finite])


def test_write_single_tile_does_not_mutate_caller_input(tmp_path):
"""``_write_single_tile`` must not mutate the caller's array either.

The .vrt tiled-output path goes through ``_write_single_tile`` per
chunk. That helper has its own defensive copy at the
NaN -> sentinel rewrite. Direct invocation pins it: pass a numpy
buffer with NaNs, request an integer nodata sentinel, then assert
the source still has NaNs in the same places.
"""
from xrspatial.geotiff import _write_single_tile

src = np.array([
[1.0, np.nan, 3.0],
[4.0, 5.0, np.nan],
[np.nan, 8.0, 9.0],
], dtype=np.float32)
snapshot = src.copy()

out_path = str(tmp_path / 'issue_1553_single_tile_no_mutate.tif')
_write_single_tile(
src, out_path,
geo_transform=None, epsg=4326, wkt=None,
nodata=-9999.0, compression='deflate', compression_level=None,
tile_size=16, predictor=False, bigtiff=False,
)

# Source must still hold its original NaNs and finite values.
np.testing.assert_array_equal(np.isnan(src), np.isnan(snapshot))
finite = ~np.isnan(snapshot)
np.testing.assert_array_equal(src[finite], snapshot[finite])
Loading