From abdec47f2c3de514a02d14f08fffe3fc097ed729 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 16 Mar 2026 17:37:49 -0700 Subject: [PATCH 01/16] wip --- cuda_core/cuda/core/_memory/_buffer.pxd | 1 + cuda_core/cuda/core/_memory/_buffer.pyx | 284 ++++++++++++++++++ cuda_core/docs/source/release/0.7.x-notes.rst | 5 + cuda_core/tests/test_memory.py | 127 ++++++++ 4 files changed, 417 insertions(+) diff --git a/cuda_core/cuda/core/_memory/_buffer.pxd b/cuda_core/cuda/core/_memory/_buffer.pxd index 91c0cfe24a..04b5707e18 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pxd +++ b/cuda_core/cuda/core/_memory/_buffer.pxd @@ -12,6 +12,7 @@ cdef struct _MemAttrs: int device_id bint is_device_accessible bint is_host_accessible + bint is_managed cdef class Buffer: diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 83009f74ae..686585b527 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -72,6 +72,194 @@ A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting :attr:`Buffer.handle`. """ + +cdef tuple _VALID_MANAGED_LOCATION_TYPES = ( + "device", + "host", + "host_numa", + "host_numa_current", +) + +cdef dict _MANAGED_LOCATION_TYPE_ATTRS = { + "device": "CU_MEM_LOCATION_TYPE_DEVICE", + "host": "CU_MEM_LOCATION_TYPE_HOST", + "host_numa": "CU_MEM_LOCATION_TYPE_HOST_NUMA", + "host_numa_current": "CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT", +} + +cdef dict _MANAGED_ADVICE_ALIASES = { + "set_read_mostly": "CU_MEM_ADVISE_SET_READ_MOSTLY", + "cu_mem_advise_set_read_mostly": "CU_MEM_ADVISE_SET_READ_MOSTLY", + "unset_read_mostly": "CU_MEM_ADVISE_UNSET_READ_MOSTLY", + "cu_mem_advise_unset_read_mostly": "CU_MEM_ADVISE_UNSET_READ_MOSTLY", + "set_preferred_location": "CU_MEM_ADVISE_SET_PREFERRED_LOCATION", + "cu_mem_advise_set_preferred_location": "CU_MEM_ADVISE_SET_PREFERRED_LOCATION", + "unset_preferred_location": "CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION", + "cu_mem_advise_unset_preferred_location": "CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION", + "set_accessed_by": "CU_MEM_ADVISE_SET_ACCESSED_BY", + "cu_mem_advise_set_accessed_by": "CU_MEM_ADVISE_SET_ACCESSED_BY", + "unset_accessed_by": "CU_MEM_ADVISE_UNSET_ACCESSED_BY", + "cu_mem_advise_unset_accessed_by": "CU_MEM_ADVISE_UNSET_ACCESSED_BY", +} + +cdef frozenset _MANAGED_ADVICE_IGNORE_LOCATION = frozenset(( + "set_read_mostly", + "unset_read_mostly", + "unset_preferred_location", +)) + +cdef frozenset _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY = frozenset(( + "set_accessed_by", + "unset_accessed_by", +)) + + +cdef inline object _managed_location_enum(str location_type): + cdef str attr_name = _MANAGED_LOCATION_TYPE_ATTRS[location_type] + if not hasattr(driver.CUmemLocationType, attr_name): + raise RuntimeError( + f"Managed-memory location type {location_type!r} is not supported by the " + f"installed cuda.bindings package." + ) + return getattr(driver.CUmemLocationType, attr_name) + + +cdef inline object _make_managed_location(str location_type, int location_id): + cdef object location = driver.CUmemLocation() + location.type = _managed_location_enum(location_type) + if location_type == "host": + location.id = int(getattr(driver, "CU_DEVICE_CPU", -1)) + elif location_type == "host_numa_current": + location.id = 0 + else: + location.id = location_id + return location + + +cdef inline tuple _normalize_managed_advice(object advice): + cdef str alias + cdef str attr_name + if isinstance(advice, str): + alias = advice.lower() + attr_name = _MANAGED_ADVICE_ALIASES.get(alias) + if attr_name is None: + raise ValueError( + "advice must be one of " + f"{tuple(sorted(_MANAGED_ADVICE_ALIASES))!r}, got {advice!r}" + ) + return alias, getattr(driver.CUmem_advise, attr_name) + + if isinstance(advice, driver.CUmem_advise): + for alias, attr_name in _MANAGED_ADVICE_ALIASES.items(): + if alias.startswith("cu_mem_advise_"): + continue + if advice == getattr(driver.CUmem_advise, attr_name): + return alias, advice + raise ValueError(f"Unsupported advice value: {advice!r}") + + raise TypeError( + "advice must be a cuda.bindings.driver.CUmem_advise value or a supported string alias" + ) + + +cdef inline object _normalize_managed_location( + object location, + object location_type, + str what, + bint allow_none=False, + bint allow_host=True, + bint allow_host_numa=True, + bint allow_host_numa_current=True, +): + cdef object loc_type + cdef int loc_id + + if isinstance(location, Device): + location = (location).device_id + + if location_type is not None and not isinstance(location_type, str): + raise TypeError(f"{what} location_type must be a string or None, got {type(location_type).__name__}") + + loc_type = None if location_type is None else (location_type).lower() + if loc_type is not None and loc_type not in _VALID_MANAGED_LOCATION_TYPES: + raise ValueError( + f"{what} location_type must be one of {_VALID_MANAGED_LOCATION_TYPES!r} " + f"or None, got {location_type!r}" + ) + + if loc_type is None: + if location is None: + if allow_none: + return _make_managed_location("host", -1) + raise ValueError(f"{what} requires a location") + if not isinstance(location, int): + raise TypeError( + f"{what} location must be a Device, int, or None, got {type(location).__name__}" + ) + loc_id = location + if loc_id == -1: + loc_type = "host" + elif loc_id >= 0: + loc_type = "device" + else: + raise ValueError( + f"{what} location must be a device ordinal (>= 0), -1 for host, or None; got {location!r}" + ) + elif loc_type == "device": + if isinstance(location, int) and location >= 0: + loc_id = location + else: + raise ValueError( + f"{what} location must be a device ordinal (>= 0) when location_type is 'device', got {location!r}" + ) + return _make_managed_location(loc_type, loc_id) + elif loc_type == "host": + if location not in (None, -1): + raise ValueError( + f"{what} location must be None or -1 when location_type is 'host', got {location!r}" + ) + if not allow_host: + raise ValueError(f"{what} does not support location_type='host'") + return _make_managed_location(loc_type, -1) + elif loc_type == "host_numa": + if not allow_host_numa: + raise ValueError(f"{what} does not support location_type='host_numa'") + if not isinstance(location, int) or location < 0: + raise ValueError( + f"{what} location must be a NUMA node ID (>= 0) when location_type is 'host_numa', got {location!r}" + ) + return _make_managed_location(loc_type, location) + else: + if not allow_host_numa_current: + raise ValueError(f"{what} does not support location_type='host_numa_current'") + if location is not None: + raise ValueError( + f"{what} location must be None when location_type is 'host_numa_current', got {location!r}" + ) + return _make_managed_location(loc_type, 0) + + if loc_type == "host" and not allow_host: + raise ValueError(f"{what} does not support host locations") + if loc_type == "host_numa" and not allow_host_numa: + raise ValueError(f"{what} does not support location_type='host_numa'") + if loc_type == "host_numa_current" and not allow_host_numa_current: + raise ValueError(f"{what} does not support location_type='host_numa_current'") + return _make_managed_location(loc_type, loc_id) + + +cdef inline void _require_managed_buffer(Buffer self, str what): + _init_mem_attrs(self) + if not self._mem_attrs.is_managed: + raise ValueError(f"{what} requires a managed-memory buffer") + + +cdef inline void _require_managed_discard_prefetch_support(): + if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): + raise RuntimeError( + "Buffer.discard_prefetch requires cuda.bindings support for " + "cuMemDiscardAndPrefetchBatchAsync" + ) + cdef class Buffer: """Represent a handle to allocated memory. @@ -293,6 +481,99 @@ cdef class Buffer: finally: PyBuffer_Release(&buf) + def advise( + self, + advice: driver.CUmem_advise | str, + location: Device | int | None = None, + *, + location_type: str | None = None, + ): + """Apply a managed-memory advice to this buffer. + + This method is only valid for buffers backed by managed memory. + + Parameters + ---------- + advice : :obj:`~driver.CUmem_advise` | str + Managed-memory advice to apply. String aliases such as + ``"set_read_mostly"``, ``"set_preferred_location"``, and + ``"set_accessed_by"`` are accepted. + location : :obj:`~_device.Device` | int | None, optional + Target location. When ``location_type`` is ``None``, values are + interpreted as a device ordinal, ``-1`` for host, or ``None`` for + advice values that ignore location. + location_type : str | None, optional + Explicit location kind. Supported values are ``"device"``, + ``"host"``, ``"host_numa"``, and ``"host_numa_current"``. + """ + cdef str advice_name + _require_managed_buffer(self, "Buffer.advise") + advice_name, advice = _normalize_managed_advice(advice) + location = _normalize_managed_location( + location, + location_type, + "Buffer.advise", + allow_none=advice_name in _MANAGED_ADVICE_IGNORE_LOCATION, + allow_host=True, + allow_host_numa=advice_name not in _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY, + allow_host_numa_current=advice_name == "set_preferred_location", + ) + handle_return(driver.cuMemAdvise(self.handle, self._size, advice, location)) + + def prefetch( + self, + location: Device | int | None = None, + *, + stream: Stream | GraphBuilder, + location_type: str | None = None, + ): + """Prefetch this managed-memory buffer to a target location.""" + cdef Stream s = Stream_accept(stream) + _require_managed_buffer(self, "Buffer.prefetch") + location = _normalize_managed_location( + location, + location_type, + "Buffer.prefetch", + allow_none=False, + allow_host=True, + allow_host_numa=True, + allow_host_numa_current=True, + ) + handle_return(driver.cuMemPrefetchAsync(self.handle, self._size, location, 0, s.handle)) + + def discard_prefetch( + self, + location: Device | int | None = None, + *, + stream: Stream | GraphBuilder, + location_type: str | None = None, + ): + """Discard this managed-memory buffer and prefetch it to a target location.""" + cdef Stream s = Stream_accept(stream) + _require_managed_buffer(self, "Buffer.discard_prefetch") + _require_managed_discard_prefetch_support() + location = _normalize_managed_location( + location, + location_type, + "Buffer.discard_prefetch", + allow_none=False, + allow_host=True, + allow_host_numa=True, + allow_host_numa_current=True, + ) + handle_return( + driver.cuMemDiscardAndPrefetchBatchAsync( + [self.handle], + [self._size], + 1, + [location], + [0], + 1, + 0, + s.handle, + ) + ) + def __dlpack__( self, *, @@ -453,6 +734,7 @@ cdef inline int _query_memory_attrs( out.is_host_accessible = True out.is_device_accessible = False out.device_id = -1 + out.is_managed = False elif ( is_managed or memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_HOST @@ -461,10 +743,12 @@ cdef inline int _query_memory_attrs( out.is_host_accessible = True out.is_device_accessible = True out.device_id = device_id + out.is_managed = is_managed != 0 elif memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_DEVICE: out.is_host_accessible = False out.is_device_accessible = True out.device_id = device_id + out.is_managed = False else: with cython.gil: raise ValueError(f"Unsupported memory type: {memory_type}") diff --git a/cuda_core/docs/source/release/0.7.x-notes.rst b/cuda_core/docs/source/release/0.7.x-notes.rst index 98551603b6..18b3bede36 100644 --- a/cuda_core/docs/source/release/0.7.x-notes.rst +++ b/cuda_core/docs/source/release/0.7.x-notes.rst @@ -35,6 +35,11 @@ New features preference, or a tuple such as ``("device", 0)``, ``("host", None)``, or ``("host_numa", 3)``. +- Added managed-memory controls on :class:`Buffer`: ``advise()``, + ``prefetch()``, and ``discard_prefetch()``. These methods validate that the + underlying allocation is managed memory and then forward to the corresponding + CUDA driver operations for range advice and migration. + - Added ``numa_id`` option to :class:`PinnedMemoryResourceOptions` for explicit control over host NUMA node placement. When ``ipc_enabled=True`` and ``numa_id`` is not set, the NUMA node is automatically derived from the diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 0473d2d183..dd146785ec 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1134,6 +1134,133 @@ def test_managed_memory_resource_preferred_location_validation(init_cuda): ) +def _get_mem_range_attr(buffer, attribute, data_size): + return handle_return(driver.cuMemRangeGetAttribute(data_size, attribute, buffer.handle, buffer.size)) + + +def test_managed_buffer_advise_prefetch_and_discard_prefetch(init_cuda): + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + + if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): + pytest.skip("discard-prefetch requires cuda.bindings support") + + mr = create_managed_memory_resource_or_skip() + buffer = mr.allocate(4096) + stream = device.create_stream() + + buffer.advise("set_read_mostly") + assert _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, + 4, + ) == 1 + + buffer.advise("set_preferred_location", device, location_type="device") + preferred_type = _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE, + 4, + ) + preferred_id = _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_ID, + 4, + ) + assert int(preferred_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE) + assert preferred_id == device.device_id + + buffer.prefetch(-1, stream=stream) + stream.sync() + last_type = _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE, + 4, + ) + assert int(last_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST) + + buffer.discard_prefetch(device, stream=stream) + stream.sync() + last_type = _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE, + 4, + ) + last_id = _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_ID, + 4, + ) + assert int(last_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE) + assert last_id == device.device_id + + buffer.close() + + +def test_managed_buffer_operations_support_external_managed_allocations(init_cuda): + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(4096) + stream = device.create_stream() + + buffer.prefetch(device, stream=stream) + stream.sync() + + last_type = _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE, + 4, + ) + last_id = _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_ID, + 4, + ) + assert int(last_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE) + assert last_id == device.device_id + + buffer.close() + + +def test_managed_buffer_operations_reject_non_managed_buffers(init_cuda): + device = Device() + device.set_current() + + buffer = DummyDeviceMemoryResource(device).allocate(4096) + stream = device.create_stream() + + with pytest.raises(ValueError, match="managed-memory buffer"): + buffer.advise("set_read_mostly") + with pytest.raises(ValueError, match="managed-memory buffer"): + buffer.prefetch(device, stream=stream) + with pytest.raises(ValueError, match="managed-memory buffer"): + buffer.discard_prefetch(device, stream=stream) + + buffer.close() + + +def test_managed_buffer_operation_validation(init_cuda): + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + + mr = create_managed_memory_resource_or_skip() + buffer = mr.allocate(4096) + stream = device.create_stream() + + with pytest.raises(ValueError, match="requires a location"): + buffer.prefetch(stream=stream) + with pytest.raises(ValueError, match="does not support location_type='host_numa'"): + buffer.advise("set_accessed_by", 0, location_type="host_numa") + with pytest.raises(ValueError, match="location must be None or -1"): + buffer.prefetch(0, stream=stream, location_type="host") + + buffer.close() + + def test_managed_memory_resource_host_numa_auto_resolve_failure(init_cuda): """host_numa with None raises RuntimeError when NUMA ID cannot be determined.""" from unittest.mock import MagicMock, patch From c418050043ef38cc15a74e733d9038d564068c0d Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 16 Mar 2026 17:44:49 -0700 Subject: [PATCH 02/16] wip --- cuda_core/tests/test_memory.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index dd146785ec..44d50e356c 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1151,11 +1151,14 @@ def test_managed_buffer_advise_prefetch_and_discard_prefetch(init_cuda): stream = device.create_stream() buffer.advise("set_read_mostly") - assert _get_mem_range_attr( - buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, - 4, - ) == 1 + assert ( + _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, + 4, + ) + == 1 + ) buffer.advise("set_preferred_location", device, location_type="device") preferred_type = _get_mem_range_attr( From b879fa5b13922b2a41122f31751cd11c0c1fbaee Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 16 Mar 2026 17:51:36 -0700 Subject: [PATCH 03/16] fixing ci compiler errors --- cuda_core/cuda/core/_memory/_buffer.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 686585b527..05a1667b3f 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -36,7 +36,7 @@ else: BufferProtocol = object from cuda.core._dlpack import DLDeviceType, make_py_capsule -from cuda.core._utils.cuda_utils import driver +from cuda.core._utils.cuda_utils import driver, handle_return from cuda.core._device import Device @@ -175,7 +175,7 @@ cdef inline object _normalize_managed_location( cdef int loc_id if isinstance(location, Device): - location = (location).device_id + location = location.device_id if location_type is not None and not isinstance(location_type, str): raise TypeError(f"{what} location_type must be a string or None, got {type(location_type).__name__}") From 04ee3de1859c91158f30a7bffd3246024d422f0e Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 17 Mar 2026 09:07:10 -0700 Subject: [PATCH 04/16] skipping tests that aren't supported --- cuda_core/tests/test_memory.py | 130 ++++++++++++++++++++++----------- 1 file changed, 86 insertions(+), 44 deletions(-) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 44d50e356c..95c6e6e964 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1138,18 +1138,70 @@ def _get_mem_range_attr(buffer, attribute, data_size): return handle_return(driver.cuMemRangeGetAttribute(data_size, attribute, buffer.handle, buffer.size)) -def test_managed_buffer_advise_prefetch_and_discard_prefetch(init_cuda): - device = Device() - skip_if_managed_memory_unsupported(device) - device.set_current() +def _skip_if_managed_allocation_unsupported(device): + try: + if not device.properties.managed_memory: + pytest.skip("Device does not support managed memory operations") + except AttributeError: + pytest.skip("Managed-memory buffer operations require CUDA support") + +def _skip_if_managed_location_ops_unsupported(device): + _skip_if_managed_allocation_unsupported(device) + try: + if not device.properties.concurrent_managed_access: + pytest.skip("Device does not support concurrent managed memory access") + except AttributeError: + pytest.skip("Managed-memory location operations require CUDA support") + + +def _skip_if_managed_discard_prefetch_unsupported(device): + _skip_if_managed_location_ops_unsupported(device) if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): pytest.skip("discard-prefetch requires cuda.bindings support") + visible_devices = Device.get_all_devices() + if not all(dev.properties.concurrent_managed_access for dev in visible_devices): + pytest.skip("discard-prefetch requires concurrent managed access on all visible devices") + + +def test_managed_buffer_prefetch_supports_managed_pool_allocations(init_cuda): + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + mr = create_managed_memory_resource_or_skip() buffer = mr.allocate(4096) stream = device.create_stream() + buffer.prefetch(-1, stream=stream) + stream.sync() + last_location = _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + 4, + ) + assert last_location == -1 + + buffer.prefetch(device, stream=stream) + stream.sync() + last_location = _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + 4, + ) + assert last_location == device.device_id + + buffer.close() + + +def test_managed_buffer_advise_supports_external_managed_allocations(init_cuda): + device = Device() + _skip_if_managed_allocation_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(4096) + buffer.advise("set_read_mostly") assert ( _get_mem_range_attr( @@ -1160,70 +1212,60 @@ def test_managed_buffer_advise_prefetch_and_discard_prefetch(init_cuda): == 1 ) - buffer.advise("set_preferred_location", device, location_type="device") - preferred_type = _get_mem_range_attr( - buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE, - 4, - ) - preferred_id = _get_mem_range_attr( + # cuda.bindings currently exposes the combined location attributes for + # cuMemRangeGetAttribute, so use the legacy location query here. + buffer.advise("set_preferred_location", location_type="host") + preferred_location = _get_mem_range_attr( buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_ID, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION, 4, ) - assert int(preferred_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE) - assert preferred_id == device.device_id + assert preferred_location == -1 - buffer.prefetch(-1, stream=stream) - stream.sync() - last_type = _get_mem_range_attr( - buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE, - 4, - ) - assert int(last_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST) + buffer.close() - buffer.discard_prefetch(device, stream=stream) + +def test_managed_buffer_prefetch_supports_external_managed_allocations(init_cuda): + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(4096) + stream = device.create_stream() + + buffer.prefetch(device, stream=stream) stream.sync() - last_type = _get_mem_range_attr( - buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE, - 4, - ) - last_id = _get_mem_range_attr( + + last_location = _get_mem_range_attr( buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_ID, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, 4, ) - assert int(last_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE) - assert last_id == device.device_id + assert last_location == device.device_id buffer.close() -def test_managed_buffer_operations_support_external_managed_allocations(init_cuda): +def test_managed_buffer_discard_prefetch_supports_external_managed_allocations(init_cuda): device = Device() - skip_if_managed_memory_unsupported(device) + _skip_if_managed_discard_prefetch_unsupported(device) device.set_current() buffer = DummyUnifiedMemoryResource(device).allocate(4096) stream = device.create_stream() - buffer.prefetch(device, stream=stream) + buffer.prefetch(-1, stream=stream) stream.sync() - last_type = _get_mem_range_attr( - buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE, - 4, - ) - last_id = _get_mem_range_attr( + buffer.discard_prefetch(device, stream=stream) + stream.sync() + + last_location = _get_mem_range_attr( buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_ID, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, 4, ) - assert int(last_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE) - assert last_id == device.device_id + assert last_location == device.device_id buffer.close() From 9ab3f465d1c7d072a6dd9c6b8b70a9b47a24f3d8 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 17 Mar 2026 09:34:29 -0700 Subject: [PATCH 05/16] cu12 support --- cuda_core/cuda/core/_memory/_buffer.pyx | 40 ++++++++++++++++++-- cuda_core/tests/test_memory.py | 50 ++++++++++++++++++++++++- 2 files changed, 86 insertions(+), 4 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 05a1667b3f..4460de900d 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -36,7 +36,7 @@ else: BufferProtocol = object from cuda.core._dlpack import DLDeviceType, make_py_capsule -from cuda.core._utils.cuda_utils import driver, handle_return +from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return from cuda.core._device import Device @@ -247,6 +247,20 @@ cdef inline object _normalize_managed_location( return _make_managed_location(loc_type, loc_id) +cdef inline bint _managed_location_uses_v2_bindings(): + # cuda.bindings 13.x switches these APIs to CUmemLocation-based wrappers. + return get_binding_version() >= (13, 0) + + +cdef inline int _managed_location_to_legacy_device(object location, str what): + cdef object loc_type = location.type + if loc_type == _managed_location_enum("device") or loc_type == _managed_location_enum("host"): + return location.id + raise RuntimeError( + f"{what} requires cuda.bindings 13.x for location_type={loc_type!r}" + ) + + cdef inline void _require_managed_buffer(Buffer self, str what): _init_mem_attrs(self) if not self._mem_attrs.is_managed: @@ -518,7 +532,17 @@ cdef class Buffer: allow_host_numa=advice_name not in _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY, allow_host_numa_current=advice_name == "set_preferred_location", ) - handle_return(driver.cuMemAdvise(self.handle, self._size, advice, location)) + if _managed_location_uses_v2_bindings(): + handle_return(driver.cuMemAdvise(self.handle, self._size, advice, location)) + else: + handle_return( + driver.cuMemAdvise( + self.handle, + self._size, + advice, + _managed_location_to_legacy_device(location, "Buffer.advise"), + ) + ) def prefetch( self, @@ -539,7 +563,17 @@ cdef class Buffer: allow_host_numa=True, allow_host_numa_current=True, ) - handle_return(driver.cuMemPrefetchAsync(self.handle, self._size, location, 0, s.handle)) + if _managed_location_uses_v2_bindings(): + handle_return(driver.cuMemPrefetchAsync(self.handle, self._size, location, 0, s.handle)) + else: + handle_return( + driver.cuMemPrefetchAsync( + self.handle, + self._size, + _managed_location_to_legacy_device(location, "Buffer.prefetch"), + s.handle, + ) + ) def discard_prefetch( self, diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 95c6e6e964..380b581e7b 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -43,7 +43,7 @@ system as ccx_system, ) from cuda.core._dlpack import DLDeviceType -from cuda.core._memory import IPCBufferDescriptor +from cuda.core._memory import IPCBufferDescriptor, _buffer from cuda.core._utils.cuda_utils import CUDAError, handle_return from cuda.core.utils import StridedMemoryView @@ -1270,6 +1270,54 @@ def test_managed_buffer_discard_prefetch_supports_external_managed_allocations(i buffer.close() +def test_managed_buffer_advise_uses_legacy_bindings_signature(monkeypatch, init_cuda): + device = Device() + _skip_if_managed_allocation_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(4096) + calls = [] + + def fake_cuMemAdvise(ptr, size, advice, location): + calls.append((ptr, size, advice, location)) + return (driver.CUresult.CUDA_SUCCESS,) + + monkeypatch.setattr(_buffer, "get_binding_version", lambda: (12, 9)) + monkeypatch.setattr(_buffer.driver, "cuMemAdvise", fake_cuMemAdvise) + + buffer.advise("set_read_mostly") + + assert len(calls) == 1 + assert calls[0][3] == int(getattr(driver, "CU_DEVICE_CPU", -1)) + + buffer.close() + + +def test_managed_buffer_prefetch_uses_legacy_bindings_signature(monkeypatch, init_cuda): + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(4096) + stream = device.create_stream() + calls = [] + + def fake_cuMemPrefetchAsync(ptr, size, location, hstream): + calls.append((ptr, size, location, hstream)) + return (driver.CUresult.CUDA_SUCCESS,) + + monkeypatch.setattr(_buffer, "get_binding_version", lambda: (12, 9)) + monkeypatch.setattr(_buffer.driver, "cuMemPrefetchAsync", fake_cuMemPrefetchAsync) + + buffer.prefetch(device, stream=stream) + + assert len(calls) == 1 + assert calls[0][2] == device.device_id + assert int(calls[0][3]) == int(stream.handle) + + buffer.close() + + def test_managed_buffer_operations_reject_non_managed_buffers(init_cuda): device = Device() device.set_current() From a948066ab2fc6fda3dfb74516538091e96e68746 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 17 Mar 2026 16:45:51 -0700 Subject: [PATCH 06/16] Moving to function from Buffer class methods to free standing functions in the cuda.core.managed_memory namespace --- cuda_core/cuda/core/__init__.py | 2 +- cuda_core/cuda/core/_memory/_buffer.pyx | 322 +++++++++++------- cuda_core/cuda/core/experimental/__init__.py | 3 +- cuda_core/cuda/core/managed_memory.py | 9 + cuda_core/docs/source/api.rst | 13 + cuda_core/docs/source/release/0.7.x-notes.rst | 10 +- cuda_core/pixi.lock | 18 +- .../test_experimental_backward_compat.py | 7 + cuda_core/tests/test_memory.py | 137 +++++--- 9 files changed, 335 insertions(+), 186 deletions(-) create mode 100644 cuda_core/cuda/core/managed_memory.py diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index 139078e86e..c55c0786ed 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -28,7 +28,7 @@ finally: del bindings, importlib, subdir, cuda_major, cuda_minor -from cuda.core import system, utils +from cuda.core import managed_memory, system, utils from cuda.core._device import Device from cuda.core._event import Event, EventOptions from cuda.core._graph import ( diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 8ae6d22ee5..4663302b34 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -113,6 +113,13 @@ cdef frozenset _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY = frozenset(( "unset_accessed_by", )) +cdef int _MANAGED_SIZE_NOT_PROVIDED = -1 +cdef int _HOST_NUMA_CURRENT_ID = 0 +cdef int _FIRST_PREFETCH_LOCATION_INDEX = 0 +cdef size_t _SINGLE_RANGE_COUNT = 1 +cdef size_t _SINGLE_PREFETCH_LOCATION_COUNT = 1 +cdef unsigned long long _MANAGED_OPERATION_FLAGS = 0 + cdef inline object _managed_location_enum(str location_type): cdef str attr_name = _MANAGED_LOCATION_TYPE_ATTRS[location_type] @@ -130,7 +137,7 @@ cdef inline object _make_managed_location(str location_type, int location_id): if location_type == "host": location.id = int(getattr(driver, "CU_DEVICE_CPU", -1)) elif location_type == "host_numa_current": - location.id = 0 + location.id = _HOST_NUMA_CURRENT_ID else: location.id = location_id return location @@ -236,7 +243,7 @@ cdef inline object _normalize_managed_location( raise ValueError( f"{what} location must be None when location_type is 'host_numa_current', got {location!r}" ) - return _make_managed_location(loc_type, 0) + return _make_managed_location(loc_type, _HOST_NUMA_CURRENT_ID) if loc_type == "host" and not allow_host: raise ValueError(f"{what} does not support host locations") @@ -264,16 +271,206 @@ cdef inline int _managed_location_to_legacy_device(object location, str what): cdef inline void _require_managed_buffer(Buffer self, str what): _init_mem_attrs(self) if not self._mem_attrs.is_managed: - raise ValueError(f"{what} requires a managed-memory buffer") + raise ValueError(f"{what} requires a managed-memory allocation") -cdef inline void _require_managed_discard_prefetch_support(): +cdef inline void _require_managed_discard_prefetch_support(str what): if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): raise RuntimeError( - "Buffer.discard_prefetch requires cuda.bindings support for " - "cuMemDiscardAndPrefetchBatchAsync" + f"{what} requires cuda.bindings support for cuMemDiscardAndPrefetchBatchAsync" ) + +cdef inline tuple _managed_range_from_buffer( + Buffer buffer, + int size, + str what, +): + if size != _MANAGED_SIZE_NOT_PROVIDED: + raise TypeError(f"{what} does not accept size= when target is a Buffer") + _require_managed_buffer(buffer, what) + return buffer.handle, buffer._size + + +cdef inline uintptr_t _coerce_raw_pointer(object target, str what) except? 0: + cdef object ptr_obj + try: + ptr_obj = int(target) + except Exception as exc: + raise TypeError( + f"{what} target must be a Buffer or a raw pointer, got {type(target).__name__}" + ) from exc + if ptr_obj < 0: + raise ValueError(f"{what} target pointer must be >= 0, got {target!r}") + return ptr_obj + + +cdef inline int _require_managed_pointer(uintptr_t ptr, str what) except -1: + cdef _MemAttrs mem_attrs + with nogil: + _query_memory_attrs(mem_attrs, ptr) + if not mem_attrs.is_managed: + raise ValueError(f"{what} requires a managed-memory allocation") + return 0 + + +cdef inline tuple _normalize_managed_target_range( + object target, + int size, + str what, +): + cdef uintptr_t ptr + + if isinstance(target, Buffer): + return _managed_range_from_buffer(target, size, what) + + if size == _MANAGED_SIZE_NOT_PROVIDED: + raise TypeError(f"{what} requires size= when target is a raw pointer") + ptr = _coerce_raw_pointer(target, what) + _require_managed_pointer(ptr, what) + return ptr, size + + +def advise( + target, + advice: driver.CUmem_advise | str, + location: Device | int | None = None, + *, + int size=_MANAGED_SIZE_NOT_PROVIDED, + location_type: str | None = None, +): + """Apply managed-memory advice to an allocation range. + + Parameters + ---------- + target : :class:`Buffer` | int | object + Managed allocation to operate on. This may be a :class:`Buffer` or a + raw pointer (requires ``size=``). + advice : :obj:`~driver.CUmem_advise` | str + Managed-memory advice to apply. String aliases such as + ``"set_read_mostly"``, ``"set_preferred_location"``, and + ``"set_accessed_by"`` are accepted. + location : :obj:`~_device.Device` | int | None, optional + Target location. When ``location_type`` is ``None``, values are + interpreted as a device ordinal, ``-1`` for host, or ``None`` for + advice values that ignore location. + size : int, optional + Allocation size in bytes. Required when ``target`` is a raw pointer. + location_type : str | None, optional + Explicit location kind. Supported values are ``"device"``, ``"host"``, + ``"host_numa"``, and ``"host_numa_current"``. + """ + cdef str advice_name + cdef object ptr + cdef size_t nbytes + + ptr, nbytes = _normalize_managed_target_range(target, size, "advise") + advice_name, advice = _normalize_managed_advice(advice) + location = _normalize_managed_location( + location, + location_type, + "advise", + allow_none=advice_name in _MANAGED_ADVICE_IGNORE_LOCATION, + allow_host=True, + allow_host_numa=advice_name not in _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY, + allow_host_numa_current=advice_name == "set_preferred_location", + ) + if _managed_location_uses_v2_bindings(): + handle_return(driver.cuMemAdvise(ptr, nbytes, advice, location)) + else: + handle_return( + driver.cuMemAdvise( + ptr, + nbytes, + advice, + _managed_location_to_legacy_device(location, "advise"), + ) + ) + + +def prefetch( + target, + location: Device | int | None = None, + *, + stream: Stream | GraphBuilder, + int size=_MANAGED_SIZE_NOT_PROVIDED, + location_type: str | None = None, +): + """Prefetch a managed-memory allocation range to a target location.""" + cdef Stream s = Stream_accept(stream) + cdef object ptr + cdef size_t nbytes + + ptr, nbytes = _normalize_managed_target_range(target, size, "prefetch") + location = _normalize_managed_location( + location, + location_type, + "prefetch", + allow_none=False, + allow_host=True, + allow_host_numa=True, + allow_host_numa_current=True, + ) + if _managed_location_uses_v2_bindings(): + handle_return( + driver.cuMemPrefetchAsync( + ptr, + nbytes, + location, + _MANAGED_OPERATION_FLAGS, + s.handle, + ) + ) + else: + handle_return( + driver.cuMemPrefetchAsync( + ptr, + nbytes, + _managed_location_to_legacy_device(location, "prefetch"), + s.handle, + ) + ) + + +def discard_prefetch( + target, + location: Device | int | None = None, + *, + stream: Stream | GraphBuilder, + int size=_MANAGED_SIZE_NOT_PROVIDED, + location_type: str | None = None, +): + """Discard a managed-memory allocation range and prefetch it to a target location.""" + cdef Stream s = Stream_accept(stream) + cdef object ptr + cdef object batch_ptr + cdef size_t nbytes + + ptr, nbytes = _normalize_managed_target_range(target, size, "discard_prefetch") + batch_ptr = driver.CUdeviceptr(int(ptr)) + _require_managed_discard_prefetch_support("discard_prefetch") + location = _normalize_managed_location( + location, + location_type, + "discard_prefetch", + allow_none=False, + allow_host=True, + allow_host_numa=True, + allow_host_numa_current=True, + ) + handle_return( + driver.cuMemDiscardAndPrefetchBatchAsync( + [batch_ptr], + [nbytes], + _SINGLE_RANGE_COUNT, + [location], + [_FIRST_PREFETCH_LOCATION_INDEX], + _SINGLE_PREFETCH_LOCATION_COUNT, + _MANAGED_OPERATION_FLAGS, + s.handle, + ) + ) + cdef class Buffer: """Represent a handle to allocated memory. @@ -502,119 +699,6 @@ cdef class Buffer: finally: PyBuffer_Release(&buf) - def advise( - self, - advice: driver.CUmem_advise | str, - location: Device | int | None = None, - *, - location_type: str | None = None, - ): - """Apply a managed-memory advice to this buffer. - - This method is only valid for buffers backed by managed memory. - - Parameters - ---------- - advice : :obj:`~driver.CUmem_advise` | str - Managed-memory advice to apply. String aliases such as - ``"set_read_mostly"``, ``"set_preferred_location"``, and - ``"set_accessed_by"`` are accepted. - location : :obj:`~_device.Device` | int | None, optional - Target location. When ``location_type`` is ``None``, values are - interpreted as a device ordinal, ``-1`` for host, or ``None`` for - advice values that ignore location. - location_type : str | None, optional - Explicit location kind. Supported values are ``"device"``, - ``"host"``, ``"host_numa"``, and ``"host_numa_current"``. - """ - cdef str advice_name - _require_managed_buffer(self, "Buffer.advise") - advice_name, advice = _normalize_managed_advice(advice) - location = _normalize_managed_location( - location, - location_type, - "Buffer.advise", - allow_none=advice_name in _MANAGED_ADVICE_IGNORE_LOCATION, - allow_host=True, - allow_host_numa=advice_name not in _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY, - allow_host_numa_current=advice_name == "set_preferred_location", - ) - if _managed_location_uses_v2_bindings(): - handle_return(driver.cuMemAdvise(self.handle, self._size, advice, location)) - else: - handle_return( - driver.cuMemAdvise( - self.handle, - self._size, - advice, - _managed_location_to_legacy_device(location, "Buffer.advise"), - ) - ) - - def prefetch( - self, - location: Device | int | None = None, - *, - stream: Stream | GraphBuilder, - location_type: str | None = None, - ): - """Prefetch this managed-memory buffer to a target location.""" - cdef Stream s = Stream_accept(stream) - _require_managed_buffer(self, "Buffer.prefetch") - location = _normalize_managed_location( - location, - location_type, - "Buffer.prefetch", - allow_none=False, - allow_host=True, - allow_host_numa=True, - allow_host_numa_current=True, - ) - if _managed_location_uses_v2_bindings(): - handle_return(driver.cuMemPrefetchAsync(self.handle, self._size, location, 0, s.handle)) - else: - handle_return( - driver.cuMemPrefetchAsync( - self.handle, - self._size, - _managed_location_to_legacy_device(location, "Buffer.prefetch"), - s.handle, - ) - ) - - def discard_prefetch( - self, - location: Device | int | None = None, - *, - stream: Stream | GraphBuilder, - location_type: str | None = None, - ): - """Discard this managed-memory buffer and prefetch it to a target location.""" - cdef Stream s = Stream_accept(stream) - _require_managed_buffer(self, "Buffer.discard_prefetch") - _require_managed_discard_prefetch_support() - location = _normalize_managed_location( - location, - location_type, - "Buffer.discard_prefetch", - allow_none=False, - allow_host=True, - allow_host_numa=True, - allow_host_numa_current=True, - ) - handle_return( - driver.cuMemDiscardAndPrefetchBatchAsync( - [self.handle], - [self._size], - 1, - [location], - [0], - 1, - 0, - s.handle, - ) - ) - def __dlpack__( self, *, diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index e7989f0f26..83fb1c7581 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -38,9 +38,10 @@ def _warn_deprecated(): _warn_deprecated() -from cuda.core import system, utils +from cuda.core import managed_memory, system, utils # Make utils accessible as a submodule for backward compatibility +__import__("sys").modules[__spec__.name + ".managed_memory"] = managed_memory __import__("sys").modules[__spec__.name + ".utils"] = utils diff --git a/cuda_core/cuda/core/managed_memory.py b/cuda_core/cuda/core/managed_memory.py new file mode 100644 index 0000000000..f11aabcd19 --- /dev/null +++ b/cuda_core/cuda/core/managed_memory.py @@ -0,0 +1,9 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Managed-memory range operations.""" + +from cuda.core._memory._buffer import advise, discard_prefetch, prefetch + +__all__ = ["advise", "prefetch", "discard_prefetch"] diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index fa7ce48eb5..4d63bbcf88 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -62,6 +62,19 @@ CUDA runtime on other non-blocking streams. +.. module:: cuda.core.managed_memory + +Managed memory +-------------- + +.. autosummary:: + :toctree: generated/ + + advise + prefetch + discard_prefetch + + CUDA compilation toolchain -------------------------- diff --git a/cuda_core/docs/source/release/0.7.x-notes.rst b/cuda_core/docs/source/release/0.7.x-notes.rst index 18b3bede36..186e3181f1 100644 --- a/cuda_core/docs/source/release/0.7.x-notes.rst +++ b/cuda_core/docs/source/release/0.7.x-notes.rst @@ -35,10 +35,12 @@ New features preference, or a tuple such as ``("device", 0)``, ``("host", None)``, or ``("host_numa", 3)``. -- Added managed-memory controls on :class:`Buffer`: ``advise()``, - ``prefetch()``, and ``discard_prefetch()``. These methods validate that the - underlying allocation is managed memory and then forward to the corresponding - CUDA driver operations for range advice and migration. +- Added managed-memory range operations under :mod:`cuda.core.managed_memory`: + ``advise()``, ``prefetch()``, and ``discard_prefetch()``. These free + functions accept either a managed :class:`Buffer` or a raw pointer plus + ``size=``, validate that the target allocation is managed memory, and then + forward to the corresponding CUDA driver operations for range advice and + migration. - Added ``numa_id`` option to :class:`PinnedMemoryResourceOptions` for explicit control over host NUMA node placement. When ``ipc_enabled=True`` and diff --git a/cuda_core/pixi.lock b/cuda_core/pixi.lock index 78da9addb5..e2f8b7b0c2 100644 --- a/cuda_core/pixi.lock +++ b/cuda_core/pixi.lock @@ -2598,7 +2598,7 @@ packages: subdir: win-64 variants: c_compiler: vs2022 - cuda-version: 13.2.* + cuda_version: 13.2.* cxx_compiler: vs2022 python: 3.14.* target_platform: win-64 @@ -2625,7 +2625,7 @@ packages: build: py314h9a28ecd_0 subdir: linux-aarch64 variants: - cuda-version: 13.2.* + cuda_version: 13.2.* python: 3.14.* target_platform: linux-aarch64 depends: @@ -2653,7 +2653,7 @@ packages: build: py314hb727236_0 subdir: linux-64 variants: - cuda-version: 13.2.* + cuda_version: 13.2.* python: 3.14.* target_platform: linux-64 depends: @@ -2794,7 +2794,7 @@ packages: subdir: win-64 variants: c_compiler: vs2022 - cuda-version: 13.2.* + cuda_version: 13.2.* cxx_compiler: vs2022 python: 3.14.* target_platform: win-64 @@ -2817,7 +2817,7 @@ packages: subdir: win-64 variants: c_compiler: vs2022 - cuda-version: 12.* + cuda_version: 12.* cxx_compiler: vs2022 python: 3.14.* target_platform: win-64 @@ -2840,7 +2840,7 @@ packages: build: py314h9a28ecd_0 subdir: linux-aarch64 variants: - cuda-version: 13.2.* + cuda_version: 13.2.* python: 3.14.* target_platform: linux-aarch64 depends: @@ -2862,7 +2862,7 @@ packages: build: py314ha6d028f_0 subdir: linux-64 variants: - cuda-version: 12.* + cuda_version: 12.* python: 3.14.* target_platform: linux-64 depends: @@ -2884,7 +2884,7 @@ packages: build: py314hb727236_0 subdir: linux-64 variants: - cuda-version: 13.2.* + cuda_version: 13.2.* python: 3.14.* target_platform: linux-64 depends: @@ -2906,7 +2906,7 @@ packages: build: py314he8946ed_0 subdir: linux-aarch64 variants: - cuda-version: 12.* + cuda_version: 12.* python: 3.14.* target_platform: linux-aarch64 depends: diff --git a/cuda_core/tests/test_experimental_backward_compat.py b/cuda_core/tests/test_experimental_backward_compat.py index c3215b056a..82e2cdd5be 100644 --- a/cuda_core/tests/test_experimental_backward_compat.py +++ b/cuda_core/tests/test_experimental_backward_compat.py @@ -38,6 +38,7 @@ def test_experimental_backward_compatibility(): assert hasattr(cuda.core.experimental, "Device") assert hasattr(cuda.core.experimental, "Stream") assert hasattr(cuda.core.experimental, "Buffer") + assert hasattr(cuda.core.experimental, "managed_memory") assert hasattr(cuda.core.experimental, "system") # Test 2: Direct imports - should emit deprecation warning @@ -73,6 +74,7 @@ def test_experimental_backward_compatibility(): assert cuda.core.experimental.Linker is cuda.core.Linker # Compare singletons + assert cuda.core.experimental.managed_memory is cuda.core.managed_memory assert cuda.core.experimental.system is cuda.core.system # Test 4: Utils module works @@ -88,6 +90,11 @@ def test_experimental_backward_compatibility(): assert StridedMemoryView is not None assert args_viewable_as_strided_memory is not None + from cuda.core.experimental.managed_memory import advise, discard_prefetch, prefetch + + assert advise is not None + assert prefetch is not None + assert discard_prefetch is not None # Test 5: Options classes are accessible assert hasattr(cuda.core.experimental, "EventOptions") diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 380b581e7b..927014826a 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -38,6 +38,7 @@ PinnedMemoryResourceOptions, VirtualMemoryResource, VirtualMemoryResourceOptions, + managed_memory, ) from cuda.core import ( system as ccx_system, @@ -48,6 +49,12 @@ from cuda.core.utils import StridedMemoryView POOL_SIZE = 2097152 # 2MB size +_MANAGED_TEST_ALLOCATION_SIZE = 4096 +_MEM_RANGE_ATTRIBUTE_VALUE_SIZE = 4 +_READ_MOSTLY_ENABLED = 1 +_HOST_LOCATION_ID = -1 +_INVALID_HOST_DEVICE_ORDINAL = 0 +_LEGACY_BINDINGS_VERSION = (12, 9) class DummyDeviceMemoryResource(MemoryResource): @@ -1138,6 +1145,10 @@ def _get_mem_range_attr(buffer, attribute, data_size): return handle_return(driver.cuMemRangeGetAttribute(data_size, attribute, buffer.handle, buffer.size)) +def _get_int_mem_range_attr(buffer, attribute): + return _get_mem_range_attr(buffer, attribute, _MEM_RANGE_ATTRIBUTE_VALUE_SIZE) + + def _skip_if_managed_allocation_unsupported(device): try: if not device.properties.managed_memory: @@ -1165,140 +1176,134 @@ def _skip_if_managed_discard_prefetch_unsupported(device): pytest.skip("discard-prefetch requires concurrent managed access on all visible devices") -def test_managed_buffer_prefetch_supports_managed_pool_allocations(init_cuda): +def test_managed_memory_prefetch_supports_managed_pool_allocations(init_cuda): device = Device() skip_if_managed_memory_unsupported(device) device.set_current() mr = create_managed_memory_resource_or_skip() - buffer = mr.allocate(4096) + buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - buffer.prefetch(-1, stream=stream) + managed_memory.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) stream.sync() - last_location = _get_mem_range_attr( + last_location = _get_int_mem_range_attr( buffer, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, - 4, ) - assert last_location == -1 + assert last_location == _HOST_LOCATION_ID - buffer.prefetch(device, stream=stream) + managed_memory.prefetch(buffer, device, stream=stream) stream.sync() - last_location = _get_mem_range_attr( + last_location = _get_int_mem_range_attr( buffer, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, - 4, ) assert last_location == device.device_id buffer.close() -def test_managed_buffer_advise_supports_external_managed_allocations(init_cuda): +def test_managed_memory_advise_supports_external_managed_allocations(init_cuda): device = Device() _skip_if_managed_allocation_unsupported(device) device.set_current() - buffer = DummyUnifiedMemoryResource(device).allocate(4096) + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - buffer.advise("set_read_mostly") + managed_memory.advise(buffer, "set_read_mostly") assert ( - _get_mem_range_attr( + _get_int_mem_range_attr( buffer, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, - 4, ) - == 1 + == _READ_MOSTLY_ENABLED ) # cuda.bindings currently exposes the combined location attributes for # cuMemRangeGetAttribute, so use the legacy location query here. - buffer.advise("set_preferred_location", location_type="host") - preferred_location = _get_mem_range_attr( + managed_memory.advise(buffer, "set_preferred_location", location_type="host") + preferred_location = _get_int_mem_range_attr( buffer, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION, - 4, ) - assert preferred_location == -1 + assert preferred_location == _HOST_LOCATION_ID buffer.close() -def test_managed_buffer_prefetch_supports_external_managed_allocations(init_cuda): +def test_managed_memory_prefetch_supports_external_managed_allocations(init_cuda): device = Device() _skip_if_managed_location_ops_unsupported(device) device.set_current() - buffer = DummyUnifiedMemoryResource(device).allocate(4096) + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - buffer.prefetch(device, stream=stream) + managed_memory.prefetch(buffer, device, stream=stream) stream.sync() - last_location = _get_mem_range_attr( + last_location = _get_int_mem_range_attr( buffer, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, - 4, ) assert last_location == device.device_id buffer.close() -def test_managed_buffer_discard_prefetch_supports_external_managed_allocations(init_cuda): +def test_managed_memory_discard_prefetch_supports_external_managed_allocations(init_cuda): device = Device() _skip_if_managed_discard_prefetch_unsupported(device) device.set_current() - buffer = DummyUnifiedMemoryResource(device).allocate(4096) + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - buffer.prefetch(-1, stream=stream) + managed_memory.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) stream.sync() - buffer.discard_prefetch(device, stream=stream) + managed_memory.discard_prefetch(buffer, device, stream=stream) stream.sync() - last_location = _get_mem_range_attr( + last_location = _get_int_mem_range_attr( buffer, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, - 4, ) assert last_location == device.device_id buffer.close() -def test_managed_buffer_advise_uses_legacy_bindings_signature(monkeypatch, init_cuda): +def test_managed_memory_advise_uses_legacy_bindings_signature(monkeypatch, init_cuda): device = Device() _skip_if_managed_allocation_unsupported(device) device.set_current() - buffer = DummyUnifiedMemoryResource(device).allocate(4096) + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) calls = [] def fake_cuMemAdvise(ptr, size, advice, location): calls.append((ptr, size, advice, location)) return (driver.CUresult.CUDA_SUCCESS,) - monkeypatch.setattr(_buffer, "get_binding_version", lambda: (12, 9)) + monkeypatch.setattr(_buffer, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION) monkeypatch.setattr(_buffer.driver, "cuMemAdvise", fake_cuMemAdvise) - buffer.advise("set_read_mostly") + managed_memory.advise(buffer, "set_read_mostly") assert len(calls) == 1 - assert calls[0][3] == int(getattr(driver, "CU_DEVICE_CPU", -1)) + assert calls[0][3] == int(getattr(driver, "CU_DEVICE_CPU", _HOST_LOCATION_ID)) buffer.close() -def test_managed_buffer_prefetch_uses_legacy_bindings_signature(monkeypatch, init_cuda): +def test_managed_memory_prefetch_uses_legacy_bindings_signature(monkeypatch, init_cuda): device = Device() _skip_if_managed_location_ops_unsupported(device) device.set_current() - buffer = DummyUnifiedMemoryResource(device).allocate(4096) + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() calls = [] @@ -1306,10 +1311,10 @@ def fake_cuMemPrefetchAsync(ptr, size, location, hstream): calls.append((ptr, size, location, hstream)) return (driver.CUresult.CUDA_SUCCESS,) - monkeypatch.setattr(_buffer, "get_binding_version", lambda: (12, 9)) + monkeypatch.setattr(_buffer, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION) monkeypatch.setattr(_buffer.driver, "cuMemPrefetchAsync", fake_cuMemPrefetchAsync) - buffer.prefetch(device, stream=stream) + managed_memory.prefetch(buffer, device, stream=stream) assert len(calls) == 1 assert calls[0][2] == device.device_id @@ -1318,38 +1323,66 @@ def fake_cuMemPrefetchAsync(ptr, size, location, hstream): buffer.close() -def test_managed_buffer_operations_reject_non_managed_buffers(init_cuda): +def test_managed_memory_operations_reject_non_managed_allocations(init_cuda): device = Device() device.set_current() - buffer = DummyDeviceMemoryResource(device).allocate(4096) + buffer = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - with pytest.raises(ValueError, match="managed-memory buffer"): - buffer.advise("set_read_mostly") - with pytest.raises(ValueError, match="managed-memory buffer"): - buffer.prefetch(device, stream=stream) - with pytest.raises(ValueError, match="managed-memory buffer"): - buffer.discard_prefetch(device, stream=stream) + with pytest.raises(ValueError, match="managed-memory allocation"): + managed_memory.advise(buffer, "set_read_mostly") + with pytest.raises(ValueError, match="managed-memory allocation"): + managed_memory.prefetch(buffer, device, stream=stream) + with pytest.raises(ValueError, match="managed-memory allocation"): + managed_memory.discard_prefetch(buffer, device, stream=stream) buffer.close() -def test_managed_buffer_operation_validation(init_cuda): +def test_managed_memory_operation_validation(init_cuda): device = Device() skip_if_managed_memory_unsupported(device) device.set_current() mr = create_managed_memory_resource_or_skip() - buffer = mr.allocate(4096) + buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() with pytest.raises(ValueError, match="requires a location"): - buffer.prefetch(stream=stream) + managed_memory.prefetch(buffer, stream=stream) with pytest.raises(ValueError, match="does not support location_type='host_numa'"): - buffer.advise("set_accessed_by", 0, location_type="host_numa") + managed_memory.advise(buffer, "set_accessed_by", _INVALID_HOST_DEVICE_ORDINAL, location_type="host_numa") with pytest.raises(ValueError, match="location must be None or -1"): - buffer.prefetch(0, stream=stream, location_type="host") + managed_memory.prefetch(buffer, _INVALID_HOST_DEVICE_ORDINAL, stream=stream, location_type="host") + + buffer.close() + + +def test_managed_memory_functions_accept_raw_pointer_ranges(init_cuda): + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + managed_memory.advise(buffer.handle, "set_read_mostly", size=buffer.size) + assert ( + _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, + ) + == _READ_MOSTLY_ENABLED + ) + + managed_memory.prefetch(buffer.handle, device, size=buffer.size, stream=stream) + stream.sync() + last_location = _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last_location == device.device_id buffer.close() From 14575991d65ca85973a4f1dc61f068efc4fc3293 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 17 Mar 2026 16:46:20 -0700 Subject: [PATCH 07/16] precommit format --- cuda_core/cuda/core/managed_memory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/managed_memory.py b/cuda_core/cuda/core/managed_memory.py index f11aabcd19..f5bb09c13d 100644 --- a/cuda_core/cuda/core/managed_memory.py +++ b/cuda_core/cuda/core/managed_memory.py @@ -6,4 +6,4 @@ from cuda.core._memory._buffer import advise, discard_prefetch, prefetch -__all__ = ["advise", "prefetch", "discard_prefetch"] +__all__ = ["advise", "discard_prefetch", "prefetch"] From acb402478cac58689f069e0836819b2e91010c09 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 17 Mar 2026 17:30:41 -0700 Subject: [PATCH 08/16] iterating on implementation --- cuda_bindings/pixi.lock | 86 ++++++++++++------------- cuda_core/cuda/core/_memory/_buffer.pyx | 63 ++++++++++++++---- cuda_core/tests/test_memory.py | 85 ++++++++++++++++++++++++ 3 files changed, 178 insertions(+), 56 deletions(-) diff --git a/cuda_bindings/pixi.lock b/cuda_bindings/pixi.lock index b01d6eec69..237a169580 100644 --- a/cuda_bindings/pixi.lock +++ b/cuda_bindings/pixi.lock @@ -1081,21 +1081,21 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/cairo-1.18.4-h3394656_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/conda-gcc-specs-15.2.0-h53410ce_16.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_linux-64-13.2.27-ha770c72_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_linux-64-13.2.51-ha770c72_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-13.2.51-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-dev-13.2.51-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_linux-64-13.2.51-h376f20c_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-13.2.51-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.2.51-h376f20c_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.2.51-h376f20c_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.51-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.2.51-h69a702a_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-13.2.51-ha770c72_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.2.51-h4bc722e_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-tools-13.2.51-h4bc722e_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-profiler-api-13.2.20-h7938cbb_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.2-he2cc418_3.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_linux-64-12.9.27-ha770c72_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_linux-64-12.9.86-ha770c72_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-12.9.79-h5888daf_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-dev-12.9.79-h5888daf_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_linux-64-12.9.79-h3f2d84a_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-12.9.79-h5888daf_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-12.9.79-h3f2d84a_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-12.9.79-h3f2d84a_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-12.9.86-hecca717_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-12.9.86-h69a702a_6.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-12.9.86-ha770c72_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-12.9.86-h4bc722e_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-tools-12.9.86-h4bc722e_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-profiler-api-12.9.79-h7938cbb_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-12.9-h4f385c5_3.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cython-3.2.3-py314h1807b08_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/dav1d-1.2.1-hd590300_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/dbus-1.16.2-h24cb091_1.conda @@ -1134,7 +1134,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-h3ff7636_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.0.44-h85c024f_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.14.1.1-hbc026e6_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.25-h17f619e_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libdrm-2.4.125-hb03c661_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libegl-1.7.0-ha4b6fd6_2.conda @@ -1160,8 +1160,8 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_2.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb9d3cd8_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libnl-3.11.0-hb9d3cd8_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvfatbin-13.2.51-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-13.2.51-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvfatbin-12.9.82-hecca717_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-12.9.86-hecca717_2.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libogg-1.3.5-hd0c01bc_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_4.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-2025.2.0-hb617929_1.conda @@ -1264,7 +1264,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda - conda: . - build: py314hb727236_0 + build: py314ha6d028f_0 - conda: ../cuda_pathfinder linux-aarch64: - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/_openmp_mutex-4.5-2_gnu.tar.bz2 @@ -1460,21 +1460,21 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/cairo-1.18.4-h5782bbf_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/conda-gcc-specs-15.2.0-hd546029_16.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_win-64-12.9.27-h57928b3_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_win-64-12.9.86-h57928b3_2.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-12.9.79-he0c23c2_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-dev-12.9.79-he0c23c2_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_win-64-12.9.79-he0c23c2_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-static-12.9.79-he0c23c2_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_win-64-12.9.79-he0c23c2_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_win-64-12.9.79-he0c23c2_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-12.9.86-hac47afa_1.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-12.9.86-h719f0c7_6.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_win-64-12.9.86-h57928b3_2.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-impl-12.9.86-h2466b09_2.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-tools-12.9.86-h2466b09_2.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-profiler-api-12.9.79-h57928b3_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-12.9-h4f385c5_3.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_win-64-13.2.27-h57928b3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_win-64-13.2.51-h57928b3_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-dev-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_win-64-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-static-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_win-64-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_win-64-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-13.2.51-h719f0c7_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_win-64-13.2.51-h57928b3_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-impl-13.2.51-h2466b09_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-tools-13.2.51-h2466b09_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-profiler-api-13.2.20-h57928b3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.2-he2cc418_3.conda - conda: https://conda.anaconda.org/conda-forge/win-64/cython-3.2.3-py314h344ed54_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/dav1d-1.2.1-hcfcfb64_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.1-pyhd8ed1ab_0.conda @@ -1520,8 +1520,8 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/liblapack-3.11.0-5_hf9ab0e9_mkl.conda - conda: https://conda.anaconda.org/conda-forge/win-64/liblzma-5.8.1-h2466b09_2.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libmpdec-4.0.0-h2466b09_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/libnvfatbin-12.9.82-hac47afa_1.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/libnvjitlink-12.9.86-hac47afa_2.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libnvfatbin-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libnvjitlink-13.2.51-hac47afa_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libogg-1.3.5-h2466b09_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libopus-1.6-h6a83c73_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libpng-1.6.53-h7351971_0.conda @@ -1583,7 +1583,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/zstd-1.5.7-h534d264_6.conda - conda: . - build: py314h5e6f764_0 + build: py314h356c398_0 - conda: ../cuda_pathfinder packages: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -2154,7 +2154,7 @@ packages: subdir: win-64 variants: c_compiler: vs2022 - cuda-version: 13.2.* + cuda_version: 13.2.* cxx_compiler: vs2022 python: 3.14.* target_platform: win-64 @@ -2182,7 +2182,7 @@ packages: subdir: win-64 variants: c_compiler: vs2022 - cuda-version: 12.* + cuda_version: 12.* cxx_compiler: vs2022 python: 3.14.* target_platform: win-64 @@ -2209,7 +2209,7 @@ packages: build: py314h9a28ecd_0 subdir: linux-aarch64 variants: - cuda-version: 13.2.* + cuda_version: 13.2.* python: 3.14.* target_platform: linux-aarch64 depends: @@ -2237,7 +2237,7 @@ packages: build: py314ha6d028f_0 subdir: linux-64 variants: - cuda-version: 12.* + cuda_version: 12.* python: 3.14.* target_platform: linux-64 depends: @@ -2265,7 +2265,7 @@ packages: build: py314hb727236_0 subdir: linux-64 variants: - cuda-version: 13.2.* + cuda_version: 13.2.* python: 3.14.* target_platform: linux-64 depends: @@ -2293,7 +2293,7 @@ packages: build: py314he8946ed_0 subdir: linux-aarch64 variants: - cuda-version: 12.* + cuda_version: 12.* python: 3.14.* target_platform: linux-aarch64 depends: diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 4663302b34..829e05b3ad 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -205,9 +205,11 @@ cdef inline object _normalize_managed_location( ) loc_id = location if loc_id == -1: - loc_type = "host" + if not allow_host: + raise ValueError(f"{what} does not support host locations") + return _make_managed_location("host", -1) elif loc_id >= 0: - loc_type = "device" + return _make_managed_location("device", loc_id) else: raise ValueError( f"{what} location must be a device ordinal (>= 0), -1 for host, or None; got {location!r}" @@ -245,23 +247,22 @@ cdef inline object _normalize_managed_location( ) return _make_managed_location(loc_type, _HOST_NUMA_CURRENT_ID) - if loc_type == "host" and not allow_host: - raise ValueError(f"{what} does not support host locations") - if loc_type == "host_numa" and not allow_host_numa: - raise ValueError(f"{what} does not support location_type='host_numa'") - if loc_type == "host_numa_current" and not allow_host_numa_current: - raise ValueError(f"{what} does not support location_type='host_numa_current'") - return _make_managed_location(loc_type, loc_id) - cdef inline bint _managed_location_uses_v2_bindings(): # cuda.bindings 13.x switches these APIs to CUmemLocation-based wrappers. return get_binding_version() >= (13, 0) +cdef object _LEGACY_LOC_DEVICE = None +cdef object _LEGACY_LOC_HOST = None + cdef inline int _managed_location_to_legacy_device(object location, str what): + global _LEGACY_LOC_DEVICE, _LEGACY_LOC_HOST + if _LEGACY_LOC_DEVICE is None: + _LEGACY_LOC_DEVICE = _managed_location_enum("device") + _LEGACY_LOC_HOST = _managed_location_enum("host") cdef object loc_type = location.type - if loc_type == _managed_location_enum("device") or loc_type == _managed_location_enum("host"): + if loc_type == _LEGACY_LOC_DEVICE or loc_type == _LEGACY_LOC_HOST: return location.id raise RuntimeError( f"{what} requires cuda.bindings 13.x for location_type={loc_type!r}" @@ -396,7 +397,25 @@ def prefetch( int size=_MANAGED_SIZE_NOT_PROVIDED, location_type: str | None = None, ): - """Prefetch a managed-memory allocation range to a target location.""" + """Prefetch a managed-memory allocation range to a target location. + + Parameters + ---------- + target : :class:`Buffer` | int | object + Managed allocation to operate on. This may be a :class:`Buffer` or a + raw pointer (requires ``size=``). + location : :obj:`~_device.Device` | int | None, optional + Target location. When ``location_type`` is ``None``, values are + interpreted as a device ordinal, ``-1`` for host, or ``None``. + A location is required for prefetch. + stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` + Keyword argument specifying the stream for the asynchronous prefetch. + size : int, optional + Allocation size in bytes. Required when ``target`` is a raw pointer. + location_type : str | None, optional + Explicit location kind. Supported values are ``"device"``, ``"host"``, + ``"host_numa"``, and ``"host_numa_current"``. + """ cdef Stream s = Stream_accept(stream) cdef object ptr cdef size_t nbytes @@ -440,7 +459,25 @@ def discard_prefetch( int size=_MANAGED_SIZE_NOT_PROVIDED, location_type: str | None = None, ): - """Discard a managed-memory allocation range and prefetch it to a target location.""" + """Discard a managed-memory allocation range and prefetch it to a target location. + + Parameters + ---------- + target : :class:`Buffer` | int | object + Managed allocation to operate on. This may be a :class:`Buffer` or a + raw pointer (requires ``size=``). + location : :obj:`~_device.Device` | int | None, optional + Target location. When ``location_type`` is ``None``, values are + interpreted as a device ordinal, ``-1`` for host, or ``None``. + A location is required for discard_prefetch. + stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` + Keyword argument specifying the stream for the asynchronous operation. + size : int, optional + Allocation size in bytes. Required when ``target`` is a raw pointer. + location_type : str | None, optional + Explicit location kind. Supported values are ``"device"``, ``"host"``, + ``"host_numa"``, and ``"host_numa_current"``. + """ cdef Stream s = Stream_accept(stream) cdef object ptr cdef object batch_ptr diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 927014826a..ea827818ac 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1359,6 +1359,91 @@ def test_managed_memory_operation_validation(init_cuda): buffer.close() +def test_managed_memory_advise_location_validation(init_cuda): + """Verify doc-specified location constraints for each advice kind.""" + device = Device() + _skip_if_managed_allocation_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + + # set_read_mostly works without a location (location is ignored) + managed_memory.advise(buffer, "set_read_mostly") + + # set_preferred_location requires a location; device ordinal works + managed_memory.advise(buffer, "set_preferred_location", device.device_id) + + # set_preferred_location with host location_type + managed_memory.advise(buffer, "set_preferred_location", location_type="host") + + # set_accessed_by with host_numa raises ValueError (INVALID per CUDA docs) + with pytest.raises(ValueError, match="does not support location_type='host_numa'"): + managed_memory.advise(buffer, "set_accessed_by", 0, location_type="host_numa") + + # set_accessed_by with host_numa_current also raises ValueError + with pytest.raises(ValueError, match="does not support location_type='host_numa_current'"): + managed_memory.advise(buffer, "set_accessed_by", location_type="host_numa_current") + + # Inferred location from int: -1 maps to host, 0 maps to device + managed_memory.advise(buffer, "set_preferred_location", -1) + managed_memory.advise(buffer, "set_preferred_location", 0) + + buffer.close() + + +def test_managed_memory_advise_accepts_enum_value(init_cuda): + """advise() accepts CUmem_advise enum values directly, not just string aliases.""" + device = Device() + _skip_if_managed_allocation_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + + advice_enum = driver.CUmem_advise.CU_MEM_ADVISE_SET_READ_MOSTLY + managed_memory.advise(buffer, advice_enum) + + assert ( + _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, + ) + == _READ_MOSTLY_ENABLED + ) + + buffer.close() + + +def test_managed_memory_advise_size_rejected_for_buffer(init_cuda): + """advise() raises TypeError when size= is given with a Buffer target.""" + device = Device() + _skip_if_managed_allocation_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + + with pytest.raises(TypeError, match="does not accept size="): + managed_memory.advise(buffer, "set_read_mostly", size=1024) + + buffer.close() + + +def test_managed_memory_advise_invalid_advice_values(init_cuda): + """advise() rejects invalid advice strings and wrong types.""" + device = Device() + _skip_if_managed_allocation_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + + with pytest.raises(ValueError, match="advice must be one of"): + managed_memory.advise(buffer, "not_a_real_advice") + + with pytest.raises(TypeError, match="advice must be"): + managed_memory.advise(buffer, 42) + + buffer.close() + + def test_managed_memory_functions_accept_raw_pointer_ranges(init_cuda): device = Device() _skip_if_managed_location_ops_unsupported(device) From d10ab07e2f402628b83b08e07d95da39c4f2b634 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 17 Mar 2026 18:13:36 -0700 Subject: [PATCH 09/16] Simplify managed-memory helpers: remove long-form aliases, cache lookups, fix docs - Remove duplicate long-form "cu_mem_advise_*" string aliases from _MANAGED_ADVICE_ALIASES; users pass short strings or the enum directly - Replace 4 boolean allow_* params in _normalize_managed_location with a single allowed_loctypes frozenset driven by _MANAGED_ADVICE_ALLOWED_LOCTYPES - Cache immutable runtime checks: CU_DEVICE_CPU, v2 bindings flag, discard_prefetch support, and advice enum-to-alias reverse map - Collapse hasattr+getattr to single getattr in _managed_location_enum - Move _require_managed_discard_prefetch_support to top of discard_prefetch for fail-fast behavior - Fix docs build: reset Sphinx module scope after managed_memory section in api.rst so subsequent sections resolve under cuda.core - Add discard_prefetch pool-allocation test and comment on _get_mem_range_attr Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_core/cuda/core/_memory/_buffer.pyx | 94 ++++++++++++++----------- cuda_core/docs/source/api.rst | 2 + cuda_core/tests/test_memory.py | 26 +++++++ 3 files changed, 79 insertions(+), 43 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 829e05b3ad..d280b4ea2b 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -89,17 +89,11 @@ cdef dict _MANAGED_LOCATION_TYPE_ATTRS = { cdef dict _MANAGED_ADVICE_ALIASES = { "set_read_mostly": "CU_MEM_ADVISE_SET_READ_MOSTLY", - "cu_mem_advise_set_read_mostly": "CU_MEM_ADVISE_SET_READ_MOSTLY", "unset_read_mostly": "CU_MEM_ADVISE_UNSET_READ_MOSTLY", - "cu_mem_advise_unset_read_mostly": "CU_MEM_ADVISE_UNSET_READ_MOSTLY", "set_preferred_location": "CU_MEM_ADVISE_SET_PREFERRED_LOCATION", - "cu_mem_advise_set_preferred_location": "CU_MEM_ADVISE_SET_PREFERRED_LOCATION", "unset_preferred_location": "CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION", - "cu_mem_advise_unset_preferred_location": "CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION", "set_accessed_by": "CU_MEM_ADVISE_SET_ACCESSED_BY", - "cu_mem_advise_set_accessed_by": "CU_MEM_ADVISE_SET_ACCESSED_BY", "unset_accessed_by": "CU_MEM_ADVISE_UNSET_ACCESSED_BY", - "cu_mem_advise_unset_accessed_by": "CU_MEM_ADVISE_UNSET_ACCESSED_BY", } cdef frozenset _MANAGED_ADVICE_IGNORE_LOCATION = frozenset(( @@ -108,10 +102,18 @@ cdef frozenset _MANAGED_ADVICE_IGNORE_LOCATION = frozenset(( "unset_preferred_location", )) -cdef frozenset _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY = frozenset(( - "set_accessed_by", - "unset_accessed_by", -)) +cdef frozenset _ALL_LOCATION_TYPES = frozenset(("device", "host", "host_numa", "host_numa_current")) +cdef frozenset _DEVICE_HOST_NUMA = frozenset(("device", "host", "host_numa")) +cdef frozenset _DEVICE_HOST_ONLY = frozenset(("device", "host")) + +cdef dict _MANAGED_ADVICE_ALLOWED_LOCTYPES = { + "set_read_mostly": _DEVICE_HOST_NUMA, + "unset_read_mostly": _DEVICE_HOST_NUMA, + "set_preferred_location": _ALL_LOCATION_TYPES, + "unset_preferred_location": _DEVICE_HOST_NUMA, + "set_accessed_by": _DEVICE_HOST_ONLY, + "unset_accessed_by": _DEVICE_HOST_ONLY, +} cdef int _MANAGED_SIZE_NOT_PROVIDED = -1 cdef int _HOST_NUMA_CURRENT_ID = 0 @@ -120,22 +122,32 @@ cdef size_t _SINGLE_RANGE_COUNT = 1 cdef size_t _SINGLE_PREFETCH_LOCATION_COUNT = 1 cdef unsigned long long _MANAGED_OPERATION_FLAGS = 0 +# Lazily cached values for immutable runtime properties. +cdef object _CU_DEVICE_CPU = None +cdef dict _ADVICE_ENUM_TO_ALIAS = None +cdef int _V2_BINDINGS = -1 +cdef int _DISCARD_PREFETCH_SUPPORTED = -1 + cdef inline object _managed_location_enum(str location_type): cdef str attr_name = _MANAGED_LOCATION_TYPE_ATTRS[location_type] - if not hasattr(driver.CUmemLocationType, attr_name): + cdef object result = getattr(driver.CUmemLocationType, attr_name, None) + if result is None: raise RuntimeError( f"Managed-memory location type {location_type!r} is not supported by the " f"installed cuda.bindings package." ) - return getattr(driver.CUmemLocationType, attr_name) + return result cdef inline object _make_managed_location(str location_type, int location_id): + global _CU_DEVICE_CPU cdef object location = driver.CUmemLocation() location.type = _managed_location_enum(location_type) if location_type == "host": - location.id = int(getattr(driver, "CU_DEVICE_CPU", -1)) + if _CU_DEVICE_CPU is None: + _CU_DEVICE_CPU = int(getattr(driver, "CU_DEVICE_CPU", -1)) + location.id = _CU_DEVICE_CPU elif location_type == "host_numa_current": location.id = _HOST_NUMA_CURRENT_ID else: @@ -157,12 +169,17 @@ cdef inline tuple _normalize_managed_advice(object advice): return alias, getattr(driver.CUmem_advise, attr_name) if isinstance(advice, driver.CUmem_advise): - for alias, attr_name in _MANAGED_ADVICE_ALIASES.items(): - if alias.startswith("cu_mem_advise_"): - continue - if advice == getattr(driver.CUmem_advise, attr_name): - return alias, advice - raise ValueError(f"Unsupported advice value: {advice!r}") + global _ADVICE_ENUM_TO_ALIAS + if _ADVICE_ENUM_TO_ALIAS is None: + _ADVICE_ENUM_TO_ALIAS = {} + for alias, attr_name in _MANAGED_ADVICE_ALIASES.items(): + enum_val = getattr(driver.CUmem_advise, attr_name, None) + if enum_val is not None: + _ADVICE_ENUM_TO_ALIAS[enum_val] = alias + alias = _ADVICE_ENUM_TO_ALIAS.get(advice) + if alias is None: + raise ValueError(f"Unsupported advice value: {advice!r}") + return alias, advice raise TypeError( "advice must be a cuda.bindings.driver.CUmem_advise value or a supported string alias" @@ -174,9 +191,7 @@ cdef inline object _normalize_managed_location( object location_type, str what, bint allow_none=False, - bint allow_host=True, - bint allow_host_numa=True, - bint allow_host_numa_current=True, + frozenset allowed_loctypes=_ALL_LOCATION_TYPES, ): cdef object loc_type cdef int loc_id @@ -194,6 +209,9 @@ cdef inline object _normalize_managed_location( f"or None, got {location_type!r}" ) + if loc_type is not None and loc_type not in allowed_loctypes: + raise ValueError(f"{what} does not support location_type='{loc_type}'") + if loc_type is None: if location is None: if allow_none: @@ -205,7 +223,7 @@ cdef inline object _normalize_managed_location( ) loc_id = location if loc_id == -1: - if not allow_host: + if "host" not in allowed_loctypes: raise ValueError(f"{what} does not support host locations") return _make_managed_location("host", -1) elif loc_id >= 0: @@ -227,20 +245,14 @@ cdef inline object _normalize_managed_location( raise ValueError( f"{what} location must be None or -1 when location_type is 'host', got {location!r}" ) - if not allow_host: - raise ValueError(f"{what} does not support location_type='host'") return _make_managed_location(loc_type, -1) elif loc_type == "host_numa": - if not allow_host_numa: - raise ValueError(f"{what} does not support location_type='host_numa'") if not isinstance(location, int) or location < 0: raise ValueError( f"{what} location must be a NUMA node ID (>= 0) when location_type is 'host_numa', got {location!r}" ) return _make_managed_location(loc_type, location) else: - if not allow_host_numa_current: - raise ValueError(f"{what} does not support location_type='host_numa_current'") if location is not None: raise ValueError( f"{what} location must be None when location_type is 'host_numa_current', got {location!r}" @@ -250,7 +262,10 @@ cdef inline object _normalize_managed_location( cdef inline bint _managed_location_uses_v2_bindings(): # cuda.bindings 13.x switches these APIs to CUmemLocation-based wrappers. - return get_binding_version() >= (13, 0) + global _V2_BINDINGS + if _V2_BINDINGS < 0: + _V2_BINDINGS = 1 if get_binding_version() >= (13, 0) else 0 + return _V2_BINDINGS != 0 cdef object _LEGACY_LOC_DEVICE = None @@ -276,7 +291,10 @@ cdef inline void _require_managed_buffer(Buffer self, str what): cdef inline void _require_managed_discard_prefetch_support(str what): - if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): + global _DISCARD_PREFETCH_SUPPORTED + if _DISCARD_PREFETCH_SUPPORTED < 0: + _DISCARD_PREFETCH_SUPPORTED = 1 if hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync") else 0 + if not _DISCARD_PREFETCH_SUPPORTED: raise RuntimeError( f"{what} requires cuda.bindings support for cuMemDiscardAndPrefetchBatchAsync" ) @@ -372,9 +390,7 @@ def advise( location_type, "advise", allow_none=advice_name in _MANAGED_ADVICE_IGNORE_LOCATION, - allow_host=True, - allow_host_numa=advice_name not in _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY, - allow_host_numa_current=advice_name == "set_preferred_location", + allowed_loctypes=_MANAGED_ADVICE_ALLOWED_LOCTYPES[advice_name], ) if _managed_location_uses_v2_bindings(): handle_return(driver.cuMemAdvise(ptr, nbytes, advice, location)) @@ -425,10 +441,6 @@ def prefetch( location, location_type, "prefetch", - allow_none=False, - allow_host=True, - allow_host_numa=True, - allow_host_numa_current=True, ) if _managed_location_uses_v2_bindings(): handle_return( @@ -478,6 +490,7 @@ def discard_prefetch( Explicit location kind. Supported values are ``"device"``, ``"host"``, ``"host_numa"``, and ``"host_numa_current"``. """ + _require_managed_discard_prefetch_support("discard_prefetch") cdef Stream s = Stream_accept(stream) cdef object ptr cdef object batch_ptr @@ -485,15 +498,10 @@ def discard_prefetch( ptr, nbytes = _normalize_managed_target_range(target, size, "discard_prefetch") batch_ptr = driver.CUdeviceptr(int(ptr)) - _require_managed_discard_prefetch_support("discard_prefetch") location = _normalize_managed_location( location, location_type, "discard_prefetch", - allow_none=False, - allow_host=True, - allow_host_numa=True, - allow_host_numa_current=True, ) handle_return( driver.cuMemDiscardAndPrefetchBatchAsync( diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 4d63bbcf88..7bf59ae495 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -74,6 +74,8 @@ Managed memory prefetch discard_prefetch +.. module:: cuda.core + :no-index: CUDA compilation toolchain -------------------------- diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index ea827818ac..5296ea344a 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1142,6 +1142,7 @@ def test_managed_memory_resource_preferred_location_validation(init_cuda): def _get_mem_range_attr(buffer, attribute, data_size): + # cuMemRangeGetAttribute returns a raw integer when data_size <= 4. return handle_return(driver.cuMemRangeGetAttribute(data_size, attribute, buffer.handle, buffer.size)) @@ -1252,6 +1253,31 @@ def test_managed_memory_prefetch_supports_external_managed_allocations(init_cuda buffer.close() +def test_managed_memory_discard_prefetch_supports_managed_pool_allocations(init_cuda): + device = Device() + skip_if_managed_memory_unsupported(device) + _skip_if_managed_discard_prefetch_unsupported(device) + device.set_current() + + mr = create_managed_memory_resource_or_skip() + buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + managed_memory.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) + stream.sync() + + managed_memory.discard_prefetch(buffer, device, stream=stream) + stream.sync() + + last_location = _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last_location == device.device_id + + buffer.close() + + def test_managed_memory_discard_prefetch_supports_external_managed_allocations(init_cuda): device = Device() _skip_if_managed_discard_prefetch_unsupported(device) From c250c92e47393fa6cb0e6611245c5a4dd0c3b6cf Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Wed, 18 Mar 2026 09:21:11 -0700 Subject: [PATCH 10/16] fix(test): reset _V2_BINDINGS cache so legacy-signature tests take the legacy path The _V2_BINDINGS cache in _buffer.pyx persists across tests, so monkeypatching get_binding_version alone is insufficient when earlier tests have already populated the cache with the v2 value. Promote _V2_BINDINGS from cdef int to a Python-level variable so tests can monkeypatch it directly via monkeypatch.setattr, and reset it to -1 in both legacy-signature tests. Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_core/cuda/core/_memory/_buffer.pyx | 2 +- cuda_core/tests/test_memory.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 6f5809e06c..d109de2ac4 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -124,7 +124,7 @@ cdef unsigned long long _MANAGED_OPERATION_FLAGS = 0 # Lazily cached values for immutable runtime properties. cdef object _CU_DEVICE_CPU = None cdef dict _ADVICE_ENUM_TO_ALIAS = None -cdef int _V2_BINDINGS = -1 +_V2_BINDINGS = -1 cdef int _DISCARD_PREFETCH_SUPPORTED = -1 diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 9cd3209d8d..411a3c6cb5 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1314,6 +1314,7 @@ def fake_cuMemAdvise(ptr, size, advice, location): return (driver.CUresult.CUDA_SUCCESS,) monkeypatch.setattr(_buffer, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION) + monkeypatch.setattr(_buffer, "_V2_BINDINGS", -1) monkeypatch.setattr(_buffer.driver, "cuMemAdvise", fake_cuMemAdvise) managed_memory.advise(buffer, "set_read_mostly") @@ -1338,6 +1339,7 @@ def fake_cuMemPrefetchAsync(ptr, size, location, hstream): return (driver.CUresult.CUDA_SUCCESS,) monkeypatch.setattr(_buffer, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION) + monkeypatch.setattr(_buffer, "_V2_BINDINGS", -1) monkeypatch.setattr(_buffer.driver, "cuMemPrefetchAsync", fake_cuMemPrefetchAsync) managed_memory.prefetch(buffer, device, stream=stream) From 89329d9c6eff581445b4806fe0217e598a2313fa Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Wed, 18 Mar 2026 10:18:41 -0700 Subject: [PATCH 11/16] fix(test): require concurrent_managed_access for advise tests that hit real hardware These three tests call cuMemAdvise on real CUDA devices and verify memory range attributes. On devices without concurrent_managed_access (e.g. Windows/WDDM), set_read_mostly silently no-ops and set_preferred_location fails with CUDA_ERROR_INVALID_DEVICE. Use the stricter _skip_if_managed_location_ops_unsupported guard, matching the pattern already used by test_managed_memory_functions_accept_raw_pointer_ranges. Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_core/tests/test_memory.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 411a3c6cb5..56c505fbe6 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1207,7 +1207,7 @@ def test_managed_memory_prefetch_supports_managed_pool_allocations(init_cuda): def test_managed_memory_advise_supports_external_managed_allocations(init_cuda): device = Device() - _skip_if_managed_allocation_unsupported(device) + _skip_if_managed_location_ops_unsupported(device) device.set_current() buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) @@ -1390,7 +1390,7 @@ def test_managed_memory_operation_validation(init_cuda): def test_managed_memory_advise_location_validation(init_cuda): """Verify doc-specified location constraints for each advice kind.""" device = Device() - _skip_if_managed_allocation_unsupported(device) + _skip_if_managed_location_ops_unsupported(device) device.set_current() buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) @@ -1422,7 +1422,7 @@ def test_managed_memory_advise_location_validation(init_cuda): def test_managed_memory_advise_accepts_enum_value(init_cuda): """advise() accepts CUmem_advise enum values directly, not just string aliases.""" device = Device() - _skip_if_managed_allocation_unsupported(device) + _skip_if_managed_location_ops_unsupported(device) device.set_current() buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) From 8a75d1bf1f1172e4681bb232a22f00ff9567d5d8 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Wed, 18 Mar 2026 11:23:53 -0700 Subject: [PATCH 12/16] fix: validate managed buffer before checking discard_prefetch bindings support Reorder checks in discard_prefetch so _normalize_managed_target_range runs before _require_managed_discard_prefetch_support. This ensures non-managed buffers raise ValueError before the RuntimeError for missing cuMemDiscardAndPrefetchBatchAsync support. Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_core/cuda/core/_memory/_buffer.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index d109de2ac4..ffd82facb5 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -489,13 +489,13 @@ def discard_prefetch( Explicit location kind. Supported values are ``"device"``, ``"host"``, ``"host_numa"``, and ``"host_numa_current"``. """ - _require_managed_discard_prefetch_support("discard_prefetch") - cdef Stream s = Stream_accept(stream) cdef object ptr cdef object batch_ptr cdef size_t nbytes ptr, nbytes = _normalize_managed_target_range(target, size, "discard_prefetch") + _require_managed_discard_prefetch_support("discard_prefetch") + cdef Stream s = Stream_accept(stream) batch_ptr = driver.CUdeviceptr(int(ptr)) location = _normalize_managed_location( location, From 9e9b1e0914d30f855389a349cf8d41d134b1c4dc Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Wed, 18 Mar 2026 14:08:24 -0700 Subject: [PATCH 13/16] refactor: extract managed memory ops into dedicated _managed_memory_ops module Move advise, prefetch, and discard_prefetch functions and their helpers out of _buffer.pyx into a new _managed_memory_ops Cython module to improve separation of concerns. Expose _init_mem_attrs and _query_memory_attrs as non-inline cdef functions in _buffer.pxd so the new module can reuse them. Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_core/cuda/core/_memory/_buffer.pxd | 8 + cuda_core/cuda/core/_memory/_buffer.pyx | 449 +---------------- .../cuda/core/_memory/_managed_memory_ops.pxd | 6 + .../cuda/core/_memory/_managed_memory_ops.pyx | 458 ++++++++++++++++++ cuda_core/cuda/core/managed_memory.py | 2 +- cuda_core/tests/test_memory.py | 14 +- 6 files changed, 483 insertions(+), 454 deletions(-) create mode 100644 cuda_core/cuda/core/_memory/_managed_memory_ops.pxd create mode 100644 cuda_core/cuda/core/_memory/_managed_memory_ops.pyx diff --git a/cuda_core/cuda/core/_memory/_buffer.pxd b/cuda_core/cuda/core/_memory/_buffer.pxd index 04b5707e18..9065da77eb 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pxd +++ b/cuda_core/cuda/core/_memory/_buffer.pxd @@ -4,6 +4,7 @@ from libc.stdint cimport uintptr_t +from cuda.bindings cimport cydriver from cuda.core._resource_handles cimport DevicePtrHandle from cuda.core._stream cimport Stream @@ -38,3 +39,10 @@ cdef Buffer Buffer_from_deviceptr_handle( MemoryResource mr, object ipc_descriptor = * ) + +# Memory attribute query helpers (used by _managed_memory_ops) +cdef void _init_mem_attrs(Buffer self) +cdef int _query_memory_attrs( + _MemAttrs& out, + cydriver.CUdeviceptr ptr, +) except -1 nogil diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index ffd82facb5..104252a62b 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -35,7 +35,7 @@ else: BufferProtocol = object from cuda.core._dlpack import DLDeviceType, make_py_capsule -from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return +from cuda.core._utils.cuda_utils import driver, handle_return from cuda.core._device import Device @@ -72,449 +72,6 @@ A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting """ -cdef tuple _VALID_MANAGED_LOCATION_TYPES = ( - "device", - "host", - "host_numa", - "host_numa_current", -) - -cdef dict _MANAGED_LOCATION_TYPE_ATTRS = { - "device": "CU_MEM_LOCATION_TYPE_DEVICE", - "host": "CU_MEM_LOCATION_TYPE_HOST", - "host_numa": "CU_MEM_LOCATION_TYPE_HOST_NUMA", - "host_numa_current": "CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT", -} - -cdef dict _MANAGED_ADVICE_ALIASES = { - "set_read_mostly": "CU_MEM_ADVISE_SET_READ_MOSTLY", - "unset_read_mostly": "CU_MEM_ADVISE_UNSET_READ_MOSTLY", - "set_preferred_location": "CU_MEM_ADVISE_SET_PREFERRED_LOCATION", - "unset_preferred_location": "CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION", - "set_accessed_by": "CU_MEM_ADVISE_SET_ACCESSED_BY", - "unset_accessed_by": "CU_MEM_ADVISE_UNSET_ACCESSED_BY", -} - -cdef frozenset _MANAGED_ADVICE_IGNORE_LOCATION = frozenset(( - "set_read_mostly", - "unset_read_mostly", - "unset_preferred_location", -)) - -cdef frozenset _ALL_LOCATION_TYPES = frozenset(("device", "host", "host_numa", "host_numa_current")) -cdef frozenset _DEVICE_HOST_NUMA = frozenset(("device", "host", "host_numa")) -cdef frozenset _DEVICE_HOST_ONLY = frozenset(("device", "host")) - -cdef dict _MANAGED_ADVICE_ALLOWED_LOCTYPES = { - "set_read_mostly": _DEVICE_HOST_NUMA, - "unset_read_mostly": _DEVICE_HOST_NUMA, - "set_preferred_location": _ALL_LOCATION_TYPES, - "unset_preferred_location": _DEVICE_HOST_NUMA, - "set_accessed_by": _DEVICE_HOST_ONLY, - "unset_accessed_by": _DEVICE_HOST_ONLY, -} - -cdef int _MANAGED_SIZE_NOT_PROVIDED = -1 -cdef int _HOST_NUMA_CURRENT_ID = 0 -cdef int _FIRST_PREFETCH_LOCATION_INDEX = 0 -cdef size_t _SINGLE_RANGE_COUNT = 1 -cdef size_t _SINGLE_PREFETCH_LOCATION_COUNT = 1 -cdef unsigned long long _MANAGED_OPERATION_FLAGS = 0 - -# Lazily cached values for immutable runtime properties. -cdef object _CU_DEVICE_CPU = None -cdef dict _ADVICE_ENUM_TO_ALIAS = None -_V2_BINDINGS = -1 -cdef int _DISCARD_PREFETCH_SUPPORTED = -1 - - -cdef inline object _managed_location_enum(str location_type): - cdef str attr_name = _MANAGED_LOCATION_TYPE_ATTRS[location_type] - cdef object result = getattr(driver.CUmemLocationType, attr_name, None) - if result is None: - raise RuntimeError( - f"Managed-memory location type {location_type!r} is not supported by the " - f"installed cuda.bindings package." - ) - return result - - -cdef inline object _make_managed_location(str location_type, int location_id): - global _CU_DEVICE_CPU - cdef object location = driver.CUmemLocation() - location.type = _managed_location_enum(location_type) - if location_type == "host": - if _CU_DEVICE_CPU is None: - _CU_DEVICE_CPU = int(getattr(driver, "CU_DEVICE_CPU", -1)) - location.id = _CU_DEVICE_CPU - elif location_type == "host_numa_current": - location.id = _HOST_NUMA_CURRENT_ID - else: - location.id = location_id - return location - - -cdef inline tuple _normalize_managed_advice(object advice): - cdef str alias - cdef str attr_name - if isinstance(advice, str): - alias = advice.lower() - attr_name = _MANAGED_ADVICE_ALIASES.get(alias) - if attr_name is None: - raise ValueError( - "advice must be one of " - f"{tuple(sorted(_MANAGED_ADVICE_ALIASES))!r}, got {advice!r}" - ) - return alias, getattr(driver.CUmem_advise, attr_name) - - if isinstance(advice, driver.CUmem_advise): - global _ADVICE_ENUM_TO_ALIAS - if _ADVICE_ENUM_TO_ALIAS is None: - _ADVICE_ENUM_TO_ALIAS = {} - for alias, attr_name in _MANAGED_ADVICE_ALIASES.items(): - enum_val = getattr(driver.CUmem_advise, attr_name, None) - if enum_val is not None: - _ADVICE_ENUM_TO_ALIAS[enum_val] = alias - alias = _ADVICE_ENUM_TO_ALIAS.get(advice) - if alias is None: - raise ValueError(f"Unsupported advice value: {advice!r}") - return alias, advice - - raise TypeError( - "advice must be a cuda.bindings.driver.CUmem_advise value or a supported string alias" - ) - - -cdef inline object _normalize_managed_location( - object location, - object location_type, - str what, - bint allow_none=False, - frozenset allowed_loctypes=_ALL_LOCATION_TYPES, -): - cdef object loc_type - cdef int loc_id - - if isinstance(location, Device): - location = location.device_id - - if location_type is not None and not isinstance(location_type, str): - raise TypeError(f"{what} location_type must be a string or None, got {type(location_type).__name__}") - - loc_type = None if location_type is None else (location_type).lower() - if loc_type is not None and loc_type not in _VALID_MANAGED_LOCATION_TYPES: - raise ValueError( - f"{what} location_type must be one of {_VALID_MANAGED_LOCATION_TYPES!r} " - f"or None, got {location_type!r}" - ) - - if loc_type is not None and loc_type not in allowed_loctypes: - raise ValueError(f"{what} does not support location_type='{loc_type}'") - - if loc_type is None: - if location is None: - if allow_none: - return _make_managed_location("host", -1) - raise ValueError(f"{what} requires a location") - if not isinstance(location, int): - raise TypeError( - f"{what} location must be a Device, int, or None, got {type(location).__name__}" - ) - loc_id = location - if loc_id == -1: - if "host" not in allowed_loctypes: - raise ValueError(f"{what} does not support host locations") - return _make_managed_location("host", -1) - elif loc_id >= 0: - return _make_managed_location("device", loc_id) - else: - raise ValueError( - f"{what} location must be a device ordinal (>= 0), -1 for host, or None; got {location!r}" - ) - elif loc_type == "device": - if isinstance(location, int) and location >= 0: - loc_id = location - else: - raise ValueError( - f"{what} location must be a device ordinal (>= 0) when location_type is 'device', got {location!r}" - ) - return _make_managed_location(loc_type, loc_id) - elif loc_type == "host": - if location not in (None, -1): - raise ValueError( - f"{what} location must be None or -1 when location_type is 'host', got {location!r}" - ) - return _make_managed_location(loc_type, -1) - elif loc_type == "host_numa": - if not isinstance(location, int) or location < 0: - raise ValueError( - f"{what} location must be a NUMA node ID (>= 0) when location_type is 'host_numa', got {location!r}" - ) - return _make_managed_location(loc_type, location) - else: - if location is not None: - raise ValueError( - f"{what} location must be None when location_type is 'host_numa_current', got {location!r}" - ) - return _make_managed_location(loc_type, _HOST_NUMA_CURRENT_ID) - - -cdef inline bint _managed_location_uses_v2_bindings(): - # cuda.bindings 13.x switches these APIs to CUmemLocation-based wrappers. - global _V2_BINDINGS - if _V2_BINDINGS < 0: - _V2_BINDINGS = 1 if get_binding_version() >= (13, 0) else 0 - return _V2_BINDINGS != 0 - - -cdef object _LEGACY_LOC_DEVICE = None -cdef object _LEGACY_LOC_HOST = None - -cdef inline int _managed_location_to_legacy_device(object location, str what): - global _LEGACY_LOC_DEVICE, _LEGACY_LOC_HOST - if _LEGACY_LOC_DEVICE is None: - _LEGACY_LOC_DEVICE = _managed_location_enum("device") - _LEGACY_LOC_HOST = _managed_location_enum("host") - cdef object loc_type = location.type - if loc_type == _LEGACY_LOC_DEVICE or loc_type == _LEGACY_LOC_HOST: - return location.id - raise RuntimeError( - f"{what} requires cuda.bindings 13.x for location_type={loc_type!r}" - ) - - -cdef inline void _require_managed_buffer(Buffer self, str what): - _init_mem_attrs(self) - if not self._mem_attrs.is_managed: - raise ValueError(f"{what} requires a managed-memory allocation") - - -cdef inline void _require_managed_discard_prefetch_support(str what): - global _DISCARD_PREFETCH_SUPPORTED - if _DISCARD_PREFETCH_SUPPORTED < 0: - _DISCARD_PREFETCH_SUPPORTED = 1 if hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync") else 0 - if not _DISCARD_PREFETCH_SUPPORTED: - raise RuntimeError( - f"{what} requires cuda.bindings support for cuMemDiscardAndPrefetchBatchAsync" - ) - - -cdef inline tuple _managed_range_from_buffer( - Buffer buffer, - int size, - str what, -): - if size != _MANAGED_SIZE_NOT_PROVIDED: - raise TypeError(f"{what} does not accept size= when target is a Buffer") - _require_managed_buffer(buffer, what) - return buffer.handle, buffer._size - - -cdef inline uintptr_t _coerce_raw_pointer(object target, str what) except? 0: - cdef object ptr_obj - try: - ptr_obj = int(target) - except Exception as exc: - raise TypeError( - f"{what} target must be a Buffer or a raw pointer, got {type(target).__name__}" - ) from exc - if ptr_obj < 0: - raise ValueError(f"{what} target pointer must be >= 0, got {target!r}") - return ptr_obj - - -cdef inline int _require_managed_pointer(uintptr_t ptr, str what) except -1: - cdef _MemAttrs mem_attrs - with nogil: - _query_memory_attrs(mem_attrs, ptr) - if not mem_attrs.is_managed: - raise ValueError(f"{what} requires a managed-memory allocation") - return 0 - - -cdef inline tuple _normalize_managed_target_range( - object target, - int size, - str what, -): - cdef uintptr_t ptr - - if isinstance(target, Buffer): - return _managed_range_from_buffer(target, size, what) - - if size == _MANAGED_SIZE_NOT_PROVIDED: - raise TypeError(f"{what} requires size= when target is a raw pointer") - ptr = _coerce_raw_pointer(target, what) - _require_managed_pointer(ptr, what) - return ptr, size - - -def advise( - target, - advice: driver.CUmem_advise | str, - location: Device | int | None = None, - *, - int size=_MANAGED_SIZE_NOT_PROVIDED, - location_type: str | None = None, -): - """Apply managed-memory advice to an allocation range. - - Parameters - ---------- - target : :class:`Buffer` | int | object - Managed allocation to operate on. This may be a :class:`Buffer` or a - raw pointer (requires ``size=``). - advice : :obj:`~driver.CUmem_advise` | str - Managed-memory advice to apply. String aliases such as - ``"set_read_mostly"``, ``"set_preferred_location"``, and - ``"set_accessed_by"`` are accepted. - location : :obj:`~_device.Device` | int | None, optional - Target location. When ``location_type`` is ``None``, values are - interpreted as a device ordinal, ``-1`` for host, or ``None`` for - advice values that ignore location. - size : int, optional - Allocation size in bytes. Required when ``target`` is a raw pointer. - location_type : str | None, optional - Explicit location kind. Supported values are ``"device"``, ``"host"``, - ``"host_numa"``, and ``"host_numa_current"``. - """ - cdef str advice_name - cdef object ptr - cdef size_t nbytes - - ptr, nbytes = _normalize_managed_target_range(target, size, "advise") - advice_name, advice = _normalize_managed_advice(advice) - location = _normalize_managed_location( - location, - location_type, - "advise", - allow_none=advice_name in _MANAGED_ADVICE_IGNORE_LOCATION, - allowed_loctypes=_MANAGED_ADVICE_ALLOWED_LOCTYPES[advice_name], - ) - if _managed_location_uses_v2_bindings(): - handle_return(driver.cuMemAdvise(ptr, nbytes, advice, location)) - else: - handle_return( - driver.cuMemAdvise( - ptr, - nbytes, - advice, - _managed_location_to_legacy_device(location, "advise"), - ) - ) - - -def prefetch( - target, - location: Device | int | None = None, - *, - stream: Stream | GraphBuilder, - int size=_MANAGED_SIZE_NOT_PROVIDED, - location_type: str | None = None, -): - """Prefetch a managed-memory allocation range to a target location. - - Parameters - ---------- - target : :class:`Buffer` | int | object - Managed allocation to operate on. This may be a :class:`Buffer` or a - raw pointer (requires ``size=``). - location : :obj:`~_device.Device` | int | None, optional - Target location. When ``location_type`` is ``None``, values are - interpreted as a device ordinal, ``-1`` for host, or ``None``. - A location is required for prefetch. - stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` - Keyword argument specifying the stream for the asynchronous prefetch. - size : int, optional - Allocation size in bytes. Required when ``target`` is a raw pointer. - location_type : str | None, optional - Explicit location kind. Supported values are ``"device"``, ``"host"``, - ``"host_numa"``, and ``"host_numa_current"``. - """ - cdef Stream s = Stream_accept(stream) - cdef object ptr - cdef size_t nbytes - - ptr, nbytes = _normalize_managed_target_range(target, size, "prefetch") - location = _normalize_managed_location( - location, - location_type, - "prefetch", - ) - if _managed_location_uses_v2_bindings(): - handle_return( - driver.cuMemPrefetchAsync( - ptr, - nbytes, - location, - _MANAGED_OPERATION_FLAGS, - s.handle, - ) - ) - else: - handle_return( - driver.cuMemPrefetchAsync( - ptr, - nbytes, - _managed_location_to_legacy_device(location, "prefetch"), - s.handle, - ) - ) - - -def discard_prefetch( - target, - location: Device | int | None = None, - *, - stream: Stream | GraphBuilder, - int size=_MANAGED_SIZE_NOT_PROVIDED, - location_type: str | None = None, -): - """Discard a managed-memory allocation range and prefetch it to a target location. - - Parameters - ---------- - target : :class:`Buffer` | int | object - Managed allocation to operate on. This may be a :class:`Buffer` or a - raw pointer (requires ``size=``). - location : :obj:`~_device.Device` | int | None, optional - Target location. When ``location_type`` is ``None``, values are - interpreted as a device ordinal, ``-1`` for host, or ``None``. - A location is required for discard_prefetch. - stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` - Keyword argument specifying the stream for the asynchronous operation. - size : int, optional - Allocation size in bytes. Required when ``target`` is a raw pointer. - location_type : str | None, optional - Explicit location kind. Supported values are ``"device"``, ``"host"``, - ``"host_numa"``, and ``"host_numa_current"``. - """ - cdef object ptr - cdef object batch_ptr - cdef size_t nbytes - - ptr, nbytes = _normalize_managed_target_range(target, size, "discard_prefetch") - _require_managed_discard_prefetch_support("discard_prefetch") - cdef Stream s = Stream_accept(stream) - batch_ptr = driver.CUdeviceptr(int(ptr)) - location = _normalize_managed_location( - location, - location_type, - "discard_prefetch", - ) - handle_return( - driver.cuMemDiscardAndPrefetchBatchAsync( - [batch_ptr], - [nbytes], - _SINGLE_RANGE_COUNT, - [location], - [_FIRST_PREFETCH_LOCATION_INDEX], - _SINGLE_PREFETCH_LOCATION_COUNT, - _MANAGED_OPERATION_FLAGS, - s.handle, - ) - ) - cdef class Buffer: """Represent a handle to allocated memory. @@ -864,14 +421,14 @@ cdef class Buffer: # Memory Attribute Query Helpers # ------------------------------ -cdef inline void _init_mem_attrs(Buffer self): +cdef void _init_mem_attrs(Buffer self): """Initialize memory attributes by querying the pointer.""" if not self._mem_attrs_inited: _query_memory_attrs(self._mem_attrs, as_cu(self._h_ptr)) self._mem_attrs_inited = True -cdef inline int _query_memory_attrs( +cdef int _query_memory_attrs( _MemAttrs& out, cydriver.CUdeviceptr ptr ) except -1 nogil: diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pxd b/cuda_core/cuda/core/_memory/_managed_memory_ops.pxd new file mode 100644 index 0000000000..a7019c784d --- /dev/null +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pxd @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# Managed-memory operation helpers (advise, prefetch, discard_prefetch). +# The public API is exposed via def functions; no cdef declarations needed. diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx new file mode 100644 index 0000000000..649c2cbe72 --- /dev/null +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -0,0 +1,458 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from libc.stdint cimport uintptr_t + +from cuda.bindings cimport cydriver +from cuda.core._memory._buffer cimport Buffer, _MemAttrs, _init_mem_attrs, _query_memory_attrs +from cuda.core._stream cimport Stream, Stream_accept + +from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return +from cuda.core._device import Device + + +cdef tuple _VALID_MANAGED_LOCATION_TYPES = ( + "device", + "host", + "host_numa", + "host_numa_current", +) + +cdef dict _MANAGED_LOCATION_TYPE_ATTRS = { + "device": "CU_MEM_LOCATION_TYPE_DEVICE", + "host": "CU_MEM_LOCATION_TYPE_HOST", + "host_numa": "CU_MEM_LOCATION_TYPE_HOST_NUMA", + "host_numa_current": "CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT", +} + +cdef dict _MANAGED_ADVICE_ALIASES = { + "set_read_mostly": "CU_MEM_ADVISE_SET_READ_MOSTLY", + "unset_read_mostly": "CU_MEM_ADVISE_UNSET_READ_MOSTLY", + "set_preferred_location": "CU_MEM_ADVISE_SET_PREFERRED_LOCATION", + "unset_preferred_location": "CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION", + "set_accessed_by": "CU_MEM_ADVISE_SET_ACCESSED_BY", + "unset_accessed_by": "CU_MEM_ADVISE_UNSET_ACCESSED_BY", +} + +cdef frozenset _MANAGED_ADVICE_IGNORE_LOCATION = frozenset(( + "set_read_mostly", + "unset_read_mostly", + "unset_preferred_location", +)) + +cdef frozenset _ALL_LOCATION_TYPES = frozenset(("device", "host", "host_numa", "host_numa_current")) +cdef frozenset _DEVICE_HOST_NUMA = frozenset(("device", "host", "host_numa")) +cdef frozenset _DEVICE_HOST_ONLY = frozenset(("device", "host")) + +cdef dict _MANAGED_ADVICE_ALLOWED_LOCTYPES = { + "set_read_mostly": _DEVICE_HOST_NUMA, + "unset_read_mostly": _DEVICE_HOST_NUMA, + "set_preferred_location": _ALL_LOCATION_TYPES, + "unset_preferred_location": _DEVICE_HOST_NUMA, + "set_accessed_by": _DEVICE_HOST_ONLY, + "unset_accessed_by": _DEVICE_HOST_ONLY, +} + +cdef int _MANAGED_SIZE_NOT_PROVIDED = -1 +cdef int _HOST_NUMA_CURRENT_ID = 0 +cdef int _FIRST_PREFETCH_LOCATION_INDEX = 0 +cdef size_t _SINGLE_RANGE_COUNT = 1 +cdef size_t _SINGLE_PREFETCH_LOCATION_COUNT = 1 +cdef unsigned long long _MANAGED_OPERATION_FLAGS = 0 + +# Lazily cached values for immutable runtime properties. +cdef object _CU_DEVICE_CPU = None +cdef dict _ADVICE_ENUM_TO_ALIAS = None +_V2_BINDINGS = -1 +cdef int _DISCARD_PREFETCH_SUPPORTED = -1 + + +cdef object _managed_location_enum(str location_type): + cdef str attr_name = _MANAGED_LOCATION_TYPE_ATTRS[location_type] + cdef object result = getattr(driver.CUmemLocationType, attr_name, None) + if result is None: + raise RuntimeError( + f"Managed-memory location type {location_type!r} is not supported by the " + f"installed cuda.bindings package." + ) + return result + + +cdef object _make_managed_location(str location_type, int location_id): + global _CU_DEVICE_CPU + cdef object location = driver.CUmemLocation() + location.type = _managed_location_enum(location_type) + if location_type == "host": + if _CU_DEVICE_CPU is None: + _CU_DEVICE_CPU = int(getattr(driver, "CU_DEVICE_CPU", -1)) + location.id = _CU_DEVICE_CPU + elif location_type == "host_numa_current": + location.id = _HOST_NUMA_CURRENT_ID + else: + location.id = location_id + return location + + +cdef tuple _normalize_managed_advice(object advice): + cdef str alias + cdef str attr_name + if isinstance(advice, str): + alias = advice.lower() + attr_name = _MANAGED_ADVICE_ALIASES.get(alias) + if attr_name is None: + raise ValueError( + "advice must be one of " + f"{tuple(sorted(_MANAGED_ADVICE_ALIASES))!r}, got {advice!r}" + ) + return alias, getattr(driver.CUmem_advise, attr_name) + + if isinstance(advice, driver.CUmem_advise): + global _ADVICE_ENUM_TO_ALIAS + if _ADVICE_ENUM_TO_ALIAS is None: + _ADVICE_ENUM_TO_ALIAS = {} + for alias, attr_name in _MANAGED_ADVICE_ALIASES.items(): + enum_val = getattr(driver.CUmem_advise, attr_name, None) + if enum_val is not None: + _ADVICE_ENUM_TO_ALIAS[enum_val] = alias + alias = _ADVICE_ENUM_TO_ALIAS.get(advice) + if alias is None: + raise ValueError(f"Unsupported advice value: {advice!r}") + return alias, advice + + raise TypeError( + "advice must be a cuda.bindings.driver.CUmem_advise value or a supported string alias" + ) + + +cdef object _normalize_managed_location( + object location, + object location_type, + str what, + bint allow_none=False, + frozenset allowed_loctypes=_ALL_LOCATION_TYPES, +): + cdef object loc_type + cdef int loc_id + + if isinstance(location, Device): + location = location.device_id + + if location_type is not None and not isinstance(location_type, str): + raise TypeError(f"{what} location_type must be a string or None, got {type(location_type).__name__}") + + loc_type = None if location_type is None else (location_type).lower() + if loc_type is not None and loc_type not in _VALID_MANAGED_LOCATION_TYPES: + raise ValueError( + f"{what} location_type must be one of {_VALID_MANAGED_LOCATION_TYPES!r} " + f"or None, got {location_type!r}" + ) + + if loc_type is not None and loc_type not in allowed_loctypes: + raise ValueError(f"{what} does not support location_type='{loc_type}'") + + if loc_type is None: + if location is None: + if allow_none: + return _make_managed_location("host", -1) + raise ValueError(f"{what} requires a location") + if not isinstance(location, int): + raise TypeError( + f"{what} location must be a Device, int, or None, got {type(location).__name__}" + ) + loc_id = location + if loc_id == -1: + if "host" not in allowed_loctypes: + raise ValueError(f"{what} does not support host locations") + return _make_managed_location("host", -1) + elif loc_id >= 0: + return _make_managed_location("device", loc_id) + else: + raise ValueError( + f"{what} location must be a device ordinal (>= 0), -1 for host, or None; got {location!r}" + ) + elif loc_type == "device": + if isinstance(location, int) and location >= 0: + loc_id = location + else: + raise ValueError( + f"{what} location must be a device ordinal (>= 0) when location_type is 'device', got {location!r}" + ) + return _make_managed_location(loc_type, loc_id) + elif loc_type == "host": + if location not in (None, -1): + raise ValueError( + f"{what} location must be None or -1 when location_type is 'host', got {location!r}" + ) + return _make_managed_location(loc_type, -1) + elif loc_type == "host_numa": + if not isinstance(location, int) or location < 0: + raise ValueError( + f"{what} location must be a NUMA node ID (>= 0) when location_type is 'host_numa', got {location!r}" + ) + return _make_managed_location(loc_type, location) + else: + if location is not None: + raise ValueError( + f"{what} location must be None when location_type is 'host_numa_current', got {location!r}" + ) + return _make_managed_location(loc_type, _HOST_NUMA_CURRENT_ID) + + +cdef bint _managed_location_uses_v2_bindings(): + # cuda.bindings 13.x switches these APIs to CUmemLocation-based wrappers. + global _V2_BINDINGS + if _V2_BINDINGS < 0: + _V2_BINDINGS = 1 if get_binding_version() >= (13, 0) else 0 + return _V2_BINDINGS != 0 + + +cdef object _LEGACY_LOC_DEVICE = None +cdef object _LEGACY_LOC_HOST = None + +cdef int _managed_location_to_legacy_device(object location, str what): + global _LEGACY_LOC_DEVICE, _LEGACY_LOC_HOST + if _LEGACY_LOC_DEVICE is None: + _LEGACY_LOC_DEVICE = _managed_location_enum("device") + _LEGACY_LOC_HOST = _managed_location_enum("host") + cdef object loc_type = location.type + if loc_type == _LEGACY_LOC_DEVICE or loc_type == _LEGACY_LOC_HOST: + return location.id + raise RuntimeError( + f"{what} requires cuda.bindings 13.x for location_type={loc_type!r}" + ) + + +cdef void _require_managed_buffer(Buffer self, str what): + _init_mem_attrs(self) + if not self._mem_attrs.is_managed: + raise ValueError(f"{what} requires a managed-memory allocation") + + +cdef void _require_managed_discard_prefetch_support(str what): + global _DISCARD_PREFETCH_SUPPORTED + if _DISCARD_PREFETCH_SUPPORTED < 0: + _DISCARD_PREFETCH_SUPPORTED = 1 if hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync") else 0 + if not _DISCARD_PREFETCH_SUPPORTED: + raise RuntimeError( + f"{what} requires cuda.bindings support for cuMemDiscardAndPrefetchBatchAsync" + ) + + +cdef tuple _managed_range_from_buffer( + Buffer buffer, + int size, + str what, +): + if size != _MANAGED_SIZE_NOT_PROVIDED: + raise TypeError(f"{what} does not accept size= when target is a Buffer") + _require_managed_buffer(buffer, what) + return buffer.handle, buffer._size + + +cdef uintptr_t _coerce_raw_pointer(object target, str what) except? 0: + cdef object ptr_obj + try: + ptr_obj = int(target) + except Exception as exc: + raise TypeError( + f"{what} target must be a Buffer or a raw pointer, got {type(target).__name__}" + ) from exc + if ptr_obj < 0: + raise ValueError(f"{what} target pointer must be >= 0, got {target!r}") + return ptr_obj + + +cdef int _require_managed_pointer(uintptr_t ptr, str what) except -1: + cdef _MemAttrs mem_attrs + with nogil: + _query_memory_attrs(mem_attrs, ptr) + if not mem_attrs.is_managed: + raise ValueError(f"{what} requires a managed-memory allocation") + return 0 + + +cdef tuple _normalize_managed_target_range( + object target, + int size, + str what, +): + cdef uintptr_t ptr + + if isinstance(target, Buffer): + return _managed_range_from_buffer(target, size, what) + + if size == _MANAGED_SIZE_NOT_PROVIDED: + raise TypeError(f"{what} requires size= when target is a raw pointer") + ptr = _coerce_raw_pointer(target, what) + _require_managed_pointer(ptr, what) + return ptr, size + + +def advise( + target, + advice: driver.CUmem_advise | str, + location: Device | int | None = None, + *, + int size=_MANAGED_SIZE_NOT_PROVIDED, + location_type: str | None = None, +): + """Apply managed-memory advice to an allocation range. + + Parameters + ---------- + target : :class:`Buffer` | int | object + Managed allocation to operate on. This may be a :class:`Buffer` or a + raw pointer (requires ``size=``). + advice : :obj:`~driver.CUmem_advise` | str + Managed-memory advice to apply. String aliases such as + ``"set_read_mostly"``, ``"set_preferred_location"``, and + ``"set_accessed_by"`` are accepted. + location : :obj:`~_device.Device` | int | None, optional + Target location. When ``location_type`` is ``None``, values are + interpreted as a device ordinal, ``-1`` for host, or ``None`` for + advice values that ignore location. + size : int, optional + Allocation size in bytes. Required when ``target`` is a raw pointer. + location_type : str | None, optional + Explicit location kind. Supported values are ``"device"``, ``"host"``, + ``"host_numa"``, and ``"host_numa_current"``. + """ + cdef str advice_name + cdef object ptr + cdef size_t nbytes + + ptr, nbytes = _normalize_managed_target_range(target, size, "advise") + advice_name, advice = _normalize_managed_advice(advice) + location = _normalize_managed_location( + location, + location_type, + "advise", + allow_none=advice_name in _MANAGED_ADVICE_IGNORE_LOCATION, + allowed_loctypes=_MANAGED_ADVICE_ALLOWED_LOCTYPES[advice_name], + ) + if _managed_location_uses_v2_bindings(): + handle_return(driver.cuMemAdvise(ptr, nbytes, advice, location)) + else: + handle_return( + driver.cuMemAdvise( + ptr, + nbytes, + advice, + _managed_location_to_legacy_device(location, "advise"), + ) + ) + + +def prefetch( + target, + location: Device | int | None = None, + *, + stream: Stream | GraphBuilder, + int size=_MANAGED_SIZE_NOT_PROVIDED, + location_type: str | None = None, +): + """Prefetch a managed-memory allocation range to a target location. + + Parameters + ---------- + target : :class:`Buffer` | int | object + Managed allocation to operate on. This may be a :class:`Buffer` or a + raw pointer (requires ``size=``). + location : :obj:`~_device.Device` | int | None, optional + Target location. When ``location_type`` is ``None``, values are + interpreted as a device ordinal, ``-1`` for host, or ``None``. + A location is required for prefetch. + stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` + Keyword argument specifying the stream for the asynchronous prefetch. + size : int, optional + Allocation size in bytes. Required when ``target`` is a raw pointer. + location_type : str | None, optional + Explicit location kind. Supported values are ``"device"``, ``"host"``, + ``"host_numa"``, and ``"host_numa_current"``. + """ + cdef Stream s = Stream_accept(stream) + cdef object ptr + cdef size_t nbytes + + ptr, nbytes = _normalize_managed_target_range(target, size, "prefetch") + location = _normalize_managed_location( + location, + location_type, + "prefetch", + ) + if _managed_location_uses_v2_bindings(): + handle_return( + driver.cuMemPrefetchAsync( + ptr, + nbytes, + location, + _MANAGED_OPERATION_FLAGS, + s.handle, + ) + ) + else: + handle_return( + driver.cuMemPrefetchAsync( + ptr, + nbytes, + _managed_location_to_legacy_device(location, "prefetch"), + s.handle, + ) + ) + + +def discard_prefetch( + target, + location: Device | int | None = None, + *, + stream: Stream | GraphBuilder, + int size=_MANAGED_SIZE_NOT_PROVIDED, + location_type: str | None = None, +): + """Discard a managed-memory allocation range and prefetch it to a target location. + + Parameters + ---------- + target : :class:`Buffer` | int | object + Managed allocation to operate on. This may be a :class:`Buffer` or a + raw pointer (requires ``size=``). + location : :obj:`~_device.Device` | int | None, optional + Target location. When ``location_type`` is ``None``, values are + interpreted as a device ordinal, ``-1`` for host, or ``None``. + A location is required for discard_prefetch. + stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` + Keyword argument specifying the stream for the asynchronous operation. + size : int, optional + Allocation size in bytes. Required when ``target`` is a raw pointer. + location_type : str | None, optional + Explicit location kind. Supported values are ``"device"``, ``"host"``, + ``"host_numa"``, and ``"host_numa_current"``. + """ + cdef object ptr + cdef object batch_ptr + cdef size_t nbytes + + ptr, nbytes = _normalize_managed_target_range(target, size, "discard_prefetch") + _require_managed_discard_prefetch_support("discard_prefetch") + cdef Stream s = Stream_accept(stream) + batch_ptr = driver.CUdeviceptr(int(ptr)) + location = _normalize_managed_location( + location, + location_type, + "discard_prefetch", + ) + handle_return( + driver.cuMemDiscardAndPrefetchBatchAsync( + [batch_ptr], + [nbytes], + _SINGLE_RANGE_COUNT, + [location], + [_FIRST_PREFETCH_LOCATION_INDEX], + _SINGLE_PREFETCH_LOCATION_COUNT, + _MANAGED_OPERATION_FLAGS, + s.handle, + ) + ) diff --git a/cuda_core/cuda/core/managed_memory.py b/cuda_core/cuda/core/managed_memory.py index f5bb09c13d..005c9ec3cf 100644 --- a/cuda_core/cuda/core/managed_memory.py +++ b/cuda_core/cuda/core/managed_memory.py @@ -4,6 +4,6 @@ """Managed-memory range operations.""" -from cuda.core._memory._buffer import advise, discard_prefetch, prefetch +from cuda.core._memory._managed_memory_ops import advise, discard_prefetch, prefetch __all__ = ["advise", "discard_prefetch", "prefetch"] diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 56c505fbe6..544b7afc03 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -44,7 +44,7 @@ system as ccx_system, ) from cuda.core._dlpack import DLDeviceType -from cuda.core._memory import IPCBufferDescriptor, _buffer +from cuda.core._memory import IPCBufferDescriptor, _managed_memory_ops from cuda.core._utils.cuda_utils import CUDAError, handle_return from cuda.core.utils import StridedMemoryView @@ -1313,9 +1313,9 @@ def fake_cuMemAdvise(ptr, size, advice, location): calls.append((ptr, size, advice, location)) return (driver.CUresult.CUDA_SUCCESS,) - monkeypatch.setattr(_buffer, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION) - monkeypatch.setattr(_buffer, "_V2_BINDINGS", -1) - monkeypatch.setattr(_buffer.driver, "cuMemAdvise", fake_cuMemAdvise) + monkeypatch.setattr(_managed_memory_ops, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION) + monkeypatch.setattr(_managed_memory_ops, "_V2_BINDINGS", -1) + monkeypatch.setattr(_managed_memory_ops.driver, "cuMemAdvise", fake_cuMemAdvise) managed_memory.advise(buffer, "set_read_mostly") @@ -1338,9 +1338,9 @@ def fake_cuMemPrefetchAsync(ptr, size, location, hstream): calls.append((ptr, size, location, hstream)) return (driver.CUresult.CUDA_SUCCESS,) - monkeypatch.setattr(_buffer, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION) - monkeypatch.setattr(_buffer, "_V2_BINDINGS", -1) - monkeypatch.setattr(_buffer.driver, "cuMemPrefetchAsync", fake_cuMemPrefetchAsync) + monkeypatch.setattr(_managed_memory_ops, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION) + monkeypatch.setattr(_managed_memory_ops, "_V2_BINDINGS", -1) + monkeypatch.setattr(_managed_memory_ops.driver, "cuMemPrefetchAsync", fake_cuMemPrefetchAsync) managed_memory.prefetch(buffer, device, stream=stream) From 90f07117615a25b45baf9722c3c1f0835c85d1c5 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Wed, 18 Mar 2026 14:16:38 -0700 Subject: [PATCH 14/16] pre-commit fix --- cuda_core/cuda/core/_memory/_buffer.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 104252a62b..e47f3f4926 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -35,7 +35,7 @@ else: BufferProtocol = object from cuda.core._dlpack import DLDeviceType, make_py_capsule -from cuda.core._utils.cuda_utils import driver, handle_return +from cuda.core._utils.cuda_utils import driver from cuda.core._device import Device From b4d252cdb5a8899d775db185d0cc9ec92c9cd474 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Thu, 19 Mar 2026 11:07:46 -0700 Subject: [PATCH 15/16] Removing blank file --- cuda_core/cuda/core/_memory/_managed_memory_ops.pxd | 6 ------ 1 file changed, 6 deletions(-) delete mode 100644 cuda_core/cuda/core/_memory/_managed_memory_ops.pxd diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pxd b/cuda_core/cuda/core/_memory/_managed_memory_ops.pxd deleted file mode 100644 index a7019c784d..0000000000 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pxd +++ /dev/null @@ -1,6 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 - -# Managed-memory operation helpers (advise, prefetch, discard_prefetch). -# The public API is exposed via def functions; no cdef declarations needed. From faaa1d881363eb4ea5d3d13cf0a21b433cdcd61f Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Thu, 19 Mar 2026 13:15:08 -0700 Subject: [PATCH 16/16] wip --- .../cuda/core/_memory/_managed_memory_ops.pyx | 117 +++++------------- cuda_core/tests/test_memory.py | 42 ------- 2 files changed, 29 insertions(+), 130 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx index 649c2cbe72..04dc33ed75 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -4,10 +4,7 @@ from __future__ import annotations -from libc.stdint cimport uintptr_t - -from cuda.bindings cimport cydriver -from cuda.core._memory._buffer cimport Buffer, _MemAttrs, _init_mem_attrs, _query_memory_attrs +from cuda.core._memory._buffer cimport Buffer, _init_mem_attrs from cuda.core._stream cimport Stream, Stream_accept from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return @@ -56,7 +53,6 @@ cdef dict _MANAGED_ADVICE_ALLOWED_LOCTYPES = { "unset_accessed_by": _DEVICE_HOST_ONLY, } -cdef int _MANAGED_SIZE_NOT_PROVIDED = -1 cdef int _HOST_NUMA_CURRENT_ID = 0 cdef int _FIRST_PREFETCH_LOCATION_INDEX = 0 cdef size_t _SINGLE_RANGE_COUNT = 1 @@ -241,71 +237,19 @@ cdef void _require_managed_discard_prefetch_support(str what): ) -cdef tuple _managed_range_from_buffer( - Buffer buffer, - int size, - str what, -): - if size != _MANAGED_SIZE_NOT_PROVIDED: - raise TypeError(f"{what} does not accept size= when target is a Buffer") - _require_managed_buffer(buffer, what) - return buffer.handle, buffer._size - - -cdef uintptr_t _coerce_raw_pointer(object target, str what) except? 0: - cdef object ptr_obj - try: - ptr_obj = int(target) - except Exception as exc: - raise TypeError( - f"{what} target must be a Buffer or a raw pointer, got {type(target).__name__}" - ) from exc - if ptr_obj < 0: - raise ValueError(f"{what} target pointer must be >= 0, got {target!r}") - return ptr_obj - - -cdef int _require_managed_pointer(uintptr_t ptr, str what) except -1: - cdef _MemAttrs mem_attrs - with nogil: - _query_memory_attrs(mem_attrs, ptr) - if not mem_attrs.is_managed: - raise ValueError(f"{what} requires a managed-memory allocation") - return 0 - - -cdef tuple _normalize_managed_target_range( - object target, - int size, - str what, -): - cdef uintptr_t ptr - - if isinstance(target, Buffer): - return _managed_range_from_buffer(target, size, what) - - if size == _MANAGED_SIZE_NOT_PROVIDED: - raise TypeError(f"{what} requires size= when target is a raw pointer") - ptr = _coerce_raw_pointer(target, what) - _require_managed_pointer(ptr, what) - return ptr, size - - def advise( - target, + target: Buffer, advice: driver.CUmem_advise | str, location: Device | int | None = None, *, - int size=_MANAGED_SIZE_NOT_PROVIDED, location_type: str | None = None, ): """Apply managed-memory advice to an allocation range. Parameters ---------- - target : :class:`Buffer` | int | object - Managed allocation to operate on. This may be a :class:`Buffer` or a - raw pointer (requires ``size=``). + target : :class:`Buffer` + Managed allocation to operate on. advice : :obj:`~driver.CUmem_advise` | str Managed-memory advice to apply. String aliases such as ``"set_read_mostly"``, ``"set_preferred_location"``, and @@ -314,17 +258,18 @@ def advise( Target location. When ``location_type`` is ``None``, values are interpreted as a device ordinal, ``-1`` for host, or ``None`` for advice values that ignore location. - size : int, optional - Allocation size in bytes. Required when ``target`` is a raw pointer. location_type : str | None, optional Explicit location kind. Supported values are ``"device"``, ``"host"``, ``"host_numa"``, and ``"host_numa_current"``. """ + if not isinstance(target, Buffer): + raise TypeError(f"advise target must be a Buffer, got {type(target).__name__}") + cdef Buffer buf = target + _require_managed_buffer(buf, "advise") cdef str advice_name - cdef object ptr - cdef size_t nbytes + cdef object ptr = buf.handle + cdef size_t nbytes = buf._size - ptr, nbytes = _normalize_managed_target_range(target, size, "advise") advice_name, advice = _normalize_managed_advice(advice) location = _normalize_managed_location( location, @@ -347,37 +292,36 @@ def advise( def prefetch( - target, + target: Buffer, location: Device | int | None = None, *, stream: Stream | GraphBuilder, - int size=_MANAGED_SIZE_NOT_PROVIDED, location_type: str | None = None, ): """Prefetch a managed-memory allocation range to a target location. Parameters ---------- - target : :class:`Buffer` | int | object - Managed allocation to operate on. This may be a :class:`Buffer` or a - raw pointer (requires ``size=``). + target : :class:`Buffer` + Managed allocation to operate on. location : :obj:`~_device.Device` | int | None, optional Target location. When ``location_type`` is ``None``, values are interpreted as a device ordinal, ``-1`` for host, or ``None``. A location is required for prefetch. stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` Keyword argument specifying the stream for the asynchronous prefetch. - size : int, optional - Allocation size in bytes. Required when ``target`` is a raw pointer. location_type : str | None, optional Explicit location kind. Supported values are ``"device"``, ``"host"``, ``"host_numa"``, and ``"host_numa_current"``. """ + if not isinstance(target, Buffer): + raise TypeError(f"prefetch target must be a Buffer, got {type(target).__name__}") + cdef Buffer buf = target + _require_managed_buffer(buf, "prefetch") cdef Stream s = Stream_accept(stream) - cdef object ptr - cdef size_t nbytes + cdef object ptr = buf.handle + cdef size_t nbytes = buf._size - ptr, nbytes = _normalize_managed_target_range(target, size, "prefetch") location = _normalize_managed_location( location, location_type, @@ -405,40 +349,37 @@ def prefetch( def discard_prefetch( - target, + target: Buffer, location: Device | int | None = None, *, stream: Stream | GraphBuilder, - int size=_MANAGED_SIZE_NOT_PROVIDED, location_type: str | None = None, ): """Discard a managed-memory allocation range and prefetch it to a target location. Parameters ---------- - target : :class:`Buffer` | int | object - Managed allocation to operate on. This may be a :class:`Buffer` or a - raw pointer (requires ``size=``). + target : :class:`Buffer` + Managed allocation to operate on. location : :obj:`~_device.Device` | int | None, optional Target location. When ``location_type`` is ``None``, values are interpreted as a device ordinal, ``-1`` for host, or ``None``. A location is required for discard_prefetch. stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` Keyword argument specifying the stream for the asynchronous operation. - size : int, optional - Allocation size in bytes. Required when ``target`` is a raw pointer. location_type : str | None, optional Explicit location kind. Supported values are ``"device"``, ``"host"``, ``"host_numa"``, and ``"host_numa_current"``. """ - cdef object ptr - cdef object batch_ptr - cdef size_t nbytes - - ptr, nbytes = _normalize_managed_target_range(target, size, "discard_prefetch") + if not isinstance(target, Buffer): + raise TypeError(f"discard_prefetch target must be a Buffer, got {type(target).__name__}") + cdef Buffer buf = target + _require_managed_buffer(buf, "discard_prefetch") _require_managed_discard_prefetch_support("discard_prefetch") cdef Stream s = Stream_accept(stream) - batch_ptr = driver.CUdeviceptr(int(ptr)) + cdef object ptr = buf.handle + cdef size_t nbytes = buf._size + cdef object batch_ptr = driver.CUdeviceptr(int(ptr)) location = _normalize_managed_location( location, location_type, diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 544b7afc03..dbb5ac6d8c 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1441,20 +1441,6 @@ def test_managed_memory_advise_accepts_enum_value(init_cuda): buffer.close() -def test_managed_memory_advise_size_rejected_for_buffer(init_cuda): - """advise() raises TypeError when size= is given with a Buffer target.""" - device = Device() - _skip_if_managed_allocation_unsupported(device) - device.set_current() - - buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - - with pytest.raises(TypeError, match="does not accept size="): - managed_memory.advise(buffer, "set_read_mostly", size=1024) - - buffer.close() - - def test_managed_memory_advise_invalid_advice_values(init_cuda): """advise() rejects invalid advice strings and wrong types.""" device = Device() @@ -1472,34 +1458,6 @@ def test_managed_memory_advise_invalid_advice_values(init_cuda): buffer.close() -def test_managed_memory_functions_accept_raw_pointer_ranges(init_cuda): - device = Device() - _skip_if_managed_location_ops_unsupported(device) - device.set_current() - - buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - stream = device.create_stream() - - managed_memory.advise(buffer.handle, "set_read_mostly", size=buffer.size) - assert ( - _get_int_mem_range_attr( - buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, - ) - == _READ_MOSTLY_ENABLED - ) - - managed_memory.prefetch(buffer.handle, device, size=buffer.size, stream=stream) - stream.sync() - last_location = _get_int_mem_range_attr( - buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, - ) - assert last_location == device.device_id - - buffer.close() - - def test_managed_memory_resource_host_numa_auto_resolve_failure(init_cuda): """host_numa with None raises RuntimeError when NUMA ID cannot be determined.""" from unittest.mock import MagicMock, patch