diff --git a/cuda_bindings/pixi.lock b/cuda_bindings/pixi.lock index b01d6eec69..237a169580 100644 --- a/cuda_bindings/pixi.lock +++ b/cuda_bindings/pixi.lock @@ -1081,21 +1081,21 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/cairo-1.18.4-h3394656_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/conda-gcc-specs-15.2.0-h53410ce_16.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_linux-64-13.2.27-ha770c72_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_linux-64-13.2.51-ha770c72_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-13.2.51-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-dev-13.2.51-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_linux-64-13.2.51-h376f20c_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-13.2.51-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.2.51-h376f20c_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.2.51-h376f20c_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.51-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.2.51-h69a702a_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-13.2.51-ha770c72_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.2.51-h4bc722e_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-tools-13.2.51-h4bc722e_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-profiler-api-13.2.20-h7938cbb_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.2-he2cc418_3.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_linux-64-12.9.27-ha770c72_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_linux-64-12.9.86-ha770c72_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-12.9.79-h5888daf_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-dev-12.9.79-h5888daf_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_linux-64-12.9.79-h3f2d84a_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-12.9.79-h5888daf_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-12.9.79-h3f2d84a_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-12.9.79-h3f2d84a_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-12.9.86-hecca717_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-12.9.86-h69a702a_6.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-12.9.86-ha770c72_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-12.9.86-h4bc722e_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-tools-12.9.86-h4bc722e_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-profiler-api-12.9.79-h7938cbb_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-12.9-h4f385c5_3.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cython-3.2.3-py314h1807b08_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/dav1d-1.2.1-hd590300_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/dbus-1.16.2-h24cb091_1.conda @@ -1134,7 +1134,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-h3ff7636_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.0.44-h85c024f_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.14.1.1-hbc026e6_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.25-h17f619e_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libdrm-2.4.125-hb03c661_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libegl-1.7.0-ha4b6fd6_2.conda @@ -1160,8 +1160,8 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_2.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb9d3cd8_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libnl-3.11.0-hb9d3cd8_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvfatbin-13.2.51-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-13.2.51-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvfatbin-12.9.82-hecca717_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-12.9.86-hecca717_2.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libogg-1.3.5-hd0c01bc_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_4.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-2025.2.0-hb617929_1.conda @@ -1264,7 +1264,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda - conda: . - build: py314hb727236_0 + build: py314ha6d028f_0 - conda: ../cuda_pathfinder linux-aarch64: - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/_openmp_mutex-4.5-2_gnu.tar.bz2 @@ -1460,21 +1460,21 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/cairo-1.18.4-h5782bbf_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/conda-gcc-specs-15.2.0-hd546029_16.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_win-64-12.9.27-h57928b3_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_win-64-12.9.86-h57928b3_2.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-12.9.79-he0c23c2_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-dev-12.9.79-he0c23c2_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_win-64-12.9.79-he0c23c2_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-static-12.9.79-he0c23c2_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_win-64-12.9.79-he0c23c2_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_win-64-12.9.79-he0c23c2_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-12.9.86-hac47afa_1.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-12.9.86-h719f0c7_6.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_win-64-12.9.86-h57928b3_2.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-impl-12.9.86-h2466b09_2.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-tools-12.9.86-h2466b09_2.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-profiler-api-12.9.79-h57928b3_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-12.9-h4f385c5_3.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_win-64-13.2.27-h57928b3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_win-64-13.2.51-h57928b3_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-dev-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_win-64-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-static-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_win-64-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_win-64-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-13.2.51-h719f0c7_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_win-64-13.2.51-h57928b3_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-impl-13.2.51-h2466b09_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-tools-13.2.51-h2466b09_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-profiler-api-13.2.20-h57928b3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.2-he2cc418_3.conda - conda: https://conda.anaconda.org/conda-forge/win-64/cython-3.2.3-py314h344ed54_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/dav1d-1.2.1-hcfcfb64_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.1-pyhd8ed1ab_0.conda @@ -1520,8 +1520,8 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/liblapack-3.11.0-5_hf9ab0e9_mkl.conda - conda: https://conda.anaconda.org/conda-forge/win-64/liblzma-5.8.1-h2466b09_2.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libmpdec-4.0.0-h2466b09_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/libnvfatbin-12.9.82-hac47afa_1.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/libnvjitlink-12.9.86-hac47afa_2.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libnvfatbin-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libnvjitlink-13.2.51-hac47afa_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libogg-1.3.5-h2466b09_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libopus-1.6-h6a83c73_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libpng-1.6.53-h7351971_0.conda @@ -1583,7 +1583,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/zstd-1.5.7-h534d264_6.conda - conda: . - build: py314h5e6f764_0 + build: py314h356c398_0 - conda: ../cuda_pathfinder packages: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -2154,7 +2154,7 @@ packages: subdir: win-64 variants: c_compiler: vs2022 - cuda-version: 13.2.* + cuda_version: 13.2.* cxx_compiler: vs2022 python: 3.14.* target_platform: win-64 @@ -2182,7 +2182,7 @@ packages: subdir: win-64 variants: c_compiler: vs2022 - cuda-version: 12.* + cuda_version: 12.* cxx_compiler: vs2022 python: 3.14.* target_platform: win-64 @@ -2209,7 +2209,7 @@ packages: build: py314h9a28ecd_0 subdir: linux-aarch64 variants: - cuda-version: 13.2.* + cuda_version: 13.2.* python: 3.14.* target_platform: linux-aarch64 depends: @@ -2237,7 +2237,7 @@ packages: build: py314ha6d028f_0 subdir: linux-64 variants: - cuda-version: 12.* + cuda_version: 12.* python: 3.14.* target_platform: linux-64 depends: @@ -2265,7 +2265,7 @@ packages: build: py314hb727236_0 subdir: linux-64 variants: - cuda-version: 13.2.* + cuda_version: 13.2.* python: 3.14.* target_platform: linux-64 depends: @@ -2293,7 +2293,7 @@ packages: build: py314he8946ed_0 subdir: linux-aarch64 variants: - cuda-version: 12.* + cuda_version: 12.* python: 3.14.* target_platform: linux-aarch64 depends: diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index 139078e86e..c55c0786ed 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -28,7 +28,7 @@ finally: del bindings, importlib, subdir, cuda_major, cuda_minor -from cuda.core import system, utils +from cuda.core import managed_memory, system, utils from cuda.core._device import Device from cuda.core._event import Event, EventOptions from cuda.core._graph import ( diff --git a/cuda_core/cuda/core/_memory/_buffer.pxd b/cuda_core/cuda/core/_memory/_buffer.pxd index 91c0cfe24a..9065da77eb 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pxd +++ b/cuda_core/cuda/core/_memory/_buffer.pxd @@ -4,6 +4,7 @@ from libc.stdint cimport uintptr_t +from cuda.bindings cimport cydriver from cuda.core._resource_handles cimport DevicePtrHandle from cuda.core._stream cimport Stream @@ -12,6 +13,7 @@ cdef struct _MemAttrs: int device_id bint is_device_accessible bint is_host_accessible + bint is_managed cdef class Buffer: @@ -37,3 +39,10 @@ cdef Buffer Buffer_from_deviceptr_handle( MemoryResource mr, object ipc_descriptor = * ) + +# Memory attribute query helpers (used by _managed_memory_ops) +cdef void _init_mem_attrs(Buffer self) +cdef int _query_memory_attrs( + _MemAttrs& out, + cydriver.CUdeviceptr ptr, +) except -1 nogil diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index b836972f5f..e47f3f4926 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -71,6 +71,7 @@ A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting :attr:`Buffer.handle`. """ + cdef class Buffer: """Represent a handle to allocated memory. @@ -420,14 +421,14 @@ cdef class Buffer: # Memory Attribute Query Helpers # ------------------------------ -cdef inline void _init_mem_attrs(Buffer self): +cdef void _init_mem_attrs(Buffer self): """Initialize memory attributes by querying the pointer.""" if not self._mem_attrs_inited: _query_memory_attrs(self._mem_attrs, as_cu(self._h_ptr)) self._mem_attrs_inited = True -cdef inline int _query_memory_attrs( +cdef int _query_memory_attrs( _MemAttrs& out, cydriver.CUdeviceptr ptr ) except -1 nogil: @@ -459,6 +460,7 @@ cdef inline int _query_memory_attrs( out.is_host_accessible = True out.is_device_accessible = False out.device_id = -1 + out.is_managed = False elif ( is_managed or memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_HOST @@ -467,10 +469,12 @@ cdef inline int _query_memory_attrs( out.is_host_accessible = True out.is_device_accessible = True out.device_id = device_id + out.is_managed = is_managed != 0 elif memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_DEVICE: out.is_host_accessible = False out.is_device_accessible = True out.device_id = device_id + out.is_managed = False else: with cython.gil: raise ValueError(f"Unsupported memory type: {memory_type}") diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pxd b/cuda_core/cuda/core/_memory/_managed_memory_ops.pxd new file mode 100644 index 0000000000..a7019c784d --- /dev/null +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pxd @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# Managed-memory operation helpers (advise, prefetch, discard_prefetch). +# The public API is exposed via def functions; no cdef declarations needed. diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx new file mode 100644 index 0000000000..649c2cbe72 --- /dev/null +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -0,0 +1,458 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from libc.stdint cimport uintptr_t + +from cuda.bindings cimport cydriver +from cuda.core._memory._buffer cimport Buffer, _MemAttrs, _init_mem_attrs, _query_memory_attrs +from cuda.core._stream cimport Stream, Stream_accept + +from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return +from cuda.core._device import Device + + +cdef tuple _VALID_MANAGED_LOCATION_TYPES = ( + "device", + "host", + "host_numa", + "host_numa_current", +) + +cdef dict _MANAGED_LOCATION_TYPE_ATTRS = { + "device": "CU_MEM_LOCATION_TYPE_DEVICE", + "host": "CU_MEM_LOCATION_TYPE_HOST", + "host_numa": "CU_MEM_LOCATION_TYPE_HOST_NUMA", + "host_numa_current": "CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT", +} + +cdef dict _MANAGED_ADVICE_ALIASES = { + "set_read_mostly": "CU_MEM_ADVISE_SET_READ_MOSTLY", + "unset_read_mostly": "CU_MEM_ADVISE_UNSET_READ_MOSTLY", + "set_preferred_location": "CU_MEM_ADVISE_SET_PREFERRED_LOCATION", + "unset_preferred_location": "CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION", + "set_accessed_by": "CU_MEM_ADVISE_SET_ACCESSED_BY", + "unset_accessed_by": "CU_MEM_ADVISE_UNSET_ACCESSED_BY", +} + +cdef frozenset _MANAGED_ADVICE_IGNORE_LOCATION = frozenset(( + "set_read_mostly", + "unset_read_mostly", + "unset_preferred_location", +)) + +cdef frozenset _ALL_LOCATION_TYPES = frozenset(("device", "host", "host_numa", "host_numa_current")) +cdef frozenset _DEVICE_HOST_NUMA = frozenset(("device", "host", "host_numa")) +cdef frozenset _DEVICE_HOST_ONLY = frozenset(("device", "host")) + +cdef dict _MANAGED_ADVICE_ALLOWED_LOCTYPES = { + "set_read_mostly": _DEVICE_HOST_NUMA, + "unset_read_mostly": _DEVICE_HOST_NUMA, + "set_preferred_location": _ALL_LOCATION_TYPES, + "unset_preferred_location": _DEVICE_HOST_NUMA, + "set_accessed_by": _DEVICE_HOST_ONLY, + "unset_accessed_by": _DEVICE_HOST_ONLY, +} + +cdef int _MANAGED_SIZE_NOT_PROVIDED = -1 +cdef int _HOST_NUMA_CURRENT_ID = 0 +cdef int _FIRST_PREFETCH_LOCATION_INDEX = 0 +cdef size_t _SINGLE_RANGE_COUNT = 1 +cdef size_t _SINGLE_PREFETCH_LOCATION_COUNT = 1 +cdef unsigned long long _MANAGED_OPERATION_FLAGS = 0 + +# Lazily cached values for immutable runtime properties. +cdef object _CU_DEVICE_CPU = None +cdef dict _ADVICE_ENUM_TO_ALIAS = None +_V2_BINDINGS = -1 +cdef int _DISCARD_PREFETCH_SUPPORTED = -1 + + +cdef object _managed_location_enum(str location_type): + cdef str attr_name = _MANAGED_LOCATION_TYPE_ATTRS[location_type] + cdef object result = getattr(driver.CUmemLocationType, attr_name, None) + if result is None: + raise RuntimeError( + f"Managed-memory location type {location_type!r} is not supported by the " + f"installed cuda.bindings package." + ) + return result + + +cdef object _make_managed_location(str location_type, int location_id): + global _CU_DEVICE_CPU + cdef object location = driver.CUmemLocation() + location.type = _managed_location_enum(location_type) + if location_type == "host": + if _CU_DEVICE_CPU is None: + _CU_DEVICE_CPU = int(getattr(driver, "CU_DEVICE_CPU", -1)) + location.id = _CU_DEVICE_CPU + elif location_type == "host_numa_current": + location.id = _HOST_NUMA_CURRENT_ID + else: + location.id = location_id + return location + + +cdef tuple _normalize_managed_advice(object advice): + cdef str alias + cdef str attr_name + if isinstance(advice, str): + alias = advice.lower() + attr_name = _MANAGED_ADVICE_ALIASES.get(alias) + if attr_name is None: + raise ValueError( + "advice must be one of " + f"{tuple(sorted(_MANAGED_ADVICE_ALIASES))!r}, got {advice!r}" + ) + return alias, getattr(driver.CUmem_advise, attr_name) + + if isinstance(advice, driver.CUmem_advise): + global _ADVICE_ENUM_TO_ALIAS + if _ADVICE_ENUM_TO_ALIAS is None: + _ADVICE_ENUM_TO_ALIAS = {} + for alias, attr_name in _MANAGED_ADVICE_ALIASES.items(): + enum_val = getattr(driver.CUmem_advise, attr_name, None) + if enum_val is not None: + _ADVICE_ENUM_TO_ALIAS[enum_val] = alias + alias = _ADVICE_ENUM_TO_ALIAS.get(advice) + if alias is None: + raise ValueError(f"Unsupported advice value: {advice!r}") + return alias, advice + + raise TypeError( + "advice must be a cuda.bindings.driver.CUmem_advise value or a supported string alias" + ) + + +cdef object _normalize_managed_location( + object location, + object location_type, + str what, + bint allow_none=False, + frozenset allowed_loctypes=_ALL_LOCATION_TYPES, +): + cdef object loc_type + cdef int loc_id + + if isinstance(location, Device): + location = location.device_id + + if location_type is not None and not isinstance(location_type, str): + raise TypeError(f"{what} location_type must be a string or None, got {type(location_type).__name__}") + + loc_type = None if location_type is None else (location_type).lower() + if loc_type is not None and loc_type not in _VALID_MANAGED_LOCATION_TYPES: + raise ValueError( + f"{what} location_type must be one of {_VALID_MANAGED_LOCATION_TYPES!r} " + f"or None, got {location_type!r}" + ) + + if loc_type is not None and loc_type not in allowed_loctypes: + raise ValueError(f"{what} does not support location_type='{loc_type}'") + + if loc_type is None: + if location is None: + if allow_none: + return _make_managed_location("host", -1) + raise ValueError(f"{what} requires a location") + if not isinstance(location, int): + raise TypeError( + f"{what} location must be a Device, int, or None, got {type(location).__name__}" + ) + loc_id = location + if loc_id == -1: + if "host" not in allowed_loctypes: + raise ValueError(f"{what} does not support host locations") + return _make_managed_location("host", -1) + elif loc_id >= 0: + return _make_managed_location("device", loc_id) + else: + raise ValueError( + f"{what} location must be a device ordinal (>= 0), -1 for host, or None; got {location!r}" + ) + elif loc_type == "device": + if isinstance(location, int) and location >= 0: + loc_id = location + else: + raise ValueError( + f"{what} location must be a device ordinal (>= 0) when location_type is 'device', got {location!r}" + ) + return _make_managed_location(loc_type, loc_id) + elif loc_type == "host": + if location not in (None, -1): + raise ValueError( + f"{what} location must be None or -1 when location_type is 'host', got {location!r}" + ) + return _make_managed_location(loc_type, -1) + elif loc_type == "host_numa": + if not isinstance(location, int) or location < 0: + raise ValueError( + f"{what} location must be a NUMA node ID (>= 0) when location_type is 'host_numa', got {location!r}" + ) + return _make_managed_location(loc_type, location) + else: + if location is not None: + raise ValueError( + f"{what} location must be None when location_type is 'host_numa_current', got {location!r}" + ) + return _make_managed_location(loc_type, _HOST_NUMA_CURRENT_ID) + + +cdef bint _managed_location_uses_v2_bindings(): + # cuda.bindings 13.x switches these APIs to CUmemLocation-based wrappers. + global _V2_BINDINGS + if _V2_BINDINGS < 0: + _V2_BINDINGS = 1 if get_binding_version() >= (13, 0) else 0 + return _V2_BINDINGS != 0 + + +cdef object _LEGACY_LOC_DEVICE = None +cdef object _LEGACY_LOC_HOST = None + +cdef int _managed_location_to_legacy_device(object location, str what): + global _LEGACY_LOC_DEVICE, _LEGACY_LOC_HOST + if _LEGACY_LOC_DEVICE is None: + _LEGACY_LOC_DEVICE = _managed_location_enum("device") + _LEGACY_LOC_HOST = _managed_location_enum("host") + cdef object loc_type = location.type + if loc_type == _LEGACY_LOC_DEVICE or loc_type == _LEGACY_LOC_HOST: + return location.id + raise RuntimeError( + f"{what} requires cuda.bindings 13.x for location_type={loc_type!r}" + ) + + +cdef void _require_managed_buffer(Buffer self, str what): + _init_mem_attrs(self) + if not self._mem_attrs.is_managed: + raise ValueError(f"{what} requires a managed-memory allocation") + + +cdef void _require_managed_discard_prefetch_support(str what): + global _DISCARD_PREFETCH_SUPPORTED + if _DISCARD_PREFETCH_SUPPORTED < 0: + _DISCARD_PREFETCH_SUPPORTED = 1 if hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync") else 0 + if not _DISCARD_PREFETCH_SUPPORTED: + raise RuntimeError( + f"{what} requires cuda.bindings support for cuMemDiscardAndPrefetchBatchAsync" + ) + + +cdef tuple _managed_range_from_buffer( + Buffer buffer, + int size, + str what, +): + if size != _MANAGED_SIZE_NOT_PROVIDED: + raise TypeError(f"{what} does not accept size= when target is a Buffer") + _require_managed_buffer(buffer, what) + return buffer.handle, buffer._size + + +cdef uintptr_t _coerce_raw_pointer(object target, str what) except? 0: + cdef object ptr_obj + try: + ptr_obj = int(target) + except Exception as exc: + raise TypeError( + f"{what} target must be a Buffer or a raw pointer, got {type(target).__name__}" + ) from exc + if ptr_obj < 0: + raise ValueError(f"{what} target pointer must be >= 0, got {target!r}") + return ptr_obj + + +cdef int _require_managed_pointer(uintptr_t ptr, str what) except -1: + cdef _MemAttrs mem_attrs + with nogil: + _query_memory_attrs(mem_attrs, ptr) + if not mem_attrs.is_managed: + raise ValueError(f"{what} requires a managed-memory allocation") + return 0 + + +cdef tuple _normalize_managed_target_range( + object target, + int size, + str what, +): + cdef uintptr_t ptr + + if isinstance(target, Buffer): + return _managed_range_from_buffer(target, size, what) + + if size == _MANAGED_SIZE_NOT_PROVIDED: + raise TypeError(f"{what} requires size= when target is a raw pointer") + ptr = _coerce_raw_pointer(target, what) + _require_managed_pointer(ptr, what) + return ptr, size + + +def advise( + target, + advice: driver.CUmem_advise | str, + location: Device | int | None = None, + *, + int size=_MANAGED_SIZE_NOT_PROVIDED, + location_type: str | None = None, +): + """Apply managed-memory advice to an allocation range. + + Parameters + ---------- + target : :class:`Buffer` | int | object + Managed allocation to operate on. This may be a :class:`Buffer` or a + raw pointer (requires ``size=``). + advice : :obj:`~driver.CUmem_advise` | str + Managed-memory advice to apply. String aliases such as + ``"set_read_mostly"``, ``"set_preferred_location"``, and + ``"set_accessed_by"`` are accepted. + location : :obj:`~_device.Device` | int | None, optional + Target location. When ``location_type`` is ``None``, values are + interpreted as a device ordinal, ``-1`` for host, or ``None`` for + advice values that ignore location. + size : int, optional + Allocation size in bytes. Required when ``target`` is a raw pointer. + location_type : str | None, optional + Explicit location kind. Supported values are ``"device"``, ``"host"``, + ``"host_numa"``, and ``"host_numa_current"``. + """ + cdef str advice_name + cdef object ptr + cdef size_t nbytes + + ptr, nbytes = _normalize_managed_target_range(target, size, "advise") + advice_name, advice = _normalize_managed_advice(advice) + location = _normalize_managed_location( + location, + location_type, + "advise", + allow_none=advice_name in _MANAGED_ADVICE_IGNORE_LOCATION, + allowed_loctypes=_MANAGED_ADVICE_ALLOWED_LOCTYPES[advice_name], + ) + if _managed_location_uses_v2_bindings(): + handle_return(driver.cuMemAdvise(ptr, nbytes, advice, location)) + else: + handle_return( + driver.cuMemAdvise( + ptr, + nbytes, + advice, + _managed_location_to_legacy_device(location, "advise"), + ) + ) + + +def prefetch( + target, + location: Device | int | None = None, + *, + stream: Stream | GraphBuilder, + int size=_MANAGED_SIZE_NOT_PROVIDED, + location_type: str | None = None, +): + """Prefetch a managed-memory allocation range to a target location. + + Parameters + ---------- + target : :class:`Buffer` | int | object + Managed allocation to operate on. This may be a :class:`Buffer` or a + raw pointer (requires ``size=``). + location : :obj:`~_device.Device` | int | None, optional + Target location. When ``location_type`` is ``None``, values are + interpreted as a device ordinal, ``-1`` for host, or ``None``. + A location is required for prefetch. + stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` + Keyword argument specifying the stream for the asynchronous prefetch. + size : int, optional + Allocation size in bytes. Required when ``target`` is a raw pointer. + location_type : str | None, optional + Explicit location kind. Supported values are ``"device"``, ``"host"``, + ``"host_numa"``, and ``"host_numa_current"``. + """ + cdef Stream s = Stream_accept(stream) + cdef object ptr + cdef size_t nbytes + + ptr, nbytes = _normalize_managed_target_range(target, size, "prefetch") + location = _normalize_managed_location( + location, + location_type, + "prefetch", + ) + if _managed_location_uses_v2_bindings(): + handle_return( + driver.cuMemPrefetchAsync( + ptr, + nbytes, + location, + _MANAGED_OPERATION_FLAGS, + s.handle, + ) + ) + else: + handle_return( + driver.cuMemPrefetchAsync( + ptr, + nbytes, + _managed_location_to_legacy_device(location, "prefetch"), + s.handle, + ) + ) + + +def discard_prefetch( + target, + location: Device | int | None = None, + *, + stream: Stream | GraphBuilder, + int size=_MANAGED_SIZE_NOT_PROVIDED, + location_type: str | None = None, +): + """Discard a managed-memory allocation range and prefetch it to a target location. + + Parameters + ---------- + target : :class:`Buffer` | int | object + Managed allocation to operate on. This may be a :class:`Buffer` or a + raw pointer (requires ``size=``). + location : :obj:`~_device.Device` | int | None, optional + Target location. When ``location_type`` is ``None``, values are + interpreted as a device ordinal, ``-1`` for host, or ``None``. + A location is required for discard_prefetch. + stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` + Keyword argument specifying the stream for the asynchronous operation. + size : int, optional + Allocation size in bytes. Required when ``target`` is a raw pointer. + location_type : str | None, optional + Explicit location kind. Supported values are ``"device"``, ``"host"``, + ``"host_numa"``, and ``"host_numa_current"``. + """ + cdef object ptr + cdef object batch_ptr + cdef size_t nbytes + + ptr, nbytes = _normalize_managed_target_range(target, size, "discard_prefetch") + _require_managed_discard_prefetch_support("discard_prefetch") + cdef Stream s = Stream_accept(stream) + batch_ptr = driver.CUdeviceptr(int(ptr)) + location = _normalize_managed_location( + location, + location_type, + "discard_prefetch", + ) + handle_return( + driver.cuMemDiscardAndPrefetchBatchAsync( + [batch_ptr], + [nbytes], + _SINGLE_RANGE_COUNT, + [location], + [_FIRST_PREFETCH_LOCATION_INDEX], + _SINGLE_PREFETCH_LOCATION_COUNT, + _MANAGED_OPERATION_FLAGS, + s.handle, + ) + ) diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index e7989f0f26..83fb1c7581 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -38,9 +38,10 @@ def _warn_deprecated(): _warn_deprecated() -from cuda.core import system, utils +from cuda.core import managed_memory, system, utils # Make utils accessible as a submodule for backward compatibility +__import__("sys").modules[__spec__.name + ".managed_memory"] = managed_memory __import__("sys").modules[__spec__.name + ".utils"] = utils diff --git a/cuda_core/cuda/core/managed_memory.py b/cuda_core/cuda/core/managed_memory.py new file mode 100644 index 0000000000..005c9ec3cf --- /dev/null +++ b/cuda_core/cuda/core/managed_memory.py @@ -0,0 +1,9 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Managed-memory range operations.""" + +from cuda.core._memory._managed_memory_ops import advise, discard_prefetch, prefetch + +__all__ = ["advise", "discard_prefetch", "prefetch"] diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 86a83c4e86..29fd9bf62a 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -62,6 +62,21 @@ CUDA runtime on other non-blocking streams. +.. module:: cuda.core.managed_memory + +Managed memory +-------------- + +.. autosummary:: + :toctree: generated/ + + advise + prefetch + discard_prefetch + +.. module:: cuda.core + :no-index: + CUDA compilation toolchain -------------------------- diff --git a/cuda_core/docs/source/release/0.7.x-notes.rst b/cuda_core/docs/source/release/0.7.x-notes.rst index 98551603b6..186e3181f1 100644 --- a/cuda_core/docs/source/release/0.7.x-notes.rst +++ b/cuda_core/docs/source/release/0.7.x-notes.rst @@ -35,6 +35,13 @@ New features preference, or a tuple such as ``("device", 0)``, ``("host", None)``, or ``("host_numa", 3)``. +- Added managed-memory range operations under :mod:`cuda.core.managed_memory`: + ``advise()``, ``prefetch()``, and ``discard_prefetch()``. These free + functions accept either a managed :class:`Buffer` or a raw pointer plus + ``size=``, validate that the target allocation is managed memory, and then + forward to the corresponding CUDA driver operations for range advice and + migration. + - Added ``numa_id`` option to :class:`PinnedMemoryResourceOptions` for explicit control over host NUMA node placement. When ``ipc_enabled=True`` and ``numa_id`` is not set, the NUMA node is automatically derived from the diff --git a/cuda_core/pixi.lock b/cuda_core/pixi.lock index 78da9addb5..e2f8b7b0c2 100644 --- a/cuda_core/pixi.lock +++ b/cuda_core/pixi.lock @@ -2598,7 +2598,7 @@ packages: subdir: win-64 variants: c_compiler: vs2022 - cuda-version: 13.2.* + cuda_version: 13.2.* cxx_compiler: vs2022 python: 3.14.* target_platform: win-64 @@ -2625,7 +2625,7 @@ packages: build: py314h9a28ecd_0 subdir: linux-aarch64 variants: - cuda-version: 13.2.* + cuda_version: 13.2.* python: 3.14.* target_platform: linux-aarch64 depends: @@ -2653,7 +2653,7 @@ packages: build: py314hb727236_0 subdir: linux-64 variants: - cuda-version: 13.2.* + cuda_version: 13.2.* python: 3.14.* target_platform: linux-64 depends: @@ -2794,7 +2794,7 @@ packages: subdir: win-64 variants: c_compiler: vs2022 - cuda-version: 13.2.* + cuda_version: 13.2.* cxx_compiler: vs2022 python: 3.14.* target_platform: win-64 @@ -2817,7 +2817,7 @@ packages: subdir: win-64 variants: c_compiler: vs2022 - cuda-version: 12.* + cuda_version: 12.* cxx_compiler: vs2022 python: 3.14.* target_platform: win-64 @@ -2840,7 +2840,7 @@ packages: build: py314h9a28ecd_0 subdir: linux-aarch64 variants: - cuda-version: 13.2.* + cuda_version: 13.2.* python: 3.14.* target_platform: linux-aarch64 depends: @@ -2862,7 +2862,7 @@ packages: build: py314ha6d028f_0 subdir: linux-64 variants: - cuda-version: 12.* + cuda_version: 12.* python: 3.14.* target_platform: linux-64 depends: @@ -2884,7 +2884,7 @@ packages: build: py314hb727236_0 subdir: linux-64 variants: - cuda-version: 13.2.* + cuda_version: 13.2.* python: 3.14.* target_platform: linux-64 depends: @@ -2906,7 +2906,7 @@ packages: build: py314he8946ed_0 subdir: linux-aarch64 variants: - cuda-version: 12.* + cuda_version: 12.* python: 3.14.* target_platform: linux-aarch64 depends: diff --git a/cuda_core/tests/test_experimental_backward_compat.py b/cuda_core/tests/test_experimental_backward_compat.py index c3215b056a..82e2cdd5be 100644 --- a/cuda_core/tests/test_experimental_backward_compat.py +++ b/cuda_core/tests/test_experimental_backward_compat.py @@ -38,6 +38,7 @@ def test_experimental_backward_compatibility(): assert hasattr(cuda.core.experimental, "Device") assert hasattr(cuda.core.experimental, "Stream") assert hasattr(cuda.core.experimental, "Buffer") + assert hasattr(cuda.core.experimental, "managed_memory") assert hasattr(cuda.core.experimental, "system") # Test 2: Direct imports - should emit deprecation warning @@ -73,6 +74,7 @@ def test_experimental_backward_compatibility(): assert cuda.core.experimental.Linker is cuda.core.Linker # Compare singletons + assert cuda.core.experimental.managed_memory is cuda.core.managed_memory assert cuda.core.experimental.system is cuda.core.system # Test 4: Utils module works @@ -88,6 +90,11 @@ def test_experimental_backward_compatibility(): assert StridedMemoryView is not None assert args_viewable_as_strided_memory is not None + from cuda.core.experimental.managed_memory import advise, discard_prefetch, prefetch + + assert advise is not None + assert prefetch is not None + assert discard_prefetch is not None # Test 5: Options classes are accessible assert hasattr(cuda.core.experimental, "EventOptions") diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 8005d3ce6c..544b7afc03 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -38,16 +38,23 @@ PinnedMemoryResourceOptions, VirtualMemoryResource, VirtualMemoryResourceOptions, + managed_memory, ) from cuda.core import ( system as ccx_system, ) from cuda.core._dlpack import DLDeviceType -from cuda.core._memory import IPCBufferDescriptor +from cuda.core._memory import IPCBufferDescriptor, _managed_memory_ops from cuda.core._utils.cuda_utils import CUDAError, handle_return from cuda.core.utils import StridedMemoryView POOL_SIZE = 2097152 # 2MB size +_MANAGED_TEST_ALLOCATION_SIZE = 4096 +_MEM_RANGE_ATTRIBUTE_VALUE_SIZE = 4 +_READ_MOSTLY_ENABLED = 1 +_HOST_LOCATION_ID = -1 +_INVALID_HOST_DEVICE_ORDINAL = 0 +_LEGACY_BINDINGS_VERSION = (12, 9) class DummyDeviceMemoryResource(MemoryResource): @@ -1134,6 +1141,365 @@ def test_managed_memory_resource_preferred_location_validation(init_cuda): ) +def _get_mem_range_attr(buffer, attribute, data_size): + # cuMemRangeGetAttribute returns a raw integer when data_size <= 4. + return handle_return(driver.cuMemRangeGetAttribute(data_size, attribute, buffer.handle, buffer.size)) + + +def _get_int_mem_range_attr(buffer, attribute): + return _get_mem_range_attr(buffer, attribute, _MEM_RANGE_ATTRIBUTE_VALUE_SIZE) + + +def _skip_if_managed_allocation_unsupported(device): + try: + if not device.properties.managed_memory: + pytest.skip("Device does not support managed memory operations") + except AttributeError: + pytest.skip("Managed-memory buffer operations require CUDA support") + + +def _skip_if_managed_location_ops_unsupported(device): + _skip_if_managed_allocation_unsupported(device) + try: + if not device.properties.concurrent_managed_access: + pytest.skip("Device does not support concurrent managed memory access") + except AttributeError: + pytest.skip("Managed-memory location operations require CUDA support") + + +def _skip_if_managed_discard_prefetch_unsupported(device): + _skip_if_managed_location_ops_unsupported(device) + if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): + pytest.skip("discard-prefetch requires cuda.bindings support") + + visible_devices = Device.get_all_devices() + if not all(dev.properties.concurrent_managed_access for dev in visible_devices): + pytest.skip("discard-prefetch requires concurrent managed access on all visible devices") + + +def test_managed_memory_prefetch_supports_managed_pool_allocations(init_cuda): + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + + mr = create_managed_memory_resource_or_skip() + buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + managed_memory.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) + stream.sync() + last_location = _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last_location == _HOST_LOCATION_ID + + managed_memory.prefetch(buffer, device, stream=stream) + stream.sync() + last_location = _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last_location == device.device_id + + buffer.close() + + +def test_managed_memory_advise_supports_external_managed_allocations(init_cuda): + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + + managed_memory.advise(buffer, "set_read_mostly") + assert ( + _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, + ) + == _READ_MOSTLY_ENABLED + ) + + # cuda.bindings currently exposes the combined location attributes for + # cuMemRangeGetAttribute, so use the legacy location query here. + managed_memory.advise(buffer, "set_preferred_location", location_type="host") + preferred_location = _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION, + ) + assert preferred_location == _HOST_LOCATION_ID + + buffer.close() + + +def test_managed_memory_prefetch_supports_external_managed_allocations(init_cuda): + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + managed_memory.prefetch(buffer, device, stream=stream) + stream.sync() + + last_location = _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last_location == device.device_id + + buffer.close() + + +def test_managed_memory_discard_prefetch_supports_managed_pool_allocations(init_cuda): + device = Device() + skip_if_managed_memory_unsupported(device) + _skip_if_managed_discard_prefetch_unsupported(device) + device.set_current() + + mr = create_managed_memory_resource_or_skip() + buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + managed_memory.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) + stream.sync() + + managed_memory.discard_prefetch(buffer, device, stream=stream) + stream.sync() + + last_location = _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last_location == device.device_id + + buffer.close() + + +def test_managed_memory_discard_prefetch_supports_external_managed_allocations(init_cuda): + device = Device() + _skip_if_managed_discard_prefetch_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + managed_memory.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) + stream.sync() + + managed_memory.discard_prefetch(buffer, device, stream=stream) + stream.sync() + + last_location = _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last_location == device.device_id + + buffer.close() + + +def test_managed_memory_advise_uses_legacy_bindings_signature(monkeypatch, init_cuda): + device = Device() + _skip_if_managed_allocation_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + calls = [] + + def fake_cuMemAdvise(ptr, size, advice, location): + calls.append((ptr, size, advice, location)) + return (driver.CUresult.CUDA_SUCCESS,) + + monkeypatch.setattr(_managed_memory_ops, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION) + monkeypatch.setattr(_managed_memory_ops, "_V2_BINDINGS", -1) + monkeypatch.setattr(_managed_memory_ops.driver, "cuMemAdvise", fake_cuMemAdvise) + + managed_memory.advise(buffer, "set_read_mostly") + + assert len(calls) == 1 + assert calls[0][3] == int(getattr(driver, "CU_DEVICE_CPU", _HOST_LOCATION_ID)) + + buffer.close() + + +def test_managed_memory_prefetch_uses_legacy_bindings_signature(monkeypatch, init_cuda): + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + calls = [] + + def fake_cuMemPrefetchAsync(ptr, size, location, hstream): + calls.append((ptr, size, location, hstream)) + return (driver.CUresult.CUDA_SUCCESS,) + + monkeypatch.setattr(_managed_memory_ops, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION) + monkeypatch.setattr(_managed_memory_ops, "_V2_BINDINGS", -1) + monkeypatch.setattr(_managed_memory_ops.driver, "cuMemPrefetchAsync", fake_cuMemPrefetchAsync) + + managed_memory.prefetch(buffer, device, stream=stream) + + assert len(calls) == 1 + assert calls[0][2] == device.device_id + assert int(calls[0][3]) == int(stream.handle) + + buffer.close() + + +def test_managed_memory_operations_reject_non_managed_allocations(init_cuda): + device = Device() + device.set_current() + + buffer = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + with pytest.raises(ValueError, match="managed-memory allocation"): + managed_memory.advise(buffer, "set_read_mostly") + with pytest.raises(ValueError, match="managed-memory allocation"): + managed_memory.prefetch(buffer, device, stream=stream) + with pytest.raises(ValueError, match="managed-memory allocation"): + managed_memory.discard_prefetch(buffer, device, stream=stream) + + buffer.close() + + +def test_managed_memory_operation_validation(init_cuda): + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + + mr = create_managed_memory_resource_or_skip() + buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + with pytest.raises(ValueError, match="requires a location"): + managed_memory.prefetch(buffer, stream=stream) + with pytest.raises(ValueError, match="does not support location_type='host_numa'"): + managed_memory.advise(buffer, "set_accessed_by", _INVALID_HOST_DEVICE_ORDINAL, location_type="host_numa") + with pytest.raises(ValueError, match="location must be None or -1"): + managed_memory.prefetch(buffer, _INVALID_HOST_DEVICE_ORDINAL, stream=stream, location_type="host") + + buffer.close() + + +def test_managed_memory_advise_location_validation(init_cuda): + """Verify doc-specified location constraints for each advice kind.""" + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + + # set_read_mostly works without a location (location is ignored) + managed_memory.advise(buffer, "set_read_mostly") + + # set_preferred_location requires a location; device ordinal works + managed_memory.advise(buffer, "set_preferred_location", device.device_id) + + # set_preferred_location with host location_type + managed_memory.advise(buffer, "set_preferred_location", location_type="host") + + # set_accessed_by with host_numa raises ValueError (INVALID per CUDA docs) + with pytest.raises(ValueError, match="does not support location_type='host_numa'"): + managed_memory.advise(buffer, "set_accessed_by", 0, location_type="host_numa") + + # set_accessed_by with host_numa_current also raises ValueError + with pytest.raises(ValueError, match="does not support location_type='host_numa_current'"): + managed_memory.advise(buffer, "set_accessed_by", location_type="host_numa_current") + + # Inferred location from int: -1 maps to host, 0 maps to device + managed_memory.advise(buffer, "set_preferred_location", -1) + managed_memory.advise(buffer, "set_preferred_location", 0) + + buffer.close() + + +def test_managed_memory_advise_accepts_enum_value(init_cuda): + """advise() accepts CUmem_advise enum values directly, not just string aliases.""" + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + + advice_enum = driver.CUmem_advise.CU_MEM_ADVISE_SET_READ_MOSTLY + managed_memory.advise(buffer, advice_enum) + + assert ( + _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, + ) + == _READ_MOSTLY_ENABLED + ) + + buffer.close() + + +def test_managed_memory_advise_size_rejected_for_buffer(init_cuda): + """advise() raises TypeError when size= is given with a Buffer target.""" + device = Device() + _skip_if_managed_allocation_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + + with pytest.raises(TypeError, match="does not accept size="): + managed_memory.advise(buffer, "set_read_mostly", size=1024) + + buffer.close() + + +def test_managed_memory_advise_invalid_advice_values(init_cuda): + """advise() rejects invalid advice strings and wrong types.""" + device = Device() + _skip_if_managed_allocation_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + + with pytest.raises(ValueError, match="advice must be one of"): + managed_memory.advise(buffer, "not_a_real_advice") + + with pytest.raises(TypeError, match="advice must be"): + managed_memory.advise(buffer, 42) + + buffer.close() + + +def test_managed_memory_functions_accept_raw_pointer_ranges(init_cuda): + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + managed_memory.advise(buffer.handle, "set_read_mostly", size=buffer.size) + assert ( + _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, + ) + == _READ_MOSTLY_ENABLED + ) + + managed_memory.prefetch(buffer.handle, device, size=buffer.size, stream=stream) + stream.sync() + last_location = _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last_location == device.device_id + + buffer.close() + + def test_managed_memory_resource_host_numa_auto_resolve_failure(init_cuda): """host_numa with None raises RuntimeError when NUMA ID cannot be determined.""" from unittest.mock import MagicMock, patch