IntelPython · antonwolfy · Jan 15, 2026 · Jan 14, 2026 · Jan 14, 2026 · Jan 14, 2026
@@ -41,6 +41,7 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum
 * Aligned the signature of `dpnp.reshape` function with Python array API by making `shape` a required argument [#2673](https://github.com/IntelPython/dpnp/pull/2673)
 * Unified `dpnp` public API exports by consolidating function exports in `__init__.py` and removing wildcard imports [#2665](https://github.com/IntelPython/dpnp/pull/2665) [#2666](https://github.com/IntelPython/dpnp/pull/2666)
 * Updated tests to reflect the new scalar conversion rules for non-0D `usm_ndarray` [#2694](https://github.com/IntelPython/dpnp/pull/2694)
+* Compile indexing extension with `-fno-sycl-id-queries-fit-in-int` to support huge arrays [#2721](https://github.com/IntelPython/dpnp/pull/2721)
 
 ### Deprecated
 

@@ -84,6 +84,7 @@ else()
     )
 endif()
 
+target_compile_options(${python_module_name} PUBLIC -fno-sycl-id-queries-fit-in-int)
 target_link_options(${python_module_name} PUBLIC -fsycl-device-code-split=per_kernel)
 
 if(DPNP_GENERATE_COVERAGE)

@@ -1,4 +1,4 @@
-import unittest
+from __future__ import annotations
 
 import pytest
 
@@ -8,7 +8,7 @@
 pytest.skip("CArray is not supported", allow_module_level=True)
 
 
-class TestCArray(unittest.TestCase):
+class TestCArray:
 
     def test_size(self):
         x = cupy.arange(3).astype("i")
@@ -63,39 +63,38 @@ def test_getitem_idx(self):
         testing.assert_array_equal(y, x)
 
 
-@testing.parameterize(
-    {"size": 2**31 - 1024},
-    {"size": 2**31},
-    {"size": 2**31 + 1024},
-    {"size": 2**32 - 1024},
-    {"size": 2**32},
-    {"size": 2**32 + 1024},
+@pytest.mark.parametrize(
+    "size",
+    [2**31 - 1024, 2**31, 2**31 + 1024, 2**32 - 1024, 2**32, 2**32 + 1024],
 )
-@testing.slow
-class TestCArray32BitBoundary(unittest.TestCase):
+@pytest.mark.slow
+@pytest.mark.thread_unsafe(reason="too large allocations")
+class TestCArray32BitBoundary:
     # This test case is intended to confirm CArray indexing work correctly
     # with input/output arrays whose size is so large that it crosses the
     # 32-bit boundary (in terms of both number of elements and size in bytes).
     # This test requires approx. 8 GiB GPU memory to run.
     # See https://github.com/cupy/cupy/pull/882 for detailed discussions.
-
-    def tearDown(self):
-        # Free huge memory for slow test
+    def teardown_method(self):
         cupy.get_default_memory_pool().free_all_blocks()
 
     # HIP is known to fail with sizes > 2**32-1024
-    @unittest.skipIf(cupy.cuda.runtime.is_hip, "HIP does not support this")
-    def test(self):
+    @pytest.mark.skipif(
+        cupy.cuda.runtime.is_hip, reason="HIP does not support this"
+    )
+    def test(self, size):
         # Elementwise
-        a = cupy.full((1, self.size), 7, dtype=cupy.int8)
+        a = cupy.full((1, size), 7, dtype=cupy.int8)
         # Reduction
         result = a.sum(axis=0, dtype=cupy.int8)
         # Explicitly specify the dtype to absorb Linux/Windows difference.
-        assert result.sum(dtype=cupy.int64) == self.size * 7
+        assert result.sum(dtype=cupy.int64) == size * 7
 
     # HIP is known to fail with sizes > 2**32-1024
-    @unittest.skipIf(cupy.cuda.runtime.is_hip, "HIP does not support this")
-    def test_assign(self):
-        a = cupy.zeros(self.size, dtype=cupy.int8)
+    @pytest.mark.skipif(
+        cupy.cuda.runtime.is_hip, reason="HIP does not support this"
+    )
+    def test_assign(self, size):
+        a = cupy.zeros(size, dtype=cupy.int8)
         a[-1] = 1.0
         assert a.sum() == 1
@@ -8,6 +8,10 @@
 
 import dpnp as cupy
 from dpnp.tests.third_party.cupy import testing
+from dpnp.tests.third_party.cupy.testing._protocol_helpers import (
+    DummyObjectWithCudaArrayInterface,
+    DummyObjectWithCuPyGetNDArray,
+)
 
 
 class TestSize(unittest.TestCase):
@@ -37,6 +41,7 @@ def test_size_axis_error(self, dtype):
 
     @testing.numpy_cupy_equal()
     @testing.slow
+    # @pytest.mark.thread_unsafe(reason="Allocation too large.")
     def test_size_huge(self, xp):
         a = xp.ndarray(2**32, "b")  # 4 GiB
         return xp.size(a)
@@ -95,33 +100,44 @@ def test_cupy_ndarray(self, dtype):
         for v in (arr, (arr, arr)):
             assert cupy.min_scalar_type(v) is arr.dtype
 
-
-@testing.parameterize(
-    *testing.product(
-        {
-            "cxx": (None, "--std=c++14"),
-        }
+    @pytest.mark.parametrize(
+        "cupy_like",
+        [
+            DummyObjectWithCuPyGetNDArray,
+            DummyObjectWithCudaArrayInterface,
+        ],
     )
-)
-@pytest.mark.skip("compiling cupy headers are not supported")
-class TestCuPyHeaders(unittest.TestCase):
+    def test_cupy_likes_and_nested(self, cupy_like):
+        arr = cupy.array([[-1, 1]], dtype="int8")
 
-    def setUp(self):
+        obj = cupy_like(arr)
+        assert cupy.min_scalar_type(obj) is arr.dtype
+        if cupy_like is DummyObjectWithCuPyGetNDArray:
+            # __cupy_get_ndarray__ path currently assumes .shape and .dtype
+            obj.shape = arr.shape
+            obj.dtype = arr.dtype
+        assert cupy.min_scalar_type([obj, obj]) is arr.dtype
+
+
+@pytest.mark.skip("compiling cupy headers are not supported")
+class TestCuPyHeaders:
+    def setup_method(self):
         self.temporary_cache_dir_context = test_raw.use_temporary_cache_dir()
         self.cache_dir = self.temporary_cache_dir_context.__enter__()
         self.header = "\n".join(
             ["#include <" + h + ">" for h in core._cupy_header_list]
         )
 
-    def tearDown(self):
+    def teardown_method(self):
         self.temporary_cache_dir_context.__exit__(*sys.exc_info())
 
-    def test_compiling_core_header(self):
+    @pytest.mark.parametrize("cxx", (None, "--std=c++17"))
+    def test_compiling_core_header(self, cxx):
         code = r"""
         extern "C" __global__ void _test_ker_() { }
         """
         code = self.header + code
-        options = () if self.cxx is None else (self.cxx,)
+        options = () if cxx is None else (cxx,)
         ker = cupy.RawKernel(
             code, "_test_ker_", options=options, backend="nvrtc"
         )

@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import sys
 import unittest
 from itertools import combinations
@@ -19,17 +21,18 @@
 # This test class and its children below only test if CUB backend can be used
 # or not; they don't verify its correctness as it's already extensively covered
 # by existing tests
-@unittest.skipIf(_environment.get_cub_path() is None, "CUB not found")
 class CubReductionTestBase(unittest.TestCase):
     """
     Note: call self.can_use() when arrays are already allocated, otherwise
     call self._test_can_use().
     """
 
     def setUp(self):
+        if _environment.get_cub_path() is None:
+            pytest.skip("CUB not found")
         if cupy.cuda.runtime.is_hip:
             if _environment.get_hipcc_path() is None:
-                self.skipTest("hipcc is not found")
+                pytest.skip("hipcc is not found")
 
         self.can_use = cupy._core._cub_reduction._can_use_cub_block_reduction
 

@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import dpctl
 import dpctl.tensor._dlpack as dlp
 import numpy
@@ -60,6 +62,8 @@ class TestNewDLPackConversion:
     def pool(self, request):
         self.memory = request.param
         if self.memory == "managed":
+            # if cuda.runtime.is_hip:
+            #     pytest.skip("HIP does not support managed memory")
             old_pool = cupy.get_default_memory_pool()
             new_pool = cuda.MemoryPool(cuda.malloc_managed)
             cuda.set_allocator(new_pool.malloc)
@@ -201,6 +205,8 @@ def test_conversion_device_to_cpu(self):
     @pytest.mark.skip("due to dpctl-2213")
     def test_stream(self):
         allowed_streams = ["null", True]
+        # if not cuda.runtime.is_hip:
+        #     allowed_streams.append("ptds")
 
         # stream order is automatically established via DLPack protocol
         for src_s in [self._get_stream(s) for s in allowed_streams]:
@@ -226,18 +232,18 @@ class TestDLTensorMemory:
 
     @pytest.fixture
     def pool(self):
-        pass
+        # old_pool = cupy.get_default_memory_pool()
+        # pool = cupy.cuda.MemoryPool()
+        # cupy.cuda.set_allocator(pool.malloc)
 
-    #     old_pool = cupy.get_default_memory_pool()
-    #     pool = cupy.cuda.MemoryPool()
-    #     cupy.cuda.set_allocator(pool.malloc)
+        # yield pool
 
-    # yield pool
-
-    #     pool.free_all_blocks()
-    #     cupy.cuda.set_allocator(old_pool.malloc)
+        # pool.free_all_blocks()
+        # cupy.cuda.set_allocator(old_pool.malloc)
+        pass
 
     @pytest.mark.parametrize("max_version", [None, (1, 0)])
+    # @pytest.mark.thread_unsafe(reason="modifies pool and tracks allocations")
     def test_deleter(self, pool, max_version):
         # memory is freed when tensor is deleted, as it's not consumed
         array = cupy.empty(10)
@@ -252,6 +258,7 @@ def test_deleter(self, pool, max_version):
         # assert pool.n_free_blocks() == 1
 
     @pytest.mark.parametrize("max_version", [None, (1, 0)])
+    # @pytest.mark.thread_unsafe(reason="modifies pool and tracks allocations")
     def test_deleter2(self, pool, max_version):
         # memory is freed when array2 is deleted, as tensor is consumed
         array = cupy.empty(10)

@@ -246,6 +246,11 @@ def test_copy_multi_device_non_contiguous_K(self):
     # See cupy/cupy#5004
     @pytest.mark.skip("RawKernel() is not supported")
     @testing.multi_gpu(2)
+    # @pytest.mark.xfail(
+    #     runtime.is_hip,
+    #     reason='ROCm may work differently in async D2D copy with streams')
+    # @pytest.mark.thread_unsafe(
+    #     reason="order is unclear multithread. Also, hard crash in threaded!")
     def test_copy_multi_device_with_stream(self):
         # Kernel that takes long enough then finally writes values.
         src = _test_copy_multi_device_with_stream_src
@@ -430,21 +435,6 @@ def test_cuda_array_interface_stream(self):
                 assert iface["stream"] == stream.ptr
 
 
-@pytest.mark.skip("CUDA interface is not supported")
-class TestNdarrayCudaInterfaceNoneCUDA(unittest.TestCase):
-
-    def setUp(self):
-        self.arr = cupy.zeros(shape=(2, 3), dtype=cupy.float64)
-
-    def test_cuda_array_interface_hasattr(self):
-        assert not hasattr(self.arr, "__cuda_array_interface__")
-
-    def test_cuda_array_interface_getattr(self):
-        with pytest.raises(AttributeError) as e:
-            getattr(self.arr, "__cuda_array_interface__")
-        assert "HIP" in str(e.value)
-
-
 @testing.parameterize(
     *testing.product(
         {

@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import itertools
 
 import numpy
@@ -435,6 +437,15 @@ def test_invalid_adv_getitem(self):
                 a[self.indexes]
 
 
+class TestArrayBadDTypeIndexAdvGetitem:
+    @pytest.mark.parametrize("dtype", [object, "i,i", "float32", "str"])
+    def test_bad_dtype_adv_getitem(self, dtype):
+        # Test various bad dtypes, supported by CuPy or not.
+        a = cupy.arange(10)
+        with pytest.raises(IndexError, match="arrays used as indices"):
+            a[numpy.array([1, 2], dtype=dtype)]
+
+
 @testing.parameterize(
     {"shape": (0,), "indexes": ([False],)},
     {
@@ -950,6 +961,60 @@ class TestArrayAdvancedIndexingSetitemTranspose:
     def test_adv_setitem_transp(self, xp):
         shape = (2, 3, 4)
         a = xp.zeros(shape).transpose(0, 2, 1)
-        slices = (xp.array([1, 0]), slice(None), xp.array([2, 1]))
+        slices = (numpy.array([1, 0]), slice(None), numpy.array([2, 1]))
         a[slices] = 1
         return a
+
+
+class TestHugeArrays:
+    # These tests require a lot of memory
+    @testing.slow
+    def test_advanced(self):
+        try:
+            arr = cupy.ones((1, 2**30), dtype=cupy.int8)
+            idx = cupy.zeros(3, dtype=cupy.int32)
+            res = arr[idx, :]
+            # sanity check, we mostly care about it not crashing.
+            assert res.sum() == 3 * 2**30
+            del res
+
+            arr[idx, :] = cupy.array([[3], [3], [3]], dtype=cupy.int8)
+            # Check 3 got written (order may not be strictly guaranteed)
+            assert arr.sum() == 2**30 * 3
+        except MemoryError:
+            pytest.skip("out of memory in test.")
+
+    @testing.slow
+    def test_take_array(self):
+        try:
+            arr = cupy.ones((1, 2**32), dtype=cupy.int8)
+            arr[0, 2**30] = 0  # We should see each of these once
+            arr[0, -1] = 0
+            res = arr.take(cupy.array([0, 0]), axis=0)
+            # sanity check, we mostly care about it not crashing.
+            assert res.sum() == 2 * (2**32 - 2)
+        except MemoryError:
+            pytest.skip("out of memory in test.")
+
+    @testing.slow
+    def test_take_scalar(self):
+        try:
+            arr = cupy.ones((1, 2**32), dtype=cupy.int8)
+            arr[0, 2**30] = 0  # We should see each of these once
+            arr[0, -1] = 0
+            res = arr.take(0, axis=0)
+            # sanity check, we mostly care about it not crashing.
+            assert res.sum() == 2**32 - 2
+        except MemoryError:
+            pytest.skip("out of memory in test.")
+
+    @testing.slow
+    def test_choose(self):
+        try:
+            choices = cupy.zeros((2, 2**31), dtype=cupy.int8)
+            choices[1, :] = 1
+            res = choices[1, :].choose(choices)
+            # sanity check, we mostly care about it not crashing.
+            assert res.sum() == 2**31
+        except MemoryError:
+            pytest.skip("out of memory in test.")
-Original file line number
+Diff line change
@@ Expand Up / @@ -84,6 +84,7 @@ else() @@
         )
     endif()
+    target_compile_options(${python_module_name} PUBLIC -fno-sycl-id-queries-fit-in-int)
     target_link_options(${python_module_name} PUBLIC -fsycl-device-code-split=per_kernel)
     if(DPNP_GENERATE_COVERAGE)
@@ Expand Down @@