Skip to content
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum
* Aligned the signature of `dpnp.reshape` function with Python array API by making `shape` a required argument [#2673](https://github.com/IntelPython/dpnp/pull/2673)
* Unified `dpnp` public API exports by consolidating function exports in `__init__.py` and removing wildcard imports [#2665](https://github.com/IntelPython/dpnp/pull/2665) [#2666](https://github.com/IntelPython/dpnp/pull/2666)
* Updated tests to reflect the new scalar conversion rules for non-0D `usm_ndarray` [#2694](https://github.com/IntelPython/dpnp/pull/2694)
* Compile indexing extension with `-fno-sycl-id-queries-fit-in-int` to support huge arrays [#2721](https://github.com/IntelPython/dpnp/pull/2721)

### Deprecated

Expand Down
1 change: 1 addition & 0 deletions dpnp/backend/extensions/indexing/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ else()
)
endif()

target_compile_options(${python_module_name} PUBLIC -fno-sycl-id-queries-fit-in-int)
target_link_options(${python_module_name} PUBLIC -fsycl-device-code-split=per_kernel)

if(DPNP_GENERATE_COVERAGE)
Expand Down
41 changes: 20 additions & 21 deletions dpnp/tests/third_party/cupy/core_tests/test_carray.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import unittest
from __future__ import annotations

import pytest

Expand All @@ -8,7 +8,7 @@
pytest.skip("CArray is not supported", allow_module_level=True)


class TestCArray(unittest.TestCase):
class TestCArray:

def test_size(self):
x = cupy.arange(3).astype("i")
Expand Down Expand Up @@ -63,39 +63,38 @@ def test_getitem_idx(self):
testing.assert_array_equal(y, x)


@testing.parameterize(
{"size": 2**31 - 1024},
{"size": 2**31},
{"size": 2**31 + 1024},
{"size": 2**32 - 1024},
{"size": 2**32},
{"size": 2**32 + 1024},
@pytest.mark.parametrize(
"size",
[2**31 - 1024, 2**31, 2**31 + 1024, 2**32 - 1024, 2**32, 2**32 + 1024],
)
@testing.slow
class TestCArray32BitBoundary(unittest.TestCase):
@pytest.mark.slow
@pytest.mark.thread_unsafe(reason="too large allocations")
class TestCArray32BitBoundary:
# This test case is intended to confirm CArray indexing work correctly
# with input/output arrays whose size is so large that it crosses the
# 32-bit boundary (in terms of both number of elements and size in bytes).
# This test requires approx. 8 GiB GPU memory to run.
# See https://github.com/cupy/cupy/pull/882 for detailed discussions.

def tearDown(self):
# Free huge memory for slow test
def teardown_method(self):
cupy.get_default_memory_pool().free_all_blocks()

# HIP is known to fail with sizes > 2**32-1024
@unittest.skipIf(cupy.cuda.runtime.is_hip, "HIP does not support this")
def test(self):
@pytest.mark.skipif(
cupy.cuda.runtime.is_hip, reason="HIP does not support this"
)
def test(self, size):
# Elementwise
a = cupy.full((1, self.size), 7, dtype=cupy.int8)
a = cupy.full((1, size), 7, dtype=cupy.int8)
# Reduction
result = a.sum(axis=0, dtype=cupy.int8)
# Explicitly specify the dtype to absorb Linux/Windows difference.
assert result.sum(dtype=cupy.int64) == self.size * 7
assert result.sum(dtype=cupy.int64) == size * 7

# HIP is known to fail with sizes > 2**32-1024
@unittest.skipIf(cupy.cuda.runtime.is_hip, "HIP does not support this")
def test_assign(self):
a = cupy.zeros(self.size, dtype=cupy.int8)
@pytest.mark.skipif(
cupy.cuda.runtime.is_hip, reason="HIP does not support this"
)
def test_assign(self, size):
a = cupy.zeros(size, dtype=cupy.int8)
a[-1] = 1.0
assert a.sum() == 1
42 changes: 29 additions & 13 deletions dpnp/tests/third_party/cupy/core_tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@

import dpnp as cupy
from dpnp.tests.third_party.cupy import testing
from dpnp.tests.third_party.cupy.testing._protocol_helpers import (
DummyObjectWithCudaArrayInterface,
DummyObjectWithCuPyGetNDArray,
)


class TestSize(unittest.TestCase):
Expand Down Expand Up @@ -37,6 +41,7 @@ def test_size_axis_error(self, dtype):

@testing.numpy_cupy_equal()
@testing.slow
# @pytest.mark.thread_unsafe(reason="Allocation too large.")
def test_size_huge(self, xp):
a = xp.ndarray(2**32, "b") # 4 GiB
return xp.size(a)
Expand Down Expand Up @@ -95,33 +100,44 @@ def test_cupy_ndarray(self, dtype):
for v in (arr, (arr, arr)):
assert cupy.min_scalar_type(v) is arr.dtype


@testing.parameterize(
*testing.product(
{
"cxx": (None, "--std=c++14"),
}
@pytest.mark.parametrize(
"cupy_like",
[
DummyObjectWithCuPyGetNDArray,
DummyObjectWithCudaArrayInterface,
],
)
)
@pytest.mark.skip("compiling cupy headers are not supported")
class TestCuPyHeaders(unittest.TestCase):
def test_cupy_likes_and_nested(self, cupy_like):
arr = cupy.array([[-1, 1]], dtype="int8")

def setUp(self):
obj = cupy_like(arr)
assert cupy.min_scalar_type(obj) is arr.dtype
if cupy_like is DummyObjectWithCuPyGetNDArray:
# __cupy_get_ndarray__ path currently assumes .shape and .dtype
obj.shape = arr.shape
obj.dtype = arr.dtype
assert cupy.min_scalar_type([obj, obj]) is arr.dtype


@pytest.mark.skip("compiling cupy headers are not supported")
class TestCuPyHeaders:
def setup_method(self):
self.temporary_cache_dir_context = test_raw.use_temporary_cache_dir()
self.cache_dir = self.temporary_cache_dir_context.__enter__()
self.header = "\n".join(
["#include <" + h + ">" for h in core._cupy_header_list]
)

def tearDown(self):
def teardown_method(self):
self.temporary_cache_dir_context.__exit__(*sys.exc_info())

def test_compiling_core_header(self):
@pytest.mark.parametrize("cxx", (None, "--std=c++17"))
def test_compiling_core_header(self, cxx):
code = r"""
extern "C" __global__ void _test_ker_() { }
"""
code = self.header + code
options = () if self.cxx is None else (self.cxx,)
options = () if cxx is None else (cxx,)
ker = cupy.RawKernel(
code, "_test_ker_", options=options, backend="nvrtc"
)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

import sys
import unittest
from itertools import combinations
Expand All @@ -19,17 +21,18 @@
# This test class and its children below only test if CUB backend can be used
# or not; they don't verify its correctness as it's already extensively covered
# by existing tests
@unittest.skipIf(_environment.get_cub_path() is None, "CUB not found")
class CubReductionTestBase(unittest.TestCase):
"""
Note: call self.can_use() when arrays are already allocated, otherwise
call self._test_can_use().
"""

def setUp(self):
if _environment.get_cub_path() is None:
pytest.skip("CUB not found")
if cupy.cuda.runtime.is_hip:
if _environment.get_hipcc_path() is None:
self.skipTest("hipcc is not found")
pytest.skip("hipcc is not found")

self.can_use = cupy._core._cub_reduction._can_use_cub_block_reduction

Expand Down
23 changes: 15 additions & 8 deletions dpnp/tests/third_party/cupy/core_tests/test_dlpack.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

import dpctl
import dpctl.tensor._dlpack as dlp
import numpy
Expand Down Expand Up @@ -60,6 +62,8 @@ class TestNewDLPackConversion:
def pool(self, request):
self.memory = request.param
if self.memory == "managed":
# if cuda.runtime.is_hip:
# pytest.skip("HIP does not support managed memory")
old_pool = cupy.get_default_memory_pool()
new_pool = cuda.MemoryPool(cuda.malloc_managed)
cuda.set_allocator(new_pool.malloc)
Expand Down Expand Up @@ -201,6 +205,8 @@ def test_conversion_device_to_cpu(self):
@pytest.mark.skip("due to dpctl-2213")
def test_stream(self):
allowed_streams = ["null", True]
# if not cuda.runtime.is_hip:
# allowed_streams.append("ptds")

# stream order is automatically established via DLPack protocol
for src_s in [self._get_stream(s) for s in allowed_streams]:
Expand All @@ -226,18 +232,18 @@ class TestDLTensorMemory:

@pytest.fixture
def pool(self):
pass
# old_pool = cupy.get_default_memory_pool()
# pool = cupy.cuda.MemoryPool()
# cupy.cuda.set_allocator(pool.malloc)

# old_pool = cupy.get_default_memory_pool()
# pool = cupy.cuda.MemoryPool()
# cupy.cuda.set_allocator(pool.malloc)
# yield pool

# yield pool

# pool.free_all_blocks()
# cupy.cuda.set_allocator(old_pool.malloc)
# pool.free_all_blocks()
# cupy.cuda.set_allocator(old_pool.malloc)
pass

@pytest.mark.parametrize("max_version", [None, (1, 0)])
# @pytest.mark.thread_unsafe(reason="modifies pool and tracks allocations")
def test_deleter(self, pool, max_version):
# memory is freed when tensor is deleted, as it's not consumed
array = cupy.empty(10)
Expand All @@ -252,6 +258,7 @@ def test_deleter(self, pool, max_version):
# assert pool.n_free_blocks() == 1

@pytest.mark.parametrize("max_version", [None, (1, 0)])
# @pytest.mark.thread_unsafe(reason="modifies pool and tracks allocations")
def test_deleter2(self, pool, max_version):
# memory is freed when array2 is deleted, as tensor is consumed
array = cupy.empty(10)
Expand Down
20 changes: 5 additions & 15 deletions dpnp/tests/third_party/cupy/core_tests/test_ndarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,11 @@ def test_copy_multi_device_non_contiguous_K(self):
# See cupy/cupy#5004
@pytest.mark.skip("RawKernel() is not supported")
@testing.multi_gpu(2)
# @pytest.mark.xfail(
# runtime.is_hip,
# reason='ROCm may work differently in async D2D copy with streams')
# @pytest.mark.thread_unsafe(
# reason="order is unclear multithread. Also, hard crash in threaded!")
def test_copy_multi_device_with_stream(self):
# Kernel that takes long enough then finally writes values.
src = _test_copy_multi_device_with_stream_src
Expand Down Expand Up @@ -430,21 +435,6 @@ def test_cuda_array_interface_stream(self):
assert iface["stream"] == stream.ptr


@pytest.mark.skip("CUDA interface is not supported")
class TestNdarrayCudaInterfaceNoneCUDA(unittest.TestCase):

def setUp(self):
self.arr = cupy.zeros(shape=(2, 3), dtype=cupy.float64)

def test_cuda_array_interface_hasattr(self):
assert not hasattr(self.arr, "__cuda_array_interface__")

def test_cuda_array_interface_getattr(self):
with pytest.raises(AttributeError) as e:
getattr(self.arr, "__cuda_array_interface__")
assert "HIP" in str(e.value)


@testing.parameterize(
*testing.product(
{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

import itertools

import numpy
Expand Down Expand Up @@ -435,6 +437,15 @@ def test_invalid_adv_getitem(self):
a[self.indexes]


class TestArrayBadDTypeIndexAdvGetitem:
@pytest.mark.parametrize("dtype", [object, "i,i", "float32", "str"])
def test_bad_dtype_adv_getitem(self, dtype):
# Test various bad dtypes, supported by CuPy or not.
a = cupy.arange(10)
with pytest.raises(IndexError, match="arrays used as indices"):
a[numpy.array([1, 2], dtype=dtype)]


@testing.parameterize(
{"shape": (0,), "indexes": ([False],)},
{
Expand Down Expand Up @@ -950,6 +961,60 @@ class TestArrayAdvancedIndexingSetitemTranspose:
def test_adv_setitem_transp(self, xp):
shape = (2, 3, 4)
a = xp.zeros(shape).transpose(0, 2, 1)
slices = (xp.array([1, 0]), slice(None), xp.array([2, 1]))
slices = (numpy.array([1, 0]), slice(None), numpy.array([2, 1]))
a[slices] = 1
return a


class TestHugeArrays:
# These tests require a lot of memory
@testing.slow
def test_advanced(self):
try:
arr = cupy.ones((1, 2**30), dtype=cupy.int8)
idx = cupy.zeros(3, dtype=cupy.int32)
res = arr[idx, :]
# sanity check, we mostly care about it not crashing.
assert res.sum() == 3 * 2**30
del res

arr[idx, :] = cupy.array([[3], [3], [3]], dtype=cupy.int8)
# Check 3 got written (order may not be strictly guaranteed)
assert arr.sum() == 2**30 * 3
except MemoryError:
pytest.skip("out of memory in test.")

@testing.slow
def test_take_array(self):
try:
arr = cupy.ones((1, 2**32), dtype=cupy.int8)
arr[0, 2**30] = 0 # We should see each of these once
arr[0, -1] = 0
res = arr.take(cupy.array([0, 0]), axis=0)
# sanity check, we mostly care about it not crashing.
assert res.sum() == 2 * (2**32 - 2)
except MemoryError:
pytest.skip("out of memory in test.")

@testing.slow
def test_take_scalar(self):
try:
arr = cupy.ones((1, 2**32), dtype=cupy.int8)
arr[0, 2**30] = 0 # We should see each of these once
arr[0, -1] = 0
res = arr.take(0, axis=0)
# sanity check, we mostly care about it not crashing.
assert res.sum() == 2**32 - 2
except MemoryError:
pytest.skip("out of memory in test.")

@testing.slow
def test_choose(self):
try:
choices = cupy.zeros((2, 2**31), dtype=cupy.int8)
choices[1, :] = 1
res = choices[1, :].choose(choices)
# sanity check, we mostly care about it not crashing.
assert res.sum() == 2**31
except MemoryError:
pytest.skip("out of memory in test.")
Loading
Loading