From 484ea2269f6d6272c85026d3354505f709c4e916 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Thu, 25 Jun 2026 14:09:43 +0200
Subject: [PATCH 01/37] Fix intersect1d crash with empty arrays

---
 .../cupy/logic_tests/test_truth.py            | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/dpnp/tests/third_party/cupy/logic_tests/test_truth.py b/dpnp/tests/third_party/cupy/logic_tests/test_truth.py
index 0ba353972109..0170f83a6dea 100644
--- a/dpnp/tests/third_party/cupy/logic_tests/test_truth.py
+++ b/dpnp/tests/third_party/cupy/logic_tests/test_truth.py
@@ -265,6 +265,36 @@ def test_multiple_instances(self, xp, dtype):
         b = xp.array([4, 6, 2, 5, 7, 6], dtype=dtype)
         return xp.intersect1d(a, b, return_indices=True)
 
+    @testing.numpy_cupy_array_equal()
+    def test_intersect1d_both_empty(self, xp):
+        return xp.intersect1d(xp.array([]), xp.array([]))
+
+    @testing.for_all_dtypes()
+    @testing.numpy_cupy_array_equal()
+    def test_intersect1d_empty_array(self, xp, dtype):
+        a = xp.array([], dtype=dtype)
+        b = xp.array([0], dtype=dtype)
+        return xp.intersect1d(a, b, return_indices=True)
+
+    @testing.for_all_dtypes()
+    @testing.numpy_cupy_array_equal()
+    def test_intersect1d_second_empty_array(self, xp, dtype):
+        a = xp.array([0], dtype=dtype)
+        b = xp.array([], dtype=dtype)
+        return xp.intersect1d(a, b, return_indices=True)
+
+    @testing.numpy_cupy_array_equal()
+    def test_intersect1d_mixed_dtypes_empty(self, xp):
+        a = xp.array([0], dtype=xp.int64)
+        b = xp.array([], dtype=xp.float64)
+        return xp.intersect1d(a, b)
+
+    @testing.numpy_cupy_array_equal()
+    def test_intersect1d_mixed_dtypes_empty_with_indices(self, xp):
+        a = xp.array([0], dtype=xp.int64)
+        b = xp.array([], dtype=xp.float64)
+        return xp.intersect1d(a, b, return_indices=True)
+
 
 @pytest.mark.skip("union1d() is not supported yet")
 class TestUnion1d:

From 853c88a09e84a7b9d5479488315aac8bd62c35be Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Thu, 25 Jun 2026 14:26:37 +0200
Subject: [PATCH 02/37] Make meshgrid return a tuple not list

---
 dpnp/tests/third_party/cupy/creation_tests/test_ranges.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/dpnp/tests/third_party/cupy/creation_tests/test_ranges.py b/dpnp/tests/third_party/cupy/creation_tests/test_ranges.py
index ce716b10dd37..636d4387f4d3 100644
--- a/dpnp/tests/third_party/cupy/creation_tests/test_ranges.py
+++ b/dpnp/tests/third_party/cupy/creation_tests/test_ranges.py
@@ -346,13 +346,16 @@ def test_meshgrid0(self, dtype):
         )
         assert out == ()
 
+    @testing.with_requires("numpy>=2.5")
     @testing.for_all_dtypes()
     @testing.numpy_cupy_array_equal()
     def test_meshgrid1(self, xp, dtype):
         x = xp.arange(2).astype(dtype)
-        return xp.meshgrid(
+        result = xp.meshgrid(
             x, indexing=self.indexing, sparse=self.sparse, copy=self.copy
         )
+        assert isinstance(result, tuple)
+        return result
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_array_equal()

From 45f7db981cbc59637f529d71b8e219b0b0af0fad Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Thu, 25 Jun 2026 15:03:23 +0200
Subject: [PATCH 03/37] Relax dtype check in views (including zero-copy array
 constructors)

---
 .../cupy/creation_tests/test_basic.py         | 20 +++++++
 .../cupy/creation_tests/test_from_data.py     | 60 +++++++++++++++++++
 2 files changed, 80 insertions(+)

diff --git a/dpnp/tests/third_party/cupy/creation_tests/test_basic.py b/dpnp/tests/third_party/cupy/creation_tests/test_basic.py
index a9e382d22798..05b2e5e711b2 100644
--- a/dpnp/tests/third_party/cupy/creation_tests/test_basic.py
+++ b/dpnp/tests/third_party/cupy/creation_tests/test_basic.py
@@ -542,3 +542,23 @@ def test_full_like_reshape_cupy_only(self, dtype):
         c = cupy.full(self.shape, 1, dtype=dtype)
 
         testing.assert_array_equal(b, c)
+
+
+@pytest.mark.skip("void dtypes are not supported")
+class TestDTypeUnchecked:
+    def test_void_dtype(self):
+        arr = cupy.zeros(3, dtype="V10")
+        assert not arr.get().view("uint8").any()
+
+        np_arr = numpy.array([b"1", b"2", b"3"], dtype="V10")
+        arr = cupy.array(np_arr)
+        testing.assert_array_equal(arr.get(), np_arr)
+
+    def test_subarray_rejected(self):
+        with pytest.raises(ValueError, match="Unsupported dtype"):
+            cupy.empty(3, dtype="3i")
+
+    def test_empty_void_rejected(self):
+        # We could try to allow V0 explicitly, but for now...
+        with pytest.raises(ValueError, match="Unsupported dtype"):
+            cupy.empty(3, dtype="V")
diff --git a/dpnp/tests/third_party/cupy/creation_tests/test_from_data.py b/dpnp/tests/third_party/cupy/creation_tests/test_from_data.py
index a2496c855b02..04b3c4446a7b 100644
--- a/dpnp/tests/third_party/cupy/creation_tests/test_from_data.py
+++ b/dpnp/tests/third_party/cupy/creation_tests/test_from_data.py
@@ -716,6 +716,66 @@ def test_big_endian(self):
             cupy.asarray(b)
 
 
+@pytest.mark.skip("CUDA array interface is not supported")
+class TestCudaArrayInterfaceNonBuiltinDtype:
+    # CuPy can accept non-builtin dtypes as containers when wrapping
+    # existing GPU memory via CAI. See cupy/cupy#9709 and cupy/cupy#9712
+    # for details.
+
+    @pytest.mark.parametrize("ver", range(max_cuda_array_interface_version + 1))
+    @pytest.mark.parametrize("strides", [False, None, True])
+    @pytest.mark.parametrize(
+        "typestr",
+        [
+            "<M8[s]",
+            "<M8[ms]",
+            "<M8[us]",
+            "<M8[ns]",
+            "<m8[s]",
+            "<m8[ms]",
+            "<m8[us]",
+            "<m8[ns]",
+        ],
+    )
+    def test_datetime_timedelta(self, typestr, ver, strides):
+        dtype = numpy.dtype(typestr)
+        a = cupy.zeros(4, dtype="i8")
+        b = DummyObjectWithCudaArrayInterface(
+            (a.shape, a.strides, dtype.str, dtype.descr, a.data.ptr),
+            ver,
+            strides,
+        )
+        c = cupy.asarray(b)
+        assert c.dtype == dtype
+        assert c.shape == (4,)
+
+    @pytest.mark.parametrize("ver", range(max_cuda_array_interface_version + 1))
+    @pytest.mark.parametrize("strides", [False, None, True])
+    def test_structured(self, ver, strides):
+        dtype = numpy.dtype("i4,f8")
+        a = cupy.zeros(3, dtype=dtype)
+        b = DummyObjectWithCudaArrayInterface(a, ver, strides)
+        c = cupy.asarray(b)
+        # CAI typestr for structured dtypes is |V<n>; field info is lost
+        assert c.dtype == numpy.dtype(dtype.str)
+        assert c.shape == (3,)
+
+    @pytest.mark.parametrize("ver", range(max_cuda_array_interface_version + 1))
+    @pytest.mark.parametrize("strides", [False, None, True])
+    def test_datetime64_view(self, ver, strides):
+        dtype = numpy.dtype("datetime64[ns]")
+        a = cupy.zeros(4, dtype="i8")
+        b = DummyObjectWithCudaArrayInterface(
+            (a.shape, a.strides, dtype.str, dtype.descr, a.data.ptr),
+            ver,
+            strides,
+        )
+        c = cupy.asarray(b)
+        # Should be able to view back as int64
+        d = c.view("i8")
+        assert d.dtype == numpy.dtype("i8")
+
+
 @testing.parameterize(
     *testing.product(
         {

From 4a6385622997fee946682122420eaa4fe7c440e5 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Thu, 25 Jun 2026 15:03:48 +0200
Subject: [PATCH 04/37] Implemeted inverse_cdf method for cp.quaniles and
 percentiles

---
 dpnp/tests/third_party/cupy/statistics_tests/test_order.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpnp/tests/third_party/cupy/statistics_tests/test_order.py b/dpnp/tests/third_party/cupy/statistics_tests/test_order.py
index f35617e18619..ba15fc16b078 100644
--- a/dpnp/tests/third_party/cupy/statistics_tests/test_order.py
+++ b/dpnp/tests/third_party/cupy/statistics_tests/test_order.py
@@ -12,7 +12,7 @@
 from dpnp.tests.third_party.cupy import testing
 
 _all_methods = (
-    # 'inverted_cdf',               # TODO(takagi) Not implemented
+    "inverted_cdf",
     # 'averaged_inverted_cdf',      # TODO(takagi) Not implemented
     # 'closest_observation',        # TODO(takagi) Not implemented
     # 'interpolated_inverted_cdf',  # TODO(takagi) Not implemented

From ae14f468618a386bb670e45b0904c75711e2a77a Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 10:53:52 +0200
Subject: [PATCH 05/37] Allow cupy.ndarray as repeats argument to cupy.repeat

---
 .../cupy/manipulation_tests/test_tiling.py    | 239 ++++++++++++++++--
 1 file changed, 224 insertions(+), 15 deletions(-)

diff --git a/dpnp/tests/third_party/cupy/manipulation_tests/test_tiling.py b/dpnp/tests/third_party/cupy/manipulation_tests/test_tiling.py
index a8a1f06da47f..a37544656735 100644
--- a/dpnp/tests/third_party/cupy/manipulation_tests/test_tiling.py
+++ b/dpnp/tests/third_party/cupy/manipulation_tests/test_tiling.py
@@ -24,21 +24,6 @@ def test_array_repeat(self, xp):
         return xp.repeat(x, self.repeats, self.axis)
 
 
-class TestRepeatRepeatsNdarray(unittest.TestCase):
-
-    def test_func(self):
-        a = testing.shaped_arange((2, 3, 4), cupy)
-        repeats = cupy.array([2, 3], dtype=cupy.int32)
-        with pytest.raises(ValueError, match=r"repeats"):
-            cupy.repeat(a, repeats)
-
-    def test_method(self):
-        a = testing.shaped_arange((2, 3, 4), cupy)
-        repeats = cupy.array([2, 3], dtype=cupy.int32)
-        with pytest.raises(ValueError, match=r"repeats"):
-            a.repeat(repeats)
-
-
 @testing.parameterize(
     {"repeats": [2], "axis": None},
     {"repeats": [2], "axis": 1},
@@ -100,6 +85,230 @@ def test_repeat_failure(self):
                 xp.repeat(x, self.repeats, self.axis)
 
 
+@testing.parameterize(
+    # 1-D
+    {"shape": (6,), "reps": [1, 3, 2, 1, 1, 2], "axis": None},
+    {"shape": (6,), "reps": [2], "axis": None},
+    # 2-D
+    {"shape": (2, 3), "reps": [2, 1], "axis": 0},
+    {"shape": (2, 3), "reps": [1, 3, 2], "axis": 1},
+    {"shape": (2, 3), "reps": [2], "axis": 0},
+    {"shape": (2, 3), "reps": [2], "axis": 1},
+    # 3-D
+    {"shape": (2, 3, 4), "reps": [1, 2, 3, 4], "axis": 2},
+    {"shape": (2, 3, 4), "reps": [0, 3], "axis": 0},
+    {"shape": (2, 3, 4), "reps": [1, 2, 3], "axis": 1},
+    {"shape": (2, 3, 4), "reps": [4], "axis": 2},
+    # negative axis
+    {"shape": (2, 3, 4), "reps": [1, 2, 3, 4], "axis": -1},
+    {"shape": (2, 3, 4), "reps": [1, 2, 3], "axis": -2},
+    # axis=None
+    {"shape": (2, 3), "reps": [1, 2, 3, 4, 5, 0], "axis": None},
+    {"shape": (4,), "reps": [0, 0, 0, 0], "axis": None},
+    {"shape": (4,), "reps": [5, 0, 3, 1], "axis": None},
+    # zeros in reps
+    {"shape": (4,), "reps": [0, 2, 0, 1], "axis": 0},
+    {"shape": (2, 3), "reps": [0, 3, 0], "axis": 1},
+    # broadcast
+    {"shape": (2, 3), "reps": [0], "axis": 0},
+    {"shape": (2, 3), "reps": [1], "axis": 1},
+    {"shape": (3, 4), "reps": [2], "axis": None},
+    # 4-D
+    {"shape": (2, 3, 4, 5), "reps": [2, 1, 3], "axis": 1},
+    # empty
+    {"shape": (0, 3), "reps": [2], "axis": 0},
+    {"shape": (2, 3), "reps": [0, 0, 0], "axis": 1},
+)
+class TestRepeatNdarrayRepeats:
+    """ndarray repeats matches numpy for diverse shapes, axes, and reps."""
+
+    @testing.numpy_cupy_array_equal()
+    def test_repeat(self, xp):
+        x = testing.shaped_arange(self.shape, xp)
+        return xp.repeat(x, xp.array(self.reps), self.axis)
+
+
+@testing.parameterize(
+    *[
+        {"rep_dtype": d}
+        for d in [
+            numpy.int8,
+            numpy.int16,
+            numpy.int32,
+            numpy.int64,
+            numpy.uint8,
+            numpy.uint16,
+            numpy.uint32,
+        ]
+    ]
+)
+class TestRepeatNdarrayRepsDtype:
+    """Various integer dtypes for reps are accepted."""
+
+    @testing.numpy_cupy_array_equal()
+    def test_repeat(self, xp):
+        x = testing.shaped_arange((4,), xp)
+        return xp.repeat(x, xp.array([1, 2, 3, 4], dtype=self.rep_dtype), 0)
+
+
+@testing.parameterize(
+    *[
+        {"a_dtype": d}
+        for d in [
+            numpy.bool_,
+            numpy.int32,
+            numpy.float32,
+            numpy.float64,
+            numpy.complex64,
+        ]
+    ]
+)
+class TestRepeatNdarrayArrayDtype:
+    """Output dtype matches input dtype."""
+
+    @testing.numpy_cupy_array_equal()
+    def test_dtype_preserved(self, xp):
+        x = testing.shaped_arange((3, 4), xp, dtype=self.a_dtype)
+        return xp.repeat(x, xp.array([1, 2, 3, 4]), axis=1)
+
+
+class TestRepeatNdarrayNonContiguous:
+
+    @testing.numpy_cupy_array_equal()
+    def test_transposed(self, xp):
+        x = testing.shaped_arange((4, 3), xp).T
+        return xp.repeat(x, xp.array([2, 1, 3, 0]), axis=1)
+
+    @testing.numpy_cupy_array_equal()
+    def test_strided(self, xp):
+        x = testing.shaped_arange((3, 8), xp)[:, ::2]
+        return xp.repeat(x, xp.array([1, 2, 3, 0]), axis=1)
+
+    @testing.numpy_cupy_array_equal()
+    def test_reversed(self, xp):
+        x = testing.shaped_arange((5,), xp)[::-1]
+        return xp.repeat(x, xp.array([0, 1, 2, 1, 0]))
+
+
+class TestRepeatNdarrayDtypeEdges:
+
+    @testing.numpy_cupy_array_equal()
+    def test_bool_perelement(self, xp):
+        return xp.repeat(xp.arange(3), xp.array([True, False, True]))
+
+    @testing.numpy_cupy_array_equal()
+    def test_bool_broadcast(self, xp):
+        return xp.repeat(
+            testing.shaped_arange((3, 4), xp), xp.array([True]), axis=0
+        )
+
+    @testing.numpy_cupy_array_equal()
+    def test_uint32_accepted(self, xp):
+        return xp.repeat(
+            xp.arange(4), xp.array([1, 2, 3, 4], dtype=numpy.uint32)
+        )
+
+
+class TestRepeatNdarrayLarge:
+
+    @testing.numpy_cupy_array_equal()
+    def test_large_single(self, xp):
+        return xp.repeat(
+            testing.shaped_arange((3,), xp), xp.array([0, 100000, 0])
+        )
+
+    @testing.numpy_cupy_array_equal()
+    def test_large_broadcast(self, xp):
+        return xp.repeat(testing.shaped_arange((3,), xp), xp.array([50000]))
+
+
+class TestRepeatScalarEquivalence:
+    """All scalar-like repeats inputs produce identical results."""
+
+    def _check_all_equal(self, a, n, axis):
+        expected = cupy.array(numpy.repeat(cupy.asnumpy(a), n, axis))
+        for form in [
+            n,
+            [n],
+            cupy.array([n]),
+            cupy.array(n),
+        ]:  # numpy.intp(n) is not supported
+            testing.assert_array_equal(cupy.repeat(a, form, axis), expected)
+
+    def test_equivalence(self):
+        a = cupy.arange(6).reshape(2, 3)
+        for n, axis in [(3, None), (2, 0), (4, 1), (0, 0), (1, 0)]:
+            self._check_all_equal(a, n, axis)
+
+    def test_negative_raises(self):
+        a = cupy.arange(3)
+        for form in [
+            -1,
+            [-1],
+            cupy.array([-1]),
+            cupy.array(-1),
+        ]:  # numpy.intp(n) is not supported
+            with pytest.raises(ValueError, match=r"positive"):
+                cupy.repeat(a, form)
+
+    @pytest.mark.skip("dpnp.repeat does not accept numpy scalars as repeats")
+    def test_numpy_scalar_accepted(self):
+        a = cupy.arange(3)
+        testing.assert_array_equal(
+            cupy.repeat(a, numpy.int64(2)), cupy.repeat(a, 2)
+        )
+
+    def test_numpy_ndarray_rejected(self):
+        a = cupy.arange(3)
+        with pytest.raises(TypeError, match="numpy.ndarray"):
+            cupy.repeat(a, numpy.array([1, 2, 3]))
+
+
+class TestRepeatNdarrayErrors:
+
+    def test_length_mismatch(self):
+        with pytest.raises(ValueError, match=r"must be broadcastable"):
+            cupy.repeat(cupy.arange(4), cupy.array([1, 2]), axis=0)
+
+    def test_negative(self):
+        with pytest.raises(ValueError, match=r"positive"):
+            cupy.repeat(cupy.arange(3), cupy.array([-1, 1, 2]))
+
+    def test_float_dtype_matches_numpy(self):
+        # Both NumPy and CuPy raise TypeError for unsafe cast
+        for xp in (numpy, cupy):
+            with pytest.raises(TypeError):
+                xp.repeat(xp.arange(3), xp.array([1.0, 1.0, 1.0]), 0)
+
+    @pytest.mark.skip("dpnp accepts uint64 repeats, unlike numpy")
+    def test_uint64_matches_numpy(self):
+        # Both reject uint64 (unsigned → signed is unsafe)
+        for xp in (numpy, cupy):
+            with pytest.raises(TypeError):
+                xp.repeat(xp.arange(3), xp.array([1, 2, 3], dtype=numpy.uint64))
+
+    def test_ndim_gt1_matches_numpy(self):
+        for xp in (numpy, cupy):
+            with pytest.raises(ValueError):
+                xp.repeat(xp.arange(6), xp.array([[1, 2, 3, 4, 5, 6]]))
+
+    @pytest.mark.skip("different message for nested lists")
+    def test_ndim_gt1_list_rejected(self):
+        with pytest.raises(ValueError, match=r"too deep"):
+            cupy.repeat(cupy.arange(6), [[1, 2, 3, 4, 5, 6]])
+
+    def test_bad_axis(self):
+        with pytest.raises(Exception):
+            cupy.repeat(
+                cupy.arange(12).reshape(3, 4), cupy.array([1, 2, 3]), axis=5
+            )
+
+    def test_method_interface(self):
+        a = cupy.arange(4)
+        reps = cupy.array([1, 2, 0, 3])
+        testing.assert_array_equal(a.repeat(reps), cupy.repeat(a, reps))
+
+
 @testing.parameterize(
     {"reps": 0},
     {"reps": 1},

From c0de37dd880d9a56ff16ae2015ba7bf777f21cc3 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 11:14:44 +0200
Subject: [PATCH 06/37] Fix ZeroDivisionError when sorting along zero-length
 axis

---
 dpnp/tests/third_party/cupy/sorting_tests/test_sort.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/dpnp/tests/third_party/cupy/sorting_tests/test_sort.py b/dpnp/tests/third_party/cupy/sorting_tests/test_sort.py
index ba64ef949cb0..3bf1c405d0d0 100644
--- a/dpnp/tests/third_party/cupy/sorting_tests/test_sort.py
+++ b/dpnp/tests/third_party/cupy/sorting_tests/test_sort.py
@@ -37,6 +37,11 @@ def test_external_sort_zero_dim(self):
             with pytest.raises(AxisError):
                 xp.sort(a)
 
+    @testing.numpy_cupy_array_equal()
+    def test_sort_zero_length_axis(self, xp):
+        """Sorting along a zero-length axis is a no-op (#9816)."""
+        return xp.sort(xp.empty((2, 0)), axis=-1)
+
     @testing.numpy_cupy_array_equal()
     def test_sort_two_or_more_dim(self, xp):
         a = testing.shaped_random((2, 3, 3), xp)

From d4f36c928efeecf4f382926110171d34c85ac357 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 11:40:55 +0200
Subject: [PATCH 07/37] Fix integer comparisons

---
 .../cupy/logic_tests/test_comparison.py       | 68 ++++++++++++++++---
 1 file changed, 59 insertions(+), 9 deletions(-)

diff --git a/dpnp/tests/third_party/cupy/logic_tests/test_comparison.py b/dpnp/tests/third_party/cupy/logic_tests/test_comparison.py
index 5215191987b4..91a9dc727924 100644
--- a/dpnp/tests/third_party/cupy/logic_tests/test_comparison.py
+++ b/dpnp/tests/third_party/cupy/logic_tests/test_comparison.py
@@ -36,16 +36,18 @@ def test_equal(self):
         self.check_binary("equal")
 
 
-class TestComparisonOperator(unittest.TestCase):
+operators = [
+    operator.lt,
+    operator.le,
+    operator.eq,
+    operator.ne,
+    operator.gt,
+    operator.ge,
+]
 
-    operators = [
-        operator.lt,
-        operator.le,
-        operator.eq,
-        operator.ne,
-        operator.gt,
-        operator.ge,
-    ]
+
+class TestComparisonOperator:
+    operators = operators
 
     @testing.for_all_dtypes(no_complex=True)
     @testing.numpy_cupy_array_equal()
@@ -75,6 +77,52 @@ def test_binary_array_pyscalar(self, xp, dtype):
         b = 3
         return [op(a, b) for op in self.operators]
 
+    @pytest.mark.skip("SAT-8549")
+    @pytest.mark.parametrize(
+        "dtype", [numpy.int8, numpy.int64, numpy.uint8, numpy.uint64]
+    )
+    @pytest.mark.parametrize("scalar", [-1, 0, 2**32, 2**63, 2**64 - 1])
+    @pytest.mark.parametrize("op", operators)
+    @testing.numpy_cupy_array_equal()
+    @numpy.errstate(over="ignore")
+    def test_binary_array_pyscalar_int(self, xp, dtype, scalar, op):
+        # This test also checks large mixed unsigned/signed comparisons.
+        min_, max_ = numpy.iinfo(dtype).min, numpy.iinfo(dtype).max
+
+        a = xp.array([min_, 0, max_, xp.dtype(dtype).type(0) - 1], dtype=dtype)
+        b = scalar
+        return [op(a, b), op(b, a)]
+
+    @pytest.mark.parametrize(
+        "dtype", [numpy.float16, numpy.float32, numpy.float64]
+    )
+    @pytest.mark.parametrize(
+        "scalar", [-1, 0, 2**32, 2**31 - 1, 2**31 + 1, 2**63, 2**64 - 1]
+    )
+    @pytest.mark.parametrize("op", operators)
+    @testing.numpy_cupy_array_equal()
+    @numpy.errstate(over="ignore")
+    def test_binary_array_pyscalar_int_and_float(self, xp, dtype, scalar, op):
+        a = xp.array(
+            [-1, 0, 2**31 - 1, 2**31 + 1, 2**32, 2**63 - 1, 2**62, 2**62 + 1]
+        )
+        a = a.astype(dtype)  # cast (overflow OK)
+        b = scalar
+        return [op(a, b), op(b, a)]
+
+    @pytest.mark.skip("SAT-8549")
+    @pytest.mark.parametrize(
+        "scalar,safe_scalar",
+        [(2**63, 2), (2**63 + 100, 2), (-(2**63), -1)],
+    )
+    @pytest.mark.parametrize("op", operators)
+    def test_binary_array_pyscalar_int_and_bool(self, scalar, safe_scalar, op):
+        # As of 2.5, NumPy uses the default integer and fails for very large
+        # Python scalars. But CuPy uses uint64 and succeeds.
+        a = cupy.array([True, False])
+        testing.assert_array_equal(op(a, scalar), op(a, safe_scalar))
+        testing.assert_array_equal(op(scalar, a), op(safe_scalar, a))
+
 
 class TestArrayEqual(unittest.TestCase):
 
@@ -207,6 +255,7 @@ def test_allclose_finite(self, xp, dtype):
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_equal()
+    @numpy.errstate(over="ignore")
     def test_allclose_min_int(self, xp, dtype):
         a = xp.array([0]).astype(dtype)
         b = xp.array([numpy.iinfo("i").min]).astype(dtype)
@@ -253,6 +302,7 @@ def test_is_close_finite(self, xp, dtype):
 
     @testing.for_all_dtypes(no_complex=True)
     @testing.numpy_cupy_array_equal()
+    @numpy.errstate(over="ignore")
     def test_is_close_min_int(self, xp, dtype):
         # In numpy<1.10 this test fails when dtype is bool
         a = xp.array([0]).astype(dtype)

From 3516f892f600e3fb21ef8cfbdd567a281fadaf89 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 11:48:33 +0200
Subject: [PATCH 08/37] Fix delete incompatibilities with NumPy

---
 .../manipulation_tests/test_add_remove.py     | 48 ++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

diff --git a/dpnp/tests/third_party/cupy/manipulation_tests/test_add_remove.py b/dpnp/tests/third_party/cupy/manipulation_tests/test_add_remove.py
index 31bbc9691889..ba29b67ba6e3 100644
--- a/dpnp/tests/third_party/cupy/manipulation_tests/test_add_remove.py
+++ b/dpnp/tests/third_party/cupy/manipulation_tests/test_add_remove.py
@@ -2,6 +2,7 @@
 
 import pytest
 
+import dpnp as cupy
 from dpnp.tests.helper import has_support_aspect64
 from dpnp.tests.third_party.cupy import testing
 from dpnp.tests.third_party.cupy.testing._loops import (
@@ -10,7 +11,7 @@
 )
 
 
-class TestDelete(unittest.TestCase):
+class TestDelete:
 
     @testing.numpy_cupy_array_equal()
     def test_delete_with_no_axis(self, xp):
@@ -56,6 +57,51 @@ def test_delete_with_indices_as_int(self, xp):
         #    pytest.xfail("HIP may have a bug")
         return xp.delete(arr, indices)
 
+    def test_delete_array_like_input(self):
+        arr = [[0, 1, 2], [3, 4, 5]]
+        with pytest.raises((TypeError, ValueError)):
+            cupy.delete(arr, [1], axis=1)
+
+    @pytest.mark.parametrize(
+        "make_obj",
+        [
+            pytest.param(lambda xp: [], id="empty_list"),
+            pytest.param(lambda xp: (), id="empty_tuple"),
+            pytest.param(lambda xp: [0, 2], id="int_list"),
+            pytest.param(
+                lambda xp: [True, False, True], id="matching_bool_list"
+            ),
+            pytest.param(lambda xp: (0, True), id="mixed_tuple"),
+            pytest.param(
+                lambda xp: xp.array([], dtype=xp.int_), id="empty_int_array"
+            ),
+            pytest.param(lambda xp: xp.array([0, 2]), id="int_array"),
+            pytest.param(
+                lambda xp: xp.array([-1, -3]), id="negative_int_array"
+            ),
+            # The following raise ValueError (wrong-size or scalar bool masks).
+            pytest.param(lambda xp: True, id="scalar_true"),
+            pytest.param(lambda xp: False, id="scalar_false"),
+            pytest.param(lambda xp: [True, False], id="wrong_size_bool_list"),
+            pytest.param(lambda xp: xp.array(True), id="zerodim_bool_array"),
+            pytest.param(
+                lambda xp: xp.array([True, False]), id="wrong_size_bool_array"
+            ),
+            pytest.param(
+                lambda xp: xp.array([], dtype=xp.bool_), id="empty_bool_array"
+            ),
+            # The following raise IndexError (non-integer index arrays).
+            pytest.param(lambda xp: xp.array([1.5]), id="float_array_single"),
+            pytest.param(
+                lambda xp: xp.array([0.0, 2.0]), id="float_array_multi"
+            ),
+        ],
+    )
+    @testing.numpy_cupy_array_equal(accept_error=(ValueError, IndexError))
+    def test_delete_obj_variants(self, xp, make_obj):
+        arr = xp.array([10, 20, 30])
+        return xp.delete(arr, make_obj(xp))
+
 
 class TestAppend(unittest.TestCase):
 

From 062a6e61d834b15cd41359bdff77a2b47d827227 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 11:52:19 +0200
Subject: [PATCH 09/37] Add fast-path for gufunc (specifically matmul)

---
 .../cupy/core_tests/test_gufuncs.py           | 219 +++++++++++++++++-
 1 file changed, 211 insertions(+), 8 deletions(-)

diff --git a/dpnp/tests/third_party/cupy/core_tests/test_gufuncs.py b/dpnp/tests/third_party/cupy/core_tests/test_gufuncs.py
index 977313e266f9..c14511df0753 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_gufuncs.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_gufuncs.py
@@ -13,16 +13,39 @@ class TestGUFuncSignature:
     @pytest.mark.parametrize(
         "signature",
         [
-            ("(i,j)->(i,j)", [("i", "j")], [("i", "j")]),
-            ("->(i)", [()], [("i",)]),
-            ("(i,j),(j,k)->(k,l)", [("i", "j"), ("j", "k")], [("k", "l")]),
-            ("()->()", [()], [()]),
+            (
+                "(i,j)->(i,j)",
+                [(("i", False, False), ("j", False, False))],
+                [(("i", False, False), ("j", False, False))],
+                2,
+            ),
+            ("->(i)", [()], [(("i", False, False),)], 1),
+            (
+                "(i,j),(j,k)->(k,l)",
+                [
+                    (("i", False, False), ("j", False, False)),
+                    (("j", False, False), ("k", False, False)),
+                ],
+                [(("k", False, False), ("l", False, False))],
+                4,
+            ),
+            ("()->()", [()], [()], 0),
+            (
+                "(i?,j|1),(i?,j)->(i?,j)",
+                [
+                    (("i", True, False), ("j", False, True)),
+                    (("i", True, False), ("j", False, False)),
+                ],
+                [(("i", True, False), ("j", False, False))],
+                2,
+            ),
         ],
     )
     def test_signature_parsing(self, signature):
-        i, o = cupy._core._gufuncs._parse_gufunc_signature(signature[0])
+        i, o, n_cd = cupy._core._gufuncs._parse_gufunc_signature(signature[0])
         assert i == signature[1]
         assert o == signature[2]
+        assert n_cd == signature[3]
 
     @pytest.mark.parametrize(
         "signature",
@@ -53,6 +76,15 @@ def func(x):
 
         return _GUFunc(func, signature)
 
+    def _get_gufunc_scalar_supports_all(self, signature):
+        def func(x, out=None):
+            # Does not use keepdims, but gufunc supports it.
+            return x.sum(axis=-1, out=out)
+
+        return _GUFunc(
+            func, signature, supports_batched=True, supports_out=True
+        )
+
     @pytest.mark.parametrize(
         "axes",
         [
@@ -101,14 +133,61 @@ def test_axes_selection_single(self, xp, axes):
         else:
             return numpy.moveaxis(x, axes[0], axes[1])
 
+    @pytest.mark.parametrize(
+        "axes",
+        [
+            [(0, 1), (0, 1), (0, 1)],
+            [(0, 1), (0, 1), (1, 0)],
+            [(-2, -1), (-3, 0), (-1, -3)],
+        ],
+    )
+    @pytest.mark.parametrize("use_out", [True, False])
+    @testing.numpy_cupy_array_equal()
+    def test_axes_matmul(self, xp, axes, use_out):
+        # Do not use a weird shape, but rather rely on each
+        # arange transpose giving a unique result.
+        x = testing.shaped_arange((3, 3, 3, 3), xp=xp)
+        y = testing.shaped_arange((3, 3, 3, 3), xp=xp)
+        if use_out:
+            out = xp.empty((3, 3, 3, 3))
+        else:
+            out = None
+
+        return xp.matmul(x, y, axes=axes, out=out)
+
+    @pytest.mark.parametrize("ax,outer_ax", [(0, 1), (1, 0), ((-1,), 0)])
+    @testing.numpy_cupy_array_equal(accept_error=numpy.exceptions.AxisError)
+    def test_axes_single_matmul(self, xp, ax, outer_ax):
+        # We do not allow this (just as NumPy), although it may be possible
+        # to define it in principle.
+        x = xp.ones((2, 3))
+        y = xp.ones((2, 3))
+        xp.matmul(x, y, axes=[ax] * 2 + [()])
+        # no return, should raise error.
+
+    @pytest.mark.parametrize("axis", [0, 1, 2, 3])
+    @pytest.mark.parametrize("keepdims", [True, False])
+    @testing.numpy_cupy_array_equal()
+    def test_axis(self, xp, axis, keepdims):
+        x = testing.shaped_arange((2, 3, 4, 5), xp=xp)
+        if xp is cupy:
+            return self._get_gufunc_scalar("(i)->()")(
+                x, axis=axis, keepdims=keepdims
+            )
+        else:
+            return x.sum(axis=axis, keepdims=keepdims)
+
     @pytest.mark.parametrize("axis", [0, 1, 2, 3])
+    @pytest.mark.parametrize("keepdims", [True, False])
     @testing.numpy_cupy_array_equal()
-    def test_axis(self, xp, axis):
+    def test_axis_full_core_support(self, xp, axis, keepdims):
         x = testing.shaped_arange((2, 3, 4, 5), xp=xp)
         if xp is cupy:
-            return self._get_gufunc_scalar("(i)->()")(x, axis=axis)
+            return self._get_gufunc_scalar_supports_all("(i)->()")(
+                x, axis=axis, keepdims=keepdims
+            )
         else:
-            return x.sum(axis=axis)
+            return x.sum(axis=axis, keepdims=keepdims)
 
     def test_axis_invalid(self):
         x = testing.shaped_arange((2, 3, 4, 5))
@@ -306,3 +385,127 @@ def default(x, y):
         y = x
         with pytest.raises(TypeError):
             gu_func(x, y, casting="unsafe", signature=sig)
+
+
+class TestGUFuncOptional:
+    def _get_gufunc_ridiculous_optional(self):
+        signature = "(a?,b,c,d?),(i?,j?,k,l)->(b,c,a?,d?,k,l,j?,i?)"
+
+        def func(x, y):
+            # The ufunc is always passed all dimensions (filled in with 1)
+            # if omitted and optional.
+            res_shape = x.shape[1:-1] + (x.shape[0], x.shape[-1])
+            res_shape += y.shape[2:] + (y.shape[1], y.shape[0])
+            return cupy.ones(res_shape)
+
+        return _GUFunc(func, signature)
+
+    def _get_forbidden_optional(self):
+        signature = "(a?,b?),(b,a?)->(a?,b?)"
+
+        def func(x, y):
+            raise RuntimeError("this will not be called")
+
+        return _GUFunc(func, signature)
+
+    @pytest.mark.parametrize(
+        "x_ndim, y_ndim",
+        [
+            (2, 2),
+            (3, 2),
+            (2, 3),
+            (3, 3),
+            (4, 2),
+            (2, 4),
+            (4, 3),
+            (3, 4),
+            (4, 4),
+            (6, 6),
+        ],
+    )
+    def test_ridiculous_optional(self, x_ndim, y_ndim):
+        gufunc = self._get_gufunc_ridiculous_optional()
+
+        x_shape = tuple(range(1, x_ndim + 1))
+        y_shape = tuple(range(1, y_ndim + 1))
+        x = cupy.ones(x_shape)
+        y = cupy.ones(y_shape)
+        # Succeeds if the correct `func` above matches with allocated output.
+        res = gufunc(x, y)
+
+        if x_ndim == 6 and y_ndim == 6:
+            # only test where this is the case
+            x_shape = x_shape[2:]
+            y_shape = y_shape[2:]
+            outer_shape = (1, 2)
+        else:
+            outer_shape = ()
+
+        # Check that the result shape is actually what we expect it to be.
+        if x.ndim == 2:  # b, c
+            core_shape = x_shape
+        elif x.ndim == 3:  # b, c, d -> b, c, d
+            core_shape = x_shape[:-1] + (x_shape[-1],)
+        else:  # a, b, c, d -> b, c, a, d
+            core_shape = x_shape[1:-1] + (x_shape[0], x_shape[-1])
+
+        if y.ndim == 2:  # k, l
+            core_shape += y_shape
+        elif y.ndim == 3:  # j, k, l -> k, l, j
+            core_shape += y_shape[1:] + (y_shape[0],)
+        else:  # i, j, k, l -> k, l, j, i
+            core_shape += y_shape[2:] + (y_shape[1], y_shape[0])
+
+        assert res.shape == outer_shape + core_shape
+
+    def test_forbidden_optional(self):
+        gufunc = self._get_forbidden_optional()
+        x = cupy.ones(2)
+        y = cupy.ones((2, 2))
+        with pytest.raises(ValueError):
+            # first op is missing a at front but second is not
+            gufunc(x, y)
+
+        with pytest.raises(ValueError):
+            # second op is missing a at end but first is not
+            gufunc(y, x)
+
+
+class TestGUFuncBroadcastable:
+    def _get_gufunc(self):
+        def func(x, y):
+            shape = cupy.broadcast_shapes(x.shape, y.shape)
+            return cupy.ones(shape)
+
+        return _GUFunc(func, "(i|1,j|1),(i|1,j)->(i,j)")
+
+    @pytest.mark.parametrize(
+        "x_shape, y_shape",
+        [
+            ((2, 1), (2, 3)),
+            ((1, 1), (2, 1)),
+            ((2, 3), (1, 3)),
+            ((1, 1), (1, 1)),
+        ],
+    )
+    def test_broadcastable(self, x_shape, y_shape):
+        func = self._get_gufunc()
+        x = cupy.ones(x_shape)
+        y = cupy.ones(y_shape)
+
+        res = func(x, y)
+        assert res.shape == cupy.broadcast_shapes(x_shape, y_shape)
+
+    @pytest.mark.parametrize(
+        "x_shape, y_shape",
+        [
+            ((2, 3), (2, 1)),  # second operand 1 is not broadcastable
+        ],
+    )
+    def test_not_broadcastable(self, x_shape, y_shape):
+        func = self._get_gufunc()
+        x = cupy.ones(x_shape)
+        y = cupy.ones(y_shape)
+
+        with pytest.raises(ValueError):
+            func(x, y)

From b432cae4bcea76e5196bdcd529be1e2d00639814 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 12:17:24 +0200
Subject: [PATCH 10/37] Do not unload modules/code that have been used

---
 .../cupy/core_tests/test_multithreading.py    | 119 ++++++++++++++++++
 1 file changed, 119 insertions(+)
 create mode 100644 dpnp/tests/third_party/cupy/core_tests/test_multithreading.py

diff --git a/dpnp/tests/third_party/cupy/core_tests/test_multithreading.py b/dpnp/tests/third_party/cupy/core_tests/test_multithreading.py
new file mode 100644
index 000000000000..c5e77d38daa0
--- /dev/null
+++ b/dpnp/tests/third_party/cupy/core_tests/test_multithreading.py
@@ -0,0 +1,119 @@
+from __future__ import annotations
+
+import concurrent.futures
+import threading
+
+import pytest
+
+import dpnp as cupy
+
+pytest.skip(
+    "ElementwiseKernel / create_ufunc internals are not supported",
+    allow_module_level=True,
+)
+
+# thread_unsafe marker requires pytest-run-parallel, which is not used by dpnp
+# pytestmark = pytest.mark.thread_unsafe(
+#     reason="tests in this module are already explicitly multi-threaded"
+# )
+
+
+def run_threaded(
+    func,
+    max_workers=8,
+    pass_count=False,
+    pass_barrier=False,
+    outer_iterations=1,
+    prepare_args=None,
+):
+    """Runs a function many times in parallel
+
+    This function has been taken from NumPy:
+    https://github.com/numpy/numpy/blob/a90ef57574c501a780fe834123b20fcea1329f90/numpy/testing/_private/utils.py#L2807
+    """
+    for _ in range(outer_iterations):
+        with concurrent.futures.ThreadPoolExecutor(
+            max_workers=max_workers
+        ) as tpe:
+            if prepare_args is None:
+                args = []
+            else:
+                args = prepare_args()
+            if pass_barrier:
+                barrier = threading.Barrier(max_workers)
+                args.append(barrier)
+            if pass_count:
+                all_args = [(func, i, *args) for i in range(max_workers)]
+            else:
+                all_args = [(func, *args) for i in range(max_workers)]
+            try:
+                futures = []
+                for arg in all_args:
+                    futures.append(tpe.submit(*arg))
+            except RuntimeError as e:
+                pytest.skip(
+                    f"Spawning {max_workers} threads failed with "
+                    f"error {e!r} (likely due to resource limits on "
+                    "the system running the tests)"
+                )
+            finally:
+                if len(futures) < max_workers and pass_barrier:
+                    barrier.abort()
+            for f in futures:
+                f.result()
+
+
+@pytest.mark.slow
+def test_elementwise_kernel_cache():
+    """Checks that a thread always uses the same compiled kernel
+    which means that we don't unload a kernel that was ever used.
+
+    This matters for graph capture, although in some cases just
+    unloading a module may create problems and that isn't prevented.
+    I.e. a race can still mean that multiple threads compile the same code.
+    When this happens, we test that one version is used everywhere.
+    """
+
+    def prepare_args():
+        kernel = cupy.ElementwiseKernel("T x", "T y", "y = x;")
+        assert not kernel._elementwise_kernel_memo
+        arr = cupy.ones(10)
+        return [kernel, arr]
+
+    def func(kernel, arr, barrier):
+        barrier.wait()
+        kernel(arr)
+        assert len(kernel._elementwise_kernel_memo) == 1
+        cached_obj1 = next(iter(kernel._elementwise_kernel_memo.values()))
+        kernel(arr)
+        assert len(kernel._elementwise_kernel_memo) == 1
+        cached_obj2 = next(iter(kernel._elementwise_kernel_memo.values()))
+        assert cached_obj1 is cached_obj2
+
+    run_threaded(
+        func, outer_iterations=20, pass_barrier=True, prepare_args=prepare_args
+    )
+
+
+@pytest.mark.slow
+def test_ufunc_kernel_cache():
+    # See test_elementwise_kernel_cache for more details.
+    def prepare_args():
+        ufunc = cupy._core.create_ufunc("cache_test", ("d->d",), "out0 = in0")
+        assert not ufunc._kernel_memo
+        arr = cupy.ones(10)
+        return [ufunc, arr]
+
+    def func(ufunc, arr, barrier):
+        barrier.wait()
+        ufunc(arr)
+        assert len(ufunc._kernel_memo) == 1
+        cached_obj1 = next(iter(ufunc._kernel_memo.values()))
+        ufunc(arr)
+        assert len(ufunc._kernel_memo) == 1
+        cached_obj2 = next(iter(ufunc._kernel_memo.values()))
+        assert cached_obj1 is cached_obj2
+
+    run_threaded(
+        func, outer_iterations=20, pass_barrier=True, prepare_args=prepare_args
+    )

From 51f12ef5d8018d2ab6ccbdde673d1d44fb89b5f0 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 12:24:28 +0200
Subject: [PATCH 11/37] Support cp.from_dlpack with ml_dtypes.bfloat16
 Optionally

---
 dpnp/tests/third_party/cupy/core_tests/test_dlpack.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/dpnp/tests/third_party/cupy/core_tests/test_dlpack.py b/dpnp/tests/third_party/cupy/core_tests/test_dlpack.py
index eb9e958fad0b..5515886bf0c4 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_dlpack.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_dlpack.py
@@ -21,6 +21,9 @@ def _gen_array(dtype, alloc_q=None):
         array = numpy.random.random((2, 3))
     elif dtype == cupy.bool_:
         array = numpy.random.randint(0, 2, size=(2, 3))
+    # bfloat16 is not supported by dpnp
+    # elif dtype.name == "bfloat16":
+    #     array = numpy.random.rand(2, 3)
     else:
         assert False, f"unrecognized dtype: {dtype}"
     return cupy.asarray(array, sycl_queue=alloc_q).astype(dtype)
@@ -89,6 +92,14 @@ def test_conversion(self, dtype):
         testing.assert_array_equal(orig_array, out_array)
         testing.assert_array_equal(orig_array.data.ptr, out_array.data.ptr)
 
+    @pytest.mark.skip("bfloat16 dtype is not supported")
+    def test_conversion_bfloat16(self):
+        ml_dtypes = pytest.importorskip("ml_dtypes")
+        orig_array = _gen_array(numpy.dtype(ml_dtypes.bfloat16))
+        out_array = cupy.from_dlpack(orig_array)
+        testing.assert_array_equal(orig_array, out_array)
+        testing.assert_array_equal(orig_array.data.ptr, out_array.data.ptr)
+
     @pytest.mark.skip("no limitations in from_dlpack()")
     def test_from_dlpack_and_conv_errors(self):
         orig_array = _gen_array("int8")

From 8409000a480e90d957ec4b967ec037bf1133777c Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 12:26:42 +0200
Subject: [PATCH 12/37] Deprecate jitify=True support (and jitify=False)

---
 .../third_party/cupy/core_tests/test_raw.py   | 27 +++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/dpnp/tests/third_party/cupy/core_tests/test_raw.py b/dpnp/tests/third_party/cupy/core_tests/test_raw.py
index 59581674a856..b00d56c85c3a 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_raw.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_raw.py
@@ -1176,6 +1176,7 @@ def test_compile_module(self):
     # Finally, we test NVCC
     {"backend": "nvcc", "in_memory": False},
 )
+@pytest.mark.filterwarnings("ignore:.*jitify=False:DeprecationWarning")
 class TestRaw(_TestRawBase, unittest.TestCase):
     pass
 
@@ -1196,6 +1197,7 @@ class TestRaw(_TestRawBase, unittest.TestCase):
 @pytest.mark.thread_unsafe(
     reason="Jitify seems to have problems, skip as largely unmaintained."
 )
+@pytest.mark.filterwarnings("ignore:jitify=True:DeprecationWarning")
 class TestRawWithJitify(_TestRawBase, unittest.TestCase):
     pass
 
@@ -1512,6 +1514,7 @@ def test_jitify5(self):
 
 @unittest.skipIf(cupy.cuda.runtime.is_hip, "Jitify does not support ROCm/HIP")
 @testing.slow
+@pytest.mark.filterwarnings("ignore:.*jitify=False:DeprecationWarning")
 class TestRawJitifyNoJitify(_TestRawJitify, unittest.TestCase):
     jitify = False
 
@@ -1521,5 +1524,29 @@ class TestRawJitifyNoJitify(_TestRawJitify, unittest.TestCase):
 @pytest.mark.thread_unsafe(
     reason="Jitify seems to have problems, skip as largely unmaintained."
 )
+@pytest.mark.filterwarnings("ignore:jitify=True:DeprecationWarning")
 class TestRawJitifyJitify(_TestRawJitify, unittest.TestCase):
     jitify = True
+
+
+@pytest.mark.parametrize(
+    "jitify,match",
+    [(True, ".*"), (False, "Avoid passing.*jitify=False")],
+)
+@unittest.skipIf(cupy.cuda.runtime.is_hip, "Jitify does not support ROCm/HIP")
+@testing.slow
+@pytest.mark.thread_unsafe(reason="uses temporary cache dir")
+@use_temporary_cache_dir()
+def test_jitify_deprecation_warning(jitify, match):
+    with pytest.warns(DeprecationWarning, match=match):
+        cupy.RawKernel(
+            _test_source1, "test_sum", backend="nvrtc", jitify=jitify
+        )
+
+    with pytest.warns(DeprecationWarning, match=match):
+        cupy.RawModule(code=_test_source1, backend="nvrtc", jitify=jitify)
+
+    # Not technically part of the rawkernel, but test warning in compile here:
+    with pytest.warns(DeprecationWarning, match=match):
+        # compiler is not imported in dpnp (module is skipped)
+        compiler.compile_using_nvrtc("", options=(), jitify=jitify)

From cfebec00a95d20523802103622e6392651ffa0a3 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 12:31:43 +0200
Subject: [PATCH 13/37] Slightly bump SVD test tolerance (but tighten it for
 float64)

---
 .../tests/third_party/cupy/linalg_tests/test_decomposition.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/dpnp/tests/third_party/cupy/linalg_tests/test_decomposition.py b/dpnp/tests/third_party/cupy/linalg_tests/test_decomposition.py
index 697e4ee7988d..5f4e9d48e9ce 100644
--- a/dpnp/tests/third_party/cupy/linalg_tests/test_decomposition.py
+++ b/dpnp/tests/third_party/cupy/linalg_tests/test_decomposition.py
@@ -298,7 +298,9 @@ def check_usv(self, shape, dtype):
                 )
             else:
                 a_gpu_usv = cupy.matmul(u_gpu * s_gpu[..., None, :], vh_gpu)
-        testing.assert_allclose(a_gpu, a_gpu_usv, rtol=1e-4, atol=1e-4)
+
+        tol = numpy.finfo(a_gpu_usv.dtype).eps * 1024
+        testing.assert_allclose(a_gpu, a_gpu_usv, rtol=tol, atol=tol)
 
         # assert unitary
         u_len = u_gpu.shape[-1]

From 73935f46a36298e06366f8e5349bcf36b76f961d Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 12:34:58 +0200
Subject: [PATCH 14/37] Implement kernel cache save/load abstraction

---
 .../third_party/cupy/core_tests/test_raw.py   |  6 +-
 .../third_party/cupy/cuda_tests/__init__.py   |  0
 .../cupy/cuda_tests/test_compiler_cache.py    | 98 +++++++++++++++++++
 3 files changed, 101 insertions(+), 3 deletions(-)
 create mode 100644 dpnp/tests/third_party/cupy/cuda_tests/__init__.py
 create mode 100644 dpnp/tests/third_party/cupy/cuda_tests/test_compiler_cache.py

diff --git a/dpnp/tests/third_party/cupy/core_tests/test_raw.py b/dpnp/tests/third_party/cupy/core_tests/test_raw.py
index b00d56c85c3a..2d3b8c153b86 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_raw.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_raw.py
@@ -356,11 +356,11 @@
 def use_temporary_cache_dir():
     # Note uses mock, so not thread-safe (except at class/method level)
     # tempdir fixture could be used instead.
-    target1 = "cupy.cuda.compiler.get_cache_dir"
+    target1 = "cupy.cuda.compiler._kernel_cache_backend._cache_dir"
     target2 = "cupy.cuda.compiler._empty_file_preprocess_cache"
     temp_cache = {}
     with tempfile.TemporaryDirectory() as path:
-        with mock.patch(target1, lambda: path):
+        with mock.patch(target1, path):
             with mock.patch(target2, temp_cache):
                 yield path
 
@@ -391,7 +391,7 @@ def find_nvcc_ver():
     cmd = cupy.cuda.get_nvcc_path().split()
     cmd += ["--version"]
 
-    output = compiler._run_cc(cmd, cupy.cuda.compiler.get_cache_dir(), "nvcc")
+    output = compiler._run_cc(cmd, None, "nvcc")
     match = re.search(nvcc_ver_pattern, output)
     assert match
 
diff --git a/dpnp/tests/third_party/cupy/cuda_tests/__init__.py b/dpnp/tests/third_party/cupy/cuda_tests/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/dpnp/tests/third_party/cupy/cuda_tests/test_compiler_cache.py b/dpnp/tests/third_party/cupy/cuda_tests/test_compiler_cache.py
new file mode 100644
index 000000000000..3e21e7bd8cc5
--- /dev/null
+++ b/dpnp/tests/third_party/cupy/cuda_tests/test_compiler_cache.py
@@ -0,0 +1,98 @@
+from __future__ import annotations
+
+import os
+import tempfile
+
+import pytest
+
+pytest.skip(
+    "cupy.cuda._compiler_cache (kernel cache backend) is not supported",
+    allow_module_level=True,
+)
+
+# _compiler_cache is a CuPy CUDA internal with no dpnp equivalent
+# from cupy.cuda._compiler_cache import (
+#     DiskKernelCacheBackend,
+#     _hash_length,
+#     _default_cache_dir,
+# )
+
+
+class TestDiskKernelCacheBackend:
+    """Tests for DiskKernelCacheBackend implementation."""
+
+    def test_init_cache_dir(self):
+        """Test initialization with default cache directory."""
+        backend = DiskKernelCacheBackend()
+        cupy_cache_dir = os.environ.get("CUPY_CACHE_DIR")
+        if cupy_cache_dir is None:
+            assert backend._cache_dir == _default_cache_dir
+        else:
+            assert backend._cache_dir == cupy_cache_dir
+        assert os.path.isdir(backend._cache_dir)
+
+    def test_init_custom_cache_dir(self):
+        """Test initialization with custom cache directory."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            cache_dir = os.path.join(tmpdir, "custom_cache")
+            backend = DiskKernelCacheBackend(cache_dir=cache_dir)
+            assert backend._cache_dir == cache_dir
+            assert os.path.isdir(cache_dir)
+
+    def test_save_and_load(self):
+        """Test basic save and load operations."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            backend = DiskKernelCacheBackend(cache_dir=tmpdir)
+
+            name = "test_kernel.cubin"
+            cubin = b"compiled_kernel_binary"
+            source = 'extern "C" __global__ void test() {}'
+
+            # Save the kernel
+            backend.save(name, cubin, source)
+
+            # Load it back
+            loaded_cubin = backend.load(name)
+            assert loaded_cubin == cubin
+
+    def test_load_nonexistent_file(self):
+        """Test loading a file that doesn't exist."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            backend = DiskKernelCacheBackend(cache_dir=tmpdir)
+
+            result = backend.load("nonexistent.cubin")
+            assert result is None
+
+    def test_load_file_too_short(self):
+        """Test loading a file that's too short to contain a hash."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            backend = DiskKernelCacheBackend(cache_dir=tmpdir)
+
+            # Write a file with less than _hash_length bytes
+            name = "short.cubin"
+            data = b"too_short"
+            assert len(data) < _hash_length
+            path = os.path.join(tmpdir, name)
+            with open(path, "wb") as f:
+                f.write(data)
+
+            result = backend.load(name)
+            assert result is None
+
+    def test_load_corrupted_hash(self):
+        """Test that corrupted cache files are rejected."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            backend = DiskKernelCacheBackend(cache_dir=tmpdir)
+
+            name = "corrupted.cubin"
+            path = os.path.join(tmpdir, name)
+
+            # Write file with wrong hash
+            cubin = b"kernel_data"
+            wrong_hash = b"0" * _hash_length  # Wrong hash
+            with open(path, "wb") as f:
+                f.write(wrong_hash + cubin)
+
+            # Load should return None due to hash mismatch
+            result = backend.load(name)
+            assert result is None

From 1629b040313b54d63288990732317a3aa6ccc091 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 12:39:08 +0200
Subject: [PATCH 15/37] Update test_assumed_runtime_version

---
 .../cupy/cuda_tests/test_runtime.py           | 89 +++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100644 dpnp/tests/third_party/cupy/cuda_tests/test_runtime.py

diff --git a/dpnp/tests/third_party/cupy/cuda_tests/test_runtime.py b/dpnp/tests/third_party/cupy/cuda_tests/test_runtime.py
new file mode 100644
index 000000000000..0e432da5b653
--- /dev/null
+++ b/dpnp/tests/third_party/cupy/cuda_tests/test_runtime.py
@@ -0,0 +1,89 @@
+from __future__ import annotations
+
+import pickle
+import sys
+
+import pytest
+
+import dpnp as cupy
+
+pytest.skip(
+    "cupy.cuda.runtime / nvrtc internals are not supported",
+    allow_module_level=True,
+)
+
+# CUDA runtime internals have no dpnp equivalent
+# from cupy.cuda import driver
+# from cupy.cuda import nvrtc
+# from cupy.cuda import runtime
+
+
+class TestExceptionPicklable:
+
+    def test(self):
+        e1 = runtime.CUDARuntimeError(1)
+        e2 = pickle.loads(pickle.dumps(e1))
+        assert e1.args == e2.args
+        assert str(e1) == str(e2)
+
+
+class TestMemPool:
+
+    @pytest.mark.skipif(
+        runtime.is_hip, reason="HIP does not support async allocator"
+    )
+    @pytest.mark.skipif(
+        driver._is_cuda_python() and runtime.runtimeGetVersion() < 11020,
+        reason="cudaMemPool_t is supported since CUDA 11.2",
+    )
+    @pytest.mark.skipif(
+        not driver._is_cuda_python() and driver.get_build_version() < 11020,
+        reason="cudaMemPool_t is supported since CUDA 11.2",
+    )
+    @pytest.mark.skipif(
+        runtime.deviceGetAttribute(runtime.cudaDevAttrMemoryPoolsSupported, 0)
+        == 0,
+        reason="cudaMemPool_t is not supported on device 0",
+    )
+    def test_mallocFromPoolAsync(self):
+        # also test create/destroy a pool
+        props = runtime.MemPoolProps(
+            runtime.cudaMemAllocationTypePinned,
+            runtime.cudaMemHandleTypeNone,
+            runtime.cudaMemLocationTypeDevice,
+            0,
+        )  # on device 0
+        pool = runtime.memPoolCreate(props)
+        assert pool > 0
+        s = cupy.cuda.Stream()
+        ptr = runtime.mallocFromPoolAsync(128, pool, s.ptr)
+        assert ptr > 0
+        runtime.freeAsync(ptr, s.ptr)
+        runtime.memPoolDestroy(pool)
+
+
+@pytest.mark.skipif(
+    runtime.is_hip, reason="This assumption is correct only in CUDA"
+)
+def test_assumed_runtime_version():
+    # When CUDA Python is enabled, CuPy calculates the CUDA runtime version
+    # from NVRTC version. This test ensures that the assumption is correct
+    # by running the same logic in non-CUDA Python environment.
+    # When this fails, `runtime.runtimeGetVersion()` logic needs to be fixed.
+    major, minor = nvrtc.getVersion()
+    local_ver = runtime._getLocalRuntimeVersion()
+    # On Windows, starting from CUDA 13.0, cudaRuntimeGetVersion() always
+    # returns major * 1000 regardless of the minor version (nvbugs 5955788,
+    # 5523579). Accept either form on Windows + CUDA >= 13.
+    if sys.platform == "win32" and major >= 13:
+        assert local_ver in (major * 1000, major * 1000 + minor * 10)
+    else:
+        assert local_ver == major * 1000 + minor * 10
+
+
+def test_major_version():
+    major = runtime._getCUDAMajorVersion()
+    if runtime.is_hip:
+        assert major == 0
+    else:
+        assert 10 < major < 20

From f55b32802887d3a961f41a2d3e39ddeafe49eaf7 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 12:42:06 +0200
Subject: [PATCH 16/37] Restructure SingleDeviceMemoryPool and locking

---
 .../cupy/core_tests/test_multithreading.py    | 53 +++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/dpnp/tests/third_party/cupy/core_tests/test_multithreading.py b/dpnp/tests/third_party/cupy/core_tests/test_multithreading.py
index c5e77d38daa0..ee577b187cff 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_multithreading.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_multithreading.py
@@ -1,12 +1,17 @@
 from __future__ import annotations
 
 import concurrent.futures
+import gc
+import random
 import threading
 
 import pytest
 
 import dpnp as cupy
 
+# cupy.cuda.memory.alloc has no dpnp equivalent
+# from cupy.cuda.memory import alloc
+
 pytest.skip(
     "ElementwiseKernel / create_ufunc internals are not supported",
     allow_module_level=True,
@@ -117,3 +122,51 @@ def func(ufunc, arr, barrier):
     run_threaded(
         func, outer_iterations=20, pass_barrier=True, prepare_args=prepare_args
     )
+
+
+@pytest.mark.slow
+# NOTE: With clean=False, this test can OOM, since the cycles may not
+# be cleaned up sufficiently in the `gc.collect()` we do on OOM.
+@pytest.mark.parametrize("clean", [True, False])
+def test_default_memory_pool_threaded(clean, iterations=500):
+    # This test is designed to stress-test the memory pool, we will
+    # create various usage patterns and mix them in a threaded way.
+    # To seriously stress-test it make the iterations very large and watch
+    # the long-term behavior.
+
+    def random_allocation():
+        # choose a random allocation size, hopefully this will (occasionally)
+        # lead to allocations being split.
+        size = random.randint(1, 50_000)
+        return alloc(size)
+
+    def make_allocations():
+        allocations = []
+        for i in range(random.randint(1, 50)):
+            allocations.append(random_allocation())
+
+        # And now let's make a few that can't be cleaned up easily.
+        first = [None, random_allocation()]
+        curr = first
+        for i in range(2, 50):
+            node = [curr, random_allocation()]
+            curr = node
+
+        first[0] = curr  # close the circle
+
+        return allocations
+
+    def func():
+        for i in range(iterations):  # increase to test for longer
+            _ = make_allocations()
+            # once in a while, we either collect or free all blocks
+            # to stress those paths more. But hitting the high-water mark
+            # with clean=False is also interesting.
+            if clean:
+                if i % 10 == 0:
+                    gc.collect()
+                elif i % 10 == 5:
+                    cupy.get_default_memory_pool().free_all_blocks()
+            _ = make_allocations()
+
+    run_threaded(func)

From 8f6ae4747f169e4f07098419d0687c4abfd4b115 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 12:43:58 +0200
Subject: [PATCH 17/37] Make sure local cache is warmed up at job start time

---
 .../cupy/cuda_tests/test_compiler_cache.py    | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/dpnp/tests/third_party/cupy/cuda_tests/test_compiler_cache.py b/dpnp/tests/third_party/cupy/cuda_tests/test_compiler_cache.py
index 3e21e7bd8cc5..a95a496ec10f 100644
--- a/dpnp/tests/third_party/cupy/cuda_tests/test_compiler_cache.py
+++ b/dpnp/tests/third_party/cupy/cuda_tests/test_compiler_cache.py
@@ -96,3 +96,37 @@ def test_load_corrupted_hash(self):
             # Load should return None due to hash mismatch
             result = backend.load(name)
             assert result is None
+
+    def test_encode_decode_roundtrip(self):
+        """Test that _encode_cubin/_decode_cubin form a correct round-trip."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            backend = DiskKernelCacheBackend(cache_dir=tmpdir)
+            cubin = b"some_kernel_binary"
+            encoded = backend._encode_cubin(cubin)
+            # Encoded form is larger and starts with the ASCII hash
+            assert len(encoded) == _hash_length + len(cubin)
+            assert encoded[_hash_length:] == cubin
+            # Decode recovers the original cubin
+            assert backend._decode_cubin(encoded) == cubin
+
+    def test_decode_cubin_too_short(self):
+        """Test that _decode_cubin returns None for data shorter than hash."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            backend = DiskKernelCacheBackend(cache_dir=tmpdir)
+            assert backend._decode_cubin(b"short") is None
+
+    def test_decode_cubin_bad_hash(self):
+        """Test that _decode_cubin returns None when hash does not match."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            backend = DiskKernelCacheBackend(cache_dir=tmpdir)
+            bad_data = b"0" * _hash_length + b"kernel_data"
+            assert backend._decode_cubin(bad_data) is None
+
+    def test_write_encoded_readable_by_load(self):
+        """Test that _write_encoded writes data that load() can read back."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            backend = DiskKernelCacheBackend(cache_dir=tmpdir)
+            cubin = b"another_kernel_binary"
+            name = "test.cubin"
+            backend._write_encoded(name, backend._encode_cubin(cubin))
+            assert backend.load(name) == cubin

From b13072dbde672bb4e1fa4f28c4d5cfa2c2225ff8 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 12:56:54 +0200
Subject: [PATCH 18/37] Skip many tests when running with pytest-run-parallel

---
 .../cupy/creation_tests/test_basic.py         |  8 ++++
 .../cupy/creation_tests/test_from_data.py     |  2 +
 .../third_party/cupy/fft_tests/test_fft.py    | 16 ++++++++
 .../cupy/manipulation_tests/test_basic.py     | 10 +++++
 .../cupy/manipulation_tests/test_join.py      |  2 +
 .../cupy/math_tests/test_sumprod.py           | 14 ++++++-
 .../cupy/random_tests/common_distributions.py | 38 ++++++++++---------
 .../cupy/random_tests/test_generator.py       | 26 ++++++++-----
 .../cupy/random_tests/test_generator_api.py   |  6 ++-
 .../cupy/random_tests/test_permutations.py    |  4 ++
 .../cupy/random_tests/test_random.py          |  2 +
 .../cupy/sorting_tests/test_search.py         |  6 +++
 .../third_party/cupy/test_numpy_interop.py    |  2 +
 13 files changed, 106 insertions(+), 30 deletions(-)

diff --git a/dpnp/tests/third_party/cupy/creation_tests/test_basic.py b/dpnp/tests/third_party/cupy/creation_tests/test_basic.py
index 05b2e5e711b2..c6d5f371e52f 100644
--- a/dpnp/tests/third_party/cupy/creation_tests/test_basic.py
+++ b/dpnp/tests/third_party/cupy/creation_tests/test_basic.py
@@ -24,6 +24,8 @@ def test_empty(self, xp, dtype, order):
         return a
 
     @testing.slow
+    # thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+    # @pytest.mark.thread_unsafe(reason="too large allocations")
     def test_empty_huge_size(self):
         a = cupy.empty((1024, 2048, 1024), dtype="b")
         a.fill(123)
@@ -33,6 +35,8 @@ def test_empty_huge_size(self):
         # cupy.get_default_memory_pool().free_all_blocks()
 
     @testing.slow
+    # thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+    # @pytest.mark.thread_unsafe(reason="too large allocations")
     def test_empty_huge_size_fill0(self):
         a = cupy.empty((1024, 2048, 1024), dtype="b")
         a.fill(0)
@@ -66,6 +70,8 @@ def test_empty_int(self, xp, dtype, order):
         return a
 
     @testing.slow
+    # thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+    # @pytest.mark.thread_unsafe(reason="too large allocations")
     def test_empty_int_huge_size(self):
         a = cupy.empty(2**31, dtype="b")
         a.fill(123)
@@ -75,6 +81,8 @@ def test_empty_int_huge_size(self):
         cupy.get_default_memory_pool().free_all_blocks()
 
     @testing.slow
+    # thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+    # @pytest.mark.thread_unsafe(reason="too large allocations")
     def test_empty_int_huge_size_fill0(self):
         a = cupy.empty(2**31, dtype="b")
         a.fill(0)
diff --git a/dpnp/tests/third_party/cupy/creation_tests/test_from_data.py b/dpnp/tests/third_party/cupy/creation_tests/test_from_data.py
index 04b3c4446a7b..11ea8a75f20a 100644
--- a/dpnp/tests/third_party/cupy/creation_tests/test_from_data.py
+++ b/dpnp/tests/third_party/cupy/creation_tests/test_from_data.py
@@ -799,6 +799,8 @@ def test_masked_array(self, dtype):
 
 # marked slow as either numpy or cupy could go OOM in this test
 @testing.slow
+# thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+# @pytest.mark.thread_unsafe(reason="too large allocations")
 @pytest.mark.skip("CUDA array interface is not supported")
 class TestCudaArrayInterfaceBigArray(unittest.TestCase):
     def test_with_over_size_array(self):
diff --git a/dpnp/tests/third_party/cupy/fft_tests/test_fft.py b/dpnp/tests/third_party/cupy/fft_tests/test_fft.py
index 369409ba001c..969bb0ae721d 100644
--- a/dpnp/tests/third_party/cupy/fft_tests/test_fft.py
+++ b/dpnp/tests/third_party/cupy/fft_tests/test_fft.py
@@ -281,6 +281,8 @@ def test_ifft(self, xp, dtype):
 
 @pytest.mark.skip("default FFT function is not supported")
 @testing.with_requires("numpy>=2.0")
+# thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+# @pytest.mark.thread_unsafe(reason="`nd_planning_states` is not thread-safe")
 class TestDefaultPlanType:
 
     @nd_planning_states()
@@ -404,6 +406,8 @@ def test_fft_allocate(self):
         )
     )
 )
+# thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+# @pytest.mark.thread_unsafe(reason="`nd_planning_states` is not thread-safe")
 class TestFft2:
 
     @nd_planning_states()
@@ -496,6 +500,8 @@ def test_ifft2(self, xp, dtype, order, enable_nd):
         )
     )
 )
+# thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+# @pytest.mark.thread_unsafe(reason="`nd_planning_states` is not thread-safe")
 class TestFftn:
 
     @nd_planning_states()
@@ -584,6 +590,8 @@ def test_ifftn(self, xp, dtype, order, enable_nd):
         )
     )
 )
+# thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+# @pytest.mark.thread_unsafe(reason="`nd_planning_states` is not thread-safe")
 class TestPlanCtxManagerFftn:
 
     @pytest.fixture(autouse=True)
@@ -851,6 +859,8 @@ def test_fft_error_on_wrong_plan(self, dtype):
     )
 )
 @pytest.mark.skip("default FFT function is not supported")
+# thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+# @pytest.mark.thread_unsafe(reason="`nd_planning_states` is not thread-safe")
 class TestFftnContiguity:
 
     @nd_planning_states([True])
@@ -1120,6 +1130,8 @@ def test_irfft2(self, dtype):
         )
     )
 )
+# thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+# @pytest.mark.thread_unsafe(reason="`nd_planning_states` is not thread-safe")
 class TestRfftn:
 
     @nd_planning_states()
@@ -1189,6 +1201,8 @@ def test_irfftn(self, xp, dtype, order, enable_nd):
     )
 )
 @pytest.mark.skip("get_fft_plan() is not supported")
+# thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+# @pytest.mark.thread_unsafe(reason="`nd_planning_states` is not thread-safe")
 class TestPlanCtxManagerRfftn:
 
     @pytest.fixture(autouse=True)
@@ -1269,6 +1283,8 @@ def test_irfftn(self, xp, dtype, enable_nd):
     )
 )
 @pytest.mark.skip("default FFT function is not supported")
+# thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+# @pytest.mark.thread_unsafe(reason="`nd_planning_states` is not thread-safe")
 class TestRfftnContiguity:
 
     @nd_planning_states([True])
diff --git a/dpnp/tests/third_party/cupy/manipulation_tests/test_basic.py b/dpnp/tests/third_party/cupy/manipulation_tests/test_basic.py
index 5b0b486d5c81..1d790eeb7597 100644
--- a/dpnp/tests/third_party/cupy/manipulation_tests/test_basic.py
+++ b/dpnp/tests/third_party/cupy/manipulation_tests/test_basic.py
@@ -154,18 +154,24 @@ def get_numpy():
     @pytest.mark.skip("multi GPU is not supported")
     @testing.multi_gpu(2)
     @testing.for_all_dtypes()
+    # cupy._util.PerformanceWarning has no dpnp equivalent
+    # @pytest.mark.filterwarnings("ignore::cupy._util.PerformanceWarning")
     def test_copyto_where_multigpu_raises(self, dtype):
         self._check_copyto_where_multigpu_raises(dtype, 2)
 
     @pytest.mark.skip("multi GPU is not supported")
     @testing.multi_gpu(4)
     @testing.for_all_dtypes()
+    # cupy._util.PerformanceWarning has no dpnp equivalent
+    # @pytest.mark.filterwarnings("ignore::cupy._util.PerformanceWarning")
     def test_copyto_where_multigpu_raises_4(self, dtype):
         self._check_copyto_where_multigpu_raises(dtype, 4)
 
     @pytest.mark.skip("multi GPU is not supported")
     @testing.multi_gpu(6)
     @testing.for_all_dtypes()
+    # cupy._util.PerformanceWarning has no dpnp equivalent
+    # @pytest.mark.filterwarnings("ignore::cupy._util.PerformanceWarning")
     def test_copyto_where_multigpu_raises_6(self, dtype):
         self._check_copyto_where_multigpu_raises(dtype, 6)
 
@@ -173,6 +179,8 @@ def test_copyto_where_multigpu_raises_6(self, dtype):
     @testing.multi_gpu(2)
     @testing.for_all_dtypes()
     @testing.numpy_cupy_array_equal()
+    # cupy._util.PerformanceWarning has no dpnp equivalent
+    # @pytest.mark.filterwarnings("ignore::cupy._util.PerformanceWarning")
     def test_copyto_multigpu(self, xp, dtype):
         with cuda.Device(0):
             a = testing.shaped_arange((2, 3, 4), xp, dtype)
@@ -184,6 +192,8 @@ def test_copyto_multigpu(self, xp, dtype):
     @pytest.mark.skip("multi GPU is not supported")
     @testing.multi_gpu(2)
     @testing.for_all_dtypes()
+    # cupy._util.PerformanceWarning has no dpnp equivalent
+    # @pytest.mark.filterwarnings("ignore::cupy._util.PerformanceWarning")
     def test_copyto_multigpu_noncontinguous(self, dtype):
         with cuda.Device(0):
             src = testing.shaped_arange((2, 3, 4), cupy, dtype)
diff --git a/dpnp/tests/third_party/cupy/manipulation_tests/test_join.py b/dpnp/tests/third_party/cupy/manipulation_tests/test_join.py
index 9e8a6b027e85..09c874661aec 100644
--- a/dpnp/tests/third_party/cupy/manipulation_tests/test_join.py
+++ b/dpnp/tests/third_party/cupy/manipulation_tests/test_join.py
@@ -135,6 +135,8 @@ def test_concatenate_many_multi_dtype(self, xp):
         return xp.concatenate((a, b) * 1024, axis=1)
 
     @testing.slow
+    # thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+    # @pytest.mark.thread_unsafe(reason="too large allocations")
     def test_concatenate_32bit_boundary(self):
         a = cupy.zeros((2**30,), dtype=cupy.int8)
         b = cupy.zeros((2**30,), dtype=cupy.int8)
diff --git a/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py b/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py
index b1c1e569ae2f..6d8c803e567f 100644
--- a/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py
+++ b/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py
@@ -65,6 +65,8 @@ def test_sum_axis(self, xp, dtype):
 
     @testing.slow
     @testing.numpy_cupy_allclose()
+    # thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+    # @pytest.mark.thread_unsafe(reason="too large allocations")
     def test_sum_axis_huge(self, xp):
         a = testing.shaped_random((2048, 1, 1024), xp, "b")
         a = xp.broadcast_to(a, (2048, 1024, 1024))
@@ -232,6 +234,8 @@ def setUp(self):
         _acc.set_routine_accelerators(old_routine_accelerators)
         _acc.set_reduction_accelerators(old_reduction_accelerators)
 
+    # thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+    # @pytest.mark.thread_unsafe(reason="unsafe AssertFunctionIsCalled.")
     @testing.for_contiguous_axes()
     # sum supports less dtypes; don't test float16 as it's not as accurate?
     @testing.for_dtypes("qQfdFD")
@@ -283,6 +287,8 @@ def test_cub_sum_empty_axis(self, xp, dtype):
             a = xp.asfortranarray(a)
         return a.sum(axis=())
 
+    # thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+    # @pytest.mark.thread_unsafe(reason="unsafe AssertFunctionIsCalled.")
     @testing.for_contiguous_axes()
     # prod supports less dtypes; don't test float16 as it's not as accurate?
     @testing.for_dtypes("qQfdFD")
@@ -325,6 +331,8 @@ def test_cub_prod(self, xp, dtype, axis):
 
     # TODO(leofang): test axis after support is added
     # don't test float16 as it's not as accurate?
+    # thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+    # @pytest.mark.thread_unsafe(reason="unsafe AssertFunctionIsCalled.")
     @testing.for_dtypes("bhilBHILfdF")
     @testing.numpy_cupy_allclose(rtol=1e-4)
     def test_cub_cumsum(self, xp, dtype):
@@ -350,6 +358,8 @@ def test_cub_cumsum(self, xp, dtype):
 
     # TODO(leofang): test axis after support is added
     # don't test float16 as it's not as accurate?
+    # thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+    # @pytest.mark.thread_unsafe(reason="unsafe AssertFunctionIsCalled.")
     @testing.for_dtypes("bhilBHILfdF")
     @testing.numpy_cupy_allclose(rtol=1e-4)
     def test_cub_cumprod(self, xp, dtype):
@@ -400,8 +410,8 @@ def _mitigate_cumprod(self, xp, dtype, result):
 @pytest.mark.skip("cutensor is not supported")
 class TestCuTensorReduction:
 
-    @pytest.fixture(autouse=True)
-    def setUp(self):
+    @pytest.fixture(autouse=True, scope="class")
+    def setup(self):
         old_accelerators = cupy._core.get_routine_accelerators()
         cupy._core.set_routine_accelerators(["cutensor"])
         yield
diff --git a/dpnp/tests/third_party/cupy/random_tests/common_distributions.py b/dpnp/tests/third_party/cupy/random_tests/common_distributions.py
index 9c500d17258b..c7f7f9776316 100644
--- a/dpnp/tests/third_party/cupy/random_tests/common_distributions.py
+++ b/dpnp/tests/third_party/cupy/random_tests/common_distributions.py
@@ -38,28 +38,29 @@ class BaseGeneratorTestCase(unittest.TestCase):
     target_method = None
 
     def get_rng(self, xp, seed):
-        pass
+        raise NotImplementedError
 
-    def set_rng_seed(self, seed):
-        pass
+    def set_rng_seed(self, rng, seed):
+        raise NotImplementedError
 
-    def setUp(self):
-        self.__seed = testing.generate_seed()
+    def _get_rng_and_seed(self):
+        seed = testing.generate_seed()
         # rng will be a new or old generator API object
-        self.rng = self.get_rng(cupy, self.__seed)
+        rng = self.get_rng(cupy, seed)
+        return rng, seed
 
-    def _get_generator_func(self, *args, **kwargs):
+    def _get_generator_func(self, rng, *args, **kwargs):
         assert isinstance(
             self.target_method, str
         ), "generate_method must be overridden"
-        f = getattr(self.rng, self.target_method)
+        f = getattr(rng, self.target_method)
         return lambda: f(*args, **kwargs)
 
-    def _generate_check_repro(self, func, seed):
+    def _generate_check_repro(self, func, rng, seed):
         # Sample a random array while checking reproducibility
-        self.set_rng_seed(seed)
+        self.set_rng_seed(rng, seed)
         x = func()
-        self.set_rng_seed(seed)
+        self.set_rng_seed(rng, seed)
         y = func()
         testing.assert_array_equal(
             x, y, "Randomly generated arrays with the same seed did not match"
@@ -69,8 +70,9 @@ def _generate_check_repro(self, func, seed):
     def generate(self, *args, **kwargs):
         # Pick one sample from generator.
         # Reproducibility is checked by repeating seed-and-sample cycle twice.
-        func = self._get_generator_func(*args, **kwargs)
-        return self._generate_check_repro(func, self.__seed)
+        rng, seed = self._get_rng_and_seed()
+        func = self._get_generator_func(rng, *args, **kwargs)
+        return self._generate_check_repro(func, rng, seed)
 
     def generate_many(self, *args, **kwargs):
         # Pick many samples from generator.
@@ -78,12 +80,13 @@ def generate_many(self, *args, **kwargs):
         # because it's very slow to set seed every time.
         _count = kwargs.pop("_count", None)
         assert _count is not None, "_count is required"
-        func = self._get_generator_func(*args, **kwargs)
+        rng, seed = self._get_rng_and_seed()
+        func = self._get_generator_func(rng, *args, **kwargs)
 
         if _count == 0:
             return []
 
-        vals = [self._generate_check_repro(func, self.__seed)]
+        vals = [self._generate_check_repro(func, rng, seed)]
         for _ in range(1, _count):
             vals.append(func())
         return vals
@@ -99,7 +102,8 @@ def _check_ks(
         assert "size" in kwargs
 
         # cupy
-        func = self._get_generator_func(*args, **kwargs)
+        rng, seed = self._get_rng_and_seed()
+        func = self._get_generator_func(rng, *args, **kwargs)
         vals_cupy = func()
         assert vals_cupy.size > 0
         count = 1 + (cupy_len - 1) // vals_cupy.size
@@ -111,7 +115,7 @@ def _check_ks(
         # numpy
         kwargs["size"] = numpy_len
         dtype = kwargs.pop("dtype", None)
-        numpy_rng = self.get_rng(numpy, self.__seed)
+        numpy_rng = self.get_rng(numpy, seed)
         vals_numpy = getattr(numpy_rng, self.target_method)(*args, **kwargs)
         if dtype is not None:
             vals_numpy = vals_numpy.astype(dtype, copy=False)
diff --git a/dpnp/tests/third_party/cupy/random_tests/test_generator.py b/dpnp/tests/third_party/cupy/random_tests/test_generator.py
index 23a86d88d8ff..5bef94c6d57d 100644
--- a/dpnp/tests/third_party/cupy/random_tests/test_generator.py
+++ b/dpnp/tests/third_party/cupy/random_tests/test_generator.py
@@ -82,8 +82,8 @@ class RandomGeneratorTestCase(common_distributions.BaseGeneratorTestCase):
     def get_rng(self, xp, seed):
         return xp.random.RandomState(seed=seed)
 
-    def set_rng_seed(self, seed):
-        self.rng.seed(seed)
+    def set_rng_seed(self, rng, seed):
+        rng.seed(seed)
 
 
 def _xp_random(xp, method_name):
@@ -103,12 +103,8 @@ def f(*args, **kwargs):
 
 @testing.fix_random()
 class TestRandomState(unittest.TestCase):
-
-    def setUp(self):
-        self.rs = _generator.RandomState(seed=testing.generate_seed())
-
     def check_seed(self, seed):
-        rs = self.rs
+        rs = _generator.RandomState(seed=testing.generate_seed())
 
         rs.seed(seed)
         xs1 = [rs.uniform() for _ in range(100)]
@@ -131,13 +127,15 @@ def test_seed_not_none(self, dtype):
 
     @testing.for_dtypes([numpy.complex128])
     def test_seed_invalid_type_complex(self, dtype):
+        rs = _generator.RandomState(seed=testing.generate_seed())
         with self.assertRaises(TypeError):
-            self.rs.seed(dtype(0))
+            rs.seed(dtype(0))
 
     @testing.for_float_dtypes()
     def test_seed_invalid_type_float(self, dtype):
+        rs = _generator.RandomState(seed=testing.generate_seed())
         with self.assertRaises(TypeError):
-            self.rs.seed(dtype(0))
+            rs.seed(dtype(0))
 
     def test_array_seed(self):
         self.check_seed(numpy.random.randint(0, 2**31, size=40))
@@ -1263,12 +1261,16 @@ def test_choice_invalid_value(self):
 
 class TestResetStates(unittest.TestCase):
 
+    # thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+    # @pytest.mark.thread_unsafe(reason="mutates global _generator.")
     def test_reset_states(self):
         _generator._random_states = "dummy"
         _generator.reset_states()
         assert {} == _generator._random_states
 
 
+# thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+# @pytest.mark.thread_unsafe(reason="mutates global _generator.")
 class TestGetRandomState(unittest.TestCase):
 
     def setUp(self):
@@ -1294,6 +1296,8 @@ def test_get_random_state_memoized(self):
         assert "expected" == rs
 
 
+# thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+# @pytest.mark.thread_unsafe(reason="mutates global _generator.")
 class TestSetRandomState(unittest.TestCase):
 
     def setUp(self):
@@ -1335,6 +1339,8 @@ def test_triangular(self):
         )
 
 
+# thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+# @pytest.mark.thread_unsafe(reason="Mutates global rng instance.")
 class TestRandomStateThreadSafe(unittest.TestCase):
 
     def setUp(self):
@@ -1389,6 +1395,8 @@ def _f(func, args=()):
         assert cupy.random.get_random_state() is rs
 
 
+# thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+# @pytest.mark.thread_unsafe(reason="mutates global random states")
 class TestGetRandomState2(unittest.TestCase):
 
     def setUp(self):
diff --git a/dpnp/tests/third_party/cupy/random_tests/test_generator_api.py b/dpnp/tests/third_party/cupy/random_tests/test_generator_api.py
index 74c689ce664a..1bd858c7f0b8 100644
--- a/dpnp/tests/third_party/cupy/random_tests/test_generator_api.py
+++ b/dpnp/tests/third_party/cupy/random_tests/test_generator_api.py
@@ -26,8 +26,8 @@ def get_rng(self, xp, seed):
         else:
             return numpy.random.Generator(numpy.random.MT19937(seed))
 
-    def set_rng_seed(self, seed):
-        self.rng.bit_generator = random._bit_generator.Philox4x3210(seed=seed)
+    def set_rng_seed(self, rng, seed):
+        rng.bit_generator = random._bit_generator.Philox4x3210(seed=seed)
 
 
 class InvalidOutsMixin:
@@ -333,6 +333,8 @@ class TestDrichlet(common_distributions.Dirichlet, GeneratorTestCase):
 
 @testing.slow
 class TestLarge:
+    # thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+    # @pytest.mark.thread_unsafe(reason="allocates large memory")
     def test_large(self):
         gen = random.Generator(random.XORWOW(1234))
         gen.random(2**31 + 1, dtype=cupy.int8)
diff --git a/dpnp/tests/third_party/cupy/random_tests/test_permutations.py b/dpnp/tests/third_party/cupy/random_tests/test_permutations.py
index 9a2d3b19f90f..ac36d985b2c0 100644
--- a/dpnp/tests/third_party/cupy/random_tests/test_permutations.py
+++ b/dpnp/tests/third_party/cupy/random_tests/test_permutations.py
@@ -66,6 +66,8 @@ def test_permutation_sort_ndim(self, dtype):
 
     # Test seed
 
+    # thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+    # @pytest.mark.thread_unsafe(reason="relies on global random state")
     @testing.for_all_dtypes()
     def test_permutation_seed1(self, dtype):
         flag = cupy.issubdtype(dtype, cupy.unsignedinteger)
@@ -128,6 +130,8 @@ def test_shuffle_sort_ndim(self, dtype):
 
     # Test seed
 
+    # thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+    # @pytest.mark.thread_unsafe(reason="relies on global random state")
     @testing.for_all_dtypes()
     def test_shuffle_seed1(self, dtype):
         flag = cupy.issubdtype(dtype, cupy.unsignedinteger)
diff --git a/dpnp/tests/third_party/cupy/random_tests/test_random.py b/dpnp/tests/third_party/cupy/random_tests/test_random.py
index 60b1f391dce4..994667bdcde8 100644
--- a/dpnp/tests/third_party/cupy/random_tests/test_random.py
+++ b/dpnp/tests/third_party/cupy/random_tests/test_random.py
@@ -9,6 +9,8 @@
 @pytest.mark.skip("random.get_random_state() is not supported yet")
 class TestResetSeed(unittest.TestCase):
 
+    # thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+    # @pytest.mark.thread_unsafe(reason="uses global random state")
     @testing.for_float_dtypes(no_float16=True)
     def test_reset_seed(self, dtype):
         rs = random.get_random_state()
diff --git a/dpnp/tests/third_party/cupy/sorting_tests/test_search.py b/dpnp/tests/third_party/cupy/sorting_tests/test_search.py
index 70b076ce1aaa..1e41e8853012 100644
--- a/dpnp/tests/third_party/cupy/sorting_tests/test_search.py
+++ b/dpnp/tests/third_party/cupy/sorting_tests/test_search.py
@@ -83,6 +83,8 @@ def test_argmax_zero_size_axis1(self, xp, dtype):
         return a.argmax(axis=1)
 
     @testing.slow
+    # thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+    # @pytest.mark.thread_unsafe(reason="allocation too large.")
     def test_argmax_int32_overflow(self):
         a = testing.shaped_arange((2**32 + 1,), cupy, numpy.float64)
         assert a.argmax().item() == 2**32
@@ -162,6 +164,8 @@ def test_argmin_zero_size_axis1(self, xp, dtype):
         return a.argmin(axis=1)
 
     @testing.slow
+    # thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+    # @pytest.mark.thread_unsafe(reason="allocation too large.")
     def test_argmin_int32_overflow(self):
         a = testing.shaped_arange((2**32 + 1,), cupy, numpy.float64)
         cupy.negative(a, out=a)
@@ -186,6 +190,8 @@ def _skip_cuda90(dtype):
         }
     )
 )
+# thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+# @pytest.mark.thread_unsafe(reason="unsafe setUp and counts function calls.")
 @pytest.mark.skip("The CUB routine is not enabled")
 class TestCubReduction:
 
diff --git a/dpnp/tests/third_party/cupy/test_numpy_interop.py b/dpnp/tests/third_party/cupy/test_numpy_interop.py
index 0409c3fdaadc..c9f86d9a5353 100644
--- a/dpnp/tests/third_party/cupy/test_numpy_interop.py
+++ b/dpnp/tests/third_party/cupy/test_numpy_interop.py
@@ -164,6 +164,8 @@ def test_asnumpy_out(self):
         reason="blocking or not is irrelevant when zero-copy is on",
     )
     @pytest.mark.parametrize("blocking", (True, False))
+    # thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+    # @pytest.mark.thread_unsafe(reason="allocation too large.")
     def test_asnumpy_blocking(self, blocking):
         prefactor = 4
         a = cupy.random.random(

From 4975bfa7621a34593e27c1d682162aaa8e99467c Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 12:59:45 +0200
Subject: [PATCH 19/37] Prevent hypergeometric infinite loops and other
 consequences of invalid inputs

---
 .../cupy/random_tests/test_generator.py       | 40 +++++++++++++++++++
 .../cupy/random_tests/test_generator_api.py   | 40 +++++++++++++++++++
 2 files changed, 80 insertions(+)

diff --git a/dpnp/tests/third_party/cupy/random_tests/test_generator.py b/dpnp/tests/third_party/cupy/random_tests/test_generator.py
index 5bef94c6d57d..8b3f61c7b4de 100644
--- a/dpnp/tests/third_party/cupy/random_tests/test_generator.py
+++ b/dpnp/tests/third_party/cupy/random_tests/test_generator.py
@@ -238,6 +238,46 @@ class TestHypergeometric(
     pass
 
 
+class TestHypergeometricValidation:
+
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        self.rs = _generator.RandomState(seed=0)
+
+    def test_hypergeometric_ngood_negative(self):
+        with pytest.raises(ValueError):
+            self.rs.hypergeometric(-1, 10, 5, size=10)
+
+    def test_hypergeometric_nbad_negative(self):
+        with pytest.raises(ValueError):
+            self.rs.hypergeometric(10, -1, 5, size=10)
+
+    def test_hypergeometric_nsample_zero(self):
+        with pytest.raises(ValueError):
+            self.rs.hypergeometric(10, 10, 0, size=10)
+
+    def test_hypergeometric_nsample_negative(self):
+        with pytest.raises(ValueError):
+            self.rs.hypergeometric(10, 10, -1, size=10)
+
+    def test_hypergeometric_nsample_too_large(self):
+        with pytest.raises(ValueError):
+            self.rs.hypergeometric(5, 10, 16, size=10)
+
+    def test_hypergeometric_nsample_equals_total(self):
+        # nsample == ngood + nbad is valid (deterministic)
+        out = self.rs.hypergeometric(5, 10, 15, size=10)
+        testing.assert_array_equal(out, cupy.full(10, 5))
+
+    def test_hypergeometric_ngood_zero(self):
+        out = self.rs.hypergeometric(0, 10, 5, size=10)
+        testing.assert_array_equal(out, cupy.zeros(10))
+
+    def test_hypergeometric_nbad_zero(self):
+        out = self.rs.hypergeometric(5, 0, 5, size=10)
+        testing.assert_array_equal(out, cupy.full(10, 5))
+
+
 @testing.fix_random()
 class TestLaplace(RandomGeneratorTestCase):
 
diff --git a/dpnp/tests/third_party/cupy/random_tests/test_generator_api.py b/dpnp/tests/third_party/cupy/random_tests/test_generator_api.py
index 1bd858c7f0b8..3ed6032029d2 100644
--- a/dpnp/tests/third_party/cupy/random_tests/test_generator_api.py
+++ b/dpnp/tests/third_party/cupy/random_tests/test_generator_api.py
@@ -265,6 +265,46 @@ class TestHypergeometric(
     pass
 
 
+class TestHypergeometricValidation:
+
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        self.gen = random.default_rng(seed=0)
+
+    def test_hypergeometric_ngood_negative(self):
+        with pytest.raises(ValueError):
+            self.gen.hypergeometric(-1, 10, 5, size=10)
+
+    def test_hypergeometric_nbad_negative(self):
+        with pytest.raises(ValueError):
+            self.gen.hypergeometric(10, -1, 5, size=10)
+
+    def test_hypergeometric_nsample_negative(self):
+        with pytest.raises(ValueError):
+            self.gen.hypergeometric(10, 10, -1, size=10)
+
+    def test_hypergeometric_nsample_too_large(self):
+        with pytest.raises(ValueError):
+            self.gen.hypergeometric(5, 10, 16, size=10)
+
+    def test_hypergeometric_nsample_zero(self):
+        # Generator API allows nsample=0 (returns zeros), unlike legacy API
+        out = self.gen.hypergeometric(5, 10, 0, size=10)
+        testing.assert_array_equal(out, cupy.zeros(10, dtype=cupy.int64))
+
+    def test_hypergeometric_nsample_equals_total(self):
+        out = self.gen.hypergeometric(5, 10, 15, size=10)
+        testing.assert_array_equal(out, cupy.full(10, 5, dtype=cupy.int64))
+
+    def test_hypergeometric_ngood_zero(self):
+        out = self.gen.hypergeometric(0, 10, 5, size=10)
+        testing.assert_array_equal(out, cupy.zeros(10, dtype=cupy.int64))
+
+    def test_hypergeometric_nbad_zero(self):
+        out = self.gen.hypergeometric(5, 0, 5, size=10)
+        testing.assert_array_equal(out, cupy.full(10, 5, dtype=cupy.int64))
+
+
 @testing.parameterize(*common_distributions.power_params)
 @testing.fix_random()
 class TestPower(common_distributions.Power, GeneratorTestCase):

From 09f2ce6f6f1471d7ed0f9b0f13632046201b43db Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 13:02:32 +0200
Subject: [PATCH 20/37] Fixup some more tests (mainly cupyx) for free-threading

---
 dpnp/tests/third_party/cupy/core_tests/test_raw.py         | 6 ++++--
 dpnp/tests/third_party/cupy/statistics_tests/test_order.py | 2 ++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/dpnp/tests/third_party/cupy/core_tests/test_raw.py b/dpnp/tests/third_party/cupy/core_tests/test_raw.py
index 2d3b8c153b86..e582a4f4f31e 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_raw.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_raw.py
@@ -8,6 +8,7 @@
 import subprocess
 import sys
 import tempfile
+import threading
 import unittest
 from unittest import mock
 
@@ -580,8 +581,9 @@ def _generate_file(self, ext: str):
             code = compiler._convert_to_hip_source(_test_source5, None, False)
         # split() is needed because nvcc could come from the env var NVCC
         cmd = cc.split()
-        source = "{}/test_load_cubin.cu".format(self.cache_dir)
-        file_path = self.cache_dir + "test_load_cubin"
+        thread_id = threading.get_ident()
+        source = f"{self.cache_dir}/test_load_cubin_{thread_id}.cu"
+        file_path = self.cache_dir + f"test_load_cubin_{thread_id}"
         with open(source, "w") as f:
             f.write(code)
         if not cupy.cuda.runtime.is_hip:
diff --git a/dpnp/tests/third_party/cupy/statistics_tests/test_order.py b/dpnp/tests/third_party/cupy/statistics_tests/test_order.py
index ba15fc16b078..97d464fa63f2 100644
--- a/dpnp/tests/third_party/cupy/statistics_tests/test_order.py
+++ b/dpnp/tests/third_party/cupy/statistics_tests/test_order.py
@@ -65,6 +65,8 @@ def test_percentile_unexpected_method(self, dtype):
 
     # See gh-4453
     @testing.for_float_dtypes()
+    # thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+    # @pytest.mark.thread_unsafe(reason="allocator setting not thread-safe")
     def test_percentile_memory_access(self, dtype):
         # Create an allocator that guarantees array allocated in
         # cupy.percentile call will be followed by a NaN

From b10344f8ea62e70a1ba51fa1575fabd8c26e7797 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 13:06:02 +0200
Subject: [PATCH 21/37] Fix incomplete size guard for CUB segmented reduce and
 scan

---
 .../cupy/math_tests/test_sumprod.py           | 117 ++++++++++++++++++
 1 file changed, 117 insertions(+)

diff --git a/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py b/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py
index 6d8c803e567f..86998a0adf1f 100644
--- a/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py
+++ b/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py
@@ -398,6 +398,123 @@ def _mitigate_cumprod(self, xp, dtype, result):
         return result
 
 
+INT32_MAX = numpy.iinfo(numpy.int32).max
+
+
+# CUB is not supported by dpnp; the original skipif on cupy.cuda.cub.available
+# cannot be evaluated (dpnp has no cupy.cuda), so skip unconditionally.
+# @pytest.mark.skipif(
+#     not cupy.cuda.cub.available, reason="The CUB routine is not enabled")
+@pytest.mark.skip("CUB reduction is not supported")
+@testing.slow
+class TestReductionSizeOverInt32Max:
+
+    @pytest.fixture(autouse=True)
+    def _cub_device_and_memory(self):
+        cupy.get_default_memory_pool().free_all_blocks()
+        cupy.get_default_pinned_memory_pool().free_all_blocks()
+        old_routine = _acc.get_routine_accelerators()
+        old_red = _acc.get_reduction_accelerators()
+        _acc.set_routine_accelerators(["cub"])
+        _acc.set_reduction_accelerators([])
+        yield
+        _acc.set_routine_accelerators(old_routine)
+        _acc.set_reduction_accelerators(old_red)
+        cupy.get_default_memory_pool().free_all_blocks()
+        cupy.get_default_pinned_memory_pool().free_all_blocks()
+
+    @pytest.mark.parametrize(
+        "shape,axis,dtype,part",
+        [
+            ((INT32_MAX + 1024,), None, "int8", "first_part"),
+            ((4, 2**30 + 512), 1, "float32", "second_part"),
+            ((INT32_MAX + 1024, 2), 0, "int8", "first_part"),
+            ((INT32_MAX + 1024, 2), 1, "int32", "second_part"),
+        ],
+    )
+    def test_reduce(self, shape, axis, dtype, part):
+        try:
+            a = cupy.ones(shape, dtype=dtype)
+            # Make first and last element along each slice interesting
+            if axis is None:
+                a[[0, -1]] = [3, -1]
+            elif axis == 0:
+                a[[0, -1], :] = [[3], [-1]]
+            else:
+                a[:, [0, -1]] = [[3, -1]]
+
+            # Test only half of the reductions per test for better speed
+            # (it is still very slow.)
+            if part == "first_part":
+                if axis is None:
+                    # Full reduction: one segment, one 2 and (size-1) ones
+                    assert a.sum() == a.size
+                    assert a.max() == 3
+                    assert a.argmin() == a.size - 1
+                else:
+                    s = a.sum(axis=axis)
+                    expected_sum = shape[axis]
+                    testing.assert_array_equal(
+                        s, cupy.full(s.shape, expected_sum, dtype=s.dtype)
+                    )
+                    testing.assert_array_equal(
+                        a.max(axis=axis), cupy.full(s.shape, 3, dtype=dtype)
+                    )
+                    testing.assert_array_equal(
+                        a.argmin(axis),
+                        cupy.full(s.shape, a.shape[axis] - 1),
+                    )
+            else:
+                if axis is None:
+                    # Full reduction: one segment, one 2 and (size-1) ones
+                    assert a.prod() == -3
+                    assert a.min() == -1
+                    assert a.argmax() == 0
+                else:
+                    p = a.prod(axis=axis)
+                    testing.assert_array_equal(
+                        p, cupy.full(p.shape, -3, dtype=p.dtype)
+                    )
+                    testing.assert_array_equal(
+                        a.min(axis=axis), cupy.full(p.shape, -1, dtype=dtype)
+                    )
+                    testing.assert_array_equal(
+                        a.argmax(axis), cupy.full(p.shape, 0)
+                    )
+        except MemoryError:
+            pytest.skip("out of memory in test.")
+
+    @pytest.mark.parametrize("dtype", [numpy.int8, numpy.int32, numpy.float32])
+    def test_cumsum_size_over_int32_max(self, dtype):
+        """CUB device_scan with size > INT32_MAX."""
+        try:
+            n = INT32_MAX + 1024
+            a = cupy.ones(n, dtype=dtype)
+            a[0] = 3
+            a[-1] = -1
+            out = a.cumsum()
+            expected = n
+            if dtype in (numpy.float32, numpy.float64):
+                testing.assert_allclose(float(out[-1]), expected, rtol=2e-4)
+            else:
+                assert int(out[-1]) == expected
+        except MemoryError:
+            pytest.skip("out of memory in test.")
+
+    @pytest.mark.parametrize("dtype", [numpy.int8, numpy.int32, numpy.float32])
+    def test_cumprod_size_over_int32_max(self, dtype):
+        """CUB device_scan (cumprod) with size > INT32_MAX."""
+        try:
+            n = INT32_MAX + 1024
+            a = cupy.ones(n, dtype=dtype)
+            a[0] = 2
+            a[-1] = 3
+            out = a.cumprod()
+            assert out[-1] == 6  # product of array
+        except MemoryError:
+            pytest.skip("out of memory in test.")
+
+
 # This class compares cuTENSOR results against NumPy's
 @testing.parameterize(
     *testing.product(

From 4f5e9712ef170fcd7f24663075c091c2e09eeda6 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 13:17:37 +0200
Subject: [PATCH 22/37] Fix regression for 32bit index flag in .real and
 broadcast

---
 .../cupy/creation_tests/test_basic.py         | 55 +++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/dpnp/tests/third_party/cupy/creation_tests/test_basic.py b/dpnp/tests/third_party/cupy/creation_tests/test_basic.py
index c6d5f371e52f..9a5e349bf1a1 100644
--- a/dpnp/tests/third_party/cupy/creation_tests/test_basic.py
+++ b/dpnp/tests/third_party/cupy/creation_tests/test_basic.py
@@ -337,6 +337,61 @@ def test_full_like_subok(self):
         with pytest.raises(NotImplementedError):
             cupy.full_like(a, 1, subok=True)
 
+    @pytest.mark.skip("_index_32_bits attribute is not supported by dpnp")
+    @pytest.mark.slow
+    # thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+    # @pytest.mark.thread_unsafe(reason="large allocations")
+    @pytest.mark.parametrize(
+        "arr_factory,expected",
+        [
+            (lambda: cupy.empty(2**31 - 1, dtype=cupy.int8), True),
+            (lambda: cupy.empty(2**31, dtype=cupy.int8), True),
+            (lambda: cupy.empty(2**31 + 1, dtype=cupy.int8)[::2], False),
+            (lambda: cupy.empty(2**31 // 8, dtype=cupy.complex64), True),
+            (lambda: cupy.empty(2**31 // 8 + 1, dtype=cupy.complex64), False),
+            # Regression test for gh-9750:
+            (lambda: cupy.empty(2**31 // 8, dtype=cupy.complex64).real, True),
+            (
+                lambda: cupy.empty(2**31 // 8 + 1, dtype=cupy.complex64).real,
+                False,
+            ),
+            # broadcasting also causes this, test both broadcast_to and normal:
+            (
+                lambda: cupy.broadcast_to(
+                    cupy.empty(2**30 + 1, dtype=cupy.int8), (2, 2**30 + 1)
+                ),
+                False,
+            ),
+            (
+                lambda: cupy.broadcast_arrays(
+                    cupy.empty(2**30 + 1, dtype=cupy.int8), cupy.empty((2, 1))
+                )[0],
+                False,
+            ),
+            # Also test raw "broadcasting path":
+            (
+                lambda: cupy.ndarray(
+                    shape=(2**30 + 1, 2), strides=(1, 0), dtype=cupy.int8
+                ),
+                False,
+            ),
+            # These ones are debatable, the start pointers are OK, but the
+            # range extends beyond 32bits on a byte level:
+            (lambda: cupy.empty((2**31 + 1) // 3, dtype="i1,i1,i1"), False),
+            # Same cupy.byte_bounds as above, but strided
+            # (size * itemsize is OK):
+            (
+                lambda: cupy.empty((2**31 + 1) // 3, dtype="i1,i1,i1")[
+                    ::2
+                ].view(),
+                False,
+            ),
+        ],
+    )
+    def test_index_32_bits(self, arr_factory, expected):
+        assert arr_factory()._index_32_bits == expected
+        cupy.get_default_memory_pool().free_all_blocks()
+
 
 @testing.parameterize(
     *testing.product(

From 764216be3126d2d57492ae57dd21cac3a01fca12 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 13:20:36 +0200
Subject: [PATCH 23/37] Skip test_solve_singular_empty on NumPy >= 2.4

---
 dpnp/tests/third_party/cupy/linalg_tests/test_solve.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dpnp/tests/third_party/cupy/linalg_tests/test_solve.py b/dpnp/tests/third_party/cupy/linalg_tests/test_solve.py
index bac6591bb7f0..856a8f66af2d 100644
--- a/dpnp/tests/third_party/cupy/linalg_tests/test_solve.py
+++ b/dpnp/tests/third_party/cupy/linalg_tests/test_solve.py
@@ -80,6 +80,7 @@ def check_shape(self, a_shape, b_shape, error_types):
     # NumPy with OpenBLAS returns an empty array
     # while numpy with OneMKL raises LinAlgError
     @pytest.mark.skip("Undefined behavior")
+    @testing.with_requires("numpy<2.4")
     @testing.numpy_cupy_allclose()
     def test_solve_singular_empty(self, xp):
         a = xp.zeros((3, 3))  # singular

From aeed1edfb32663d624d6aac654784b9a55e25fd6 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 13:23:58 +0200
Subject: [PATCH 24/37] Cherry pick rocm fixes

---
 .../third_party/cupy/random_tests/test_bit_generator.py  | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/dpnp/tests/third_party/cupy/random_tests/test_bit_generator.py b/dpnp/tests/third_party/cupy/random_tests/test_bit_generator.py
index a94202cf19b7..339b2457c490 100644
--- a/dpnp/tests/third_party/cupy/random_tests/test_bit_generator.py
+++ b/dpnp/tests/third_party/cupy/random_tests/test_bit_generator.py
@@ -49,9 +49,6 @@ def test_array_seed(self):
 
 @testing.with_requires("numpy>=1.17.0")
 @testing.fix_random()
-@pytest.mark.skipif(
-    cupy.cuda.runtime.is_hip, reason="HIP does not support this"
-)
 class TestBitGeneratorXORWOW(BitGeneratorTestCase, unittest.TestCase):
     def setUp(self):
         super().setUp()
@@ -60,9 +57,6 @@ def setUp(self):
 
 @testing.with_requires("numpy>=1.17.0")
 @testing.fix_random()
-@pytest.mark.skipif(
-    cupy.cuda.runtime.is_hip, reason="HIP does not support this"
-)
 class TestBitGeneratorMRG32k3a(BitGeneratorTestCase, unittest.TestCase):
     def setUp(self):
         super().setUp()
@@ -71,9 +65,6 @@ def setUp(self):
 
 @testing.with_requires("numpy>=1.17.0")
 @testing.fix_random()
-@pytest.mark.skipif(
-    cupy.cuda.runtime.is_hip, reason="HIP does not support this"
-)
 class TestBitGeneratorPhilox4x3210(BitGeneratorTestCase, unittest.TestCase):
     def setUp(self):
         super().setUp()

From 65197e2e14f7f4b65edaca0f94ea922ea1a1191f Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 13:25:53 +0200
Subject: [PATCH 25/37] Make cutensor bindings threadsafe (and some small
 fixes)

---
 dpnp/tests/third_party/cupy/core_tests/test_raw.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/dpnp/tests/third_party/cupy/core_tests/test_raw.py b/dpnp/tests/third_party/cupy/core_tests/test_raw.py
index e582a4f4f31e..43002cba815e 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_raw.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_raw.py
@@ -450,12 +450,8 @@ def tearDown(self):
             # kernel uses nvcc, with which I/O cannot be avoided
             files = os.listdir(self.cache_dir)
             for f in files:
-                if f == "test_load_cubin.cu":
-                    count = 1
-                    break
-            else:
-                count = 0
-            assert len(files) == count
+                # only test_load_cubin_*.cu files should be present
+                assert re.match(r"test_load_cubin_(\d+)\.cu", f)
 
         self.in_memory_context.__exit__(*sys.exc_info())
         self.temporary_cache_dir_context.__exit__(*sys.exc_info())

From 6c56f3b6fc51fc1128e37a35a39430bc92b3f900 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 13:27:44 +0200
Subject: [PATCH 26/37] Validate hypergeometric inputs without syncing

---
 .../third_party/cupy/random_tests/test_generator.py   | 11 +++++++----
 .../cupy/random_tests/test_generator_api.py           | 11 +++++++----
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/dpnp/tests/third_party/cupy/random_tests/test_generator.py b/dpnp/tests/third_party/cupy/random_tests/test_generator.py
index 8b3f61c7b4de..ecd545c7cd63 100644
--- a/dpnp/tests/third_party/cupy/random_tests/test_generator.py
+++ b/dpnp/tests/third_party/cupy/random_tests/test_generator.py
@@ -260,15 +260,18 @@ def test_hypergeometric_nsample_negative(self):
         with pytest.raises(ValueError):
             self.rs.hypergeometric(10, 10, -1, size=10)
 
-    def test_hypergeometric_nsample_too_large(self):
-        with pytest.raises(ValueError):
-            self.rs.hypergeometric(5, 10, 16, size=10)
-
     def test_hypergeometric_nsample_equals_total(self):
         # nsample == ngood + nbad is valid (deterministic)
         out = self.rs.hypergeometric(5, 10, 15, size=10)
         testing.assert_array_equal(out, cupy.full(10, 5))
 
+    def test_hypergeometric_nsample_exceeds_total(self):
+        # nsample > ngood + nbad would previously cause an infinite
+        # loop in the HRUA kernel. The kernel now routes this through
+        # the HYP path which handles it safely.
+        out = self.rs.hypergeometric(5, 10, 16, size=10)
+        testing.assert_array_equal(out, cupy.full(10, 5))
+
     def test_hypergeometric_ngood_zero(self):
         out = self.rs.hypergeometric(0, 10, 5, size=10)
         testing.assert_array_equal(out, cupy.zeros(10))
diff --git a/dpnp/tests/third_party/cupy/random_tests/test_generator_api.py b/dpnp/tests/third_party/cupy/random_tests/test_generator_api.py
index 3ed6032029d2..1e03d0a4246b 100644
--- a/dpnp/tests/third_party/cupy/random_tests/test_generator_api.py
+++ b/dpnp/tests/third_party/cupy/random_tests/test_generator_api.py
@@ -283,10 +283,6 @@ def test_hypergeometric_nsample_negative(self):
         with pytest.raises(ValueError):
             self.gen.hypergeometric(10, 10, -1, size=10)
 
-    def test_hypergeometric_nsample_too_large(self):
-        with pytest.raises(ValueError):
-            self.gen.hypergeometric(5, 10, 16, size=10)
-
     def test_hypergeometric_nsample_zero(self):
         # Generator API allows nsample=0 (returns zeros), unlike legacy API
         out = self.gen.hypergeometric(5, 10, 0, size=10)
@@ -296,6 +292,13 @@ def test_hypergeometric_nsample_equals_total(self):
         out = self.gen.hypergeometric(5, 10, 15, size=10)
         testing.assert_array_equal(out, cupy.full(10, 5, dtype=cupy.int64))
 
+    def test_hypergeometric_nsample_exceeds_total(self):
+        # nsample > ngood + nbad would previously cause an infinite
+        # loop in the HRUA kernel. The kernel now routes this through
+        # the HYP path which handles it safely.
+        out = self.gen.hypergeometric(5, 10, 16, size=10)
+        testing.assert_array_equal(out, cupy.full(10, 5, dtype=cupy.int64))
+
     def test_hypergeometric_ngood_zero(self):
         out = self.gen.hypergeometric(0, 10, 5, size=10)
         testing.assert_array_equal(out, cupy.zeros(10, dtype=cupy.int64))

From b6a26dcf3632ddb8ce8baf712c8c43670a8732f1 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 13:29:37 +0200
Subject: [PATCH 27/37] Remove NumericTraits specializations for complex types

---
 dpnp/tests/third_party/cupy/math_tests/test_sumprod.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py b/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py
index 86998a0adf1f..c102a88a8153 100644
--- a/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py
+++ b/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py
@@ -333,7 +333,7 @@ def test_cub_prod(self, xp, dtype, axis):
     # don't test float16 as it's not as accurate?
     # thread_unsafe marker requires pytest-run-parallel, not used by dpnp
     # @pytest.mark.thread_unsafe(reason="unsafe AssertFunctionIsCalled.")
-    @testing.for_dtypes("bhilBHILfdF")
+    @testing.for_dtypes("bhilBHILfdFD")
     @testing.numpy_cupy_allclose(rtol=1e-4)
     def test_cub_cumsum(self, xp, dtype):
         if self.backend == "block":
@@ -360,7 +360,7 @@ def test_cub_cumsum(self, xp, dtype):
     # don't test float16 as it's not as accurate?
     # thread_unsafe marker requires pytest-run-parallel, not used by dpnp
     # @pytest.mark.thread_unsafe(reason="unsafe AssertFunctionIsCalled.")
-    @testing.for_dtypes("bhilBHILfdF")
+    @testing.for_dtypes("bhilBHILfdFD")
     @testing.numpy_cupy_allclose(rtol=1e-4)
     def test_cub_cumprod(self, xp, dtype):
         if self.backend == "block":

From 60dd6168622defb42e96a7786933c9b2cef82fc5 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 13:33:04 +0200
Subject: [PATCH 28/37] Fix silent corruption in thrust sort/argsort/lexsort
 under

---
 .../cupy/sorting_tests/test_sort.py           | 82 +++++++++++++++++++
 1 file changed, 82 insertions(+)

diff --git a/dpnp/tests/third_party/cupy/sorting_tests/test_sort.py b/dpnp/tests/third_party/cupy/sorting_tests/test_sort.py
index 3bf1c405d0d0..9a6ab8981711 100644
--- a/dpnp/tests/third_party/cupy/sorting_tests/test_sort.py
+++ b/dpnp/tests/third_party/cupy/sorting_tests/test_sort.py
@@ -454,6 +454,88 @@ def test_sort_complex_nan(self, xp, dtype):
         return a, xp.sort_complex(a)
 
 
+@pytest.mark.skip(
+    "cupy.cuda.using_allocator / memory pool internals are not supported"
+)
+class TestThrustWorkspaceOOM:
+    """Regression tests for cupy/cupy#9894.
+
+    When thrust's workspace allocation fails, sort/argsort/lexsort must
+    raise ``MemoryError`` instead of silently producing corrupt results.
+
+    Each op may make several pre-thrust allocations (e.g. ``data.copy()``
+    and ``idx_array``) before reaching thrust.  Failing the *first*
+    allocation only exercises pre-existing OOM behavior, not this fix.
+    To target thrust's workspace specifically, we count allocations during
+    a successful run, then re-run with the *last* allocation forced to
+    fail.  Since thrust is called last in each routine, the final
+    allocation is always inside thrust's workspace request.
+    """
+
+    @staticmethod
+    def _verify_workspace_oom_raises(op):
+        pool = cupy.get_default_memory_pool()
+        n = [0]
+
+        def counting(size):
+            n[0] += 1
+            return pool.malloc(size)
+
+        with cupy.cuda.using_allocator(counting):
+            op()
+        assert n[0] >= 1, "expected at least one allocation"
+        total = n[0]
+
+        seen = [0]
+
+        def fail_on_last(size):
+            seen[0] += 1
+            if seen[0] >= total:
+                raise cupy.cuda.memory.OutOfMemoryError(size, 0, 0)
+            return pool.malloc(size)
+
+        with cupy.cuda.using_allocator(fail_on_last):
+            with pytest.raises(MemoryError):
+                op()
+
+    def test_sort_workspace_oom(self):
+        self._verify_workspace_oom_raises(
+            lambda: cupy.arange(100_000, dtype=cupy.float32).sort()
+        )
+
+    def test_argsort_workspace_oom(self):
+        self._verify_workspace_oom_raises(
+            lambda: cupy.arange(100_000, dtype=cupy.float32).argsort()
+        )
+
+    def test_lexsort_workspace_oom(self):
+        self._verify_workspace_oom_raises(
+            lambda: cupy.lexsort(
+                cupy.arange(100_000, dtype=cupy.float32).reshape(2, 50_000)
+            )
+        )
+
+    # thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+    # @pytest.mark.thread_unsafe(
+    #     reason="contextlib.redirect_stderr replaces sys.stderr globally")
+    def test_no_stderr_noise_on_workspace_oom(self):
+        # The thrust allocator's `noexcept`-driven stderr trace was
+        # confusing to users (cupy/cupy#9894).  After the fix, OOM produces a
+        # clean MemoryError with no "Exception ignored" trace and no
+        # OutOfMemoryError print on stderr.
+        import contextlib
+        import io
+
+        buf = io.StringIO()
+        with contextlib.redirect_stderr(buf):
+            self._verify_workspace_oom_raises(
+                lambda: cupy.arange(100_000, dtype=cupy.float32).sort()
+            )
+        stderr = buf.getvalue()
+        assert "Exception ignored" not in stderr, stderr
+        assert "OutOfMemoryError" not in stderr, stderr
+
+
 @testing.parameterize(
     *testing.product(
         {

From b72960efa54931eda1dae0b5ee5e5ecdfc214895 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 13:35:42 +0200
Subject: [PATCH 29/37] Remove test_assumed_runtime_version

---
 .../cupy/cuda_tests/test_runtime.py           | 21 -------------------
 1 file changed, 21 deletions(-)

diff --git a/dpnp/tests/third_party/cupy/cuda_tests/test_runtime.py b/dpnp/tests/third_party/cupy/cuda_tests/test_runtime.py
index 0e432da5b653..d6a8b3bd5f57 100644
--- a/dpnp/tests/third_party/cupy/cuda_tests/test_runtime.py
+++ b/dpnp/tests/third_party/cupy/cuda_tests/test_runtime.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import pickle
-import sys
 
 import pytest
 
@@ -14,7 +13,6 @@
 
 # CUDA runtime internals have no dpnp equivalent
 # from cupy.cuda import driver
-# from cupy.cuda import nvrtc
 # from cupy.cuda import runtime
 
 
@@ -62,25 +60,6 @@ def test_mallocFromPoolAsync(self):
         runtime.memPoolDestroy(pool)
 
 
-@pytest.mark.skipif(
-    runtime.is_hip, reason="This assumption is correct only in CUDA"
-)
-def test_assumed_runtime_version():
-    # When CUDA Python is enabled, CuPy calculates the CUDA runtime version
-    # from NVRTC version. This test ensures that the assumption is correct
-    # by running the same logic in non-CUDA Python environment.
-    # When this fails, `runtime.runtimeGetVersion()` logic needs to be fixed.
-    major, minor = nvrtc.getVersion()
-    local_ver = runtime._getLocalRuntimeVersion()
-    # On Windows, starting from CUDA 13.0, cudaRuntimeGetVersion() always
-    # returns major * 1000 regardless of the minor version (nvbugs 5955788,
-    # 5523579). Accept either form on Windows + CUDA >= 13.
-    if sys.platform == "win32" and major >= 13:
-        assert local_ver in (major * 1000, major * 1000 + minor * 10)
-    else:
-        assert local_ver == major * 1000 + minor * 10
-
-
 def test_major_version():
     major = runtime._getCUDAMajorVersion()
     if runtime.is_hip:

From 1fc346184826853bc4a11bb9ff898115e0c109c8 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 13:40:06 +0200
Subject: [PATCH 30/37] Avoid hard pytest dependency in cupy.testing (and test)

---
 dpnp/tests/third_party/cupy/test_init.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/dpnp/tests/third_party/cupy/test_init.py b/dpnp/tests/third_party/cupy/test_init.py
index a4f1ad78f7f5..2a533e94dc89 100644
--- a/dpnp/tests/third_party/cupy/test_init.py
+++ b/dpnp/tests/third_party/cupy/test_init.py
@@ -57,6 +57,28 @@ def test_import_error(self):
         assert stdoutdata in (b"", b"RuntimeError\n")
 
 
+@pytest.mark.skip("dpnp has no public dpnp.testing module")
+def test_testing_import_does_not_require_pytest():
+    # cupy.testing import is lazy, but some environments tend to inspect
+    # it anyway.  Check that even an * import doesn't require pytest.
+    returncode, stdoutdata, stderrdata = _run_script("""
+import sys
+
+class BlockPytest:
+    def find_spec(self, fullname, path=None, target=None):
+        if fullname == 'pytest' or fullname.startswith('_pytest'):
+            raise ModuleNotFoundError(fullname)
+        return None
+import sys
+sys.meta_path.insert(0, BlockPytest())
+
+# non-lazy import should succeed even if pytest is not available
+from dpnp.testing import *
+""")
+    assert returncode == 0, "stderr: {!r}".format(stderrdata)
+    assert stdoutdata == b""
+
+
 # if not cupy.cuda.runtime.is_hip:
 #     visible = "CUDA_VISIBLE_DEVICES"
 # else:

From 5b5747b276fe1947f196a14e16620a503380a9f1 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 13:42:28 +0200
Subject: [PATCH 31/37] Advertise free-threading support and add linux CI run

---
 dpnp/tests/third_party/cupy/core_tests/test_cub_reduction.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/dpnp/tests/third_party/cupy/core_tests/test_cub_reduction.py b/dpnp/tests/third_party/cupy/core_tests/test_cub_reduction.py
index 2adcbfe6ed32..9765f4c10fa0 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_cub_reduction.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_cub_reduction.py
@@ -148,6 +148,9 @@ def test_can_use_cub_oversize_input4(self):
         b = cupy.empty((), dtype=cupy.int8)
         assert self.can_use([a], [b], (1,), (0,)) is None
 
+    # thread_unsafe marker requires pytest-run-parallel, not used by dpnp
+    # @pytest.mark.thread_unsafe(
+    #     reason="AssertFunctionIsCalled and accelerate mutation.")
     def test_can_use_accelerator_set_unset(self):
         # ensure we use CUB block reduction and not CUB device reduction
         old_routine_accelerators = _accelerator.get_routine_accelerators()

From 1f981c489dd0bfb42c7d81fadc00bd9ab71e52fc Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 13:44:32 +0200
Subject: [PATCH 32/37] Use cuda.pathfinder for CUDA component discovery

---
 .../third_party/cupy/core_tests/test_include.py     | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/dpnp/tests/third_party/cupy/core_tests/test_include.py b/dpnp/tests/third_party/cupy/core_tests/test_include.py
index 1e738f7977bf..cc593d0c9a77 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_include.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_include.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
 
-import os
-
 import pytest
 
 import dpnp as cupy
@@ -58,12 +56,17 @@ def _get_cuda_archs(self):
         return archs
 
     def _get_options(self):
-        return (
+        from cuda.pathfinder import find_nvidia_header_directory
+
+        include_dir = find_nvidia_header_directory("cudart")
+        opts = [
             "-std=c++17",
             *cupy._core.core._get_cccl_include_options(),
             "-I{}".format(cupy._core.core._get_header_dir_path()),
-            "-I{}".format(os.path.join(cupy.cuda.get_cuda_path(), "include")),
-        )
+        ]
+        if include_dir is not None:
+            opts.append("-I{}".format(include_dir))
+        return tuple(opts)
 
     def test_nvcc(self):
         options = self._get_options()

From e629924ea46635b12c622dc48b6e36b47f5622cc Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 13:51:22 +0200
Subject: [PATCH 33/37] Assert cupy.linalg.solve throws LinAlgError

---
 .../cupy/linalg_tests/test_solve.py           | 29 ++++++++++++++-----
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/dpnp/tests/third_party/cupy/linalg_tests/test_solve.py b/dpnp/tests/third_party/cupy/linalg_tests/test_solve.py
index 856a8f66af2d..3b789480fd8b 100644
--- a/dpnp/tests/third_party/cupy/linalg_tests/test_solve.py
+++ b/dpnp/tests/third_party/cupy/linalg_tests/test_solve.py
@@ -76,18 +76,31 @@ def check_shape(self, a_shape, b_shape, error_types):
             with pytest.raises(error_type):
                 xp.linalg.solve(a, b)
 
-    # Undefined behavior is implementation-dependent:
-    # NumPy with OpenBLAS returns an empty array
-    # while numpy with OneMKL raises LinAlgError
-    @pytest.mark.skip("Undefined behavior")
-    @testing.with_requires("numpy<2.4")
+    def test_solve_singular_empty(self):
+        a = cupy.zeros((3, 3))  # singular
+        b = cupy.empty((3, 0))  # nrhs = 0
+        c = cupy.linalg.solve(a, b)
+        assert c.size == 0
+
     @testing.numpy_cupy_allclose()
-    def test_solve_singular_empty(self, xp):
-        a = xp.zeros((3, 3))  # singular
+    def test_solve_non_singular_empty(self, xp):
+        a = xp.eye(3)  # non-singular
         b = xp.empty((3, 0))  # nrhs = 0
-        # LinAlgError("Singular matrix") is not raised
         return xp.linalg.solve(a, b)
 
+    @pytest.mark.skip("cupyx.errstate(linalg='raise') is not supported")
+    def test_solve_singular_empty__assert_raises(self):
+        # OpenBLAS with NumPy 2.4.3 started raising a LinAlgError here,
+        # which seems correct.  We raise currently (do not test against
+        # NumPy as the behavior may depend on the BLAS version used)
+        a = cupy.zeros((3, 3))  # singular
+        b = cupy.empty((3, 0))  # nrhs = 0
+        # errstate is 'ignore' by default since enabling it causes
+        # synchronization
+        with cupyx.errstate(linalg="raise"):
+            with pytest.raises(numpy.linalg.LinAlgError):
+                cupy.linalg.solve(a, b)
+
     @testing.with_requires("numpy>=2.0")
     def test_invalid_shape(self):
         linalg_errors = {

From 52aaf4871543e42cee8ccc07cb4dccc13ddb83f5 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 13:54:54 +0200
Subject: [PATCH 34/37] Drop stale xfail on
 TestChoiceChi.test_goodness_of_fit_2

---
 dpnp/tests/third_party/cupy/random_tests/test_generator.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dpnp/tests/third_party/cupy/random_tests/test_generator.py b/dpnp/tests/third_party/cupy/random_tests/test_generator.py
index ecd545c7cd63..1cda3a3dc7cf 100644
--- a/dpnp/tests/third_party/cupy/random_tests/test_generator.py
+++ b/dpnp/tests/third_party/cupy/random_tests/test_generator.py
@@ -893,7 +893,6 @@ def test_goodness_of_fit(self):
         assert _hypothesis.chi_square_test(counts, expected)
 
     @_condition.repeat(3, 10)
-    # @pytest.mark.xfail(runtime.is_hip, reason="ROCm/HIP may have a bug")
     def test_goodness_of_fit_2(self):
         vals = self.generate(3, (5, 20), True, [0.3, 0.3, 0.4]).get()
         counts = numpy.histogram(vals, bins=numpy.arange(4))[0]

From f099e5900c87a3b4d6508e849f8389d89751ca3b Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 13:57:40 +0200
Subject: [PATCH 35/37] Make new pytest versions happy

---
 dpnp/tests/third_party/cupy/math_tests/test_sumprod.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py b/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py
index c102a88a8153..e6411e298fdd 100644
--- a/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py
+++ b/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py
@@ -528,7 +528,8 @@ def test_cumprod_size_over_int32_max(self, dtype):
 class TestCuTensorReduction:
 
     @pytest.fixture(autouse=True, scope="class")
-    def setup(self):
+    @classmethod
+    def setup(cls):
         old_accelerators = cupy._core.get_routine_accelerators()
         cupy._core.set_routine_accelerators(["cutensor"])
         yield

From 4787a251d55a99eda6476e65a916db3224c63e1e Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 14:15:12 +0200
Subject: [PATCH 36/37] Add pytest support to @testing.for_contiguous_axes
 decorator

---
 .../cupy/core_tests/test_cub_reduction.py     |  39 +-
 .../cupy/core_tests/test_ndarray_reduction.py | 351 ++++++++---------
 .../cupy/math_tests/test_sumprod.py           | 358 ++++++++----------
 dpnp/tests/third_party/cupy/testing/_loops.py |   6 +-
 4 files changed, 341 insertions(+), 413 deletions(-)

diff --git a/dpnp/tests/third_party/cupy/core_tests/test_cub_reduction.py b/dpnp/tests/third_party/cupy/core_tests/test_cub_reduction.py
index 9765f4c10fa0..0bbc1296a3f7 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_cub_reduction.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_cub_reduction.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import sys
-import unittest
 from itertools import combinations
 
 import pytest
@@ -21,13 +20,14 @@
 # This test class and its children below only test if CUB backend can be used
 # or not; they don't verify its correctness as it's already extensively covered
 # by existing tests
-class CubReductionTestBase(unittest.TestCase):
+class CubReductionTestBase:
     """
     Note: call self.can_use() when arrays are already allocated, otherwise
     call self._test_can_use().
     """
 
-    def setUp(self):
+    @pytest.fixture(autouse=True)
+    def configure(self):
         if _environment.get_cub_path() is None:
             pytest.skip("CUB not found")
         if cupy.cuda.runtime.is_hip:
@@ -38,8 +38,7 @@ def setUp(self):
 
         self.old_accelerators = _accelerator.get_reduction_accelerators()
         _accelerator.set_reduction_accelerators(["cub"])
-
-    def tearDown(self):
+        yield
         _accelerator.set_reduction_accelerators(self.old_accelerators)
 
     def _test_can_use(self, i_shape, o_shape, r_axis, o_axis, order, expected):
@@ -53,40 +52,32 @@ def _test_can_use(self, i_shape, o_shape, r_axis, o_axis, order, expected):
         assert result is expected
 
 
-@testing.parameterize(
-    *testing.product(
-        {
-            "shape": [(2,), (2, 3), (2, 3, 4), (2, 3, 4, 5)],
-            "order": ("C", "F"),
-        }
-    )
-)
+@pytest.mark.parametrize("shape", [(2,), (2, 3), (2, 3, 4), (2, 3, 4, 5)])
+@pytest.mark.parametrize("order", ["C", "F"])
 class TestSimpleCubReductionKernelContiguity(CubReductionTestBase):
 
     @testing.for_contiguous_axes()
-    def test_can_use_cub_contiguous(self, axis):
+    def test_can_use_cub_contiguous(self, axis, shape, order):
         r_axis = axis
-        i_shape = self.shape
+        i_shape = shape
         o_axis = tuple(i for i in range(len(i_shape)) if i not in r_axis)
-        o_shape = tuple(self.shape[i] for i in o_axis)
-        self._test_can_use(i_shape, o_shape, r_axis, o_axis, self.order, True)
+        o_shape = tuple(shape[i] for i in o_axis)
+        self._test_can_use(i_shape, o_shape, r_axis, o_axis, order, True)
 
     @testing.for_contiguous_axes()
-    def test_can_use_cub_non_contiguous(self, axis):
+    def test_can_use_cub_non_contiguous(self, axis, shape, order):
         # array is contiguous, but reduce_axis is not
-        dim = len(self.shape)
+        dim = len(shape)
         r_dim = len(axis)
         non_contiguous_axes = [
             i for i in combinations(range(dim), r_dim) if i != axis
         ]
 
-        i_shape = self.shape
+        i_shape = shape
         for r_axis in non_contiguous_axes:
             o_axis = tuple(i for i in range(dim) if i not in r_axis)
-            o_shape = tuple(self.shape[i] for i in o_axis)
-            self._test_can_use(
-                i_shape, o_shape, r_axis, o_axis, self.order, False
-            )
+            o_shape = tuple(shape[i] for i in o_axis)
+            self._test_can_use(i_shape, o_shape, r_axis, o_axis, order, False)
 
 
 class TestSimpleCubReductionKernelMisc(CubReductionTestBase):
diff --git a/dpnp/tests/third_party/cupy/core_tests/test_ndarray_reduction.py b/dpnp/tests/third_party/cupy/core_tests/test_ndarray_reduction.py
index a0fa09d7661d..b774cdff4394 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_ndarray_reduction.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_ndarray_reduction.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+from itertools import combinations
+
 import numpy
 import pytest
 
@@ -7,13 +9,7 @@
 from dpnp.tests.third_party.cupy import testing
 
 
-@testing.parameterize(
-    *testing.product(
-        {
-            "order": ("C", "F"),
-        }
-    )
-)
+@pytest.mark.parametrize("order", ["C", "F"])
 class TestArrayReduction:
 
     @pytest.fixture(scope="class")
@@ -21,17 +17,17 @@ class TestArrayReduction:
     def exclude_cutensor(cls):
         # cuTENSOR seems to have issues in handling inf/nan in reduction-based
         # routines, so we use this fixture to skip testing it
-        # self.old_routine_accelerators = _acc.get_routine_accelerators()
-        # self.old_reduction_accelerators = _acc.get_reduction_accelerators()
+        # old_routine_accelerators = _acc.get_routine_accelerators()
+        # old_reduction_accelerators = _acc.get_reduction_accelerators()
 
-        # rot_acc = self.old_routine_accelerators.copy()
+        # rot_acc = old_routine_accelerators.copy()
         # try:
         #     rot_acc.remove(_acc.ACCELERATOR_CUTENSOR)
         # except ValueError:
         #     pass
         # _acc.set_routine_accelerators(rot_acc)
 
-        # red_acc = self.old_reduction_accelerators.copy()
+        # red_acc = old_reduction_accelerators.copy()
         # try:
         #     red_acc.remove(_acc.ACCELERATOR_CUTENSOR)
         # except ValueError:
@@ -40,343 +36,318 @@ def exclude_cutensor(cls):
 
         # yield
 
-        # _acc.set_routine_accelerators(self.old_routine_accelerators)
-        # _acc.set_reduction_accelerators(self.old_reduction_accelerators)
+        # _acc.set_routine_accelerators(old_routine_accelerators)
+        # _acc.set_reduction_accelerators(old_reduction_accelerators)
         pass
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_max_all(self, xp, dtype):
-        a = testing.shaped_random((2, 3), xp, dtype, order=self.order)
+    def test_max_all(self, xp, dtype, order):
+        a = testing.shaped_random((2, 3), xp, dtype, order=order)
         return a.max()
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_max_all_keepdims(self, xp, dtype):
-        a = testing.shaped_random((2, 3), xp, dtype, order=self.order)
+    def test_max_all_keepdims(self, xp, dtype, order):
+        a = testing.shaped_random((2, 3), xp, dtype, order=order)
         return a.max(keepdims=True)
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_max_axis_large(self, xp, dtype):
-        a = testing.shaped_random((3, 1000), xp, dtype, order=self.order)
+    def test_max_axis_large(self, xp, dtype, order):
+        a = testing.shaped_random((3, 1000), xp, dtype, order=order)
         return a.max(axis=0)
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_max_axis0(self, xp, dtype):
-        a = testing.shaped_random((2, 3, 4), xp, dtype, order=self.order)
+    def test_max_axis0(self, xp, dtype, order):
+        a = testing.shaped_random((2, 3, 4), xp, dtype, order=order)
         return a.max(axis=0)
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_max_axis1(self, xp, dtype):
-        a = testing.shaped_random((2, 3, 4), xp, dtype, order=self.order)
+    def test_max_axis1(self, xp, dtype, order):
+        a = testing.shaped_random((2, 3, 4), xp, dtype, order=order)
         return a.max(axis=1)
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_max_axis2(self, xp, dtype):
-        a = testing.shaped_random((2, 3, 4), xp, dtype, order=self.order)
+    def test_max_axis2(self, xp, dtype, order):
+        a = testing.shaped_random((2, 3, 4), xp, dtype, order=order)
         return a.max(axis=2)
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_max_multiple_axes(self, xp, dtype):
-        a = testing.shaped_random((2, 3, 4), xp, dtype, order=self.order)
+    def test_max_multiple_axes(self, xp, dtype, order):
+        a = testing.shaped_random((2, 3, 4), xp, dtype, order=order)
         return a.max(axis=(1, 2))
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_max_multiple_axes_keepdims(self, xp, dtype):
-        a = testing.shaped_random((2, 3, 4), xp, dtype, order=self.order)
+    def test_max_multiple_axes_keepdims(self, xp, dtype, order):
+        a = testing.shaped_random((2, 3, 4), xp, dtype, order=order)
         return a.max(axis=(1, 2), keepdims=True)
 
     @testing.for_float_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_max_nan(self, xp, dtype, exclude_cutensor):
-        a = xp.array([float("nan"), 1, -1], dtype, order=self.order)
+    def test_max_nan(self, exclude_cutensor, xp, dtype, order):
+        a = xp.array([float("nan"), 1, -1], dtype, order=order)
         return a.max()
 
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_max_nan_real(self, xp, dtype):
-        a = xp.array([float("nan"), 1, -1], dtype, order=self.order)
+    def test_max_nan_real(self, xp, dtype, order):
+        a = xp.array([float("nan"), 1, -1], dtype, order=order)
         return a.max()
 
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_max_nan_imag(self, xp, dtype):
-        a = xp.array(
-            [float("nan") * 1.0j, 1.0j, -1.0j], dtype, order=self.order
-        )
+    def test_max_nan_imag(self, xp, dtype, order):
+        a = xp.array([float("nan") * 1.0j, 1.0j, -1.0j], dtype, order=order)
         return a.max()
 
     @testing.for_float_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_max_inf(self, exclude_cutensor, xp, dtype):
+    def test_max_inf(self, exclude_cutensor, xp, dtype, order):
         # cupy/cupy#8180
-        a = xp.array([-float("inf"), -float("inf")], dtype, order=self.order)
+        a = xp.array([-float("inf"), -float("inf")], dtype, order=order)
         return a.max()
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_min_all(self, xp, dtype):
-        a = testing.shaped_random((2, 3), xp, dtype, order=self.order)
+    def test_min_all(self, xp, dtype, order):
+        a = testing.shaped_random((2, 3), xp, dtype, order=order)
         return a.min()
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_min_all_keepdims(self, xp, dtype):
-        a = testing.shaped_random((2, 3), xp, dtype, order=self.order)
+    def test_min_all_keepdims(self, xp, dtype, order):
+        a = testing.shaped_random((2, 3), xp, dtype, order=order)
         return a.min(keepdims=True)
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_min_axis_large(self, xp, dtype):
-        a = testing.shaped_random((3, 1000), xp, dtype, order=self.order)
+    def test_min_axis_large(self, xp, dtype, order):
+        a = testing.shaped_random((3, 1000), xp, dtype, order=order)
         return a.min(axis=0)
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_min_axis0(self, xp, dtype):
-        a = testing.shaped_random((2, 3, 4), xp, dtype, order=self.order)
+    def test_min_axis0(self, xp, dtype, order):
+        a = testing.shaped_random((2, 3, 4), xp, dtype, order=order)
         return a.min(axis=0)
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_min_axis1(self, xp, dtype):
-        a = testing.shaped_random((2, 3, 4), xp, dtype, order=self.order)
+    def test_min_axis1(self, xp, dtype, order):
+        a = testing.shaped_random((2, 3, 4), xp, dtype, order=order)
         return a.min(axis=1)
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_min_axis2(self, xp, dtype):
-        a = testing.shaped_random((2, 3, 4), xp, dtype, order=self.order)
+    def test_min_axis2(self, xp, dtype, order):
+        a = testing.shaped_random((2, 3, 4), xp, dtype, order=order)
         return a.min(axis=2)
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_min_multiple_axes(self, xp, dtype):
-        a = testing.shaped_random((2, 3, 4), xp, dtype, order=self.order)
+    def test_min_multiple_axes(self, xp, dtype, order):
+        a = testing.shaped_random((2, 3, 4), xp, dtype, order=order)
         return a.min(axis=(1, 2))
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_min_multiple_axes_keepdims(self, xp, dtype):
-        a = testing.shaped_random((2, 3, 4), xp, dtype, order=self.order)
+    def test_min_multiple_axes_keepdims(self, xp, dtype, order):
+        a = testing.shaped_random((2, 3, 4), xp, dtype, order=order)
         return a.min(axis=(1, 2), keepdims=True)
 
     @testing.for_float_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_min_nan(self, xp, dtype, exclude_cutensor):
-        a = xp.array([float("nan"), 1, -1], dtype, order=self.order)
+    def test_min_nan(self, exclude_cutensor, xp, dtype, order):
+        a = xp.array([float("nan"), 1, -1], dtype, order=order)
         return a.min()
 
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_min_nan_real(self, xp, dtype):
-        a = xp.array([float("nan"), 1, -1], dtype, order=self.order)
+    def test_min_nan_real(self, xp, dtype, order):
+        a = xp.array([float("nan"), 1, -1], dtype, order=order)
         return a.min()
 
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_min_nan_imag(self, xp, dtype):
-        a = xp.array(
-            [float("nan") * 1.0j, 1.0j, -1.0j], dtype, order=self.order
-        )
+    def test_min_nan_imag(self, xp, dtype, order):
+        a = xp.array([float("nan") * 1.0j, 1.0j, -1.0j], dtype, order=order)
         return a.min()
 
     @testing.for_float_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_min_inf(self, xp, dtype, exclude_cutensor):
+    def test_min_inf(self, exclude_cutensor, xp, dtype, order):
         # cupy/cupy#8180
-        a = xp.array([float("inf"), float("inf")], dtype, order=self.order)
+        a = xp.array([float("inf"), float("inf")], dtype, order=order)
         return a.min()
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_argmax_all(self, xp, dtype):
-        a = testing.shaped_random((2, 3), xp, dtype, order=self.order)
+    def test_argmax_all(self, xp, dtype, order):
+        a = testing.shaped_random((2, 3), xp, dtype, order=order)
         return a.argmax()
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_argmax_axis_large(self, xp, dtype):
-        a = testing.shaped_random((3, 1000), xp, dtype, order=self.order)
+    def test_argmax_axis_large(self, xp, dtype, order):
+        a = testing.shaped_random((3, 1000), xp, dtype, order=order)
         return a.argmax(axis=0)
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_argmax_axis0(self, xp, dtype):
-        a = testing.shaped_random((2, 3, 4), xp, dtype, order=self.order)
+    def test_argmax_axis0(self, xp, dtype, order):
+        a = testing.shaped_random((2, 3, 4), xp, dtype, order=order)
         return a.argmax(axis=0)
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_argmax_axis1(self, xp, dtype):
-        a = testing.shaped_random((2, 3, 4), xp, dtype, order=self.order)
+    def test_argmax_axis1(self, xp, dtype, order):
+        a = testing.shaped_random((2, 3, 4), xp, dtype, order=order)
         return a.argmax(axis=1)
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_argmax_axis2(self, xp, dtype):
-        a = testing.shaped_random((2, 3, 4), xp, dtype, order=self.order)
+    def test_argmax_axis2(self, xp, dtype, order):
+        a = testing.shaped_random((2, 3, 4), xp, dtype, order=order)
         return a.argmax(axis=2)
 
     @testing.for_float_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_argmax_nan(self, xp, dtype, exclude_cutensor):
-        a = xp.array([float("nan"), 1, -1], dtype, order=self.order)
+    def test_argmax_nan(self, exclude_cutensor, xp, dtype, order):
+        a = xp.array([float("nan"), 1, -1], dtype, order=order)
         return a.argmax()
 
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_argmax_nan_real(self, xp, dtype):
-        a = xp.array([float("nan"), 1, -1], dtype, order=self.order)
+    def test_argmax_nan_real(self, xp, dtype, order):
+        a = xp.array([float("nan"), 1, -1], dtype, order=order)
         return a.argmax()
 
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_argmax_nan_imag(self, xp, dtype):
-        a = xp.array(
-            [float("nan") * 1.0j, 1.0j, -1.0j], dtype, order=self.order
-        )
+    def test_argmax_nan_imag(self, xp, dtype, order):
+        a = xp.array([float("nan") * 1.0j, 1.0j, -1.0j], dtype, order=order)
         return a.argmax()
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_argmin_all(self, xp, dtype):
-        a = testing.shaped_random((2, 3), xp, dtype, order=self.order)
+    def test_argmin_all(self, xp, dtype, order):
+        a = testing.shaped_random((2, 3), xp, dtype, order=order)
         return a.argmin()
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_argmin_axis_large(self, xp, dtype):
-        a = testing.shaped_random((3, 1000), xp, dtype, order=self.order)
+    def test_argmin_axis_large(self, xp, dtype, order):
+        a = testing.shaped_random((3, 1000), xp, dtype, order=order)
         return a.argmin(axis=0)
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_argmin_axis0(self, xp, dtype):
-        a = testing.shaped_random((2, 3, 4), xp, dtype, order=self.order)
+    def test_argmin_axis0(self, xp, dtype, order):
+        a = testing.shaped_random((2, 3, 4), xp, dtype, order=order)
         return a.argmin(axis=0)
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_argmin_axis1(self, xp, dtype):
-        a = testing.shaped_random((2, 3, 4), xp, dtype, order=self.order)
+    def test_argmin_axis1(self, xp, dtype, order):
+        a = testing.shaped_random((2, 3, 4), xp, dtype, order=order)
         return a.argmin(axis=1)
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_argmin_axis2(self, xp, dtype):
-        a = testing.shaped_random((2, 3, 4), xp, dtype, order=self.order)
+    def test_argmin_axis2(self, xp, dtype, order):
+        a = testing.shaped_random((2, 3, 4), xp, dtype, order=order)
         return a.argmin(axis=2)
 
     @testing.for_float_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_argmin_nan(self, xp, dtype, exclude_cutensor):
-        a = xp.array([float("nan"), 1, -1], dtype, order=self.order)
+    def test_argmin_nan(self, exclude_cutensor, xp, dtype, order):
+        a = xp.array([float("nan"), 1, -1], dtype, order=order)
         return a.argmin()
 
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_argmin_nan_real(self, xp, dtype):
-        a = xp.array([float("nan"), 1, -1], dtype, order=self.order)
+    def test_argmin_nan_real(self, xp, dtype, order):
+        a = xp.array([float("nan"), 1, -1], dtype, order=order)
         return a.argmin()
 
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_argmin_nan_imag(self, xp, dtype):
-        a = xp.array(
-            [float("nan") * 1.0j, 1.0j, -1.0j], dtype, order=self.order
-        )
+    def test_argmin_nan_imag(self, xp, dtype, order):
+        a = xp.array([float("nan") * 1.0j, 1.0j, -1.0j], dtype, order=order)
         return a.argmin()
 
 
-@testing.parameterize(
-    *testing.product(
-        {
-            # TODO(leofang): make a @testing.for_all_axes decorator
-            "shape_and_axis": [
-                ((), None),
-                ((0,), (0,)),
-                ((0, 2), (0,)),
-                ((0, 2), (1,)),
-                ((0, 2), (0, 1)),
-                ((2, 0), (0,)),
-                ((2, 0), (1,)),
-                ((2, 0), (0, 1)),
-                ((0, 2, 3), (0,)),
-                ((0, 2, 3), (1,)),
-                ((0, 2, 3), (2,)),
-                ((0, 2, 3), (0, 1)),
-                ((0, 2, 3), (1, 2)),
-                ((0, 2, 3), (0, 2)),
-                ((0, 2, 3), (0, 1, 2)),
-                ((2, 0, 3), (0,)),
-                ((2, 0, 3), (1,)),
-                ((2, 0, 3), (2,)),
-                ((2, 0, 3), (0, 1)),
-                ((2, 0, 3), (1, 2)),
-                ((2, 0, 3), (0, 2)),
-                ((2, 0, 3), (0, 1, 2)),
-                ((2, 3, 0), (0,)),
-                ((2, 3, 0), (1,)),
-                ((2, 3, 0), (2,)),
-                ((2, 3, 0), (0, 1)),
-                ((2, 3, 0), (1, 2)),
-                ((2, 3, 0), (0, 2)),
-                ((2, 3, 0), (0, 1, 2)),
-            ],
-            "order": ("C", "F"),
-            "func": ("min", "max", "argmax", "argmin"),
-        }
-    )
+def _axes_for_shape(shape):
+    if shape == ():
+        return [None]
+    ndim = len(shape)
+    return [
+        combo
+        for r in range(1, ndim + 1)
+        for combo in combinations(range(ndim), r)
+    ]
+
+
+@pytest.mark.parametrize(
+    "shape,axis",
+    [
+        (shape, axis)
+        for shape in [
+            (),
+            (0,),
+            (0, 2),
+            (2, 0),
+            (0, 2, 3),
+            (2, 0, 3),
+            (2, 3, 0),
+        ]
+        for axis in _axes_for_shape(shape)
+    ],
 )
+@pytest.mark.parametrize("order", ["C", "F"])
+@pytest.mark.parametrize("func", ["min", "max", "argmax", "argmin"])
 class TestArrayReductionZeroSize:
 
     @testing.numpy_cupy_allclose(
         contiguous_check=False, accept_error=ValueError
     )
-    def test_zero_size(self, xp):
-        shape, axis = self.shape_and_axis
+    def test_zero_size(self, xp, shape, axis, order, func):
         # NumPy only supports axis being an int
-        if self.func in ("argmax", "argmin"):
+        if func in ("argmax", "argmin"):
             if axis is not None and len(axis) == 1:
                 axis = axis[0]
             else:
-                pytest.skip(
-                    f"NumPy does not support axis={axis} for {self.func}"
-                )
+                pytest.skip(f"NumPy does not support axis={axis} for {func}")
         # dtype is irrelevant here, just pick one
-        a = testing.shaped_random(shape, xp, xp.float32, order=self.order)
-        return getattr(a, self.func)(axis=axis)
+        a = testing.shaped_random(shape, xp, xp.float32, order=order)
+        return getattr(a, func)(axis=axis)
 
 
 # This class compares CUB results against NumPy's. ("fallback" is CuPy's
 # original kernel, also tested here to reduce code duplication.)
-@testing.parameterize(
-    *testing.product(
-        {
-            "shape": [
-                (10,),
-                (10, 20),
-                (10, 20, 30),
-                (10, 20, 30, 40),
-                # skip (2, 3, 0) because it would not hit the CUB code path
-                (0,),
-                (2, 0),
-                (0, 2),
-                (0, 2, 3),
-                (2, 3, 0),
-            ],
-            "order": ("C", "F"),
-            "backend": ("device", "block", "fallback"),
-        }
-    )
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (10,),
+        (10, 20),
+        (10, 20, 30),
+        (10, 20, 30, 40),
+        # skip (2, 3, 0) because it would not hit the CUB code path
+        (0,),
+        (2, 0),
+        (0, 2),
+        (0, 2, 3),
+        (2, 3, 0),
+    ],
 )
+@pytest.mark.parametrize("order", ["C", "F"])
+@pytest.mark.parametrize("backend", ["device", "block", "fallback"])
 @pytest.mark.skip("CUB reduction is not supported")
 # @pytest.mark.skipif(
 #     not cupy.cuda.cub.available, reason="The CUB routine is not enabled"
@@ -385,16 +356,16 @@ def test_zero_size(self, xp):
 class TestCubReduction:
 
     @pytest.fixture(autouse=True)
-    def setUp(self):
+    def setUp(self, backend):
         self.old_routine_accelerators = _acc.get_routine_accelerators()
         self.old_reduction_accelerators = _acc.get_reduction_accelerators()
-        if self.backend == "device":
+        if backend == "device":
             _acc.set_routine_accelerators(["cub"])
             _acc.set_reduction_accelerators([])
-        elif self.backend == "block":
+        elif backend == "block":
             _acc.set_routine_accelerators([])
             _acc.set_reduction_accelerators(["cub"])
-        elif self.backend == "fallback":
+        elif backend == "fallback":
             _acc.set_routine_accelerators([])
             _acc.set_reduction_accelerators([])
         yield
@@ -406,28 +377,28 @@ def setUp(self):
     @testing.numpy_cupy_allclose(
         contiguous_check=False, accept_error=ValueError
     )
-    def test_cub_min(self, xp, dtype, axis):
-        a = testing.shaped_random(self.shape, xp, dtype, order=self.order)
+    def test_cub_min(self, xp, dtype, axis, shape, order, backend):
+        a = testing.shaped_random(shape, xp, dtype, order=order)
 
         if xp is numpy:
             return a.min(axis=axis)
 
         # xp is cupy, first ensure we really use CUB
         ret = cupy.empty(())  # Cython checks return type, need to fool it
-        if self.backend == "device":
+        if backend == "device":
             func_name = "cupy._core._routines_statistics.cub."
-            if len(axis) == len(self.shape):
+            if len(axis) == len(shape):
                 func_name += "device_reduce"
             else:
                 func_name += "device_segmented_reduce"
             with testing.AssertFunctionIsCalled(func_name, return_value=ret):
                 a.min(axis=axis)
-        elif self.backend == "block":
+        elif backend == "block":
             # this is the only function we can mock; the rest is cdef'd
             func_name = "cupy._core._cub_reduction."
             func_name += "_SimpleCubReductionKernel_get_cached_function"
             func = _cub_reduction._SimpleCubReductionKernel_get_cached_function
-            if len(axis) == len(self.shape):
+            if len(axis) == len(shape):
                 times_called = 2  # two passes
             else:
                 times_called = 1  # one pass
@@ -437,15 +408,17 @@ def test_cub_min(self, xp, dtype, axis):
                 func_name, wraps=func, times_called=times_called
             ):
                 a.min(axis=axis)
-        elif self.backend == "fallback":
+        elif backend == "fallback":
             pass
         # ...then perform the actual computation
         return a.min(axis=axis)
 
     @testing.for_all_dtypes(no_bool=True, no_float16=True)
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_cub_min_empty_axis(self, xp, dtype, contiguous_check=False):
-        a = testing.shaped_random(self.shape, xp, dtype, order=self.order)
+    def test_cub_min_empty_axis(
+        self, xp, dtype, shape, order, contiguous_check=False
+    ):
+        a = testing.shaped_random(shape, xp, dtype, order=order)
         return a.min(axis=())
 
     @testing.for_contiguous_axes()
@@ -453,28 +426,28 @@ def test_cub_min_empty_axis(self, xp, dtype, contiguous_check=False):
     @testing.numpy_cupy_allclose(
         contiguous_check=False, accept_error=ValueError
     )
-    def test_cub_max(self, xp, dtype, axis):
-        a = testing.shaped_random(self.shape, xp, dtype, order=self.order)
+    def test_cub_max(self, xp, dtype, axis, shape, order, backend):
+        a = testing.shaped_random(shape, xp, dtype, order=order)
 
         if xp is numpy:
             return a.max(axis=axis)
 
         # xp is cupy, first ensure we really use CUB
         ret = cupy.empty(())  # Cython checks return type, need to fool it
-        if self.backend == "device":
+        if backend == "device":
             func_name = "cupy._core._routines_statistics.cub."
-            if len(axis) == len(self.shape):
+            if len(axis) == len(shape):
                 func_name += "device_reduce"
             else:
                 func_name += "device_segmented_reduce"
             with testing.AssertFunctionIsCalled(func_name, return_value=ret):
                 a.max(axis=axis)
-        elif self.backend == "block":
+        elif backend == "block":
             # this is the only function we can mock; the rest is cdef'd
             func_name = "cupy._core._cub_reduction."
             func_name += "_SimpleCubReductionKernel_get_cached_function"
             func = _cub_reduction._SimpleCubReductionKernel_get_cached_function
-            if len(axis) == len(self.shape):
+            if len(axis) == len(shape):
                 times_called = 2  # two passes
             else:
                 times_called = 1  # one pass
@@ -484,13 +457,13 @@ def test_cub_max(self, xp, dtype, axis):
                 func_name, wraps=func, times_called=times_called
             ):
                 a.max(axis=axis)
-        elif self.backend == "fallback":
+        elif backend == "fallback":
             pass
         # ...then perform the actual computation
         return a.max(axis=axis)
 
     @testing.for_all_dtypes(no_bool=True, no_float16=True)
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_cub_max_empty_axis(self, xp, dtype):
-        a = testing.shaped_random(self.shape, xp, dtype, order=self.order)
+    def test_cub_max_empty_axis(self, xp, dtype, shape, order):
+        a = testing.shaped_random(shape, xp, dtype, order=order)
         return a.max(axis=())
diff --git a/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py b/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py
index e6411e298fdd..7fc5a3b80d2e 100644
--- a/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py
+++ b/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py
@@ -1,4 +1,5 @@
 import math
+from itertools import product as iproduct
 
 import numpy
 import pytest
@@ -208,26 +209,22 @@ def test_prod_dtype(self, xp, src_dtype, dst_dtype):
 
 
 # This class compares CUB results against NumPy's
-@testing.parameterize(
-    *testing.product(
-        {
-            "shape": [(10,), (10, 20), (10, 20, 30), (10, 20, 30, 40)],
-            "order": ("C", "F"),
-            "backend": ("device", "block"),
-        }
-    )
+@pytest.mark.parametrize(
+    "shape", [(10,), (10, 20), (10, 20, 30), (10, 20, 30, 40)]
 )
+@pytest.mark.parametrize("order", ["C", "F"])
+@pytest.mark.parametrize("backend", ["device", "block"])
 @pytest.mark.skip("_cub_reduction is not supported")
 class TestCubReduction:
 
     @pytest.fixture(autouse=True)
-    def setUp(self):
+    def setUp(self, backend):
         old_routine_accelerators = _acc.get_routine_accelerators()
         old_reduction_accelerators = _acc.get_reduction_accelerators()
-        if self.backend == "device":
+        if backend == "device":
             _acc.set_routine_accelerators(["cub"])
             _acc.set_reduction_accelerators([])
-        elif self.backend == "block":
+        elif backend == "block":
             _acc.set_routine_accelerators([])
             _acc.set_reduction_accelerators(["cub"])
         yield
@@ -240,11 +237,11 @@ def setUp(self):
     # sum supports less dtypes; don't test float16 as it's not as accurate?
     @testing.for_dtypes("qQfdFD")
     @testing.numpy_cupy_allclose(rtol=1e-5)
-    def test_cub_sum(self, xp, dtype, axis):
-        a = testing.shaped_random(self.shape, xp, dtype)
-        if self.order in ("c", "C"):
+    def test_cub_sum(self, xp, dtype, axis, shape, order, backend):
+        a = testing.shaped_random(shape, xp, dtype)
+        if order in ("c", "C"):
             a = xp.ascontiguousarray(a)
-        elif self.order in ("f", "F"):
+        elif order in ("f", "F"):
             a = xp.asfortranarray(a)
 
         if xp is numpy:
@@ -252,20 +249,20 @@ def test_cub_sum(self, xp, dtype, axis):
 
         # xp is cupy, first ensure we really use CUB
         ret = cupy.empty(())  # Cython checks return type, need to fool it
-        if self.backend == "device":
+        if backend == "device":
             func_name = "cupy._core._routines_math.cub."
-            if len(axis) == len(self.shape):
+            if len(axis) == len(shape):
                 func_name += "device_reduce"
             else:
                 func_name += "device_segmented_reduce"
             with testing.AssertFunctionIsCalled(func_name, return_value=ret):
                 a.sum(axis=axis)
-        elif self.backend == "block":
+        elif backend == "block":
             # this is the only function we can mock; the rest is cdef'd
             func_name = "cupy._core._cub_reduction."
             func_name += "_SimpleCubReductionKernel_get_cached_function"
             func = _cub_reduction._SimpleCubReductionKernel_get_cached_function
-            if len(axis) == len(self.shape):
+            if len(axis) == len(shape):
                 times_called = 2  # two passes
             else:
                 times_called = 1  # one pass
@@ -279,11 +276,11 @@ def test_cub_sum(self, xp, dtype, axis):
     # sum supports less dtypes; don't test float16 as it's not as accurate?
     @testing.for_dtypes("qQfdFD")
     @testing.numpy_cupy_allclose(rtol=1e-5, contiguous_check=False)
-    def test_cub_sum_empty_axis(self, xp, dtype):
-        a = testing.shaped_random(self.shape, xp, dtype)
-        if self.order in ("c", "C"):
+    def test_cub_sum_empty_axis(self, xp, dtype, shape, order, backend):
+        a = testing.shaped_random(shape, xp, dtype)
+        if order in ("c", "C"):
             a = xp.ascontiguousarray(a)
-        elif self.order in ("f", "F"):
+        elif order in ("f", "F"):
             a = xp.asfortranarray(a)
         return a.sum(axis=())
 
@@ -293,11 +290,11 @@ def test_cub_sum_empty_axis(self, xp, dtype):
     # prod supports less dtypes; don't test float16 as it's not as accurate?
     @testing.for_dtypes("qQfdFD")
     @testing.numpy_cupy_allclose(rtol=1e-5)
-    def test_cub_prod(self, xp, dtype, axis):
-        a = testing.shaped_random(self.shape, xp, dtype)
-        if self.order in ("c", "C"):
+    def test_cub_prod(self, xp, dtype, axis, shape, order, backend):
+        a = testing.shaped_random(shape, xp, dtype)
+        if order in ("c", "C"):
             a = xp.ascontiguousarray(a)
-        elif self.order in ("f", "F"):
+        elif order in ("f", "F"):
             a = xp.asfortranarray(a)
 
         if xp is numpy:
@@ -305,20 +302,20 @@ def test_cub_prod(self, xp, dtype, axis):
 
         # xp is cupy, first ensure we really use CUB
         ret = cupy.empty(())  # Cython checks return type, need to fool it
-        if self.backend == "device":
+        if backend == "device":
             func_name = "cupy._core._routines_math.cub."
-            if len(axis) == len(self.shape):
+            if len(axis) == len(shape):
                 func_name += "device_reduce"
             else:
                 func_name += "device_segmented_reduce"
             with testing.AssertFunctionIsCalled(func_name, return_value=ret):
                 a.prod(axis=axis)
-        elif self.backend == "block":
+        elif backend == "block":
             # this is the only function we can mock; the rest is cdef'd
             func_name = "cupy._core._cub_reduction."
             func_name += "_SimpleCubReductionKernel_get_cached_function"
             func = _cub_reduction._SimpleCubReductionKernel_get_cached_function
-            if len(axis) == len(self.shape):
+            if len(axis) == len(shape):
                 times_called = 2  # two passes
             else:
                 times_called = 1  # one pass
@@ -335,14 +332,14 @@ def test_cub_prod(self, xp, dtype, axis):
     # @pytest.mark.thread_unsafe(reason="unsafe AssertFunctionIsCalled.")
     @testing.for_dtypes("bhilBHILfdFD")
     @testing.numpy_cupy_allclose(rtol=1e-4)
-    def test_cub_cumsum(self, xp, dtype):
-        if self.backend == "block":
+    def test_cub_cumsum(self, xp, dtype, shape, order, backend):
+        if backend == "block":
             pytest.skip("does not support")
 
-        a = testing.shaped_random(self.shape, xp, dtype)
-        if self.order in ("c", "C"):
+        a = testing.shaped_random(shape, xp, dtype)
+        if order in ("c", "C"):
             a = xp.ascontiguousarray(a)
-        elif self.order in ("f", "F"):
+        elif order in ("f", "F"):
             a = xp.asfortranarray(a)
 
         if xp is numpy:
@@ -362,14 +359,14 @@ def test_cub_cumsum(self, xp, dtype):
     # @pytest.mark.thread_unsafe(reason="unsafe AssertFunctionIsCalled.")
     @testing.for_dtypes("bhilBHILfdFD")
     @testing.numpy_cupy_allclose(rtol=1e-4)
-    def test_cub_cumprod(self, xp, dtype):
-        if self.backend == "block":
+    def test_cub_cumprod(self, xp, dtype, shape, order, backend):
+        if backend == "block":
             pytest.skip("does not support")
 
-        a = testing.shaped_random(self.shape, xp, dtype)
-        if self.order in ("c", "C"):
+        a = testing.shaped_random(shape, xp, dtype)
+        if order in ("c", "C"):
             a = xp.ascontiguousarray(a)
-        elif self.order in ("f", "F"):
+        elif order in ("f", "F"):
             a = xp.asfortranarray(a)
 
         if xp is numpy:
@@ -516,14 +513,10 @@ def test_cumprod_size_over_int32_max(self, dtype):
 
 
 # This class compares cuTENSOR results against NumPy's
-@testing.parameterize(
-    *testing.product(
-        {
-            "shape": [(10,), (10, 20), (10, 20, 30), (10, 20, 30, 40)],
-            "order": ("C", "F"),
-        }
-    )
+@pytest.mark.parametrize(
+    "shape", [(10,), (10, 20), (10, 20, 30), (10, 20, 30, 40)]
 )
+@pytest.mark.parametrize("order", ["C", "F"])
 @pytest.mark.skip("cutensor is not supported")
 class TestCuTensorReduction:
 
@@ -539,11 +532,11 @@ def setup(cls):
     # sum supports less dtypes; don't test float16 as it's not as accurate?
     @testing.for_dtypes("qQfdFD")
     @testing.numpy_cupy_allclose(rtol=1e-5, contiguous_check=False)
-    def test_cutensor_sum(self, xp, dtype, axis):
-        a = testing.shaped_random(self.shape, xp, dtype)
-        if self.order in ("c", "C"):
+    def test_cutensor_sum(self, xp, dtype, axis, shape, order):
+        a = testing.shaped_random(shape, xp, dtype)
+        if order in ("c", "C"):
             a = xp.ascontiguousarray(a)
-        elif self.order in ("f", "F"):
+        elif order in ("f", "F"):
             a = xp.asfortranarray(a)
 
         if xp is numpy:
@@ -560,134 +553,121 @@ def test_cutensor_sum(self, xp, dtype, axis):
     # sum supports less dtypes; don't test float16 as it's not as accurate?
     @testing.for_dtypes("qQfdFD")
     @testing.numpy_cupy_allclose(rtol=1e-5, contiguous_check=False)
-    def test_cutensor_sum_empty_axis(self, xp, dtype):
-        a = testing.shaped_random(self.shape, xp, dtype)
-        if self.order in ("c", "C"):
+    def test_cutensor_sum_empty_axis(self, xp, dtype, shape, order):
+        a = testing.shaped_random(shape, xp, dtype)
+        if order in ("c", "C"):
             a = xp.ascontiguousarray(a)
-        elif self.order in ("f", "F"):
+        elif order in ("f", "F"):
             a = xp.asfortranarray(a)
         return a.sum(axis=())
 
 
-@testing.parameterize(
-    *testing.product(
-        {
-            "shape": [(2, 3, 4), (20, 30, 40)],
-            "axis": [0, 1],
-            "transpose_axes": [True, False],
-            "keepdims": [True, False],
-            "func": ["nansum", "nanprod"],
-        }
-    )
-)
+@pytest.mark.parametrize("shape", [(2, 3, 4), (20, 30, 40)])
+@pytest.mark.parametrize("axis", [0, 1])
+@pytest.mark.parametrize("transpose_axes", [True, False])
+@pytest.mark.parametrize("keepdims", [True, False])
+@pytest.mark.parametrize("func", ["nansum", "nanprod"])
 class TestNansumNanprodLong:
 
-    def _do_transposed_axis_test(self):
-        return not self.transpose_axes and self.axis != 1
+    def _do_transposed_axis_test(self, transpose_axes, axis):
+        return not transpose_axes and axis != 1
 
-    def _numpy_nanprod_implemented(self):
+    def _numpy_nanprod_implemented(self, func):
         return (
-            self.func == "nanprod"
+            func == "nanprod"
             and numpy.__version__ >= numpy.lib.NumpyVersion("1.10.0")
         )
 
-    def _test(self, xp, dtype):
-        shape = self.shape
+    def _test(self, xp, dtype, shape, axis, transpose_axes, keepdims, func):
         # Reduce the shape of the input array to avoid overflow warning
         # for nanprod with float32, shape=(20, 30, 40), axis=0 and transpose_axes=False
         if (
-            self.func == "nanprod"
+            func == "nanprod"
             and dtype == xp.float32
-            and self.shape == (20, 30, 40)
-            and self.axis == 0
-            and not self.transpose_axes
+            and shape == (20, 30, 40)
+            and axis == 0
+            and not transpose_axes
         ):
             shape = (10, 20, 30)
         a = testing.shaped_arange(shape, xp, dtype)
-        if self.transpose_axes:
+        if transpose_axes:
             a = a.transpose(2, 0, 1)
         if not issubclass(dtype, xp.integer):
             a[:, 1] = xp.nan
-        func = getattr(xp, self.func)
-        return func(a, axis=self.axis, keepdims=self.keepdims)
+        func = getattr(xp, func)
+        return func(a, axis=axis, keepdims=keepdims)
 
     @testing.for_all_dtypes(no_bool=True, no_float16=True)
     @testing.numpy_cupy_allclose(type_check=has_support_aspect64())
-    def test_nansum_all(self, xp, dtype):
-        if (
-            not self._numpy_nanprod_implemented()
-            or not self._do_transposed_axis_test()
-        ):
+    def test_nansum_all(
+        self, xp, dtype, shape, axis, transpose_axes, keepdims, func
+    ):
+        if not self._numpy_nanprod_implemented(
+            func
+        ) or not self._do_transposed_axis_test(transpose_axes, axis):
             return xp.array(())
-        return self._test(xp, dtype)
+        return self._test(
+            xp, dtype, shape, axis, transpose_axes, keepdims, func
+        )
 
     @testing.for_all_dtypes(no_bool=True, no_float16=True)
     @testing.numpy_cupy_allclose(type_check=has_support_aspect64())
-    def test_nansum_axis_transposed(self, xp, dtype):
-        if (
-            not self._numpy_nanprod_implemented()
-            or not self._do_transposed_axis_test()
-        ):
+    def test_nansum_axis_transposed(
+        self, xp, dtype, shape, axis, transpose_axes, keepdims, func
+    ):
+        if not self._numpy_nanprod_implemented(
+            func
+        ) or not self._do_transposed_axis_test(transpose_axes, axis):
             return xp.array(())
-        return self._test(xp, dtype)
+        return self._test(
+            xp, dtype, shape, axis, transpose_axes, keepdims, func
+        )
 
 
-@testing.parameterize(
-    *testing.product(
-        {
-            "shape": [(2, 3, 4), (20, 30, 40)],
-        }
-    )
-)
+@pytest.mark.parametrize("shape", [(2, 3, 4), (20, 30, 40)])
 class TestNansumNanprodExtra:
 
-    def test_nansum_axis_float16(self):
+    def test_nansum_axis_float16(self, shape):
         # Note that the above test example overflows in float16. We use a
         # smaller array instead, just return if array is too large.
-        if numpy.prod(self.shape) > 24:
+        if numpy.prod(shape) > 24:
             return
-        a = testing.shaped_arange(self.shape, dtype="e")
+        a = testing.shaped_arange(shape, dtype="e")
         a[:, 1] = cupy.nan
         sa = cupy.nansum(a, axis=1)
-        b = testing.shaped_arange(self.shape, numpy, dtype="f")
+        b = testing.shaped_arange(shape, numpy, dtype="f")
         b[:, 1] = numpy.nan
         sb = numpy.nansum(b, axis=1)
         testing.assert_allclose(sa, sb.astype("e"))
 
     @testing.for_all_dtypes(no_bool=True, no_float16=True)
     @testing.numpy_cupy_allclose()
-    def test_nansum_out(self, xp, dtype):
-        a = testing.shaped_arange(self.shape, xp, dtype)
+    def test_nansum_out(self, xp, dtype, shape):
+        a = testing.shaped_arange(shape, xp, dtype)
         if not issubclass(dtype, xp.integer):
             a[:, 1] = xp.nan
-        b = xp.empty((self.shape[0], self.shape[2]), dtype=dtype)
+        b = xp.empty((shape[0], shape[2]), dtype=dtype)
         xp.nansum(a, axis=1, out=b)
         return b
 
-    def test_nansum_out_wrong_shape(self):
-        a = testing.shaped_arange(self.shape)
+    def test_nansum_out_wrong_shape(self, shape):
+        a = testing.shaped_arange(shape)
         a[:, 1] = cupy.nan
         b = cupy.empty((2, 3))
         with pytest.raises(ValueError):
             cupy.nansum(a, axis=1, out=b)
 
 
-@testing.parameterize(
-    *testing.product(
-        {
-            "shape": [(2, 3, 4, 5), (20, 30, 40, 50)],
-            "axis": [(1, 3), (0, 2, 3)],
-        }
-    )
-)
+@pytest.mark.parametrize("shape", [(2, 3, 4, 5), (20, 30, 40, 50)])
+@pytest.mark.parametrize("axis", [(1, 3), (0, 2, 3)])
 class TestNansumNanprodAxes:
     @testing.for_all_dtypes(no_bool=True, no_float16=True)
     @testing.numpy_cupy_allclose(rtol=1e-6)
-    def test_nansum_axes(self, xp, dtype):
-        a = testing.shaped_arange(self.shape, xp, dtype)
+    def test_nansum_axes(self, xp, dtype, shape, axis):
+        a = testing.shaped_arange(shape, xp, dtype)
         if not issubclass(dtype, xp.integer):
             a[:, 1] = xp.nan
-        return xp.nansum(a, axis=self.axis)
+        return xp.nansum(a, axis=axis)
 
 
 class TestNansumNanprodHuge:
@@ -715,7 +695,6 @@ def test_nansum_axis_huge_halfnan(self, xp):
 axes = [0, 1, 2]
 
 
-@testing.parameterize(*testing.product({"axis": axes}))
 class TestCumsum:
 
     def _cumsum(self, xp, a, *args, **kwargs):
@@ -752,48 +731,53 @@ def test_cumsum_2dim(self, xp, dtype):
         a = testing.shaped_arange((4, 5), xp, dtype)
         return self._cumsum(xp, a)
 
+    @pytest.mark.parametrize("axis", axes)
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_cumsum_axis(self, xp, dtype):
+    def test_cumsum_axis(self, xp, dtype, axis):
         n = len(axes)
         a = testing.shaped_arange(tuple(range(4, 4 + n)), xp, dtype)
-        return self._cumsum(xp, a, axis=self.axis)
+        return self._cumsum(xp, a, axis=axis)
 
+    @pytest.mark.parametrize("axis", axes)
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose()
-    def test_cumsum_axis_out(self, xp, dtype):
+    def test_cumsum_axis_out(self, xp, dtype, axis):
         n = len(axes)
         shape = tuple(range(4, 4 + n))
         a = testing.shaped_arange(shape, xp, dtype)
         out = xp.zeros(shape, dtype=dtype)
-        self._cumsum(xp, a, axis=self.axis, out=out)
+        self._cumsum(xp, a, axis=axis, out=out)
         return out
 
+    @pytest.mark.parametrize("axis", axes)
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose()
-    def test_cumsum_axis_out_noncontiguous(self, xp, dtype):
+    def test_cumsum_axis_out_noncontiguous(self, xp, dtype, axis):
         n = len(axes)
         shape = tuple(range(4, 4 + n))
         a = testing.shaped_arange(shape, xp, dtype)
         out = xp.zeros((8,) + shape[1:], dtype=dtype)[
             ::2
         ]  # Non contiguous view
-        self._cumsum(xp, a, axis=self.axis, out=out)
+        self._cumsum(xp, a, axis=axis, out=out)
         return out
 
+    @pytest.mark.parametrize("axis", axes)
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
-    def test_ndarray_cumsum_axis(self, xp, dtype):
+    def test_ndarray_cumsum_axis(self, xp, dtype, axis):
         n = len(axes)
         a = testing.shaped_arange(tuple(range(4, 4 + n)), xp, dtype)
-        return a.cumsum(axis=self.axis)
+        return a.cumsum(axis=axis)
 
+    @pytest.mark.parametrize("axis", axes)
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose()
-    def test_cumsum_axis_empty(self, xp, dtype):
+    def test_cumsum_axis_empty(self, xp, dtype, axis):
         n = len(axes)
         a = testing.shaped_arange(tuple(range(0, n)), xp, dtype)
-        return self._cumsum(xp, a, axis=self.axis)
+        return self._cumsum(xp, a, axis=axis)
 
     @testing.for_all_dtypes()
     def test_invalid_axis_lower1(self, dtype):
@@ -930,29 +914,21 @@ def test_cumprod_numpy_array(self, dtype):
 
 
 @pytest.mark.usefixtures("suppress_invalid_numpy_warnings")
-@testing.parameterize(
-    *testing.product(
-        {
-            "shape": [(20,), (7, 6), (3, 4, 5)],
-            "axis": [None, 0, 1, 2],
-            "func": ("nancumsum", "nancumprod"),
-        }
-    )
-)
+@pytest.mark.parametrize("shape", [(20,), (7, 6), (3, 4, 5)])
+@pytest.mark.parametrize("axis", [None, 0, 1, 2])
+@pytest.mark.parametrize("func", ["nancumsum", "nancumprod"])
 class TestNanCumSumProd:
 
     zero_density = 0.25
 
-    def _make_array(self, dtype):
+    def _make_array(self, dtype, shape):
         dtype = numpy.dtype(dtype)
         if dtype.char in "efdFD":
             r_dtype = dtype.char.lower()
-            a = testing.shaped_random(self.shape, numpy, dtype=r_dtype, scale=1)
+            a = testing.shaped_random(shape, numpy, dtype=r_dtype, scale=1)
             if dtype.char in "FD":
                 ai = a
-                aj = testing.shaped_random(
-                    self.shape, numpy, dtype=r_dtype, scale=1
-                )
+                aj = testing.shaped_random(shape, numpy, dtype=r_dtype, scale=1)
                 ai[ai < math.sqrt(self.zero_density)] = 0
                 aj[aj < math.sqrt(self.zero_density)] = 0
                 a = ai + 1j * aj
@@ -960,30 +936,30 @@ def _make_array(self, dtype):
                 a[a < self.zero_density] = 0
             a = a / a
         else:
-            a = testing.shaped_random(self.shape, numpy, dtype=dtype)
+            a = testing.shaped_random(shape, numpy, dtype=dtype)
         return a
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose()
-    def test_nancumsumprod(self, xp, dtype):
-        if self.axis is not None and self.axis >= len(self.shape):
+    def test_nancumsumprod(self, xp, dtype, shape, axis, func):
+        if axis is not None and axis >= len(shape):
             pytest.skip()
-        a = xp.array(self._make_array(dtype))
-        out = getattr(xp, self.func)(a, axis=self.axis)
+        a = xp.array(self._make_array(dtype, shape))
+        out = getattr(xp, func)(a, axis=axis)
         return xp.ascontiguousarray(out)
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose()
-    def test_nancumsumprod_out(self, xp, dtype):
+    def test_nancumsumprod_out(self, xp, dtype, shape, axis, func):
         dtype = numpy.dtype(dtype)
-        if self.axis is not None and self.axis >= len(self.shape):
+        if axis is not None and axis >= len(shape):
             pytest.skip()
-        if len(self.shape) > 1 and self.axis is None:
+        if len(shape) > 1 and axis is None:
             # Skip the cases where np.nancum{sum|prod} raise AssertionError.
             pytest.skip()
-        a = xp.array(self._make_array(dtype))
-        out = xp.empty(self.shape, dtype=dtype)
-        getattr(xp, self.func)(a, axis=self.axis, out=out)
+        a = xp.array(self._make_array(dtype, shape))
+        out = xp.empty(shape, dtype=dtype)
+        getattr(xp, func)(a, axis=axis, out=out)
         return xp.ascontiguousarray(out)
 
 
@@ -1057,35 +1033,30 @@ def test_diff_invalid_axis(self):
 
 
 # This class compares CUB results against NumPy's
-@testing.parameterize(
-    *testing.product_dict(
-        testing.product(
-            {
-                "shape": [()],
-                "axis": [None, ()],
-                "spacing": [(), (1.2,)],
-            }
-        )
-        + testing.product(
-            {
-                "shape": [(33,)],
-                "axis": [None, 0, -1, (0,)],
-                "spacing": [(), (1.2,), "sequence of int", "arrays"],
-            }
+@pytest.mark.parametrize(
+    "shape,axis,spacing",
+    list(iproduct([()], [None, ()], [(), (1.2,)]))
+    + list(
+        iproduct(
+            [(33,)],
+            [None, 0, -1, (0,)],
+            [(), (1.2,), "sequence of int", "arrays"],
         )
-        + testing.product(
-            {
-                "shape": [(10, 20), (10, 20, 30)],
-                "axis": [None, 0, -1, (0, -1), (1, 0)],
-                "spacing": [(), (1.2,), "sequence of int", "arrays", "mixed"],
-            }
-        ),
-        testing.product(
-            {
-                "edge_order": [1, 2],
-            }
-        ),
     )
+    + list(
+        iproduct(
+            [(10, 20), (10, 20, 30)],
+            [None, 0, -1, (0, -1), (1, 0)],
+            [(), (1.2,), "sequence of int", "arrays", "mixed"],
+        )
+    ),
+)
+@pytest.mark.parametrize(
+    "edge_order",
+    [
+        pytest.param(1, id="edge_order"),
+        pytest.param(2, id="edge_order"),
+    ],
 )
 class TestGradient:
 
@@ -1123,10 +1094,10 @@ def _gradient(self, xp, dtype, shape, spacing, axis, edge_order):
 
     @testing.for_dtypes("fFdD")
     @testing.numpy_cupy_allclose(atol=1e-6, rtol=1e-5)
-    def test_gradient_floating(self, xp, dtype):
-        return self._gradient(
-            xp, dtype, self.shape, self.spacing, self.axis, self.edge_order
-        )
+    def test_gradient_floating(
+        self, xp, dtype, shape, axis, spacing, edge_order
+    ):
+        return self._gradient(xp, dtype, shape, spacing, axis, edge_order)
 
     # unsigned int behavior fixed in 1.18.1
     # https://github.com/numpy/numpy/issues/15207
@@ -1135,20 +1106,13 @@ def test_gradient_floating(self, xp, dtype):
     @testing.numpy_cupy_allclose(
         atol=1e-6, rtol=1e-5, type_check=has_support_aspect64()
     )
-    def test_gradient_int(self, xp, dtype):
-        return self._gradient(
-            xp, dtype, self.shape, self.spacing, self.axis, self.edge_order
-        )
+    def test_gradient_int(self, xp, dtype, shape, axis, spacing, edge_order):
+        return self._gradient(xp, dtype, shape, spacing, axis, edge_order)
 
     @testing.numpy_cupy_allclose(atol=2e-2, rtol=1e-3)
-    def test_gradient_float16(self, xp):
+    def test_gradient_float16(self, xp, shape, axis, spacing, edge_order):
         return self._gradient(
-            xp,
-            numpy.float16,
-            self.shape,
-            self.spacing,
-            self.axis,
-            self.edge_order,
+            xp, numpy.float16, shape, spacing, axis, edge_order
         )
 
 
diff --git a/dpnp/tests/third_party/cupy/testing/_loops.py b/dpnp/tests/third_party/cupy/testing/_loops.py
index 03232642b221..c85481e20928 100644
--- a/dpnp/tests/third_party/cupy/testing/_loops.py
+++ b/dpnp/tests/third_party/cupy/testing/_loops.py
@@ -1523,8 +1523,8 @@ def for_contiguous_axes(name="axis"):
     def decorator(impl):
         @_wraps_partial(impl, name)
         def test_func(self, *args, **kw):
-            ndim = len(self.shape)
-            order = self.order
+            ndim = len(kw["shape"])
+            order = kw["order"]
             for i in range(ndim):
                 a = ()
                 if order in ("c", "C"):
@@ -1546,7 +1546,7 @@ def test_func(self, *args, **kw):
                         ", ndim is",
                         ndim,
                         ", shape is",
-                        self.shape,
+                        kw["shape"],
                         ", order is",
                         order,
                     )

From d02678ac3fde62bfc46798e17fe51a9a3fcdd681 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 26 Jun 2026 15:14:23 +0200
Subject: [PATCH 37/37] Update new tests to handle a device with no fp64
 support

---
 dpnp/tests/third_party/cupy/linalg_tests/test_solve.py      | 2 +-
 dpnp/tests/third_party/cupy/logic_tests/test_comparison.py  | 6 ++----
 .../third_party/cupy/manipulation_tests/test_tiling.py      | 3 ++-
 dpnp/tests/third_party/cupy/sorting_tests/test_sort.py      | 2 +-
 4 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/dpnp/tests/third_party/cupy/linalg_tests/test_solve.py b/dpnp/tests/third_party/cupy/linalg_tests/test_solve.py
index 3b789480fd8b..eeb5712d84ef 100644
--- a/dpnp/tests/third_party/cupy/linalg_tests/test_solve.py
+++ b/dpnp/tests/third_party/cupy/linalg_tests/test_solve.py
@@ -82,7 +82,7 @@ def test_solve_singular_empty(self):
         c = cupy.linalg.solve(a, b)
         assert c.size == 0
 
-    @testing.numpy_cupy_allclose()
+    @testing.numpy_cupy_allclose(type_check=has_support_aspect64())
     def test_solve_non_singular_empty(self, xp):
         a = xp.eye(3)  # non-singular
         b = xp.empty((3, 0))  # nrhs = 0
diff --git a/dpnp/tests/third_party/cupy/logic_tests/test_comparison.py b/dpnp/tests/third_party/cupy/logic_tests/test_comparison.py
index 91a9dc727924..de788ba0d473 100644
--- a/dpnp/tests/third_party/cupy/logic_tests/test_comparison.py
+++ b/dpnp/tests/third_party/cupy/logic_tests/test_comparison.py
@@ -93,16 +93,14 @@ def test_binary_array_pyscalar_int(self, xp, dtype, scalar, op):
         b = scalar
         return [op(a, b), op(b, a)]
 
-    @pytest.mark.parametrize(
-        "dtype", [numpy.float16, numpy.float32, numpy.float64]
-    )
     @pytest.mark.parametrize(
         "scalar", [-1, 0, 2**32, 2**31 - 1, 2**31 + 1, 2**63, 2**64 - 1]
     )
     @pytest.mark.parametrize("op", operators)
+    @testing.for_float_dtypes(no_float16=False)
     @testing.numpy_cupy_array_equal()
     @numpy.errstate(over="ignore")
-    def test_binary_array_pyscalar_int_and_float(self, xp, dtype, scalar, op):
+    def test_binary_array_pyscalar_int_and_float(self, xp, scalar, op, dtype):
         a = xp.array(
             [-1, 0, 2**31 - 1, 2**31 + 1, 2**32, 2**63 - 1, 2**62, 2**62 + 1]
         )
diff --git a/dpnp/tests/third_party/cupy/manipulation_tests/test_tiling.py b/dpnp/tests/third_party/cupy/manipulation_tests/test_tiling.py
index a37544656735..a0ea1f35865f 100644
--- a/dpnp/tests/third_party/cupy/manipulation_tests/test_tiling.py
+++ b/dpnp/tests/third_party/cupy/manipulation_tests/test_tiling.py
@@ -4,6 +4,7 @@
 import pytest
 
 import dpnp as cupy
+from dpnp.tests.helper import has_support_aspect64
 from dpnp.tests.third_party.cupy import testing
 
 
@@ -166,7 +167,7 @@ def test_repeat(self, xp):
 class TestRepeatNdarrayArrayDtype:
     """Output dtype matches input dtype."""
 
-    @testing.numpy_cupy_array_equal()
+    @testing.numpy_cupy_array_equal(type_check=has_support_aspect64())
     def test_dtype_preserved(self, xp):
         x = testing.shaped_arange((3, 4), xp, dtype=self.a_dtype)
         return xp.repeat(x, xp.array([1, 2, 3, 4]), axis=1)
diff --git a/dpnp/tests/third_party/cupy/sorting_tests/test_sort.py b/dpnp/tests/third_party/cupy/sorting_tests/test_sort.py
index 9a6ab8981711..196f64fffa74 100644
--- a/dpnp/tests/third_party/cupy/sorting_tests/test_sort.py
+++ b/dpnp/tests/third_party/cupy/sorting_tests/test_sort.py
@@ -37,7 +37,7 @@ def test_external_sort_zero_dim(self):
             with pytest.raises(AxisError):
                 xp.sort(a)
 
-    @testing.numpy_cupy_array_equal()
+    @testing.numpy_cupy_array_equal(type_check=has_support_aspect64())
     def test_sort_zero_length_axis(self, xp):
         """Sorting along a zero-length axis is a no-op (#9816)."""
         return xp.sort(xp.empty((2, 0)), axis=-1)