diff --git a/odl/contrib/mrc/test/uncompr_bin_test.py b/odl/contrib/mrc/test/uncompr_bin_test.py
index 941f5189a9d..8219e8f2318 100644
--- a/odl/contrib/mrc/test/uncompr_bin_test.py
+++ b/odl/contrib/mrc/test/uncompr_bin_test.py
@@ -35,9 +35,9 @@
 # --- Tests --- #
 
 
-def test_uncompr_bin_io_without_header(shape, floating_dtype, order):
+def test_uncompr_bin_io_without_header(shape, odl_floating_dtype, order):
     """Test I/O bypassing the header processing."""
-    dtype = np.dtype(floating_dtype)
+    dtype = np.dtype(odl_floating_dtype)
     with tempfile.NamedTemporaryFile() as named_file:
         file = named_file.file
 
diff --git a/odl/discr/diff_ops.py b/odl/discr/diff_ops.py
index fad6dbb939f..e78c2b76dfa 100644
--- a/odl/discr/diff_ops.py
+++ b/odl/discr/diff_ops.py
@@ -14,7 +14,8 @@
 from odl.discr.lp_discr import DiscreteLp
 from odl.operator.tensor_ops import PointwiseTensorFieldOperator
 from odl.space import ProductSpace
-from odl.util import writable_array, signature_string, indent
+from odl.util import (
+    writable_array, asarray, array_module, signature_string, indent)
 
 
 __all__ = ('PartialDerivative', 'Gradient', 'Divergence', 'Laplacian')
@@ -137,9 +138,9 @@ def _call(self, x, out=None):
         if out is None:
             out = self.range.element()
 
-        # TODO: this pipes CUDA arrays through NumPy. Write native operator.
-        with writable_array(out) as out_arr:
-            finite_diff(x.asarray(), axis=self.axis, dx=self.dx,
+        impl = self.domain.impl
+        with writable_array(out, impl=impl) as out_arr:
+            finite_diff(x, axis=self.axis, dx=self.dx, impl=impl,
                         method=self.method, pad_mode=self.pad_mode,
                         pad_const=self.pad_const, out=out_arr)
         return out
@@ -190,8 +191,7 @@ def __repr__(self):
 
     def __str__(self):
         """Return ``str(self)``."""
-        dom_ran_str = '\n-->\n'.join([repr(self.domain), repr(self.range)])
-        return '{}:\n{}'.format(self.__class__.__name__, indent(dom_ran_str))
+        return repr(self)
 
 
 class Gradient(PointwiseTensorFieldOperator):
@@ -347,16 +347,15 @@ def _call(self, x, out=None):
         if out is None:
             out = self.range.element()
 
-        x_arr = x.asarray()
         ndim = self.domain.ndim
         dx = self.domain.cell_sides
+        impl = self.domain.impl
 
         for axis in range(ndim):
-            with writable_array(out[axis]) as out_arr:
-                finite_diff(x_arr, axis=axis, dx=dx[axis], method=self.method,
-                            pad_mode=self.pad_mode,
-                            pad_const=self.pad_const,
-                            out=out_arr)
+            with writable_array(out[axis], impl=impl) as out_arr:
+                finite_diff(x, axis=axis, dx=dx[axis], impl=impl,
+                            method=self.method, pad_mode=self.pad_mode,
+                            pad_const=self.pad_const, out=out_arr)
         return out
 
     def derivative(self, point=None):
@@ -414,8 +413,7 @@ def __repr__(self):
 
     def __str__(self):
         """Return ``str(self)``."""
-        dom_ran_str = '\n-->\n'.join([repr(self.domain), repr(self.range)])
-        return '{}:\n{}'.format(self.__class__.__name__, indent(dom_ran_str))
+        return repr(self)
 
 
 class Divergence(PointwiseTensorFieldOperator):
@@ -494,10 +492,12 @@ def __init__(self, domain=None, range=None, method='forward',
         ...                  [2., 3., 4., 5., 6.]])
         >>> f = div.domain.element([data, data])
         >>> div_f = div(f)
-        >>> print(div_f)
-        [[  2.,   2.,   2.,   2.,  -3.],
-         [  2.,   2.,   2.,   2.,  -4.],
-         [ -1.,  -2.,  -3.,  -4., -12.]]
+        >>> div_f
+        uniform_discr([ 0.,  0.], [ 3.,  5.], (3, 5)).element(
+            [[  2.,   2.,   2.,   2.,  -3.],
+             [  2.,   2.,   2.,   2.,  -4.],
+             [ -1.,  -2.,  -3.,  -4., -12.]]
+        )
 
         Verify adjoint:
 
@@ -559,11 +559,13 @@ def _call(self, x, out=None):
 
         ndim = self.range.ndim
         dx = self.range.cell_sides
+        impl = self.range.impl
 
-        tmp = np.empty(out.shape, out.dtype, order=out.space.default_order)
-        with writable_array(out) as out_arr:
+        tmp = array_module(impl).empty(
+            out.shape, out.dtype, order=out.space.default_order)
+        with writable_array(out, impl=impl) as out_arr:
             for axis in range(ndim):
-                finite_diff(x[axis], axis=axis, dx=dx[axis],
+                finite_diff(x[axis], axis=axis, dx=dx[axis], impl=impl,
                             method=self.method, pad_mode=self.pad_mode,
                             pad_const=self.pad_const,
                             out=tmp)
@@ -623,8 +625,7 @@ def __repr__(self):
 
     def __str__(self):
         """Return ``str(self)``."""
-        dom_ran_str = '\n-->\n'.join([repr(self.domain), repr(self.range)])
-        return '{}:\n{}'.format(self.__class__.__name__, indent(dom_ran_str))
+        return repr(self)
 
 
 class Laplacian(PointwiseTensorFieldOperator):
@@ -714,24 +715,23 @@ def _call(self, x, out=None):
         else:
             out.set_zero()
 
-        x_arr = x.asarray()
-        out_arr = out.asarray()
-        tmp = np.empty(out.shape, out.dtype, order=out.space.default_order)
-
         ndim = self.domain.ndim
         dx = self.domain.cell_sides
+        impl = self.domain.impl
+        tmp = array_module(impl).empty(
+            out.shape, out.dtype, order=out.space.default_order)
 
-        with writable_array(out) as out_arr:
+        with writable_array(out, impl=impl) as out_arr:
             for axis in range(ndim):
                 # TODO: this can be optimized
-                finite_diff(x_arr, axis=axis, dx=dx[axis] ** 2,
+                finite_diff(x, axis=axis, dx=dx[axis] ** 2, impl=impl,
                             method='forward',
                             pad_mode=self.pad_mode,
                             pad_const=self.pad_const, out=tmp)
 
                 out_arr += tmp
 
-                finite_diff(x_arr, axis=axis, dx=dx[axis] ** 2,
+                finite_diff(x, axis=axis, dx=dx[axis] ** 2, impl=impl,
                             method='backward',
                             pad_mode=self.pad_mode,
                             pad_const=self.pad_const, out=tmp)
@@ -781,11 +781,11 @@ def __repr__(self):
 
     def __str__(self):
         """Return ``str(self)``."""
-        dom_ran_str = '\n-->\n'.join([repr(self.domain), repr(self.range)])
-        return '{}:\n{}'.format(self.__class__.__name__, indent(dom_ran_str))
+        return repr(self)
 
 
-def finite_diff(f, axis, dx=1.0, method='forward', out=None, **kwargs):
+def finite_diff(f, axis, dx=1.0, method='forward', out=None, impl='numpy',
+                **kwargs):
     """Calculate the partial derivative of ``f`` along a given ``axis``.
 
     In the interior of the domain of f, the partial derivative is computed
@@ -819,6 +819,9 @@ def finite_diff(f, axis, dx=1.0, method='forward', out=None, **kwargs):
     out : `numpy.ndarray`, optional
          An N-dimensional array to which the output is written. Has to have
          the same shape as the input array ``f``.
+    impl : str, optional
+        Implementation backend for array manipulations. Usually handled by
+        the calling code.
     pad_mode : string, optional
         The padding mode to use outside the domain.
 
@@ -883,7 +886,7 @@ def finite_diff(f, axis, dx=1.0, method='forward', out=None, **kwargs):
     >>> out is finite_diff(f, axis=0, out=out)
     True
     """
-    f_arr = np.asarray(f)
+    f_arr = asarray(f, impl=impl)
     ndim = f_arr.ndim
 
     if f_arr.shape[axis] < 2:
@@ -912,8 +915,9 @@ def finite_diff(f, axis, dx=1.0, method='forward', out=None, **kwargs):
     pad_const = kwargs.pop('pad_const', 0)
     pad_const = f.dtype.type(pad_const)
 
+    arrmod = array_module(impl)
     if out is None:
-        out = np.empty_like(f_arr)
+        out = arrmod.empty_like(f_arr)
     else:
         if out.shape != f.shape:
             raise ValueError('expected output shape {}, got {}'
@@ -931,24 +935,24 @@ def finite_diff(f, axis, dx=1.0, method='forward', out=None, **kwargs):
 
     # create slice objects: initially all are [:, :, ..., :]
 
-    # Swap axes so that the axis of interest is first. This is a O(1)
+    # Swap axes so that the axis of interest is first. This is an O(1)
     # operation and is done to simplify the code below.
-    out, out_in = np.swapaxes(out, 0, axis), out
-    f_arr = np.swapaxes(f_arr, 0, axis)
+    out, out_in = arrmod.swapaxes(out, 0, axis), out
+    f_arr = arrmod.swapaxes(f_arr, 0, axis)
 
     # Interior of the domain of f
     if method == 'central':
         # 1D equivalent: out[1:-1] = (f[2:] - f[:-2])/2.0
-        np.subtract(f_arr[2:], f_arr[:-2], out=out[1:-1])
+        arrmod.subtract(f_arr[2:], f_arr[:-2], out=out[1:-1])
         out[1:-1] /= 2.0
 
     elif method == 'forward':
         # 1D equivalent: out[1:-1] = (f[2:] - f[1:-1])
-        np.subtract(f_arr[2:], f_arr[1:-1], out=out[1:-1])
+        arrmod.subtract(f_arr[2:], f_arr[1:-1], out=out[1:-1])
 
     elif method == 'backward':
         # 1D equivalent: out[1:-1] = (f[1:-1] - f[:-2])
-        np.subtract(f_arr[1:-1], f_arr[:-2], out=out[1:-1])
+        arrmod.subtract(f_arr[1:-1], f_arr[:-2], out=out[1:-1])
 
     # Boundaries
     if pad_mode == 'constant':
diff --git a/odl/discr/discretization.py b/odl/discr/discretization.py
index 14c5ec8515d..84c8c860bfd 100644
--- a/odl/discr/discretization.py
+++ b/odl/discr/discretization.py
@@ -331,16 +331,18 @@ def copy(self):
         """Create an identical (deep) copy of this element."""
         return self.space.element(self.tensor.copy())
 
-    def asarray(self, out=None):
-        """Extract the data of this array as a numpy array.
+    def asarray(self, out=None, impl='numpy'):
+        """Extract the data of this element as an array.
 
         Parameters
         ----------
-        out : `numpy.ndarray`, optional
-            Array in which the result should be written in-place.
-            Has to be contiguous and of the correct dtype.
+        out : ndarray, optional
+            Array into which the result should be written. Must be contiguous
+            and of the correct dtype.
+        impl : str, optional
+            Array backend for the output, used when ``out`` is not given.
         """
-        return self.tensor.asarray(out=out)
+        return self.tensor.asarray(out=out, impl=impl)
 
     def astype(self, dtype):
         """Return a copy of this element with new ``dtype``.
@@ -409,7 +411,7 @@ def __setitem__(self, indices, values):
             indices = indices.tensor
         if isinstance(values, type(self)):
             values = values.tensor
-        self.tensor.__setitem__(indices, values)
+        self.tensor[indices] = values
 
     def sampling(self, ufunc, **kwargs):
         """Sample a continuous function and assign to this element.
diff --git a/odl/discr/lp_discr.py b/odl/discr/lp_discr.py
index 54a849b2ff4..eaea13ef1c7 100644
--- a/odl/discr/lp_discr.py
+++ b/odl/discr/lp_discr.py
@@ -755,29 +755,6 @@ def conj(self, out=None):
             self.tensor.conj(out=out.tensor)
             return out
 
-    def __setitem__(self, indices, values):
-        """Set values of this element.
-
-        Parameters
-        ----------
-        indices : int or `slice`
-            The position(s) that should be set
-        values : scalar or `array-like`
-            Value(s) to be assigned.
-            If ``indices`` is an integer, ``values`` must be a scalar
-            value.
-            If ``indices`` is a slice, ``values`` must be
-            broadcastable to the size of the slice (same size,
-            shape ``(1,)`` or scalar).
-            For ``indices == slice(None)``, i.e. in the call
-            ``vec[:] = values``, a multi-dimensional array of correct
-            shape is allowed as ``values``.
-        """
-        if values in self.space:
-            self.tensor[indices] = values.tensor
-        else:
-            super(DiscreteLpElement, self).__setitem__(indices, values)
-
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         """Interface to Numpy's ufunc machinery.
 
diff --git a/odl/set/space.py b/odl/set/space.py
index 90effed68fd..a59c226b3ef 100644
--- a/odl/set/space.py
+++ b/odl/set/space.py
@@ -374,10 +374,10 @@ def __pow__(self, shape):
         >>> r2 ** 4
         ProductSpace(rn(2), 4)
 
-        Multiple powers work as expected:
+        Multiple powers work as expected (first entry is outermost power):
 
-        >>> r2 ** (4, 2)
-        ProductSpace(ProductSpace(rn(2), 4), 2)
+        >>> r2 ** (3, 4)
+        ProductSpace(ProductSpace(rn(2), 4), 3)
         """
         from odl.space import ProductSpace
 
@@ -387,7 +387,7 @@ def __pow__(self, shape):
             shape = tuple(shape)
 
         pspace = self
-        for n in shape:
+        for n in reversed(shape):
             pspace = ProductSpace(pspace, n)
 
         return pspace
diff --git a/odl/space/__init__.py b/odl/space/__init__.py
index 36e99548a05..4b317575d43 100644
--- a/odl/space/__init__.py
+++ b/odl/space/__init__.py
@@ -19,6 +19,9 @@
 from .npy_tensors import *
 __all__ += npy_tensors.__all__
 
+from .cupy_tensors import *
+__all__ += cupy_tensors.__all__
+
 from .pspace import *
 __all__ += pspace.__all__
 
diff --git a/odl/space/base_tensors.py b/odl/space/base_tensors.py
index c909e6b0b16..f88bd6336ee 100644
--- a/odl/space/base_tensors.py
+++ b/odl/space/base_tensors.py
@@ -9,7 +9,6 @@
 """Base classes for implementations of tensor spaces."""
 
 from __future__ import print_function, division, absolute_import
-from builtins import object
 from numbers import Integral
 import numpy as np
 
@@ -18,7 +17,8 @@
 from odl.util import (
     is_numeric_dtype, is_real_dtype, is_floating_dtype,
     is_real_floating_dtype, is_complex_floating_dtype, safe_int_conv,
-    array_str, dtype_str, signature_string, indent, writable_array)
+    array_str, dtype_str, signature_string, indent, writable_array,
+    none_context)
 from odl.util.ufuncs import TensorSpaceUfuncs
 from odl.util.utility import TYPE_MAP_R2C, TYPE_MAP_C2R
 
@@ -815,26 +815,11 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
 
         # --- Evaluate ufunc --- #
 
-        # Trivial context used to create a single code path for the ufunc
-        # evaluation. For `None` output parameter(s), this is used instead of
-        # `writable_array`.
-        class CtxNone(object):
-            """Trivial context manager class.
-
-            When used as ::
-
-                with CtxNone() as obj:
-                    # do stuff with `obj`
-
-            the returned ``obj`` is ``None``.
-            """
-            __enter__ = __exit__ = lambda *_: None
-
         if method == '__call__':
             if ufunc.nout == 1:
                 # Make context for output (trivial one returns `None`)
                 if out is None:
-                    out_ctx = CtxNone()
+                    out_ctx = none_context()
                 else:
                     out_ctx = writable_array(out, **array_kwargs)
 
@@ -851,11 +836,11 @@ class CtxNone(object):
                 if out1 is not None:
                     out1_ctx = writable_array(out1, **array_kwargs)
                 else:
-                    out1_ctx = CtxNone()
+                    out1_ctx = none_context()
                 if out2 is not None:
                     out2_ctx = writable_array(out2, **array_kwargs)
                 else:
-                    out2_ctx = CtxNone()
+                    out2_ctx = none_context()
 
                 # Evaluate ufunc
                 with out1_ctx as out1_arr, out2_ctx as out2_arr:
@@ -872,7 +857,7 @@ class CtxNone(object):
         else:  # method != '__call__'
             # Make context for output (trivial one returns `None`)
             if out is None:
-                out_ctx = CtxNone()
+                out_ctx = none_context()
             else:
                 out_ctx = writable_array(out, **array_kwargs)
 
diff --git a/odl/space/cupy_tensors.py b/odl/space/cupy_tensors.py
new file mode 100644
index 00000000000..6d534f63f6b
--- /dev/null
+++ b/odl/space/cupy_tensors.py
@@ -0,0 +1,2819 @@
+# Copyright 2014-2017 The ODL contributors
+#
+# This file is part of ODL.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public License,
+# v. 2.0. If a copy of the MPL was not distributed with this file, You can
+# obtain one at https://mozilla.org/MPL/2.0/.
+
+"""Implementation of tensor spaces using CuPy.
+
+See https://cupy.chainer.org/ or https://github.com/cupy/cupy for details
+on the backend.
+"""
+
+from __future__ import print_function, division, absolute_import
+import numpy as np
+import warnings
+
+from odl.set import RealNumbers, ComplexNumbers
+from odl.space.base_tensors import TensorSpace, Tensor
+from odl.space.weighting import (
+    Weighting, ArrayWeighting, ConstWeighting,
+    CustomInner, CustomNorm, CustomDist)
+from odl.util import (
+    array_str, dtype_str, is_floating_dtype, is_numeric_dtype, is_real_dtype,
+    real_dtype, signature_string, indent)
+
+try:
+    import cupy
+except ImportError:
+    cupy = None
+    CUPY_AVAILABLE = False
+else:
+    _maj = int(cupy.__version__.split('.')[0])
+    if _maj < 2:
+        raise warnings.warn(
+            'your version {} of CuPy is not supported; please upgrade to '
+            'version 2.0.0 or higher'.format(cupy.__version__), RuntimeWarning)
+    CUPY_AVAILABLE = True
+
+
+__all__ = ('CupyTensorSpace',)
+
+
+# --- Space method implementations --- #
+
+
+if CUPY_AVAILABLE:
+    lico = cupy.ElementwiseKernel(in_params='X a, X x, Y b, Y y',
+                                  out_params='Z z',
+                                  operation='z = a * x + b * y;',
+                                  name='lico')
+
+    all_equal = cupy.ReductionKernel(in_params='T x, T y',
+                                     out_params='R res',
+                                     map_expr='x == y',
+                                     reduce_expr='a && b',
+                                     post_map_expr='res = a',
+                                     identity='1',
+                                     name='all_equal')
+
+
+def _fallback_scal(a, x):
+    """Fallback implementation of ``scal`` when cuBLAS is not applicable."""
+    x *= a
+    return x
+
+
+def _fallback_axpy(a, x, y):
+    """Fallback implementation of ``axpy`` when cuBLAS is not applicable."""
+    return lico(a, x, 1, y, y)
+
+
+def _get_flat_inc(arr1, arr2=None):
+    """Return flat index increment(s) for cuBLAS, raise if not applicable.
+
+    This function checks if the array stride(s) allow usage of cuBLAS
+    and returns the ``incx`` (and ``incy``) parameters needed for the
+    cuBLAS functions. If the array strides ar such that cuBLAS cannot
+    be applied, a ``ValueError`` is raised, triggering a fallback
+    implementation.
+
+    For ``arr2 is None``, the conditions to be fulfilled are
+
+    - the strides do not contain 0 and
+    - the memory of the array has constant stride.
+
+    The second point applies to
+
+    - contiguous arrays,
+    - chunks of arrays along the **slowest-varying axis** (``arr[1:5, ...]``
+      for C-contiguous ``arr``), and
+    - strided slices along the **fastest-varying axis** (``arr[..., ::2]``
+      for C-contiguous ``arr``).
+
+    For ``arr2 is not None``, both arrays must
+
+    - fulfill the ``arr2 is None`` conditions individually,
+    - have the same total size, and
+    - the axis order of both must be the same in the sense that the same
+      index array sorts the strides of both arrays in ascending order.
+
+    Parameters
+    ----------
+    arr1 : cupy.core.core.ndarray
+        Array to check for compatibility.
+    arr2 : cupy.core.core.ndarray, optional
+        Second array to check for compatibility, by itself and with ``arr1``.
+
+    Returns
+    -------
+    flat_inc1 : int
+        Memory stride (in terms of elements, not bytes) of ``arr1``.
+    flat_inc2 : int or None, optional
+        Memory stride of ``arr2``. Returned only if ``arr2`` was provided.
+
+    Raises
+    ------
+    ValueError
+        If the conditions for cuBLAS compatibility are not met.
+
+    Examples
+    --------
+    >>> arr_c = cupy.zeros((4, 4, 4))
+    >>> arr_c.strides
+    (128, 32, 8)
+    >>> arr_f = cupy.asfortranarray(arr_c)
+    >>> arr_f.strides
+    (8, 32, 128)
+
+    Contiguous arrays are compatible by and with themselves, but not with
+    arrays that are contiguous in a different ordering:
+
+    >>> _get_flat_inc(arr_c)
+    1
+    >>> _get_flat_inc(arr_c, arr_c)
+    (1, 1)
+    >>> _get_flat_inc(arr_f)
+    1
+    >>> _get_flat_inc(arr_f, arr_f)
+    (1, 1)
+    >>> _get_flat_inc(arr_c, arr_f)
+    Traceback (most recent call last):
+        ...
+    ValueError
+
+    Slicing in the **fastest** axis is allowed as it results in a constant
+    stride in the flat memory. Slicing with stride in any other axis is
+    results in incompatibility.
+
+    C ordering (last axis fastest):
+
+    >>> half_arr_0_c = cupy.zeros((2, 4, 4))
+    >>> half_arr_1_c = cupy.zeros((4, 2, 4))
+    >>> half_arr_2_c = cupy.zeros((4, 4, 2))
+    >>> _get_flat_inc(arr_c[::2, :, :], half_arr_0_c)
+    Traceback (most recent call last):
+        ...
+    ValueError
+    >>> _get_flat_inc(arr_c[:, ::2, :], half_arr_1_c)
+    Traceback (most recent call last):
+        ...
+    ValueError
+    >>> _get_flat_inc(arr_c[:, :, ::2], half_arr_2_c)
+    (2, 1)
+
+    Fortran ordering (first axis fastest):
+
+    >>> half_arr_0_f = cupy.asfortranarray(half_arr_0_c)
+    >>> half_arr_1_f = cupy.asfortranarray(half_arr_1_c)
+    >>> half_arr_2_f = cupy.asfortranarray(half_arr_2_c)
+    >>> _get_flat_inc(arr_f[::2, :, :], half_arr_0_f)
+    (2, 1)
+    >>> _get_flat_inc(arr_f[:, ::2, :], half_arr_1_f)
+    Traceback (most recent call last):
+        ...
+    ValueError
+    >>> _get_flat_inc(arr_f[:, :, ::2], half_arr_2_f)
+    Traceback (most recent call last):
+        ...
+    ValueError
+
+    Axes swapped (middle axis fastest):
+
+    >>> arr_s = cupy.swapaxes(arr_c, 1, 2)
+    >>> arr_s.strides
+    (128, 8, 32)
+    >>> half_arr_0_s = cupy.swapaxes(half_arr_0_c, 1, 2)
+    >>> half_arr_1_s = cupy.swapaxes(half_arr_1_c, 1, 2)
+    >>> half_arr_2_s = cupy.swapaxes(half_arr_2_c, 1, 2)
+    >>> _get_flat_inc(arr_s[:, :, ::2], half_arr_0_s)
+    Traceback (most recent call last):
+        ...
+    ValueError
+    >>> _get_flat_inc(arr_s[:, ::2, :], half_arr_1_s)
+    (2, 1)
+    >>> _get_flat_inc(arr_s[:, :, ::2], half_arr_2_s)
+    Traceback (most recent call last):
+        ...
+    ValueError
+    """
+    # Zero strides not allowed
+    if 0 in arr1.strides:
+        raise ValueError
+    if arr2 is not None and 0 in arr2.strides:
+        raise ValueError
+
+    # Candidate for flat_inc of array 1
+    arr1_flat_inc = min(arr1.strides) // arr1.itemsize
+
+    # Check if the strides are as in a contiguous array (after reordering
+    # the axes), except for the fastest axis. We allow arbitrary axis order
+    # for operations on the whole array at once, as long as it can be
+    # indexed with a single flat index and stride.
+    arr1_ax_order = np.argsort(arr1.strides)  # ascending
+    arr1_sorted_shape = np.take(arr1.shape, arr1_ax_order)
+    arr1_sorted_shape[0] *= arr1_flat_inc
+    arr1_elem_strides = np.take(arr1.strides, arr1_ax_order) // arr1.itemsize
+    if np.any(np.cumprod(arr1_sorted_shape[:-1]) != arr1_elem_strides[1:]):
+        raise ValueError
+
+    if arr2 is None:
+        return arr1_flat_inc
+    else:
+        arr2_flat_inc = _get_flat_inc(arr2)
+        if arr1.size != arr2.size:
+            raise ValueError
+        if np.any(np.diff(np.take(arr2.strides, arr1_ax_order)) < 0):
+            # Strides of arr2 are not sorted by axis order of arr1
+            raise ValueError
+        return arr1_flat_inc, arr2_flat_inc
+
+
+def _cublas_func(name, dtype):
+    """Return the specified cupy.cuda.cublas function for a given dtype.
+
+    Parameters
+    ----------
+    name : str
+        Raw function name without prefix, e.g., ``'axpy'``.
+    dtype :
+        Numpy dtype specifier for which the cuBLAS function should be
+        used. Must be single or double precision float or complex.
+
+    Raises
+    ------
+    ValueError
+        If the data type is not supported by cuBLAS.
+    """
+    dtype, dtype_in = np.dtype(dtype), dtype
+    if dtype == 'float32':
+        prefix = 's'
+    elif dtype == 'float64':
+        prefix = 'd'
+    elif dtype == 'complex64':
+        prefix = 'c'
+    elif dtype == 'complex128':
+        prefix = 'z'
+    else:
+        raise ValueError('dtype {!r} not supported by cuBLAS'.format(dtype_in))
+
+    return getattr(cupy.cuda.cublas, prefix + name)
+
+
+def _get_scal(arr):
+    """Return a ``scal`` implementation suitable for the input.
+
+    If possible, a cuBLAS implementation is returned, otherwise a fallback.
+
+    In general, cuBLAS requires single or double precision float or complex
+    data type, and operates on linear memory with constant stride.
+    The latter is the case for
+
+    - contiguous arrays,
+    - chunks of arrays along the **slowest-varying axis** (``arr[1:5, ...]``
+      for C-contiguous ``arr``), and
+    - strided slices along the **fastest-varying axis** (``arr[..., ::2]``
+      for C-contiguous ``arr``).
+
+    Note that CuPy's cuBLAS wrapper does not necessarily implement all
+    functions for all four data types. See
+    https://github.com/cupy/cupy/blob/master/cupy/cuda/cublas.pyx
+    for details.
+
+    Parameters
+    ----------
+    arr : cupy.core.core.ndarray
+        Array to which ``scal`` should be applied.
+
+    Returns
+    -------
+    scal : callable
+        Implementation of ``x <- a * x``.
+    """
+    try:
+        incx = _get_flat_inc(arr)
+    except ValueError:
+        return _fallback_scal
+
+    try:
+        _cublas_scal_impl = _cublas_func('scal', arr.dtype)
+    except (ValueError, AttributeError):
+        return _fallback_scal
+
+    def _cublas_scal(a, x):
+        """Implement ``x <- a * x`` with constant ``a``."""
+        return _cublas_scal_impl(
+            x.data.device.cublas_handle, x.size, a, x.data.ptr, incx)
+
+    return _cublas_scal
+
+
+def _get_axpy(arr1, arr2):
+    """Return an ``axpy`` implementation suitable for the inputs.
+
+    If possible, a cuBLAS implementation is returned, otherwise a fallback.
+
+    In general, cuBLAS requires single or double precision float or complex
+    data type, and operates on linear memory with constant stride.
+    The latter is the case for
+
+    - contiguous arrays,
+    - chunks of arrays along the **slowest-varying axis** (``arr[1:5, ...]``
+      for C-contiguous ``arr``), and
+    - strided slices along the **fastest-varying axis** (``arr[..., ::2]``
+      for C-contiguous ``arr``).
+
+    Furthermore, the two arrays must
+
+    - be on the same device,
+    - have the same total size, and
+    - the axis order of both must be the same in the sense that the same
+      index array sorts the strides of both arrays in ascending order.
+
+    Note that CuPy's cuBLAS wrapper does not necessarily implement all
+    functions for all four data types. See
+    https://github.com/cupy/cupy/blob/master/cupy/cuda/cublas.pyx
+    for details.
+
+    Parameters
+    ----------
+    arr1, arr2 : cupy.core.core.ndarray
+        Arrays to which ``axpy`` should be applied.
+
+    Returns
+    -------
+    axpy : callable
+        Implementation of ``y <- a * x + y``.
+    """
+    if arr1.dtype != arr2.dtype or arr1.device != arr2.device:
+        return _fallback_axpy
+
+    try:
+        incx1, incx2 = _get_flat_inc(arr1, arr2)
+    except ValueError:
+        return _fallback_axpy
+
+    try:
+        _cublas_axpy = _cublas_func('axpy', arr1.dtype)
+    except (ValueError, AttributeError):
+        return _fallback_axpy
+
+    def axpy(a, x, y):
+        """Implement ``y <- a * x + y`` with constant ``a``."""
+        return _cublas_axpy(
+            x.data.device.cublas_handle, x.size, a, x.data.ptr, incx1,
+            y.data.ptr, incx2)
+
+    axpy.__name__ = axpy.__qualname__ = '_cublas_axpy'
+    return axpy
+
+
+def _lincomb_impl(a, x1, b, x2, out):
+    """Linear combination implementation, assuming types have been checked.
+
+    This implementation is a highly optimized, considering all special
+    cases of array alignment and special scalar values 0 and 1 separately.
+    """
+    scal1 = _get_scal(x1.data)
+    scal2 = _get_scal(x2.data)
+    axpy = _get_axpy(x1.data, x2.data)
+
+    if a == 0 and b == 0:
+        # out <- 0
+        out.data.fill(0)
+
+    elif a == 0:
+        # Compute out <- b * x2
+        if out is x2:
+            # out <- b * out
+            if b == 1:
+                pass
+            else:
+                scal2(b, out.data)
+        else:
+            # out <- b * x2
+            if b == 1:
+                out.data[:] = x2.data
+            else:
+                cupy.multiply(b, x2.data, out=out.data)
+
+    elif b == 0:
+        # Compute out <- a * x1
+        if out is x1:
+            # out <- a * out
+            if a == 1:
+                pass
+            else:
+                scal1(a, out.data)
+        else:
+            # out <- a * x1
+            if a == 1:
+                out.data[:] = x1.data
+            else:
+                cupy.multiply(a, x1.data, out=out.data)
+
+    else:
+        # Compute out <- a * x1 + b * x2
+        # Optimize a number of alignment options. We know that a and b
+        # are nonzero.
+        if out is x1 and out is x2:
+            # out <-- (a + b) * out
+            if a + b == 0:
+                out.data.fill(0)
+            elif a + b == 1:
+                pass
+            else:
+                scal1(a + b, out.data)
+        elif out is x1 and a == 1:
+            # out <-- out + b * x2
+            axpy(b, x2.data, out.data)
+        elif out is x2 and b == 1:
+            # out <-- a * x1 + out
+            axpy(a, x1.data, out.data)
+        else:
+            # out <-- a * x1 + b * x2
+            # No optimization for other cases of a and b; alignment doesn't
+            # matter anymore.
+            lico(a, x1.data, b, x2.data, out.data)
+
+
+def _python_scalar(arr):
+    """Convert a CuPy array to a Python scalar.
+
+    The shape of the array must be ``()`` or ``(1,)``, otherwise an
+    error is raised.
+    """
+    if arr.dtype.kind == 'f':
+        return float(cupy.asnumpy(arr))
+    elif arr.dtype.kind == 'c':
+        return complex(cupy.asnumpy(arr))
+    elif arr.dtype.kind in ('u', 'i'):
+        return int(cupy.asnumpy(arr))
+    elif arr.dtype.kind == 'b':
+        return bool(cupy.asnumpy(arr))
+    else:
+        raise RuntimeError("no conversion for dtype {}"
+                           "".format(arr.dtype))
+
+
+# --- Space and element classes --- #
+
+
+class CupyTensorSpace(TensorSpace):
+
+    """Tensor space implemented with CUDA arrays using the CuPy library.
+
+    This space implements tensors of arbitrary rank over a `Field` ``F``,
+    which is either the real or complex numbers.
+
+    Its elements are represented as instances of the
+    `CupyTensor` class.
+
+    See https://github.com/cupy/cupy for details on the backend.
+    """
+
+    def __init__(self, shape, dtype='float64', device=None, **kwargs):
+        """Initialize a new instance.
+
+        Parameters
+        ----------
+        shape : positive int or sequence of positive ints
+            Number of entries per axis for elements in this space.
+            A single integer results in a space with rank 1, i.e., 1 axis.
+        dtype :
+            Data type of each element. Can be provided in any
+            way the `numpy.dtype` function understands, e.g.
+            as built-in type or as a string.
+            See `available_dtypes` for the list of supported scalar data types.
+        device : int, optional
+            ID of the GPU device where elements should be created.
+            For ``None``, the default device is chosen, which usually
+            has ID 0.
+        weighting : optional
+            Use weighted inner product, norm, and dist. The following
+            types are supported:
+
+            `Weighting`: Use this weighting as-is.
+            Compatibility with this space's elements is not checked
+            during init.
+
+            float: Weighting by a constant
+
+            array-like: Pointwise weighting by an array of the same
+            `shape` as the space.
+
+            sequence of 1D array-likes: Per-axis (tensor product) weighting
+            using broadcasting multiplication in each axis. ``None``
+            entries cause the corresponding axis to be skipped.
+
+            This option cannot be combined with ``dist``,
+            ``norm`` or ``inner``.
+
+            Default: Each point has weight 1.0 (no weighting)
+
+        exponent : positive float, optional
+            Exponent of the norm. For values other than 2.0, no
+            inner product is defined.
+
+            This option is ignored if ``dist``, ``norm`` or
+            ``inner`` is given.
+
+            Default: 2.0
+
+        Other Parameters
+        ----------------
+        dist : callable, optional
+            The distance function defining a metric on the space.
+            It must accept two `CupyTensor` arguments and
+            fulfill the following mathematical conditions for any
+            three vectors ``x, y, z``:
+
+            - ``dist(x, y) >= 0``
+            - ``dist(x, y) = 0``  if and only if  ``x = y``
+            - ``dist(x, y) = dist(y, x)``
+            - ``dist(x, y) <= dist(x, z) + dist(z, y)``
+
+            This option cannot be combined with ``weight``,
+            ``norm`` or ``inner``.
+
+        norm : callable, optional
+            The norm implementation. It must accept an
+            `CupyTensor` argument, return a float and satisfy the
+            following conditions for all vectors ``x, y`` and scalars
+            ``s``:
+
+            - ``||x|| >= 0``
+            - ``||x|| = 0``  if and only if  ``x = 0``
+            - ``||s * x|| = |s| * ||x||``
+            - ``||x + y|| <= ||x|| + ||y||``
+
+            By default, ``norm(x)`` is calculated as ``inner(x, x)``.
+
+            This option cannot be combined with ``weight``,
+            ``dist`` or ``inner``.
+
+        inner : callable, optional
+            The inner product implementation. It must accept two
+            `CupyTensor` arguments, return a element from
+            the field of the space (real or complex number) and
+            satisfy the following conditions for all vectors
+            ``x, y, z`` and scalars ``s``:
+
+            - ``<x, y> = conj(<y, x>)``
+            - ``<s*x + y, z> = s * <x, z> + <y, z>``
+            - ``<x, x> = 0``  if and only if  ``x = 0``
+
+            This option cannot be combined with ``weight``,
+            ``dist`` or ``norm``.
+
+        kwargs :
+            Further keyword arguments are passed to the weighting
+            classes.
+
+        Examples
+        --------
+        Initialization with the class constructor:
+
+        >>> space = CupyTensorSpace(3, 'float')
+        >>> space
+        rn(3, impl='cupy')
+        >>> space.shape
+        (3,)
+        >>> space.dtype
+        dtype('float64')
+
+        A more convenient way is to use the factory functions with the
+        ``impl='cupy'`` option:
+
+        >>> space = odl.rn(3, impl='cupy', weighting=[1, 2, 3])
+        >>> space
+        rn(3, impl='cupy', weighting=[1, 2, 3])
+        >>> space = odl.tensor_space((2, 3), impl='cupy', dtype=int)
+        >>> space
+        tensor_space((2, 3), 'int', impl='cupy')
+        """
+        super(CupyTensorSpace, self).__init__(shape, dtype)
+        if self.dtype.char not in self.available_dtypes():
+            raise ValueError('`dtype` {!r} not supported'.format(dtype))
+
+        if device is None:
+            self.__device = cupy.cuda.get_device_id()
+        else:
+            self.__device = int(device)
+
+        dist = kwargs.pop('dist', None)
+        norm = kwargs.pop('norm', None)
+        inner = kwargs.pop('inner', None)
+        weighting = kwargs.pop('weighting', None)
+        exponent = kwargs.pop('exponent', None)
+
+        # Check validity of option combination (3 or 4 out of 4 must be None)
+        if sum(x is None for x in (dist, norm, inner, weighting)) < 3:
+            raise ValueError('invalid combination of options `weighting`, '
+                             '`dist`, `norm` and `inner`')
+        if (any(x is not None for x in (dist, norm, inner)) and
+                exponent is not None):
+            raise ValueError('`exponent` cannot be used together with '
+                             '`dist`, `norm` and `inner`')
+
+        # Set the weighting
+        if weighting is not None:
+            if isinstance(weighting, Weighting):
+                if weighting.impl != 'cupy':
+                    raise ValueError("`weighting.impl` must be 'cupy', "
+                                     '`got {!r}'.format(weighting.impl))
+                if exponent is not None and weighting.exponent != exponent:
+                    raise ValueError('`weighting.exponent` conflicts with '
+                                     '`exponent`: {} != {}'
+                                     ''.format(weighting.exponent, exponent))
+                self.__weighting = weighting
+            else:
+                self.__weighting = _weighting(weighting, exponent)
+
+            # Check (afterwards) that the weighting input was sane
+            if isinstance(self.weighting, CupyTensorSpaceArrayWeighting):
+                if not np.can_cast(self.weighting.array.dtype, self.dtype):
+                    raise ValueError(
+                        'cannot cast from `weighting` data type {} to '
+                        'the space `dtype` {}'
+                        ''.format(dtype_str(self.weighting.array.dtype),
+                                  dtype_str(self.dtype)))
+                if self.weighting.array.shape != self.shape:
+                    raise ValueError('array-like weights must have same '
+                                     'shape {} as this space, got {}'
+                                     ''.format(self.shape,
+                                               self.weighting.array.shape))
+
+        elif dist is not None:
+            self.__weighting = CupyTensorSpaceCustomDist(dist)
+        elif norm is not None:
+            self.__weighting = CupyTensorSpaceCustomNorm(norm)
+        elif inner is not None:
+            self.__weighting = CupyTensorSpaceCustomInner(inner)
+        else:  # all None -> no weighing
+            self.__weighting = _weighting(1.0, exponent)
+
+    @property
+    def device(self):
+        """The GPU device ID of this tensor space."""
+        return self.__device
+
+    @property
+    def impl(self):
+        """Implementation back-end of this space: ``'cupy'``."""
+        return 'cupy'
+
+    @property
+    def default_order(self):
+        """Default storage order for new elements in this space: ``'C'``."""
+        return 'C'
+
+    @property
+    def weighting(self):
+        """This space's weighting scheme."""
+        return self.__weighting
+
+    @property
+    def is_weighted(self):
+        """Return ``True`` if the space is not weighted by constant 1.0."""
+        return not (
+            isinstance(self.weighting, CupyTensorSpaceConstWeighting) and
+            self.weighting.const == 1.0)
+
+    @property
+    def exponent(self):
+        """Exponent of the norm and distance."""
+        return self.weighting.exponent
+
+    def element(self, inp=None, order=None):
+        """Create a new element.
+
+        Parameters
+        ----------
+        inp : `array-like`, optional
+            Input used to initialize the new element.
+
+            If ``inp`` is `None`, an empty element is created with no
+            guarantee of its state (memory allocation only).
+            The new element will use ``order`` as storage order if
+            provided, otherwise `default_order`.
+
+            Otherwise, a copy is avoided whenever possible. This requires
+            correct `shape` and `dtype`, and if ``order`` is provided,
+            also contiguousness in that ordering. If any of these
+            conditions is not met, a copy is made.
+
+        order : {None, 'C', 'F'}, optional
+            Storage order of the returned element. For ``'C'`` and ``'F'``,
+            contiguous memory in the respective ordering is enforced.
+            The default ``None`` enforces no contiguousness.
+
+        Returns
+        -------
+        element : `CupyTensor`
+            The new element created (from ``inp``).
+
+        Notes
+        -----
+        This method preserves "array views" of correct size and type,
+        see the examples below.
+
+        Examples
+        --------
+        >>> space = odl.rn((2, 3), impl='cupy')
+
+        Create an empty element:
+
+        >>> empty = space.element()
+        >>> empty.shape
+        (2, 3)
+
+        Initialization during creation:
+
+        >>> x = space.element([[1, 2, 3],
+        ...                    [4, 5, 6]])
+        >>> x
+        rn((2, 3), impl='cupy').element(
+            [[ 1.,  2.,  3.],
+             [ 4.,  5.,  6.]]
+        )
+        """
+        if order is not None:
+            # Need to keep `order=None` intact
+            order, order_in = str(order).upper(), order
+            if order not in ('C', 'F'):
+                raise ValueError('`order` {!r} not understood'
+                                 ''.format(order_in))
+
+        with cupy.cuda.Device(self.device):
+            if inp is None:
+                if order is None:
+                    # TODO: remove when fix is available
+                    # `order=None` is not understood by cupy 2.1.0, for new
+                    # elements it means to use default order. See
+                    # https://github.com/cupy/cupy/issues/590
+                    # (fixed on master)
+                    order = self.default_order
+                arr = cupy.empty(self.shape, dtype=self.dtype, order=order)
+
+            else:
+                if inp in self and order is None:
+                    # Short-circuit for space elements and no enforced ordering
+                    return inp
+
+                if hasattr(inp, 'shape') and inp.shape != self.shape:
+                    raise ValueError('`inp` must have shape {}, got shape {}'
+                                     ''.format(self.shape, inp.shape))
+
+                if isinstance(inp, cupy.ndarray):
+                    # Copies to current device if necessary
+                    # TODO: remove first case when `order=None` fix is
+                    # available (see above)
+                    if order is None:
+                        arr = inp.astype(self.dtype, copy=False)
+                    else:
+                        arr = inp.astype(self.dtype, order=order, copy=False)
+                else:
+                    # TODO: remove first case when `order=None` fix is
+                    # available (see above)
+                    if order is None:
+                        arr = cupy.array(inp, copy=False, dtype=self.dtype,
+                                         ndmin=self.ndim)
+                    else:
+                        arr = cupy.array(inp, copy=False, dtype=self.dtype,
+                                         ndmin=self.ndim, order=order)
+
+                # If the result has a 0 stride, make a copy since it would
+                # produce all kinds of nasty problems. This happens for e.g.
+                # results of `broadcast_to()`.
+                if 0 in arr.strides:
+                    arr = arr.copy()
+
+                # Make sure the shape is ok
+                if arr.shape != self.shape:
+                    raise ValueError('`inp` must have shape {}, got shape {}'
+                                     ''.format(self.shape, arr.shape))
+
+            return self.element_type(self, arr)
+
+    def zero(self):
+        """Create a tensor filled with zeros.
+
+        Examples
+        --------
+        >>> space = odl.rn(3, impl='cupy')
+        >>> x = space.zero()
+        >>> x
+        rn(3, impl='cupy').element([ 0.,  0.,  0.])
+        """
+        with cupy.cuda.Device(self.device):
+            arr = cupy.zeros(self.shape, dtype=self.dtype)
+        return self.element(arr)
+
+    def one(self):
+        """Create a tensor filled with ones.
+
+        Examples
+        --------
+        >>> space = odl.rn(3, impl='cupy')
+        >>> x = space.one()
+        >>> x
+        rn(3, impl='cupy').element([ 1.,  1.,  1.])
+        """
+        with cupy.cuda.Device(self.device):
+            arr = cupy.ones(self.shape, dtype=self.dtype)
+        return self.element(arr)
+
+    def __eq__(self, other):
+        """Return ``self == other``.
+
+        Returns
+        -------
+        equals : bool
+            ``True`` if ``other`` is an instance of this space's type
+            with the same `shape`,  `dtype`, `device` and
+            `weighting`, ``False`` otherwise.
+
+        Examples
+        --------
+        >>> space = odl.rn(2, impl='cupy')
+        >>> same_space = odl.rn(2, exponent=2, impl='cupy')
+        >>> same_space == space
+        True
+
+        Different `shape`, `exponent`, `dtype` or `impl`
+        all result in different spaces:
+
+        >>> diff_space = odl.rn((2, 3), impl='cupy')
+        >>> diff_space == space
+        False
+        >>> diff_space = odl.rn(2, exponent=1, impl='cupy')
+        >>> diff_space == space
+        False
+        >>> diff_space = odl.rn(2, dtype='float32', impl='cupy')
+        >>> diff_space == space
+        False
+        >>> diff_space = odl.rn(2, impl='numpy')
+        >>> diff_space == space
+        False
+        >>> space == object
+        False
+
+        A `CupyTensorSpace` with the same properties is considered
+        equal:
+
+        >>> same_space = odl.CupyTensorSpace(2, dtype='float64')
+        >>> same_space == space
+        True
+        """
+        return (super(CupyTensorSpace, self).__eq__(other) and
+                self.device == other.device and
+                self.weighting == other.weighting)
+
+    def __hash__(self):
+        """Return ``hash(self)``."""
+        return hash((super(CupyTensorSpace, self).__hash__(),
+                     self.device,
+                     self.weighting))
+
+    def _lincomb(self, a, x1, b, x2, out):
+        """Linear combination of ``x1`` and ``x2``.
+
+        Calculate ``out = a*x1 + b*x2`` using optimized BLAS
+        routines if possible.
+
+        Parameters
+        ----------
+        a, b : `TensorSpace.field` elements
+            Scalars to multiply ``x1`` and ``x2`` with.
+        x1, x2 : `CupyTensor`
+            Summands in the linear combination.
+        out : `CupyTensor`
+            Tensor to which the result is written.
+
+        Returns
+        -------
+        None
+
+        Examples
+        --------
+        >>> r3 = odl.rn(3, impl='cupy')
+        >>> x = r3.element([1, 2, 3])
+        >>> y = r3.element([4, 5, 6])
+        >>> out = r3.element()
+        >>> result = r3.lincomb(2, x, -1, y, out)
+        >>> result
+        rn(3, impl='cupy').element([-2., -1.,  0.])
+        >>> result is out
+        True
+        """
+        _lincomb_impl(a, x1, b, x2, out)
+
+    def _dist(self, x1, x2):
+        """Calculate the distance between two tensors.
+
+        Parameters
+        ----------
+        x1, x2 : `CupyTensor`
+            Tensors whose mutual distance is calculated.
+
+        Returns
+        -------
+        dist : float
+            Distance between the tensors.
+
+        Examples
+        --------
+        The default case is the Euclidean distance:
+
+        >>> r3 = odl.rn(3, impl='cupy')
+        >>> x = r3.element([1, 2, 3])
+        >>> y = r3.element([4, 2, -1])
+        >>> r3.dist(x, y)  # 3^2 + 4^2 = 25
+        5.0
+
+        Taking a different exponent or a weighting is also possible
+        during space creation:
+
+        >>> r3 = odl.rn(3, impl='cupy', exponent=1)
+        >>> x = r3.element([1, 2, 3])
+        >>> y = r3.element([4, 2, -1])
+        >>> r3.dist(x, y)  # 3 + 4 = 7
+        7.0
+
+        >>> r3 = odl.rn(3, impl='cupy', weighting=2, exponent=1)
+        >>> x = r3.element([1, 2, 3])
+        >>> y = r3.element([4, 2, -1])
+        >>> r3.dist(x, y)  # 2*3 + 2*4 = 14
+        14.0
+        """
+        return self.weighting.dist(x1, x2)
+
+    def _norm(self, x):
+        """Calculate the norm of a tensor.
+
+        Parameters
+        ----------
+        x : `CupyTensor`
+            The tensor whose norm is calculated.
+
+        Returns
+        -------
+        norm : float
+            Norm of the tensor.
+
+        Examples
+        --------
+        The default case is the Euclidean norm:
+
+        >>> r3 = odl.rn(3, impl='cupy')
+        >>> x = r3.element([3, 4, 0])
+        >>> r3.norm(x)  # 3^2 + 4^2 = 25
+        5.0
+
+        Taking a different exponent or a weighting is also possible
+        during space creation:
+
+        >>> r3 = odl.rn(3, impl='cupy', exponent=1)
+        >>> x = r3.element([3, 4, 0])
+        >>> r3.norm(x)  # 3 + 4 = 7
+        7.0
+
+        >>> r3 = odl.rn(3, impl='cupy', weighting=2, exponent=1)
+        >>> x = r3.element([3, 4, 0])
+        >>> r3.norm(x)  # 2*3 + 2*4 = 14
+        14.0
+        """
+        return self.weighting.norm(x)
+
+    def _inner(self, x1, x2):
+        """Raw inner product of two tensors.
+
+        Parameters
+        ----------
+        x1, x2 : `CupyTensor`
+            The tensors whose inner product is calculated.
+
+        Returns
+        -------
+        inner : `field` element
+            Inner product of the tensors.
+
+        Examples
+        --------
+        The default case is the dot product:
+
+        >>> r3 = odl.rn(3, impl='cupy')
+        >>> x = r3.element([1, 2, 3])
+        >>> y = r3.element([-1, 0, 1])
+        >>> r3.inner(x, y)  # 1*(-1) + 2*0 + 3*1 = 2
+        2.0
+
+        Taking a different weighting is also possible during space
+        creation:
+
+        >>> r3 = odl.rn(3, impl='cupy', weighting=2)
+        >>> x = r3.element([1, 2, 3])
+        >>> y = r3.element([-1, 0, 1])
+        >>> r3.inner(x, y)  # 2 * 1*(-1) + 2 * 2*0 + 2 * 3*1 = 4
+        4.0
+        """
+        return self.weighting.inner(x1, x2)
+
+    def _multiply(self, x1, x2, out):
+        """Entry-wise product of two tensors, assigned to out.
+
+        Parameters
+        ----------
+        x1, x2 : `CupyTensor`
+            Factors in the product.
+        out : `CupyTensor`
+            Tensor to which the result is written.
+
+        Examples
+        --------
+        Out-of-place evaluation:
+
+        >>> r3 = odl.rn(3, impl='cupy')
+        >>> x = r3.element([1, 2, 3])
+        >>> y = r3.element([-1, 0, 1])
+        >>> r3.multiply(x, y)
+        rn(3, impl='cupy').element([-1.,  0.,  3.])
+
+        In-place:
+
+        >>> out = r3.element()
+        >>> result = r3.multiply(x, y, out=out)
+        >>> result
+        rn(3, impl='cupy').element([-1.,  0.,  3.])
+        >>> result is out
+        True
+        """
+        x1.ufuncs.multiply(x2, out=out)
+
+    def _divide(self, x1, x2, out):
+        """Entry-wise division of two tensors, assigned to out.
+
+        Parameters
+        ----------
+        x1, x2 : `CupyTensor`
+            Dividend and divisor in the quotient.
+        out : `CupyTensor`
+            Tensor to which the result is written.
+
+        Examples
+        --------
+        Out-of-place evaluation:
+
+        >>> r3 = odl.rn(3, impl='cupy')
+        >>> x = r3.element([1, 2, 3])
+        >>> y = r3.element([-1, 2, 1])
+        >>> r3.divide(x, y)
+        rn(3, impl='cupy').element([-1.,  1.,  3.])
+
+        In-place:
+
+        >>> out = r3.element()
+        >>> result = r3.divide(x, y, out=out)
+        >>> result
+        rn(3, impl='cupy').element([-1.,  1.,  3.])
+        >>> result is out
+        True
+        """
+        x1.ufuncs.divide(x2, out=out)
+
+    def __repr__(self):
+        """Return ``repr(self)``."""
+        if self.ndim == 1:
+            posargs = [self.size]
+        else:
+            posargs = [self.shape]
+
+        if self.is_real:
+            constructor_name = 'rn'
+        elif self.is_complex:
+            constructor_name = 'cn'
+        else:
+            constructor_name = 'tensor_space'
+
+        if (constructor_name == 'tensor_space' or
+                (not self.is_real and not self.is_complex) or
+                self.dtype != self.default_dtype(self.field)):
+            posargs.append(dtype_str(self.dtype))
+
+        optargs = [('impl', self.impl, 'numpy'),  # for the helper functions
+                   ('device', self.device, cupy.cuda.get_device_id())]
+        inner_str = signature_string(posargs, optargs)
+        weight_str = self.weighting.repr_part
+        if weight_str:
+            inner_str += ', ' + weight_str
+
+        return '{}({})'.format(constructor_name, inner_str)
+
+    @property
+    def element_type(self):
+        """`CupyTensor`"""
+        return CupyTensor
+
+    @staticmethod
+    def available_dtypes():
+        """Return the data types available for this space."""
+        dtypes = (np.sctypes['float'] +
+                  np.sctypes['complex'] +
+                  np.sctypes['int'] +
+                  np.sctypes['uint'] +
+                  [bool])
+
+        # Convert to dtypes and remove duplicates
+        dtypes = tuple(set(np.dtype(dtype) for dtype in dtypes))
+
+        # Remove float128 and complex256 if they exist
+        if hasattr(np, 'float128'):
+            dtypes.remove(np.float128)
+        if hasattr(np, 'complex256'):
+            dtypes.remove(np.complex256)
+
+        return dtypes
+
+    @staticmethod
+    def default_dtype(field=None):
+        """Return the default data type of this space type for a given field.
+
+        Parameters
+        ----------
+        field : `Field`, optional
+            Set of numbers to be represented by a data type.
+            Currently supported : `RealNumbers`, `ComplexNumbers`.
+            Default: `RealNumbers`
+
+        Returns
+        -------
+        dtype : `numpy.dtype`
+            Numpy data type specifier. The returned defaults are:
+
+            - ``RealNumbers()`` or ``None`` : ``np.dtype('float64')``
+            - ``ComplexNumbers()`` : ``np.dtype('complex128')``
+
+            These choices correspond to the defaults of the CuPy library.
+        """
+        if field is None or field == RealNumbers():
+            return np.dtype('float64')
+        elif field == ComplexNumbers():
+            return np.dtype('complex128')
+        else:
+            raise ValueError('no default data type defined for field {}.'
+                             ''.format(field))
+
+
+class CupyTensor(Tensor):
+
+    """Representation of an `CupyTensorSpace` element."""
+
+    def __init__(self, space, data):
+        """Initialize a new instance."""
+        super(CupyTensor, self).__init__(space)
+        self.__data = data
+
+    @property
+    def data(self):
+        """Raw `cupy.core.core.ndarray` representing the data."""
+        return self.__data
+
+    @property
+    def ndim(self):
+        """Number of axes (=dimensions) of this tensor."""
+        return self.space.ndim
+
+    @property
+    def device(self):
+        """The GPU device on which this tensor lies."""
+        return self.space.device
+
+    def asarray(self, out=None, impl='numpy'):
+        """Extract the data of this element as an ndarray.
+
+        This method is invoked when calling `numpy.asarray` on this tensor.
+
+        Parameters
+        ----------
+        out : ndarray, optional
+            Array into which the result should be written. Must be contiguous
+            and of the correct dtype.
+        impl : str, optional
+            Array backend for the output, used when ``out`` is not given.
+
+        Returns
+        -------
+        asarray : ndarray
+            Array of the same `dtype` and `shape` this tensor.
+            If ``out`` was given, the returned object is a reference to it.
+
+        Examples
+        --------
+        By default, a new array is created:
+
+        >>> r3 = odl.rn(3, impl='cupy')
+        >>> x = r3.element([1, 2, 3])
+        >>> x.asarray()
+        array([ 1.,  2.,  3.])
+        >>> int_spc = odl.tensor_space(3, impl='cupy', dtype=int)
+        >>> x = int_spc.element([1, 2, 3])
+        >>> x.asarray()
+        array([1, 2, 3])
+        >>> tensors = odl.rn((2, 3), impl='cupy', dtype='float32')
+        >>> x = tensors.element([[1, 2, 3],
+        ...                      [4, 5, 6]])
+        >>> x.asarray()
+        array([[ 1.,  2.,  3.],
+               [ 4.,  5.,  6.]], dtype=float32)
+
+        Using the out parameter, the array can be filled in-place:
+
+        >>> out = np.empty((2, 3), dtype='float32')
+        >>> result = x.asarray(out=out)
+        >>> out
+        array([[ 1.,  2.,  3.],
+               [ 4.,  5.,  6.]], dtype=float32)
+        >>> result is out
+        True
+        """
+        impl, impl_in = str(impl).lower(), impl
+        if out is None:
+            if impl == 'numpy':
+                return cupy.asnumpy(self.data)
+            elif impl == 'cupy':
+                return self.data
+            else:
+                raise ValueError('`impl` {!r} not understood'.format(impl_in))
+        else:
+            if not (out.flags.c_contiguous or out.flags.f_contiguous):
+                raise ValueError('`out` must be contiguous')
+            if out.shape != self.shape:
+                raise ValueError('`out` must have shape {}, got shape {}'
+                                 ''.format(self.shape, out.shape))
+            if out.dtype != self.dtype:
+                raise ValueError('`out` must have dtype {}, got dtype {}'
+                                 ''.format(self.dtype, out.dtype))
+
+            if isinstance(out, np.ndarray):
+                if (self.data.flags.c_contiguous or
+                        self.data.flags.f_contiguous):
+                    # Use efficient copy for contiguous memory
+                    self.data.data.copy_to_host(
+                        out.ctypes.data_as(np.ctypeslib.ctypes.c_void_p),
+                        self.size * self.itemsize)
+                else:
+                    out[:] = cupy.asnumpy(self.data)
+            else:
+                out[:] = self.data
+
+            return out
+
+    @property
+    def data_ptr(self):
+        """Memory address of the data container as 64-bit integer.
+
+        Returns
+        -------
+        data_ptr : int
+            The data pointer is technically of type ``uintptr_t`` and gives the
+            index in bytes of the first element of the data storage.
+
+        Examples
+        --------
+        >>> r3 = odl.rn(3, impl='cupy')
+        >>> x = r3.one()
+        >>> x.data_ptr  # doctest: +SKIP
+        47259975936
+        """
+        return self.data.ptr
+
+    def __eq__(self, other):
+        """Return ``self == other``.
+
+        Parameters
+        ----------
+        other :
+            Object to be compared with ``self``.
+
+        Returns
+        -------
+        equals : bool
+            ``True`` if all entries of ``other`` are equal to this
+            tensor's entries, ``False`` otherwise.
+
+        Examples
+        --------
+        >>> r3 = odl.rn(3, impl='cupy')
+        >>> x = r3.element([1, 2, 3])
+        >>> same_x = r3.element([1, 2, 3])
+        >>> y = r3.element([-1, -2, -3])
+        >>> x == same_x
+        True
+        >>> x == y
+        False
+
+        Space membership matters:
+
+        >>> int_spc = odl.tensor_space(3, impl='cupy', dtype=int)
+        >>> x_int = int_spc.element([1, 2, 3])
+        >>> x == x_int
+        False
+        """
+        if other is self:
+            return True
+        elif other not in self.space:
+            return False
+        else:
+            out = cupy.empty((), dtype=bool)
+            return bool(all_equal(self.data, other.data, out))
+
+    def copy(self):
+        """Create an identical (deep) copy of this tensor.
+
+        Returns
+        -------
+        copy : `CupyTensor`
+            A deep copy.
+
+        Examples
+        --------
+        >>> r3 = odl.rn(3, impl='cupy')
+        >>> x = r3.element([1, 2, 3])
+        >>> y = x.copy()
+        >>> y
+        rn(3, impl='cupy').element([ 1.,  2.,  3.])
+        >>> x == y
+        True
+        >>> x is y
+        False
+        """
+        return self.space.element(self.data.copy())
+
+    def __getitem__(self, indices):
+        """Access values of this tensor.
+
+        Parameters
+        ----------
+        indices : index expression
+            The position(s) that should be accessed.
+
+        Returns
+        -------
+        values : scalar or `cupy.core.core.ndarray`
+            The value(s) at the index (indices).
+
+        Examples
+        --------
+        Indexing rules follow roughly the Numpy style, as far (or "fancy")
+        as supported:
+
+        >>> r5 = odl.rn(5, impl='cupy')
+        >>> x = r5.element([1, 2, 3, 4, 5])
+        >>> x[1:4]
+        rn(3, impl='cupy').element([ 2.,  3.,  4.])
+        >>> x[::2]
+        rn(3, impl='cupy').element([ 1.,  3.,  5.])
+
+        If integers and slices are used, the returned "views" are writable,
+        hence modificatons alter the original array:
+
+        >>> view = x[1:4]
+        >>> view[:] = -1
+        >>> view
+        rn(3, impl='cupy').element([-1., -1., -1.])
+        >>> x
+        rn(5, impl='cupy').element([ 1., -1., -1., -1.,  5.])
+
+        For lists, index arrays or boolean arrays, a copy is made, i.e.,
+        changes do not affect the original array:
+
+        >>> idcs = [0, 3, 2, 1, 2]
+        >>> x = r5.element([1, 2, 3, 4, 5])
+        >>> x_part = x[[0, 3, 2, 1, 2]]
+        >>> x_part
+        rn(5, impl='cupy').element([ 1.,  4.,  3.,  2.,  3.])
+        >>> x_part[:] = 0
+        >>> x
+        rn(5, impl='cupy').element([ 1.,  2.,  3.,  4.,  5.])
+
+        Multi-indexing is also directly supported:
+
+        >>> tensors = odl.rn((2, 3), impl='cupy')
+        >>> x = tensors.element([[1, 2, 3],
+        ...                      [4, 5, 6]])
+        >>> x[1, 2]
+        6.0
+        >>> x[1]  # row with index 1
+        rn(3, impl='cupy').element([ 4.,  5.,  6.])
+        >>> view = x[:, ::2]
+        >>> view
+        rn((2, 2), impl='cupy').element(
+            [[ 1.,  3.],
+             [ 4.,  6.]]
+        )
+        >>> view[:] = [[0, 0],
+        ...            [0, 0]]
+        >>> x
+        rn((2, 3), impl='cupy').element(
+            [[ 0.,  2.,  0.],
+             [ 0.,  5.,  0.]]
+        )
+        """
+        if isinstance(indices, CupyTensor):
+            indices = indices.data
+
+        arr = self.data[indices]
+        if arr.shape == ():
+            return _python_scalar(arr)
+        else:
+            if is_numeric_dtype(self.dtype):
+                weighting = self.space.weighting
+            else:
+                weighting = None
+
+            kwargs = {}
+            if isinstance(weighting, CupyTensorSpaceArrayWeighting):
+                weighting = weighting.array[indices]
+            elif isinstance(weighting, CupyTensorSpaceConstWeighting):
+                # Axes were removed, cannot infer new constant
+                if arr.ndim != self.ndim:
+                    weighting = None
+                    kwargs['exponent'] = self.space.exponent
+
+            space = type(self.space)(arr.shape, dtype=self.dtype,
+                                     device=self.device, weighting=weighting,
+                                     **kwargs)
+            return space.element(arr)
+
+    def __setitem__(self, indices, values):
+        """Set values of this tensor.
+
+        Parameters
+        ----------
+        indices : index expression
+            The position(s) that should be accessed.
+        values : scalar or `array-like`
+            The value(s) that are to be assigned.
+
+            If ``indices`` is an int (1D) or a sequence of ints,
+            ``value`` must be scalar.
+
+            Otherwise, ``value`` must be broadcastable to the shape of
+            the sliced view according to the Numpy broadcasting rules.
+
+        Examples
+        --------
+        In 1D, Values can be set with scalars or arrays that match the
+        shape of the slice:
+
+        >>> r5 = odl.rn(5, impl='cupy')
+        >>> x = r5.element([1, 2, 3, 4, 5])
+        >>> x[1:4] = 0
+        >>> x
+        rn(5, impl='cupy').element([ 1.,  0.,  0.,  0.,  5.])
+        >>> x[1:4] = [-1, 1, -1]
+        >>> x
+        rn(5, impl='cupy').element([ 1., -1.,  1., -1.,  5.])
+        >>> y = r5.element([5, 5, 5, 8, 8])
+        >>> x[:] = y
+        >>> x
+        rn(5, impl='cupy').element([ 5.,  5.,  5.,  8.,  8.])
+
+        In higher dimensions, broadcasting can be applied to assign
+        values:
+
+        >>> tensors = odl.rn((2, 3), impl='cupy')
+        >>> x = tensors.element([[1, 2, 3],
+        ...                      [4, 5, 6]])
+        >>> x[:] = [[6], [3]]  # rhs mimics (2, 1) shape
+        >>> x
+        rn((2, 3), impl='cupy').element(
+            [[ 6.,  6.,  6.],
+             [ 3.,  3.,  3.]]
+        )
+
+        Be aware of unsafe casts and over-/underflows, there
+        will be warnings at maximum.
+
+        >>> int_r3 = odl.tensor_space(3, impl='cupy', dtype='uint32')
+        >>> x = int_r3.element([1, 2, 3])
+        >>> x[0] = -1
+        >>> x[0]
+        4294967295
+        """
+        if isinstance(indices, CupyTensor):
+            indices = indices.data
+
+        if isinstance(values, CupyTensor):
+            self.data[indices] = values.data
+        elif isinstance(values, cupy.ndarray) or np.isscalar(values):
+            self.data[indices] = values
+        else:
+            values = cupy.array(values, dtype=self.dtype, copy=False)
+            self.data[indices] = values
+
+    def __int__(self):
+        """Return ``int(self)``.
+
+        Returns
+        -------
+        int : int
+            Integer representing this tensor.
+
+        Raises
+        ------
+        TypeError
+            If the tensor is of `size` != 1.
+        """
+        if self.size != 1:
+            raise TypeError('only size 1 tensors can be converted to int')
+        return int(self[(0,) * self.ndim])
+
+    def __long__(self):
+        """Return ``long(self)``.
+
+        The `long` method is only available in Python 2.
+
+        Returns
+        -------
+        long : `long`
+            Integer representing this tensor.
+
+        Raises
+        ------
+        TypeError
+            If the tensor is of `size` != 1.
+        """
+        if self.size != 1:
+            raise TypeError('only size 1 tensors can be converted to long')
+        return long(self[(0,) * self.ndim])
+
+    def __float__(self):
+        """Return ``float(self)``.
+
+        Returns
+        -------
+        float : float
+            Floating point number representing this tensor.
+
+        Raises
+        ------
+        TypeError
+            If the tensor is of `size` != 1.
+        """
+        if self.size != 1:
+            raise TypeError('only size 1 tensors can be converted to float')
+        return float(self[(0,) * self.ndim])
+
+    def __complex__(self):
+        """Return ``complex(self)``.
+
+        Returns
+        -------
+        complex : `complex`
+            Complex floating point number representing this tensor.
+
+        Raises
+        ------
+        TypeError
+            If the tensor is of `size` != 1.
+        """
+        if self.size != 1:
+            raise TypeError('only size 1 tensors can be converted to complex')
+        return complex(self[(0,) * self.ndim])
+
+    def __str__(self):
+        """Return ``str(self)``."""
+        return str(self.data)
+
+    def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+        """Interface to Numpy's ufunc machinery.
+
+        This method is called by Numpy version 1.13 and higher as a single
+        point for the ufunc dispatch logic. An object implementing
+        ``__array_ufunc__`` takes over control when a `numpy.ufunc` is
+        called on it, allowing it to use custom implementations and
+        output types.
+
+        This includes handling of in-place arithmetic like
+        ``npy_array += custom_obj``. In this case, the custom object's
+        ``__array_ufunc__`` takes precedence over the baseline
+        `numpy.ndarray` implementation. It will be called with
+        ``npy_array`` as ``out`` argument, which ensures that the
+        returned object is a Numpy array. For this to work properly,
+        ``__array_ufunc__`` has to accept Numpy arrays as ``out`` arguments.
+        The same holds analogously for GPU arrays.
+
+        See the `corresponding NEP`_ and the `interface documentation`_
+        for further details. See also the `general documentation on
+        Numpy ufuncs`_.
+
+        .. warning::
+            Apart from ``'__call__'`` (invoked by, e.g., ``np.add(x, y))``,
+            CuPy has no native implementation of ufunc methods like
+            ``'reduce'`` or ``'accumulate'``. We manually implement the
+            mappings (covering most use cases)
+
+            - ``np.add.reduce`` -> ``cupy.sum``
+            - ``np.add.accumulate`` -> ``cupy.cumsum``
+            - ``np.multiply.reduce`` -> ``cupy.prod``
+            - ``np.multiply.accumulate`` -> ``cupy.cumprod``.
+
+            **All other such methods will run Numpy code and be slow**!
+
+            Please consult the `CuPy documentation on ufuncs
+            <https://docs-cupy.chainer.org/en/stable/reference/ufunc.html>`_
+            to check the current state of the library.
+
+        .. note::
+            When an ``out`` parameter is specified, and (one of) it has
+            type `numpy.ndarray`, the inputs are converted to Numpy
+            arrays, and the Numpy ufunc is invoked.
+
+        .. note::
+            When using operations that alter the shape (like ``reduce``),
+            or the data type (can be any of the methods),
+            the resulting array is wrapped in a space of the same
+            type as ``self.space``, propagating space properties like
+            `exponent` or `weighting` as closely as possible.
+
+        Parameters
+        ----------
+        ufunc : `numpy.ufunc`
+            Ufunc that should be called on ``self``.
+        method : str
+            Method on ``ufunc`` that should be called on ``self``.
+            Possible values:
+
+            ``'__call__'``, ``'accumulate'``, ``'at'``, ``'outer'``,
+            ``'reduce'``, ``'reduceat'``
+
+        input1, ..., inputN :
+            Positional arguments to ``ufunc.method``.
+        force_native : bool, optional
+            If ``True``, raise a ``ValueError`` if there is no native CuPy
+            function available for the given ``ufunc``, ``method`` and inputs.
+            Default : ``False``
+        kwargs :
+            Further keyword arguments passed to ``ufunc.method``.
+
+        Returns
+        -------
+        ufunc_result : `CupyTensor`, `numpy.ndarray` or tuple
+            Result of the ufunc evaluation. If no ``out`` keyword argument
+            was given, the result is a `Tensor` or a tuple
+            of such, depending on the number of outputs of ``ufunc``.
+            If ``out`` was provided, the returned object or tuple entries
+            refer(s) to ``out``.
+
+        Examples
+        --------
+        We apply `numpy.add` to ODL tensors:
+
+        >>> r3 = odl.rn(3, impl='cupy')
+        >>> x = r3.element([1, 2, 3])
+        >>> y = r3.element([-1, -2, -3])
+        >>> x.__array_ufunc__(np.add, '__call__', x, y)
+        rn(3, impl='cupy').element([ 0.,  0.,  0.])
+        >>> np.add(x, y)  # same mechanism for Numpy >= 1.13
+        rn(3, impl='cupy').element([ 0.,  0.,  0.])
+
+        As ``out``, a Numpy array or an ODL tensor can be given (wrapped
+        in a sequence):
+
+        >>> out = r3.element()
+        >>> res = x.__array_ufunc__(np.add, '__call__', x, y, out=(out,))
+        >>> out
+        rn(3, impl='cupy').element([ 0.,  0.,  0.])
+        >>> res is out
+        True
+        >>> out_arr = np.empty(3)
+        >>> res = x.__array_ufunc__(np.add, '__call__', x, y, out=(out_arr,))
+        >>> out_arr
+        array([ 0.,  0.,  0.])
+        >>> res is out_arr
+        True
+
+        With multiple dimensions:
+
+        >>> r23 = odl.rn((2, 3), impl='cupy')
+        >>> x = y = r23.one()
+        >>> x.__array_ufunc__(np.add, '__call__', x, y)
+        rn((2, 3), impl='cupy').element(
+            [[ 2.,  2.,  2.],
+             [ 2.,  2.,  2.]]
+        )
+
+        The ``ufunc.accumulate`` method retains the original `shape` and
+        `dtype`. The latter can be changed with the ``dtype`` parameter:
+
+        >>> x = r3.element([1, 2, 3])
+        >>> x.__array_ufunc__(np.add, 'accumulate', x)
+        rn(3, impl='cupy').element([ 1.,  3.,  6.])
+        >>> np.add.accumulate(x)  # same mechanism for Numpy >= 1.13
+        rn(3, impl='cupy').element([ 1.,  3.,  6.])
+
+        For multi-dimensional tensors, an optional ``axis`` parameter
+        can be provided:
+
+        >>> z = r23.one()
+        >>> z.__array_ufunc__(np.add, 'accumulate', z, axis=1)
+        rn((2, 3), impl='cupy').element(
+            [[ 1.,  2.,  3.],
+             [ 1.,  2.,  3.]]
+        )
+
+        The ``ufunc.at`` method operates in-place. Here we add the second
+        operand ``[5, 10]`` to ``x`` at indices ``[0, 2]``:
+
+        >>> x = r3.element([1, 2, 3])
+        >>> x.__array_ufunc__(np.add, 'at', x, [0, 2], [5, 10])
+        >>> x
+        rn(3, impl='cupy').element([  6.,   2.,  13.])
+
+        For outer-product-type operations, i.e., operations where the result
+        shape is the sum of the individual shapes, the ``ufunc.outer``
+        method can be used:
+
+        >>> x = odl.rn(2, impl='cupy').element([0, 3])
+        >>> y = odl.rn(3, impl='cupy').element([1, 2, 3])
+        >>> x.__array_ufunc__(np.add, 'outer', x, y)
+        rn((2, 3), impl='cupy').element(
+            [[ 1.,  2.,  3.],
+             [ 4.,  5.,  6.]]
+        )
+        >>> y.__array_ufunc__(np.add, 'outer', y, x)
+        rn((3, 2), impl='cupy').element(
+            [[ 1.,  4.],
+             [ 2.,  5.],
+             [ 3.,  6.]]
+        )
+
+        Using ``ufunc.reduce`` produces a scalar, which can be avoided with
+        ``keepdims=True``:
+
+        >>> x = r3.element([1, 2, 3])
+        >>> x.__array_ufunc__(np.add, 'reduce', x)
+        6.0
+        >>> x.__array_ufunc__(np.add, 'reduce', x, keepdims=True)
+        rn(1, impl='cupy').element([ 6.])
+
+        In multiple dimensions, ``axis`` can be provided for reduction over
+        selected axes:
+
+        >>> z = r23.element([[1, 2, 3],
+        ...                  [4, 5, 6]])
+        >>> z.__array_ufunc__(np.add, 'reduce', z, axis=1)
+        rn(2, impl='cupy').element([  6.,  15.])
+
+        Finally, ``add.reduceat`` is a combination of ``reduce`` and
+        ``at`` with rather flexible and complex semantics (see the
+        `reduceat documentation`_ for details):
+
+        >>> x = r3.element([1, 2, 3])
+        >>> x.__array_ufunc__(np.add, 'reduceat', x, [0, 1])
+        rn(2, impl='cupy').element([ 1.,  5.])
+
+        References
+        ----------
+        .. _corresponding NEP:
+           https://github.com/numpy/numpy/blob/master/doc/neps/\
+ufunc-overrides.rst
+
+        .. _interface documentation:
+           https://github.com/charris/numpy/blob/master/doc/source/reference/\
+arrays.classes.rst#special-attributes-and-methods
+
+        .. _general documentation on Numpy ufuncs:
+           https://docs.scipy.org/doc/numpy/reference/ufuncs.html
+
+        .. _reduceat documentation:
+           https://docs.scipy.org/doc/numpy/reference/generated/\
+numpy.ufunc.reduceat.html
+        """
+        # --- Process `out` --- #
+
+        # Unwrap out if provided. The output parameters are all wrapped
+        # in one tuple, even if there is only one.
+        out_tuple = kwargs.pop('out', ())
+
+        # Check number of `out` args, depending on `method`
+        if method == '__call__' and len(out_tuple) not in (0, ufunc.nout):
+            raise ValueError(
+                "need 0 or {} `out` arguments for `method='__call__'`, "
+                'got {}'.format(ufunc.nout, len(out_tuple)))
+        elif method != '__call__' and len(out_tuple) not in (0, 1):
+            raise ValueError(
+                "need 0 or 1 `out` arguments for `method={!r}`, "
+                'got {}'.format(method, len(out_tuple)))
+
+        # Catch wrong number of inputs
+        if method == '__call__' and len(inputs) != ufunc.nin:
+            raise ValueError(
+                "need {} inputs for `method='__call__'`, got {}"
+                ''.format(ufunc.nin, len(inputs)))
+
+        # We allow our own tensors, the data container type and
+        # `numpy.ndarray` objects as `out` (see docs for reason for the
+        # latter)
+        valid_types = (type(self), type(self.data), np.ndarray)
+        if not all(isinstance(o, valid_types) or o is None
+                   for o in out_tuple):
+            return NotImplemented
+
+        # Assign to `out` or `out1` and `out2`, respectively, unwrapping the
+        # data container
+        out = out1 = out2 = None
+        if len(out_tuple) == 1:
+            if isinstance(out_tuple[0], type(self)):
+                out = out_tuple[0].data
+            else:
+                out = out_tuple[0]
+        elif len(out_tuple) == 2:
+            if isinstance(out_tuple[0], type(self)):
+                out1 = out_tuple[0].data
+            else:
+                out1 = out_tuple[0]
+            if isinstance(out_tuple[1], type(self)):
+                out2 = out_tuple[1].data
+            else:
+                out2 = out_tuple[1]
+
+        # --- Process `inputs` --- #
+
+        # Pull out the data container of the inputs if necessary
+        inputs = [inp.data if isinstance(inp, type(self)) else inp
+                  for inp in inputs]
+
+        # Determine native ufunc vs. Numpy ufunc
+        if any(isinstance(o, np.ndarray) for o in out_tuple):
+            native_ufunc = None
+            use_native = False
+        elif not all(isinstance(inp, type(self.data)) or np.isscalar(inp)
+                     for inp in inputs):
+            native_ufunc = None
+            use_native = False
+        else:
+            native_ufunc = getattr(cupy, ufunc.__name__, None)
+
+            # Manually implemented cases
+            if (
+                    (ufunc in (np.add, np.multiply) and
+                     method in ('reduce', 'accumulate')) or
+                    (ufunc in (np.minimum, np.maximum) and
+                     method == 'reduce') or
+                    (ufunc == np.add and method == 'at')
+                    ):
+                use_native = native_ufunc is not None  # should be True
+            else:
+                use_native = (native_ufunc is not None and
+                              hasattr(native_ufunc, method))
+
+        force_native = kwargs.pop('force_native', False)
+        if force_native and not use_native:
+            raise ValueError(
+                'no native function available to evaluate `{}.{}` with '
+                'inputs {!r}, kwargs {!r} and `out` {!r}'
+                ''.format(ufunc.__name__, method, inputs, kwargs, out_tuple))
+
+        if use_native:
+            # TODO: remove when upstream issue is fixed
+            # For native ufuncs, we turn non-scalar inputs into cupy arrays
+            # since cupy ufuncs do not accept array-like input. See
+            # https://github.com/cupy/cupy/issues/594 and
+            # https://github.com/odlgroup/odl/issues/1248
+            for i in range(len(inputs)):
+                if not (isinstance(inputs[i], cupy.ndarray) or
+                        np.isscalar(inputs[i]) or
+                        inputs[i] is None):
+                    inputs[i] = cupy.array(inputs[i])
+            assert native_ufunc is not None
+
+        elif not use_native and method != 'at':
+            # TODO: remove when upstream issue is fixed
+            # For non-native ufuncs, we need ot cast our tensors
+            # and Cupy arrays to Numpy arrays explicitly, since `__array__`
+            # and friends are not implemented. See
+            # https://github.com/cupy/cupy/issues/589 and
+            # https://github.com/odlgroup/odl/issues/1248
+            # We must exclude 'at' since it's in-place, thus we're not allowed
+            # to change the input.
+            for i in range(len(inputs)):
+                if isinstance(inputs[i], cupy.ndarray):
+                    inputs[i] = cupy.asnumpy(inputs[i])
+                elif isinstance(inputs[i], CupyTensor):
+                    inputs[i] = cupy.asnumpy(inputs[i].data)
+
+        # --- For later --- #
+
+        # Wrap the result in an appropriate space, propagating weighting
+        # if possible
+        def space_element(res, use_weighting=True):
+            if use_weighting and is_floating_dtype(res.dtype):
+                if res.shape == self.shape:
+                    weighting = self.space.weighting
+                else:
+                    # Don't propagate weighting if shape changes
+                    # (but keep the exponent)
+                    weighting = CupyTensorSpaceConstWeighting(
+                        1.0, self.space.exponent)
+
+                spc_kwargs = {'weighting': weighting}
+            else:
+                # No `exponent` or `weighting` applicable
+                spc_kwargs = {}
+
+            res_space = type(self.space)(
+                res.shape, res.dtype, self.device, **spc_kwargs)
+            return res_space.element(res)
+
+        # --- Evaluate ufunc --- #
+
+        if method == '__call__':
+            if ufunc.nout == 1:
+                if use_native:
+                    kwargs['out'] = out  # No tuple packing for cupy
+                    res = native_ufunc(*inputs, **kwargs)
+                else:
+                    # We need to explicitly cast `out` to Numpy array since
+                    # the upstream `np.asarray` won't work. See
+                    # https://github.com/cupy/cupy/issues/589 and
+                    # https://github.com/odlgroup/odl/issues/1248
+                    if isinstance(out, cupy.ndarray):
+                        out_npy = cupy.asnumpy(out)
+                    else:
+                        out_npy = out
+
+                    kwargs['out'] = (out_npy,)
+                    res = super(CupyTensor, self).__array_ufunc__(
+                        ufunc, '__call__', *inputs, **kwargs)
+
+                    if isinstance(out, cupy.ndarray):
+                        out[:] = cupy.asarray(res)
+
+                if out is None:
+                    result = space_element(res, use_weighting=True)
+                else:
+                    result = out_tuple[0]
+
+                return result
+
+            elif ufunc.nout == 2:
+                if use_native:
+                    # cupy ufuncs with 2 outputs don't support writing to
+                    # `out`
+                    res1, res2 = native_ufunc(*inputs)
+                    if out1 is not None:
+                        out1[:] = res1
+                    if out2 is not None:
+                        out2[:] = res2
+                else:
+                    # See case nout == 1 for comments
+                    if isinstance(out1, cupy.ndarray):
+                        out1_npy = cupy.asnumpy(out1)
+                    else:
+                        out1_npy = out1
+                    if isinstance(out2, cupy.ndarray):
+                        out2_npy = cupy.asnumpy(out2)
+                    else:
+                        out2_npy = out2
+
+                    kwargs['out'] = (out1_npy, out2_npy)
+                    res1, res2 = super(CupyTensor, self).__array_ufunc__(
+                        ufunc, '__call__', *inputs, **kwargs)
+
+                    if isinstance(out1, cupy.ndarray):
+                        out1[:] = cupy.asarray(res1)
+                    if isinstance(out2, cupy.ndarray):
+                        out2[:] = cupy.asarray(res2)
+
+                # Don't use exponents or weightings since we don't know
+                # how to map them to the spaces
+                if out1 is None:
+                    result1 = space_element(res1, use_weighting=False)
+                else:
+                    result1 = out_tuple[0]
+
+                if out2 is None:
+                    result2 = space_element(res2, use_weighting=False)
+                else:
+                    result2 = out_tuple[1]
+
+                return result1, result2
+
+            else:
+                raise NotImplementedError('nout = {} not supported'
+                                          ''.format(ufunc.nout))
+
+        # Special case 1
+        elif (use_native and
+              (ufunc in (np.add, np.multiply) and
+               method in ('reduce', 'accumulate')
+               ) or
+              (ufunc in (np.minimum, np.maximum) and
+               method == 'reduce')
+              ):
+            # These cases are implemented but not available as methods
+            # of cupy.ufunc. We map the implementation by hand.
+            if ufunc == np.add:
+                function = cupy.sum if method == 'reduce' else cupy.cumsum
+            elif ufunc == np.multiply:
+                function = cupy.prod if method == 'reduce' else cupy.cumprod
+            elif ufunc == np.minimum and method == 'reduce':
+                function = cupy.min
+            elif ufunc == np.maximum and method == 'reduce':
+                function = cupy.max
+            else:
+                raise RuntimeError('no native {}.{}'
+                                   ''.format(ufunc.__name__, method))
+
+            # Make 0 the default for `axis` as the ufunc methods. The default
+            # is to reduce/accumulate over all axes.
+            axis = kwargs.pop('axis', 0)
+            kwargs['axis'] = axis
+            kwargs['out'] = out
+            res = function(*inputs, **kwargs)
+
+            # Shortcut for scalar return value
+            if getattr(res, 'shape', ()) == ():
+                # Happens for `reduce` with all axes
+                return _python_scalar(res)
+
+            if out is None:
+                result = space_element(res, use_weighting=True)
+            else:
+                result = out_tuple[0]
+
+            return result
+
+        # Special case 2
+        elif use_native and ufunc == np.add and method == 'at':
+            cupy.scatter_add(*inputs, **kwargs)
+            return
+
+        # Separate handling of 'at' since input is unchanged
+        elif method == 'at':
+            native_method = getattr(native_ufunc, 'at', None)
+            use_native = (use_native and native_method is not None)
+
+            def eval_at_via_npy(*inputs, **kwargs):
+                """Use Numpy to evaluate ufunc.at, converting the inputs."""
+                import ctypes
+
+                if force_native:
+                    raise ValueError(
+                        'no native function available to evaluate `{}.at` '
+                        'with inputs {!r} and kwargs {!r}'
+                        ''.format(ufunc.__name__, inputs, kwargs))
+                new_inputs = [cupy.asnumpy(inputs[0]), inputs[1]]
+                if ufunc.nin == 2:
+                    new_inputs.append(cupy.asnumpy(inputs[2]))
+
+                # Shouldn't exist, but let upstream code raise
+                new_inputs.extend(inputs[3:])
+
+                super(CupyTensor, self).__array_ufunc__(
+                    ufunc, method, *new_inputs, **kwargs)
+                # Workaround for assignment cupy_arr[:] = npy_arr not
+                # working. See
+                # https://github.com/cupy/cupy/issues/593 and
+                # https://github.com/odlgroup/odl/issues/1248
+                inputs[0].data.copy_from_host(
+                    new_inputs[0].ctypes.data_as(ctypes.c_void_p),
+                    new_inputs[0].nbytes)
+
+            if use_native:
+                # Native method could exist but raise `NotImplementedError`
+                # or return `NotImplemented`. We fall back to Numpy also in
+                # that situation.
+                try:
+                    res = native_method(*inputs, **kwargs)
+                except NotImplementedError:
+                    eval_at_via_npy(*inputs, **kwargs)
+                else:
+                    if res is NotImplemented:
+                        eval_at_via_npy(*inputs, **kwargs)
+            else:
+                eval_at_via_npy(*inputs, **kwargs)
+
+            return
+
+        else:  # method != '__call__'
+            kwargs['out'] = (out,)
+            native_method = getattr(native_ufunc, method, None)
+            use_native = (use_native and native_method is not None)
+
+            if use_native:
+                # A native method could exist but raise `NotImplementedError`
+                # or return `NotImplemented`. We fall back to Numpy also in
+                # that situation.
+                try:
+                    res = native_method(*inputs, **kwargs)
+                except NotImplementedError:
+                    res = super(CupyTensor, self).__array_ufunc__(
+                        ufunc, method, *inputs, **kwargs)
+                else:
+                    if res is NotImplemented:
+                        res = super(CupyTensor, self).__array_ufunc__(
+                            ufunc, method, *inputs, **kwargs)
+
+            else:
+                # See case `method == '__call__', nout == 1` for details
+                if isinstance(out, cupy.ndarray):
+                    out_npy = cupy.asnumpy(out)
+                else:
+                    out_npy = out
+
+                kwargs['out'] = (out_npy,)
+                res = super(CupyTensor, self).__array_ufunc__(
+                    ufunc, method, *inputs, **kwargs)
+
+                if isinstance(out, cupy.ndarray):
+                    out[:] = cupy.asarray(res)
+
+            # Shortcut for scalar or no return value
+            if getattr(res, 'shape', ()) == ():
+                # Occurs for `reduce` with all axes
+                return _python_scalar(res)
+
+            if out is None:
+                result = space_element(res, use_weighting=True)
+            else:
+                result = out_tuple[0]
+
+            return result
+
+    @property
+    def real(self):
+        """Real part of this tensor.
+
+        Returns
+        -------
+        real : `CupyTensor` view with real dtype
+            The real part of this tensor as an element of an `rn` space.
+        """
+        if self.space.is_real:
+            return self
+        else:
+            return self.space.real_space.element(self.data.real)
+
+    @real.setter
+    def real(self, newreal):
+        """Setter for the real part.
+
+        This method is invoked by ``tensor.real = other``.
+
+        Parameters
+        ----------
+        newreal : `array-like` or scalar
+            The new real part for this tensor.
+        """
+        # Assignment with array-likes does not work directly, see
+        # https://github.com/cupy/cupy/issues/593
+        if np.isscalar(newreal):
+            self.data.real = newreal
+        else:
+            # Avoid compile errors for complex right-hand sides
+            self.data.real = cupy.asarray(newreal).real
+
+    @property
+    def imag(self):
+        """Imaginary part of this tensor.
+
+        Returns
+        -------
+        imag : `CupyTensor`
+            The imaginary part of this tensor as an element of an `rn` space.
+        """
+        if self.space.is_real:
+            return self.space.zero()
+        else:
+            return self.space.real_space.element(self.data.imag)
+
+    @imag.setter
+    def imag(self, newimag):
+        """Setter for the imaginary part.
+
+        This method is invoked by ``tensor.imag = other``.
+
+        Parameters
+        ----------
+        newimag : `array-like` or scalar
+            The new imaginary part for this tensor.
+        """
+        # Assignment with array-likes does not work directly, see
+        # https://github.com/cupy/cupy/issues/593
+        if np.isscalar(newimag):
+            self.data.imag = newimag
+        else:
+            # Avoid compile errors for complex right-hand sides
+            self.data.imag = cupy.asarray(newimag).real
+
+    def conj(self, out=None):
+        """Complex conjugate of this tensor.
+
+        Parameters
+        ----------
+        out : `CupyTensor`, optional
+            Tensor to which the complex conjugate is written.
+            Must be an element of this tensor's space.
+
+        Returns
+        -------
+        out : `CupyTensor`
+            The complex conjugate tensor. If ``out`` was provided,
+            the returned object is a reference to it.
+        """
+        if out is None:
+            if self.space.is_real:
+                return self.copy()
+            else:
+                return self.space.element(self.data.conj())
+        else:
+            if self.space.is_real:
+                out.assign(self)
+            else:
+                # In-place not available as it seems
+                out[:] = self.data.conj()
+            return out
+
+    def __ipow__(self, other):
+        """Return ``self **= other``."""
+        try:
+            if other == int(other):
+                return super(CupyTensor, self).__ipow__(other)
+        except TypeError:
+            pass
+
+        self.ufuncs.power(other, out=self)
+        return self
+
+
+# --- Weightings --- #
+
+
+def _weighting(weights, exponent):
+    """Return a weighting whose type is inferred from the arguments."""
+    if exponent is None:
+        exponent = 2.0
+
+    if np.isscalar(weights):
+        weighting = CupyTensorSpaceConstWeighting(weights, exponent=exponent)
+    else:
+        # TODO: sequence of 1D array-likes, see
+        # https://github.com/odlgroup/odl/pull/1238
+        weights = cupy.array(weights, copy=False)
+        weighting = CupyTensorSpaceArrayWeighting(weights, exponent=exponent)
+    return weighting
+
+
+# Kernels for space functions
+#
+# T = generic type
+# R = real floating point type, usually for norm or dist output
+# W = real type for weights, can be floating point or integer
+#
+# Note: the kernels with an output type that does not also occur as an input
+# type must be called with output argument since the output type cannot be
+# inferred. The ouptut array must have shape `()` for full reduction.
+
+
+if CUPY_AVAILABLE:
+    dot = cupy.ReductionKernel(in_params='T x, T y',
+                               out_params='T res',
+                               map_expr='x * y',
+                               reduce_expr='a + b',
+                               post_map_expr='res = a',
+                               identity='0',
+                               name='dot')
+
+    dotw = cupy.ReductionKernel(in_params='T x, T y, W w',
+                                out_params='T res',
+                                map_expr='x * y * w',
+                                reduce_expr='a + b',
+                                post_map_expr='res = a',
+                                identity='0',
+                                name='dotw')
+
+    nrm0 = cupy.ReductionKernel(in_params='T x',
+                                out_params='R res',
+                                map_expr='x != 0',
+                                reduce_expr='a + b',
+                                post_map_expr='res = a',
+                                identity='0',
+                                name='nrm0')
+
+    nrm1 = cupy.ReductionKernel(in_params='T x',
+                                out_params='R res',
+                                map_expr='abs(x)',
+                                reduce_expr='a + b',
+                                post_map_expr='res = a',
+                                identity='0',
+                                name='nrm1')
+
+    nrm1w = cupy.ReductionKernel(in_params='T x, W w',
+                                 out_params='R res',
+                                 map_expr='abs(x) * w',
+                                 reduce_expr='a + b',
+                                 post_map_expr='res = a',
+                                 identity='0',
+                                 name='nrm1w')
+
+    nrm2 = cupy.ReductionKernel(in_params='T x',
+                                out_params='R res',
+                                map_expr='abs(x * x)',
+                                reduce_expr='a + b',
+                                post_map_expr='res = sqrt(a)',
+                                identity='0',
+                                name='nrm2')
+
+    nrm2w = cupy.ReductionKernel(in_params='T x, W w',
+                                 out_params='R res',
+                                 map_expr='abs(x * x) * w',
+                                 reduce_expr='a + b',
+                                 post_map_expr='res = sqrt(a)',
+                                 identity='0',
+                                 name='nrm2w')
+
+    nrminf = cupy.ReductionKernel(in_params='T x',
+                                  out_params='R res',
+                                  map_expr='abs(x)',
+                                  reduce_expr='a > b ? a : b',
+                                  post_map_expr='res = a',
+                                  identity='0',
+                                  name='nrminf')
+
+    nrmneginf = cupy.ReductionKernel(in_params='T x',
+                                     out_params='R res',
+                                     map_expr='abs(x)',
+                                     reduce_expr='a > b ? b : a',
+                                     post_map_expr='res = a',
+                                     identity='0',
+                                     name='nrmneginf')
+
+    nrmp = cupy.ReductionKernel(in_params='T x, R p',
+                                out_params='R res',
+                                map_expr='pow(abs(x), p)',
+                                reduce_expr='a + b',
+                                post_map_expr='res = pow(a, 1 / p)',
+                                identity='0',
+                                name='nrmp')
+
+    nrmpw = cupy.ReductionKernel(in_params='T x, R p, W w',
+                                 out_params='R res',
+                                 map_expr='pow(abs(x), p) * w',
+                                 reduce_expr='a + b',
+                                 post_map_expr='res = pow(a, 1 / p)',
+                                 identity='0',
+                                 name='nrmpw')
+
+    dist0 = cupy.ReductionKernel(in_params='T x, T y',
+                                 out_params='R res',
+                                 map_expr='x != y',
+                                 reduce_expr='a + b',
+                                 post_map_expr='res = a',
+                                 identity='0',
+                                 name='dist0')
+
+    dist1 = cupy.ReductionKernel(in_params='T x, T y',
+                                 out_params='R res',
+                                 map_expr='abs(x - y)',
+                                 reduce_expr='a + b',
+                                 post_map_expr='res = a',
+                                 identity='0',
+                                 name='dist1')
+
+    dist1w = cupy.ReductionKernel(in_params='T x, T y, W w',
+                                  out_params='R res',
+                                  map_expr='abs(x - y) * w',
+                                  reduce_expr='a + b',
+                                  post_map_expr='res = a',
+                                  identity='0',
+                                  name='dist1w')
+
+    dist2 = cupy.ReductionKernel(in_params='T x, T y',
+                                 out_params='R res',
+                                 map_expr='abs((x - y) * (x - y))',
+                                 reduce_expr='a + b',
+                                 post_map_expr='res = sqrt(a)',
+                                 identity='0',
+                                 name='dist2')
+
+    dist2w = cupy.ReductionKernel(in_params='T x, T y, W w',
+                                  out_params='R res',
+                                  map_expr='abs((x - y) * (x - y)) * w',
+                                  reduce_expr='a + b',
+                                  post_map_expr='res = sqrt(a)',
+                                  identity='0',
+                                  name='dist2w')
+
+    distinf = cupy.ReductionKernel(in_params='T x, T y',
+                                   out_params='R res',
+                                   map_expr='abs(x - y)',
+                                   reduce_expr='a > b ? a : b',
+                                   post_map_expr='res = a',
+                                   identity='0',
+                                   name='distinf')
+
+    distneginf = cupy.ReductionKernel(in_params='T x, T y',
+                                      out_params='R res',
+                                      map_expr='abs(x - y)',
+                                      reduce_expr='a > b ? b : a',
+                                      post_map_expr='res = a',
+                                      identity='0',
+                                      name='distneginf')
+
+    distp = cupy.ReductionKernel(in_params='T x, T y, R p',
+                                 out_params='R res',
+                                 map_expr='pow(abs(x - y), p)',
+                                 reduce_expr='a + b',
+                                 post_map_expr='res = pow(a, 1 / p)',
+                                 identity='0',
+                                 name='distp')
+
+    distpw = cupy.ReductionKernel(in_params='T x, T y, R p, W w',
+                                  out_params='R res',
+                                  map_expr='pow(abs(x - y), p) * w',
+                                  reduce_expr='a + b',
+                                  post_map_expr='res = pow(a, 1 / p)',
+                                  identity='0',
+                                  name='distpw')
+
+
+class CupyTensorSpaceArrayWeighting(ArrayWeighting):
+
+    """Array weighting for `CupyTensorSpace`.
+
+    See `ArrayWeighting` for further details.
+    """
+
+    def __init__(self, array, exponent=2.0):
+        """Initialize a new instance.
+
+        Parameters
+        ----------
+        array : `array-like`, one-dim.
+            Weighting array of the inner product, norm and distance,
+            must have real data type and should only have positive entries.
+        exponent : positive float
+            Exponent of the norm. For values other than 2.0, the inner
+            product is not defined.
+        """
+        if isinstance(array, CupyTensor):
+            array = array.data
+        elif not isinstance(array, cupy.ndarray):
+            array = cupy.array(array, copy=False)
+
+        if not is_real_dtype(array.dtype):
+            raise ValueError('`array.dtype` must be real, got {}'
+                             ''.format(dtype_str(array.dtype)))
+
+        super(CupyTensorSpaceArrayWeighting, self).__init__(
+            array, impl='cupy', exponent=exponent)
+
+    def inner(self, x1, x2):
+        """Return the weighted inner product of two tensors.
+
+        Parameters
+        ----------
+        x1, x2 : `CupyTensor`
+            Tensors whose inner product is calculated.
+
+        Returns
+        -------
+        inner : float or complex
+            The inner product of the two provided tensors.
+        """
+        if self.exponent != 2.0:
+            raise NotImplementedError('no inner product defined for '
+                                      'exponent != 2 (got {})'
+                                      ''.format(self.exponent))
+        else:
+            # complex(cupy_complex_scalar) not implemented, see
+            # https://github.com/cupy/cupy/issues/782
+            return x1.space.field.element(
+                cupy.asnumpy(dotw(x2.data.conj(), x1.data, self.array)))
+
+    def norm(self, x):
+        """Return the weighted norm of a tensor.
+
+        Parameters
+        ----------
+        x : `CupyTensor`
+            Tensor whose norm is calculated.
+
+        Returns
+        -------
+        norm : float
+            The norm of the provided tensor.
+        """
+        # Define scalar output array
+        if is_floating_dtype(x.dtype):
+            out_dtype = real_dtype(x.dtype)
+        else:
+            out_dtype = float
+        out = cupy.empty((), dtype=out_dtype)
+
+        # Run reduction kernel (returns the output)
+        if self.exponent == 0:
+            return float(nrm0(x.data, out))
+        elif self.exponent == 1:
+            return float(nrm1w(x.data, self.array, out))
+        elif self.exponent == 2:
+            return float(nrm2w(x.data, self.array, out))
+        elif self.exponent == float('inf'):
+            return float(nrminf(x.data, out))
+        elif self.exponent == -float('inf'):
+            return float(nrmneginf(x.data, out))
+        else:
+            return float(nrmpw(x.data, self.exponent, self.array, out))
+
+    def dist(self, x1, x2):
+        """Return the weighted distance of two tensors.
+
+        Parameters
+        ----------
+        x1, x2 : `CupyTensor`
+            Tensors whose mutual distance is calculated.
+
+        Returns
+        -------
+        dist : float
+            The distance between the provided tensors.
+        """
+        # Define scalar output array
+        if is_floating_dtype(x1.dtype):
+            out_dtype = real_dtype(x1.dtype)
+        else:
+            out_dtype = float
+        out = cupy.empty((), dtype=out_dtype)
+
+        # Run reduction kernel (returns the output)
+        if self.exponent == 0:
+            return float(dist0(x1.data, x2.data, out))
+        elif self.exponent == 1:
+            return float(dist1w(x1.data, x2.data, self.array, out))
+        elif self.exponent == 2:
+            return float(dist2w(x1.data, x2.data, self.array, out))
+        elif self.exponent == float('inf'):
+            return float(distinf(x1.data, x2.data, out))
+        elif self.exponent == -float('inf'):
+            return float(distneginf(x1.data, x2.data, out))
+        else:
+            return float(distpw(x1.data, x2.data, self.exponent, self.array,
+                                out))
+
+    def __hash__(self):
+        """Return ``hash(self)``."""
+        # CuPy array has no `tobytes`
+        return hash((super(ArrayWeighting, self).__hash__(),
+                     cupy.asnumpy(self.array).tobytes()))
+
+    # TODO: remove repr_part and __repr__ when cupy.ndarray.__array__
+    # is implemented. See
+    # https://github.com/cupy/cupy/issues/589
+    @property
+    def repr_part(self):
+        """String usable in a space's ``__repr__`` method."""
+        maxsize_full_print = 2 * np.get_printoptions()['edgeitems']
+        arr_str = array_str(cupy.asnumpy(self.array),
+                            nprint=maxsize_full_print)
+        optargs = [('weighting', arr_str, ''),
+                   ('exponent', self.exponent, 2.0)]
+        return signature_string([], optargs, sep=',\n',
+                                mod=[[], ['!s', ':.4']])
+
+    def __repr__(self):
+        """Return ``repr(self)``."""
+        maxsize_full_print = 2 * np.get_printoptions()['edgeitems']
+        arr_str = array_str(cupy.asnumpy(self.array),
+                            nprint=maxsize_full_print)
+        posargs = [arr_str]
+        optargs = [('exponent', self.exponent, 2.0)]
+        inner_str = signature_string(posargs, optargs, sep=',\n',
+                                     mod=['!s', ':.4'])
+        return '{}(\n{}\n)'.format(self.__class__.__name__, indent(inner_str))
+
+
+class CupyTensorSpaceConstWeighting(ConstWeighting):
+
+    """Constant weighting for `CupyTensorSpace`.
+
+    See `ConstWeighting` for further details.
+    """
+
+    def __init__(self, constant, exponent=2.0):
+        """Initialize a new instance.
+
+        Parameters
+        ----------
+        constant : positive float
+            Weighting constant of the inner product.
+        exponent : positive float
+            Exponent of the norm. For values other than 2.0, the inner
+            product is not defined.
+        """
+        super(CupyTensorSpaceConstWeighting, self).__init__(
+            constant, impl='cupy', exponent=exponent)
+
+    def inner(self, x1, x2):
+        """Return the weighted inner product of two tensors.
+
+        Parameters
+        ----------
+        x1, x2 : `CupyTensor`
+            Tensors whose inner product is calculated.
+
+        Returns
+        -------
+        inner : float or complex
+            The inner product of the two provided tensors.
+        """
+        if self.exponent != 2.0:
+            raise NotImplementedError('no inner product defined for '
+                                      'exponent != 2 (got {})'
+                                      ''.format(self.exponent))
+        else:
+            if np.issubsctype(x1.dtype, np.complexfloating):
+                dot = cupy.vdot(x2.data, x1.data)
+            else:
+                # Ravels in C order, no other supported currently
+                # TODO: ravel in most efficient ordering when available
+                dot = cupy.dot(x2.data.ravel(), x1.data.ravel())
+
+            # complex(cupy_complex_scalar) not implemented, see
+            # https://github.com/cupy/cupy/issues/782
+            return x1.space.field.element(self.const * cupy.asnumpy(dot))
+
+    def norm(self, x):
+        """Return the constant-weighted norm of a tensor.
+
+        Parameters
+        ----------
+        x1 : `CupyTensor`
+            Tensor whose norm is calculated.
+
+        Returns
+        -------
+        norm : float
+            The norm of the tensor.
+        """
+        # Define scalar output array
+        if is_floating_dtype(x.dtype):
+            out_dtype = real_dtype(x.dtype)
+        else:
+            out_dtype = float
+        out = cupy.empty((), dtype=out_dtype)
+
+        # Run reduction kernel (returns the output)
+        if self.exponent == 0:
+            return float(nrm0(x.data, out))
+        elif self.exponent == 1:
+            return float(self.const * nrm1(x.data, out))
+        elif self.exponent == 2:
+            # We try to use cuBLAS nrm2
+            try:
+                incx = _get_flat_inc(x.data)
+            except ValueError:
+                use_cublas = False
+            else:
+                use_cublas = True
+
+            if use_cublas:
+                try:
+                    _cublas_nrm2 = _cublas_func('nrm2', x.dtype)
+                except (ValueError, AttributeError):
+                    pass
+                else:
+                    norm = _cublas_nrm2(
+                        x.data.data.device.cublas_handle, x.size,
+                        x.data.data.ptr, incx)
+                    return float(np.sqrt(self.const) * norm)
+
+            # Cannot use cuBLAS, fall back to custom kernel
+            return float(np.sqrt(self.const) * nrm2(x.data, out))
+        elif self.exponent == float('inf'):
+            return float(nrminf(x.data, out))
+        elif self.exponent == -float('inf'):
+            return float(nrmneginf(x.data, out))
+        else:
+            return float(self.const ** (1 / self.exponent) *
+                         nrmp(x.data, self.exponent, out))
+
+    def dist(self, x1, x2):
+        """Return the weighted distance between two tensors.
+
+        Parameters
+        ----------
+        x1, x2 : `CupyTensor`
+            Tensors whose mutual distance is calculated.
+
+        Returns
+        -------
+        dist : float
+            The distance between the tensors.
+        """
+        # Define scalar output array
+        if is_floating_dtype(x1.dtype):
+            out_dtype = real_dtype(x1.dtype)
+        else:
+            out_dtype = float
+        out = cupy.empty((), dtype=out_dtype)
+
+        # Run reduction kernel (returns the output)
+        if self.exponent == 0:
+            return float(dist0(x1.data, x2.data, out))
+        elif self.exponent == 1:
+            return float(self.const * dist1(x1.data, x2.data, out))
+        elif self.exponent == 2:
+            # cuBLAS nrm2(x1 - x2) would probably be faster, but would
+            # require a copy, so we don't do it
+            return float(np.sqrt(self.const) * dist2(x1.data, x2.data, out))
+        elif self.exponent == float('inf'):
+            return float(distinf(x1.data, x2.data, out))
+        elif self.exponent == -float('inf'):
+            return float(distneginf(x1.data, x2.data, out))
+        else:
+            return float(self.const ** (1 / self.exponent) *
+                         distp(x1.data, x2.data, self.exponent, out))
+
+
+class CupyTensorSpaceCustomInner(CustomInner):
+
+    """Class for handling custom inner products in `CupyTensorSpace`."""
+
+    def __init__(self, inner):
+        """Initialize a new instance.
+
+        Parameters
+        ----------
+        inner : callable
+            The inner product implementation. It must accept two
+            `CupyTensor` arguments, return an element from their space's
+            field (real or complex number) and satisfy the following
+            conditions for all vectors ``x, y, z`` and scalars ``s``:
+
+            - ``<x, y> = conj(<y, x>)``
+            - ``<s*x + y, z> = s * <x, z> + <y, z>``
+            - ``<x, x> = 0``  if and only if  ``x = 0``
+        """
+        super(CupyTensorSpaceCustomInner, self).__init__(inner, impl='cupy')
+
+
+class CupyTensorSpaceCustomNorm(CustomNorm):
+
+    """Class for handling a user-specified norm in `CupyTensorSpace`.
+
+    Note that this removes ``inner``.
+    """
+
+    def __init__(self, norm):
+        """Initialize a new instance.
+
+        Parameters
+        ----------
+        norm : callable
+            The norm implementation. It must accept an `CupyTensor`
+            argument, return a float and satisfy the following
+            conditions for all vectors ``x, y`` and scalars ``s``:
+
+            - ``||x|| >= 0``
+            - ``||x|| = 0``  if and only if  ``x = 0``
+            - ``||s * x|| = |s| * ||x||``
+            - ``||x + y|| <= ||x|| + ||y||``
+        """
+        super(CupyTensorSpaceCustomNorm, self).__init__(norm, impl='cupy')
+
+
+class CupyTensorSpaceCustomDist(CustomDist):
+
+    """Class for handling a user-specified distance in `CupyTensorSpace`.
+
+    Note that this removes ``inner`` and ``norm``.
+    """
+
+    def __init__(self, dist):
+        """Initialize a new instance.
+
+        Parameters
+        ----------
+        dist : callable
+            The distance function defining a metric on `CupyTensorSpace`.
+            It must accept two `CupyTensor` arguments, return a float and
+            fulfill the following mathematical conditions for any three
+            vectors ``x, y, z``:
+
+            - ``dist(x, y) >= 0``
+            - ``dist(x, y) = 0``  if and only if  ``x = y``
+            - ``dist(x, y) = dist(y, x)``
+            - ``dist(x, y) <= dist(x, z) + dist(z, y)``
+        """
+        super(CupyTensorSpaceCustomDist, self).__init__(dist, impl='cupy')
+
+
+if __name__ == '__main__':
+    from odl.util.testutils import run_doctests
+    run_doctests()
diff --git a/odl/space/entry_points.py b/odl/space/entry_points.py
index 1c610d4b9f5..af28ed9a97a 100644
--- a/odl/space/entry_points.py
+++ b/odl/space/entry_points.py
@@ -23,12 +23,15 @@
 from __future__ import print_function, division, absolute_import
 
 from odl.space.npy_tensors import NumpyTensorSpace
+from odl.space.cupy_tensors import CUPY_AVAILABLE, CupyTensorSpace
 
 # We don't expose anything to odl.space
 __all__ = ()
 
 IS_INITIALIZED = False
 TENSOR_SPACE_IMPLS = {'numpy': NumpyTensorSpace}
+if CUPY_AVAILABLE:
+    TENSOR_SPACE_IMPLS['cupy'] = CupyTensorSpace
 
 
 def _initialize_if_needed():
diff --git a/odl/space/npy_tensors.py b/odl/space/npy_tensors.py
index af0c91c156a..c2e9158a59a 100644
--- a/odl/space/npy_tensors.py
+++ b/odl/space/npy_tensors.py
@@ -23,7 +23,7 @@
     CustomInner, CustomNorm, CustomDist)
 from odl.util import (
     dtype_str, signature_string, is_real_dtype, is_numeric_dtype,
-    writable_array, is_floating_dtype)
+    writable_array, is_floating_dtype, none_context)
 
 
 __all__ = ('NumpyTensorSpace',)
@@ -73,19 +73,19 @@ class NumpyTensorSpace(TensorSpace):
     .. _Wikipedia article on tensors: https://en.wikipedia.org/wiki/Tensor
     """
 
-    def __init__(self, shape, dtype=None, **kwargs):
+    def __init__(self, shape, dtype='float64', **kwargs):
         """Initialize a new instance.
 
         Parameters
         ----------
         shape : positive int or sequence of positive ints
-            Number of entries per axis for elements in this space. A
-            single integer results in a space with rank 1, i.e., 1 axis.
+            Number of entries per axis for elements in this space.
+            A single integer results in a space with rank 1, i.e., 1 axis.
         dtype :
             Data type of each element. Can be provided in any
             way the `numpy.dtype` function understands, e.g.
-            as built-in type or as a string. For ``None``,
-            the `default_dtype` of this space (``float64``) is used.
+            as built-in type or as a string.
+            See `available_dtypes` for the list of supported scalar data types.
         exponent : positive float, optional
             Exponent of the norm. For values other than 2.0, no
             inner product is defined.
@@ -220,8 +220,7 @@ def __init__(self, shape, dtype=None, **kwargs):
         """
         super(NumpyTensorSpace, self).__init__(shape, dtype)
         if self.dtype.char not in self.available_dtypes():
-            raise ValueError('`dtype` {!r} not supported'
-                             ''.format(dtype_str(dtype)))
+            raise ValueError('`dtype` {!r} not supported'.format(dtype))
 
         dist = kwargs.pop('dist', None)
         norm = kwargs.pop('norm', None)
@@ -503,9 +502,10 @@ def default_dtype(field=None):
         dtype : `numpy.dtype`
             Numpy data type specifier. The returned defaults are:
 
-                ``RealNumbers()`` : ``np.dtype('float64')``
+            - ``RealNumbers()`` : ``np.dtype('float64')``
+            - ``ComplexNumbers()`` : ``np.dtype('complex128')``
 
-                ``ComplexNumbers()`` : ``np.dtype('complex128')``
+            These choices correspond to the defaults of the NumPy library.
         """
         if field is None or field == RealNumbers():
             return np.dtype('float64')
@@ -864,24 +864,24 @@ def data(self):
         """The `numpy.ndarray` representing the data of ``self``."""
         return self.__data
 
-    def asarray(self, out=None):
-        """Extract the data of this array as a ``numpy.ndarray``.
+    def asarray(self, out=None, impl='numpy'):
+        """Extract the data of this array as an ndarray.
 
-        This method is invoked when calling `numpy.asarray` on this
-        tensor.
+        This method is invoked when calling `numpy.asarray` on this tensor.
 
         Parameters
         ----------
-        out : `numpy.ndarray`, optional
-            Array in which the result should be written in-place.
-            Has to be contiguous and of the correct dtype.
+        out : ndarray, optional
+            Array into which the result should be written. Must be contiguous
+            and of the correct dtype.
+        impl : str, optional
+            Array backend for the output, used when ``out`` is not given.
 
         Returns
         -------
-        asarray : `numpy.ndarray`
-            Numpy array with the same data type as ``self``. If
-            ``out`` was given, the returned object is a reference
-            to it.
+        asarray : ndarray
+            Array with the same data type as ``self``. If ``out`` was given,
+            the returned object is a reference to it.
 
         Examples
         --------
@@ -902,10 +902,46 @@ def asarray(self, out=None):
         array([[ 1.,  1.,  1.],
                [ 1.,  1.,  1.]])
         """
+        # Import cupy if it exists (else None)
+        from odl.space.cupy_tensors import cupy, CUPY_AVAILABLE
+        import ctypes
+
+        impl, impl_in = str(impl).lower(), impl
         if out is None:
-            return self.data
+            if impl == 'numpy':
+                return self.data
+            elif impl == 'cupy':
+                if CUPY_AVAILABLE:
+                    return cupy.array(self.data)
+                else:
+                    raise ValueError("`impl` 'cupy' not available")
+            else:
+                raise ValueError('`impl` {!r} not understood'.format(impl_in))
+
         else:
-            out[:] = self.data
+            if not (out.flags.c_contiguous or out.flags.f_contiguous):
+                raise ValueError('`out` must be contiguous')
+            if out.shape != self.shape:
+                raise ValueError('`out` must have shape {}, got shape {}'
+                                 ''.format(self.shape, out.shape))
+            if out.dtype != self.dtype:
+                raise ValueError('`out` must have dtype {}, got dtype {}'
+                                 ''.format(self.dtype, out.dtype))
+
+            if CUPY_AVAILABLE and isinstance(out, cupy.ndarray):
+                # Use efficient copy by ensuring contiguous memory
+                if self.data.flags.contiguous:
+                    self_contig_arr = self.data
+                else:
+                    self_contig_arr = np.ascontiguousarray(self.data)
+
+                out.data.copy_from_host(
+                    self_contig_arr.ctypes.data_as(ctypes.c_void_p),
+                    self.size * self.itemsize)
+
+            else:
+                out[:] = self.data
+
             return out
 
     def astype(self, dtype):
@@ -928,7 +964,13 @@ def astype(self, dtype):
 
     @property
     def data_ptr(self):
-        """A raw pointer to the data container of ``self``.
+        """Memory address of the data container as 64-bit integer.
+
+        Returns
+        -------
+        data_ptr : int
+            The data pointer is technically of type ``uintptr_t`` and gives the
+            index in bytes of the first element of the data storage.
 
         Examples
         --------
@@ -1109,6 +1151,14 @@ def __getitem__(self, indices):
                 weighting = self.space.weighting
             else:
                 weighting = None
+
+            if isinstance(weighting, NumpyTensorSpaceArrayWeighting):
+                weighting = weighting.array[indices]
+            elif isinstance(weighting, NumpyTensorSpaceConstWeighting):
+                # Axes were removed, cannot infer new constant
+                if arr.ndim != self.ndim:
+                    weighting = None
+
             space = type(self.space)(
                 arr.shape, dtype=self.dtype, exponent=self.space.exponent,
                 weighting=weighting)
@@ -1125,11 +1175,6 @@ def __setitem__(self, indices, values):
         values : scalar, array-like or `NumpyTensor`
             The value(s) that are to be assigned.
 
-            If ``index`` is an integer, ``value`` must be a scalar.
-
-            If ``index`` is a slice or a sequence of slices, ``value``
-            must be broadcastable to the shape of the slice.
-
         Examples
         --------
         For 1d spaces, entries can be set with scalars or sequences of
@@ -1656,26 +1701,11 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
 
         # --- Evaluate ufunc --- #
 
-        # Trivial context used to create a single code path for the ufunc
-        # evaluation. For `None` output parameter(s), this is used instead of
-        # `writable_array`.
-        class CtxNone(object):
-            """Trivial context manager class.
-
-            When used as ::
-
-                with CtxNone() as obj:
-                    # do stuff with `obj`
-
-            the returned ``obj`` is ``None``.
-            """
-            __enter__ = __exit__ = lambda *_: None
-
         if method == '__call__':
             if ufunc.nout == 1:
                 # Make context for output (trivial one returns `None`)
                 if out is None:
-                    out_ctx = CtxNone()
+                    out_ctx = none_context()
                 else:
                     out_ctx = writable_array(out, **array_kwargs)
 
@@ -1703,11 +1733,11 @@ class CtxNone(object):
                 if out1 is not None:
                     out1_ctx = writable_array(out1, **array_kwargs)
                 else:
-                    out1_ctx = CtxNone()
+                    out1_ctx = none_context()
                 if out2 is not None:
                     out2_ctx = writable_array(out2, **array_kwargs)
                 else:
-                    out2_ctx = CtxNone()
+                    out2_ctx = none_context()
 
                 # Evaluate ufunc
                 with out1_ctx as out1_arr, out2_ctx as out2_arr:
@@ -1733,7 +1763,7 @@ class CtxNone(object):
         else:  # method != '__call__'
             # Make context for output (trivial one returns `None`)
             if out is None:
-                out_ctx = CtxNone()
+                out_ctx = none_context()
             else:
                 out_ctx = writable_array(out, **array_kwargs)
 
@@ -2018,7 +2048,6 @@ def _pnorm_diagweight(x, p, w):
     # BLAS dot or nrm2
     xp = np.abs(x.data.ravel(order))
     if p == float('inf'):
-        xp *= w.ravel(order)
         return np.max(xp)
     else:
         xp = np.power(xp, p, out=xp)
@@ -2273,7 +2302,7 @@ def norm(self, x):
         if self.exponent == 2.0:
             return float(np.sqrt(self.const) * _norm_default(x))
         elif self.exponent == float('inf'):
-            return float(self.const * _pnorm_default(x, self.exponent))
+            return float(_pnorm_default(x, self.exponent))
         else:
             return float((self.const ** (1 / self.exponent) *
                           _pnorm_default(x, self.exponent)))
@@ -2294,7 +2323,7 @@ def dist(self, x1, x2):
         if self.exponent == 2.0:
             return float(np.sqrt(self.const) * _norm_default(x1 - x2))
         elif self.exponent == float('inf'):
-            return float(self.const * _pnorm_default(x1 - x2, self.exponent))
+            return float(_pnorm_default(x1 - x2, self.exponent))
         else:
             return float((self.const ** (1 / self.exponent) *
                           _pnorm_default(x1 - x2, self.exponent)))
diff --git a/odl/space/weighting.py b/odl/space/weighting.py
index 8fcabbf50d8..8e051c9e169 100644
--- a/odl/space/weighting.py
+++ b/odl/space/weighting.py
@@ -500,7 +500,7 @@ def array(self):
 
     def is_valid(self):
         """Return True if the array is a valid weight, i.e. positive."""
-        return np.all(np.greater(self.array, 0))
+        return (self.array > 0).all()
 
     def __eq__(self, other):
         """Return ``self == other``.
@@ -547,9 +547,9 @@ def equiv(self, other):
         elif isinstance(other, MatrixWeighting):
             return other.equiv(self)
         elif isinstance(other, ConstWeighting):
-            return np.array_equiv(self.array, other.const)
+            return (self.array == other.const).all()
         else:
-            return np.array_equal(self.array, other.array)
+            return (self.array == other.array).all()
 
     @property
     def repr_part(self):
diff --git a/odl/test/discr/diff_ops_test.py b/odl/test/discr/diff_ops_test.py
index 5b37889d499..ff8a759afd2 100644
--- a/odl/test/discr/diff_ops_test.py
+++ b/odl/test/discr/diff_ops_test.py
@@ -28,7 +28,7 @@
                                      'order0', 'order1', 'order2'])
 
 
-@pytest.fixture(scope="module", params=[1, 2, 3], ids=['1d', '2d', '3d'])
+@pytest.fixture(scope="module", params=[1, 2, 3], ids=[' 1d ', ' 2d ', ' 3d '])
 def space(request, odl_tspace_impl):
     impl = odl_tspace_impl
     ndim = request.param
@@ -430,6 +430,12 @@ def test_divergence(space, method, padding):
     assert rhs != 0
     assert lhs == pytest.approx(rhs, rel=dtype_tol(space.dtype))
 
+    # Higher dimensional arrays
+    for ndim in range(1, 6):
+        # DiscreteLpElement
+        lin_size = 3
+        space = odl.uniform_discr([0.] * ndim, [1.] * ndim, [lin_size] * ndim)
+
 
 # --- Laplacian --- #
 
diff --git a/odl/test/discr/discr_ops_test.py b/odl/test/discr/discr_ops_test.py
index 061cccbdfdb..bf8c8ca958d 100644
--- a/odl/test/discr/discr_ops_test.py
+++ b/odl/test/discr/discr_ops_test.py
@@ -9,8 +9,8 @@
 """Unit tests for `discr_ops`."""
 
 from __future__ import division
-import pytest
 import numpy as np
+import pytest
 
 import odl
 from odl.discr.discr_ops import _SUPPORTED_RESIZE_PAD_MODES
diff --git a/odl/test/discr/lp_discr_test.py b/odl/test/discr/lp_discr_test.py
index f829d28cf9b..280ccb3aa6c 100644
--- a/odl/test/discr/lp_discr_test.py
+++ b/odl/test/discr/lp_discr_test.py
@@ -16,8 +16,9 @@
 from odl.space.base_tensors import TensorSpace
 from odl.space.npy_tensors import NumpyTensor
 from odl.space.weighting import ConstWeighting
+from odl.util import array_module, as_numpy
 from odl.util.testutils import (
-    all_equal, all_almost_equal, noise_elements, simple_fixture)
+    all_equal, all_almost_equal, noise_elements, simple_fixture, xfail_if)
 
 
 USE_ARRAY_UFUNCS_INTERFACE = (parse_version(np.__version__) >=
@@ -107,7 +108,8 @@ def test_empty():
 # --- uniform_discr --- #
 
 
-def test_factory_dtypes(odl_tspace_impl):
+def test_uniform_discr_dtypes(odl_tspace_impl):
+    """Test basic properties of spaces created by uniform_discr."""
     impl = odl_tspace_impl
     real_float_dtypes = [np.float32, np.float64]
     nonfloat_dtypes = [np.int8, np.int16, np.int32, np.int64,
@@ -198,40 +200,39 @@ def test_uniform_discr_init_complex(odl_tspace_impl):
 # --- DiscreteLp methods --- #
 
 
-def test_discretelp_element():
+def test_discretelp_element(tspace_impl):
     """Test creation and membership of DiscreteLp elements."""
     # Creation from scratch
     # 1D
-    discr = odl.uniform_discr(0, 1, 3)
+    discr = odl.uniform_discr(0, 1, 3, impl=tspace_impl)
     weight = 1.0 if exponent == float('inf') else discr.cell_volume
-    tspace = odl.rn(3, weighting=weight)
+    tspace = odl.rn(3, weighting=weight, impl=tspace_impl)
     elem = discr.element()
     assert elem in discr
     assert elem.tensor in tspace
 
     # 2D
-    discr = odl.uniform_discr([0, 0], [1, 1], (3, 3))
+    discr = odl.uniform_discr([0, 0], [1, 1], (3, 3), impl=tspace_impl)
     weight = 1.0 if exponent == float('inf') else discr.cell_volume
-    tspace = odl.rn((3, 3), weighting=weight)
+    tspace = odl.rn((3, 3), weighting=weight, impl=tspace_impl)
     elem = discr.element()
     assert elem in discr
     assert elem.tensor in tspace
 
 
-def test_discretelp_element_from_array():
+def test_discretelp_element_from_array(tspace_impl):
     """Test creation of DiscreteLp elements from arrays."""
     # 1D
-    discr = odl.uniform_discr(0, 1, 3)
+    discr = odl.uniform_discr(0, 1, 3, impl=tspace_impl)
     elem = discr.element([1, 2, 3])
     assert np.array_equal(elem.tensor, [1, 2, 3])
-
     assert isinstance(elem, DiscreteLpElement)
-    assert isinstance(elem.tensor, NumpyTensor)
     assert all_equal(elem.tensor, [1, 2, 3])
 
 
-def test_element_from_array_2d(odl_elem_order):
+def test_element_from_array_2d(odl_tspace_impl, odl_elem_order):
     """Test element in 2d with different orderings."""
+    impl = odl_tspace_impl
     order = odl_elem_order
     discr = odl.uniform_discr([0, 0], [1, 1], [2, 2])
     elem = discr.element([[1, 2],
@@ -258,9 +259,9 @@ def test_element_from_array_2d(odl_elem_order):
                        [4]])  # wrong shape
 
 
-def test_element_from_function_1d():
+def test_element_from_function_1d(tspace_impl):
     """Test creation of DiscreteLp elements from functions in 1 dimension."""
-    space = odl.uniform_discr(-1, 1, 4)
+    space = odl.uniform_discr(-1, 1, 4, impl=tspace_impl)
     points = space.points().squeeze()
 
     # Without parameter
@@ -307,9 +308,9 @@ def f(x, **kwargs):
     assert all_equal(elem_lam, points)
 
 
-def test_element_from_function_2d():
+def test_element_from_function_2d(tspace_impl):
     """Test creation of DiscreteLp elements from functions in 2 dimensions."""
-    space = odl.uniform_discr([-1, -1], [1, 1], (2, 3))
+    space = odl.uniform_discr([-1, -1], [1, 1], (2, 3), impl=tspace_impl)
     points = space.points()
 
     # Without parameter
@@ -366,9 +367,9 @@ def f(x, **kwargs):
     assert all_equal(elem_lam, true_elem)
 
 
-def test_discretelp_zero_one():
+def test_discretelp_zero_one(tspace_impl):
     """Test the zero and one element creators of DiscreteLp."""
-    discr = odl.uniform_discr(0, 1, 3)
+    discr = odl.uniform_discr(0, 1, 3, impl=tspace_impl)
 
     zero = discr.zero()
     assert zero in discr
@@ -742,12 +743,15 @@ def test_cell_volume():
     assert elem.cell_volume == 0.5
 
 
-def test_astype():
-
-    rdiscr = odl.uniform_discr([0, 0], [1, 1], [2, 2], dtype='float64')
-    cdiscr = odl.uniform_discr([0, 0], [1, 1], [2, 2], dtype='complex128')
-    rdiscr_s = odl.uniform_discr([0, 0], [1, 1], [2, 2], dtype='float32')
-    cdiscr_s = odl.uniform_discr([0, 0], [1, 1], [2, 2], dtype='complex64')
+def test_astype(tspace_impl):
+    rdiscr = odl.uniform_discr([0, 0], [1, 1], [2, 2],
+                               dtype='float64', impl=tspace_impl)
+    cdiscr = odl.uniform_discr([0, 0], [1, 1], [2, 2],
+                               dtype='complex128', impl=tspace_impl)
+    rdiscr_s = odl.uniform_discr([0, 0], [1, 1], [2, 2],
+                                 dtype='float32', impl=tspace_impl)
+    cdiscr_s = odl.uniform_discr([0, 0], [1, 1], [2, 2],
+                                 dtype='complex64', impl=tspace_impl)
 
     # Real
     assert rdiscr.astype('float32') == rdiscr_s
@@ -781,174 +785,202 @@ def testodl_ufuncs(odl_tspace_impl, odl_ufunc):
     space = odl.uniform_discr([0, 0], [1, 1], (2, 3), impl=impl)
     name = odl_ufunc
 
-    # Get the ufunc from numpy as reference
-    npy_ufunc = getattr(np, name)
-    nin = npy_ufunc.nin
-    nout = npy_ufunc.nout
-    if (np.issubsctype(space.dtype, np.floating) and
-            name in ['bitwise_and',
-                     'bitwise_or',
-                     'bitwise_xor',
-                     'invert',
-                     'left_shift',
-                     'right_shift']):
-        # Skip integer only methods if floating point type
-        return
+    # Get the ufunc from numpy as reference, plus some additional info
+    ufunc_npy = getattr(np, name)
+    nin = ufunc_npy.nin
+    nout = ufunc_npy.nout
+
+    # Disable Numpy warnings for the time being
+    npy_err_orig = np.geterr()
+    np.seterr(all='ignore')
+
+    def _check_result_type(result, expected_type):
+        if nout == 1:
+            assert isinstance(result, space.element_type)
+        elif nout > 1:
+            for i in range(nout):
+                assert isinstance(result[i], space.element_type)
+        else:
+            assert False
+
+    def _check_result_is_out(result, out_seq):
+        if nout == 1:
+            assert result is out_seq[0]
+        elif nout > 1:
+            for i in range(nout):
+                assert result[i] is out_seq[i]
+        else:
+            assert False
+
+    # See https://github.com/cupy/cupy/issues/794
+    cupy_ufuncs_broken_complex = [
+        'expm1', 'floor_divide', 'fmin', 'fmax',
+        'greater', 'greater_equal', 'less', 'less_equal', 'log1p', 'log2',
+        'logical_and', 'logical_or', 'logical_not', 'logical_xor', 'minimum',
+        'maximum', 'rint', 'sign', 'square']
+    if (space.impl == 'cupy' and
+            space.dtype.kind == 'c' and
+            ufunc in cupy_ufuncs_broken_complex):
+        pytest.xfail('ufunc {} broken for complex input in cupy'.format(ufunc))
 
     # Create some data
     arrays, elements = noise_elements(space, nin + nout)
-    in_arrays = arrays[:nin]
-    out_arrays = arrays[nin:]
+    # Arrays of the space's own data storage type
+    in_arrays_own = arrays[:nin]
+    in_arrays_npy = [as_numpy(arr) for arr in arrays[:nin]]
     data_elem = elements[0]
-    out_elems = elements[nin:]
-
-    if nout == 1:
-        out_arr_kwargs = {'out': out_arrays[0]}
-        out_elem_kwargs = {'out': out_elems[0]}
-    elif nout > 1:
-        out_arr_kwargs = {'out': out_arrays[:nout]}
-        out_elem_kwargs = {'out': out_elems[:nout]}
 
     # Get function to call, using both interfaces:
-    # - vec.ufunc(other_args)
-    # - np.ufunc(vec, other_args)
-    elem_fun_old = getattr(data_elem.ufuncs, name)
-    in_elems_old = elements[1:nin]
-    elem_fun_new = npy_ufunc
-    in_elems_new = elements[:nin]
-
-    # Out-of-place
-    with np.errstate(all='ignore'):  # avoid pytest warnings
-        npy_result = npy_ufunc(*in_arrays)
-        odl_result_old = elem_fun_old(*in_elems_old)
-        assert all_almost_equal(npy_result, odl_result_old)
-        odl_result_new = elem_fun_new(*in_elems_new)
-        assert all_almost_equal(npy_result, odl_result_new)
-
-    # Test type of output
+    # - vec.ufunc(*other_args)
+    # - np.ufunc(vec, *other_args)
+    ufunc_method = getattr(data_elem.ufuncs, name)
+    in_elems_method = elements[1:nin]
+    in_elems_npy = elements[:nin]
+
+    # If Numpy fails, mark the test as xfail (same below)
+    try:
+        result_npy = ufunc_npy(*in_arrays_npy)
+    except TypeError:
+        pytest.xfail('numpy ufunc not valid for inputs')
+
+    # Out-of-place -- in = elements -- ufunc = method or numpy
+    result = ufunc_method(*in_elems_method)
+    assert all_almost_equal(result_npy, result)
+    _check_result_type(result, space.element_type)
+
+    # Get element(s) in the right space for in-place later
     if nout == 1:
-        assert isinstance(odl_result_old, space.element_type)
-        assert isinstance(odl_result_new, space.element_type)
-    elif nout > 1:
-        for i in range(nout):
-            assert isinstance(odl_result_old[i], space.element_type)
-            assert isinstance(odl_result_new[i], space.element_type)
-
-    # In-place with ODL objects as `out`
-    with np.errstate(all='ignore'):  # avoid pytest warnings
-        npy_result = npy_ufunc(*in_arrays, **out_arr_kwargs)
-        odl_result_old = elem_fun_old(*in_elems_old, **out_elem_kwargs)
-        assert all_almost_equal(npy_result, odl_result_old)
-        if USE_ARRAY_UFUNCS_INTERFACE:
-            # In-place will not work with Numpy < 1.13
-            odl_result_new = elem_fun_new(*in_elems_new, **out_elem_kwargs)
-            assert all_almost_equal(npy_result, odl_result_new)
-
-    # Check that returned stuff refers to given out
+        out_elems = [result.space.element()]
+    elif nout == 2:
+        out_elems = [res.space.element() for res in result]
+    else:
+        assert False
+
+    result = ufunc_npy(*in_elems_npy)
+    assert all_almost_equal(result_npy, result)
+    _check_result_type(result, space.element_type)
+
+    # Out-of-place -- in = numpy or own arrays -- ufunc = method
+    result = ufunc_method(*in_arrays_npy[1:])
+    assert all_almost_equal(result_npy, result)
+    _check_result_type(result, space.element_type)
+
+    result = ufunc_method(*in_arrays_own[1:])
+    assert all_almost_equal(result_npy, result)
+    _check_result_type(result, space.element_type)
+
+    # In-place -- in = elements -- out = elements -- ufunc = method or numpy
     if nout == 1:
-        assert odl_result_old is out_elems[0]
-        if USE_ARRAY_UFUNCS_INTERFACE:
-            assert odl_result_new is out_elems[0]
-    elif nout > 1:
-        for i in range(nout):
-            assert odl_result_old[i] is out_elems[i]
-            if USE_ARRAY_UFUNCS_INTERFACE:
-                assert odl_result_new[i] is out_elems[i]
-
-    # In-place with Numpy array as `out` for new interface
-    if USE_ARRAY_UFUNCS_INTERFACE:
-        out_arrays_new = tuple(np.empty_like(arr) for arr in out_arrays)
-        if nout == 1:
-            out_arr_kwargs_new = {'out': out_arrays_new[0]}
-        elif nout > 1:
-            out_arr_kwargs_new = {'out': out_arrays_new[:nout]}
+        kwargs_out = {'out': out_elems[0]}
+    elif nout == 2:
+        kwargs_out = {'out1': out_elems[0], 'out2': out_elems[1]}
 
-        with np.errstate(all='ignore'):  # avoid pytest warnings
-            odl_result_arr_new = elem_fun_new(*in_elems_new,
-                                              **out_arr_kwargs_new)
-        assert all_almost_equal(npy_result, odl_result_arr_new)
+    result = ufunc_method(*in_elems_method, **kwargs_out)
+    assert all_almost_equal(result_npy, result)
+    _check_result_is_out(result, out_elems[:nout])
+    if USE_ARRAY_UFUNCS_INTERFACE:
+        # Custom objects not allowed as `out` for numpy < 1.13
 
         if nout == 1:
-            assert odl_result_arr_new is out_arrays_new[0]
-        elif nout > 1:
-            for i in range(nout):
-                assert odl_result_arr_new[i] is out_arrays_new[i]
+            kwargs_out = {'out': out_elems[0]}
+        elif nout == 2:
+            kwargs_out = {'out': (out_elems[0], out_elems[1])}
+
+        result = ufunc_npy(*in_elems_npy, **kwargs_out)
+        assert all_almost_equal(result_npy, result)
+        _check_result_is_out(result, out_elems[:nout])
 
-    # In-place with data container (tensor) as `out` for new interface
+    # In-place -- in = elements -- out = numpy or own arrays -- ufunc = numpy
+    # This case is only supported with the new interface
     if USE_ARRAY_UFUNCS_INTERFACE:
-        out_tensors_new = tuple(space.tspace.element(np.empty_like(arr))
-                                for arr in out_arrays)
+        # Fresh arrays for output
+        out_arrays_npy = [np.empty_like(as_numpy(arr))
+                          for arr in arrays[nin:]]
+        out_arrays_own = [array_module(space.impl).empty_like(arr)
+                          for arr in arrays[nin:]]
         if nout == 1:
-            out_tens_kwargs_new = {'out': out_tensors_new[0]}
+            kwargs_npy = {'out': out_arrays_npy[0]}
+            kwargs_own = {'out': out_arrays_own[0]}
         elif nout > 1:
-            out_tens_kwargs_new = {'out': out_tensors_new[:nout]}
+            kwargs_npy = {'out': out_arrays_npy[:nout]}
+            kwargs_own = {'out': out_arrays_own[:nout]}
 
-        with np.errstate(all='ignore'):  # avoid pytest warnings
-            odl_result_tens_new = elem_fun_new(*in_elems_new,
-                                               **out_tens_kwargs_new)
-        assert all_almost_equal(npy_result, odl_result_tens_new)
+        try:
+            result_out_npy = ufunc_npy(*in_elems_npy, **kwargs_npy)
+        except TypeError:
+            pytest.xfail('numpy ufunc not valid for inputs')
 
-        if nout == 1:
-            assert odl_result_tens_new is out_tensors_new[0]
-        elif nout > 1:
-            for i in range(nout):
-                assert odl_result_tens_new[i] is out_tensors_new[i]
+        result_out_own = ufunc_npy(*in_elems_npy, **kwargs_own)
+        assert all_almost_equal(result_out_npy, result_npy)
+        assert all_almost_equal(result_out_own, result_npy)
+        _check_result_is_out(result_out_npy, out_arrays_npy)
+        _check_result_is_out(result_out_own, out_arrays_own)
 
     if USE_ARRAY_UFUNCS_INTERFACE:
         # Check `ufunc.at`
         indices = [[0, 0, 1],
                    [0, 1, 2]]
 
-        mod_array = in_arrays[0].copy()
-        mod_elem = in_elems_new[0].copy()
-        if nout > 1:
-            return  # currently not supported by Numpy
+        mod_array = in_arrays_npy[0].copy()
+        mod_elem = in_elems_npy[0].copy()
         if nin == 1:
-            with np.errstate(all='ignore'):  # avoid pytest warnings
-                npy_result = npy_ufunc.at(mod_array, indices)
-                odl_result = npy_ufunc.at(mod_elem, indices)
+            try:
+                result_npy = ufunc_npy.at(mod_array, indices)
+            except TypeError:
+                pytest.xfail('numpy ufunc.at not valid for inputs')
+
+            result = ufunc_npy.at(mod_elem, indices)
+
         elif nin == 2:
-            other_array = in_arrays[1][indices]
-            other_elem = in_elems_new[1][indices]
-            with np.errstate(all='ignore'):  # avoid pytest warnings
-                npy_result = npy_ufunc.at(mod_array, indices, other_array)
-                odl_result = npy_ufunc.at(mod_elem, indices, other_elem)
+            other_array = in_arrays_npy[1][indices]
+            other_elem = in_elems_npy[1][indices]
+            try:
+                result_npy = ufunc_npy.at(mod_array, indices, other_array)
+            except TypeError:
+                pytest.xfail('numpy ufunc.at not valid for inputs')
+
+            result = ufunc_npy.at(mod_elem, indices, other_elem)
 
-        assert all_almost_equal(odl_result, npy_result)
+        assert all_almost_equal(result, result_npy)
 
     # Check `ufunc.reduce`
     if nin == 2 and nout == 1 and USE_ARRAY_UFUNCS_INTERFACE:
-        in_array = in_arrays[0]
-        in_elem = in_elems_new[0]
+        in_array = in_arrays_npy[0]
+        in_elem = in_elems_npy[0]
 
         # We only test along one axis since some binary ufuncs are not
         # re-orderable, in which case Numpy raises a ValueError
-        with np.errstate(all='ignore'):  # avoid pytest warnings
-            npy_result = npy_ufunc.reduce(in_array)
-            odl_result = npy_ufunc.reduce(in_elem)
-            assert all_almost_equal(odl_result, npy_result)
-            # In-place using `out` (with ODL vector and array)
-            out_elem = odl_result.space.element()
-            out_array = np.empty(odl_result.shape,
-                                 dtype=odl_result.dtype)
-            npy_ufunc.reduce(in_elem, out=out_elem)
-            npy_ufunc.reduce(in_elem, out=out_array)
-            assert all_almost_equal(out_elem, odl_result)
-            assert all_almost_equal(out_array, odl_result)
-            # Using a specific dtype
-            try:
-                npy_result = npy_ufunc.reduce(in_array, dtype=complex)
-            except TypeError:
-                # Numpy finds no matching loop, bail out
-                return
-            else:
-                odl_result = npy_ufunc.reduce(in_elem, dtype=complex)
-                assert odl_result.dtype == npy_result.dtype
-                assert all_almost_equal(odl_result, npy_result)
+
+        try:
+            result_npy = ufunc_npy.reduce(in_array)
+        except TypeError:
+            pytest.xfail('numpy ufunc.reduce not valid for inputs')
+
+        # Out-of-place -- in = element
+        result = ufunc_npy.reduce(in_elem)
+        assert all_almost_equal(result, result_npy)
+        # keepdims not allowed
+        with pytest.raises(ValueError):
+            ufunc_npy.reduce(in_elem, keepdims=True)
+
+        # Using a specific dtype
+        try:
+            result_npy = ufunc_npy.reduce(in_array, dtype=complex)
+        except TypeError:
+            pytest.xfail('numpy ufunc.reduce not valid for complex dtype')
+
+        with xfail_if(space.impl == 'cupy',
+                      reason='complex reduce is broken in cupy'):
+            result = ufunc_npy.reduce(in_elem, dtype=complex)
+            assert result.dtype == result_npy.dtype
+            assert all_almost_equal(result, result_npy)
 
     # Other ufunc method use the same interface, to we don't perform
     # extra tests for them.
 
+    # Reset Numpy err handling
+    np.seterr(**npy_err_orig)
+
 
 def test_ufunc_corner_cases(odl_tspace_impl):
     """Check if some corner cases are handled correctly."""
@@ -972,10 +1004,12 @@ def test_ufunc_corner_cases(odl_tspace_impl):
     assert res.space == space
 
     # Check usage of `order` argument
-    for order in ('C', 'F'):
-        res = x.__array_ufunc__(np.sin, '__call__', x, order=order)
-        assert all_almost_equal(res, np.sin(x.asarray()))
-        assert res.tensor.data.flags[order + '_CONTIGUOUS']
+    with xfail_if(tspace_impl == 'cupy',
+                  reason='cupy does not accept `order` in ufuncs'):
+        for order in ('C', 'F'):
+            res = x.__array_ufunc__(np.sin, '__call__', x, order=order)
+            assert all_almost_equal(res, np.sin(x.asarray()))
+            assert res.tensor.data.flags[order + '_CONTIGUOUS']
 
     # Check usage of `dtype` argument
     res = x.__array_ufunc__(np.sin, '__call__', x, dtype=complex)
@@ -1125,16 +1159,20 @@ def test_real_imag(odl_tspace_impl, odl_elem_order):
 
             # With [:] assignment
             x = cdiscr.zero()
-            x.real[:] = assigntype([[2, 3],
-                                    [4, 5]])
-            assert all_equal(x.real, [[2, 3],
-                                      [4, 5]])
+            with xfail_if(tspace_impl == 'cupy',
+                          reason='cupy x.real[:] does not mutate x'):
+                x.real[:] = assigntype([[2, 3],
+                                        [4, 5]])
+                assert all_equal(x.real, [[2, 3],
+                                          [4, 5]])
 
             x = cdiscr.zero()
-            x.imag[:] = assigntype([[2, 3],
-                                    [4, 5]])
-            assert all_equal(x.imag, [[2, 3],
-                                      [4, 5]])
+            with xfail_if(tspace_impl == 'cupy',
+                          reason='cupy x.imag[:] does not mutate x'):
+                x.imag[:] = assigntype([[2, 3],
+                                        [4, 5]])
+                assert all_equal(x.imag, [[2, 3],
+                                          [4, 5]])
 
         # Setting with scalars
         x = cdiscr.zero()
@@ -1163,7 +1201,8 @@ def test_reduction(odl_tspace_impl, odl_reduction):
 
     # Create some data
     x_arr, x = noise_elements(space, 1)
-    assert reduction(x_arr) == pytest.approx(getattr(x.ufuncs, name)())
+    assert (reduction(as_numpy(x_arr)) ==
+            pytest.approx(getattr(x.ufuncs, name)()))
 
 
 def test_power(odl_tspace_impl, power):
@@ -1171,9 +1210,9 @@ def test_power(odl_tspace_impl, power):
     space = odl.uniform_discr([0, 0], [1, 1], [2, 2], impl=impl)
 
     x_arr, x = noise_elements(space, 1)
-    x_pos_arr = np.abs(x_arr)
+    x_pos_arr = array_module(tspace_impl).abs(x_arr)
     x_neg_arr = -x_pos_arr
-    x_pos = np.abs(x)
+    x_pos = x.ufuncs.absolute()
     x_neg = -x_pos
 
     if int(power) != power:
@@ -1182,8 +1221,8 @@ def test_power(odl_tspace_impl, power):
             y += 0.1
 
     with np.errstate(invalid='ignore'):
-        true_pos_pow = np.power(x_pos_arr, power)
-        true_neg_pow = np.power(x_neg_arr, power)
+        true_pos_pow = array_module(tspace_impl).power(x_pos_arr, power)
+        true_neg_pow = array_module(tspace_impl).power(x_neg_arr, power)
 
     if int(power) != power and impl == 'cuda':
         with pytest.raises(ValueError):
diff --git a/odl/test/largescale/trafos/fourier_slow_test.py b/odl/test/largescale/trafos/fourier_slow_test.py
index 9738b96e8a8..7ab5f78eecb 100644
--- a/odl/test/largescale/trafos/fourier_slow_test.py
+++ b/odl/test/largescale/trafos/fourier_slow_test.py
@@ -79,7 +79,7 @@ def charfun_freq_ball(x):
     ball_dom_ft = ft(ball_dom)
     ball_ran_ift = ft.adjoint(ball_ran)
     assert (ball_dom.inner(ball_ran_ift) ==
-            pytest.approx(ball_ran.inner(ball_dom_ft), ndigits=1))
+            pytest.approx(ball_ran.inner(ball_dom_ft), rel=0.1))
 
 
 if __name__ == '__main__':
diff --git a/odl/test/operator/oputils_test.py b/odl/test/operator/oputils_test.py
index 54b0d63f0d5..e628681f407 100644
--- a/odl/test/operator/oputils_test.py
+++ b/odl/test/operator/oputils_test.py
@@ -78,6 +78,31 @@ def test_matrix_representation_product_to_product():
     matrix representation will be ``(2, 3, 2, 3)``.
     """
     n = 3
+    rn = odl.rn(n)
+    A = np.random.rand(n, n)
+    Aop = odl.MatrixOperator(A)
+
+    m = 2
+    rm = odl.rn(m)
+    B = np.random.rand(m, m)
+    Bop = odl.MatrixOperator(B)
+
+    ran_and_dom = ProductSpace(rn, rm)
+
+    AB_matrix = np.vstack([np.hstack([A, np.zeros((n, m))]),
+                           np.hstack([np.zeros((m, n)), B])])
+    ABop = ProductSpaceOperator([[Aop, 0],
+                                 [0, Bop]],
+                                ran_and_dom, ran_and_dom)
+    matrix_repr = matrix_representation(ABop)
+
+    assert all_almost_equal(AB_matrix, matrix_repr)
+
+
+def test_matrix_representation_product_to_product_two():
+    # Verify that the matrix representation function returns the correct matrix
+    n = 3
+    rn = odl.rn(n)
     A = np.random.rand(n, n)
     Aop = odl.MatrixOperator(A)
 
diff --git a/odl/test/set/domain_test.py b/odl/test/set/domain_test.py
index caa75590776..e0bd3453e6e 100644
--- a/odl/test/set/domain_test.py
+++ b/odl/test/set/domain_test.py
@@ -53,6 +53,9 @@ def test_min_pt():
     set_ = IntervalProd([1], [2])
     assert set_.min_pt == 1
 
+    set_ = IntervalProd(1, 2)
+    assert set_.min_pt == 1
+
     set_ = IntervalProd([1, 2, 3], [5, 6, 7])
     assert all_equal(set_.min_pt, [1, 2, 3])
 
diff --git a/odl/test/solvers/functional/functional_test.py b/odl/test/solvers/functional/functional_test.py
index ca79f02bce6..0db26dc84af 100644
--- a/odl/test/solvers/functional/functional_test.py
+++ b/odl/test/solvers/functional/functional_test.py
@@ -355,12 +355,9 @@ def test_functional_sum(space):
                             func1.gradient(x) + func2.gradient(x),
                             ndigits)
 
-    assert (
-        func_sum.derivative(x)(p) ==
-        pytest.approx(
-            func1.gradient(x).inner(p) + func2.gradient(x).inner(p),
-            rel=rtol)
-    )
+    assert (func_sum.derivative(x)(p) ==
+            pytest.approx(func1.gradient(x).inner(p) +
+                          func2.gradient(x).inner(p), rel=rtol))
 
     # Verify that proximal raises
     with pytest.raises(NotImplementedError):
@@ -396,10 +393,8 @@ def test_functional_plus_scalar(space):
     assert all_almost_equal(func_scalar_sum.gradient(x), func.gradient(x),
                             ndigits)
 
-    assert (
-        func_scalar_sum.derivative(x)(p) ==
-        pytest.approx(func.gradient(x).inner(p), rel=rtol)
-    )
+    assert (func_scalar_sum.derivative(x)(p) ==
+            pytest.approx(func.gradient(x).inner(p), rel=rtol))
 
     # Test proximal operator
     sigma = 1.2
@@ -407,11 +402,9 @@ def test_functional_plus_scalar(space):
                             func.proximal(sigma)(x),
                             ndigits)
 
-    # Test convex conjugate
-    assert (
-        func_scalar_sum.convex_conj(x) ==
-        pytest.approx(func.convex_conj(x) - scalar, rel=rtol)
-    )
+    # Test convex conjugate functional
+    assert (func_scalar_sum.convex_conj(x) ==
+            pytest.approx(func.convex_conj(x) - scalar, rel=rtol))
 
     assert all_almost_equal(func_scalar_sum.convex_conj.gradient(x),
                             func.convex_conj.gradient(x),
@@ -472,11 +465,10 @@ def test_translation_of_functional(space):
         second_translation)
 
     # Evaluation
-    assert (
-        double_translated_functional(x) ==
-        pytest.approx(test_functional(x - translation - second_translation),
-                      rel=rtol)
-    )
+    assert (double_translated_functional(x) ==
+            pytest.approx(
+                test_functional(x - translation - second_translation),
+                rel=rtol))
 
 
 def test_translation_proximal_stepsizes():
diff --git a/odl/test/space/pspace_test.py b/odl/test/space/pspace_test.py
index ad7acd1ad51..d6bce1858c4 100644
--- a/odl/test/space/pspace_test.py
+++ b/odl/test/space/pspace_test.py
@@ -160,7 +160,7 @@ def test_is_power_space():
 
 
 def test_mixed_space():
-    """Verify that a mixed productspace is handled properly."""
+    """Verify that a mixed product space is handled properly."""
     r2_1 = odl.rn(2, dtype='float64')
     r2_2 = odl.rn(2, dtype='float32')
     pspace = odl.ProductSpace(r2_1, r2_2)
@@ -250,24 +250,31 @@ def test_metric():
     HxH = odl.ProductSpace(H, H, exponent=1.0)
     w1 = HxH.element([v11, v12])
     w2 = HxH.element([v21, v22])
-    assert (HxH.dist(w1, w2) ==
-            pytest.approx(H.dist(v11, v21) + H.dist(v12, v22)))
+
+    dist = HxH.dist(w1, w2)
+    expected_dist = np.linalg.norm([H.dist(v11, v21), H.dist(v12, v22)],
+                                   ord=1.0)
+    assert dist == pytest.approx(expected_dist)
 
     # 2-norm
     HxH = odl.ProductSpace(H, H, exponent=2.0)
     w1 = HxH.element([v11, v12])
     w2 = HxH.element([v21, v22])
-    assert (
-        HxH.dist(w1, w2) ==
-        pytest.approx((H.dist(v11, v21) ** 2 + H.dist(v12, v22) ** 2) ** 0.5)
-    )
+
+    dist = HxH.dist(w1, w2)
+    expected_dist = np.linalg.norm([H.dist(v11, v21), H.dist(v12, v22)],
+                                   ord=2.0)
+    assert dist == pytest.approx(expected_dist)
 
     # inf norm
     HxH = odl.ProductSpace(H, H, exponent=float('inf'))
     w1 = HxH.element([v11, v12])
     w2 = HxH.element([v21, v22])
-    assert (HxH.dist(w1, w2) ==
-            pytest.approx(max(H.dist(v11, v21), H.dist(v12, v22))))
+
+    dist = HxH.dist(w1, w2)
+    expected_dist = np.linalg.norm([H.dist(v11, v21), H.dist(v12, v22)],
+                                   ord='inf')
+    assert dist == pytest.approx(expected_dist)
 
 
 def test_norm():
@@ -278,18 +285,28 @@ def test_norm():
     # 1-norm
     HxH = odl.ProductSpace(H, H, exponent=1.0)
     w = HxH.element([v1, v2])
-    assert HxH.norm(w) == pytest.approx(H.norm(v1) + H.norm(v2))
+
+    norm = HxH.norm(w)
+    expected_norm = sum([H.norm(v1), H.norm(v2)])
+    assert norm == pytest.approx(expected_norm)
 
     # 2-norm
     HxH = odl.ProductSpace(H, H, exponent=2.0)
     w = HxH.element([v1, v2])
-    assert (HxH.norm(w) ==
-            pytest.approx((H.norm(v1) ** 2 + H.norm(v2) ** 2) ** (1 / 2.0)))
+
+    norm = HxH.norm(w)
+    expected_norm = np.sqrt(
+        sum(n ** 2 for n in (H.norm(v1), H.norm(v2)))
+    )
+    assert norm == pytest.approx(expected_norm)
 
     # inf norm
     HxH = odl.ProductSpace(H, H, exponent=float('inf'))
     w = HxH.element([v1, v2])
-    assert HxH.norm(w) == pytest.approx(max(H.norm(v1), H.norm(v2)))
+
+    norm = HxH.norm(w)
+    expected_norm = max([H.norm(v1), H.norm(v2)])
+    assert norm == pytest.approx(expected_norm)
 
 
 def test_inner():
@@ -303,7 +320,10 @@ def test_inner():
     HxH = odl.ProductSpace(H, H)
     v = HxH.element([v1, v2])
     u = HxH.element([u1, u2])
-    assert HxH.inner(v, u) == pytest.approx(H.inner(v1, u1) + H.inner(v2, u2))
+
+    inner = HxH.inner(v, u)
+    expected_inner = sum([H.inner(v1, u1), H.inner(v2, u2)])
+    assert inner == pytest.approx(expected_inner)
 
 
 def test_vector_weighting(exponent):
@@ -837,8 +857,7 @@ def test_element_setitem_broadcast():
 
 
 def test_unary_ops():
-    # Verify that the unary operators (`+x` and `-x`) work as expected
-
+    """Verify that the unary operators ``+x`` and ``-x`` work as expected."""
     space = odl.rn(3)
     pspace = odl.ProductSpace(space, 2)
 
@@ -945,6 +964,23 @@ def test_ufuncs():
     assert w is z
     assert all_almost_equal(z, [[5], [7, 9]])
 
+    # Broadcasting
+    pow_space = odl.rn(4) ** (2, 3)  # corresponds to rn((2, 3, 4))
+    x = pow_space.one()
+    x_arr = np.ones((2, 3, 4))
+    y = x.ufuncs.add([1, 2, 3, 4])  # bcast along axes 0 and 1
+    y_arr = np.add(x_arr, [1, 2, 3, 4])
+    assert all_almost_equal(y, y_arr)
+    y = x.ufuncs.add(np.ones((3, 4)))  # bcast along axis 0
+    y_arr = np.add(x_arr, np.ones((3, 4)))
+    assert all_almost_equal(y, y_arr)
+    y = x.ufuncs.add(np.ones((2, 1, 4)))  # bcast along axis 1
+    y_arr = np.add(x_arr, np.ones((2, 1, 4)))
+    assert all_almost_equal(y, y_arr)
+    y = x.ufuncs.add((odl.rn(4) ** 3).one())  # bcast along axis 0
+    y_arr = np.add(x_arr, np.ones((3, 4)))
+    assert all_almost_equal(y, y_arr)
+
 
 def test_reductions():
     H = odl.ProductSpace(odl.rn(1), odl.rn(2))
diff --git a/odl/test/space/tensors_test.py b/odl/test/space/tensors_test.py
index 1316f850c6f..2fde9b84b19 100644
--- a/odl/test/space/tensors_test.py
+++ b/odl/test/space/tensors_test.py
@@ -17,14 +17,22 @@
 
 import odl
 from odl.set.space import LinearSpaceTypeError
+from odl.space.entry_points import tensor_space_impl
 from odl.space.npy_tensors import (
-    NumpyTensor, NumpyTensorSpace,
+    NumpyTensorSpace,
     NumpyTensorSpaceConstWeighting, NumpyTensorSpaceArrayWeighting,
     NumpyTensorSpaceCustomInner, NumpyTensorSpaceCustomNorm,
     NumpyTensorSpaceCustomDist)
+from odl.space.cupy_tensors import (
+    CupyTensorSpace,
+    CupyTensorSpaceConstWeighting, CupyTensorSpaceArrayWeighting,
+    CupyTensorSpaceCustomInner, CupyTensorSpaceCustomNorm,
+    CupyTensorSpaceCustomDist,
+    CUPY_AVAILABLE, cupy)
+from odl.util import array_module, array_cls, as_numpy
 from odl.util.testutils import (
-    all_almost_equal, all_equal, simple_fixture,
-    noise_array, noise_element, noise_elements)
+    all_almost_equal, all_equal, simple_fixture, skip_if_no_cupy,
+    noise_array, noise_element, noise_elements, xfail_if)
 from odl.util.ufuncs import UFUNCS
 
 
@@ -35,28 +43,22 @@
                               parse_version('1.13'))
 
 
-# Functions to return arrays and classes corresponding to impls. Extend
+# Functions to return arrays, classes etc. corresponding to impls. Extend
 # when a new impl is available.
 
-def _pos_array(space):
-    """Create an array with positive real entries in ``space``."""
-    return np.abs(noise_array(space)) + 0.1
-
-
-def _array_cls(impl):
-    """Return the array class for given impl."""
-    if impl == 'numpy':
-        return np.ndarray
+def _data_ptr(array):
+    """Return the memory address of the given array (depending on impl)."""
+    if isinstance(array, np.ndarray):
+        return array.ctypes.data
+    elif isinstance(array, cupy.ndarray):
+        return array.data.ptr
     else:
         assert False
 
 
-def _odl_tensor_cls(impl):
-    """Return the ODL tensor class for given impl."""
-    if impl == 'numpy':
-        return NumpyTensor
-    else:
-        assert False
+def _pos_array(space):
+    """Create an array with positive real entries for ``space``."""
+    return array_module(space.impl).asarray(abs(noise_array(space)) + 0.1)
 
 
 def _weighting_cls(impl, kind):
@@ -74,6 +76,21 @@ def _weighting_cls(impl, kind):
             return NumpyTensorSpaceCustomDist
         else:
             assert False
+
+    elif impl == 'cupy':
+        if kind == 'array':
+            return CupyTensorSpaceArrayWeighting
+        elif kind == 'const':
+            return CupyTensorSpaceConstWeighting
+        elif kind == 'inner':
+            return CupyTensorSpaceCustomInner
+        elif kind == 'norm':
+            return CupyTensorSpaceCustomNorm
+        elif kind == 'dist':
+            return CupyTensorSpaceCustomDist
+        else:
+            assert False
+
     else:
         assert False
 
@@ -102,9 +119,14 @@ def weight(request):
 
 @pytest.fixture(scope='module')
 def tspace(odl_floating_dtype, odl_tspace_impl):
-    impl = odl_tspace_impl
-    dtype = odl_floating_dtype
-    return odl.tensor_space(shape=(3, 4), dtype=dtype, impl=impl)
+    available_dtypes = tensor_space_impl(odl_tspace_impl).available_dtypes()
+    if odl_floating_dtype not in available_dtypes:
+        pytest.skip('dtype {} not supported by impl {!r}'
+                    ''.format(odl_floating_dtype, odl_tspace_impl))
+    else:
+        return odl.tensor_space(shape=(3, 4),
+                                dtype=odl_floating_dtype,
+                                impl=odl_tspace_impl)
 
 
 # --- Space classes --- #
@@ -121,6 +143,9 @@ def test_init_npy_tspace():
     NumpyTensorSpace((3, 4), dtype=complex, exponent=float('inf'))
     NumpyTensorSpace((3, 4), dtype='S1')
 
+    with pytest.raises(ValueError):
+        NumpyTensorSpace((3, 4), dtype=object)
+
     # Alternative constructor
     odl.tensor_space((3, 4))
     odl.tensor_space((3, 4), dtype=int)
@@ -160,24 +185,78 @@ def test_init_npy_tspace():
     odl.rn((3, 4), weighting=weight_arr)
 
 
+def test_init_cupy_tspace():
+    """Test initialization patterns and options for ``CupyTensorSpace``."""
+    if not CUPY_AVAILABLE:
+        pytest.skip('cupy backend not available')
+
+    # Basic class constructor
+    CupyTensorSpace((3, 4))
+    CupyTensorSpace((3, 4), dtype=int)
+    CupyTensorSpace((3, 4), dtype=float)
+    CupyTensorSpace((3, 4), dtype=complex)
+    CupyTensorSpace((3, 4), dtype=complex, exponent=1.0)
+    CupyTensorSpace((3, 4), dtype=complex, exponent=float('inf'))
+
+    with pytest.raises(ValueError):
+        CupyTensorSpace((3, 4), dtype='S1')
+    with pytest.raises(ValueError):
+        CupyTensorSpace((3, 4), dtype=object)
+
+    # Alternative constructor
+    odl.tensor_space((3, 4), impl='cupy')
+    odl.tensor_space((3, 4), dtype=int, impl='cupy')
+    odl.tensor_space((3, 4), exponent=1.0, impl='cupy')
+
+    # Constructors for real spaces
+    odl.rn((3, 4), impl='cupy')
+    odl.rn((3, 4), dtype='float32', impl='cupy')
+    odl.rn(3, impl='cupy')
+    odl.rn(3, dtype='float32', impl='cupy')
+
+    # Works only for real data types
+    with pytest.raises(ValueError):
+        odl.rn((3, 4), complex, impl='cupy')
+    with pytest.raises(ValueError):
+        odl.rn(3, int, impl='cupy')
+    with pytest.raises(ValueError):
+        odl.rn(3, 'S1', impl='cupy')
+
+    # Constructors for complex spaces
+    odl.cn((3, 4), impl='cupy')
+    odl.cn((3, 4), dtype='complex64', impl='cupy')
+    odl.cn(3, impl='cupy')
+    odl.cn(3, dtype='complex64', impl='cupy')
+
+    # Works only for complex data types
+    with pytest.raises(ValueError):
+        odl.cn((3, 4), float, impl='cupy')
+    with pytest.raises(ValueError):
+        odl.cn(3, 'S1', impl='cupy')
+
+    # Init with weights or custom space functions
+    weight_const = 1.5
+    weight_arr = _pos_array(odl.rn((3, 4), float))
+
+    odl.rn((3, 4), weighting=weight_const, impl='cupy')
+    odl.rn((3, 4), weighting=weight_arr, impl='cupy')
+
+
 def test_init_tspace_weighting(weight, exponent, odl_tspace_impl):
     """Test if weightings during init give the correct weighting classes."""
     impl = odl_tspace_impl
-    space = odl.tensor_space((3, 4), weighting=weight, exponent=exponent,
-                             impl=impl)
+    if impl == 'cupy' and isinstance(weight, np.ndarray):
+        # Need cast before using in space creation since
+        # ArrayWeighting.__eq__ uses `arr1 is arr2` to check arrays
+        weight = cupy.asarray(weight)
 
-    if impl == 'numpy':
-        if isinstance(weight, np.ndarray):
-            weighting_cls = _weighting_cls(impl, 'array')
-        else:
-            weighting_cls = _weighting_cls(impl, 'const')
+    if isinstance(weight, array_cls(impl)):
+        weighting_cls = _weighting_cls(impl, 'array')
     else:
-        assert False
+        weighting_cls = _weighting_cls(impl, 'const')
 
     weighting = weighting_cls(weight, exponent)
 
-    assert space.weighting == weighting
-
     # Using a weighting instance
     space = odl.tensor_space((3, 4), weighting=weighting, exponent=exponent,
                              impl=impl)
@@ -193,8 +272,8 @@ def test_init_tspace_weighting(weight, exponent, odl_tspace_impl):
             bad_dtype = np.ones((3, 4), dtype=complex)
             odl.tensor_space((3, 4), weighting=bad_dtype)
 
-        with pytest.raises(TypeError):
-            odl.tensor_space((3, 4), weighting=1j)  # float() conversion
+    with pytest.raises(TypeError):
+        odl.tensor_space((3, 4), weighting=1j)  # float() conversion
 
 
 def test_properties(odl_tspace_impl):
@@ -211,6 +290,9 @@ def test_properties(odl_tspace_impl):
     assert x.itemsize == 4
     assert x.nbytes == 4 * 3 * 4
 
+    if impl == 'cupy':
+        assert x.device == space.device == cupy.cuda.get_device_id()
+
 
 def test_size(odl_tspace_impl):
     """Test that size handles corner cases appropriately."""
@@ -249,8 +331,9 @@ def test_element(tspace, odl_elem_order):
     else:
         assert elem.data.flags[order + '_CONTIGUOUS']
 
-    # From Numpy array (C order)
-    arr_c = np.random.rand(*tspace.shape).astype(tspace.dtype)
+    # From array (C order)
+    arr_c = array_module(tspace.impl).asarray(
+        np.ascontiguousarray(noise_array(tspace)))
     elem = tspace.element(arr_c, order=order)
     assert all_equal(elem, arr_c)
     assert elem.shape == elem.data.shape
@@ -258,12 +341,15 @@ def test_element(tspace, odl_elem_order):
     if order is None or order == 'C':
         # None or same order should not lead to copy
         assert np.may_share_memory(elem.data, arr_c)
+    if order is not None:
+        assert _data_ptr(elem.data) == _data_ptr(arr_c)
     if order is not None:
         # Contiguousness in explicitly provided order should be guaranteed
         assert elem.data.flags[order + '_CONTIGUOUS']
 
-    # From Numpy array (F order)
-    arr_f = np.asfortranarray(arr_c)
+    # From array (F order)
+    arr_f = array_module(tspace.impl).asarray(
+        np.asfortranarray(noise_array(tspace)))
     elem = tspace.element(arr_f, order=order)
     assert all_equal(elem, arr_f)
     assert elem.shape == elem.data.shape
@@ -271,29 +357,38 @@ def test_element(tspace, odl_elem_order):
     if order is None or order == 'F':
         # None or same order should not lead to copy
         assert np.may_share_memory(elem.data, arr_f)
+    if order is not None:
+        assert _data_ptr(elem.data) == _data_ptr(arr_f)
     if order is not None:
         # Contiguousness in explicitly provided order should be guaranteed
         assert elem.data.flags[order + '_CONTIGUOUS']
 
+    # From Numpy array
+    arr = np.random.rand(*tspace.shape).astype(tspace.dtype)
+    elem = tspace.element(arr, order=order)
+    assert all_equal(elem, arr)
+    assert elem.shape == elem.data.shape
+    assert elem.dtype == tspace.dtype == elem.data.dtype
+
     # From pointer
-    arr_c_ptr = arr_c.ctypes.data
-    elem = tspace.element(data_ptr=arr_c_ptr, order='C')
-    assert all_equal(elem, arr_c)
-    assert np.may_share_memory(elem.data, arr_c)
-    arr_f_ptr = arr_f.ctypes.data
-    elem = tspace.element(data_ptr=arr_f_ptr, order='F')
-    assert all_equal(elem, arr_f)
-    assert np.may_share_memory(elem.data, arr_f)
+    if tspace.impl == 'numpy':
+        arr_c_ptr = arr_c.ctypes.data
+        elem = tspace.element(data_ptr=arr_c_ptr, order='C')
+        assert all_equal(elem, arr_c)
+        assert np.may_share_memory(elem.data, arr_c)
+        arr_f_ptr = arr_f.ctypes.data
+        elem = tspace.element(data_ptr=arr_f_ptr, order='F')
+        assert all_equal(elem, arr_f)
+        assert np.may_share_memory(elem.data, arr_f)
 
-    # Check errors
-    with pytest.raises(ValueError):
-        tspace.element(order='A')  # only 'C' or 'F' valid
+        with pytest.raises(ValueError):
+            tspace.element(data_ptr=arr_c_ptr)  # need order argument
 
-    with pytest.raises(ValueError):
-        tspace.element(data_ptr=arr_c_ptr)  # need order argument
+        with pytest.raises(TypeError):
+            tspace.element(arr_c, arr_c_ptr)  # forbidden to give both
 
-    with pytest.raises(TypeError):
-        tspace.element(arr_c, arr_c_ptr)  # forbidden to give both
+    with pytest.raises(ValueError):
+        tspace.element(order='A')  # only 'C', 'F' or None valid
 
 
 def test_equals_space(odl_tspace_impl):
@@ -490,7 +585,7 @@ def test_multiply(tspace):
 
 def test_multiply_exceptions(tspace):
     """Test if multiply raises correctly for bad input."""
-    other_space = odl.rn((4, 3))
+    other_space = odl.rn((4, 3), impl=tspace.impl)
 
     other_x = other_space.zero()
     x, y = tspace.zero(), tspace.zero()
@@ -508,8 +603,9 @@ def test_multiply_exceptions(tspace):
 def test_power(tspace):
     """Test ``**`` against direct array exponentiation."""
     [x_arr, y_arr], [x, y] = noise_elements(tspace, n=2)
-    y_pos = tspace.element(np.abs(y) + 0.1)
-    y_pos_arr = np.abs(y_arr) + 0.1
+    y_pos = tspace.element(y.ufuncs.absolute() + 0.1)
+    y_pos_arr = array_module(tspace.impl).abs(y_arr) + 0.1
+    y_pos_arr = y_pos_arr.astype(tspace.dtype)
 
     # Testing standard positive integer power out-of-place and in-place
     assert all_almost_equal(x ** 2, x_arr ** 2)
@@ -631,16 +727,16 @@ def test_inner(tspace):
     """Test the inner method against numpy.vdot."""
     xd = noise_element(tspace)
     yd = noise_element(tspace)
-
-    # TODO: add weighting
     correct_inner = np.vdot(yd, xd)
-    assert tspace.inner(xd, yd) == pytest.approx(correct_inner)
-    assert xd.inner(yd) == pytest.approx(correct_inner)
+
+    # Allow some error for single and half precision
+    assert tspace.inner(xd, yd) == pytest.approx(correct_inner, rel=1e-2)
+    assert xd.inner(yd) == pytest.approx(correct_inner, rel=1e-2)
 
 
 def test_inner_exceptions(tspace):
     """Test if inner raises correctly for bad input."""
-    other_space = odl.rn((4, 3))
+    other_space = odl.rn((4, 3), impl=tspace.impl)
     other_x = other_space.zero()
     x = tspace.zero()
 
@@ -654,27 +750,33 @@ def test_inner_exceptions(tspace):
 def test_norm(tspace):
     """Test the norm method against numpy.linalg.norm."""
     xarr, x = noise_elements(tspace)
+    correct_norm = np.linalg.norm(as_numpy(xarr.ravel()))
 
-    correct_norm = np.linalg.norm(xarr.ravel())
-    assert tspace.norm(x) == pytest.approx(correct_norm)
-    assert x.norm() == pytest.approx(correct_norm)
+    # Allow some error for single and half precision
+    assert tspace.norm(x) == pytest.approx(correct_norm, rel=1e-2)
+    assert x.norm() == pytest.approx(correct_norm, rel=1e-2)
 
 
 def test_norm_exceptions(tspace):
     """Test if norm raises correctly for bad input."""
-    other_space = odl.rn((4, 3))
+    other_space = odl.rn((4, 3), impl=tspace.impl)
     other_x = other_space.zero()
 
     with pytest.raises(LinearSpaceTypeError):
         tspace.norm(other_x)
 
 
-def test_pnorm(exponent):
+def test_pnorm(exponent, odl_tspace_impl):
     """Test the norm method with p!=2 against numpy.linalg.norm."""
-    for tspace in (odl.rn((3, 4), exponent=exponent),
-                   odl.cn((3, 4), exponent=exponent)):
+    impl = odl_tspace_impl
+    spaces = [odl.rn((3, 4), exponent=exponent, impl=impl)]
+    cls = odl.space.entry_points.tensor_space_impl(impl)
+    if complex in cls.available_dtypes():
+        spaces.append(odl.cn((3, 4), exponent=exponent, impl=impl))
+
+    for tspace in spaces:
         xarr, x = noise_elements(tspace)
-        correct_norm = np.linalg.norm(xarr.ravel(), ord=exponent)
+        correct_norm = np.linalg.norm(as_numpy(xarr.ravel()), ord=exponent)
 
         assert tspace.norm(x) == pytest.approx(correct_norm)
         assert x.norm() == pytest.approx(correct_norm)
@@ -683,15 +785,16 @@ def test_pnorm(exponent):
 def test_dist(tspace):
     """Test the dist method against numpy.linalg.norm of the difference."""
     [xarr, yarr], [x, y] = noise_elements(tspace, n=2)
+    correct_dist = np.linalg.norm(as_numpy((xarr - yarr).ravel()))
 
-    correct_dist = np.linalg.norm((xarr - yarr).ravel())
-    assert tspace.dist(x, y) == pytest.approx(correct_dist)
+    # Allow some error for single and half precision
+    assert tspace.dist(x, y) == pytest.approx(correct_dist, rel=1e-2)
     assert x.dist(y) == pytest.approx(correct_dist)
 
 
 def test_dist_exceptions(tspace):
     """Test if dist raises correctly for bad input."""
-    other_space = odl.rn((4, 3))
+    other_space = odl.rn((4, 3), impl=tspace.impl)
     other_x = other_space.zero()
     x = tspace.zero()
 
@@ -709,10 +812,12 @@ def test_pdist(odl_tspace_impl, exponent):
     cls = odl.space.entry_points.tensor_space_impl(impl)
     if complex in cls.available_dtypes():
         spaces.append(odl.cn((3, 4), exponent=exponent, impl=impl))
+
     for space in spaces:
         [xarr, yarr], [x, y] = noise_elements(space, n=2)
 
-        correct_dist = np.linalg.norm((xarr - yarr).ravel(), ord=exponent)
+        correct_dist = np.linalg.norm(as_numpy((xarr - yarr).ravel()),
+                                      ord=exponent)
         assert space.dist(x, y) == pytest.approx(correct_dist)
         assert x.dist(y) == pytest.approx(correct_dist)
 
@@ -728,7 +833,7 @@ def test_element_getitem(odl_tspace_impl, getitem_indices):
     sliced_shape = x_arr_sliced.shape
     x_sliced = x[getitem_indices]
 
-    if np.isscalar(x_arr_sliced):
+    if np.isscalar(x_sliced):
         assert x_arr_sliced == x_sliced
     else:
         assert x_sliced.shape == sliced_shape
@@ -739,13 +844,12 @@ def test_element_getitem(odl_tspace_impl, getitem_indices):
         assert sliced_spc.shape == sliced_shape
         assert sliced_spc.dtype == space.dtype
         assert sliced_spc.exponent == space.exponent
-        assert sliced_spc.weighting == space.weighting
 
         # Check that we have a view that manipulates the original array
         # (or not, depending on indexing style)
         x_arr_sliced[:] = 0
         x_sliced[:] = 0
-        assert all_equal(x_arr, x)
+        assert all_equal(x, x_arr)
 
 
 def test_element_setitem(odl_tspace_impl, setitem_indices):
@@ -764,13 +868,14 @@ def test_element_setitem(odl_tspace_impl, setitem_indices):
     assert all_equal(x, x_arr)
 
     # Setting values with arrays
-    rhs_arr = np.ones(sliced_shape)
+    rhs_arr = array_module(impl).ones(sliced_shape)
     x_arr[setitem_indices] = rhs_arr
     x[setitem_indices] = rhs_arr
     assert all_equal(x, x_arr)
 
-    # Using a list of lists
+    # Setting values with a list of lists
     rhs_list = (-np.ones(sliced_shape)).tolist()
+    x_arr = as_numpy(x_arr)
     x_arr[setitem_indices] = rhs_list
     x[setitem_indices] = rhs_list
     assert all_equal(x, x_arr)
@@ -782,6 +887,7 @@ def test_element_getitem_bool_array(odl_tspace_impl):
     space = odl.tensor_space((2, 3, 4), dtype='float32', exponent=1,
                              weighting=2, impl=impl)
     bool_space = odl.tensor_space((2, 3, 4), dtype=bool)
+
     x_arr, x = noise_elements(space)
     cond_arr, cond = noise_elements(bool_space)
 
@@ -794,7 +900,6 @@ def test_element_getitem_bool_array(odl_tspace_impl):
     assert sliced_spc.shape == x_arr_sliced.shape
     assert sliced_spc.dtype == space.dtype
     assert sliced_spc.exponent == space.exponent
-    assert sliced_spc.weighting == space.weighting
 
 
 def test_element_setitem_bool_array(odl_tspace_impl):
@@ -827,6 +932,76 @@ def test_element_setitem_bool_array(odl_tspace_impl):
     assert all_equal(x, x_arr)
 
 
+@skip_if_no_cupy
+def test_asarray_numpy_to_cupy(odl_floating_dtype):
+    """Test x.asarray with numpy x and cupy impl and out."""
+    dtype = odl_floating_dtype
+    space = odl.tensor_space((2, 3), dtype=dtype)
+
+    with xfail_if(dtype in ('float128', 'complex256'),
+                  reason='quad precision types not available in cupy'):
+        # Make new array, contiguous
+        x = space.one()
+        x_cpy = x.asarray(impl='cupy')
+        assert isinstance(x_cpy, cupy.ndarray)
+        assert all_equal(x_cpy, x)
+
+        # Write to existing, contiguous
+        out_cpy = cupy.empty((2, 3), dtype=dtype)
+        x_cpy = x.asarray(out=out_cpy)
+        assert x_cpy is out_cpy
+        assert all_equal(out_cpy, x)
+
+        # Make new array, discontiguous
+        arr = np.arange(12).astype(dtype).reshape((2, 6))[:, ::2]
+        x = space.element(arr)
+        assert not (x.data.flags.c_contiguous or x.data.flags.f_contiguous)
+        x_cpy = x.asarray(impl='cupy')
+        assert isinstance(x_cpy, cupy.ndarray)
+        assert all_equal(x_cpy, x)
+
+        # Write to existing, contiguous
+        out_cpy = cupy.empty((2, 3), dtype=dtype)
+        x_cpy = x.asarray(out=out_cpy)
+        assert x_cpy is out_cpy
+        assert all_equal(out_cpy, x)
+
+
+@skip_if_no_cupy
+def test_asarray_cupy_to_numpy(odl_floating_dtype):
+    """Test x.asarray with cupy x and numpy impl and out."""
+    dtype = odl_floating_dtype
+    with xfail_if(dtype in ('float128', 'complex256'),
+                  reason='quad precision types not available in cupy'):
+        space = odl.tensor_space((2, 3), dtype=dtype, impl='cupy')
+
+        # Make new array, contiguous
+        x = space.one()
+        x_npy = x.asarray(impl='numpy')
+        assert isinstance(x_npy, np.ndarray)
+        assert all_equal(x_npy, x)
+
+        # Write to existing, contiguous
+        out_npy = np.empty((2, 3), dtype=dtype)
+        x_npy = x.asarray(out=out_npy)
+        assert x_npy is out_npy
+        assert all_equal(out_npy, x)
+
+        # Make new array, discontiguous
+        arr = cupy.arange(12).astype(dtype).reshape((2, 6))[:, ::2]
+        x = space.element(arr)
+        assert not (x.data.flags.c_contiguous or x.data.flags.f_contiguous)
+        x_npy = x.asarray(impl='numpy')
+        assert isinstance(x_npy, np.ndarray)
+        assert all_equal(x_npy, x)
+
+        # Write to existing, contiguous
+        out_npy = np.empty((2, 3), dtype=dtype)
+        x_npy = x.asarray(out=out_npy)
+        assert x_npy is out_npy
+        assert all_equal(out_npy, x)
+
+
 def test_transpose(odl_tspace_impl):
     """Test the .T property of tensors against plain inner product."""
     impl = odl_tspace_impl
@@ -907,7 +1082,9 @@ def test_conversion_to_scalar(odl_tspace_impl):
     """Test conversion of size-1 vectors/tensors to scalars."""
     impl = odl_tspace_impl
     space = odl.rn(1, impl=impl)
+
     # Size 1 real space
+    space = odl.rn(1, impl=impl)
     value = 1.5
     element = space.element(value)
 
@@ -988,7 +1165,7 @@ def test_array_wrap_method(odl_tspace_impl):
     space = odl.tensor_space((3, 4), dtype='float32', exponent=1, weighting=2,
                              impl=impl)
     x_arr, x = noise_elements(space)
-    y_arr = np.sin(x_arr)
+    y_arr = array_module(impl).sin(x_arr)
     y = np.sin(x)  # Should yield again an ODL tensor
 
     assert all_equal(y, y_arr)
@@ -1008,7 +1185,7 @@ def test_conj(tspace):
     assert all_equal(y, xarr.conj())
 
 
-# --- Weightings (Numpy) --- #
+# --- Weightings --- #
 
 
 def test_array_weighting_init(odl_tspace_impl, exponent):
@@ -1022,8 +1199,8 @@ def test_array_weighting_init(odl_tspace_impl, exponent):
     weighting_arr = weighting_cls(weight_arr, exponent=exponent)
     weighting_elem = weighting_cls(weight_elem, exponent=exponent)
 
-    assert isinstance(weighting_arr.array, _array_cls(impl))
-    assert isinstance(weighting_elem.array, _array_cls(impl))
+    assert isinstance(weighting_arr.array, array_cls(impl))
+    assert isinstance(weighting_elem.array, array_cls(impl))
 
 
 def test_array_weighting_array_is_valid(odl_tspace_impl):
@@ -1039,7 +1216,7 @@ def test_array_weighting_array_is_valid(odl_tspace_impl):
 
     # Invalid
     weight_arr[0] = 0
-    weighting_arr = NumpyTensorSpaceArrayWeighting(weight_arr)
+    weighting_arr = weighting_cls(weight_arr)
     assert not weighting_arr.is_valid()
 
 
@@ -1110,54 +1287,53 @@ def test_array_weighting_inner(tspace):
     [xarr, yarr], [x, y] = noise_elements(tspace, 2)
 
     weight_arr = _pos_array(tspace)
-    weighting = NumpyTensorSpaceArrayWeighting(weight_arr)
+    weighting_cls = _weighting_cls(tspace.impl, 'array')
+    weighting = weighting_cls(weight_arr)
+    true_inner = np.vdot(as_numpy(yarr), as_numpy(xarr * weight_arr))
 
-    true_inner = np.vdot(yarr, xarr * weight_arr)
-    assert weighting.inner(x, y) == pytest.approx(true_inner)
+    # Allow some error for single and half precision
+    assert weighting.inner(x, y) == pytest.approx(true_inner, rel=1e-2)
 
     # Exponent != 2 -> no inner product, should raise
     with pytest.raises(NotImplementedError):
-        NumpyTensorSpaceArrayWeighting(weight_arr, exponent=1.0).inner(x, y)
+        weighting_cls(weight_arr, exponent=1.0).inner(x, y)
 
 
 def test_array_weighting_norm(tspace, exponent):
     """Test norm in a weighted space."""
-    rtol = np.sqrt(np.finfo(tspace.dtype).resolution)
     xarr, x = noise_elements(tspace)
 
     weight_arr = _pos_array(tspace)
-    weighting = NumpyTensorSpaceArrayWeighting(weight_arr, exponent=exponent)
-
+    weighting_cls = _weighting_cls(tspace.impl, 'array')
+    weighting = weighting_cls(weight_arr, exponent=exponent)
     if exponent == float('inf'):
-        true_norm = np.linalg.norm(
-            (weight_arr * xarr).ravel(),
-            ord=float('inf'))
+        true_norm = np.linalg.norm(as_numpy(xarr.ravel()), ord=float('inf'))
     else:
         true_norm = np.linalg.norm(
-            (weight_arr ** (1 / exponent) * xarr).ravel(),
+            as_numpy((weight_arr ** (1 / exponent) * xarr).ravel()),
             ord=exponent)
 
-    assert weighting.norm(x) == pytest.approx(true_norm, rel=rtol)
+    # Allow some error for single and half precision
+    assert weighting.norm(x) == pytest.approx(true_norm, rel=1e-2)
 
 
 def test_array_weighting_dist(tspace, exponent):
     """Test dist product in a weighted space."""
-    rtol = np.sqrt(np.finfo(tspace.dtype).resolution)
     [xarr, yarr], [x, y] = noise_elements(tspace, n=2)
 
     weight_arr = _pos_array(tspace)
-    weighting = NumpyTensorSpaceArrayWeighting(weight_arr, exponent=exponent)
-
+    weighting_cls = _weighting_cls(tspace.impl, 'array')
+    weighting = weighting_cls(weight_arr, exponent=exponent)
     if exponent == float('inf'):
-        true_dist = np.linalg.norm(
-            (weight_arr * (xarr - yarr)).ravel(),
-            ord=float('inf'))
+        true_dist = np.linalg.norm(as_numpy((xarr - yarr).ravel()),
+                                   ord=float('inf'))
     else:
         true_dist = np.linalg.norm(
-            (weight_arr ** (1 / exponent) * (xarr - yarr)).ravel(),
+            as_numpy((weight_arr ** (1 / exponent) * (xarr - yarr)).ravel()),
             ord=exponent)
 
-    assert weighting.dist(x, y) == pytest.approx(true_dist, rel=rtol)
+    # Allow some error for single and half precision
+    assert weighting.dist(x, y) == pytest.approx(true_dist, rel=1e-2)
 
 
 def test_const_weighting_init(odl_tspace_impl, exponent):
@@ -1221,15 +1397,17 @@ def test_const_weighting_inner(tspace):
     [xarr, yarr], [x, y] = noise_elements(tspace, 2)
 
     constant = 1.5
-    true_result_const = constant * np.vdot(yarr, xarr)
+    weighting_cls = _weighting_cls(tspace.impl, 'const')
+    weighting = weighting_cls(constant)
+    true_inner = constant * np.vdot(as_numpy(yarr), as_numpy(xarr))
 
-    w_const = NumpyTensorSpaceConstWeighting(constant)
-    assert w_const.inner(x, y) == pytest.approx(true_result_const)
+    # Allow some error for single and half precision
+    assert weighting.inner(x, y) == pytest.approx(true_inner, rel=1e-2)
 
     # Exponent != 2 -> no inner
-    w_const = NumpyTensorSpaceConstWeighting(constant, exponent=1)
+    weighing = weighting_cls(constant, exponent=1)
     with pytest.raises(NotImplementedError):
-        w_const.inner(x, y)
+        weighing.inner(x, y)
 
 
 def test_const_weighting_norm(tspace, exponent):
@@ -1237,14 +1415,16 @@ def test_const_weighting_norm(tspace, exponent):
     xarr, x = noise_elements(tspace)
 
     constant = 1.5
+    weighting_cls = _weighting_cls(tspace.impl, 'const')
+    weighting = weighting_cls(constant, exponent=exponent)
     if exponent == float('inf'):
-        factor = constant
+        factor = 1.0
     else:
         factor = constant ** (1 / exponent)
-    true_norm = factor * np.linalg.norm(xarr.ravel(), ord=exponent)
+    true_norm = factor * np.linalg.norm(as_numpy(xarr.ravel()), ord=exponent)
 
-    w_const = NumpyTensorSpaceConstWeighting(constant, exponent=exponent)
-    assert w_const.norm(x) == pytest.approx(true_norm)
+    # Allow some error for single and half precision
+    assert weighting.norm(x) == pytest.approx(true_norm, rel=1e-2)
 
 
 def test_const_weighting_dist(tspace, exponent):
@@ -1252,74 +1432,79 @@ def test_const_weighting_dist(tspace, exponent):
     [xarr, yarr], [x, y] = noise_elements(tspace, 2)
 
     constant = 1.5
+    weighting_cls = _weighting_cls(tspace.impl, 'const')
+    weighting = weighting_cls(constant, exponent=exponent)
     if exponent == float('inf'):
-        factor = constant
+        factor = 1.0
     else:
         factor = constant ** (1 / exponent)
-    true_dist = factor * np.linalg.norm((xarr - yarr).ravel(), ord=exponent)
+    true_dist = factor * np.linalg.norm(as_numpy((xarr - yarr).ravel()),
+                                        ord=exponent)
 
-    w_const = NumpyTensorSpaceConstWeighting(constant, exponent=exponent)
-    assert w_const.dist(x, y) == pytest.approx(true_dist)
+    # Allow some error for single and half precision
+    assert weighting.dist(x, y) == pytest.approx(true_dist, rel=1e-2)
 
 
 def test_custom_inner(tspace):
     """Test weighting with a custom inner product."""
-    rtol = np.sqrt(np.finfo(tspace.dtype).resolution)
-
     [xarr, yarr], [x, y] = noise_elements(tspace, 2)
 
     def inner(x, y):
-        return np.vdot(y, x)
+        return np.vdot(as_numpy(y.data), as_numpy(x.data))
 
-    w = NumpyTensorSpaceCustomInner(inner)
-    w_same = NumpyTensorSpaceCustomInner(inner)
-    w_other = NumpyTensorSpaceCustomInner(np.dot)
+    weighting_cls = _weighting_cls(tspace.impl, 'inner')
+    w = weighting_cls(inner)
+    w_same = weighting_cls(inner)
+    w_other = weighting_cls(array_module(tspace.impl).dot)
 
     assert w == w
     assert w == w_same
     assert w != w_other
 
-    true_inner = inner(xarr, yarr)
-    assert w.inner(x, y) == pytest.approx(true_inner)
-
-    true_norm = np.linalg.norm(xarr.ravel())
-    assert w.norm(x) == pytest.approx(true_norm)
+    true_inner = np.vdot(as_numpy(yarr), as_numpy(xarr))
+    true_norm = np.linalg.norm(as_numpy(xarr.ravel()))
+    true_dist = np.linalg.norm(as_numpy((xarr - yarr).ravel()))
 
-    true_dist = np.linalg.norm((xarr - yarr).ravel())
-    assert w.dist(x, y) == pytest.approx(true_dist, rel=rtol)
+    # Allow some error for single and half precision
+    assert w.inner(x, y) == pytest.approx(true_inner, rel=1e-2)
+    assert w.norm(x) == pytest.approx(true_norm, rel=1e-2)
+    assert w.dist(x, y) == pytest.approx(true_dist, rel=1e-2)
 
     with pytest.raises(TypeError):
-        NumpyTensorSpaceCustomInner(1)
+        weighting_cls(1)
 
 
 def test_custom_norm(tspace):
     """Test weighting with a custom norm."""
     [xarr, yarr], [x, y] = noise_elements(tspace, 2)
 
-    norm = np.linalg.norm
+    def norm(x):
+        return np.linalg.norm(as_numpy(x.data).ravel())
 
     def other_norm(x):
-        return np.linalg.norm(x, ord=1)
+        return np.linalg.norm(as_numpy(x.data).ravel(), ord=1)
 
-    w = NumpyTensorSpaceCustomNorm(norm)
-    w_same = NumpyTensorSpaceCustomNorm(norm)
-    w_other = NumpyTensorSpaceCustomNorm(other_norm)
+    weighting_cls = _weighting_cls(tspace.impl, 'norm')
+    w = weighting_cls(norm)
+    w_same = weighting_cls(norm)
+    w_other = weighting_cls(other_norm)
 
     assert w == w
     assert w == w_same
     assert w != w_other
 
-    with pytest.raises(NotImplementedError):
-        w.inner(x, y)
+    true_norm = np.linalg.norm(as_numpy(xarr.ravel()))
+    true_dist = np.linalg.norm(as_numpy((xarr - yarr).ravel()))
 
-    true_norm = np.linalg.norm(xarr.ravel())
-    assert w.norm(x) == pytest.approx(true_norm)
+    # Allow some error for single and half precision
+    assert w.norm(x) == pytest.approx(true_norm, rel=1e-2)
+    assert w.dist(x, y) == pytest.approx(true_dist, rel=1e-2)
 
-    true_dist = np.linalg.norm((xarr - yarr).ravel())
-    assert w.dist(x, y) == pytest.approx(true_dist)
+    with pytest.raises(NotImplementedError):
+        w.inner(x, y)
 
     with pytest.raises(TypeError):
-        NumpyTensorSpaceCustomNorm(1)
+        weighting_cls(1)
 
 
 def test_custom_dist(tspace):
@@ -1327,30 +1512,33 @@ def test_custom_dist(tspace):
     [xarr, yarr], [x, y] = noise_elements(tspace, 2)
 
     def dist(x, y):
-        return np.linalg.norm(x - y)
+        return np.linalg.norm(as_numpy((x - y).data).ravel())
 
     def other_dist(x, y):
-        return np.linalg.norm(x - y, ord=1)
+        return np.linalg.norm(as_numpy((x - y).data).ravel(), ord=1)
 
-    w = NumpyTensorSpaceCustomDist(dist)
-    w_same = NumpyTensorSpaceCustomDist(dist)
-    w_other = NumpyTensorSpaceCustomDist(other_dist)
+    weighting_cls = _weighting_cls(tspace.impl, 'dist')
+    w = weighting_cls(dist)
+    w_same = weighting_cls(dist)
+    w_other = weighting_cls(other_dist)
 
     assert w == w
     assert w == w_same
     assert w != w_other
 
+    true_dist = np.linalg.norm(as_numpy((xarr - yarr).ravel()))
+
+    # Allow some error for single and half precision
+    assert w.dist(x, y) == pytest.approx(true_dist, rel=1e-2)
+
     with pytest.raises(NotImplementedError):
         w.inner(x, y)
 
     with pytest.raises(NotImplementedError):
         w.norm(x)
 
-    true_dist = np.linalg.norm((xarr - yarr).ravel())
-    assert w.dist(x, y) == pytest.approx(true_dist)
-
     with pytest.raises(TypeError):
-        NumpyTensorSpaceCustomDist(1)
+        weighting_cls(1)
 
 
 # --- Ufuncs & Reductions --- #
@@ -1361,166 +1549,242 @@ def testodl_ufuncs(tspace, odl_ufunc):
     name = odl_ufunc
 
     # Get the ufunc from numpy as reference, plus some additional info
-    npy_ufunc = getattr(np, name)
-    nin = npy_ufunc.nin
-    nout = npy_ufunc.nout
-
-    if (np.issubsctype(tspace.dtype, np.floating) or
-            np.issubsctype(tspace.dtype, np.complexfloating) and
-            name in ['bitwise_and',
-                     'bitwise_or',
-                     'bitwise_xor',
-                     'invert',
-                     'left_shift',
-                     'right_shift']):
-        # Skip integer only methods for floating point data types
-        return
-
-    if (np.issubsctype(tspace.dtype, np.complexfloating) and
-            name in ['remainder',
-                     'trunc',
-                     'signbit',
-                     'invert',
-                     'left_shift',
-                     'right_shift',
-                     'rad2deg',
-                     'deg2rad',
-                     'copysign',
-                     'mod',
-                     'modf',
-                     'fmod',
-                     'logaddexp2',
-                     'logaddexp',
-                     'hypot',
-                     'arctan2',
-                     'floor',
-                     'ceil']):
-        # Skip real-only methods for complex data types
-        return
+    ufunc_npy = getattr(np, name)
+    nin = ufunc_npy.nin
+    nout = ufunc_npy.nout
+
+    # Disable Numpy warnings for the time being
+    npy_err_orig = np.geterr()
+    np.seterr(all='ignore')
+
+    def _check_result_type(result, expected_type):
+        if nout == 1:
+            assert isinstance(result, tspace.element_type)
+        elif nout > 1:
+            for i in range(nout):
+                assert isinstance(result[i], tspace.element_type)
+        else:
+            assert False
+
+    def _check_result_is_out(result, out_seq):
+        if nout == 1:
+            assert result is out_seq[0]
+        elif nout > 1:
+            for i in range(nout):
+                assert result[i] is out_seq[i]
+        else:
+            assert False
+
+    # See https://github.com/cupy/cupy/issues/794
+    cupy_ufuncs_broken_complex = [
+        'expm1', 'floor_divide', 'fmin', 'fmax',
+        'greater', 'greater_equal', 'less', 'less_equal', 'log1p', 'log2',
+        'logical_and', 'logical_or', 'logical_not', 'logical_xor', 'minimum',
+        'maximum', 'rint', 'sign', 'square']
+    if (tspace.impl == 'cupy' and
+            tspace.dtype.kind == 'c' and
+            ufunc in cupy_ufuncs_broken_complex):
+        pytest.xfail('ufunc {} broken for complex input in cupy'.format(name))
 
     # Create some data
     arrays, elements = noise_elements(tspace, nin + nout)
-    in_arrays = arrays[:nin]
-    out_arrays = arrays[nin:]
+    # Arrays of the space's own data storage type
+    in_arrays_own = arrays[:nin]
+    in_arrays_npy = [as_numpy(arr) for arr in in_arrays_own]
+    out_arrays_own = arrays[:nin]
+    out_arrays_npy = [as_numpy(arr) for arr in out_arrays_own]
     data_elem = elements[0]
 
     out_elems = elements[nin:]
     if nout == 1:
-        out_arr_kwargs = {'out': out_arrays[0]}
+        out_arr_own_kwargs = {'out': out_arrays_own[0]}
+        out_arr_npy_kwargs = {'out': out_arrays_npy[0]}
         out_elem_kwargs = {'out': out_elems[0]}
     elif nout > 1:
-        out_arr_kwargs = {'out': out_arrays[:nout]}
+        out_arr_own_kwargs = {'out': out_arrays_own[:nout]}
+        out_arr_npy_kwargs = {'out': out_arrays_npy[:nout]}
         out_elem_kwargs = {'out': out_elems[:nout]}
 
     # Get function to call, using both interfaces:
-    # - vec.ufunc(other_args)
-    # - np.ufunc(vec, other_args)
-    elem_fun_old = getattr(data_elem.ufuncs, name)
-    in_elems_old = elements[1:nin]
-    elem_fun_new = npy_ufunc
-    in_elems_new = elements[:nin]
-
-    # Out-of-place
-    npy_result = npy_ufunc(*in_arrays)
-    odl_result_old = elem_fun_old(*in_elems_old)
-    assert all_almost_equal(npy_result, odl_result_old)
-    odl_result_new = elem_fun_new(*in_elems_new)
-    assert all_almost_equal(npy_result, odl_result_new)
-
-    # Test type of output
+    # - vec.ufunc(*other_args)
+    # - np.ufunc(vec, *other_args)
+    ufunc_method = getattr(data_elem.ufuncs, name)
+    in_elems_method = elements[1:nin]
+    in_elems_npy = elements[:nin]
+
+    # If Numpy fails, mark the test as xfail (same below)
+    try:
+        result_npy = ufunc_npy(*in_arrays_npy)
+    except TypeError:
+        pytest.xfail('numpy ufunc not valid for inputs')
+
+    # Out-of-place -- in = elements -- ufunc = method or numpy
+    result = ufunc_method(*in_elems_method)
+    assert all_almost_equal(result_npy, result)
+    _check_result_type(result, tspace.element_type)
+
+    # Get element(s) in the right space for in-place later
     if nout == 1:
-        assert isinstance(odl_result_old, tspace.element_type)
-        assert isinstance(odl_result_new, tspace.element_type)
-    elif nout > 1:
-        for i in range(nout):
-            assert isinstance(odl_result_old[i], tspace.element_type)
-            assert isinstance(odl_result_new[i], tspace.element_type)
-
-    # In-place with ODL objects as `out`
-    npy_result = npy_ufunc(*in_arrays, **out_arr_kwargs)
-    odl_result_old = elem_fun_old(*in_elems_old, **out_elem_kwargs)
-    assert all_almost_equal(npy_result, odl_result_old)
+        out_elems = [result.space.element()]
+    elif nout == 2:
+        out_elems = [res.space.element() for res in result]
+    else:
+        assert False
+
+    result = ufunc_npy(*in_elems_npy)
+    assert all_almost_equal(result_npy, result)
+    _check_result_type(result, tspace.element_type)
+
+    # Out-of-place -- in = numpy or own arrays -- ufunc = method
+    result = ufunc_method(*in_arrays_npy[1:])
+    assert all_almost_equal(result_npy, result)
+    _check_result_type(result, tspace.element_type)
+
+    result = ufunc_method(*in_arrays_own[1:])
+    assert all_almost_equal(result_npy, result)
+    _check_result_type(result, tspace.element_type)
+
+    # In-place -- in = elements -- out = elements -- ufunc = method or numpy
+    result = ufunc_method(*in_elems_method, **out_elem_kwargs)
+    assert all_almost_equal(result_npy, result)
+    _check_result_is_out(result, out_elems[:nout])
+
     if USE_ARRAY_UFUNCS_INTERFACE:
-        # In-place will not work with Numpy < 1.13
-        odl_result_new = elem_fun_new(*in_elems_new, **out_elem_kwargs)
-        assert all_almost_equal(npy_result, odl_result_new)
+        # Custom objects not allowed as `out` for numpy < 1.13
+        result = ufunc_npy(*in_elems_npy, **out_arr_npy_kwargs)
 
-    # Check that returned stuff refers to given out
-    if nout == 1:
-        assert odl_result_old is out_elems[0]
-        if USE_ARRAY_UFUNCS_INTERFACE:
-            assert odl_result_new is out_elems[0]
-    elif nout > 1:
-        for i in range(nout):
-            assert odl_result_old[i] is out_elems[i]
-            if USE_ARRAY_UFUNCS_INTERFACE:
-                assert odl_result_new[i] is out_elems[i]
+        assert all_almost_equal(result_npy, result)
+        _check_result_is_out(result, out_elems[:nout])
 
-    # In-place with Numpy array as `out` for new interface
+    # In-place -- in = elements -- out = numpy or own arrays -- ufunc = numpy
+    # This case is only supported with the new interface
     if USE_ARRAY_UFUNCS_INTERFACE:
-        out_arrays_new = [np.empty_like(arr) for arr in out_arrays]
+        # Fresh arrays for output
+        out_arrays_npy = [np.empty_like(as_numpy(arr))
+                          for arr in arrays[nin:]]
+        out_arrays_own = [array_module(tspace.impl).empty_like(arr)
+                          for arr in arrays[nin:]]
         if nout == 1:
-            out_elem_kwargs_new = {'out': out_arrays_new[0]}
+            kwargs_npy = {'out': out_arrays_npy[0]}
+            kwargs_own = {'out': out_arrays_own[0]}
         elif nout > 1:
-            out_elem_kwargs_new = {'out': out_arrays_new[:nout]}
+            kwargs_npy = {'out': out_arrays_npy[:nout]}
+            kwargs_own = {'out': out_arrays_own[:nout]}
 
-        odl_result_elem_new = elem_fun_new(*in_elems_new,
-                                           **out_elem_kwargs_new)
-        assert all_almost_equal(npy_result, odl_result_elem_new)
+        try:
+            result_out_npy = ufunc_npy(*in_elems_npy, **kwargs_npy)
+        except TypeError:
+            pytest.xfail('numpy ufunc not valid for inputs')
 
-        if nout == 1:
-            assert odl_result_elem_new is out_arrays_new[0]
-        elif nout > 1:
-            for i in range(nout):
-                assert odl_result_elem_new[i] is out_arrays_new[i]
+        result_out_own = ufunc_npy(*in_elems_npy, **kwargs_own)
+        assert all_almost_equal(result_out_npy, result_npy)
+        assert all_almost_equal(result_out_own, result_npy)
+        _check_result_is_out(result_out_npy, out_arrays_npy)
+        _check_result_is_out(result_out_own, out_arrays_own)
 
     if USE_ARRAY_UFUNCS_INTERFACE:
         # Check `ufunc.at`
         indices = [[0, 0, 1],
                    [0, 1, 2]]
 
-        mod_array = in_arrays[0].copy()
-        mod_elem = in_elems_new[0].copy()
+        mod_array = in_arrays_npy[0].copy()
+        mod_elem = in_elems_npy[0].copy()
         if nin == 1:
-            npy_result = npy_ufunc.at(mod_array, indices)
-            odl_result = npy_ufunc.at(mod_elem, indices)
+            try:
+                result_npy = ufunc_npy.at(mod_array, indices)
+            except TypeError:
+                pytest.xfail('numpy ufunc.at not valid for inputs')
+
+            result = ufunc_npy.at(mod_elem, indices)
+
         elif nin == 2:
-            other_array = in_arrays[1][indices]
-            other_elem = in_elems_new[1][indices]
-            npy_result = npy_ufunc.at(mod_array, indices, other_array)
-            odl_result = npy_ufunc.at(mod_elem, indices, other_elem)
+            other_array = in_arrays_npy[1][indices]
+            other_elem = in_elems_npy[1][indices]
+            try:
+                result_npy = ufunc_npy.at(mod_array, indices, other_array)
+            except TypeError:
+                pytest.xfail('numpy ufunc.at not valid for inputs')
 
-        assert all_almost_equal(odl_result, npy_result)
+            result = ufunc_npy.at(mod_elem, indices, other_elem)
+
+        assert all_almost_equal(result, result_npy)
 
     # Check `ufunc.reduce`
     if nin == 2 and nout == 1 and USE_ARRAY_UFUNCS_INTERFACE:
-        in_array = in_arrays[0]
-        in_elem = in_elems_new[0]
+        in_array = in_arrays_npy[0]
+        in_elem = in_elems_npy[0]
 
         # We only test along one axis since some binary ufuncs are not
         # re-orderable, in which case Numpy raises a ValueError
-        npy_result = npy_ufunc.reduce(in_array)
-        odl_result = npy_ufunc.reduce(in_elem)
-        assert all_almost_equal(odl_result, npy_result)
-        odl_result_keepdims = npy_ufunc.reduce(in_elem, keepdims=True)
-        assert odl_result_keepdims.shape == (1,) + in_elem.shape[1:]
-        # In-place using `out` (with ODL vector and array)
-        out_elem = odl_result_keepdims.space.element()
-        out_array = np.empty(odl_result_keepdims.shape,
-                             dtype=odl_result_keepdims.dtype)
-        npy_ufunc.reduce(in_elem, out=out_elem, keepdims=True)
-        npy_ufunc.reduce(in_elem, out=out_array, keepdims=True)
-        assert all_almost_equal(out_elem, odl_result_keepdims)
-        assert all_almost_equal(out_array, odl_result_keepdims)
+
+        try:
+            result_npy = ufunc_npy.reduce(in_array)
+        except TypeError:
+            pytest.xfail('numpy ufunc.reduce not valid for inputs')
+
+        # Out-of-place -- in = element
+        result = ufunc_npy.reduce(in_elem)
+        assert all_almost_equal(result, result_npy)
+        result_keepdims = ufunc_npy.reduce(in_elem, keepdims=True)
+        assert result_keepdims.shape == (1,) + in_elem.shape[1:]
+
+        # In-place -- in = element -- out = element or numpy array or own array
+        out_elem = result_keepdims.space.element()
+        ufunc_npy.reduce(in_elem, out=out_elem, keepdims=True)
+        assert all_almost_equal(out_elem, result_keepdims)
+        out_array_npy = np.empty(result_keepdims.shape,
+                                 dtype=result_keepdims.dtype)
+        ufunc_npy.reduce(in_elem, out=out_array_npy, keepdims=True)
+        assert all_almost_equal(out_array_npy, result_keepdims)
+        out_array_own = array_module(tspace.impl).empty(
+            result_keepdims.shape, dtype=result_keepdims.dtype)
+        ufunc_npy.reduce(in_elem, out=out_array_own, keepdims=True)
+        assert all_almost_equal(out_array_own, result_keepdims)
+
         # Using a specific dtype
-        npy_result = npy_ufunc.reduce(in_array, dtype=complex)
-        odl_result = npy_ufunc.reduce(in_elem, dtype=complex)
-        assert odl_result.dtype == npy_result.dtype
-        assert all_almost_equal(odl_result, npy_result)
+        try:
+            result_npy = ufunc_npy.reduce(in_array, dtype=complex)
+        except TypeError:
+            pytest.xfail('numpy ufunc.reduce not valid for complex dtype')
+
+        with xfail_if(tspace.impl == 'cupy'):
+            result = ufunc_npy.reduce(in_elem, dtype=complex)
+            assert result.dtype == result_npy.dtype
+            assert all_almost_equal(result, result_npy)
 
     # Other ufunc method use the same interface, to we don't perform
     # extra tests for them.
 
+    # Reset Numpy err handling
+    np.seterr(**npy_err_orig)
+
+
+@skip_if_no_cupy
+def test_ufunc_cupy_force_native():
+    """Test the ``force_native`` flag for cupy based ufuncs."""
+    if not USE_ARRAY_UFUNCS_INTERFACE:
+        pytest.skip('`force_native` option only used in __array_ufuncs__')
+
+    space = odl.rn((3, 4), impl='cupy')
+
+    # Make sure we call native code for supported ufuncs
+    for ufunc in [np.sin, np.absolute, np.add, np.remainder, np.fmod]:
+        nin, nout = ufunc.nin, ufunc.nout
+        _, in_elems = noise_elements(space, n=2)
+        out_arrays, out_elems = noise_elements(space, n=2)
+        ufunc(*in_elems[:nin], out=out_elems[:nout], force_native=True)
+        ufunc(*in_elems[:nin], out=out_arrays[:nout], force_native=True)
+
+    # These have explicit native implementations
+    for ufunc in [np.add, np.multiply]:
+        for method in ['reduce', 'accumulate']:
+            in_elem = noise_element(space)
+            getattr(ufunc, method)(in_elem, force_native=True)
+
+    for ufunc in [np.minimum, np.maximum]:
+        in_elem = noise_element(space)
+        ufunc.reduce(in_elem, force_native=True)
+
 
 def test_ufunc_corner_cases(odl_tspace_impl):
     """Check if some corner cases are handled correctly."""
@@ -1545,11 +1809,17 @@ def test_ufunc_corner_cases(odl_tspace_impl):
     # Check that the result space is the same
     assert res.space == space
 
-    # Check usage of `order` argument
+    # Check usage of `order` argument (not available in cupy)
     for order in ('C', 'F'):
-        res = x.__array_ufunc__(np.sin, '__call__', x, order=order)
-        assert all_almost_equal(res, np.sin(x.asarray()))
-        assert res.data.flags[order + '_CONTIGUOUS']
+        if impl == 'numpy':
+            res = x.__array_ufunc__(np.sin, '__call__', x, order=order)
+            assert all_almost_equal(res, np.sin(x.asarray()))
+            assert res.data.flags[order + '_CONTIGUOUS']
+        elif impl == 'cupy':
+            with pytest.xfail(reason='cupy does not accept `order` in ufuncs'):
+                res = x.__array_ufunc__(np.sin, '__call__', x, order=order)
+                assert all_almost_equal(res, np.sin(x.asarray()))
+                assert res.data.flags[order + '_CONTIGUOUS']
 
     # Check usage of `dtype` argument
     res = x.__array_ufunc__(np.sin, '__call__', x, dtype='float32')
@@ -1590,7 +1860,8 @@ def test_ufunc_corner_cases(odl_tspace_impl):
     res = x.__array_ufunc__(np.add, 'accumulate', x)
     assert all_almost_equal(res, np.add.accumulate(x.asarray()))
     assert res.space == space
-    arr = np.empty_like(x)
+
+    arr = array_module(impl).empty_like(x)
     res = x.__array_ufunc__(np.add, 'accumulate', x, out=(arr,))
     assert all_almost_equal(arr, np.add.accumulate(x.asarray()))
     assert res is arr
@@ -1611,7 +1882,7 @@ def test_ufunc_corner_cases(odl_tspace_impl):
     assert all_almost_equal(res, np.add.reduce(x.asarray()))
 
     # With `out` argument and `axis`
-    out_ax0 = np.empty(3)
+    out_ax0 = array_module(impl).empty(3)
     res = x.__array_ufunc__(np.add, 'reduce', x, axis=0, out=(out_ax0,))
     assert all_almost_equal(out_ax0, np.add.reduce(x.asarray(), axis=0))
     assert res is out_ax0
@@ -1651,51 +1922,60 @@ def testodl_reduction(tspace, odl_reduction):
     # Should be equal theoretically, but summation order, other stuff, ...,
     # hence we use approx
 
+    if (tspace.impl == 'cupy' and
+            name in ('min', 'max') and
+            tspace.dtype.kind == 'c'):
+        pytest.xfail('Cupy does not accept complex input to `min` and `max`')
+
     # Full reduction, produces scalar
-    result_npy = npy_reduction(x_arr)
+    result_npy = npy_reduction(as_numpy(x_arr))
     result = x_reduction()
     assert result == pytest.approx(result_npy)
     result = x_reduction(axis=(0, 1))
     assert result == pytest.approx(result_npy)
 
     # Reduction along axes, produces element in reduced space
-    result_npy = npy_reduction(x_arr, axis=0)
+    result_npy = npy_reduction(as_numpy(x_arr), axis=0)
     result = x_reduction(axis=0)
-    assert isinstance(result, NumpyTensor)
+    assert isinstance(result, tspace.element_type)
     assert result.shape == result_npy.shape
     assert result.dtype == x.dtype
-    assert np.allclose(result, result_npy)
+    assert all_almost_equal(result, result_npy)
     # Check reduced space properties
-    assert isinstance(result.space, NumpyTensorSpace)
+    assert type(result.space) is type(tspace)
     assert result.space.exponent == x.space.exponent
     assert result.space.weighting == x.space.weighting  # holds true here
     # Evaluate in-place
     out = result.space.element()
     x_reduction(axis=0, out=out)
-    assert np.allclose(out, result_npy)
+    assert all_almost_equal(out, result_npy)
 
     # Use keepdims parameter
-    result_npy = npy_reduction(x_arr, axis=1, keepdims=True)
+    result_npy = npy_reduction(as_numpy(x_arr), axis=1, keepdims=True)
     result = x_reduction(axis=1, keepdims=True)
     assert result.shape == result_npy.shape
-    assert np.allclose(result, result_npy)
+    assert all_almost_equal(result, result_npy)
     # Evaluate in-place
     out = result.space.element()
     x_reduction(axis=1, keepdims=True, out=out)
-    assert np.allclose(out, result_npy)
+    assert all_almost_equal(out, result_npy)
 
     # Use dtype parameter
     # These reductions have a `dtype` parameter
     if name in ('cumprod', 'cumsum', 'mean', 'prod', 'std', 'sum',
                 'trace', 'var'):
-        result_npy = npy_reduction(x_arr, axis=1, dtype='complex64')
-        result = x_reduction(axis=1, dtype='complex64')
-        assert result.dtype == np.dtype('complex64')
-        assert np.allclose(result, result_npy)
-        # Evaluate in-place
-        out = result.space.element()
-        x_reduction(axis=1, dtype='complex64', out=out)
-        assert np.allclose(out, result_npy)
+        with xfail_if(tspace.impl == 'cupy' and tspace.dtype == 'float16',
+                      reason='complex reduction fails for float16 in cupy'):
+            # See https://github.com/cupy/cupy/issues/795
+            result_npy = npy_reduction(as_numpy(x_arr), axis=1,
+                                       dtype='complex64')
+            result = x_reduction(axis=1, dtype='complex64')
+            assert result.dtype == np.dtype('complex64')
+            assert all_almost_equal(result, result_npy)
+            # Evaluate in-place
+            out = result.space.element()
+            x_reduction(axis=1, dtype='complex64', out=out)
+            assert all_almost_equal(out, result_npy)
 
 
 def test_ufunc_reduction_docs_notempty(odl_tspace_impl):
diff --git a/odl/test/util/numerics_test.py b/odl/test/util/numerics_test.py
index a4a3fec702a..1fb8269464b 100644
--- a/odl/test/util/numerics_test.py
+++ b/odl/test/util/numerics_test.py
@@ -355,9 +355,9 @@ def test_resize_array_adj(resize_setup, odl_floating_dtype):
     resized_adj = resize_array(other_arr, array.shape, offset, pad_mode,
                                pad_const, direction='adjoint')
 
-    assert (np.vdot(resized.ravel(), other_arr.ravel()) ==
-            pytest.approx(np.vdot(array.ravel(), resized_adj.ravel()),
-                          rel=dtype_tol(dtype)))
+    dot = np.vdot(resized.ravel(), other_arr.ravel())
+    expected_dot = np.vdot(array.ravel(), resized_adj.ravel())
+    assert dot == pytest.approx(expected_dot)
 
 
 def test_resize_array_corner_cases(odl_scalar_dtype, padding):
diff --git a/odl/ufunc_ops/ufunc_ops.py b/odl/ufunc_ops/ufunc_ops.py
index af88b1f408a..b2b541cd455 100644
--- a/odl/ufunc_ops/ufunc_ops.py
+++ b/odl/ufunc_ops/ufunc_ops.py
@@ -209,10 +209,10 @@ def derivative(self, point):
     return derivative
 
 
-def ufunc_class_factory(name, nargin, nargout, docstring):
+def ufunc_class_factory(name, nin, nout, docstring):
     """Create a Ufunc `Operator` from a given specification."""
 
-    assert 0 <= nargin <= 2
+    assert 0 <= nin <= 2
 
     def __init__(self, space):
         """Initialize an instance.
@@ -225,21 +225,23 @@ def __init__(self, space):
         if not isinstance(space, LinearSpace):
             raise TypeError('`space` {!r} not a `LinearSpace`'.format(space))
 
-        if nargin == 1:
+        if nin == 1:
             domain = space0 = space
-            dtypes = [space.dtype]
-        elif nargin == len(space) == 2 and isinstance(space, ProductSpace):
+            dtypes_in = [space.dtype]
+        elif nin == 2:
+            if not (isinstance(space, ProductSpace) and len(space) == 2):
+                raise TypeError('`space` must be a `ProductSpace` of length '
+                                '{} for ufunc {!r}, got {!r}'
+                                ''.format(nin, name, space))
             domain = space
             space0 = space[0]
-            dtypes = [space[0].dtype, space[1].dtype]
+            dtypes_in = [space[0].dtype, space[1].dtype]
         else:
-            domain = ProductSpace(space, nargin)
-            space0 = space
-            dtypes = [space.dtype, space.dtype]
+            raise RuntimeError('bad `nin` {}'.format(nin))
 
-        dts_out = dtypes_out(name, dtypes)
+        dts_out = dtypes_out(name, dtypes_in)
 
-        if nargout == 1:
+        if nout == 1:
             range = space0.astype(dts_out[0])
         else:
             range = ProductSpace(space0.astype(dts_out[0]),
@@ -250,15 +252,15 @@ def __init__(self, space):
 
     def _call(self, x, out=None):
         """Return ``self(x)``."""
-        # TODO: use `__array_ufunc__` when implemented on `ProductSpace`,
+        # TODO: use `__array_ufunc__` when implemented on product spaces,
         # or try both
         if out is None:
-            if nargin == 1:
+            if nin == 1:
                 return getattr(x.ufuncs, name)()
             else:
                 return getattr(x[0].ufuncs, name)(*x[1:])
         else:
-            if nargin == 1:
+            if nin == 1:
                 return getattr(x.ufuncs, name)(out=out)
             else:
                 return getattr(x[0].ufuncs, name)(*x[1:], out=out)
@@ -274,21 +276,22 @@ def __repr__(self):
         dtype = float
 
     space = tensor_space(3, dtype=dtype)
-    if nargin == 1:
+    if nin == 1:
         vec = space.element([-1, 1, 2])
         arg = '{}'.format(vec)
         with np.errstate(all='ignore'):
             result = getattr(vec.ufuncs, name)()
     else:
         vec = space.element([-1, 1, 2])
-        vec2 = space.element([3, 4, 5])
+        vec2 = [3, 4, 5]
         arg = '[{}, {}]'.format(vec, vec2)
         with np.errstate(all='ignore'):
             result = getattr(vec.ufuncs, name)(vec2)
 
-    if nargout == 2:
-        result_space = ProductSpace(vec.space, 2)
-        result = repr(result_space.element(result))
+    if nout == 2:
+        result_space = ProductSpace(result[0].space, result[1].space)
+        with np.errstate(all='ignore'):
+            result = repr(result_space.element(result))
 
     examples_docstring = RAW_EXAMPLES_DOCSTRING.format(space=space, name=name,
                                                        arg=arg, result=result)
@@ -305,10 +308,10 @@ def __repr__(self):
     return type(full_name, (Operator,), attributes)
 
 
-def ufunc_functional_factory(name, nargin, nargout, docstring):
+def ufunc_functional_factory(name, nin, nout, docstring):
     """Create a ufunc `Functional` from a given specification."""
 
-    assert 0 <= nargin <= 2
+    assert 0 <= nin <= 2
 
     def __init__(self, field):
         """Initialize an instance.
@@ -330,7 +333,7 @@ def __init__(self, field):
 
     def _call(self, x):
         """Return ``self(x)``."""
-        if nargin == 1:
+        if nin == 1:
             return getattr(np, name)(x)
         else:
             return getattr(np, name)(*x)
@@ -341,10 +344,11 @@ def __repr__(self):
 
     # Create example (also functions as doctest)
 
-    if nargin != 1:
+    # TODO: remove this restriction
+    if nin != 1:
         raise NotImplementedError('Currently not suppored')
 
-    if nargout != 1:
+    if nout != 1:
         raise NotImplementedError('Currently not suppored')
 
     space = RealNumbers()
@@ -371,7 +375,7 @@ def __repr__(self):
 RAW_UFUNC_FACTORY_DOCSTRING = """{docstring}
 Notes
 -----
-This creates a `Operator`/`Functional` that applies a ufunc pointwise.
+This creates an `Operator`or `Functional` that applies a ufunc pointwise.
 
 Examples
 --------
@@ -380,21 +384,28 @@ def __repr__(self):
 """
 
 RAW_UFUNC_FACTORY_FUNCTIONAL_DOCSTRING = """
-Create functional with domain/range as real numbers:
+Create functional with real numbers as domain/range:
 
 >>> func = odl.ufunc_ops.{name}()
 """
 
 RAW_UFUNC_FACTORY_OPERATOR_DOCSTRING = """
-Create operator that acts pointwise on a `TensorSpace`
+Create operator that acts pointwise on a `TensorSpace`:
 
 >>> space = odl.rn(3)
 >>> op = odl.ufunc_ops.{name}(space)
 """
 
+# Avoid tons of warnings on the console when testing
+npy_err_old = np.geterr()
+np.seterr(all='ignore')
 
 # Create an operator for each ufunc
-for name, nargin, nargout, docstring in UFUNCS:
+for name, nin, nout, docstring in UFUNCS:
+    if nin == 2:
+        # Currently not supported
+        continue
+
     def indirection(name, docstring):
         # Indirection is needed since name should be saved but is changed
         # in the loop.
@@ -410,17 +421,17 @@ def ufunc_factory(domain=RealNumbers()):
                 raise ValueError('ufunc not available for {}'.format(domain))
         return ufunc_factory
 
-    globals()[name + '_op'] = ufunc_class_factory(name, nargin,
-                                                  nargout, docstring)
-    if not _is_integer_only_ufunc(name):
+    globals()[name + '_op'] = ufunc_class_factory(name, nin,
+                                                  nout, docstring)
+    if _is_integer_only_ufunc(name):
+        operator_example = ""
+    else:
         operator_example = RAW_UFUNC_FACTORY_OPERATOR_DOCSTRING.format(
             name=name)
-    else:
-        operator_example = ""
 
-    if not _is_integer_only_ufunc(name) and nargin == 1 and nargout == 1:
+    if not _is_integer_only_ufunc(name) and nin == 1 and nout == 1:
         globals()[name + '_func'] = ufunc_functional_factory(
-            name, nargin, nargout, docstring)
+            name, nin, nout, docstring)
         functional_example = RAW_UFUNC_FACTORY_FUNCTIONAL_DOCSTRING.format(
             name=name)
     else:
@@ -436,6 +447,7 @@ def ufunc_factory(domain=RealNumbers()):
     globals()[name] = ufunc_factory
     __all__ += (name,)
 
+np.seterr(**npy_err_old)
 
 if __name__ == '__main__':
     from odl.util.testutils import run_doctests
diff --git a/odl/util/pytest_plugins.py b/odl/util/pytest_plugins.py
index dbac0fef1a5..4054b21c71a 100644
--- a/odl/util/pytest_plugins.py
+++ b/odl/util/pytest_plugins.py
@@ -14,6 +14,7 @@
 import os
 
 import odl
+from odl.space.cupy_tensors import CUPY_AVAILABLE
 from odl.space.entry_points import tensor_space_impl_names
 from odl.trafos.backends import PYFFTW_AVAILABLE, PYWT_AVAILABLE
 from odl.util.testutils import simple_fixture
@@ -80,6 +81,10 @@ def find_example_dirs():
     collect_ignore.append(
         os.path.join(odl_root, 'odl', 'trafos', 'wavelet.py'))
 
+if not CUPY_AVAILABLE:
+    collect_ignore.append(
+        os.path.join(odl_root, 'odl', 'space', 'cupy_tensors.py'))
+
 
 # Remove duplicates
 collect_ignore = list(set(collect_ignore))
diff --git a/odl/util/testutils.py b/odl/util/testutils.py
index a0f54f76452..c0ae330ed4b 100644
--- a/odl/util/testutils.py
+++ b/odl/util/testutils.py
@@ -17,8 +17,7 @@
 import warnings
 from time import time
 
-from odl.util.utility import run_from_ipython, is_string
-
+from odl.util.utility import run_from_ipython, is_string, none_context, asarray
 
 __all__ = (
     'all_equal', 'all_almost_equal', 'dtype_ndigits', 'dtype_tol',
@@ -28,7 +27,6 @@
     'ProgressRange', 'test', 'run_doctests', 'test_file'
 )
 
-
 def _ndigits(a, b, default=None):
     """Return number of expected correct digits comparing ``a`` and ``b``.
 
@@ -91,6 +89,19 @@ def dtype_tol(dtype, default=None):
 
 def all_equal(iter1, iter2):
     """Return ``True`` if all elements in ``a`` and ``b`` are equal."""
+    # Transfer cupy arrays to CPU for faster comparison
+    from odl.space.cupy_tensors import CUPY_AVAILABLE, CupyTensor, cupy
+    if CUPY_AVAILABLE:
+        if isinstance(iter1, CupyTensor):
+            iter1 = iter1.asarray()
+        elif isinstance(iter1, cupy.ndarray):
+            iter1 = cupy.asnumpy(iter1)
+
+        if isinstance(iter2, CupyTensor):
+            iter2 = iter2.asarray()
+        elif isinstance(iter2, cupy.ndarray):
+            iter2 = cupy.asnumpy(iter2)
+
     # Direct comparison for scalars, tuples or lists
     try:
         if iter1 == iter2:
@@ -137,10 +148,23 @@ def all_almost_equal_array(v1, v2, ndigits):
 
 def all_almost_equal(iter1, iter2, ndigits=None):
     """Return ``True`` if all elements in ``a`` and ``b`` are almost equal."""
+    # Transfer cupy arrays to CPU for faster comparison
+    from odl.space.cupy_tensors import CUPY_AVAILABLE, CupyTensor, cupy
+    if CUPY_AVAILABLE:
+        if isinstance(iter1, CupyTensor):
+            iter1 = iter1.asarray()
+        elif isinstance(iter1, cupy.ndarray):
+            iter1 = cupy.asnumpy(iter1)
+
+        if isinstance(iter2, CupyTensor):
+            iter2 = iter2.asarray()
+        elif isinstance(iter2, cupy.ndarray):
+            iter2 = cupy.asnumpy(iter2)
+
     try:
         if iter1 is iter2 or iter1 == iter2:
             return True
-    except ValueError:
+    except (ValueError, TypeError):
         pass
 
     if iter1 is None and iter2 is None:
@@ -182,6 +206,23 @@ def is_subdict(subdict, dictionary):
     return all(item in dictionary.items() for item in subdict.items())
 
 
+def xfail_if(condition, reason=''):
+    """Return a ``pytest.xfail`` object if ``condition`` is ``True``.
+
+    Examples
+    --------
+    Create test that is expected to fail if ``condition`` is false, e.g.
+
+    >>> condition = False
+    >>> with xfail_if(condition, reason='only works without condition'):
+    ...     assert not condition
+    """
+    if condition:
+        return pytest.xfail(reason)
+    else:
+        return none_context()
+
+
 try:
     # Try catch in case user does not have pytest
     import pytest
@@ -191,6 +232,7 @@ def _pass(function):
         return function
 
     never_skip = _pass
+    skip_if_no_cupy = _pass
     skip_if_no_stir = _pass
     skip_if_no_pywavelets = _pass
     skip_if_no_pyfftw = _pass
@@ -208,6 +250,11 @@ def _pass(function):
         reason='STIR not available'
     )
 
+    skip_if_no_cupy = pytest.mark.skipif(
+        "not odl.space.cupy_tensors.CUPY_AVAILABLE",
+        reason='CuPy not available'
+    )
+
     skip_if_no_pywavelets = pytest.mark.skipif(
         "not odl.trafos.PYWT_AVAILABLE",
         reason='PyWavelets not available'
@@ -305,7 +352,7 @@ def noise_array(space):
 
     Returns
     -------
-    noise_array : `numpy.ndarray` element
+    noise_array : `numpy.ndarray`
         Array with white noise such that ``space.element``'s can be created
         from it.
 
@@ -323,9 +370,21 @@ def noise_array(space):
     odl.set.space.LinearSpace.examples : Examples of elements
         typical to the space.
     """
-    from odl.space import ProductSpace
+    from odl.space.pspace import ProductSpace
+
     if isinstance(space, ProductSpace):
-        return np.array([noise_array(si) for si in space])
+        arr_list = [noise_array(spc_i) for spc_i in space]
+        if space.is_power_space:
+            arr = np.empty((len(arr_list),) + arr_list[0].shape,
+                           dtype=space[0].dtype)
+        else:
+            arr = np.empty((len(arr_list),) + arr_list[0].shape,
+                           dtype=object)
+        for i in range(len(arr)):
+            arr[i] = arr_list[i]
+
+        return arr
+
     else:
         if space.dtype == bool:
             # TODO(kohr-h): use `randint(..., dtype=bool)` from Numpy 1.11 on
@@ -435,13 +494,21 @@ def noise_elements(space, n=1):
     noise_array
     noise_element
     """
-    arrs = tuple(noise_array(space) for _ in range(n))
+    from odl.space.pspace import ProductSpace
+
+    if isinstance(space, ProductSpace) and not space.is_power_space:
+        raise ValueError('`space` cannot be a non-power product space')
+
+    if isinstance(space, ProductSpace):
+        impl = space[0].impl
+    else:
+        impl = space.impl
 
-    # Make space elements from arrays
-    elems = tuple(space.element(arr.copy()) for arr in arrs)
+    arrs = tuple(asarray(noise_array(space), impl=impl) for _ in range(n))
+    elems = tuple(space.element(arr) for arr in arrs)
 
     if n == 1:
-        return tuple(arrs + elems)
+        return arrs + elems
     else:
         return arrs, elems
 
@@ -557,23 +624,23 @@ class ProgressBar(object):
     Usage:
 
     >>> progress = ProgressBar('Reading data', 10)
-    \rReading data: [                              ] Starting
+    Reading data: [                              ] Starting
     >>> progress.update(4) #halfway, zero indexing
-    \rReading data: [###############               ] 50.0%
+    Reading data: [###############               ] 50.0%
 
     Multi-indices, from slowest to fastest:
 
     >>> progress = ProgressBar('Reading data', 10, 10)
-    \rReading data: [                              ] Starting
+    Reading data: [                              ] Starting
     >>> progress.update(9, 8)
-    \rReading data: [############################# ] 99.0%
+    Reading data: [############################# ] 99.0%
 
     Supports simply calling update, which moves the counter forward:
 
     >>> progress = ProgressBar('Reading data', 10, 10)
-    \rReading data: [                              ] Starting
+    Reading data: [                              ] Starting
     >>> progress.update()
-    \rReading data: [                              ]  1.0%
+    Reading data: [                              ]  1.0%
     """
 
     def __init__(self, text='progress', *njobs):
@@ -687,12 +754,14 @@ def run_doctests(skip_if=False, **kwargs):
         Extra keyword arguments passed on to the ``doctest.testmod``
         function.
     """
-    from doctest import testmod, NORMALIZE_WHITESPACE, SKIP
+    from doctest import (
+        testmod, NORMALIZE_WHITESPACE, SKIP, IGNORE_EXCEPTION_DETAIL)
     from pkg_resources import parse_version
     import odl
     import numpy as np
 
-    optionflags = kwargs.pop('optionflags', NORMALIZE_WHITESPACE)
+    optionflags = kwargs.pop('optionflags',
+                             NORMALIZE_WHITESPACE | IGNORE_EXCEPTION_DETAIL)
     if skip_if:
         optionflags |= SKIP
 
diff --git a/odl/util/ufuncs.py b/odl/util/ufuncs.py
index caa6f1fbb0d..2a6b50b92eb 100644
--- a/odl/util/ufuncs.py
+++ b/odl/util/ufuncs.py
@@ -32,25 +32,35 @@
 __all__ = ('TensorSpaceUfuncs', 'ProductSpaceUfuncs')
 
 
-# Some are ignored since they don't cooperate with dtypes, needs fix
-RAW_UFUNCS = ['absolute', 'add', 'arccos', 'arccosh', 'arcsin', 'arcsinh',
-              'arctan', 'arctan2', 'arctanh', 'bitwise_and', 'bitwise_or',
-              'bitwise_xor', 'ceil', 'conj', 'copysign', 'cos', 'cosh',
-              'deg2rad', 'divide', 'equal', 'exp', 'exp2', 'expm1', 'floor',
-              'floor_divide', 'fmax', 'fmin', 'fmod', 'greater',
-              'greater_equal', 'hypot', 'invert', 'isfinite', 'isinf', 'isnan',
-              'left_shift', 'less', 'less_equal', 'log', 'log10', 'log1p',
-              'log2', 'logaddexp', 'logaddexp2', 'logical_and', 'logical_not',
-              'logical_or', 'logical_xor', 'maximum', 'minimum', 'mod', 'modf',
-              'multiply', 'negative', 'not_equal', 'power',
-              'rad2deg', 'reciprocal', 'remainder', 'right_shift', 'rint',
-              'sign', 'signbit', 'sin', 'sinh', 'sqrt', 'square', 'subtract',
-              'tan', 'tanh', 'true_divide', 'trunc']
-# ,'isreal', 'iscomplex', 'ldexp', 'frexp'
+_npy_maj, _npy_min = [int(n) for n in np.__version__.split('.')[:2]]
+
+# Supported by Numpy 1.9 and higher
+UFUNC_NAMES = [
+    'absolute', 'add', 'arccos', 'arccosh', 'arcsin', 'arcsinh', 'arctan',
+    'arctan2', 'arctanh', 'bitwise_and', 'bitwise_or', 'bitwise_xor', 'ceil',
+    'conj', 'conjugate', 'copysign', 'cos', 'cosh', 'deg2rad', 'degrees',
+    'divide', 'equal', 'exp', 'exp2', 'expm1', 'fabs', 'floor', 'floor_divide',
+    'fmax', 'fmin', 'fmod', 'frexp', 'greater', 'greater_equal', 'hypot',
+    'invert', 'isfinite', 'isinf', 'isnan', 'left_shift', 'less',
+    'less_equal', 'log', 'log10', 'log1p', 'log2', 'logaddexp', 'logaddexp2',
+    'logical_and', 'logical_not', 'logical_or', 'logical_xor', 'maximum',
+    'minimum', 'mod', 'modf', 'multiply', 'negative', 'nextafter',
+    'not_equal', 'power', 'rad2deg', 'radians', 'reciprocal', 'remainder',
+    'right_shift', 'rint', 'sign', 'signbit', 'sin', 'sinh', 'sqrt',
+    'square', 'spacing', 'subtract', 'tan', 'tanh', 'true_divide', 'trunc']
+
+if (_npy_maj, _npy_min) >= (1, 10):
+    UFUNC_NAMES.extend(['abs', 'cbrt', 'bitwise_not'])
+
+if (_npy_maj, _npy_min) >= (1, 12):
+    UFUNC_NAMES.extend(['float_power'])
+
+if (_npy_maj, _npy_min) >= (1, 13):
+    UFUNC_NAMES.extend(['divmod', 'heaviside', 'positive'])
 
 # Add some standardized information
 UFUNCS = []
-for name in RAW_UFUNCS:
+for name in UFUNC_NAMES:
     ufunc = getattr(np, name)
     n_in, n_out = ufunc.nin, ufunc.nout
     descr = ufunc.__doc__.splitlines()[2]
@@ -86,12 +96,9 @@ def wrapper(self, out=None, **kwargs):
                     ufunc, '__call__', self.elem, out=out, **kwargs)
 
         elif n_out == 2:
-            def wrapper(self, out=None, **kwargs):
-                if out is None:
-                    out = (None, None)
-
+            def wrapper(self, out1=None, out2=None, **kwargs):
                 return self.elem.__array_ufunc__(
-                    ufunc, '__call__', self.elem, out=out, **kwargs)
+                    ufunc, '__call__', self.elem, out=(out1, out2), **kwargs)
 
         else:
             raise NotImplementedError
@@ -99,8 +106,18 @@ def wrapper(self, out=None, **kwargs):
     elif n_in == 2:
         if n_out == 1:
             def wrapper(self, x2, out=None, **kwargs):
+                if out is None or isinstance(out, (type(self.elem),
+                                                   type(self.elem.data))):
+                    out = (out,)
+
                 return self.elem.__array_ufunc__(
-                    ufunc, '__call__', self.elem, x2, out=(out,), **kwargs)
+                    ufunc, '__call__', self.elem, x2, out=out, **kwargs)
+
+        elif n_out == 2:
+            def wrapper(self, x2, out1=None, out2=None, **kwargs):
+                return self.elem.__array_ufunc__(
+                    ufunc, '__call__', self.elem, x2, out=(out1, out2),
+                    **kwargs)
 
         else:
             raise NotImplementedError
@@ -137,6 +154,18 @@ def sum(self, axis=None, dtype=None, out=None, keepdims=False):
             np.add, 'reduce', self.elem,
             axis=axis, dtype=dtype, out=(out,), keepdims=keepdims)
 
+    def cumsum(self, axis=None, dtype=None, out=None):
+        """Return the cumulative sum of ``self``.
+
+        See Also
+        --------
+        numpy.cumsum
+        cumprod
+        """
+        return self.elem.__array_ufunc__(
+            np.add, 'accumulate', self.elem,
+            axis=axis, dtype=dtype, out=(out,))
+
     def prod(self, axis=None, dtype=None, out=None, keepdims=False):
         """Return the product of ``self``.
 
@@ -149,6 +178,18 @@ def prod(self, axis=None, dtype=None, out=None, keepdims=False):
             np.multiply, 'reduce', self.elem,
             axis=axis, dtype=dtype, out=(out,), keepdims=keepdims)
 
+    def cumprod(self, axis=None, dtype=None, out=None):
+        """Return the cumulative product of ``self``.
+
+        See Also
+        --------
+        numpy.cumprod
+        cumsum
+        """
+        return self.elem.__array_ufunc__(
+            np.multiply, 'accumulate', self.elem,
+            axis=axis, dtype=dtype, out=(out,))
+
     def min(self, axis=None, dtype=None, out=None, keepdims=False):
         """Return the minimum of ``self``.
 
@@ -183,28 +224,176 @@ def max(self, axis=None, dtype=None, out=None, keepdims=False):
 # --- Wrappers for `ProductSpaceElement` --- #
 
 
+def _add_leading_dims(x2, pspace):
+    """Add leading dimensions to an input of a binary product space ufunc.
+
+    Parameters
+    ----------
+    x2
+        Input to a binary ufunc.
+    pspace : `ProductSpace`
+        Spac on which the ufunc is evaluated.
+
+    Returns
+    -------
+    x2_extra_dims
+        Variant of ``x2`` with extra leading dimensions appropriate for
+        ``pspace``.
+
+    Examples
+    --------
+    Product space elements, arrays and nested sequences can be rewrapped
+    such that they have leading dimensions of size 1 for broadcasting:
+
+    >>> pspace = odl.rn(2) ** (3, 4)
+    >>> reshaped = _add_leading_dims([0, 0], pspace)
+    >>> np.shape([0, 0])
+    (2,)
+    >>> reshaped
+    [[[0, 0]]]
+    >>> np.shape(reshaped)
+    (1, 1, 2)
+    >>> reshaped = _add_leading_dims(np.zeros(2), pspace)
+    >>> reshaped
+    array([[[ 0.,  0.]]])
+    >>> reshaped.shape
+    (1, 1, 2)
+    >>> reshaped = _add_leading_dims(odl.rn(2).zero(), pspace)
+    >>> reshaped  # returning array for simplicity
+    array([[[ 0.,  0.]]])
+    >>> reshaped.shape
+    (1, 1, 2)
+    >>> reshaped = _add_leading_dims((odl.rn(2) ** 4).zero(), pspace)
+    >>> reshaped  # Broadcasting along outer dimension of size 3
+    ProductSpace(ProductSpace(rn(2), 4), 1).element([
+        [
+            [ 0.,  0.],
+            [ 0.,  0.],
+            [ 0.,  0.],
+            [ 0.,  0.]
+        ]
+    ])
+    """
+    from odl.space.pspace import ProductSpace, ProductSpaceElement
+    from odl.space.base_tensors import Tensor
+
+    # Workaround for `shape` not using the base space shape of a
+    # power space
+    # TODO: remove when fixed, see
+    # https://github.com/odlgroup/odl/pull/1152
+    num_levels_pspace = 0
+    tmp_pspace = pspace
+    while True:
+        if isinstance(tmp_pspace, ProductSpace):
+            tmp_pspace = tmp_pspace[0]
+            num_levels_pspace += 1
+        else:
+            base_ndim_pspace = len(getattr(tmp_pspace, 'shape', ()))
+            break
+
+    pspace_ndim = num_levels_pspace + base_ndim_pspace
+
+    if isinstance(x2, ProductSpaceElement):
+        # Workaround for `shape` not using the base space shape of a
+        # power space element
+        # TODO: remove when fixed, see
+        # https://github.com/odlgroup/odl/pull/1152
+        num_levels_x2 = 0
+        tmp_x2 = x2
+        while True:
+            if isinstance(tmp_x2, ProductSpaceElement):
+                tmp_x2 = tmp_x2[0]
+                num_levels_x2 += 1
+            else:
+                base_ndim_x2 = len(getattr(tmp_x2, 'shape', ()))
+                break
+
+        x2_ndim = num_levels_x2 + base_ndim_x2
+
+        # TODO: replace by `pspace.ndim` and `x2.ndim` when fixed, see
+        # https://github.com/odlgroup/odl/pull/1152
+        for _ in range(pspace_ndim - x2_ndim):
+            x2 = (x2.space ** 1).element([x2])
+
+        return x2
+
+    if isinstance(x2, Tensor):
+        # Work with array directly
+        x2 = x2.data
+
+    if hasattr(x2, 'shape'):
+        # Some type of array
+        x2_ndim = len(x2.shape)
+
+        slc = (None,) * (pspace_ndim - x2_ndim) + (slice(None),) * x2_ndim
+        x2 = x2[slc]
+
+        # Downstream code will raise in case of bad shape
+        return x2
+
+    else:
+        # Array-like
+        x2_ndim = np.ndim(x2)
+
+        for _ in range(pspace_ndim - x2_ndim):
+            x2 = [x2]
+
+        # Downstream code will raise in case of bad shape
+        return x2
+
+
 def wrap_ufunc_productspace(name, n_in, n_out, doc):
     """Return ufunc wrapper for `ProductSpaceUfuncs`."""
     if n_in == 1:
         if n_out == 1:
             def wrapper(self, out=None, **kwargs):
+                from odl.space.pspace import ProductSpace, ProductSpaceElement
                 if out is None:
-                    result = [getattr(x.ufuncs, name)(**kwargs)
-                              for x in self.elem]
-                    return self.elem.space.element(result)
+                    out_seq = [None] * len(self.elem.space)
                 else:
-                    for x, out_x in zip(self.elem, out):
-                        getattr(x.ufuncs, name)(out=out_x, **kwargs)
-                    return out
+                    assert isinstance(out, ProductSpaceElement)
+                    out_seq = out
+
+                res = []
+                for xi, out_i in zip(self.elem, out_seq):
+                    ufunc = getattr(xi.ufuncs, name)
+                    r = ufunc(out=out_i, **kwargs)
+                    res.append(r)
+
+                if out is None:
+                    out_space = ProductSpace(*[r.space for r in res])
+                    out = out_space.element(res)
+
+                return out
 
         elif n_out == 2:
             def wrapper(self, out1=None, out2=None, **kwargs):
+                from odl.space.pspace import ProductSpace, ProductSpaceElement
                 if out1 is None:
-                    out1 = self.elem.space.element()
+                    out1_seq = [None] * len(self.elem.space)
+                else:
+                    assert isinstance(out1, ProductSpaceElement)
+                    out1_seq = out1
                 if out2 is None:
-                    out2 = self.elem.space.element()
-                for x, out1_x, out2_x in zip(self.elem, out1, out2):
-                    getattr(x.ufuncs, name)(out1=out1_x, out2=out2_x, **kwargs)
+                    out2_seq = [None] * len(self.elem.space)
+                else:
+                    assert isinstance(out2, ProductSpaceElement)
+                    out2_seq = out2
+
+                res1, res2 = [], []
+                for xi, out1_i, out2_i in zip(self.elem, out1_seq, out2_seq):
+                    ufunc = getattr(xi.ufuncs, name)
+                    r1, r2 = ufunc(out1=out1_i, out2=out2_i, **kwargs)
+                    res1.append(r1)
+                    res2.append(r2)
+
+                if out1 is None:
+                    out_space_1 = ProductSpace(*[r.space for r in res1])
+                    out1 = out_space_1.element(res1)
+                if out2 is None:
+                    out_space_2 = ProductSpace(*[r.space for r in res2])
+                    out2 = out_space_2.element(res2)
+
                 return out1, out2
 
         else:
@@ -213,24 +402,64 @@ def wrapper(self, out1=None, out2=None, **kwargs):
     elif n_in == 2:
         if n_out == 1:
             def wrapper(self, x2, out=None, **kwargs):
-                if x2 in self.elem.space:
-                    if out is None:
-                        result = [getattr(x.ufuncs, name)(x2p, **kwargs)
-                                  for x, x2p in zip(self.elem, x2)]
-                        return self.elem.space.element(result)
-                    else:
-                        for x, x2p, outp in zip(self.elem, x2, out):
-                            getattr(x.ufuncs, name)(x2p, out=outp, **kwargs)
-                        return out
+                from odl.space.pspace import ProductSpace, ProductSpaceElement
+                if out is None:
+                    out_seq = [None] * len(self.elem.space)
                 else:
-                    if out is None:
-                        result = [getattr(x.ufuncs, name)(x2, **kwargs)
-                                  for x in self.elem]
-                        return self.elem.space.element(result)
-                    else:
-                        for x, outp in zip(self.elem, out):
-                            getattr(x.ufuncs, name)(x2, out=outp, **kwargs)
-                        return out
+                    assert isinstance(out, ProductSpaceElement)
+                    out_seq = out
+
+                # Implement broadcasting
+                x2 = _add_leading_dims(x2, self.elem.space)
+                if len(x2) == 1 and len(self.elem.space) != 1:
+                    x2 = [x2[0]] * len(self.elem.space)
+
+                res = []
+                for x1_i, x2_i, out_i in zip(self.elem, x2, out_seq):
+                    ufunc = getattr(x1_i.ufuncs, name)
+                    r = ufunc(x2_i, out=out_i, **kwargs)
+                    res.append(r)
+
+                if out is None:
+                    out_space = ProductSpace(*[r.space for r in res])
+                    out = out_space.element(res)
+
+                return out
+
+        elif n_out == 2:
+            def wrapper(self, x2, out1=None, out2=None, **kwargs):
+                from odl.space.pspace import ProductSpace, ProductSpaceElement
+                if out1 is None:
+                    out1_seq = [None] * len(self.elem.space)
+                else:
+                    assert isinstance(out1, ProductSpaceElement)
+                    out1_seq = out1
+                if out2 is None:
+                    out2_seq = [None] * len(self.elem.space)
+                else:
+                    out2_seq = out2
+
+                # Implement broadcasting
+                x2 = _add_leading_dims(x2, self.elem.space)
+                if len(x2) == 1 and len(self.elem.space) != 1:
+                    x2 = [x2[0]] * len(self.elem.space)
+
+                res1, res2 = [], []
+                for x1_i, x2_i, out1_i, out2_i in zip(self.elem, x2,
+                                                      out1_seq, out2_seq):
+                    ufunc = getattr(x1_i.ufuncs, name)
+                    r1, r2 = ufunc(x2_i, out1=out1_i, out2=out2_i, **kwargs)
+                    res1.append(r1)
+                    res2.append(r2)
+
+                if out1 is None:
+                    out_space_1 = ProductSpace(*[r.space for r in res1])
+                    out1 = out_space_1.element(res1)
+                if out2 is None:
+                    out_space_2 = ProductSpace(*[r.space for r in res2])
+                    out2 = out_space_2.element(res2)
+
+                return out1, out2
 
         else:
             raise NotImplementedError
diff --git a/odl/util/utility.py b/odl/util/utility.py
index 828ee0ed5d1..e5a0422de69 100644
--- a/odl/util/utility.py
+++ b/odl/util/utility.py
@@ -21,16 +21,35 @@
 import numpy as np
 
 __all__ = (
-    'array_str', 'dtype_str', 'dtype_repr', 'npy_printoptions',
-    'signature_string', 'signature_string_parts', 'repr_string',
-    'indent', 'dedent', 'attribute_repr_string', 'method_repr_string',
-    'is_numeric_dtype', 'is_int_dtype', 'is_floating_dtype', 'is_real_dtype',
-    'is_real_floating_dtype', 'is_complex_floating_dtype',
-    'real_dtype', 'complex_dtype', 'is_string', 'nd_iterator', 'conj_exponent',
-    'writable_array', 'run_from_ipython', 'NumpyRandomSeed',
-    'cache_arguments', 'unique',
-    'REPR_PRECISION')
-
+    'array_cls',
+    'array_module',
+    'array_str',
+    'as_numpy',
+    'asarray',
+    'cache_arguments',
+    'complex_dtype',
+    'conj_exponent',
+    'dtype_repr',
+    'dtype_str',
+    'indent',
+    'is_complex_floating_dtype',
+    'is_floating_dtype',
+    'is_int_dtype',
+    'is_numeric_dtype',
+    'is_real_dtype',
+    'is_real_floating_dtype',
+    'is_string',
+    'nd_iterator',
+    'none_context',
+    'npy_printoptions',
+    'NumpyRandomSeed',
+    'real_dtype',
+    'REPR_PRECISION',
+    'run_from_ipython',
+    'signature_string',
+    'unique',
+    'writable_array'
+    )
 
 REPR_PRECISION = 4  # For printing scalars and array entries
 TYPE_MAP_R2C = {np.dtype(dtype): np.result_type(dtype, 1j)
@@ -438,7 +457,7 @@ def real_dtype(dtype, default=None):
     """
     dtype, dtype_in = np.dtype(dtype), dtype
 
-    if is_real_floating_dtype(dtype):
+    if is_real_dtype(dtype):
         return dtype
 
     try:
@@ -648,18 +667,94 @@ def ip_wrapper(x, out, **kwargs):
     return decorator
 
 
+def asarray(obj, dtype=None, impl='numpy'):
+    """Convert ``obj`` to an ``impl`` type array as fast as possible.
+
+    Parameters
+    ----------
+    obj : array_like
+        Object to be converted to an array.
+    dtype : data-type, optional
+        Desired data type of the array.
+    impl : str, optional
+        Array backend used to create the array.
+
+    Returns
+    -------
+    array : ndarray
+        Array with data type ``dtype`` created from ``obj`` using ``impl``
+        as backend.
+
+    Examples
+    --------
+    >>> a = asarray([1, 2])
+    >>> a
+    array([1, 2])
+    >>> isinstance(a, np.ndarray)
+    True
+    >>> a = asarray(odl.rn(3).one())
+    >>> a
+    array([ 1.,  1.,  1.])
+    >>> isinstance(a, np.ndarray)
+    True
+
+    Notes
+    -----
+    For ODL tensors, there are specific implementation of the conversion
+    to different types of arrays that are not necessarily invoked by
+    the ``asarray`` methods of array libraries.
+    While ``numpy.asarray`` uses the ``__array__`` method to "ask" the
+    object for an array, there is no such dedicated interface for other
+    implementations. As a result, an intermediate Numpy array is typically
+    created, which, for instance, for ``impl='cupy'`` leads to transfers
+    GPU->CPU->GPU.
+    This function bypasses this detour and uses the optimized
+    ``Tensor.asarray()`` method.
+    """
+    from odl.space.cupy_tensors import cupy, CUPY_AVAILABLE
+    impl, impl_in = str(impl).lower(), impl
+
+    if impl == 'numpy':
+        if CUPY_AVAILABLE and isinstance(obj, cupy.ndarray):
+            # __array__ of cupy.ndarray does not return a Numpy array, see
+            # https://github.com/cupy/cupy/issues/589
+            obj = cupy.asnumpy(obj)
+        return np.asarray(obj, dtype=dtype)
+
+    elif impl == 'cupy':
+        if CUPY_AVAILABLE:
+            try:
+                arr = obj.asarray(impl='cupy')
+            except AttributeError:
+                arr = cupy.asarray(obj, dtype=dtype)
+            else:
+                arr = arr.astype(dtype, copy=False)
+
+            return arr
+
+        else:
+            raise ValueError("`impl` 'cupy' not available")
+
+    else:
+        raise ValueError('`impl` {!r} not understood'.format(impl_in))
+
+
 class writable_array(object):
+
     """Context manager that casts obj to a `numpy.array` and saves changes."""
 
-    def __init__(self, obj, **kwargs):
-        """initialize a new instance.
+    def __init__(self, obj, impl='numpy', **kwargs):
+        """Initialize a new instance.
 
         Parameters
         ----------
         obj : `array-like`
             Object that should be made available as writable array.
             It must be valid as input to `numpy.asarray` and needs to
-            support the syntax ``obj[:] = arr``.
+            support assignment ``obj[:] = arr``.
+        impl : str, optional
+            Array backend for the exposed ndarray.
+
         kwargs :
             Keyword arguments that should be passed to `numpy.asarray`.
 
@@ -702,31 +797,97 @@ def __init__(self, obj, **kwargs):
         """
         self.obj = obj
         self.kwargs = kwargs
+        self.impl = impl
         self.arr = None
 
     def __enter__(self):
-        """called by ``with writable_array(obj):``.
+        """Called by ``with writable_array(obj):``.
 
         Returns
         -------
-        arr : `numpy.ndarray`
-            Array representing ``self.obj``, created by calling
-            ``numpy.asarray``. Any changes to ``arr`` will be passed through
-            to ``self.obj`` after the context manager exits.
+        arr : ndarray
+            Array representing ``self.obj``, created by calling ``asarray``
+            corresponding to the chosen ``impl``, e.g., ``numpy.asarray``.
+            Any changes to ``arr`` will be passed through to ``self.obj``
+            when the context manager exits.
         """
-        self.arr = np.asarray(self.obj, **self.kwargs)
+        self.arr = array_module(self.impl).asarray(self.obj, **self.kwargs)
         return self.arr
 
-    def __exit__(self, type, value, traceback):
-        """called when ``with writable_array(obj):`` ends.
+    def __exit__(self, *args, **kwargs):
+        """Called when ``with writable_array(obj):`` ends.
 
         Saves any changes to ``self.arr`` to ``self.obj``, also "frees"
         self.arr in case the manager is used multiple times.
 
         Extra arguments are ignored, any exceptions are passed through.
         """
-        self.obj[:] = self.arr
-        self.arr = None
+        # Some extra care for Numpy and Cupy arrays
+        from odl.space.npy_tensors import NumpyTensor
+        from odl.space.cupy_tensors import cupy
+
+        obj_is_npy = isinstance(self.obj, (NumpyTensor, np.ndarray))
+        arr_is_cupy = isinstance(self.arr,
+                                 getattr(cupy, 'ndarray', type(None)))
+        if obj_is_npy and arr_is_cupy:
+            arr = cupy.asnumpy(self.arr)
+        else:
+            arr = self.arr
+
+        self.obj[:] = arr
+        self.arr = None  # gc
+
+
+class none_context(object):
+    """Trivial context manager class.
+
+    When used as ::
+
+        with none_context(*args, **kwargs) as obj:
+            # do stuff with `obj`
+
+    the returned ``obj`` is ``None``.
+    """
+    def __init__(self, *args, **kwargs):
+        # Ignore all arguments
+        pass
+
+    def __enter__(self):
+        return None
+
+    def __exit__(self, type, value, traceback):
+        return None
+
+
+def array_module(impl):
+    """Return the array module for ``impl``."""
+    from odl.space.cupy_tensors import cupy
+    if impl == 'numpy':
+        return np
+    elif impl == 'cupy':
+        return cupy
+    else:
+        raise ValueError('`impl` {!r} not understood'.format(impl))
+
+
+def array_cls(impl):
+    """Return the array class for given ``impl``."""
+    return array_module(impl).ndarray
+
+
+def as_numpy(array):
+    """Return a ``numpy.ndarray`` from the given array.
+
+    This is intended for casting from other array implementations to
+    Numpy, not for ODL tensors.
+    """
+    from odl.space.cupy_tensors import cupy
+    if isinstance(array, np.ndarray):
+        return array
+    elif isinstance(array, cupy.ndarray):
+        return cupy.asnumpy(array)
+    else:
+        raise TypeError('type {} not understood'.format(type(array)))
 
 
 def signature_string(posargs, optargs, sep=', ', mod='!r'):