diff --git a/.gitignore b/.gitignore
index 5cd5605dc86..875d4781162 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,3 +19,4 @@ htmlcov/
 .idea/
 !ultima/ultima.cpp
 ultima/ultima
+.cache/
diff --git a/clpy/_version.py b/clpy/_version.py
index 38da91ee5a4..04188a16d9f 100644
--- a/clpy/_version.py
+++ b/clpy/_version.py
@@ -1 +1 @@
-__version__ = '2.1.0.1'
+__version__ = '2.2.0'
diff --git a/clpy/backend/cuda/compiler.py b/clpy/backend/cuda/compiler.py
index f8915715239..26f1f725600 100644
--- a/clpy/backend/cuda/compiler.py
+++ b/clpy/backend/cuda/compiler.py
@@ -195,7 +195,7 @@ def dump(self, f):
         linum_fmt = '{{:0{}d}} '.format(digits)
         f.write('NVRTC compilation error: {}\n'.format(self))
         f.write('-----\n')
-        f.write('Name: {}\n'.format(' '.join(self.name)))
+        f.write('Name: {}\n'.format(self.name))
         f.write('Options: {}\n'.format(' '.join(self.options)))
         f.write('CUDA source:\n')
         for i, line in enumerate(lines):
diff --git a/clpy/backend/cuda/device.pxd b/clpy/backend/cuda/device.pxd
index a48a148755d..d32c8630549 100644
--- a/clpy/backend/cuda/device.pxd
+++ b/clpy/backend/cuda/device.pxd
@@ -1,5 +1,7 @@
 cpdef int get_device_id() except *
-cpdef get_cublas_handle()
+cpdef size_t get_cublas_handle() except *
+cpdef size_t get_cusolver_handle() except *
+cpdef size_t get_cusparse_handle() except *
 
 cdef class Device:
     cdef:
diff --git a/clpy/backend/cuda/device.pyx b/clpy/backend/cuda/device.pyx
index f414265b7c3..1369e049a88 100644
--- a/clpy/backend/cuda/device.pyx
+++ b/clpy/backend/cuda/device.pyx
@@ -24,24 +24,27 @@ cdef dict _cusolver_handles = {}
 cdef dict _cusparse_handles = {}
 
 
-cpdef get_cublas_handle():
+cpdef size_t get_cublas_handle() except *:
     dev_id = get_device_id()
-    if dev_id in _cublas_handles:
-        return _cublas_handles[dev_id]
+    ret = _cublas_handles.get(dev_id, None)
+    if ret is not None:
+        return ret
     return Device().cublas_handle
 
 
-cpdef get_cusolver_handle():
+cpdef size_t get_cusolver_handle() except *:
     dev_id = get_device_id()
-    if dev_id in _cusolver_handles:
-        return _cusolver_handles[dev_id]
+    ret = _cusolver_handles.get(dev_id, None)
+    if ret is not None:
+        return ret
     return Device().cusolver_handle
 
 
-cpdef get_cusparse_handle():
+cpdef size_t get_cusparse_handle() except *:
     dev_id = get_device_id()
-    if dev_id in _cusparse_handles:
-        return _cusparse_handles[dev_id]
+    ret = _cusparse_handles.get(dev_id, None)
+    if ret is not None:
+        return ret
     return Device().cusparse_handle
 
 
diff --git a/clpy/core/core.pxd b/clpy/core/core.pxd
index 732c93eefe3..3226210da8b 100644
--- a/clpy/core/core.pxd
+++ b/clpy/core/core.pxd
@@ -71,6 +71,9 @@ cdef class Indexer:
         readonly Py_ssize_t size
         readonly tuple shape
 
+
+cpdef ndarray ascontiguousarray(ndarray a, dtype=*)
+
 cdef class CArray:
     cdef:
         readonly tuple stride
diff --git a/clpy/core/core.pyx b/clpy/core/core.pyx
index a0b4189c5c5..43fc82ce53a 100644
--- a/clpy/core/core.pyx
+++ b/clpy/core/core.pyx
@@ -1888,29 +1888,18 @@ include "reduction.pxi"
 
 cdef _id = 'out0 = in0'
 
-_elementwise_copy = create_ufunc(
+elementwise_copy = create_ufunc(
     'clpy_copy',
     ('?->?', 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L',
      'q->q', 'Q->Q', ('e->e', _id), 'f->f', 'd->d', 'F->F', 'D->D'),
-    'out0 = (out0_type)(in0)')
+    'out0 = out0_type(in0)', default_casting='unsafe')
 # complex numbers requires out0 = complex<T>(in0)
 
-
-def elementwise_copy(*args, **kwargs):
-    kwargs['casting'] = 'unsafe'
-    return _elementwise_copy(*args, **kwargs)
-
-
-_elementwise_copy_where = create_ufunc(
+elementwise_copy_where = create_ufunc(
     'clpy_copy_where',
     ('??->?', 'b?->b', 'B?->B', 'h?->h', 'H?->H', 'i?->i', 'I?->I', 'l?->l',
-     'L?->L', 'q?->q', 'Q?->Q', 'f?->f', 'd?->d', 'F?->F', 'D?->D'),
-    'if (in1) out0 = in0')
-
-
-def elementwise_copy_where(*args, **kwargs):
-    kwargs['casting'] = 'unsafe'
-    return _elementwise_copy_where(*args, **kwargs)
+     'L?->L', 'q?->q', 'Q?->Q', 'e?->e', 'f?->f', 'd?->d', 'F?->F', 'D?->D'),
+    'if (in1) out0 = in0', default_casting='unsafe')
 
 
 cdef _divmod_int = string.Template('''
@@ -3465,6 +3454,11 @@ cpdef ndarray matmul(ndarray a, ndarray b, ndarray out=None):
     cdef Py_ssize_t i, n, m, ka, kb
     cdef Py_ssize_t batchCount
 
+    orig_a_shape = a.shape
+    orig_b_shape = b.shape
+    if len(orig_a_shape) == 0 or len(orig_b_shape) == 0:
+        raise ValueError('Scalar operands are not allowed, use \'*\' instead')
+
     ret_dtype = numpy.result_type(a.dtype, b.dtype)
     dtype = numpy.find_common_type((ret_dtype, 'f'), ())
 
@@ -3550,9 +3544,16 @@ cpdef ndarray matmul(ndarray a, ndarray b, ndarray out=None):
     *la, ka, n = a.shape
     *lb, m, kb = b.shape
 
-    assert ka == kb
+    if ka != kb:
+        raise ValueError(
+            'shapes ({}) and ({}) not aligned'.format(
+                ','.join([str(_) for _ in orig_a_shape]),
+                ','.join([str(_) for _ in orig_b_shape])))
     for la_, lb_ in zip(la, lb):
-        assert la_ == lb_ or la_ == 1 or lb_ == 1
+        if not (la_ == lb_ or la_ == 1 or lb_ == 1):
+            raise ValueError(
+                'operands could not be broadcast together with '
+                'remapped shapes')
 
     batchCount = 1  # batchCount = numpy.prod(la)
     for i in la:
diff --git a/clpy/core/elementwise.pxi b/clpy/core/elementwise.pxi
index 746606db431..d34153cb841 100644
--- a/clpy/core/elementwise.pxi
+++ b/clpy/core/elementwise.pxi
@@ -567,9 +567,11 @@ cdef class ElementwiseKernel:
 
         Args:
             args: Arguments of the kernel.
-            size (int): Range size of the indices. If specified, the variable
-                ``n`` is set to this value. Otherwise, the result of
-                broadcasting is used to determine the value of ``n``.
+            size (int): Range size of the indices.  By default, the range size
+                is automatically determined from the result of broadcasting.
+                This parameter must be specified if and only if all ndarrays
+                are `raw` and the range size cannot be determined
+                automatically.
 
         Returns:
             Arrays are returned according to the ``out_params`` argument of the
@@ -761,7 +763,9 @@ class ufunc(object):
         nargs (int): Number of all arguments.
 
     """
-    def __init__(self, name, nin, nout, ops, preamble='', doc=''):
+
+    def __init__(self, name, nin, nout, ops, preamble='', doc='',
+                 default_casting=None):
         # TODO(tomoya.sakai): raw array may be possible for ufunc
         self.name = name
         self.nin = nin
@@ -770,6 +774,10 @@ class ufunc(object):
         self._ops = ops
         self._preamble = preamble
         self.__doc__ = doc
+        if default_casting is None:
+            self._default_casting = 'same_kind'
+        else:
+            self._default_casting = default_casting
         _in_params = tuple(
             ParameterInfo('T in%d' % i, True)
             for i in range(nin))
@@ -799,9 +807,7 @@ class ufunc(object):
         return types
 
     def __call__(self, *args, **kwargs):
-        """__call__(*args, **kwargs)
-
-        Applies the universal function to arguments elementwise.
+        """Applies the universal function to arguments elementwise.
 
         Args:
             args: Input arguments. Each of them can be a :class:`clpy.ndarray`
@@ -821,7 +827,7 @@ class ufunc(object):
         out = kwargs.pop('out', None)
         dtype = kwargs.pop('dtype', None)
         # Note default behavior of casting is 'same_kind' on numpy>=1.10
-        casting = kwargs.pop('casting', 'same_kind')
+        casting = kwargs.pop('casting', self._default_casting)
         if dtype is not None:
             dtype = numpy.dtype(dtype).type
         if kwargs:
@@ -879,7 +885,8 @@ class ufunc(object):
         return ret
 
 
-cpdef create_ufunc(name, ops, routine=None, preamble='', doc=''):
+cpdef create_ufunc(name, ops, routine=None, preamble='', doc='',
+                   default_casting=None):
     _ops = []
     for t in ops:
         if not isinstance(t, tuple):
@@ -897,4 +904,6 @@ cpdef create_ufunc(name, ops, routine=None, preamble='', doc=''):
         out_types = tuple([numpy.dtype(t).type for t in out_types])
         _ops.append((in_types, out_types, rt))
 
-    return ufunc(name, len(_ops[0][0]), len(_ops[0][1]), _ops, preamble, doc)
+    ret = ufunc(name, len(_ops[0][0]), len(_ops[0][1]), _ops, preamble, doc,
+                default_casting=default_casting)
+    return ret
diff --git a/clpy/core/fusion.py b/clpy/core/fusion.py
index 73b1c4a2c9b..76756993754 100644
--- a/clpy/core/fusion.py
+++ b/clpy/core/fusion.py
@@ -734,7 +734,7 @@ def _create_ufunc(clpy_ufunc, numpy_ufunc):
 
 clip = ufunc(core._clip, math.misc.clip, numpy.clip)
 
-copy = ufunc(core._elementwise_copy,
+copy = ufunc(core.elementwise_copy,
              creation.from_data.copy, numpy.copy)
 
 bitwise_and = _create_ufunc(core.bitwise_and, numpy.bitwise_and)
diff --git a/clpy/core/reduction.pxi b/clpy/core/reduction.pxi
index e0c61b48f22..ddaf4f67472 100644
--- a/clpy/core/reduction.pxi
+++ b/clpy/core/reduction.pxi
@@ -398,9 +398,7 @@ class ReductionKernel(object):
         self.preamble = preamble
 
     def __call__(self, *args, **kwargs):
-        """__call__(*args, **kwargs)
-
-        Compiles and invokes the reduction kernel.
+        """Compiles and invokes the reduction kernel.
 
         The compilation runs only if the kernel is not cached. Note that the
         kernels with different argument dtypes, ndims, or axis are not
diff --git a/clpy/cudnn.py b/clpy/cudnn.py
deleted file mode 100644
index db91e726f03..00000000000
--- a/clpy/cudnn.py
+++ /dev/null
@@ -1,338 +0,0 @@
-import atexit
-import threading
-
-import numpy
-import six
-
-import clpy
-from clpy.core import internal
-from clpy import cuda
-from clpy.cuda import cudnn
-
-
-_cudnn_version = cudnn.getVersion()
-_thread_local = threading.local()
-
-_handles = {}
-
-
-def get_handle():
-    dev = cuda.get_device_id()
-    if dev in _handles:
-        return _handles[dev]
-    handle = cudnn.create()
-    _handles[dev] = handle
-    return handle
-
-
-@atexit.register
-def reset_handles():
-    global _handles
-    handles = _handles
-    _handles = {}
-
-    for handle in six.itervalues(handles):
-        cudnn.destroy(handle)
-
-
-def _get_nd_tensor_cache():
-    if not hasattr(_thread_local, 'cudnn_nd_tensor_cache'):
-        _thread_local.cudnn_nd_tensor_cache = {}
-    return _thread_local.cudnn_nd_tensor_cache
-
-
-class Descriptor(object):
-
-    def __init__(self, descriptor, destroyer):
-        self.value = descriptor
-        self.destroy = destroyer
-
-    def __del__(self):
-        if self.value:
-            self.destroy(self.value)
-            self.value = None
-
-
-def get_data_type(dtype):
-    if dtype.type == numpy.float32:
-        return cudnn.CUDNN_DATA_FLOAT
-    elif dtype.type == numpy.float64:
-        return cudnn.CUDNN_DATA_DOUBLE
-    elif dtype.type == numpy.float16:
-        return cudnn.CUDNN_DATA_HALF
-    else:
-        raise TypeError('Dtype {} is not supported in cuDNN'.format(dtype))
-
-
-def _to_ctypes_array(tup, dtype=numpy.intc):
-    return numpy.array(tup, dtype=dtype).ctypes
-
-
-def create_tensor_descriptor(arr, format=cudnn.CUDNN_TENSOR_NCHW):
-    desc = Descriptor(cudnn.createTensorDescriptor(),
-                      cudnn.destroyTensorDescriptor)
-    if not arr.flags.c_contiguous:
-        raise ValueError('clpy.cudnn supports c-contiguous arrays only')
-    data_type = get_data_type(arr.dtype)
-    if arr.ndim == 4:
-        cudnn.setTensor4dDescriptor(desc.value, format, data_type, *arr.shape)
-    else:
-        strides = [s // arr.itemsize for s in arr.strides]
-        c_shape = _to_ctypes_array(arr.shape)
-        c_strides = _to_ctypes_array(strides)
-        cudnn.setTensorNdDescriptor(desc.value, data_type, arr.ndim,
-                                    c_shape.data, c_strides.data)
-    return desc
-
-
-def create_uninitialized_tensor_descriptor():
-    """Create uninitialized tensor descriptor.
-
-    Create a cudnnCreateTensorDescriptor_t that is not yet initialized.
-    This is used by the batch normalization functions.
-    """
-    desc = Descriptor(cudnn.createTensorDescriptor(),
-                      cudnn.destroyTensorDescriptor)
-    return desc
-
-
-def create_tensor_nd_descriptor(arr):
-    desc = Descriptor(cudnn.createTensorDescriptor(),
-                      cudnn.destroyTensorDescriptor)
-    if not arr.flags.c_contiguous:
-        raise ValueError('clpy.cudnn supports c-contiguous arrays only')
-    data_type = get_data_type(arr.dtype)
-    shape = arr.shape
-    key = (data_type, shape)
-    cache = _get_nd_tensor_cache()
-    if key in cache:
-        return cache[key]
-
-    # numpy's stride is defined in bytes, but cudnn's stride is defined in
-    # size of element
-    strides = [s // arr.itemsize for s in arr.strides]
-
-    c_shape = _to_ctypes_array(shape)
-    c_strides = _to_ctypes_array(strides)
-    cudnn.setTensorNdDescriptor(desc.value, data_type,
-                                arr.ndim, c_shape.data, c_strides.data)
-    cache = _get_nd_tensor_cache()
-    cache[key] = desc
-    return desc
-
-
-def create_filter_descriptor(arr, format=cudnn.CUDNN_TENSOR_NCHW):
-    desc = Descriptor(cudnn.createFilterDescriptor(),
-                      cudnn.destroyFilterDescriptor)
-    data_type = get_data_type(arr.dtype)
-    if arr.ndim == 4:
-        cudnn.setFilter4dDescriptor_v4(desc.value, data_type, format,
-                                       *arr.shape)
-    else:
-        c_shape = _to_ctypes_array(arr.shape)
-        cudnn.setFilterNdDescriptor_v4(desc.value, data_type, format,
-                                       arr.ndim, c_shape.data)
-    return desc
-
-
-def create_convolution_descriptor(pad, stride, dtype,
-                                  mode=cudnn.CUDNN_CROSS_CORRELATION,
-                                  dilation=(1, 1),
-                                  use_tensor_core=False,
-                                  group=1):
-    desc = Descriptor(cudnn.createConvolutionDescriptor(),
-                      cudnn.destroyConvolutionDescriptor)
-    ndim = len(pad)
-    if ndim != len(stride):
-        raise ValueError('pad and stride must be of same length')
-
-    if ndim == 2:
-        if _cudnn_version < 6000:
-            if dilation[0] != 1 or dilation[1] != 1:
-                raise ValueError('dilation must be one when cudnn < 6.0')
-        if _cudnn_version >= 5000:
-            compute_type = get_data_type(dtype)
-            # TODO(takagi) Temporarily use computing precision of FP32 for
-            #     storing precision of FP16.
-            if dtype == numpy.float16:
-                compute_type = cudnn.CUDNN_DATA_FLOAT
-            cudnn.setConvolution2dDescriptor_v5(
-                desc.value, pad[0], pad[1], stride[0], stride[1],
-                dilation[0], dilation[1], mode, compute_type)
-
-            if _cudnn_version >= 7000:
-                if use_tensor_core:
-                    math_type = cudnn.CUDNN_TENSOR_OP_MATH
-                    cudnn.setConvolutionMathType(desc.value, math_type)
-                if group > 1:
-                    cudnn.setConvolutionGroupCount(desc.value, group)
-        else:
-            cudnn.setConvolution2dDescriptor_v4(
-                desc.value, pad[0], pad[1], stride[0], stride[1], 1, 1, mode)
-    else:
-        c_pad = _to_ctypes_array(pad)
-        c_stride = _to_ctypes_array(stride)
-        c_dilation = _to_ctypes_array((1,) * ndim)
-        compute_type = get_data_type(dtype)
-        # TODO(takagi) Temporarily use computing precision of FP32 for
-        #     storing precision of FP16.
-        if dtype == numpy.float16:
-            compute_type = cudnn.CUDNN_DATA_FLOAT
-        cudnn.setConvolutionNdDescriptor_v3(
-            desc.value, ndim, c_pad.data, c_stride.data, c_dilation.data,
-            mode, compute_type)
-
-    return desc
-
-
-def create_pooling_descriptor(ksize, stride, pad, mode):
-    desc = Descriptor(cudnn.createPoolingDescriptor(),
-                      cudnn.destroyPoolingDescriptor)
-    ndim = len(ksize)
-    if ndim != len(stride) or ndim != len(pad):
-        raise ValueError('ksize, stride, and pad must be of same length')
-
-    if ndim == 2:
-        cudnn.setPooling2dDescriptor_v4(
-            desc.value, mode, cudnn.CUDNN_NOT_PROPAGATE_NAN, ksize[0],
-            ksize[1], pad[0], pad[1], stride[0], stride[1])
-    else:
-        c_ksize = _to_ctypes_array(ksize)
-        c_pad = _to_ctypes_array(pad)
-        c_stride = _to_ctypes_array(stride)
-        cudnn.setPoolingNdDescriptor_v4(
-            desc.value, mode, cudnn.CUDNN_NOT_PROPAGATE_NAN, ndim,
-            c_ksize.data, c_pad.data, c_stride.data)
-
-    return desc
-
-
-def _as4darray(arr):
-    if arr.ndim == 0:
-        return arr.reshape(1, 1, 1, 1)
-    else:
-        return arr.reshape(arr.shape[0], -1, 1, 1)
-
-
-def activation_forward(x, mode):
-    x = clpy.ascontiguousarray(x)
-    y = clpy.empty_like(x)
-
-    dtype = 'd' if x.dtype == 'd' else 'f'
-    one = numpy.array(1, dtype=dtype).ctypes
-    zero = numpy.array(0, dtype=dtype).ctypes
-    handle = get_handle()
-    x_mat = _as4darray(x)
-    desc = create_tensor_descriptor(x_mat)
-    act_desc = Descriptor(cudnn.createActivationDescriptor(),
-                          cudnn.destroyActivationDescriptor)
-    cudnn.setActivationDescriptor(
-        act_desc.value, mode, cudnn.CUDNN_NOT_PROPAGATE_NAN, 0.0)
-    cudnn.activationForward_v4(
-        handle, act_desc.value, one.data, desc.value, x_mat.data.ptr,
-        zero.data, desc.value, y.data.ptr)
-    return y
-
-
-def activation_backward(x, y, gy, mode):
-    x = clpy.ascontiguousarray(x)
-    gy = clpy.ascontiguousarray(gy)
-
-    gx = clpy.empty_like(x)
-    dtype = 'd' if x.dtype == 'd' else 'f'
-    one = numpy.array(1, dtype=dtype).ctypes
-    zero = numpy.array(0, dtype=dtype).ctypes
-    handle = get_handle()
-    y_mat = _as4darray(y)
-    desc = create_tensor_descriptor(y_mat)
-    act_desc = Descriptor(cudnn.createActivationDescriptor(),
-                          cudnn.destroyActivationDescriptor)
-    cudnn.setActivationDescriptor(
-        act_desc.value, mode, cudnn.CUDNN_NOT_PROPAGATE_NAN, 0.0)
-    cudnn.activationBackward_v4(
-        handle, act_desc.value, one.data, desc.value, y.data.ptr,
-        desc.value, gy.data.ptr, desc.value, x.data.ptr,
-        zero.data, desc.value, gx.data.ptr)
-    return gx
-
-
-def create_dropout_descriptor(
-        handle, dropout, states, state_size_in_bytes, seed):
-    desc = Descriptor(cudnn.createDropoutDescriptor(),
-                      cudnn.destroyDropoutDescriptor)
-    cudnn.setDropoutDescriptor(desc.value, handle, dropout,
-                               states, state_size_in_bytes, seed)
-    return desc
-
-
-def set_dropout_descriptor(desc, handle, dropout):
-    # When the fourth argument is NULL, random state is not updated.
-    cudnn.setDropoutDescriptor(desc.value, handle, dropout, 0, 0, 0)
-
-
-def create_rnn_descriptor(hidden_size, num_layers, dropout_desc,
-                          input_mode, direction, mode, data_type):
-    desc = Descriptor(cudnn.createRNNDescriptor(),
-                      cudnn.destroyRNNDescriptor)
-    if _cudnn_version >= 7000:
-        _handle = get_handle()
-        _algo = cudnn.CUDNN_RNN_ALGO_STANDARD
-        cudnn.setRNNDescriptor_v6(
-            _handle, desc.value, hidden_size, num_layers, dropout_desc.value,
-            input_mode, direction, mode, _algo, data_type)
-    else:
-        cudnn.setRNNDescriptor_v5(
-            desc.value, hidden_size, num_layers, dropout_desc.value,
-            input_mode, direction, mode, data_type)
-    return desc
-
-
-def get_rnn_lin_layer_matrix_params(
-        handle, rnn_desc, layer, x_desc, w_desc, w, lin_layer_id):
-    mat_desc = Descriptor(cudnn.createFilterDescriptor(),
-                          cudnn.destroyFilterDescriptor)
-    ptr = numpy.array(0, dtype=numpy.intp)
-    cudnn.getRNNLinLayerMatrixParams(
-        handle, rnn_desc.value, layer, x_desc.value, w_desc.value, w.data.ptr,
-        lin_layer_id, mat_desc.value, ptr.ctypes.data)
-    offset = (ptr - w.data.ptr) // 4
-    _, _, _, dim = cudnn.getFilterNdDescriptor(mat_desc.value, 3)
-    size = internal.prod(dim)
-    mat = w[offset: offset + size]
-    return mat
-
-
-def get_rnn_lin_layer_bias_params(
-        handle, rnn_desc, layer, x_desc, w_desc, w, lin_layer_id):
-    bias_desc = Descriptor(cudnn.createFilterDescriptor(),
-                           cudnn.destroyFilterDescriptor)
-    ptr = numpy.array(0, dtype=numpy.intp)
-    cudnn.getRNNLinLayerBiasParams(
-        handle, rnn_desc.value, layer, x_desc.value, w_desc.value, w.data.ptr,
-        lin_layer_id, bias_desc.value, ptr.ctypes.data)
-    offset = (ptr - w.data.ptr) // 4
-    _, _, _, dim = cudnn.getFilterNdDescriptor(bias_desc.value, 3)
-    size = internal.prod(dim)
-    bias = w[offset: offset + size]
-    return bias
-
-
-def create_dropout_states(handle):
-    state_size = cudnn.dropoutGetStatesSize(handle)
-    return clpy.empty((state_size,), dtype='b')
-
-
-def create_spatial_transformer_descriptor(sampler_type, dtype, nb_dims, dim_A):
-    desc = Descriptor(cudnn.createSpatialTransformerDescriptor(),
-                      cudnn.destroySpatialTransformerDescriptor)
-    data_type = get_data_type(dtype)
-
-    cudnn.setSpatialTransformerDescriptor(
-        desc.value, sampler_type, data_type, nb_dims, dim_A)
-    return desc
-
-
-def add_tensor(handle, alpha, biasDesc, biasData, beta, srcDestDesc,
-               srcDestData):
-    cudnn.addTensor_v3(handle, alpha, biasDesc,
-                       biasData, beta, srcDestDesc, srcDestData)
diff --git a/clpy/manipulation/join.py b/clpy/manipulation/join.py
index bd5c469adc2..bedc998fce3 100644
--- a/clpy/manipulation/join.py
+++ b/clpy/manipulation/join.py
@@ -120,10 +120,12 @@ def stack(tup, axis=0):
 
     .. seealso:: :func:`numpy.stack`
     """
+    # TODO(okuta) Remove this if exampd_dims is updated
     for x in tup:
-        if not (-x.ndim <= axis < x.ndim):
+        if not (-x.ndim - 1 <= axis <= x.ndim):
             raise core.core._AxisError(
-                'axis {} out of bounds [0, {})'.format(axis, x.ndim))
+                'axis {} out of bounds [{}, {}]'.format(
+                    axis, -x.ndim - 1, x.ndim))
     return concatenate([clpy.expand_dims(x, axis) for x in tup], axis)
 
 
diff --git a/clpy/random/distributions.py b/clpy/random/distributions.py
index 88f026872b6..1df2302cc7d 100644
--- a/clpy/random/distributions.py
+++ b/clpy/random/distributions.py
@@ -39,12 +39,12 @@ def gumbel(loc=0.0, scale=1.0, size=None, dtype=float):
     Returns:
         clpy.ndarray: Samples drawn from the Gumbel destribution.
 
-    .. seealso:: :func:`numpy.random.gumbel`
+    .. seealso::
+        :func:`cupy.RandomState.gumbel`
+        :func:`numpy.random.gumbel`
     """
-    rs = uniform(size=size, dtype=dtype)
-    # We use `1 - x` as input of `log` method to prevent overflow.
-    # It obeys numpy implementation.
-    return _gumbel_kernel(rs, loc, scale, rs)
+    rs = generator.get_random_state()
+    return rs.gumbel(loc, scale, size, dtype)
 
 
 def lognormal(mean=0.0, sigma=1.0, size=None, dtype=float):
diff --git a/clpy/random/generator.py b/clpy/random/generator.py
index 34e350c3c41..9f407393968 100644
--- a/clpy/random/generator.py
+++ b/clpy/random/generator.py
@@ -14,6 +14,20 @@
 from clpy import core
 
 
+_gumbel_kernel = None
+
+
+def _get_gumbel_kernel():
+    global _gumbel_kernel
+    if _gumbel_kernel is None:
+        _gumbel_kernel = core.ElementwiseKernel(
+            'T x, T loc, T scale', 'T y',
+            'y = loc - log(-log(1 - x)) * scale',
+            'gumbel_kernel'
+        )
+    return _gumbel_kernel
+
+
 class RandomState(object):
 
     """Portable container of a pseudo-random number generator.
@@ -315,12 +329,12 @@ def choice(self, a, size=None, replace=True, p=None):
         if p is not None:
             p = clpy.broadcast_to(p, (size, a_size))
             index = clpy.argmax(clpy.log(p) +
-                                clpy.random.gumbel(size=(size, a_size)),
+                                self.gumbel(size=(size, a_size)),
                                 axis=1)
             if not isinstance(shape, six.integer_types):
                 index = clpy.reshape(index, shape)
         else:
-            index = clpy.random.randint(0, a_size, size=shape)
+            index = self.randint(0, a_size, size=shape)
             # Align the dtype with NumPy
             index = index.astype(clpy.int64, copy=False)
 
@@ -350,6 +364,51 @@ def shuffle(self, a):
         clrand.generate(self._generator, sample)
         a[:] = a[clpy.argsort(sample)]
 
+    def gumbel(self, loc=0.0, scale=1.0, size=None, dtype=float):
+        """Returns an array of samples drawn from a Gumbel distribution.
+
+        .. seealso::
+            :func:`clpy.random.gumbel` for full documentation,
+            :meth:`numpy.random.RandomState.gumbel`
+        """
+        x = self.uniform(size=size, dtype=dtype)
+        # We use `1 - x` as input of `log` method to prevent overflow.
+        # It obeys numpy implementation.
+        _get_gumbel_kernel()(x, loc, scale, x)
+        return x
+
+    def randint(self, low, high=None, size=None, dtype='l'):
+        """Returns a scalar or an array of integer values over ``[low, high)``.
+
+        .. seealso::
+            :func:`clpy.random.randint` for full documentation,
+            :meth:`numpy.random.RandomState.randint`
+        """
+        if high is None:
+            lo = 0
+            hi = low
+        else:
+            lo = low
+            hi = high
+
+        if lo >= hi:
+            raise ValueError('low >= high')
+        if lo < clpy.iinfo(dtype).min:
+            raise ValueError(
+                'low is out of bounds for {}'.format(clpy.dtype(dtype).name))
+        if hi > clpy.iinfo(dtype).max + 1:
+            raise ValueError(
+                'high is out of bounds for {}'.format(clpy.dtype(dtype).name))
+
+        diff = hi - lo - 1
+        if diff > clpy.iinfo(clpy.int32).max - clpy.iinfo(clpy.int32).min + 1:
+            raise NotImplementedError(
+                'Sampling from a range whose extent is larger than int32 '
+                'range is currently not supported')
+        x = self.interval(diff, size).astype(dtype, copy=False)
+        clpy.add(x, lo, out=x)
+        return x
+
 
 def seed(seed=None):
     """Resets the state of the random number generator with a seed.
@@ -368,7 +427,7 @@ def seed(seed=None):
     get_random_state().seed(seed)
 
 
-# CuPy specific functions
+# ClPy specific functions
 
 _random_states = {}
 
diff --git a/clpy/random/sample.py b/clpy/random/sample.py
index be49ba40f12..455594ab277 100644
--- a/clpy/random/sample.py
+++ b/clpy/random/sample.py
@@ -1,6 +1,5 @@
 import six
 
-import clpy
 from clpy import core
 from clpy.creation import basic
 from clpy.random import distributions
@@ -81,31 +80,8 @@ def randint(low, high=None, size=None, dtype='l'):
         If size is integer, it is the 1D-array of length ``size`` element.
         Otherwise, it is the array whose shape specified by ``size``.
     """
-    if high is None:
-        lo = 0
-        hi = low
-    else:
-        lo = low
-        hi = high
-
-    if lo >= hi:
-        raise ValueError('low >= high')
-    if lo < clpy.iinfo(dtype).min:
-        raise ValueError(
-            'low is out of bounds for {}'.format(clpy.dtype(dtype).name))
-    if hi > clpy.iinfo(dtype).max + 1:
-        raise ValueError(
-            'high is out of bounds for {}'.format(clpy.dtype(dtype).name))
-
-    diff = hi - lo - 1
-    if diff > clpy.iinfo(clpy.int32).max - clpy.iinfo(clpy.int32).min + 1:
-        raise NotImplementedError(
-            'Sampling from a range whose extent is larger than int32 range is '
-            'currently not supported')
     rs = generator.get_random_state()
-    x = rs.interval(diff, size).astype(dtype, copy=False)
-    clpy.add(x, lo, out=x)
-    return x
+    return rs.randint(low, high, size, dtype)
 
 
 def random_integers(low, high=None, size=None):
diff --git a/clpy/testing/helper.py b/clpy/testing/helper.py
index 5680c812c35..f4157374e09 100644
--- a/clpy/testing/helper.py
+++ b/clpy/testing/helper.py
@@ -47,7 +47,11 @@ def _check_clpy_numpy_error(self, clpy_error, clpy_tb, numpy_error,
         self.fail('Only numpy raises error\n\n' + numpy_tb)
     elif numpy_error is None:
         self.fail('Only clpy raises error\n\n' + clpy_tb)
-    elif type(clpy_error) is not type(numpy_error):
+    elif not isinstance(clpy_error, type(numpy_error)):
+        # CuPy errors should be at least as explicit as the NumPy errors, i.e.
+        # allow CuPy errors to derive from NumPy errors but not the opposite.
+        # This ensures that try/except blocks that catch NumPy errors also
+        # catch CuPy errors.
         msg = '''Different types of errors occurred
 
 clpy
@@ -56,7 +60,8 @@ def _check_clpy_numpy_error(self, clpy_error, clpy_tb, numpy_error,
 %s
 ''' % (clpy_tb, numpy_tb)
         self.fail(msg)
-    elif not isinstance(clpy_error, accept_error):
+    elif not (isinstance(clpy_error, accept_error) and
+              isinstance(numpy_error, accept_error)):
         msg = '''Both clpy and numpy raise exceptions
 
 clpy
@@ -406,7 +411,7 @@ def numpy_clpy_equal(name='xp', sp_name=None):
              ``scipy.sparse`` or ``clpy.sparse`` module. If ``None``, no
              argument is given for the modules.
 
-    Decorated test fixture is required throw same errors
+    Decorated test fixture is required to return the same results
     even if ``xp`` is ``numpy`` or ``clpy``.
     """
     def decorator(impl):
@@ -441,6 +446,12 @@ def numpy_clpy_raises(name='xp', sp_name=None, accept_error=Exception):
          sp_name(str or None): Argument name whose value is either
              ``scipy.sparse`` or ``clpy.sparse`` module. If ``None``, no
              argument is given for the modules.
+         accept_error(bool, Exception or tuple of Exception): Specify
+             acceptable errors. When both NumPy test and CuPy test raises the
+             same type of errors, and the type of the errors is specified with
+             this argument, the errors are ignored and not raised.
+             If it is ``True`` all error types are acceptable.
+             If it is ``False`` no error is acceptable.
 
     Decorated test fixture is required throw same errors
     even if ``xp`` is ``numpy`` or ``clpy``.
diff --git a/clpy/testing/random.py b/clpy/testing/random.py
index 52a6bfbbcc6..908992fba4e 100644
--- a/clpy/testing/random.py
+++ b/clpy/testing/random.py
@@ -88,7 +88,7 @@ def _teardown_random():
 
 def generate_seed():
     assert _nest_count > 0, 'random is not set up'
-    return numpy.random.randint(0xffffffff, dtype=numpy.int64)
+    return numpy.random.randint(0x7fffffff)
 
 
 def fix_random():
diff --git a/clpy_setup_build.py b/clpy_setup_build.py
index 182a982a3cc..a5a3b788c7f 100644
--- a/clpy_setup_build.py
+++ b/clpy_setup_build.py
@@ -126,6 +126,70 @@ def launch_headercvt():
         ],
         'check_method': build.check_opencl_version,
     },
+    # {
+    #     'name': 'cudnn',
+    #     'file': [
+    #         'cupy.cuda.cudnn',
+    #         'cupy.cudnn',
+    #     ],
+    #     'include': [
+    #         'cudnn.h',
+    #     ],
+    #     'libraries': [
+    #         'cudnn',
+    #     ],
+    #     'check_method': build.check_cudnn_version,
+    # },
+    # {
+    #     'name': 'nccl',
+    #     'file': [
+    #         'cupy.cuda.nccl',
+    #     ],
+    #     'include': [
+    #         'nccl.h',
+    #     ],
+    #     'libraries': [
+    #         'nccl',
+    #     ],
+    #     'check_method': build.check_nccl_version,
+    # },
+    # {
+    #     'name': 'cusolver',
+    #     'file': [
+    #         'cupy.cuda.cusolver',
+    #     ],
+    #     'include': [
+    #         'cusolverDn.h',
+    #     ],
+    #     'libraries': [
+    #         'cusolver',
+    #     ],
+    #     'check_method': build.check_cusolver_version,
+    # },
+    # {
+    #     # The value of the key 'file' is a list that contains extension names
+    #     # or tuples of an extension name and a list of other souces files
+    #     # required to build the extension such as .cpp files and .cu files.
+    #     #
+    #     #   <extension name> | (<extension name>, a list of <other source>)
+    #     #
+    #     # The extension name is also interpreted as the name of the Cython
+    #     # source file required to build the extension with appending '.pyx'
+    #     # file extension.
+    #     'name': 'thrust',
+    #     'file': [
+    #         ('cupy.cuda.thrust', ['cupy/cuda/cupy_thrust.cu']),
+    #     ],
+    #     'include': [
+    #         'thrust/device_ptr.h',
+    #         'thrust/sequence.h',
+    #         'thrust/sort.h',
+    #     ],
+    #     'libraries': [
+    #         'cudart',
+    #     ],
+    #     'check_method': build.check_cuda_version,
+    # }
 ]
 
 
@@ -288,6 +352,9 @@ def cythonize(extensions, arg_options):
     directive_keys = ('linetrace', 'profile')
     directives = {key: arg_options[key] for key in directive_keys}
 
+    # Embed signatures for Sphinx documentation.
+    directives['embedsignature'] = True
+
     cythonize_option_keys = ('annotate',)
     cythonize_options = {key: arg_options[key]
                          for key in cythonize_option_keys}
diff --git a/cupy/cudnn.pyx b/cupy/cudnn.pyx
new file mode 100644
index 00000000000..baa630a6a1c
--- /dev/null
+++ b/cupy/cudnn.pyx
@@ -0,0 +1,389 @@
+from libcpp cimport vector
+
+import atexit
+import threading
+
+import numpy
+
+from cupy.core cimport core
+from cupy.cuda cimport cudnn
+from cupy.cuda cimport device
+from cupy.cuda cimport memory
+
+import cupy
+from cupy.core import internal
+from cupy.cuda import cudnn as py_cudnn
+
+
+cdef int _cudnn_version = cudnn.getVersion()
+cdef _thread_local = threading.local()
+
+cdef vector.vector[size_t] _handles
+
+
+cpdef size_t get_handle() except *:
+    cdef int dev
+    dev = device.get_device_id()
+    if _handles.size() <= dev:
+        _handles.resize(dev + 1, 0)
+    ret = _handles[dev]
+    if ret != 0:
+        return ret
+    ret = cudnn.create()
+    _handles[dev] = ret
+    return ret
+
+
+@atexit.register
+def reset_handles():
+    for handle in _handles:
+        if handle:
+            cudnn.destroy(handle)
+    _handles.clear()
+
+
+cpdef dict _get_nd_tensor_cache():
+    if not hasattr(_thread_local, 'cudnn_nd_tensor_cache'):
+        _thread_local.cudnn_nd_tensor_cache = {}
+    return _thread_local.cudnn_nd_tensor_cache
+
+
+class Descriptor(object):
+
+    def __init__(self, descriptor, destroyer):
+        self.value = descriptor
+        self.destroy = destroyer
+
+    def __del__(self):
+        if self.value:
+            self.destroy(self.value)
+            self.value = None
+
+
+cpdef get_data_type(dtype):
+    t = dtype.type
+    if t is numpy.float32:
+        return cudnn.CUDNN_DATA_FLOAT
+    elif t is numpy.float64:
+        return cudnn.CUDNN_DATA_DOUBLE
+    elif t is numpy.float16:
+        return cudnn.CUDNN_DATA_HALF
+    else:
+        raise TypeError('Dtype {} is not supported in cuDNN'.format(dtype))
+
+
+cpdef _create_tensor_nd_descriptor(
+        size_t desc, core.ndarray arr, int data_type):
+    cdef vector.vector[int] c_shape, c_strides
+    cdef Py_ssize_t itemsize, s
+    itemsize = arr.itemsize
+    for s in arr._strides:
+        c_strides.push_back(s // itemsize)
+    for s in arr._shape:
+        c_shape.push_back(s)
+    cudnn.setTensorNdDescriptor(
+        desc, data_type, arr.ndim, <size_t>&c_shape[0], <size_t>&c_strides[0])
+
+
+cpdef _create_tensor_descriptor(size_t desc, core.ndarray arr, int format):
+    if not arr.flags.c_contiguous:
+        raise ValueError('cupy.cudnn supports c-contiguous arrays only')
+    data_type = get_data_type(arr.dtype)
+    if arr._shape.size() == 4:
+        n, c, h, w = arr.shape
+        cudnn.setTensor4dDescriptor(desc, format, data_type, n, c, h, w)
+    else:
+        _create_tensor_nd_descriptor(desc, arr, data_type)
+
+
+cpdef _create_filter_descriptor(
+        size_t desc, core.ndarray arr, int format=cudnn.CUDNN_TENSOR_NCHW):
+    cdef vector.vector[int] c_shape
+    cdef Py_ssize_t s
+    data_type = get_data_type(arr.dtype)
+    if arr._shape.size() == 4:
+        n, c, h, w = arr.shape
+        cudnn.setFilter4dDescriptor_v4(
+            desc, data_type, format, n, c, h, w)
+    else:
+        for s in arr._shape:
+            c_shape.push_back(s)
+        cudnn.setFilterNdDescriptor_v4(
+            desc, data_type, format, arr.ndim, <size_t>&c_shape[0])
+
+
+cpdef _create_convolution_descriptor(
+        desc, pad, stride, dtype, mode, dilation, int group,
+        bint use_tensor_core):
+    cdef int d0, d1, p0, p1, s0, s1
+    cdef vector.vector[int] c_pad, c_stride, c_dilation
+    ndim = len(pad)
+    if ndim != len(stride):
+        raise ValueError('pad and stride must be of same length')
+
+    compute_type = get_data_type(dtype)
+    # TODO(takagi) Temporarily use computing precision of FP32 for
+    #     storing precision of FP16.
+    if compute_type == cudnn.CUDNN_DATA_HALF:
+        compute_type = cudnn.CUDNN_DATA_FLOAT
+
+    if ndim != 2:
+        c_pad = pad
+        c_stride = stride
+        c_dilation.assign(ndim, 1)
+        cudnn.setConvolutionNdDescriptor_v3(
+            desc, ndim, <size_t>&c_pad[0], <size_t>&c_stride[0],
+            <size_t>&c_dilation[0], mode, compute_type)
+        return
+
+    d0, d1 = dilation
+    p0, p1 = pad
+    s0, s1 = stride
+    if _cudnn_version < 6000 and (d0 != 1 or d1 != 1):
+        raise ValueError('dilation must be one when cudnn < 6.0')
+    if _cudnn_version >= 5000:
+        cudnn.setConvolution2dDescriptor_v5(
+            desc, p0, p1, s0, s1, d0, d1, mode, compute_type)
+        if _cudnn_version >= 7000 and use_tensor_core:
+            math_type = cudnn.CUDNN_TENSOR_OP_MATH
+            cudnn.setConvolutionMathType(desc, math_type)
+            if group > 1:
+                cudnn.setConvolutionGroupCount(desc.value, group)
+    else:
+        cudnn.setConvolution2dDescriptor_v4(desc, p0, p1, s0, s1, 1, 1, mode)
+
+
+def create_tensor_descriptor(arr, format=cudnn.CUDNN_TENSOR_NCHW):
+    desc = Descriptor(cudnn.createTensorDescriptor(),
+                      py_cudnn.destroyTensorDescriptor)
+    _create_tensor_descriptor(desc.value, arr, format)
+    return desc
+
+
+def create_uninitialized_tensor_descriptor():
+    """Create uninitialized tensor descriptor.
+
+    Create a cudnnCreateTensorDescriptor_t that is not yet initialized.
+    This is used by the batch normalization functions.
+    """
+    return Descriptor(cudnn.createTensorDescriptor(),
+                      py_cudnn.destroyTensorDescriptor)
+
+
+def create_tensor_nd_descriptor(core.ndarray arr):
+    cdef dict cache
+    if not arr.flags.c_contiguous:
+        raise ValueError('cupy.cudnn supports c-contiguous arrays only')
+    data_type = get_data_type(arr.dtype)
+    shape = arr.shape
+    key = (data_type, shape)
+    cache = _get_nd_tensor_cache()
+    if key in cache:
+        return cache[key]
+
+    # numpy's stride is defined in bytes, but cudnn's stride is defined in
+    # size of element
+    desc = Descriptor(cudnn.createTensorDescriptor(),
+                      py_cudnn.destroyTensorDescriptor)
+    _create_tensor_nd_descriptor(desc.value, arr, data_type)
+    cache[key] = desc
+    return desc
+
+
+def create_filter_descriptor(arr, format=cudnn.CUDNN_TENSOR_NCHW):
+    desc = Descriptor(cudnn.createFilterDescriptor(),
+                      py_cudnn.destroyFilterDescriptor)
+    _create_filter_descriptor(desc.value, arr, format)
+    return desc
+
+
+def create_convolution_descriptor(pad, stride, dtype,
+                                  mode=cudnn.CUDNN_CROSS_CORRELATION,
+                                  dilation=(1, 1),
+                                  use_tensor_core=False,
+                                  group=1):
+    desc = Descriptor(cudnn.createConvolutionDescriptor(),
+                      py_cudnn.destroyConvolutionDescriptor)
+    _create_convolution_descriptor(
+        desc.value, pad, stride, dtype, mode, dilation, group, use_tensor_core)
+    return desc
+
+
+def create_pooling_descriptor(ksize, stride, pad, mode):
+    cdef vector.vector[int] c_ksize, c_pad, c_stride
+    ndim = len(ksize)
+    if ndim != len(stride) or ndim != len(pad):
+        raise ValueError('ksize, stride, and pad must be of same length')
+    desc = Descriptor(cudnn.createPoolingDescriptor(),
+                      py_cudnn.destroyPoolingDescriptor)
+    if ndim == 2:
+        cudnn.setPooling2dDescriptor_v4(
+            desc.value, mode, cudnn.CUDNN_NOT_PROPAGATE_NAN, ksize[0],
+            ksize[1], pad[0], pad[1], stride[0], stride[1])
+    else:
+        c_ksize = ksize
+        c_pad = pad
+        c_stride = stride
+        cudnn.setPoolingNdDescriptor_v4(
+            desc.value, mode, cudnn.CUDNN_NOT_PROPAGATE_NAN, ndim,
+            <size_t>&c_ksize[0], <size_t>&c_pad[0], <size_t>&c_stride[0])
+
+    return desc
+
+
+cpdef core.ndarray _as4darray(core.ndarray arr):
+    if arr.ndim == 0:
+        return arr.reshape(1, 1, 1, 1)
+    return arr.reshape(arr.shape[0], -1, 1, 1)
+
+
+def activation_forward(core.ndarray x, int mode):
+    cdef float float_zero = 0, float_one = 1
+    cdef double double_zero = 0, double_one = 1
+    cdef size_t zero, one
+    if x.dtype == 'd':
+        zero = <size_t>&double_zero
+        one = <size_t>&double_one
+    else:
+        zero = <size_t>&float_zero
+        one = <size_t>&float_one
+
+    x = core.ascontiguousarray(x)
+    y = cupy.empty_like(x)
+    x = _as4darray(x)
+
+    handle = get_handle()
+    desc = cudnn.createTensorDescriptor()
+    act_desc = cudnn.createActivationDescriptor()
+    try:
+        _create_tensor_descriptor(desc, x, cudnn.CUDNN_TENSOR_NCHW)
+        cudnn.setActivationDescriptor(
+            act_desc, mode, cudnn.CUDNN_NOT_PROPAGATE_NAN, 0.0)
+        cudnn.activationForward_v4(
+            handle, act_desc, one, desc, x.data.ptr,
+            zero, desc, y.data.ptr)
+    finally:
+        cudnn.destroyActivationDescriptor(act_desc)
+        cudnn.destroyTensorDescriptor(desc)
+    return y
+
+
+def activation_backward(core.ndarray x, core.ndarray y, core.ndarray gy,
+                        int mode):
+    cdef float float_zero = 0, float_one = 1
+    cdef double double_zero = 0, double_one = 1
+    cdef size_t zero, one
+    if x.dtype == 'd':
+        zero = <size_t>&double_zero
+        one = <size_t>&double_one
+    else:
+        zero = <size_t>&float_zero
+        one = <size_t>&float_one
+
+    gx = cupy.empty_like(x)
+    x = core.ascontiguousarray(x)
+    gy = core.ascontiguousarray(gy)
+    y_mat = _as4darray(y)
+
+    handle = get_handle()
+    desc = cudnn.createTensorDescriptor()
+    act_desc = cudnn.createActivationDescriptor()
+    try:
+        _create_tensor_descriptor(desc, y_mat, cudnn.CUDNN_TENSOR_NCHW)
+        cudnn.setActivationDescriptor(
+            act_desc, mode, cudnn.CUDNN_NOT_PROPAGATE_NAN, 0.0)
+        cudnn.activationBackward_v4(
+            handle, act_desc, one, desc, y.data.ptr,
+            desc, gy.data.ptr, desc, x.data.ptr,
+            zero, desc, gx.data.ptr)
+    finally:
+        cudnn.destroyActivationDescriptor(act_desc)
+        cudnn.destroyTensorDescriptor(desc)
+    return gx
+
+
+def create_dropout_descriptor(
+        handle, dropout, states, state_size_in_bytes, seed):
+    desc = Descriptor(cudnn.createDropoutDescriptor(),
+                      py_cudnn.destroyDropoutDescriptor)
+    cudnn.setDropoutDescriptor(desc.value, handle, dropout,
+                               states, state_size_in_bytes, seed)
+    return desc
+
+
+def set_dropout_descriptor(desc, handle, dropout):
+    # When the fourth argument is NULL, random state is not updated.
+    cudnn.setDropoutDescriptor(desc.value, handle, dropout, 0, 0, 0)
+
+
+def create_rnn_descriptor(hidden_size, num_layers, dropout_desc,
+                          input_mode, direction, mode, data_type):
+    desc = Descriptor(cudnn.createRNNDescriptor(),
+                      py_cudnn.destroyRNNDescriptor)
+    if _cudnn_version >= 7000:
+        _handle = get_handle()
+        _algo = cudnn.CUDNN_RNN_ALGO_STANDARD
+        cudnn.setRNNDescriptor_v6(
+            _handle, desc.value, hidden_size, num_layers, dropout_desc.value,
+            input_mode, direction, mode, _algo, data_type)
+    else:
+        cudnn.setRNNDescriptor_v5(
+            desc.value, hidden_size, num_layers, dropout_desc.value,
+            input_mode, direction, mode, data_type)
+    return desc
+
+
+def get_rnn_lin_layer_matrix_params(
+        handle, rnn_desc, layer, x_desc, w_desc, core.ndarray w, lin_layer_id):
+    cdef size_t ptr = 0
+    w_data_ptr = w.data.ptr
+    mat_desc = cudnn.createFilterDescriptor()
+    try:
+        cudnn.getRNNLinLayerMatrixParams(
+            handle, rnn_desc.value, layer, x_desc.value, w_desc.value,
+            w.data.ptr, lin_layer_id, mat_desc, <size_t>&ptr)
+        _, _, _, dim = cudnn.getFilterNdDescriptor(mat_desc, 3)
+    finally:
+        cudnn.destroyFilterDescriptor(mat_desc)
+    offset = (ptr - w.data.ptr) // 4
+    size = internal.prod(dim)
+    mat = w[offset: offset + size]
+    return mat
+
+
+def get_rnn_lin_layer_bias_params(
+        handle, rnn_desc, layer, x_desc, w_desc, core.ndarray w, lin_layer_id):
+    cdef size_t ptr = 0
+    bias_desc = cudnn.createFilterDescriptor()
+    try:
+        cudnn.getRNNLinLayerBiasParams(
+            handle, rnn_desc.value, layer, x_desc.value, w_desc.value,
+            w.data.ptr, lin_layer_id, bias_desc, <size_t>&ptr)
+        _, _, _, dim = cudnn.getFilterNdDescriptor(bias_desc, 3)
+    finally:
+        cudnn.destroyFilterDescriptor(bias_desc)
+    offset = (ptr - w.data.ptr) // 4
+    size = internal.prod(dim)
+    bias = w[offset: offset + size]
+    return bias
+
+
+def create_dropout_states(handle):
+    state_size = cudnn.dropoutGetStatesSize(handle)
+    return cupy.empty((state_size,), dtype='b')
+
+
+def create_spatial_transformer_descriptor(sampler_type, dtype, nb_dims, dim_A):
+    desc = Descriptor(cudnn.createSpatialTransformerDescriptor(),
+                      py_cudnn.destroySpatialTransformerDescriptor)
+    data_type = get_data_type(dtype)
+
+    cudnn.setSpatialTransformerDescriptor(
+        desc.value, sampler_type, data_type, nb_dims, dim_A)
+    return desc
+
+
+def add_tensor(handle, alpha, biasDesc, biasData, beta, srcDestDesc,
+               srcDestData):
+    cudnn.addTensor_v3(handle, alpha, biasDesc,
+                       biasData, beta, srcDestDesc, srcDestData)
diff --git a/docker/python2/Dockerfile b/docker/python2/Dockerfile
index 3d7ca934793..d844f54ef11 100644
--- a/docker/python2/Dockerfile
+++ b/docker/python2/Dockerfile
@@ -6,4 +6,4 @@ RUN apt-get update -y && \
     python-pip && \
     rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
 
-RUN pip install cupy==2.1.0.1
+RUN pip install cupy==2.2.0
diff --git a/docker/python3/Dockerfile b/docker/python3/Dockerfile
index 170ec5e1167..da0b905f193 100644
--- a/docker/python3/Dockerfile
+++ b/docker/python3/Dockerfile
@@ -6,4 +6,4 @@ RUN apt-get update -y && \
     python3-pip && \
     rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
 
-RUN pip3 install cupy==2.1.0.1
+RUN pip3 install cupy==2.2.0
diff --git a/docs/source/install.rst b/docs/source/install.rst
index 850069722cc..69d5085e19e 100644
--- a/docs/source/install.rst
+++ b/docs/source/install.rst
@@ -23,9 +23,8 @@ The following versions of Python can be used: 2.7.6+, 3.4.3+, 3.5.1+, and 3.6.0+
 .. note::
 
    We are testing CuPy automatically with Jenkins, where all the above *recommended* environments are tested.
-   We cannot guarantee that CuPy works on other environments including Windows and macOS (especially with CUDA support), even if CuPy looks running correctly.
+   We cannot guarantee that CuPy works on other environments including Windows and macOS, even if CuPy looks running correctly.
 
-CuPy is supported on Python 2.7.6+, 3.4.3+, 3.5.1+, 3.6.0+.
 CuPy uses C++ compiler such as g++.
 You need to install it before installing CuPy.
 This is typical installation method for each platform::
@@ -54,17 +53,17 @@ The latest version of each package will automatically be installed if missing.
 * `NumPy <http://www.numpy.org/>`_ 1.9, 1.10, 1.11, 1.12, 1.13
 * `Six <https://pythonhosted.org/six/>`_ 1.9+
 
-CUDA support
+In addition, you need to install `CUDA <https://developer.nvidia.com/cuda-zone>`_.
+The following versions of CUDA can be used: 7.0, 7.5, 8.0 and 9.0.
 
-* `CUDA <https://developer.nvidia.com/cuda-zone>`_ 7.0, 7.5, 8.0, 9.0
+Optional Libraries
+~~~~~~~~~~~~~~~~~~
 
-cuDNN support
+The following libraries are optional dependencies.
+CuPy will enable these features only if they are installed.
 
 * `cuDNN <https://developer.nvidia.com/cudnn>`_ v4, v5, v5.1, v6, v7
-
-NCCL support
-
-* `nccl <https://github.com/NVIDIA/nccl>`_ v1.3+
+* `NCCL <https://github.com/NVIDIA/nccl>`_ v1.3+
 
 Install CuPy
 ------------
diff --git a/docs/source/reference/environment.rst b/docs/source/reference/environment.rst
index d2c804eddcb..7907b090e80 100644
--- a/docs/source/reference/environment.rst
+++ b/docs/source/reference/environment.rst
@@ -6,7 +6,7 @@ Here are the environment variables CuPy uses.
 
 +------------------------------------+----------------------------------------------------+
 | ``CUPY_CACHE_DIR``                 | Path to the directory to store kernel cache.       |
-|                                    | ``$(HOME)/.cupy.kernel_cache`` is used by default. |
+|                                    | ``${HOME}/.cupy/kernel_cache`` is used by default. |
 |                                    | See :ref:`overview` for details.                   |
 +------------------------------------+----------------------------------------------------+
 | ``CUPY_CACHE_SAVE_CUDA_SOURCE``    | If set to 1, CUDA source file will be saved along  |
diff --git a/tests/clpy_tests/manipulation_tests/test_join.py b/tests/clpy_tests/manipulation_tests/test_join.py
index 982efad66ce..3db5fed9f70 100644
--- a/tests/clpy_tests/manipulation_tests/test_join.py
+++ b/tests/clpy_tests/manipulation_tests/test_join.py
@@ -173,10 +173,28 @@ def test_stack_value(self):
 
     @testing.with_requires('numpy>=1.10')
     @testing.numpy_clpy_array_equal()
-    def test_stack_with_axis(self, xp):
+    def test_stack_with_axis1(self, xp):
         a = testing.shaped_arange((2, 3), xp)
         return xp.stack((a, a), axis=1)
 
+    @testing.with_requires('numpy>=1.10')
+    @testing.numpy_clpy_array_equal()
+    def test_stack_with_axis2(self, xp):
+        a = testing.shaped_arange((2, 3), xp)
+        return xp.stack((a, a), axis=2)
+
+    @testing.with_requires('numpy>=1.10')
+    @testing.numpy_clpy_raises()
+    def test_stack_with_axis_over(self, xp):
+        a = testing.shaped_arange((2, 3), xp)
+        try:
+            return xp.stack((a, a), axis=3)
+        except IndexError:
+            # For 'numpy<=1.12', catch both IndexError from NumPy and
+            # IndexOrValueError from CuPy. For 'numpy>=1.13', simply do not
+            # catch the AxisError.
+            raise IndexError()
+
     def test_stack_with_axis_value(self):
         a = testing.shaped_arange((2, 3), clpy)
         s = clpy.stack((a, a), axis=1)
diff --git a/tests/clpy_tests/math_tests/test_matmul.py b/tests/clpy_tests/math_tests/test_matmul.py
index 71d0f6deaa0..c80925bcbda 100644
--- a/tests/clpy_tests/math_tests/test_matmul.py
+++ b/tests/clpy_tests/math_tests/test_matmul.py
@@ -104,6 +104,30 @@ def test_clpy_matmul(self, xp, dtype1, dtype2):
         if ((dtype1, dtype2) in self.skip_dtypes or
                 (dtype2, dtype1) in self.skip_dtypes):
             return xp.array([])
-        x1 = testing.shaped_arange(self.shape_pair[0], xp, dtype1)
-        x2 = testing.shaped_arange(self.shape_pair[1], xp, dtype2)
+        shape1, shape2 = self.shape_pair
+        x1 = testing.shaped_arange(shape1, xp, dtype1)
+        x2 = testing.shaped_arange(shape2, xp, dtype2)
         return xp.matmul(x1, x2)
+
+
+@testing.parameterize(
+    *testing.product({
+        'shape_pair': [
+            ((5, 3, 1), (3, 1, 4)),
+            ((3, 2, 3), (3, 2, 4)),
+            ((3, 2), ()),
+            ((), (3, 2)),
+            ((), ()),
+            ((3, 2), (1,)),
+        ],
+    }))
+@testing.gpu
+class TestMatmulInvalidShape(unittest.TestCase):
+
+    @testing.with_requires('numpy>=1.10')
+    @testing.numpy_clpy_raises(accept_error=ValueError)
+    def test_invalid_shape(self, xp):
+        shape1, shape2 = self.shape_pair
+        x1 = testing.shaped_arange(shape1, xp, numpy.float32)
+        x2 = testing.shaped_arange(shape2, xp, numpy.float32)
+        xp.matmul(x1, x2)
diff --git a/tests/clpy_tests/random_tests/test_generator.py b/tests/clpy_tests/random_tests/test_generator.py
index b171efb2565..357a5cc3497 100644
--- a/tests/clpy_tests/random_tests/test_generator.py
+++ b/tests/clpy_tests/random_tests/test_generator.py
@@ -1,5 +1,3 @@
-import mock
-import operator
 import os
 import threading
 import unittest
@@ -8,268 +6,308 @@
 import six
 
 import clpy
-from clpy import backend
-import clpy.backend.opencl.random as clrand
 from clpy import core
+from clpy import backend
 from clpy.random import generator
 from clpy import testing
 from clpy.testing import condition
 from clpy.testing import hypothesis
 
 
-class FunctionSwitcher(object):
+class RandomGeneratorTestCase(unittest.TestCase):
 
-    def __init__(self, f):
-        self.tmp = f
-        self.func_name = f.__name__
+    target_method = None
 
-    def __enter__(self):
-        setattr(clrand, self.func_name, mock.Mock())
+    def setUp(self):
+        self.rs = generator.RandomState(seed=testing.generate_seed())
 
-    def __exit__(self, *_):
-        setattr(clrand, self.func_name, self.tmp)
+    def _get_generator_func(self, *args, **kwargs):
+        assert isinstance(self.target_method, str), (
+            'generate_method must be overridden')
+        f = getattr(self.rs, self.target_method)
+        return lambda: f(*args, **kwargs)
+
+    def _generate_check_repro(self, func, seed=0):
+        # Sample a random array while checking reproducibility
+        self.rs.seed(seed)
+        x = func()
+        self.rs.seed(seed)
+        y = func()
+        testing.assert_array_equal(
+            x, y,
+            'Randomly generated arrays with the same seed did not match')
+        return x
+
+    def generate(self, *args, **kwargs):
+        # Pick one sample from generator.
+        # Reproducibility is checked by repeating seed-and-sample cycle twice.
+        func = self._get_generator_func(*args, **kwargs)
+        return self._generate_check_repro(func, seed=0)
+
+    def generate_many(self, *args, **kwargs):
+        # Pick many samples from generator.
+        # Reproducibility is checked only for the first sample,
+        # because it's very slow to set seed every time.
+        _count = kwargs.pop('_count', None)
+        assert _count is not None, '_count is required'
+        func = self._get_generator_func(*args, **kwargs)
+
+        if _count == 0:
+            return []
+
+        vals = [self._generate_check_repro(func, seed=0)]
+        for i in range(1, _count):
+            vals.append(func())
+        return vals
 
 
 @testing.fix_random()
-@testing.with_requires('numpy>=1.11.0')
 @testing.gpu
 class TestRandomState(unittest.TestCase):
 
-    _multiprocess_can_split_ = True
-    args = (0.0, 1.0)
-    size = None
-
     def setUp(self):
         self.rs = generator.RandomState(seed=testing.generate_seed())
 
-    """
-    def check_lognormal(self, clrand_func, dtype):
-        shape = core.get_size(self.size)
-        exp_size = six.moves.reduce(operator.mul, shape, 1)
-        if exp_size % 2 == 1:
-            exp_size += 1
-
-        clrand_func.return_value = clpy.zeros(exp_size, dtype=dtype)
-        out = self.rs.lognormal(self.args[0], self.args[1], self.size, dtype)
-        gen, _, mean, sigma = clrand_func.call_args[0]
-        self.assertIs(gen, self.rs._generator)
-        self.assertEqual(size, exp_size)
-        self.assertIs(mean, self.args[0])
-        self.assertIs(sigma, self.args[1])
-        self.assertEqual(out.shape, shape)
-
-    def test_lognormal_float(self):
-        with FunctionSwitcher(clrand.generateLogNormalDouble):
-            self.check_lognormal(clrand.generateLogNormalDouble, float)
-
-    def test_lognormal_float32(self):
-        with FunctionSwitcher(clrand.generateLogNormal):
-            self.check_lognormal(clrand.generateLogNormal, numpy.float32)
-
-    def test_lognormal_float64(self):
-        with FunctionSwitcher(clrand.generateLogNormalDouble):
-            self.check_lognormal(clrand.generateLogNormalDouble, numpy.float64)
-    """
-
-    def check_normal(self, clrand_func, dtype):
-        shape = core.get_size(self.size)
-        exp_size = six.moves.reduce(operator.mul, shape, 1)
-        if exp_size % 2 == 1:
-            exp_size += 1
-
-        clrand_func.return_value = clpy.zeros(exp_size, dtype=dtype)
-        out = self.rs.normal(self.args[0], self.args[1], self.size, dtype)
-        gen, _, loc, scale = clrand_func.call_args[0]
-        self.assertIs(gen, self.rs._generator)
-        self.assertIs(loc, self.args[0])
-        self.assertIs(scale, self.args[1])
-        self.assertEqual(out.shape, shape)
-
-    def test_normal_float32(self):
-        with FunctionSwitcher(clrand.generateNormal):
-            self.check_normal(clrand.generateNormal, numpy.float32)
-
-    def test_normal_float64(self):
-        with FunctionSwitcher(clrand.generateNormalDouble):
-            self.check_normal(clrand.generateNormalDouble, numpy.float64)
-
-    def check_random_sample(self, clrand_func, dtype):
-        out = self.rs.random_sample(self.size, dtype)
-        clrand_func.assert_called_once_with(self.rs._generator, out)
+    def check_seed(self, seed):
+        rs = self.rs
 
-    def test_random_sample_float32(self):
-        with FunctionSwitcher(clrand.generateUniform):
-            self.check_random_sample(clrand.generateUniform, numpy.float32)
+        rs.seed(seed)
+        xs1 = [rs.uniform() for _ in range(100)]
 
-    def test_random_sample_float64(self):
-        with FunctionSwitcher(clrand.generateUniformDouble):
-            self.check_random_sample(
-                clrand.generateUniformDouble, numpy.float64)
+        rs.seed(seed)
+        xs2 = [rs.uniform() for _ in range(100)]
 
-    def check_seed(self, clrand_func, seed):
-        self.rs.seed(seed)
-        call_args_list = clrand_func.call_args_list
-        self.assertEqual(1, len(call_args_list))
-        call_args = call_args_list[0][0]
-        self.assertEqual(2, len(call_args))
-        self.assertIs(self.rs._generator, call_args[0])
-        self.assertEqual(numpy.uint64, call_args[1].dtype)
+        rs.seed(seed)
+        rs.seed(None)
+        xs3 = [rs.uniform() for _ in range(100)]
 
-    def test_seed_none(self):
-        with FunctionSwitcher(clrand.setPseudoRandomGeneratorSeed):
-            self.check_seed(clrand.setPseudoRandomGeneratorSeed, None)
+        # Random state must be reproducible
+        assert xs1 == xs2
+        # Random state must be initialized randomly with seed=None
+        assert xs1 != xs3
 
     @testing.for_int_dtypes()
     def test_seed_not_none(self, dtype):
-        with FunctionSwitcher(clrand.setPseudoRandomGeneratorSeed):
-            self.check_seed(clrand.setPseudoRandomGeneratorSeed, dtype(0))
+        self.check_seed(dtype(0))
 
     @testing.for_dtypes([numpy.complex_])
     def test_seed_invalid_type_complex(self, dtype):
         with self.assertRaises(TypeError):
-            with FunctionSwitcher(clrand.setPseudoRandomGeneratorSeed):
-                self.check_seed(clrand.setPseudoRandomGeneratorSeed, dtype(0))
+            self.rs.seed(dtype(0))
 
     @testing.for_float_dtypes()
     def test_seed_invalid_type_float(self, dtype):
         with self.assertRaises(TypeError):
-            with FunctionSwitcher(clrand.setPseudoRandomGeneratorSeed):
-                self.check_seed(clrand.setPseudoRandomGeneratorSeed, dtype(0))
+            self.rs.seed(dtype(0))
 
 
 @testing.gpu
-class TestRandomState2(TestRandomState):
+@testing.parameterize(*[
+    {'args': (0.0, 1.0), 'size': None},
+    {'args': (10.0, 20.0), 'size': None},
+    {'args': (0.0, 1.0), 'size': 10},
+    {'args': (0.0, 1.0), 'size': (1, 2, 3)},
+    {'args': (0.0, 1.0), 'size': 3},
+    {'args': (0.0, 1.0), 'size': (3, 3)},
+    {'args': (0.0, 1.0), 'size': ()},
+])
+@testing.fix_random()
+class TestLogNormal(RandomGeneratorTestCase):
 
-    args = (10.0, 20.0)
-    size = None
+    target_method = 'lognormal'
 
+    def check_lognormal(self, dtype):
+        vals = self.generate_many(
+            self.args[0], self.args[1], self.size, dtype, _count=10)
 
-@testing.gpu
-class TestRandomState3(TestRandomState):
+        shape = core.get_size(self.size)
+        for val in vals:
+            assert isinstance(val, clpy.ndarray)
+            assert val.dtype == dtype
+            assert val.shape == shape
+            assert (0 <= val).all()
+        # TODO(niboshi): Distribution test
 
-    args = (0.0, 1.0)
-    size = 10
+    def test_lognormal_float(self):
+        self.check_lognormal(float)
+
+    def test_lognormal_float32(self):
+        self.check_lognormal(numpy.float32)
+
+    def test_lognormal_float64(self):
+        self.check_lognormal(numpy.float64)
 
 
 @testing.gpu
-class TestRandomState4(TestRandomState):
+@testing.parameterize(*[
+    {'args': (0.0, 1.0), 'size': None},
+    {'args': (10.0, 20.0), 'size': None},
+    {'args': (0.0, 1.0), 'size': 10},
+    {'args': (0.0, 1.0), 'size': (1, 2, 3)},
+    {'args': (0.0, 1.0), 'size': 3},
+    {'args': (0.0, 1.0), 'size': (3, 3)},
+    {'args': (0.0, 1.0), 'size': ()},
+])
+@testing.fix_random()
+class TestNormal(RandomGeneratorTestCase):
 
-    args = (0.0, 1.0)
-    size = (1, 2, 3)
+    target_method = 'normal'
 
+    def check_normal(self, dtype):
+        vals = self.generate_many(
+            self.args[0], self.args[1], self.size, dtype, _count=10)
 
-@testing.gpu
-class TestRandomState6(TestRandomState):
+        shape = core.get_size(self.size)
+        for val in vals:
+            assert isinstance(val, clpy.ndarray)
+            assert val.dtype == dtype
+            assert val.shape == shape
+        # TODO(niboshi): Distribution test
 
-    args = (0.0, 1.0)
-    size = 3
+    def test_normal_float32(self):
+        self.check_normal(numpy.float32)
+
+    def test_normal_float64(self):
+        self.check_normal(numpy.float64)
 
 
 @testing.gpu
-class TestRandomState7(TestRandomState):
+@testing.parameterize(*[
+    {'size': None},
+    {'size': 10},
+    {'size': (1, 2, 3)},
+    {'size': 3},
+    {'size': ()},
+])
+@testing.fix_random()
+class TestRandomSample(unittest.TestCase):
 
-    args = (0.0, 1.0)
-    size = (3, 3)
+    def setUp(self):
+        self.rs = generator.RandomState(seed=testing.generate_seed())
 
+    def check_random_sample(self, dtype):
+        vals = [self.rs.random_sample(self.size, dtype) for _ in range(10)]
 
-@testing.gpu
-class TestRandomState8(TestRandomState):
+        shape = core.get_size(self.size)
+        for val in vals:
+            assert isinstance(val, clpy.ndarray)
+            assert val.dtype == dtype
+            assert val.shape == shape
+            assert (0 <= val).all()
+            assert (val < 1).all()
+        # TODO(niboshi): Distribution test
 
-    args = (0.0, 1.0)
-    size = ()
+    def test_random_sample_float32(self):
+        self.check_random_sample(numpy.float32)
+
+    def test_random_sample_float64(self):
+        self.check_random_sample(numpy.float64)
 
 
 @testing.fix_random()
-@testing.with_requires('numpy>=1.11.0')
 @testing.gpu
 class TestRandAndRandN(unittest.TestCase):
 
     def setUp(self):
         self.rs = generator.RandomState(seed=testing.generate_seed())
 
-    def test_rand(self):
-        self.rs.random_sample = mock.Mock()
-        self.rs.rand(1, 2, 3, dtype=numpy.float32)
-        self.rs.random_sample.assert_called_once_with(
-            size=(1, 2, 3), dtype=numpy.float32)
-
     def test_rand_invalid_argument(self):
         with self.assertRaises(TypeError):
             self.rs.rand(1, 2, 3, unnecessary='unnecessary_argument')
 
-    def test_randn(self):
-        self.rs.normal = mock.Mock()
-        self.rs.randn(1, 2, 3, dtype=numpy.float32)
-        self.rs.normal.assert_called_once_with(
-            size=(1, 2, 3), dtype=numpy.float32)
-
     def test_randn_invalid_argument(self):
         with self.assertRaises(TypeError):
             self.rs.randn(1, 2, 3, unnecessary='unnecessary_argument')
 
 
 @testing.fix_random()
-@testing.with_requires('numpy>=1.11.0')
 @testing.gpu
-class TestInterval(unittest.TestCase):
+class TestInterval(RandomGeneratorTestCase):
 
-    def setUp(self):
-        self.rs = clpy.random.get_random_state()
-        self.rs.seed(testing.generate_seed())
+    target_method = 'interval'
 
     def test_zero(self):
-        numpy.testing.assert_array_equal(
-            self.rs.interval(0, (2, 3)).get(), numpy.zeros((2, 3)))
+        shape = (2, 3)
+        vals = self.generate_many(0, shape, _count=10)
+        for val in vals:
+            assert isinstance(val, clpy.ndarray)
+            assert val.dtype == numpy.int32
+            assert val.shape == shape
+            assert (val == 0).all()
 
     def test_shape_zero(self):
-        v = self.rs.interval(10, None)
-        self.assertEqual(v.dtype, numpy.int32)
-        self.assertEqual(v.shape, ())
+        mx = 10
+        vals = self.generate_many(mx, None, _count=10)
+        for val in vals:
+            assert isinstance(val, clpy.ndarray)
+            assert val.dtype == numpy.int32
+            assert val.shape == ()
+            assert (0 <= val).all()
+            assert (val <= mx).all()
+        # TODO(niboshi): Distribution test
 
     def test_shape_one_dim(self):
-        v = self.rs.interval(10, 10)
-        self.assertEqual(v.dtype, numpy.int32)
-        self.assertEqual(v.shape, (10,))
+        mx = 10
+        size = 20
+        vals = self.generate_many(mx, size, _count=10)
+        for val in vals:
+            assert isinstance(val, clpy.ndarray)
+            assert val.dtype == numpy.int32
+            assert val.shape == (size,)
+            assert (0 <= val).all()
+            assert (val <= mx).all()
+        # TODO(niboshi): Distribution test
 
     def test_shape_multi_dim(self):
-        v = self.rs.interval(10, (1, 2))
-        self.assertEqual(v.dtype, numpy.int32)
-        self.assertEqual(v.shape, (1, 2))
+        mx = 10
+        shape = (1, 2)
+        vals = self.generate_many(mx, shape, _count=10)
+        for val in vals:
+            assert isinstance(val, clpy.ndarray)
+            assert val.dtype == numpy.int32
+            assert val.shape == shape
+            assert (0 <= val).all()
+            assert (val <= mx).all()
+        # TODO(niboshi): Distribution test
 
     def test_int32_range(self):
-        v = self.rs.interval(0x00000000, 2)
-        self.assertEqual(v.dtype, numpy.int32)
+        v = self.generate(0x00000000, 2)
+        assert v.dtype == numpy.int32
 
-        v = self.rs.interval(0x7fffffff, 2)
-        self.assertEqual(v.dtype, numpy.int32)
+        v = self.generate(0x7fffffff, 2)
+        assert v.dtype == numpy.int32
 
     def test_uint32_range(self):
-        v = self.rs.interval(0x80000000, 2)
-        self.assertEqual(v.dtype, numpy.uint32)
+        v = self.generate(0x80000000, 2)
+        assert v.dtype == numpy.uint32
 
-        v = self.rs.interval(0xffffffff, 2)
-        self.assertEqual(v.dtype, numpy.uint32)
+        v = self.generate(0xffffffff, 2)
+        assert v.dtype == numpy.uint32
 
-    @condition.repeat(3, 10)
     def test_bound_1(self):
-        vals = [self.rs.interval(10, (2, 3)).get() for _ in range(10)]
+        vals = self.generate_many(10, (2, 3), _count=10)
         for val in vals:
-            self.assertEqual(val.shape, (2, 3))
-        self.assertEqual(min(_.min() for _ in vals), 0)
-        self.assertEqual(max(_.max() for _ in vals), 10)
+            assert isinstance(val, clpy.ndarray)
+            assert val.dtype == numpy.int32
+            assert val.shape == (2, 3)
+            assert (0 <= val).all()
+            assert (val <= 10).all()
 
-    @condition.repeat(3, 10)
     def test_bound_2(self):
-        vals = [self.rs.interval(2, None).get() for _ in range(10)]
-        self.assertEqual(min(vals), 0)
-        self.assertEqual(max(vals), 2)
+        vals = self.generate_many(2, None, _count=20)
+        for val in vals:
+            assert isinstance(val, clpy.ndarray)
+            assert val.dtype == numpy.int32
+            assert val.shape == ()
+            assert (0 <= val).all()
+            assert (val <= 2).all()
 
     @condition.repeat(3, 10)
     def test_goodness_of_fit(self):
         mx = 5
         trial = 100
-        vals = [self.rs.interval(mx, None).get()
-                for _ in six.moves.xrange(trial)]
+        vals = self.generate_many(mx, None, _count=trial)
+        vals = [val.get() for val in vals]
         counts = numpy.histogram(vals, bins=numpy.arange(mx + 2))[0]
         expected = numpy.array([float(trial) / (mx + 1)] * (mx + 1))
         self.assertTrue(hypothesis.chi_square_test(counts, expected))
@@ -277,7 +315,7 @@ def test_goodness_of_fit(self):
     @condition.repeat(3)
     def test_goodness_of_fit_2(self):
         mx = 5
-        vals = self.rs.interval(mx, (5, 5)).get()
+        vals = self.generate(mx, (5, 5)).get()
         counts = numpy.histogram(vals, bins=numpy.arange(mx + 2))[0]
         expected = numpy.array([float(vals.size) / (mx + 1)] * (mx + 1))
         self.assertTrue(hypothesis.chi_square_test(counts, expected))
@@ -292,16 +330,13 @@ def test_goodness_of_fit_2(self):
     {'a': numpy.array([0.0, 1.0, 2.0]), 'size': 2, 'p': [0.3, 0.3, 0.4]},
 )
 @testing.fix_random()
-@testing.with_requires('numpy>=1.11.0')
 @testing.gpu
-class TestChoice1(unittest.TestCase):
+class TestChoice1(RandomGeneratorTestCase):
 
-    def setUp(self):
-        self.rs = clpy.random.get_random_state()
-        self.rs.seed(testing.generate_seed())
+    target_method = 'choice'
 
     def test_dtype_shape(self):
-        v = self.rs.choice(a=self.a, size=self.size, p=self.p)
+        v = self.generate(a=self.a, size=self.size, p=self.p)
         if isinstance(self.size, six.integer_types):
             expected_shape = (self.size,)
         else:
@@ -315,29 +350,27 @@ def test_dtype_shape(self):
 
     @condition.repeat(3, 10)
     def test_bound(self):
-        vals = [self.rs.choice(a=self.a, size=self.size, p=self.p).get()
-                for _ in range(20)]
+        vals = self.generate_many(
+            a=self.a, size=self.size, p=self.p, _count=20)
+        vals = [val.get() for val in vals]
         size_ = self.size if isinstance(self.size, tuple) else (self.size,)
         for val in vals:
             self.assertEqual(val.shape, size_)
-        self.assertEqual(min(_.min() for _ in vals), 0)
-        self.assertEqual(max(_.max() for _ in vals), 2)
+        self.assertEqual(min(val.min() for val in vals), 0)
+        self.assertEqual(max(val.max() for val in vals), 2)
 
 
 @testing.parameterize(
     {'a': [0, 1, 2], 'size': 2, 'p': [0.3, 0.3, 0.4]},
 )
 @testing.fix_random()
-@testing.with_requires('numpy>=1.11.0')
 @testing.gpu
-class TestChoice2(unittest.TestCase):
+class TestChoice2(RandomGeneratorTestCase):
 
-    def setUp(self):
-        self.rs = clpy.random.get_random_state()
-        self.rs.seed(testing.generate_seed())
+    target_method = 'choice'
 
     def test_dtype_shape(self):
-        v = self.rs.choice(a=self.a, size=self.size, p=self.p)
+        v = self.generate(a=self.a, size=self.size, p=self.p)
         if isinstance(self.size, six.integer_types):
             expected_shape = (self.size,)
         else:
@@ -351,36 +384,34 @@ def test_dtype_shape(self):
 
     @condition.repeat(3, 10)
     def test_bound(self):
-        vals = [self.rs.choice(a=self.a, size=self.size, p=self.p).get()
-                for _ in range(20)]
+        vals = self.generate_many(
+            a=self.a, size=self.size, p=self.p, _count=20)
+        vals = [val.get() for val in vals]
         size_ = self.size if isinstance(self.size, tuple) else (self.size,)
         for val in vals:
             self.assertEqual(val.shape, size_)
-        self.assertEqual(min(_.min() for _ in vals), 0)
-        self.assertEqual(max(_.max() for _ in vals), 2)
+        self.assertEqual(min(val.min() for val in vals), 0)
+        self.assertEqual(max(val.max() for val in vals), 2)
 
 
 @testing.fix_random()
-@testing.with_requires('numpy>=1.11.0')
 @testing.gpu
-class TestChoiceChi(unittest.TestCase):
+class TestChoiceChi(RandomGeneratorTestCase):
 
-    def setUp(self):
-        self.rs = clpy.random.get_random_state()
-        self.rs.seed(testing.generate_seed())
+    target_method = 'choice'
 
     @condition.repeat(3, 10)
     def test_goodness_of_fit(self):
         trial = 100
-        vals = [self.rs.choice(3, 1, True, [0.3, 0.3, 0.4]).get()
-                for _ in six.moves.xrange(trial)]
+        vals = self.generate_many(3, 1, True, [0.3, 0.3, 0.4], _count=trial)
+        vals = [val.get() for val in vals]
         counts = numpy.histogram(vals, bins=numpy.arange(4))[0]
         expected = numpy.array([30, 30, 40])
         self.assertTrue(hypothesis.chi_square_test(counts, expected))
 
     @condition.repeat(3, 10)
     def test_goodness_of_fit_2(self):
-        vals = self.rs.choice(3, (5, 20), True, [0.3, 0.3, 0.4]).get()
+        vals = self.generate(3, (5, 20), True, [0.3, 0.3, 0.4]).get()
         counts = numpy.histogram(vals, bins=numpy.arange(4))[0]
         expected = numpy.array([30, 30, 40])
         self.assertTrue(hypothesis.chi_square_test(counts, expected))
@@ -413,7 +444,6 @@ def test_choice_multinomial(self, xp, dtype):
     {'a': 3, 'size': 1, 'p': [0.1, 0.1, 0.7]},
 )
 @testing.fix_random()
-@testing.with_requires('numpy>=1.11.0')
 @testing.gpu
 class TestChoiceFailure(unittest.TestCase):
 
@@ -432,16 +462,13 @@ def test_choice_invalid_value(self):
     {'a': numpy.array([0.0, 2.0, 4.0]), 'size': 2},
 )
 @testing.fix_random()
-@testing.with_requires('numpy>=1.11.0')
 @testing.gpu
-class TestChoiceReplaceFalse(unittest.TestCase):
+class TestChoiceReplaceFalse(RandomGeneratorTestCase):
 
-    def setUp(self):
-        self.rs = clpy.random.get_random_state()
-        self.rs.seed(testing.generate_seed())
+    target_method = 'choice'
 
     def test_dtype_shape(self):
-        v = self.rs.choice(a=self.a, size=self.size, replace=False)
+        v = self.generate(a=self.a, size=self.size, replace=False)
         if isinstance(self.size, six.integer_types):
             expected_shape = (self.size,)
         else:
@@ -455,7 +482,7 @@ def test_dtype_shape(self):
 
     @condition.repeat(3, 10)
     def test_bound(self):
-        val = self.rs.choice(a=self.a, size=self.size, replace=False).get()
+        val = self.generate(a=self.a, size=self.size, replace=False).get()
         size = self.size if isinstance(self.size, tuple) else (self.size,)
         self.assertEqual(val.shape, size)
         self.assertTrue((0 <= val).all())
@@ -463,12 +490,53 @@ def test_bound(self):
         val = numpy.asarray(val)
         self.assertEqual(numpy.unique(val).size, val.size)
 
-    def test_reproduce(self):
-        rs1 = clpy.random.RandomState(1)
-        v1 = rs1.choice(a=self.a, size=self.size, replace=False)
-        rs2 = clpy.random.RandomState(1)
-        v2 = rs2.choice(a=self.a, size=self.size, replace=False)
-        self.assertTrue((v1 == v2).all())
+
+@testing.gpu
+@testing.fix_random()
+class TestGumbel(RandomGeneratorTestCase):
+    # TODO(niboshi):
+    #   Test soundness of distribution.
+    #   Currently only reprocibility is checked.
+
+    target_method = 'gumbel'
+
+    def test_gumbel_1(self):
+        self.generate()
+
+    def test_gumbel_2(self):
+        self.generate(0.0, 1.0, size=(3, 2))
+
+
+@testing.gpu
+@testing.fix_random()
+class TestRandint(RandomGeneratorTestCase):
+    # TODO(niboshi):
+    #   Test soundness of distribution.
+    #   Currently only reprocibility is checked.
+
+    target_method = 'randint'
+
+    def test_randint_1(self):
+        self.generate(3)
+
+    def test_randint_2(self):
+        self.generate(3, 4, size=(3, 2))
+
+
+@testing.gpu
+@testing.fix_random()
+class TestUniform(RandomGeneratorTestCase):
+    # TODO(niboshi):
+    #   Test soundness of distribution.
+    #   Currently only reprocibility is checked.
+
+    target_method = 'uniform'
+
+    def test_uniform_1(self):
+        self.generate()
+
+    def test_uniform_2(self):
+        self.generate(-4.2, 2.4, size=(3, 2))
 
 
 @testing.parameterize(
@@ -476,7 +544,6 @@ def test_reproduce(self):
     {'a': [1, 2, 3], 'size': 5},
 )
 @testing.fix_random()
-@testing.with_requires('numpy>=1.11.0')
 @testing.gpu
 class TestChoiceReplaceFalseFailure(unittest.TestCase):
 
@@ -575,7 +642,7 @@ def test_get_random_state_no_clpy_no_chainer_seed(self):
 
         self._check_different(rvs0, rvs1)
 
-    def test_get_random_state_no_cupy_with_chainer_seed(self):
+    def test_get_random_state_no_clpy_with_chainer_seed(self):
         rvs0 = self._get_rvs(generator.RandomState(5))
 
         os.environ.pop('CLPY_SEED', None)
@@ -584,7 +651,7 @@ def test_get_random_state_no_cupy_with_chainer_seed(self):
 
         self._check_same(rvs0, rvs1)
 
-    def test_get_random_state_with_cupy_no_chainer_seed(self):
+    def test_get_random_state_with_clpy_no_chainer_seed(self):
         rvs0 = self._get_rvs(generator.RandomState(6))
 
         os.environ['CLPY_SEED'] = '6'
@@ -593,7 +660,7 @@ def test_get_random_state_with_cupy_no_chainer_seed(self):
 
         self._check_same(rvs0, rvs1)
 
-    def test_get_random_state_with_cupy_with_chainer_seed(self):
+    def test_get_random_state_with_clpy_with_chainer_seed(self):
         rvs0 = self._get_rvs(generator.RandomState(7))
 
         os.environ['CLPY_SEED'] = '7'
diff --git a/tests/clpy_tests/sorting_tests/test_search.py b/tests/clpy_tests/sorting_tests/test_search.py
index d8bedec4a95..cd258c5f530 100644
--- a/tests/clpy_tests/sorting_tests/test_search.py
+++ b/tests/clpy_tests/sorting_tests/test_search.py
@@ -58,6 +58,12 @@ def test_argmax_axis2(self, xp, dtype):
         a = testing.shaped_random((2, 3, 4), xp, dtype)
         return a.argmax(axis=2)
 
+    @testing.for_all_dtypes(no_complex=True)
+    @testing.numpy_clpy_allclose()
+    def test_argmax_tie(self, xp, dtype):
+        a = xp.array([0, 5, 2, 3, 4, 5], dtype)
+        return a.argmax()
+
     @testing.for_all_dtypes(no_complex=True)
     @testing.numpy_clpy_raises(accept_error=ValueError)
     def test_argmax_zero_size(self, xp, dtype):
@@ -124,6 +130,12 @@ def test_argmin_axis2(self, xp, dtype):
         a = testing.shaped_random((2, 3, 4), xp, dtype)
         return a.argmin(axis=2)
 
+    @testing.for_all_dtypes(no_complex=True)
+    @testing.numpy_clpy_allclose()
+    def test_argmin_tie(self, xp, dtype):
+        a = xp.array([0, 1, 2, 3, 0, 5], dtype)
+        return a.argmin()
+
     @testing.for_all_dtypes(no_complex=True)
     @testing.numpy_clpy_raises(accept_error=ValueError)
     def test_argmin_zero_size(self, xp, dtype):
diff --git a/tests/clpy_tests/test_cudnn.py b/tests/clpy_tests/test_cudnn.py
new file mode 100644
index 00000000000..4fbc22793fa
--- /dev/null
+++ b/tests/clpy_tests/test_cudnn.py
@@ -0,0 +1,37 @@
+import unittest
+
+import numpy
+
+import cupy
+try:
+    import cupy.cuda.cudnn as libcudnn
+    cudnn_enabled = True
+    modes = [
+        libcudnn.CUDNN_ACTIVATION_SIGMOID,
+        libcudnn.CUDNN_ACTIVATION_RELU,
+        libcudnn.CUDNN_ACTIVATION_TANH,
+    ]
+    import cupy.cudnn
+except ImportError:
+    cudnn_enabled = False
+    modes = []
+from cupy import testing
+
+
+@testing.parameterize(*testing.product({
+    'dtype': [numpy.float32, numpy.float64],
+    'mode': modes,
+}))
+@unittest.skipUnless(cudnn_enabled, 'cuDNN is not available')
+class TestCudnnActivation(unittest.TestCase):
+
+    def setUp(self):
+        self.x = testing.shaped_arange((3, 4), cupy, self.dtype)
+        self.y = testing.shaped_arange((3, 4), cupy, self.dtype)
+        self.g = testing.shaped_arange((3, 4), cupy, self.dtype)
+
+    def test_activation_forward(self):
+        cupy.cudnn.activation_forward(self.x, self.mode)
+
+    def test_activation_backward(self):
+        cupy.cudnn.activation_backward(self.x, self.y, self.g, self.mode)
diff --git a/tests/clpy_tests/testing_tests/test_helper.py b/tests/clpy_tests/testing_tests/test_helper.py
index 1f7e0b6ff66..02c3908a259 100644
--- a/tests/clpy_tests/testing_tests/test_helper.py
+++ b/tests/clpy_tests/testing_tests/test_helper.py
@@ -32,56 +32,122 @@ def test_unsigned_only(self):
 
 class TestCheckCupyNumpyError(unittest.TestCase):
 
+    tbs = {
+        clpy: 'xxxx',
+        numpy: 'yyyy'
+    }
+
     def test_both_success(self):
+        @testing.helper.numpy_clpy_raises()
+        def dummy_both_success(self, xp):
+            pass
+
         with self.assertRaises(AssertionError):
-            helper._check_clpy_numpy_error(self, None, None, None, None)
+            dummy_both_success(self)
 
     def test_clpy_error(self):
-        clpy_error = Exception()
-        clpy_tb = 'xxxx'
-        with six.assertRaisesRegex(self, AssertionError, clpy_tb):
-            helper._check_clpy_numpy_error(self, clpy_error, clpy_tb,
-                                           None, None)
+        @testing.helper.numpy_clpy_raises()
+        def dummy_clpy_error(self, xp):
+            if xp is clpy:
+                raise Exception(self.tbs.get(clpy))
+
+        with six.assertRaisesRegex(self, AssertionError, self.tbs.get(clpy)):
+            dummy_clpy_error(self)
 
     def test_numpy_error(self):
-        numpy_error = Exception()
-        numpy_tb = 'yyyy'
-        with six.assertRaisesRegex(self, AssertionError, numpy_tb):
-            helper._check_clpy_numpy_error(self, None, None,
-                                           numpy_error, numpy_tb)
+        @testing.helper.numpy_clpy_raises()
+        def dummy_numpy_error(self, xp):
+            if xp is numpy:
+                raise Exception(self.tbs.get(numpy))
+
+        with six.assertRaisesRegex(self, AssertionError, self.tbs.get(numpy)):
+            dummy_numpy_error(self)
 
     def test_clpy_numpy_different_error(self):
-        clpy_error = TypeError()
-        clpy_tb = 'xxxx'
-        numpy_error = ValueError()
-        numpy_tb = 'yyyy'
+        @testing.helper.numpy_clpy_raises()
+        def dummy_clpy_numpy_different_error(self, xp):
+            if xp is clpy:
+                raise TypeError(self.tbs.get(clpy))
+            elif xp is numpy:
+                raise ValueError(self.tbs.get(numpy))
+
         # Use re.S mode to ignore new line characters
-        pattern = re.compile(clpy_tb + '.*' + numpy_tb, re.S)
+        pattern = re.compile(
+            self.tbs.get(clpy) + '.*' + self.tbs.get(numpy), re.S)
         with six.assertRaisesRegex(self, AssertionError, pattern):
-            helper._check_clpy_numpy_error(self, clpy_error, clpy_tb,
-                                           numpy_error, numpy_tb)
+            dummy_clpy_numpy_different_error(self)
+
+    def test_clpy_derived_error(self):
+        @testing.helper.numpy_clpy_raises()
+        def dummy_clpy_derived_error(self, xp):
+            if xp is clpy:
+                raise ValueError(self.tbs.get(clpy))
+            elif xp is numpy:
+                raise Exception(self.tbs.get(numpy))
+
+        dummy_clpy_derived_error(self)  # Assert no exceptions
+
+    def test_numpy_derived_error(self):
+        @testing.helper.numpy_clpy_raises()
+        def dummy_numpy_derived_error(self, xp):
+            if xp is clpy:
+                raise Exception(self.tbs.get(clpy))
+            elif xp is numpy:
+                raise IndexError(self.tbs.get(numpy))
+
+        # NumPy errors may not derive from CuPy errors, i.e. CuPy errors should
+        # be at least as explicit as the NumPy error
+        pattern = re.compile(
+            self.tbs.get(clpy) + '.*' + self.tbs.get(numpy), re.S)
+        with six.assertRaisesRegex(self, AssertionError, pattern):
+            dummy_numpy_derived_error(self)
 
     def test_same_error(self):
-        clpy_error = Exception()
-        clpy_tb = 'xxxx'
-        numpy_error = Exception()
-        numpy_tb = 'yyyy'
-        # Nothing happens
-        helper._check_clpy_numpy_error(self, clpy_error, clpy_tb,
-                                       numpy_error, numpy_tb,
-                                       accept_error=Exception)
+        @testing.helper.numpy_clpy_raises(accept_error=Exception)
+        def dummy_same_error(self, xp):
+            raise Exception(self.tbs.get(xp))
+
+        dummy_same_error(self)
+
+    def test_clpy_derived_unaccept_error(self):
+        @testing.helper.numpy_clpy_raises(accept_error=ValueError)
+        def dummy_clpy_derived_unaccept_error(self, xp):
+            if xp is clpy:
+                raise IndexError(self.tbs.get(clpy))
+            elif xp is numpy:
+                raise Exception(self.tbs.get(numpy))
+
+        # Neither `IndexError` nor `Exception` is derived from `ValueError`,
+        # therefore expect an error
+        pattern = re.compile(
+            self.tbs.get(clpy) + '.*' + self.tbs.get(numpy), re.S)
+        with six.assertRaisesRegex(self, AssertionError, pattern):
+            dummy_clpy_derived_unaccept_error(self)
+
+    def test_numpy_derived_unaccept_error(self):
+        @testing.helper.numpy_clpy_raises(accept_error=ValueError)
+        def dummy_numpy_derived_unaccept_error(self, xp):
+            if xp is clpy:
+                raise Exception(self.tbs.get(clpy))
+            elif xp is numpy:
+                raise ValueError(self.tbs.get(numpy))
+
+        # `Exception` is not derived from `ValueError`, therefore expect an
+        # error
+        pattern = re.compile(
+            self.tbs.get(clpy) + '.*' + self.tbs.get(numpy), re.S)
+        with six.assertRaisesRegex(self, AssertionError, pattern):
+            dummy_numpy_derived_unaccept_error(self)
 
     def test_forbidden_error(self):
-        clpy_error = Exception()
-        clpy_tb = 'xxxx'
-        numpy_error = Exception()
-        numpy_tb = 'yyyy'
-        # Use re.S mode to ignore new line characters
-        pattern = re.compile(clpy_tb + '.*' + numpy_tb, re.S)
+        @testing.helper.numpy_clpy_raises(accept_error=False)
+        def dummy_forbidden_error(self, xp):
+            raise Exception(self.tbs.get(xp))
+
+        pattern = re.compile(
+            self.tbs.get(clpy) + '.*' + self.tbs.get(numpy), re.S)
         with six.assertRaisesRegex(self, AssertionError, pattern):
-            helper._check_clpy_numpy_error(
-                self, clpy_error, clpy_tb,
-                numpy_error, numpy_tb, accept_error=False)
+            dummy_forbidden_error(self)
 
 
 class NumPyCuPyDecoratorBase(object):