From c95dbecb8e261ab60ee0d758ee4260a89fddb662 Mon Sep 17 00:00:00 2001
From: niboshi <niboshi000@gmail.com>
Date: Tue, 21 Nov 2017 15:14:37 +0900
Subject: [PATCH 01/21] Merge pull request #767 from
 kmaehashi/fix-dependency-docs

improve dependency description of docs
---
 docs/source/install.rst | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/docs/source/install.rst b/docs/source/install.rst
index 850069722cc..69d5085e19e 100644
--- a/docs/source/install.rst
+++ b/docs/source/install.rst
@@ -23,9 +23,8 @@ The following versions of Python can be used: 2.7.6+, 3.4.3+, 3.5.1+, and 3.6.0+
 .. note::
 
    We are testing CuPy automatically with Jenkins, where all the above *recommended* environments are tested.
-   We cannot guarantee that CuPy works on other environments including Windows and macOS (especially with CUDA support), even if CuPy looks running correctly.
+   We cannot guarantee that CuPy works on other environments including Windows and macOS, even if CuPy looks running correctly.
 
-CuPy is supported on Python 2.7.6+, 3.4.3+, 3.5.1+, 3.6.0+.
 CuPy uses C++ compiler such as g++.
 You need to install it before installing CuPy.
 This is typical installation method for each platform::
@@ -54,17 +53,17 @@ The latest version of each package will automatically be installed if missing.
 * `NumPy <http://www.numpy.org/>`_ 1.9, 1.10, 1.11, 1.12, 1.13
 * `Six <https://pythonhosted.org/six/>`_ 1.9+
 
-CUDA support
+In addition, you need to install `CUDA <https://developer.nvidia.com/cuda-zone>`_.
+The following versions of CUDA can be used: 7.0, 7.5, 8.0 and 9.0.
 
-* `CUDA <https://developer.nvidia.com/cuda-zone>`_ 7.0, 7.5, 8.0, 9.0
+Optional Libraries
+~~~~~~~~~~~~~~~~~~
 
-cuDNN support
+The following libraries are optional dependencies.
+CuPy will enable these features only if they are installed.
 
 * `cuDNN <https://developer.nvidia.com/cudnn>`_ v4, v5, v5.1, v6, v7
-
-NCCL support
-
-* `nccl <https://github.com/NVIDIA/nccl>`_ v1.3+
+* `NCCL <https://github.com/NVIDIA/nccl>`_ v1.3+
 
 Install CuPy
 ------------

From c7581572766ee2cf0b01f7caac18b03947296ef5 Mon Sep 17 00:00:00 2001
From: niboshi <niboshi000@gmail.com>
Date: Tue, 21 Nov 2017 15:17:20 +0900
Subject: [PATCH 02/21] Merge pull request #769 from kmaehashi/fix-typo-cache

fix typo in environment variable reference
---
 docs/source/reference/environment.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/reference/environment.rst b/docs/source/reference/environment.rst
index d2c804eddcb..7907b090e80 100644
--- a/docs/source/reference/environment.rst
+++ b/docs/source/reference/environment.rst
@@ -6,7 +6,7 @@ Here are the environment variables CuPy uses.
 
 +------------------------------------+----------------------------------------------------+
 | ``CUPY_CACHE_DIR``                 | Path to the directory to store kernel cache.       |
-|                                    | ``$(HOME)/.cupy.kernel_cache`` is used by default. |
+|                                    | ``${HOME}/.cupy/kernel_cache`` is used by default. |
 |                                    | See :ref:`overview` for details.                   |
 +------------------------------------+----------------------------------------------------+
 | ``CUPY_CACHE_SAVE_CUDA_SOURCE``    | If set to 1, CUDA source file will be saved along  |

From 4f9c59efe2bd234a58d3ad6aaaf52fe490354b66 Mon Sep 17 00:00:00 2001
From: niboshi <niboshi000@gmail.com>
Date: Fri, 10 Nov 2017 13:26:03 +0900
Subject: [PATCH 03/21] Merge pull request #512 from okuta/improve-cudnn

Remove cuDNN overhead
---
 cupy/core/core.pxd             |   3 +
 cupy/cuda/device.pxd           |   4 +-
 cupy/cuda/device.pyx           |  21 +-
 cupy/cudnn.py                  | 338 ----------------------------
 cupy/cudnn.pyx                 | 389 +++++++++++++++++++++++++++++++++
 cupy_setup_build.py            |   1 +
 tests/cupy_tests/test_cudnn.py |  25 +--
 7 files changed, 413 insertions(+), 368 deletions(-)
 delete mode 100644 cupy/cudnn.py
 create mode 100644 cupy/cudnn.pyx

diff --git a/cupy/core/core.pxd b/cupy/core/core.pxd
index aff8f0cc000..82c257cb6fc 100644
--- a/cupy/core/core.pxd
+++ b/cupy/core/core.pxd
@@ -72,3 +72,6 @@ cdef class Indexer:
         readonly tuple shape
 
     cdef CPointer get_pointer(self)
+
+
+cpdef ndarray ascontiguousarray(ndarray a, dtype=*)
diff --git a/cupy/cuda/device.pxd b/cupy/cuda/device.pxd
index a48a148755d..d32c8630549 100644
--- a/cupy/cuda/device.pxd
+++ b/cupy/cuda/device.pxd
@@ -1,5 +1,7 @@
 cpdef int get_device_id() except *
-cpdef get_cublas_handle()
+cpdef size_t get_cublas_handle() except *
+cpdef size_t get_cusolver_handle() except *
+cpdef size_t get_cusparse_handle() except *
 
 cdef class Device:
     cdef:
diff --git a/cupy/cuda/device.pyx b/cupy/cuda/device.pyx
index f414265b7c3..1369e049a88 100644
--- a/cupy/cuda/device.pyx
+++ b/cupy/cuda/device.pyx
@@ -24,24 +24,27 @@ cdef dict _cusolver_handles = {}
 cdef dict _cusparse_handles = {}
 
 
-cpdef get_cublas_handle():
+cpdef size_t get_cublas_handle() except *:
     dev_id = get_device_id()
-    if dev_id in _cublas_handles:
-        return _cublas_handles[dev_id]
+    ret = _cublas_handles.get(dev_id, None)
+    if ret is not None:
+        return ret
     return Device().cublas_handle
 
 
-cpdef get_cusolver_handle():
+cpdef size_t get_cusolver_handle() except *:
     dev_id = get_device_id()
-    if dev_id in _cusolver_handles:
-        return _cusolver_handles[dev_id]
+    ret = _cusolver_handles.get(dev_id, None)
+    if ret is not None:
+        return ret
     return Device().cusolver_handle
 
 
-cpdef get_cusparse_handle():
+cpdef size_t get_cusparse_handle() except *:
     dev_id = get_device_id()
-    if dev_id in _cusparse_handles:
-        return _cusparse_handles[dev_id]
+    ret = _cusparse_handles.get(dev_id, None)
+    if ret is not None:
+        return ret
     return Device().cusparse_handle
 
 
diff --git a/cupy/cudnn.py b/cupy/cudnn.py
deleted file mode 100644
index 60a1580a226..00000000000
--- a/cupy/cudnn.py
+++ /dev/null
@@ -1,338 +0,0 @@
-import atexit
-import threading
-
-import numpy
-import six
-
-import cupy
-from cupy.core import internal
-from cupy import cuda
-from cupy.cuda import cudnn
-
-
-_cudnn_version = cudnn.getVersion()
-_thread_local = threading.local()
-
-_handles = {}
-
-
-def get_handle():
-    dev = cuda.get_device_id()
-    if dev in _handles:
-        return _handles[dev]
-    handle = cudnn.create()
-    _handles[dev] = handle
-    return handle
-
-
-@atexit.register
-def reset_handles():
-    global _handles
-    handles = _handles
-    _handles = {}
-
-    for handle in six.itervalues(handles):
-        cudnn.destroy(handle)
-
-
-def _get_nd_tensor_cache():
-    if not hasattr(_thread_local, 'cudnn_nd_tensor_cache'):
-        _thread_local.cudnn_nd_tensor_cache = {}
-    return _thread_local.cudnn_nd_tensor_cache
-
-
-class Descriptor(object):
-
-    def __init__(self, descriptor, destroyer):
-        self.value = descriptor
-        self.destroy = destroyer
-
-    def __del__(self):
-        if self.value:
-            self.destroy(self.value)
-            self.value = None
-
-
-def get_data_type(dtype):
-    if dtype.type == numpy.float32:
-        return cudnn.CUDNN_DATA_FLOAT
-    elif dtype.type == numpy.float64:
-        return cudnn.CUDNN_DATA_DOUBLE
-    elif dtype.type == numpy.float16:
-        return cudnn.CUDNN_DATA_HALF
-    else:
-        raise TypeError('Dtype {} is not supported in cuDNN'.format(dtype))
-
-
-def _to_ctypes_array(tup, dtype=numpy.intc):
-    return numpy.array(tup, dtype=dtype).ctypes
-
-
-def create_tensor_descriptor(arr, format=cudnn.CUDNN_TENSOR_NCHW):
-    desc = Descriptor(cudnn.createTensorDescriptor(),
-                      cudnn.destroyTensorDescriptor)
-    if not arr.flags.c_contiguous:
-        raise ValueError('cupy.cudnn supports c-contiguous arrays only')
-    data_type = get_data_type(arr.dtype)
-    if arr.ndim == 4:
-        cudnn.setTensor4dDescriptor(desc.value, format, data_type, *arr.shape)
-    else:
-        strides = [s // arr.itemsize for s in arr.strides]
-        c_shape = _to_ctypes_array(arr.shape)
-        c_strides = _to_ctypes_array(strides)
-        cudnn.setTensorNdDescriptor(desc.value, data_type, arr.ndim,
-                                    c_shape.data, c_strides.data)
-    return desc
-
-
-def create_uninitialized_tensor_descriptor():
-    """Create uninitialized tensor descriptor.
-
-    Create a cudnnCreateTensorDescriptor_t that is not yet initialized.
-    This is used by the batch normalization functions.
-    """
-    desc = Descriptor(cudnn.createTensorDescriptor(),
-                      cudnn.destroyTensorDescriptor)
-    return desc
-
-
-def create_tensor_nd_descriptor(arr):
-    desc = Descriptor(cudnn.createTensorDescriptor(),
-                      cudnn.destroyTensorDescriptor)
-    if not arr.flags.c_contiguous:
-        raise ValueError('cupy.cudnn supports c-contiguous arrays only')
-    data_type = get_data_type(arr.dtype)
-    shape = arr.shape
-    key = (data_type, shape)
-    cache = _get_nd_tensor_cache()
-    if key in cache:
-        return cache[key]
-
-    # numpy's stride is defined in bytes, but cudnn's stride is defined in
-    # size of element
-    strides = [s // arr.itemsize for s in arr.strides]
-
-    c_shape = _to_ctypes_array(shape)
-    c_strides = _to_ctypes_array(strides)
-    cudnn.setTensorNdDescriptor(desc.value, data_type,
-                                arr.ndim, c_shape.data, c_strides.data)
-    cache = _get_nd_tensor_cache()
-    cache[key] = desc
-    return desc
-
-
-def create_filter_descriptor(arr, format=cudnn.CUDNN_TENSOR_NCHW):
-    desc = Descriptor(cudnn.createFilterDescriptor(),
-                      cudnn.destroyFilterDescriptor)
-    data_type = get_data_type(arr.dtype)
-    if arr.ndim == 4:
-        cudnn.setFilter4dDescriptor_v4(desc.value, data_type, format,
-                                       *arr.shape)
-    else:
-        c_shape = _to_ctypes_array(arr.shape)
-        cudnn.setFilterNdDescriptor_v4(desc.value, data_type, format,
-                                       arr.ndim, c_shape.data)
-    return desc
-
-
-def create_convolution_descriptor(pad, stride, dtype,
-                                  mode=cudnn.CUDNN_CROSS_CORRELATION,
-                                  dilation=(1, 1),
-                                  use_tensor_core=False,
-                                  group=1):
-    desc = Descriptor(cudnn.createConvolutionDescriptor(),
-                      cudnn.destroyConvolutionDescriptor)
-    ndim = len(pad)
-    if ndim != len(stride):
-        raise ValueError('pad and stride must be of same length')
-
-    if ndim == 2:
-        if _cudnn_version < 6000:
-            if dilation[0] != 1 or dilation[1] != 1:
-                raise ValueError('dilation must be one when cudnn < 6.0')
-        if _cudnn_version >= 5000:
-            compute_type = get_data_type(dtype)
-            # TODO(takagi) Temporarily use computing precision of FP32 for
-            #     storing precision of FP16.
-            if dtype == numpy.float16:
-                compute_type = cudnn.CUDNN_DATA_FLOAT
-            cudnn.setConvolution2dDescriptor_v5(
-                desc.value, pad[0], pad[1], stride[0], stride[1],
-                dilation[0], dilation[1], mode, compute_type)
-
-            if _cudnn_version >= 7000:
-                if use_tensor_core:
-                    math_type = cudnn.CUDNN_TENSOR_OP_MATH
-                    cudnn.setConvolutionMathType(desc.value, math_type)
-                if group > 1:
-                    cudnn.setConvolutionGroupCount(desc.value, group)
-        else:
-            cudnn.setConvolution2dDescriptor_v4(
-                desc.value, pad[0], pad[1], stride[0], stride[1], 1, 1, mode)
-    else:
-        c_pad = _to_ctypes_array(pad)
-        c_stride = _to_ctypes_array(stride)
-        c_dilation = _to_ctypes_array((1,) * ndim)
-        compute_type = get_data_type(dtype)
-        # TODO(takagi) Temporarily use computing precision of FP32 for
-        #     storing precision of FP16.
-        if dtype == numpy.float16:
-            compute_type = cudnn.CUDNN_DATA_FLOAT
-        cudnn.setConvolutionNdDescriptor_v3(
-            desc.value, ndim, c_pad.data, c_stride.data, c_dilation.data,
-            mode, compute_type)
-
-    return desc
-
-
-def create_pooling_descriptor(ksize, stride, pad, mode):
-    desc = Descriptor(cudnn.createPoolingDescriptor(),
-                      cudnn.destroyPoolingDescriptor)
-    ndim = len(ksize)
-    if ndim != len(stride) or ndim != len(pad):
-        raise ValueError('ksize, stride, and pad must be of same length')
-
-    if ndim == 2:
-        cudnn.setPooling2dDescriptor_v4(
-            desc.value, mode, cudnn.CUDNN_NOT_PROPAGATE_NAN, ksize[0],
-            ksize[1], pad[0], pad[1], stride[0], stride[1])
-    else:
-        c_ksize = _to_ctypes_array(ksize)
-        c_pad = _to_ctypes_array(pad)
-        c_stride = _to_ctypes_array(stride)
-        cudnn.setPoolingNdDescriptor_v4(
-            desc.value, mode, cudnn.CUDNN_NOT_PROPAGATE_NAN, ndim,
-            c_ksize.data, c_pad.data, c_stride.data)
-
-    return desc
-
-
-def _as4darray(arr):
-    if arr.ndim == 0:
-        return arr.reshape(1, 1, 1, 1)
-    else:
-        return arr.reshape(arr.shape[0], -1, 1, 1)
-
-
-def activation_forward(x, mode):
-    x = cupy.ascontiguousarray(x)
-    y = cupy.empty_like(x)
-
-    dtype = 'd' if x.dtype == 'd' else 'f'
-    one = numpy.array(1, dtype=dtype).ctypes
-    zero = numpy.array(0, dtype=dtype).ctypes
-    handle = get_handle()
-    x_mat = _as4darray(x)
-    desc = create_tensor_descriptor(x_mat)
-    act_desc = Descriptor(cudnn.createActivationDescriptor(),
-                          cudnn.destroyActivationDescriptor)
-    cudnn.setActivationDescriptor(
-        act_desc.value, mode, cudnn.CUDNN_NOT_PROPAGATE_NAN, 0.0)
-    cudnn.activationForward_v4(
-        handle, act_desc.value, one.data, desc.value, x_mat.data.ptr,
-        zero.data, desc.value, y.data.ptr)
-    return y
-
-
-def activation_backward(x, y, gy, mode):
-    x = cupy.ascontiguousarray(x)
-    gy = cupy.ascontiguousarray(gy)
-
-    gx = cupy.empty_like(x)
-    dtype = 'd' if x.dtype == 'd' else 'f'
-    one = numpy.array(1, dtype=dtype).ctypes
-    zero = numpy.array(0, dtype=dtype).ctypes
-    handle = get_handle()
-    y_mat = _as4darray(y)
-    desc = create_tensor_descriptor(y_mat)
-    act_desc = Descriptor(cudnn.createActivationDescriptor(),
-                          cudnn.destroyActivationDescriptor)
-    cudnn.setActivationDescriptor(
-        act_desc.value, mode, cudnn.CUDNN_NOT_PROPAGATE_NAN, 0.0)
-    cudnn.activationBackward_v4(
-        handle, act_desc.value, one.data, desc.value, y.data.ptr,
-        desc.value, gy.data.ptr, desc.value, x.data.ptr,
-        zero.data, desc.value, gx.data.ptr)
-    return gx
-
-
-def create_dropout_descriptor(
-        handle, dropout, states, state_size_in_bytes, seed):
-    desc = Descriptor(cudnn.createDropoutDescriptor(),
-                      cudnn.destroyDropoutDescriptor)
-    cudnn.setDropoutDescriptor(desc.value, handle, dropout,
-                               states, state_size_in_bytes, seed)
-    return desc
-
-
-def set_dropout_descriptor(desc, handle, dropout):
-    # When the fourth argument is NULL, random state is not updated.
-    cudnn.setDropoutDescriptor(desc.value, handle, dropout, 0, 0, 0)
-
-
-def create_rnn_descriptor(hidden_size, num_layers, dropout_desc,
-                          input_mode, direction, mode, data_type):
-    desc = Descriptor(cudnn.createRNNDescriptor(),
-                      cudnn.destroyRNNDescriptor)
-    if _cudnn_version >= 7000:
-        _handle = get_handle()
-        _algo = cudnn.CUDNN_RNN_ALGO_STANDARD
-        cudnn.setRNNDescriptor_v6(
-            _handle, desc.value, hidden_size, num_layers, dropout_desc.value,
-            input_mode, direction, mode, _algo, data_type)
-    else:
-        cudnn.setRNNDescriptor_v5(
-            desc.value, hidden_size, num_layers, dropout_desc.value,
-            input_mode, direction, mode, data_type)
-    return desc
-
-
-def get_rnn_lin_layer_matrix_params(
-        handle, rnn_desc, layer, x_desc, w_desc, w, lin_layer_id):
-    mat_desc = Descriptor(cudnn.createFilterDescriptor(),
-                          cudnn.destroyFilterDescriptor)
-    ptr = numpy.array(0, dtype=numpy.intp)
-    cudnn.getRNNLinLayerMatrixParams(
-        handle, rnn_desc.value, layer, x_desc.value, w_desc.value, w.data.ptr,
-        lin_layer_id, mat_desc.value, ptr.ctypes.data)
-    offset = (ptr - w.data.ptr) // 4
-    _, _, _, dim = cudnn.getFilterNdDescriptor(mat_desc.value, 3)
-    size = internal.prod(dim)
-    mat = w[offset: offset + size]
-    return mat
-
-
-def get_rnn_lin_layer_bias_params(
-        handle, rnn_desc, layer, x_desc, w_desc, w, lin_layer_id):
-    bias_desc = Descriptor(cudnn.createFilterDescriptor(),
-                           cudnn.destroyFilterDescriptor)
-    ptr = numpy.array(0, dtype=numpy.intp)
-    cudnn.getRNNLinLayerBiasParams(
-        handle, rnn_desc.value, layer, x_desc.value, w_desc.value, w.data.ptr,
-        lin_layer_id, bias_desc.value, ptr.ctypes.data)
-    offset = (ptr - w.data.ptr) // 4
-    _, _, _, dim = cudnn.getFilterNdDescriptor(bias_desc.value, 3)
-    size = internal.prod(dim)
-    bias = w[offset: offset + size]
-    return bias
-
-
-def create_dropout_states(handle):
-    state_size = cudnn.dropoutGetStatesSize(handle)
-    return cupy.empty((state_size,), dtype='b')
-
-
-def create_spatial_transformer_descriptor(sampler_type, dtype, nb_dims, dim_A):
-    desc = Descriptor(cudnn.createSpatialTransformerDescriptor(),
-                      cudnn.destroySpatialTransformerDescriptor)
-    data_type = get_data_type(dtype)
-
-    cudnn.setSpatialTransformerDescriptor(
-        desc.value, sampler_type, data_type, nb_dims, dim_A)
-    return desc
-
-
-def add_tensor(handle, alpha, biasDesc, biasData, beta, srcDestDesc,
-               srcDestData):
-    cudnn.addTensor_v3(handle, alpha, biasDesc,
-                       biasData, beta, srcDestDesc, srcDestData)
diff --git a/cupy/cudnn.pyx b/cupy/cudnn.pyx
new file mode 100644
index 00000000000..baa630a6a1c
--- /dev/null
+++ b/cupy/cudnn.pyx
@@ -0,0 +1,389 @@
+from libcpp cimport vector
+
+import atexit
+import threading
+
+import numpy
+
+from cupy.core cimport core
+from cupy.cuda cimport cudnn
+from cupy.cuda cimport device
+from cupy.cuda cimport memory
+
+import cupy
+from cupy.core import internal
+from cupy.cuda import cudnn as py_cudnn
+
+
+cdef int _cudnn_version = cudnn.getVersion()
+cdef _thread_local = threading.local()
+
+cdef vector.vector[size_t] _handles
+
+
+cpdef size_t get_handle() except *:
+    cdef int dev
+    dev = device.get_device_id()
+    if _handles.size() <= dev:
+        _handles.resize(dev + 1, 0)
+    ret = _handles[dev]
+    if ret != 0:
+        return ret
+    ret = cudnn.create()
+    _handles[dev] = ret
+    return ret
+
+
+@atexit.register
+def reset_handles():
+    for handle in _handles:
+        if handle:
+            cudnn.destroy(handle)
+    _handles.clear()
+
+
+cpdef dict _get_nd_tensor_cache():
+    if not hasattr(_thread_local, 'cudnn_nd_tensor_cache'):
+        _thread_local.cudnn_nd_tensor_cache = {}
+    return _thread_local.cudnn_nd_tensor_cache
+
+
+class Descriptor(object):
+
+    def __init__(self, descriptor, destroyer):
+        self.value = descriptor
+        self.destroy = destroyer
+
+    def __del__(self):
+        if self.value:
+            self.destroy(self.value)
+            self.value = None
+
+
+cpdef get_data_type(dtype):
+    t = dtype.type
+    if t is numpy.float32:
+        return cudnn.CUDNN_DATA_FLOAT
+    elif t is numpy.float64:
+        return cudnn.CUDNN_DATA_DOUBLE
+    elif t is numpy.float16:
+        return cudnn.CUDNN_DATA_HALF
+    else:
+        raise TypeError('Dtype {} is not supported in cuDNN'.format(dtype))
+
+
+cpdef _create_tensor_nd_descriptor(
+        size_t desc, core.ndarray arr, int data_type):
+    cdef vector.vector[int] c_shape, c_strides
+    cdef Py_ssize_t itemsize, s
+    itemsize = arr.itemsize
+    for s in arr._strides:
+        c_strides.push_back(s // itemsize)
+    for s in arr._shape:
+        c_shape.push_back(s)
+    cudnn.setTensorNdDescriptor(
+        desc, data_type, arr.ndim, <size_t>&c_shape[0], <size_t>&c_strides[0])
+
+
+cpdef _create_tensor_descriptor(size_t desc, core.ndarray arr, int format):
+    if not arr.flags.c_contiguous:
+        raise ValueError('cupy.cudnn supports c-contiguous arrays only')
+    data_type = get_data_type(arr.dtype)
+    if arr._shape.size() == 4:
+        n, c, h, w = arr.shape
+        cudnn.setTensor4dDescriptor(desc, format, data_type, n, c, h, w)
+    else:
+        _create_tensor_nd_descriptor(desc, arr, data_type)
+
+
+cpdef _create_filter_descriptor(
+        size_t desc, core.ndarray arr, int format=cudnn.CUDNN_TENSOR_NCHW):
+    cdef vector.vector[int] c_shape
+    cdef Py_ssize_t s
+    data_type = get_data_type(arr.dtype)
+    if arr._shape.size() == 4:
+        n, c, h, w = arr.shape
+        cudnn.setFilter4dDescriptor_v4(
+            desc, data_type, format, n, c, h, w)
+    else:
+        for s in arr._shape:
+            c_shape.push_back(s)
+        cudnn.setFilterNdDescriptor_v4(
+            desc, data_type, format, arr.ndim, <size_t>&c_shape[0])
+
+
+cpdef _create_convolution_descriptor(
+        desc, pad, stride, dtype, mode, dilation, int group,
+        bint use_tensor_core):
+    cdef int d0, d1, p0, p1, s0, s1
+    cdef vector.vector[int] c_pad, c_stride, c_dilation
+    ndim = len(pad)
+    if ndim != len(stride):
+        raise ValueError('pad and stride must be of same length')
+
+    compute_type = get_data_type(dtype)
+    # TODO(takagi) Temporarily use computing precision of FP32 for
+    #     storing precision of FP16.
+    if compute_type == cudnn.CUDNN_DATA_HALF:
+        compute_type = cudnn.CUDNN_DATA_FLOAT
+
+    if ndim != 2:
+        c_pad = pad
+        c_stride = stride
+        c_dilation.assign(ndim, 1)
+        cudnn.setConvolutionNdDescriptor_v3(
+            desc, ndim, <size_t>&c_pad[0], <size_t>&c_stride[0],
+            <size_t>&c_dilation[0], mode, compute_type)
+        return
+
+    d0, d1 = dilation
+    p0, p1 = pad
+    s0, s1 = stride
+    if _cudnn_version < 6000 and (d0 != 1 or d1 != 1):
+        raise ValueError('dilation must be one when cudnn < 6.0')
+    if _cudnn_version >= 5000:
+        cudnn.setConvolution2dDescriptor_v5(
+            desc, p0, p1, s0, s1, d0, d1, mode, compute_type)
+        if _cudnn_version >= 7000 and use_tensor_core:
+            math_type = cudnn.CUDNN_TENSOR_OP_MATH
+            cudnn.setConvolutionMathType(desc, math_type)
+            if group > 1:
+                cudnn.setConvolutionGroupCount(desc.value, group)
+    else:
+        cudnn.setConvolution2dDescriptor_v4(desc, p0, p1, s0, s1, 1, 1, mode)
+
+
+def create_tensor_descriptor(arr, format=cudnn.CUDNN_TENSOR_NCHW):
+    desc = Descriptor(cudnn.createTensorDescriptor(),
+                      py_cudnn.destroyTensorDescriptor)
+    _create_tensor_descriptor(desc.value, arr, format)
+    return desc
+
+
+def create_uninitialized_tensor_descriptor():
+    """Create uninitialized tensor descriptor.
+
+    Create a cudnnCreateTensorDescriptor_t that is not yet initialized.
+    This is used by the batch normalization functions.
+    """
+    return Descriptor(cudnn.createTensorDescriptor(),
+                      py_cudnn.destroyTensorDescriptor)
+
+
+def create_tensor_nd_descriptor(core.ndarray arr):
+    cdef dict cache
+    if not arr.flags.c_contiguous:
+        raise ValueError('cupy.cudnn supports c-contiguous arrays only')
+    data_type = get_data_type(arr.dtype)
+    shape = arr.shape
+    key = (data_type, shape)
+    cache = _get_nd_tensor_cache()
+    if key in cache:
+        return cache[key]
+
+    # numpy's stride is defined in bytes, but cudnn's stride is defined in
+    # size of element
+    desc = Descriptor(cudnn.createTensorDescriptor(),
+                      py_cudnn.destroyTensorDescriptor)
+    _create_tensor_nd_descriptor(desc.value, arr, data_type)
+    cache[key] = desc
+    return desc
+
+
+def create_filter_descriptor(arr, format=cudnn.CUDNN_TENSOR_NCHW):
+    desc = Descriptor(cudnn.createFilterDescriptor(),
+                      py_cudnn.destroyFilterDescriptor)
+    _create_filter_descriptor(desc.value, arr, format)
+    return desc
+
+
+def create_convolution_descriptor(pad, stride, dtype,
+                                  mode=cudnn.CUDNN_CROSS_CORRELATION,
+                                  dilation=(1, 1),
+                                  use_tensor_core=False,
+                                  group=1):
+    desc = Descriptor(cudnn.createConvolutionDescriptor(),
+                      py_cudnn.destroyConvolutionDescriptor)
+    _create_convolution_descriptor(
+        desc.value, pad, stride, dtype, mode, dilation, group, use_tensor_core)
+    return desc
+
+
+def create_pooling_descriptor(ksize, stride, pad, mode):
+    cdef vector.vector[int] c_ksize, c_pad, c_stride
+    ndim = len(ksize)
+    if ndim != len(stride) or ndim != len(pad):
+        raise ValueError('ksize, stride, and pad must be of same length')
+    desc = Descriptor(cudnn.createPoolingDescriptor(),
+                      py_cudnn.destroyPoolingDescriptor)
+    if ndim == 2:
+        cudnn.setPooling2dDescriptor_v4(
+            desc.value, mode, cudnn.CUDNN_NOT_PROPAGATE_NAN, ksize[0],
+            ksize[1], pad[0], pad[1], stride[0], stride[1])
+    else:
+        c_ksize = ksize
+        c_pad = pad
+        c_stride = stride
+        cudnn.setPoolingNdDescriptor_v4(
+            desc.value, mode, cudnn.CUDNN_NOT_PROPAGATE_NAN, ndim,
+            <size_t>&c_ksize[0], <size_t>&c_pad[0], <size_t>&c_stride[0])
+
+    return desc
+
+
+cpdef core.ndarray _as4darray(core.ndarray arr):
+    if arr.ndim == 0:
+        return arr.reshape(1, 1, 1, 1)
+    return arr.reshape(arr.shape[0], -1, 1, 1)
+
+
+def activation_forward(core.ndarray x, int mode):
+    cdef float float_zero = 0, float_one = 1
+    cdef double double_zero = 0, double_one = 1
+    cdef size_t zero, one
+    if x.dtype == 'd':
+        zero = <size_t>&double_zero
+        one = <size_t>&double_one
+    else:
+        zero = <size_t>&float_zero
+        one = <size_t>&float_one
+
+    x = core.ascontiguousarray(x)
+    y = cupy.empty_like(x)
+    x = _as4darray(x)
+
+    handle = get_handle()
+    desc = cudnn.createTensorDescriptor()
+    act_desc = cudnn.createActivationDescriptor()
+    try:
+        _create_tensor_descriptor(desc, x, cudnn.CUDNN_TENSOR_NCHW)
+        cudnn.setActivationDescriptor(
+            act_desc, mode, cudnn.CUDNN_NOT_PROPAGATE_NAN, 0.0)
+        cudnn.activationForward_v4(
+            handle, act_desc, one, desc, x.data.ptr,
+            zero, desc, y.data.ptr)
+    finally:
+        cudnn.destroyActivationDescriptor(act_desc)
+        cudnn.destroyTensorDescriptor(desc)
+    return y
+
+
+def activation_backward(core.ndarray x, core.ndarray y, core.ndarray gy,
+                        int mode):
+    cdef float float_zero = 0, float_one = 1
+    cdef double double_zero = 0, double_one = 1
+    cdef size_t zero, one
+    if x.dtype == 'd':
+        zero = <size_t>&double_zero
+        one = <size_t>&double_one
+    else:
+        zero = <size_t>&float_zero
+        one = <size_t>&float_one
+
+    gx = cupy.empty_like(x)
+    x = core.ascontiguousarray(x)
+    gy = core.ascontiguousarray(gy)
+    y_mat = _as4darray(y)
+
+    handle = get_handle()
+    desc = cudnn.createTensorDescriptor()
+    act_desc = cudnn.createActivationDescriptor()
+    try:
+        _create_tensor_descriptor(desc, y_mat, cudnn.CUDNN_TENSOR_NCHW)
+        cudnn.setActivationDescriptor(
+            act_desc, mode, cudnn.CUDNN_NOT_PROPAGATE_NAN, 0.0)
+        cudnn.activationBackward_v4(
+            handle, act_desc, one, desc, y.data.ptr,
+            desc, gy.data.ptr, desc, x.data.ptr,
+            zero, desc, gx.data.ptr)
+    finally:
+        cudnn.destroyActivationDescriptor(act_desc)
+        cudnn.destroyTensorDescriptor(desc)
+    return gx
+
+
+def create_dropout_descriptor(
+        handle, dropout, states, state_size_in_bytes, seed):
+    desc = Descriptor(cudnn.createDropoutDescriptor(),
+                      py_cudnn.destroyDropoutDescriptor)
+    cudnn.setDropoutDescriptor(desc.value, handle, dropout,
+                               states, state_size_in_bytes, seed)
+    return desc
+
+
+def set_dropout_descriptor(desc, handle, dropout):
+    # When the fourth argument is NULL, random state is not updated.
+    cudnn.setDropoutDescriptor(desc.value, handle, dropout, 0, 0, 0)
+
+
+def create_rnn_descriptor(hidden_size, num_layers, dropout_desc,
+                          input_mode, direction, mode, data_type):
+    desc = Descriptor(cudnn.createRNNDescriptor(),
+                      py_cudnn.destroyRNNDescriptor)
+    if _cudnn_version >= 7000:
+        _handle = get_handle()
+        _algo = cudnn.CUDNN_RNN_ALGO_STANDARD
+        cudnn.setRNNDescriptor_v6(
+            _handle, desc.value, hidden_size, num_layers, dropout_desc.value,
+            input_mode, direction, mode, _algo, data_type)
+    else:
+        cudnn.setRNNDescriptor_v5(
+            desc.value, hidden_size, num_layers, dropout_desc.value,
+            input_mode, direction, mode, data_type)
+    return desc
+
+
+def get_rnn_lin_layer_matrix_params(
+        handle, rnn_desc, layer, x_desc, w_desc, core.ndarray w, lin_layer_id):
+    cdef size_t ptr = 0
+    w_data_ptr = w.data.ptr
+    mat_desc = cudnn.createFilterDescriptor()
+    try:
+        cudnn.getRNNLinLayerMatrixParams(
+            handle, rnn_desc.value, layer, x_desc.value, w_desc.value,
+            w.data.ptr, lin_layer_id, mat_desc, <size_t>&ptr)
+        _, _, _, dim = cudnn.getFilterNdDescriptor(mat_desc, 3)
+    finally:
+        cudnn.destroyFilterDescriptor(mat_desc)
+    offset = (ptr - w.data.ptr) // 4
+    size = internal.prod(dim)
+    mat = w[offset: offset + size]
+    return mat
+
+
+def get_rnn_lin_layer_bias_params(
+        handle, rnn_desc, layer, x_desc, w_desc, core.ndarray w, lin_layer_id):
+    cdef size_t ptr = 0
+    bias_desc = cudnn.createFilterDescriptor()
+    try:
+        cudnn.getRNNLinLayerBiasParams(
+            handle, rnn_desc.value, layer, x_desc.value, w_desc.value,
+            w.data.ptr, lin_layer_id, bias_desc, <size_t>&ptr)
+        _, _, _, dim = cudnn.getFilterNdDescriptor(bias_desc, 3)
+    finally:
+        cudnn.destroyFilterDescriptor(bias_desc)
+    offset = (ptr - w.data.ptr) // 4
+    size = internal.prod(dim)
+    bias = w[offset: offset + size]
+    return bias
+
+
+def create_dropout_states(handle):
+    state_size = cudnn.dropoutGetStatesSize(handle)
+    return cupy.empty((state_size,), dtype='b')
+
+
+def create_spatial_transformer_descriptor(sampler_type, dtype, nb_dims, dim_A):
+    desc = Descriptor(cudnn.createSpatialTransformerDescriptor(),
+                      py_cudnn.destroySpatialTransformerDescriptor)
+    data_type = get_data_type(dtype)
+
+    cudnn.setSpatialTransformerDescriptor(
+        desc.value, sampler_type, data_type, nb_dims, dim_A)
+    return desc
+
+
+def add_tensor(handle, alpha, biasDesc, biasData, beta, srcDestDesc,
+               srcDestData):
+    cudnn.addTensor_v3(handle, alpha, biasDesc,
+                       biasData, beta, srcDestDesc, srcDestData)
diff --git a/cupy_setup_build.py b/cupy_setup_build.py
index 1ab47d59c9d..3026e19bb40 100644
--- a/cupy_setup_build.py
+++ b/cupy_setup_build.py
@@ -69,6 +69,7 @@
         'name': 'cudnn',
         'file': [
             'cupy.cuda.cudnn',
+            'cupy.cudnn',
         ],
         'include': [
             'cudnn.h',
diff --git a/tests/cupy_tests/test_cudnn.py b/tests/cupy_tests/test_cudnn.py
index ce3a2fa9c97..4fbc22793fa 100644
--- a/tests/cupy_tests/test_cudnn.py
+++ b/tests/cupy_tests/test_cudnn.py
@@ -1,6 +1,5 @@
 import unittest
 
-import mock
 import numpy
 
 import cupy
@@ -23,9 +22,7 @@
     'dtype': [numpy.float32, numpy.float64],
     'mode': modes,
 }))
-@unittest.skipUnless(
-    cudnn_enabled and libcudnn.getVersion() >= 3000,
-    'cuDNN >= 3.0 is supported')
+@unittest.skipUnless(cudnn_enabled, 'cuDNN is not available')
 class TestCudnnActivation(unittest.TestCase):
 
     def setUp(self):
@@ -33,20 +30,8 @@ def setUp(self):
         self.y = testing.shaped_arange((3, 4), cupy, self.dtype)
         self.g = testing.shaped_arange((3, 4), cupy, self.dtype)
 
-    def test_activation_forward_version(self):
-        if libcudnn.getVersion() >= 4000:
-            patch = 'cupy.cuda.cudnn.activationForward_v4'
-        else:
-            patch = 'cupy.cuda.cudnn.activationForward_v3'
-        with mock.patch(patch) as func:
-            cupy.cudnn.activation_forward(self.x, self.mode)
-            self.assertEqual(func.called, True)
+    def test_activation_forward(self):
+        cupy.cudnn.activation_forward(self.x, self.mode)
 
-    def test_activation_backward_version(self):
-        if libcudnn.getVersion() >= 4000:
-            patch = 'cupy.cuda.cudnn.activationBackward_v4'
-        else:
-            patch = 'cupy.cuda.cudnn.activationBackward_v3'
-        with mock.patch(patch) as func:
-            cupy.cudnn.activation_backward(self.x, self.y, self.g, self.mode)
-            self.assertEqual(func.called, True)
+    def test_activation_backward(self):
+        cupy.cudnn.activation_backward(self.x, self.y, self.g, self.mode)

From 58dca59cdf4d94e7518f7e4a4050fd5d3f1649aa Mon Sep 17 00:00:00 2001
From: niboshi <niboshi000@gmail.com>
Date: Fri, 24 Nov 2017 15:46:48 +0900
Subject: [PATCH 04/21] Merge pull request #774 from
 kmaehashi/test-argminmax-tie

add test for argmin/argmax tie
---
 tests/cupy_tests/sorting_tests/test_search.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tests/cupy_tests/sorting_tests/test_search.py b/tests/cupy_tests/sorting_tests/test_search.py
index 6b4b67021ba..f45ee88bad0 100644
--- a/tests/cupy_tests/sorting_tests/test_search.py
+++ b/tests/cupy_tests/sorting_tests/test_search.py
@@ -58,6 +58,12 @@ def test_argmax_axis2(self, xp, dtype):
         a = testing.shaped_random((2, 3, 4), xp, dtype)
         return a.argmax(axis=2)
 
+    @testing.for_all_dtypes(no_complex=True)
+    @testing.numpy_cupy_allclose()
+    def test_argmax_tie(self, xp, dtype):
+        a = xp.array([0, 5, 2, 3, 4, 5], dtype)
+        return a.argmax()
+
     @testing.for_all_dtypes(no_complex=True)
     @testing.numpy_cupy_raises(accept_error=ValueError)
     def test_argmax_zero_size(self, xp, dtype):
@@ -124,6 +130,12 @@ def test_argmin_axis2(self, xp, dtype):
         a = testing.shaped_random((2, 3, 4), xp, dtype)
         return a.argmin(axis=2)
 
+    @testing.for_all_dtypes(no_complex=True)
+    @testing.numpy_cupy_allclose()
+    def test_argmin_tie(self, xp, dtype):
+        a = xp.array([0, 1, 2, 3, 0, 5], dtype)
+        return a.argmin()
+
     @testing.for_all_dtypes(no_complex=True)
     @testing.numpy_cupy_raises(accept_error=ValueError)
     def test_argmin_zero_size(self, xp, dtype):

From 873479bdd7def0578d801236b3a2e37cbc9b2531 Mon Sep 17 00:00:00 2001
From: niboshi <niboshi000@gmail.com>
Date: Fri, 24 Nov 2017 17:16:10 +0900
Subject: [PATCH 05/21] Merge pull request #777 from
 kmaehashi/fix-dump-filename

fix to correctly display filename of CUDA dump
---
 cupy/cuda/compiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cupy/cuda/compiler.py b/cupy/cuda/compiler.py
index 352ea92fbed..58b5ee69369 100644
--- a/cupy/cuda/compiler.py
+++ b/cupy/cuda/compiler.py
@@ -196,7 +196,7 @@ def dump(self, f):
         linum_fmt = '{{:0{}d}} '.format(digits)
         f.write('NVRTC compilation error: {}\n'.format(self))
         f.write('-----\n')
-        f.write('Name: {}\n'.format(' '.join(self.name)))
+        f.write('Name: {}\n'.format(self.name))
         f.write('Options: {}\n'.format(' '.join(self.options)))
         f.write('CUDA source:\n')
         for i, line in enumerate(lines):

From 601b1dd5810b64d9e597a2602df4b5b39341b4b3 Mon Sep 17 00:00:00 2001
From: Yuya Unno <unnonouno@gmail.com>
Date: Tue, 28 Nov 2017 00:37:51 +0900
Subject: [PATCH 06/21] Merge pull request #787 from hvy/fix-doc-missing-arg

Add docs for missing argument
---
 cupy/testing/helper.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/cupy/testing/helper.py b/cupy/testing/helper.py
index a1c399d4986..915a13f9531 100644
--- a/cupy/testing/helper.py
+++ b/cupy/testing/helper.py
@@ -441,6 +441,12 @@ def numpy_cupy_raises(name='xp', sp_name=None, accept_error=Exception):
          sp_name(str or None): Argument name whose value is either
              ``scipy.sparse`` or ``cupy.sparse`` module. If ``None``, no
              argument is given for the modules.
+         accept_error(bool, Exception or tuple of Exception): Specify
+             acceptable errors. When both NumPy test and CuPy test raises the
+             same type of errors, and the type of the errors is specified with
+             this argument, the errors are ignored and not raised.
+             If it is ``True`` all error types are acceptable.
+             If it is ``False`` no error is acceptable.
 
     Decorated test fixture is required throw same errors
     even if ``xp`` is ``numpy`` or ``cupy``.

From 29f55e41f94d9c764d3ea6e84918b231089770d6 Mon Sep 17 00:00:00 2001
From: Yuya Unno <unnonouno@gmail.com>
Date: Wed, 29 Nov 2017 21:01:15 +0900
Subject: [PATCH 07/21] Merge pull request #749 from okuta/fix-stack

Fix stack function bug
---
 cupy/manipulation/join.py                     |  6 ++++--
 .../manipulation_tests/test_join.py           | 19 ++++++++++++++++++-
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/cupy/manipulation/join.py b/cupy/manipulation/join.py
index e07d308afe1..a89a4f8a519 100644
--- a/cupy/manipulation/join.py
+++ b/cupy/manipulation/join.py
@@ -120,10 +120,12 @@ def stack(tup, axis=0):
 
     .. seealso:: :func:`numpy.stack`
     """
+    # TODO(okuta) Remove this if exampd_dims is updated
     for x in tup:
-        if not (-x.ndim <= axis < x.ndim):
+        if not (-x.ndim - 1 <= axis <= x.ndim):
             raise core.core._AxisError(
-                'axis {} out of bounds [0, {})'.format(axis, x.ndim))
+                'axis {} out of bounds [{}, {}]'.format(
+                    axis, -x.ndim - 1, x.ndim))
     return concatenate([cupy.expand_dims(x, axis) for x in tup], axis)
 
 
diff --git a/tests/cupy_tests/manipulation_tests/test_join.py b/tests/cupy_tests/manipulation_tests/test_join.py
index 2b483cfab29..c6d2789d9c8 100644
--- a/tests/cupy_tests/manipulation_tests/test_join.py
+++ b/tests/cupy_tests/manipulation_tests/test_join.py
@@ -173,10 +173,27 @@ def test_stack_value(self):
 
     @testing.with_requires('numpy>=1.10')
     @testing.numpy_cupy_array_equal()
-    def test_stack_with_axis(self, xp):
+    def test_stack_with_axis1(self, xp):
         a = testing.shaped_arange((2, 3), xp)
         return xp.stack((a, a), axis=1)
 
+    @testing.with_requires('numpy>=1.10')
+    @testing.numpy_cupy_array_equal()
+    def test_stack_with_axis2(self, xp):
+        a = testing.shaped_arange((2, 3), xp)
+        return xp.stack((a, a), axis=2)
+
+    @testing.numpy_cupy_raises()
+    def test_stack_with_axis_over(self, xp):
+        a = testing.shaped_arange((2, 3), xp)
+        try:
+            return xp.stack((a, a), axis=3)
+        except IndexError:
+            # For 'numpy<=1.12', catch both IndexError from NumPy and
+            # IndexOrValueError from CuPy. For 'numpy>=1.13', simply do not
+            # catch the AxisError.
+            raise IndexError()
+
     def test_stack_with_axis_value(self):
         a = testing.shaped_arange((2, 3), cupy)
         s = cupy.stack((a, a), axis=1)

From ac93d73ae9b660cf931a1b7eb3fa4989c4e2b7be Mon Sep 17 00:00:00 2001
From: Ryosuke Okuta <okuta@preferred.jp>
Date: Thu, 30 Nov 2017 09:52:16 +0900
Subject: [PATCH 08/21] Merge pull request #797 from kmaehashi/fix-gitignore

add pytest cache directory to gitignore
---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index cad37334183..e27f1bf9cbc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,3 +17,4 @@ cupy.egg-info/
 dist/
 htmlcov/
 .idea/
+.cache/

From 494c59b52fb07d1865cddd72988b3e23df36376e Mon Sep 17 00:00:00 2001
From: niboshi <niboshi000@gmail.com>
Date: Fri, 1 Dec 2017 03:11:24 +0900
Subject: [PATCH 09/21] Merge pull request #803 from kmaehashi/fix-stack-test

fix test to pass with numpy 1.9
---
 tests/cupy_tests/manipulation_tests/test_join.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/cupy_tests/manipulation_tests/test_join.py b/tests/cupy_tests/manipulation_tests/test_join.py
index c6d2789d9c8..b875867bc2f 100644
--- a/tests/cupy_tests/manipulation_tests/test_join.py
+++ b/tests/cupy_tests/manipulation_tests/test_join.py
@@ -183,6 +183,7 @@ def test_stack_with_axis2(self, xp):
         a = testing.shaped_arange((2, 3), xp)
         return xp.stack((a, a), axis=2)
 
+    @testing.with_requires('numpy>=1.10')
     @testing.numpy_cupy_raises()
     def test_stack_with_axis_over(self, xp):
         a = testing.shaped_arange((2, 3), xp)

From 624f6a87f2fe0a878f35cf7041d4118fec1e4638 Mon Sep 17 00:00:00 2001
From: niboshi <niboshi000@gmail.com>
Date: Fri, 1 Dec 2017 13:24:56 +0900
Subject: [PATCH 10/21] Merge pull request #808 from kmaehashi/embed-signature

Expose signatures to Reference
---
 cupy/core/elementwise.pxi | 4 +---
 cupy/core/reduction.pxi   | 4 +---
 cupy_setup_build.py       | 3 +++
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/cupy/core/elementwise.pxi b/cupy/core/elementwise.pxi
index e0b280da7c8..a1be278b8c7 100644
--- a/cupy/core/elementwise.pxi
+++ b/cupy/core/elementwise.pxi
@@ -742,9 +742,7 @@ class ufunc(object):
         return types
 
     def __call__(self, *args, **kwargs):
-        """__call__(*args, **kwargs)
-
-        Applies the universal function to arguments elementwise.
+        """Applies the universal function to arguments elementwise.
 
         Args:
             args: Input arguments. Each of them can be a :class:`cupy.ndarray`
diff --git a/cupy/core/reduction.pxi b/cupy/core/reduction.pxi
index d0f92318807..02123b7109c 100644
--- a/cupy/core/reduction.pxi
+++ b/cupy/core/reduction.pxi
@@ -355,9 +355,7 @@ class ReductionKernel(object):
         self.preamble = preamble
 
     def __call__(self, *args, **kwargs):
-        """__call__(*args, **kwargs)
-
-        Compiles and invokes the reduction kernel.
+        """Compiles and invokes the reduction kernel.
 
         The compilation runs only if the kernel is not cached. Note that the
         kernels with different argument dtypes, ndims, or axis are not
diff --git a/cupy_setup_build.py b/cupy_setup_build.py
index 1ab47d59c9d..9186e0a461d 100644
--- a/cupy_setup_build.py
+++ b/cupy_setup_build.py
@@ -341,6 +341,9 @@ def cythonize(extensions, arg_options):
     directive_keys = ('linetrace', 'profile')
     directives = {key: arg_options[key] for key in directive_keys}
 
+    # Embed signatures for Sphinx documentation.
+    directives['embedsignature'] = True
+
     cythonize_option_keys = ('annotate',)
     cythonize_options = {key: arg_options[key]
                          for key in cythonize_option_keys}

From 8538889720960a974607bd846d12ff193da7d668 Mon Sep 17 00:00:00 2001
From: niboshi <niboshi000@gmail.com>
Date: Fri, 1 Dec 2017 13:27:28 +0900
Subject: [PATCH 11/21] Merge pull request #786 from
 kmaehashi/fix-elementwise-docs

fix ElementwiseKernel.__call__ docs
---
 cupy/core/elementwise.pxi | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/cupy/core/elementwise.pxi b/cupy/core/elementwise.pxi
index e0b280da7c8..bfd59c437d2 100644
--- a/cupy/core/elementwise.pxi
+++ b/cupy/core/elementwise.pxi
@@ -523,9 +523,11 @@ cdef class ElementwiseKernel:
 
         Args:
             args: Arguments of the kernel.
-            size (int): Range size of the indices. If specified, the variable
-                ``n`` is set to this value. Otherwise, the result of
-                broadcasting is used to determine the value of ``n``.
+            size (int): Range size of the indices.  By default, the range size
+                is automatically determined from the result of broadcasting.
+                This parameter must be specified if and only if all ndarrays
+                are `raw` and the range size cannot be determined
+                automatically.
 
         Returns:
             Arrays are returned according to the ``out_params`` argument of the

From adf72f7c5797d02281d1a2a5dd211c87371ce3b9 Mon Sep 17 00:00:00 2001
From: niboshi <niboshi000@gmail.com>
Date: Fri, 1 Dec 2017 15:59:39 +0900
Subject: [PATCH 12/21] Merge pull request #788 from hvy/allow-derived-errors

Allow derived errors to pass equality tests
---
 cupy/testing/helper.py                        |   9 +-
 tests/cupy_tests/testing_tests/test_helper.py | 136 +++++++++++++-----
 2 files changed, 108 insertions(+), 37 deletions(-)

diff --git a/cupy/testing/helper.py b/cupy/testing/helper.py
index 915a13f9531..80b6854e952 100644
--- a/cupy/testing/helper.py
+++ b/cupy/testing/helper.py
@@ -47,7 +47,11 @@ def _check_cupy_numpy_error(self, cupy_error, cupy_tb, numpy_error,
         self.fail('Only numpy raises error\n\n' + numpy_tb)
     elif numpy_error is None:
         self.fail('Only cupy raises error\n\n' + cupy_tb)
-    elif type(cupy_error) is not type(numpy_error):
+    elif not isinstance(cupy_error, type(numpy_error)):
+        # CuPy errors should be at least as explicit as the NumPy errors, i.e.
+        # allow CuPy errors to derive from NumPy errors but not the opposite.
+        # This ensures that try/except blocks that catch NumPy errors also
+        # catch CuPy errors.
         msg = '''Different types of errors occurred
 
 cupy
@@ -56,7 +60,8 @@ def _check_cupy_numpy_error(self, cupy_error, cupy_tb, numpy_error,
 %s
 ''' % (cupy_tb, numpy_tb)
         self.fail(msg)
-    elif not isinstance(cupy_error, accept_error):
+    elif not (isinstance(cupy_error, accept_error) and
+              isinstance(numpy_error, accept_error)):
         msg = '''Both cupy and numpy raise exceptions
 
 cupy
diff --git a/tests/cupy_tests/testing_tests/test_helper.py b/tests/cupy_tests/testing_tests/test_helper.py
index 1cc3b4fa3b5..b9cd7938402 100644
--- a/tests/cupy_tests/testing_tests/test_helper.py
+++ b/tests/cupy_tests/testing_tests/test_helper.py
@@ -32,56 +32,122 @@ def test_unsigned_only(self):
 
 class TestCheckCupyNumpyError(unittest.TestCase):
 
+    tbs = {
+        cupy: 'xxxx',
+        numpy: 'yyyy'
+    }
+
     def test_both_success(self):
+        @testing.helper.numpy_cupy_raises()
+        def dummy_both_success(self, xp):
+            pass
+
         with self.assertRaises(AssertionError):
-            helper._check_cupy_numpy_error(self, None, None, None, None)
+            dummy_both_success(self)
 
     def test_cupy_error(self):
-        cupy_error = Exception()
-        cupy_tb = 'xxxx'
-        with six.assertRaisesRegex(self, AssertionError, cupy_tb):
-            helper._check_cupy_numpy_error(self, cupy_error, cupy_tb,
-                                           None, None)
+        @testing.helper.numpy_cupy_raises()
+        def dummy_cupy_error(self, xp):
+            if xp is cupy:
+                raise Exception(self.tbs.get(cupy))
+
+        with six.assertRaisesRegex(self, AssertionError, self.tbs.get(cupy)):
+            dummy_cupy_error(self)
 
     def test_numpy_error(self):
-        numpy_error = Exception()
-        numpy_tb = 'yyyy'
-        with six.assertRaisesRegex(self, AssertionError, numpy_tb):
-            helper._check_cupy_numpy_error(self, None, None,
-                                           numpy_error, numpy_tb)
+        @testing.helper.numpy_cupy_raises()
+        def dummy_numpy_error(self, xp):
+            if xp is numpy:
+                raise Exception(self.tbs.get(numpy))
+
+        with six.assertRaisesRegex(self, AssertionError, self.tbs.get(numpy)):
+            dummy_numpy_error(self)
 
     def test_cupy_numpy_different_error(self):
-        cupy_error = TypeError()
-        cupy_tb = 'xxxx'
-        numpy_error = ValueError()
-        numpy_tb = 'yyyy'
+        @testing.helper.numpy_cupy_raises()
+        def dummy_cupy_numpy_different_error(self, xp):
+            if xp is cupy:
+                raise TypeError(self.tbs.get(cupy))
+            elif xp is numpy:
+                raise ValueError(self.tbs.get(numpy))
+
         # Use re.S mode to ignore new line characters
-        pattern = re.compile(cupy_tb + '.*' + numpy_tb, re.S)
+        pattern = re.compile(
+            self.tbs.get(cupy) + '.*' + self.tbs.get(numpy), re.S)
         with six.assertRaisesRegex(self, AssertionError, pattern):
-            helper._check_cupy_numpy_error(self, cupy_error, cupy_tb,
-                                           numpy_error, numpy_tb)
+            dummy_cupy_numpy_different_error(self)
+
+    def test_cupy_derived_error(self):
+        @testing.helper.numpy_cupy_raises()
+        def dummy_cupy_derived_error(self, xp):
+            if xp is cupy:
+                raise ValueError(self.tbs.get(cupy))
+            elif xp is numpy:
+                raise Exception(self.tbs.get(numpy))
+
+        dummy_cupy_derived_error(self)  # Assert no exceptions
+
+    def test_numpy_derived_error(self):
+        @testing.helper.numpy_cupy_raises()
+        def dummy_numpy_derived_error(self, xp):
+            if xp is cupy:
+                raise Exception(self.tbs.get(cupy))
+            elif xp is numpy:
+                raise IndexError(self.tbs.get(numpy))
+
+        # NumPy errors may not derive from CuPy errors, i.e. CuPy errors should
+        # be at least as explicit as the NumPy error
+        pattern = re.compile(
+            self.tbs.get(cupy) + '.*' + self.tbs.get(numpy), re.S)
+        with six.assertRaisesRegex(self, AssertionError, pattern):
+            dummy_numpy_derived_error(self)
 
     def test_same_error(self):
-        cupy_error = Exception()
-        cupy_tb = 'xxxx'
-        numpy_error = Exception()
-        numpy_tb = 'yyyy'
-        # Nothing happens
-        helper._check_cupy_numpy_error(self, cupy_error, cupy_tb,
-                                       numpy_error, numpy_tb,
-                                       accept_error=Exception)
+        @testing.helper.numpy_cupy_raises(accept_error=Exception)
+        def dummy_same_error(self, xp):
+            raise Exception(self.tbs.get(xp))
+
+        dummy_same_error(self)
+
+    def test_cupy_derived_unaccept_error(self):
+        @testing.helper.numpy_cupy_raises(accept_error=ValueError)
+        def dummy_cupy_derived_unaccept_error(self, xp):
+            if xp is cupy:
+                raise IndexError(self.tbs.get(cupy))
+            elif xp is numpy:
+                raise Exception(self.tbs.get(numpy))
+
+        # Neither `IndexError` nor `Exception` is derived from `ValueError`,
+        # therefore expect an error
+        pattern = re.compile(
+            self.tbs.get(cupy) + '.*' + self.tbs.get(numpy), re.S)
+        with six.assertRaisesRegex(self, AssertionError, pattern):
+            dummy_cupy_derived_unaccept_error(self)
+
+    def test_numpy_derived_unaccept_error(self):
+        @testing.helper.numpy_cupy_raises(accept_error=ValueError)
+        def dummy_numpy_derived_unaccept_error(self, xp):
+            if xp is cupy:
+                raise Exception(self.tbs.get(cupy))
+            elif xp is numpy:
+                raise ValueError(self.tbs.get(numpy))
+
+        # `Exception` is not derived from `ValueError`, therefore expect an
+        # error
+        pattern = re.compile(
+            self.tbs.get(cupy) + '.*' + self.tbs.get(numpy), re.S)
+        with six.assertRaisesRegex(self, AssertionError, pattern):
+            dummy_numpy_derived_unaccept_error(self)
 
     def test_forbidden_error(self):
-        cupy_error = Exception()
-        cupy_tb = 'xxxx'
-        numpy_error = Exception()
-        numpy_tb = 'yyyy'
-        # Use re.S mode to ignore new line characters
-        pattern = re.compile(cupy_tb + '.*' + numpy_tb, re.S)
+        @testing.helper.numpy_cupy_raises(accept_error=False)
+        def dummy_forbidden_error(self, xp):
+            raise Exception(self.tbs.get(xp))
+
+        pattern = re.compile(
+            self.tbs.get(cupy) + '.*' + self.tbs.get(numpy), re.S)
         with six.assertRaisesRegex(self, AssertionError, pattern):
-            helper._check_cupy_numpy_error(
-                self, cupy_error, cupy_tb,
-                numpy_error, numpy_tb, accept_error=False)
+            dummy_forbidden_error(self)
 
 
 class NumPyCuPyDecoratorBase(object):

From 5bc3cda2460fc896c897471d850f3d2a2a3d99c3 Mon Sep 17 00:00:00 2001
From: niboshi <niboshi000@gmail.com>
Date: Fri, 1 Dec 2017 16:27:43 +0900
Subject: [PATCH 13/21] Merge pull request #720 from okuta/add-default-casting

Add default casting option to ufunc
---
 cupy/core/core.pyx        | 19 ++++---------------
 cupy/core/elementwise.pxi | 17 +++++++++++++----
 cupy/core/fusion.py       |  2 +-
 3 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/cupy/core/core.pyx b/cupy/core/core.pyx
index 032d935e238..ed713e18827 100644
--- a/cupy/core/core.pyx
+++ b/cupy/core/core.pyx
@@ -1852,29 +1852,18 @@ include "reduction.pxi"
 
 cdef _id = 'out0 = in0'
 
-_elementwise_copy = create_ufunc(
+elementwise_copy = create_ufunc(
     'cupy_copy',
     ('?->?', 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L',
      'q->q', 'Q->Q', 'e->e', 'f->f', 'd->d', 'F->F', 'D->D'),
-    'out0 = out0_type(in0)')
+    'out0 = out0_type(in0)', default_casting='unsafe')
 # complex numbers requires out0 = complex<T>(in0)
 
-
-def elementwise_copy(*args, **kwargs):
-    kwargs['casting'] = 'unsafe'
-    return _elementwise_copy(*args, **kwargs)
-
-
-_elementwise_copy_where = create_ufunc(
+elementwise_copy_where = create_ufunc(
     'cupy_copy_where',
     ('??->?', 'b?->b', 'B?->B', 'h?->h', 'H?->H', 'i?->i', 'I?->I', 'l?->l',
      'L?->L', 'q?->q', 'Q?->Q', 'e?->e', 'f?->f', 'd?->d', 'F?->F', 'D?->D'),
-    'if (in1) out0 = in0')
-
-
-def elementwise_copy_where(*args, **kwargs):
-    kwargs['casting'] = 'unsafe'
-    return _elementwise_copy_where(*args, **kwargs)
+    'if (in1) out0 = in0', default_casting='unsafe')
 
 
 cdef _divmod_float = '''
diff --git a/cupy/core/elementwise.pxi b/cupy/core/elementwise.pxi
index 929137f079e..98f9f32feda 100644
--- a/cupy/core/elementwise.pxi
+++ b/cupy/core/elementwise.pxi
@@ -707,7 +707,9 @@ class ufunc(object):
         nargs (int): Number of all arguments.
 
     """
-    def __init__(self, name, nin, nout, ops, preamble='', doc=''):
+
+    def __init__(self, name, nin, nout, ops, preamble='', doc='',
+                 default_casting=None):
         self.name = name
         self.nin = nin
         self.nout = nout
@@ -715,6 +717,10 @@ class ufunc(object):
         self._ops = ops
         self._preamble = preamble
         self.__doc__ = doc
+        if default_casting is None:
+            self._default_casting = 'same_kind'
+        else:
+            self._default_casting = default_casting
         _in_params = tuple(
             ParameterInfo('T in%d' % i, True)
             for i in range(nin))
@@ -764,7 +770,7 @@ class ufunc(object):
         out = kwargs.pop('out', None)
         dtype = kwargs.pop('dtype', None)
         # Note default behavior of casting is 'same_kind' on numpy>=1.10
-        casting = kwargs.pop('casting', 'same_kind')
+        casting = kwargs.pop('casting', self._default_casting)
         if dtype is not None:
             dtype = numpy.dtype(dtype).type
         if kwargs:
@@ -822,7 +828,8 @@ class ufunc(object):
         return ret
 
 
-cpdef create_ufunc(name, ops, routine=None, preamble='', doc=''):
+cpdef create_ufunc(name, ops, routine=None, preamble='', doc='',
+                   default_casting=None):
     _ops = []
     for t in ops:
         if not isinstance(t, tuple):
@@ -840,4 +847,6 @@ cpdef create_ufunc(name, ops, routine=None, preamble='', doc=''):
         out_types = tuple([numpy.dtype(t).type for t in out_types])
         _ops.append((in_types, out_types, rt))
 
-    return ufunc(name, len(_ops[0][0]), len(_ops[0][1]), _ops, preamble, doc)
+    ret = ufunc(name, len(_ops[0][0]), len(_ops[0][1]), _ops, preamble, doc,
+                default_casting=default_casting)
+    return ret
diff --git a/cupy/core/fusion.py b/cupy/core/fusion.py
index 256f20ff5f1..39b0e27ad4f 100644
--- a/cupy/core/fusion.py
+++ b/cupy/core/fusion.py
@@ -713,7 +713,7 @@ def _create_ufunc(cupy_ufunc, numpy_ufunc):
 
 clip = ufunc(core._clip, math.misc.clip, numpy.clip)
 
-copy = ufunc(core._elementwise_copy,
+copy = ufunc(core.elementwise_copy,
              creation.from_data.copy, numpy.copy)
 
 bitwise_and = _create_ufunc(core.bitwise_and, numpy.bitwise_and)

From 148701b7b31c33293a541d322be29bb90ab65f56 Mon Sep 17 00:00:00 2001
From: niboshi <niboshi000@gmail.com>
Date: Fri, 1 Dec 2017 17:59:05 +0900
Subject: [PATCH 14/21] Merge pull request #804 from
 kmaehashi/fix-helper-comment

fix wrong comment in test helper
---
 cupy/testing/helper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cupy/testing/helper.py b/cupy/testing/helper.py
index 915a13f9531..227bcd95d65 100644
--- a/cupy/testing/helper.py
+++ b/cupy/testing/helper.py
@@ -406,7 +406,7 @@ def numpy_cupy_equal(name='xp', sp_name=None):
              ``scipy.sparse`` or ``cupy.sparse`` module. If ``None``, no
              argument is given for the modules.
 
-    Decorated test fixture is required throw same errors
+    Decorated test fixture is required to return the same results
     even if ``xp`` is ``numpy`` or ``cupy``.
     """
     def decorator(impl):

From 77ba34484f54e5ace8028c8b2e32e636b6c960b0 Mon Sep 17 00:00:00 2001
From: Ryosuke Okuta <okuta@preferred.jp>
Date: Sun, 3 Dec 2017 19:39:12 +0900
Subject: [PATCH 15/21] Merge pull request #737 from
 niboshi/fix-matmul-value-error

Fix matmul to raise ValueError on invalid shapes
---
 cupy/core/core.pyx                         | 16 +++++++++++--
 tests/cupy_tests/math_tests/test_matmul.py | 27 ++++++++++++++++++++--
 2 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/cupy/core/core.pyx b/cupy/core/core.pyx
index ed713e18827..69538d5ad86 100644
--- a/cupy/core/core.pyx
+++ b/cupy/core/core.pyx
@@ -3347,6 +3347,11 @@ cpdef ndarray matmul(ndarray a, ndarray b, ndarray out=None):
     cdef Py_ssize_t batchCount
     cdef ndarray ap, bp, outp
 
+    orig_a_shape = a.shape
+    orig_b_shape = b.shape
+    if len(orig_a_shape) == 0 or len(orig_b_shape) == 0:
+        raise ValueError('Scalar operands are not allowed, use \'*\' instead')
+
     ret_dtype = numpy.result_type(a.dtype, b.dtype)
     dtype = numpy.find_common_type((ret_dtype, 'f'), ())
 
@@ -3432,9 +3437,16 @@ cpdef ndarray matmul(ndarray a, ndarray b, ndarray out=None):
     *la, ka, n = a.shape
     *lb, m, kb = b.shape
 
-    assert ka == kb
+    if ka != kb:
+        raise ValueError(
+            'shapes ({}) and ({}) not aligned'.format(
+                ','.join([str(_) for _ in orig_a_shape]),
+                ','.join([str(_) for _ in orig_b_shape])))
     for la_, lb_ in zip(la, lb):
-        assert la_ == lb_ or la_ == 1 or lb_ == 1
+        if not (la_ == lb_ or la_ == 1 or lb_ == 1):
+            raise ValueError(
+                'operands could not be broadcast together with '
+                'remapped shapes')
 
     batchCount = 1  # batchCount = numpy.prod(la)
     for i in la:
diff --git a/tests/cupy_tests/math_tests/test_matmul.py b/tests/cupy_tests/math_tests/test_matmul.py
index 10ac237e697..676eb24de2c 100644
--- a/tests/cupy_tests/math_tests/test_matmul.py
+++ b/tests/cupy_tests/math_tests/test_matmul.py
@@ -104,6 +104,29 @@ def test_cupy_matmul(self, xp, dtype1, dtype2):
         if ((dtype1, dtype2) in self.skip_dtypes or
                 (dtype2, dtype1) in self.skip_dtypes):
             return xp.array([])
-        x1 = testing.shaped_arange(self.shape_pair[0], xp, dtype1)
-        x2 = testing.shaped_arange(self.shape_pair[1], xp, dtype2)
+        shape1, shape2 = self.shape_pair
+        x1 = testing.shaped_arange(shape1, xp, dtype1)
+        x2 = testing.shaped_arange(shape2, xp, dtype2)
         return xp.matmul(x1, x2)
+
+
+@testing.parameterize(
+    *testing.product({
+        'shape_pair': [
+            ((5, 3, 1), (3, 1, 4)),
+            ((3, 2, 3), (3, 2, 4)),
+            ((3, 2), ()),
+            ((), (3, 2)),
+            ((), ()),
+            ((3, 2), (1,)),
+        ],
+    }))
+@testing.gpu
+class TestMatmulInvalidShape(unittest.TestCase):
+
+    @testing.numpy_cupy_raises(accept_error=ValueError)
+    def test_invalid_shape(self, xp):
+        shape1, shape2 = self.shape_pair
+        x1 = testing.shaped_arange(shape1, xp, numpy.float32)
+        x2 = testing.shaped_arange(shape2, xp, numpy.float32)
+        xp.matmul(x1, x2)

From a83c50068650c86f82caf5e1a8630afad7a5e42c Mon Sep 17 00:00:00 2001
From: niboshi <niboshi000@gmail.com>
Date: Mon, 4 Dec 2017 11:11:59 +0900
Subject: [PATCH 16/21] Merge pull request #817 from okuta/fix-matmul-test

Fix matmul test for old NumPy
---
 tests/cupy_tests/math_tests/test_matmul.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/cupy_tests/math_tests/test_matmul.py b/tests/cupy_tests/math_tests/test_matmul.py
index 676eb24de2c..c018c680e34 100644
--- a/tests/cupy_tests/math_tests/test_matmul.py
+++ b/tests/cupy_tests/math_tests/test_matmul.py
@@ -124,6 +124,7 @@ def test_cupy_matmul(self, xp, dtype1, dtype2):
 @testing.gpu
 class TestMatmulInvalidShape(unittest.TestCase):
 
+    @testing.with_requires('numpy>=1.10')
     @testing.numpy_cupy_raises(accept_error=ValueError)
     def test_invalid_shape(self, xp):
         shape1, shape2 = self.shape_pair

From b0f9e66b909446ed859c4f52bb98adf8412872ab Mon Sep 17 00:00:00 2001
From: niboshi <niboshi000@gmail.com>
Date: Mon, 4 Dec 2017 18:03:33 +0900
Subject: [PATCH 17/21] Merge pull request #805 from toslunar/cast-env-seed

Cast CUPY_SEED environment variable to uint64
---
 cupy/random/generator.py                      |  2 +
 .../cupy_tests/random_tests/test_generator.py | 48 ++++++++++++++-----
 2 files changed, 39 insertions(+), 11 deletions(-)

diff --git a/cupy/random/generator.py b/cupy/random/generator.py
index 35c0ca5739c..d6a3cf95784 100644
--- a/cupy/random/generator.py
+++ b/cupy/random/generator.py
@@ -409,6 +409,8 @@ def get_random_state():
         seed = os.getenv('CUPY_SEED')
         if seed is None:
             seed = os.getenv('CHAINER_SEED')
+        if seed is not None:
+            seed = numpy.uint64(int(seed))
         rs = RandomState(seed)
         rs = _random_states.setdefault(dev.id, rs)
     return rs
diff --git a/tests/cupy_tests/random_tests/test_generator.py b/tests/cupy_tests/random_tests/test_generator.py
index ca5e35b7ca5..7277644cfcd 100644
--- a/tests/cupy_tests/random_tests/test_generator.py
+++ b/tests/cupy_tests/random_tests/test_generator.py
@@ -551,15 +551,12 @@ def test_thread_safe(self):
 class TestGetRandomState2(unittest.TestCase):
 
     def setUp(self):
-        self.rs_tmp = generator.RandomState
-        generator.RandomState = mock.Mock()
         self.rs_dict = generator._random_states
         generator._random_states = {}
         self.cupy_seed = os.getenv('CUPY_SEED')
         self.chainer_seed = os.getenv('CHAINER_SEED')
 
     def tearDown(self, *args):
-        generator.RandomState = self.rs_tmp
         generator._random_states = self.rs_dict
         if self.cupy_seed is None:
             os.environ.pop('CUPY_SEED', None)
@@ -573,26 +570,55 @@ def tearDown(self, *args):
     def test_get_random_state_no_cupy_no_chainer_seed(self):
         os.environ.pop('CUPY_SEED', None)
         os.environ.pop('CHAINER_SEED', None)
-        generator.get_random_state()
-        generator.RandomState.assert_called_with(None)
+        rvs0 = self._get_rvs_reset()
+        rvs1 = self._get_rvs_reset()
+
+        self._check_different(rvs0, rvs1)
 
     def test_get_random_state_no_cupy_with_chainer_seed(self):
+        rvs0 = self._get_rvs(generator.RandomState(5))
+
         os.environ.pop('CUPY_SEED', None)
         os.environ['CHAINER_SEED'] = '5'
-        generator.get_random_state()
-        generator.RandomState.assert_called_with('5')
+        rvs1 = self._get_rvs_reset()
+
+        self._check_same(rvs0, rvs1)
 
     def test_get_random_state_with_cupy_no_chainer_seed(self):
+        rvs0 = self._get_rvs(generator.RandomState(6))
+
         os.environ['CUPY_SEED'] = '6'
         os.environ.pop('CHAINER_SEED', None)
-        generator.get_random_state()
-        generator.RandomState.assert_called_with('6')
+        rvs1 = self._get_rvs_reset()
+
+        self._check_same(rvs0, rvs1)
 
     def test_get_random_state_with_cupy_with_chainer_seed(self):
+        rvs0 = self._get_rvs(generator.RandomState(7))
+
         os.environ['CUPY_SEED'] = '7'
         os.environ['CHAINER_SEED'] = '8'
-        generator.get_random_state()
-        generator.RandomState.assert_called_with('7')
+        rvs1 = self._get_rvs_reset()
+
+        self._check_same(rvs0, rvs1)
+
+    def _get_rvs(self, rs):
+        rvu = rs.rand(4)
+        rvn = rs.randn(4)
+        return rvu, rvn
+
+    def _get_rvs_reset(self):
+        generator.reset_states()
+        return self._get_rvs(generator.get_random_state())
+
+    def _check_same(self, rvs0, rvs1):
+        for rv0, rv1 in zip(rvs0, rvs1):
+            testing.assert_array_equal(rv0, rv1)
+
+    def _check_different(self, rvs0, rvs1):
+        for rv0, rv1 in zip(rvs0, rvs1):
+            for r0, r1 in zip(rv0, rv1):
+                self.assertNotEqual(r0, r1)
 
 
 class TestCheckAndGetDtype(unittest.TestCase):

From f87a4dd2058296eb4697010e9c2ce4cf67c0713d Mon Sep 17 00:00:00 2001
From: niboshi <niboshi000@gmail.com>
Date: Sun, 15 Oct 2017 04:22:04 +0900
Subject: [PATCH 18/21] Merge pull request #574 from
 unnonouno/remove-randint-dtype

Remove dtype option which is introduced in v1.11
---
 cupy/testing/random.py                          |  2 +-
 tests/cupy_tests/random_tests/test_generator.py | 11 +----------
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/cupy/testing/random.py b/cupy/testing/random.py
index 9212fada578..9570e68a224 100644
--- a/cupy/testing/random.py
+++ b/cupy/testing/random.py
@@ -88,7 +88,7 @@ def _teardown_random():
 
 def generate_seed():
     assert _nest_count > 0, 'random is not set up'
-    return numpy.random.randint(0xffffffff, dtype=numpy.int64)
+    return numpy.random.randint(0x7fffffff)
 
 
 def fix_random():
diff --git a/tests/cupy_tests/random_tests/test_generator.py b/tests/cupy_tests/random_tests/test_generator.py
index 7277644cfcd..adadccb3b84 100644
--- a/tests/cupy_tests/random_tests/test_generator.py
+++ b/tests/cupy_tests/random_tests/test_generator.py
@@ -31,7 +31,6 @@ def __exit__(self, *_):
 
 
 @testing.fix_random()
-@testing.with_requires('numpy>=1.11.0')
 @testing.gpu
 class TestRandomState(unittest.TestCase):
 
@@ -180,7 +179,6 @@ class TestRandomState8(TestRandomState):
 
 
 @testing.fix_random()
-@testing.with_requires('numpy>=1.11.0')
 @testing.gpu
 class TestRandAndRandN(unittest.TestCase):
 
@@ -209,7 +207,6 @@ def test_randn_invalid_argument(self):
 
 
 @testing.fix_random()
-@testing.with_requires('numpy>=1.11.0')
 @testing.gpu
 class TestInterval(unittest.TestCase):
 
@@ -260,7 +257,7 @@ def test_bound_1(self):
 
     @condition.repeat(3, 10)
     def test_bound_2(self):
-        vals = [self.rs.interval(2, None).get() for _ in range(10)]
+        vals = [self.rs.interval(2, None).get() for _ in range(20)]
         self.assertEqual(min(vals), 0)
         self.assertEqual(max(vals), 2)
 
@@ -292,7 +289,6 @@ def test_goodness_of_fit_2(self):
     {'a': numpy.array([0.0, 1.0, 2.0]), 'size': 2, 'p': [0.3, 0.3, 0.4]},
 )
 @testing.fix_random()
-@testing.with_requires('numpy>=1.11.0')
 @testing.gpu
 class TestChoice1(unittest.TestCase):
 
@@ -328,7 +324,6 @@ def test_bound(self):
     {'a': [0, 1, 2], 'size': 2, 'p': [0.3, 0.3, 0.4]},
 )
 @testing.fix_random()
-@testing.with_requires('numpy>=1.11.0')
 @testing.gpu
 class TestChoice2(unittest.TestCase):
 
@@ -361,7 +356,6 @@ def test_bound(self):
 
 
 @testing.fix_random()
-@testing.with_requires('numpy>=1.11.0')
 @testing.gpu
 class TestChoiceChi(unittest.TestCase):
 
@@ -413,7 +407,6 @@ def test_choice_multinomial(self, xp, dtype):
     {'a': 3, 'size': 1, 'p': [0.1, 0.1, 0.7]},
 )
 @testing.fix_random()
-@testing.with_requires('numpy>=1.11.0')
 @testing.gpu
 class TestChoiceFailure(unittest.TestCase):
 
@@ -432,7 +425,6 @@ def test_choice_invalid_value(self):
     {'a': numpy.array([0.0, 2.0, 4.0]), 'size': 2},
 )
 @testing.fix_random()
-@testing.with_requires('numpy>=1.11.0')
 @testing.gpu
 class TestChoiceReplaceFalse(unittest.TestCase):
 
@@ -476,7 +468,6 @@ def test_reproduce(self):
     {'a': [1, 2, 3], 'size': 5},
 )
 @testing.fix_random()
-@testing.with_requires('numpy>=1.11.0')
 @testing.gpu
 class TestChoiceReplaceFalseFailure(unittest.TestCase):
 

From aa39eb602a5d7055628b75a56dec1970a92abf5c Mon Sep 17 00:00:00 2001
From: Yuya Unno <unnonouno@gmail.com>
Date: Fri, 24 Nov 2017 12:14:49 +0900
Subject: [PATCH 19/21] Merge pull request #741 from
 niboshi/fix-random-choice-repro

Fix: RandomState.choice reproducibility
---
 cupy/random/distributions.py                  |  10 +-
 cupy/random/generator.py                      |  63 ++-
 cupy/random/sample.py                         |  26 +-
 .../cupy_tests/random_tests/test_generator.py | 471 ++++++++++--------
 4 files changed, 341 insertions(+), 229 deletions(-)

diff --git a/cupy/random/distributions.py b/cupy/random/distributions.py
index 844f92c32bc..86bab262b60 100644
--- a/cupy/random/distributions.py
+++ b/cupy/random/distributions.py
@@ -39,12 +39,12 @@ def gumbel(loc=0.0, scale=1.0, size=None, dtype=float):
     Returns:
         cupy.ndarray: Samples drawn from the Gumbel destribution.
 
-    .. seealso:: :func:`numpy.random.gumbel`
+    .. seealso::
+        :func:`cupy.RandomState.gumbel`
+        :func:`numpy.random.gumbel`
     """
-    rs = uniform(size=size, dtype=dtype)
-    # We use `1 - x` as input of `log` method to prevent overflow.
-    # It obeys numpy implementation.
-    return _gumbel_kernel(rs, loc, scale, rs)
+    rs = generator.get_random_state()
+    return rs.gumbel(loc, scale, size, dtype)
 
 
 def lognormal(mean=0.0, sigma=1.0, size=None, dtype=float):
diff --git a/cupy/random/generator.py b/cupy/random/generator.py
index d6a3cf95784..305fba476f1 100644
--- a/cupy/random/generator.py
+++ b/cupy/random/generator.py
@@ -14,6 +14,20 @@
 from cupy.cuda import curand
 
 
+_gumbel_kernel = None
+
+
+def _get_gumbel_kernel():
+    global _gumbel_kernel
+    if _gumbel_kernel is None:
+        _gumbel_kernel = core.ElementwiseKernel(
+            'T x, T loc, T scale', 'T y',
+            'y = loc - log(-log(1 - x)) * scale',
+            'gumbel_kernel'
+        )
+    return _gumbel_kernel
+
+
 class RandomState(object):
 
     """Portable container of a pseudo-random number generator.
@@ -327,12 +341,12 @@ def choice(self, a, size=None, replace=True, p=None):
         if p is not None:
             p = cupy.broadcast_to(p, (size, a_size))
             index = cupy.argmax(cupy.log(p) +
-                                cupy.random.gumbel(size=(size, a_size)),
+                                self.gumbel(size=(size, a_size)),
                                 axis=1)
             if not isinstance(shape, six.integer_types):
                 index = cupy.reshape(index, shape)
         else:
-            index = cupy.random.randint(0, a_size, size=shape)
+            index = self.randint(0, a_size, size=shape)
             # Align the dtype with NumPy
             index = index.astype(cupy.int64, copy=False)
 
@@ -362,6 +376,51 @@ def shuffle(self, a):
         curand.generate(self._generator, sample.data.ptr, sample.size)
         a[:] = a[cupy.argsort(sample)]
 
+    def gumbel(self, loc=0.0, scale=1.0, size=None, dtype=float):
+        """Returns an array of samples drawn from a Gumbel distribution.
+
+        .. seealso::
+            :func:`cupy.random.gumbel` for full documentation,
+            :meth:`numpy.random.RandomState.gumbel`
+        """
+        x = self.uniform(size=size, dtype=dtype)
+        # We use `1 - x` as input of `log` method to prevent overflow.
+        # It obeys numpy implementation.
+        _get_gumbel_kernel()(x, loc, scale, x)
+        return x
+
+    def randint(self, low, high=None, size=None, dtype='l'):
+        """Returns a scalar or an array of integer values over ``[low, high)``.
+
+        .. seealso::
+            :func:`cupy.random.randint` for full documentation,
+            :meth:`numpy.random.RandomState.randint`
+        """
+        if high is None:
+            lo = 0
+            hi = low
+        else:
+            lo = low
+            hi = high
+
+        if lo >= hi:
+            raise ValueError('low >= high')
+        if lo < cupy.iinfo(dtype).min:
+            raise ValueError(
+                'low is out of bounds for {}'.format(cupy.dtype(dtype).name))
+        if hi > cupy.iinfo(dtype).max + 1:
+            raise ValueError(
+                'high is out of bounds for {}'.format(cupy.dtype(dtype).name))
+
+        diff = hi - lo - 1
+        if diff > cupy.iinfo(cupy.int32).max - cupy.iinfo(cupy.int32).min + 1:
+            raise NotImplementedError(
+                'Sampling from a range whose extent is larger than int32 '
+                'range is currently not supported')
+        x = self.interval(diff, size).astype(dtype, copy=False)
+        cupy.add(x, lo, out=x)
+        return x
+
 
 def seed(seed=None):
     """Resets the state of the random number generator with a seed.
diff --git a/cupy/random/sample.py b/cupy/random/sample.py
index d768d5cc3f5..8d83622bb57 100644
--- a/cupy/random/sample.py
+++ b/cupy/random/sample.py
@@ -1,6 +1,5 @@
 import six
 
-import cupy
 from cupy import core
 from cupy.creation import basic
 from cupy.random import distributions
@@ -81,31 +80,8 @@ def randint(low, high=None, size=None, dtype='l'):
         If size is integer, it is the 1D-array of length ``size`` element.
         Otherwise, it is the array whose shape specified by ``size``.
     """
-    if high is None:
-        lo = 0
-        hi = low
-    else:
-        lo = low
-        hi = high
-
-    if lo >= hi:
-        raise ValueError('low >= high')
-    if lo < cupy.iinfo(dtype).min:
-        raise ValueError(
-            'low is out of bounds for {}'.format(cupy.dtype(dtype).name))
-    if hi > cupy.iinfo(dtype).max + 1:
-        raise ValueError(
-            'high is out of bounds for {}'.format(cupy.dtype(dtype).name))
-
-    diff = hi - lo - 1
-    if diff > cupy.iinfo(cupy.int32).max - cupy.iinfo(cupy.int32).min + 1:
-        raise NotImplementedError(
-            'Sampling from a range whose extent is larger than int32 range is '
-            'currently not supported')
     rs = generator.get_random_state()
-    x = rs.interval(diff, size).astype(dtype, copy=False)
-    cupy.add(x, lo, out=x)
-    return x
+    return rs.randint(low, high, size, dtype)
 
 
 def random_integers(low, high=None, size=None):
diff --git a/tests/cupy_tests/random_tests/test_generator.py b/tests/cupy_tests/random_tests/test_generator.py
index adadccb3b84..c0644d89001 100644
--- a/tests/cupy_tests/random_tests/test_generator.py
+++ b/tests/cupy_tests/random_tests/test_generator.py
@@ -1,5 +1,4 @@
 import mock
-import operator
 import os
 import threading
 import unittest
@@ -10,172 +9,200 @@
 import cupy
 from cupy import core
 from cupy import cuda
-from cupy.cuda import curand
 from cupy.random import generator
 from cupy import testing
 from cupy.testing import condition
 from cupy.testing import hypothesis
 
 
-class FunctionSwitcher(object):
+class RandomGeneratorTestCase(unittest.TestCase):
 
-    def __init__(self, f):
-        self.tmp = f
-        self.func_name = f.__name__
+    target_method = None
 
-    def __enter__(self):
-        setattr(curand, self.func_name, mock.Mock())
+    def setUp(self):
+        self.rs = generator.RandomState(seed=testing.generate_seed())
+
+    def _get_generator_func(self, *args, **kwargs):
+        assert isinstance(self.target_method, str), (
+            'generate_method must be overridden')
+        f = getattr(self.rs, self.target_method)
+        return lambda: f(*args, **kwargs)
 
-    def __exit__(self, *_):
-        setattr(curand, self.func_name, self.tmp)
+    def _generate_check_repro(self, func, seed=0):
+        # Sample a random array while checking reproducibility
+        self.rs.seed(seed)
+        x = func()
+        self.rs.seed(seed)
+        y = func()
+        testing.assert_array_equal(
+            x, y,
+            'Randomly generated arrays with the same seed did not match')
+        return x
+
+    def generate(self, *args, **kwargs):
+        # Pick one sample from generator.
+        # Reproducibility is checked by repeating seed-and-sample cycle twice.
+        func = self._get_generator_func(*args, **kwargs)
+        return self._generate_check_repro(func, seed=0)
+
+    def generate_many(self, *args, **kwargs):
+        # Pick many samples from generator.
+        # Reproducibility is checked only for the first sample,
+        # because it's very slow to set seed every time.
+        _count = kwargs.pop('_count', None)
+        assert _count is not None, '_count is required'
+        func = self._get_generator_func(*args, **kwargs)
+
+        if _count == 0:
+            return []
+
+        vals = [self._generate_check_repro(func, seed=0)]
+        for i in range(1, _count):
+            vals.append(func())
+        return vals
 
 
 @testing.fix_random()
 @testing.gpu
 class TestRandomState(unittest.TestCase):
 
-    _multiprocess_can_split_ = True
-    args = (0.0, 1.0)
-    size = None
-
     def setUp(self):
         self.rs = generator.RandomState(seed=testing.generate_seed())
 
-    def check_lognormal(self, curand_func, dtype):
-        shape = core.get_size(self.size)
-        exp_size = six.moves.reduce(operator.mul, shape, 1)
-        if exp_size % 2 == 1:
-            exp_size += 1
-
-        curand_func.return_value = cupy.zeros(exp_size, dtype=dtype)
-        out = self.rs.lognormal(self.args[0], self.args[1], self.size, dtype)
-        gen, _, size, mean, sigma = curand_func.call_args[0]
-        self.assertIs(gen, self.rs._generator)
-        self.assertEqual(size, exp_size)
-        self.assertIs(mean, self.args[0])
-        self.assertIs(sigma, self.args[1])
-        self.assertEqual(out.shape, shape)
+    def check_seed(self, seed):
+        rs = self.rs
 
-    def test_lognormal_float(self):
-        with FunctionSwitcher(curand.generateLogNormalDouble):
-            self.check_lognormal(curand.generateLogNormalDouble, float)
+        rs.seed(seed)
+        xs1 = [rs.uniform() for _ in range(100)]
 
-    def test_lognormal_float32(self):
-        with FunctionSwitcher(curand.generateLogNormal):
-            self.check_lognormal(curand.generateLogNormal, numpy.float32)
+        rs.seed(seed)
+        xs2 = [rs.uniform() for _ in range(100)]
 
-    def test_lognormal_float64(self):
-        with FunctionSwitcher(curand.generateLogNormalDouble):
-            self.check_lognormal(curand.generateLogNormalDouble, numpy.float64)
+        rs.seed(seed)
+        rs.seed(None)
+        xs3 = [rs.uniform() for _ in range(100)]
 
-    def check_normal(self, curand_func, dtype):
-        shape = core.get_size(self.size)
-        exp_size = six.moves.reduce(operator.mul, shape, 1)
-        if exp_size % 2 == 1:
-            exp_size += 1
-
-        curand_func.return_value = cupy.zeros(exp_size, dtype=dtype)
-        out = self.rs.normal(self.args[0], self.args[1], self.size, dtype)
-        gen, _, size, loc, scale = curand_func.call_args[0]
-        self.assertIs(gen, self.rs._generator)
-        self.assertEqual(size, exp_size)
-        self.assertIs(loc, self.args[0])
-        self.assertIs(scale, self.args[1])
-        self.assertEqual(out.shape, shape)
-
-    def test_normal_float32(self):
-        with FunctionSwitcher(curand.generateNormal):
-            self.check_normal(curand.generateNormal, numpy.float32)
-
-    def test_normal_float64(self):
-        with FunctionSwitcher(curand.generateNormalDouble):
-            self.check_normal(curand.generateNormalDouble, numpy.float64)
-
-    def check_random_sample(self, curand_func, dtype):
-        out = self.rs.random_sample(self.size, dtype)
-        curand_func.assert_called_once_with(
-            self.rs._generator, out.data.ptr, out.size)
-
-    def test_random_sample_float32(self):
-        with FunctionSwitcher(curand.generateUniform):
-            self.check_random_sample(curand.generateUniform, numpy.float32)
-
-    def test_random_sample_float64(self):
-        with FunctionSwitcher(curand.generateUniformDouble):
-            self.check_random_sample(
-                curand.generateUniformDouble, numpy.float64)
-
-    def check_seed(self, curand_func, seed):
-        self.rs.seed(seed)
-        call_args_list = curand_func.call_args_list
-        self.assertEqual(1, len(call_args_list))
-        call_args = call_args_list[0][0]
-        self.assertEqual(2, len(call_args))
-        self.assertIs(self.rs._generator, call_args[0])
-        self.assertEqual(numpy.uint64, call_args[1].dtype)
-
-    def test_seed_none(self):
-        with FunctionSwitcher(curand.setPseudoRandomGeneratorSeed):
-            self.check_seed(curand.setPseudoRandomGeneratorSeed, None)
+        # Random state must be reproducible
+        assert xs1 == xs2
+        # Random state must be initialized randomly with seed=None
+        assert xs1 != xs3
 
     @testing.for_int_dtypes()
     def test_seed_not_none(self, dtype):
-        with FunctionSwitcher(curand.setPseudoRandomGeneratorSeed):
-            self.check_seed(curand.setPseudoRandomGeneratorSeed, dtype(0))
+        self.check_seed(dtype(0))
 
     @testing.for_dtypes([numpy.complex_])
     def test_seed_invalid_type_complex(self, dtype):
         with self.assertRaises(TypeError):
-            with FunctionSwitcher(curand.setPseudoRandomGeneratorSeed):
-                self.check_seed(curand.setPseudoRandomGeneratorSeed, dtype(0))
+            self.rs.seed(dtype(0))
 
     @testing.for_float_dtypes()
     def test_seed_invalid_type_float(self, dtype):
         with self.assertRaises(TypeError):
-            with FunctionSwitcher(curand.setPseudoRandomGeneratorSeed):
-                self.check_seed(curand.setPseudoRandomGeneratorSeed, dtype(0))
+            self.rs.seed(dtype(0))
 
 
 @testing.gpu
-class TestRandomState2(TestRandomState):
+@testing.parameterize(*[
+    {'args': (0.0, 1.0), 'size': None},
+    {'args': (10.0, 20.0), 'size': None},
+    {'args': (0.0, 1.0), 'size': 10},
+    {'args': (0.0, 1.0), 'size': (1, 2, 3)},
+    {'args': (0.0, 1.0), 'size': 3},
+    {'args': (0.0, 1.0), 'size': (3, 3)},
+    {'args': (0.0, 1.0), 'size': ()},
+])
+@testing.fix_random()
+class TestLogNormal(RandomGeneratorTestCase):
 
-    args = (10.0, 20.0)
-    size = None
+    target_method = 'lognormal'
 
+    def check_lognormal(self, dtype):
+        vals = self.generate_many(
+            self.args[0], self.args[1], self.size, dtype, _count=10)
 
-@testing.gpu
-class TestRandomState3(TestRandomState):
+        shape = core.get_size(self.size)
+        for val in vals:
+            assert isinstance(val, cupy.ndarray)
+            assert val.dtype == dtype
+            assert val.shape == shape
+            assert (0 <= val).all()
+        # TODO(niboshi): Distribution test
+
+    def test_lognormal_float(self):
+        self.check_lognormal(float)
+
+    def test_lognormal_float32(self):
+        self.check_lognormal(numpy.float32)
 
-    args = (0.0, 1.0)
-    size = 10
+    def test_lognormal_float64(self):
+        self.check_lognormal(numpy.float64)
 
 
 @testing.gpu
-class TestRandomState4(TestRandomState):
+@testing.parameterize(*[
+    {'args': (0.0, 1.0), 'size': None},
+    {'args': (10.0, 20.0), 'size': None},
+    {'args': (0.0, 1.0), 'size': 10},
+    {'args': (0.0, 1.0), 'size': (1, 2, 3)},
+    {'args': (0.0, 1.0), 'size': 3},
+    {'args': (0.0, 1.0), 'size': (3, 3)},
+    {'args': (0.0, 1.0), 'size': ()},
+])
+@testing.fix_random()
+class TestNormal(RandomGeneratorTestCase):
 
-    args = (0.0, 1.0)
-    size = (1, 2, 3)
+    target_method = 'normal'
 
+    def check_normal(self, dtype):
+        vals = self.generate_many(
+            self.args[0], self.args[1], self.size, dtype, _count=10)
 
-@testing.gpu
-class TestRandomState6(TestRandomState):
+        shape = core.get_size(self.size)
+        for val in vals:
+            assert isinstance(val, cupy.ndarray)
+            assert val.dtype == dtype
+            assert val.shape == shape
+        # TODO(niboshi): Distribution test
 
-    args = (0.0, 1.0)
-    size = 3
+    def test_normal_float32(self):
+        self.check_normal(numpy.float32)
+
+    def test_normal_float64(self):
+        self.check_normal(numpy.float64)
 
 
 @testing.gpu
-class TestRandomState7(TestRandomState):
+@testing.parameterize(*[
+    {'size': None},
+    {'size': 10},
+    {'size': (1, 2, 3)},
+    {'size': 3},
+    {'size': ()},
+])
+@testing.fix_random()
+class TestRandomSample(unittest.TestCase):
+
+    def setUp(self):
+        self.rs = generator.RandomState(seed=testing.generate_seed())
 
-    args = (0.0, 1.0)
-    size = (3, 3)
+    def check_random_sample(self, dtype):
+        vals = [self.rs.random_sample(self.size, dtype) for _ in range(10)]
 
+        shape = core.get_size(self.size)
+        for val in vals:
+            assert isinstance(val, cupy.ndarray)
+            assert val.dtype == dtype
+            assert val.shape == shape
+            assert (0 <= val).all()
+            assert (val < 1).all()
+        # TODO(niboshi): Distribution test
 
-@testing.gpu
-class TestRandomState8(TestRandomState):
+    def test_random_sample_float32(self):
+        self.check_random_sample(numpy.float32)
 
-    args = (0.0, 1.0)
-    size = ()
+    def test_random_sample_float64(self):
+        self.check_random_sample(numpy.float64)
 
 
 @testing.fix_random()
@@ -185,22 +212,10 @@ class TestRandAndRandN(unittest.TestCase):
     def setUp(self):
         self.rs = generator.RandomState(seed=testing.generate_seed())
 
-    def test_rand(self):
-        self.rs.random_sample = mock.Mock()
-        self.rs.rand(1, 2, 3, dtype=numpy.float32)
-        self.rs.random_sample.assert_called_once_with(
-            size=(1, 2, 3), dtype=numpy.float32)
-
     def test_rand_invalid_argument(self):
         with self.assertRaises(TypeError):
             self.rs.rand(1, 2, 3, unnecessary='unnecessary_argument')
 
-    def test_randn(self):
-        self.rs.normal = mock.Mock()
-        self.rs.randn(1, 2, 3, dtype=numpy.float32)
-        self.rs.normal.assert_called_once_with(
-            size=(1, 2, 3), dtype=numpy.float32)
-
     def test_randn_invalid_argument(self):
         with self.assertRaises(TypeError):
             self.rs.randn(1, 2, 3, unnecessary='unnecessary_argument')
@@ -208,65 +223,92 @@ def test_randn_invalid_argument(self):
 
 @testing.fix_random()
 @testing.gpu
-class TestInterval(unittest.TestCase):
+class TestInterval(RandomGeneratorTestCase):
 
-    def setUp(self):
-        self.rs = cupy.random.get_random_state()
-        self.rs.seed(testing.generate_seed())
+    target_method = 'interval'
 
     def test_zero(self):
-        numpy.testing.assert_array_equal(
-            self.rs.interval(0, (2, 3)).get(), numpy.zeros((2, 3)))
+        shape = (2, 3)
+        vals = self.generate_many(0, shape, _count=10)
+        for val in vals:
+            assert isinstance(val, cupy.ndarray)
+            assert val.dtype == numpy.int32
+            assert val.shape == shape
+            assert (val == 0).all()
 
     def test_shape_zero(self):
-        v = self.rs.interval(10, None)
-        self.assertEqual(v.dtype, numpy.int32)
-        self.assertEqual(v.shape, ())
+        mx = 10
+        vals = self.generate_many(mx, None, _count=10)
+        for val in vals:
+            assert isinstance(val, cupy.ndarray)
+            assert val.dtype == numpy.int32
+            assert val.shape == ()
+            assert (0 <= val).all()
+            assert (val <= mx).all()
+        # TODO(niboshi): Distribution test
 
     def test_shape_one_dim(self):
-        v = self.rs.interval(10, 10)
-        self.assertEqual(v.dtype, numpy.int32)
-        self.assertEqual(v.shape, (10,))
+        mx = 10
+        size = 20
+        vals = self.generate_many(mx, size, _count=10)
+        for val in vals:
+            assert isinstance(val, cupy.ndarray)
+            assert val.dtype == numpy.int32
+            assert val.shape == (size,)
+            assert (0 <= val).all()
+            assert (val <= mx).all()
+        # TODO(niboshi): Distribution test
 
     def test_shape_multi_dim(self):
-        v = self.rs.interval(10, (1, 2))
-        self.assertEqual(v.dtype, numpy.int32)
-        self.assertEqual(v.shape, (1, 2))
+        mx = 10
+        shape = (1, 2)
+        vals = self.generate_many(mx, shape, _count=10)
+        for val in vals:
+            assert isinstance(val, cupy.ndarray)
+            assert val.dtype == numpy.int32
+            assert val.shape == shape
+            assert (0 <= val).all()
+            assert (val <= mx).all()
+        # TODO(niboshi): Distribution test
 
     def test_int32_range(self):
-        v = self.rs.interval(0x00000000, 2)
-        self.assertEqual(v.dtype, numpy.int32)
+        v = self.generate(0x00000000, 2)
+        assert v.dtype == numpy.int32
 
-        v = self.rs.interval(0x7fffffff, 2)
-        self.assertEqual(v.dtype, numpy.int32)
+        v = self.generate(0x7fffffff, 2)
+        assert v.dtype == numpy.int32
 
     def test_uint32_range(self):
-        v = self.rs.interval(0x80000000, 2)
-        self.assertEqual(v.dtype, numpy.uint32)
+        v = self.generate(0x80000000, 2)
+        assert v.dtype == numpy.uint32
 
-        v = self.rs.interval(0xffffffff, 2)
-        self.assertEqual(v.dtype, numpy.uint32)
+        v = self.generate(0xffffffff, 2)
+        assert v.dtype == numpy.uint32
 
-    @condition.repeat(3, 10)
     def test_bound_1(self):
-        vals = [self.rs.interval(10, (2, 3)).get() for _ in range(10)]
+        vals = self.generate_many(10, (2, 3), _count=10)
         for val in vals:
-            self.assertEqual(val.shape, (2, 3))
-        self.assertEqual(min(_.min() for _ in vals), 0)
-        self.assertEqual(max(_.max() for _ in vals), 10)
+            assert isinstance(val, cupy.ndarray)
+            assert val.dtype == numpy.int32
+            assert val.shape == (2, 3)
+            assert (0 <= val).all()
+            assert (val <= 10).all()
 
-    @condition.repeat(3, 10)
     def test_bound_2(self):
-        vals = [self.rs.interval(2, None).get() for _ in range(20)]
-        self.assertEqual(min(vals), 0)
-        self.assertEqual(max(vals), 2)
+        vals = self.generate_many(2, None, _count=20)
+        for val in vals:
+            assert isinstance(val, cupy.ndarray)
+            assert val.dtype == numpy.int32
+            assert val.shape == ()
+            assert (0 <= val).all()
+            assert (val <= 2).all()
 
     @condition.repeat(3, 10)
     def test_goodness_of_fit(self):
         mx = 5
         trial = 100
-        vals = [self.rs.interval(mx, None).get()
-                for _ in six.moves.xrange(trial)]
+        vals = self.generate_many(mx, None, _count=trial)
+        vals = [val.get() for val in vals]
         counts = numpy.histogram(vals, bins=numpy.arange(mx + 2))[0]
         expected = numpy.array([float(trial) / (mx + 1)] * (mx + 1))
         self.assertTrue(hypothesis.chi_square_test(counts, expected))
@@ -274,7 +316,7 @@ def test_goodness_of_fit(self):
     @condition.repeat(3)
     def test_goodness_of_fit_2(self):
         mx = 5
-        vals = self.rs.interval(mx, (5, 5)).get()
+        vals = self.generate(mx, (5, 5)).get()
         counts = numpy.histogram(vals, bins=numpy.arange(mx + 2))[0]
         expected = numpy.array([float(vals.size) / (mx + 1)] * (mx + 1))
         self.assertTrue(hypothesis.chi_square_test(counts, expected))
@@ -290,14 +332,12 @@ def test_goodness_of_fit_2(self):
 )
 @testing.fix_random()
 @testing.gpu
-class TestChoice1(unittest.TestCase):
+class TestChoice1(RandomGeneratorTestCase):
 
-    def setUp(self):
-        self.rs = cupy.random.get_random_state()
-        self.rs.seed(testing.generate_seed())
+    target_method = 'choice'
 
     def test_dtype_shape(self):
-        v = self.rs.choice(a=self.a, size=self.size, p=self.p)
+        v = self.generate(a=self.a, size=self.size, p=self.p)
         if isinstance(self.size, six.integer_types):
             expected_shape = (self.size,)
         else:
@@ -311,13 +351,14 @@ def test_dtype_shape(self):
 
     @condition.repeat(3, 10)
     def test_bound(self):
-        vals = [self.rs.choice(a=self.a, size=self.size, p=self.p).get()
-                for _ in range(20)]
+        vals = self.generate_many(
+            a=self.a, size=self.size, p=self.p, _count=20)
+        vals = [val.get() for val in vals]
         size_ = self.size if isinstance(self.size, tuple) else (self.size,)
         for val in vals:
             self.assertEqual(val.shape, size_)
-        self.assertEqual(min(_.min() for _ in vals), 0)
-        self.assertEqual(max(_.max() for _ in vals), 2)
+        self.assertEqual(min(val.min() for val in vals), 0)
+        self.assertEqual(max(val.max() for val in vals), 2)
 
 
 @testing.parameterize(
@@ -325,14 +366,12 @@ def test_bound(self):
 )
 @testing.fix_random()
 @testing.gpu
-class TestChoice2(unittest.TestCase):
+class TestChoice2(RandomGeneratorTestCase):
 
-    def setUp(self):
-        self.rs = cupy.random.get_random_state()
-        self.rs.seed(testing.generate_seed())
+    target_method = 'choice'
 
     def test_dtype_shape(self):
-        v = self.rs.choice(a=self.a, size=self.size, p=self.p)
+        v = self.generate(a=self.a, size=self.size, p=self.p)
         if isinstance(self.size, six.integer_types):
             expected_shape = (self.size,)
         else:
@@ -346,35 +385,34 @@ def test_dtype_shape(self):
 
     @condition.repeat(3, 10)
     def test_bound(self):
-        vals = [self.rs.choice(a=self.a, size=self.size, p=self.p).get()
-                for _ in range(20)]
+        vals = self.generate_many(
+            a=self.a, size=self.size, p=self.p, _count=20)
+        vals = [val.get() for val in vals]
         size_ = self.size if isinstance(self.size, tuple) else (self.size,)
         for val in vals:
             self.assertEqual(val.shape, size_)
-        self.assertEqual(min(_.min() for _ in vals), 0)
-        self.assertEqual(max(_.max() for _ in vals), 2)
+        self.assertEqual(min(val.min() for val in vals), 0)
+        self.assertEqual(max(val.max() for val in vals), 2)
 
 
 @testing.fix_random()
 @testing.gpu
-class TestChoiceChi(unittest.TestCase):
+class TestChoiceChi(RandomGeneratorTestCase):
 
-    def setUp(self):
-        self.rs = cupy.random.get_random_state()
-        self.rs.seed(testing.generate_seed())
+    target_method = 'choice'
 
     @condition.repeat(3, 10)
     def test_goodness_of_fit(self):
         trial = 100
-        vals = [self.rs.choice(3, 1, True, [0.3, 0.3, 0.4]).get()
-                for _ in six.moves.xrange(trial)]
+        vals = self.generate_many(3, 1, True, [0.3, 0.3, 0.4], _count=trial)
+        vals = [val.get() for val in vals]
         counts = numpy.histogram(vals, bins=numpy.arange(4))[0]
         expected = numpy.array([30, 30, 40])
         self.assertTrue(hypothesis.chi_square_test(counts, expected))
 
     @condition.repeat(3, 10)
     def test_goodness_of_fit_2(self):
-        vals = self.rs.choice(3, (5, 20), True, [0.3, 0.3, 0.4]).get()
+        vals = self.generate(3, (5, 20), True, [0.3, 0.3, 0.4]).get()
         counts = numpy.histogram(vals, bins=numpy.arange(4))[0]
         expected = numpy.array([30, 30, 40])
         self.assertTrue(hypothesis.chi_square_test(counts, expected))
@@ -426,14 +464,12 @@ def test_choice_invalid_value(self):
 )
 @testing.fix_random()
 @testing.gpu
-class TestChoiceReplaceFalse(unittest.TestCase):
+class TestChoiceReplaceFalse(RandomGeneratorTestCase):
 
-    def setUp(self):
-        self.rs = cupy.random.get_random_state()
-        self.rs.seed(testing.generate_seed())
+    target_method = 'choice'
 
     def test_dtype_shape(self):
-        v = self.rs.choice(a=self.a, size=self.size, replace=False)
+        v = self.generate(a=self.a, size=self.size, replace=False)
         if isinstance(self.size, six.integer_types):
             expected_shape = (self.size,)
         else:
@@ -447,7 +483,7 @@ def test_dtype_shape(self):
 
     @condition.repeat(3, 10)
     def test_bound(self):
-        val = self.rs.choice(a=self.a, size=self.size, replace=False).get()
+        val = self.generate(a=self.a, size=self.size, replace=False).get()
         size = self.size if isinstance(self.size, tuple) else (self.size,)
         self.assertEqual(val.shape, size)
         self.assertTrue((0 <= val).all())
@@ -455,12 +491,53 @@ def test_bound(self):
         val = numpy.asarray(val)
         self.assertEqual(numpy.unique(val).size, val.size)
 
-    def test_reproduce(self):
-        rs1 = cupy.random.RandomState(1)
-        v1 = rs1.choice(a=self.a, size=self.size, replace=False)
-        rs2 = cupy.random.RandomState(1)
-        v2 = rs2.choice(a=self.a, size=self.size, replace=False)
-        self.assertTrue((v1 == v2).all())
+
+@testing.gpu
+@testing.fix_random()
+class TestGumbel(RandomGeneratorTestCase):
+    # TODO(niboshi):
+    #   Test soundness of distribution.
+    #   Currently only reprocibility is checked.
+
+    target_method = 'gumbel'
+
+    def test_gumbel_1(self):
+        self.generate()
+
+    def test_gumbel_2(self):
+        self.generate(0.0, 1.0, size=(3, 2))
+
+
+@testing.gpu
+@testing.fix_random()
+class TestRandint(RandomGeneratorTestCase):
+    # TODO(niboshi):
+    #   Test soundness of distribution.
+    #   Currently only reprocibility is checked.
+
+    target_method = 'randint'
+
+    def test_randint_1(self):
+        self.generate(3)
+
+    def test_randint_2(self):
+        self.generate(3, 4, size=(3, 2))
+
+
+@testing.gpu
+@testing.fix_random()
+class TestUniform(RandomGeneratorTestCase):
+    # TODO(niboshi):
+    #   Test soundness of distribution.
+    #   Currently only reprocibility is checked.
+
+    target_method = 'uniform'
+
+    def test_uniform_1(self):
+        self.generate()
+
+    def test_uniform_2(self):
+        self.generate(-4.2, 2.4, size=(3, 2))
 
 
 @testing.parameterize(

From 7cca7e67e3062dd4137d79fa5d9150335b70b923 Mon Sep 17 00:00:00 2001
From: Yuya Unno <unnonouno@gmail.com>
Date: Thu, 7 Dec 2017 11:29:25 +0900
Subject: [PATCH 20/21] Remove unused import

---
 tests/cupy_tests/random_tests/test_generator.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/cupy_tests/random_tests/test_generator.py b/tests/cupy_tests/random_tests/test_generator.py
index c0644d89001..482b3a3da5c 100644
--- a/tests/cupy_tests/random_tests/test_generator.py
+++ b/tests/cupy_tests/random_tests/test_generator.py
@@ -1,4 +1,3 @@
-import mock
 import os
 import threading
 import unittest

From d543ced352ec33195ff14e07ac86b120a8bd6d5b Mon Sep 17 00:00:00 2001
From: Seiya Tokui <beam.web@gmail.com>
Date: Tue, 12 Dec 2017 13:33:44 +0900
Subject: [PATCH 21/21] Update to v2.2.0

---
 cupy/_version.py          | 2 +-
 docker/python2/Dockerfile | 2 +-
 docker/python3/Dockerfile | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cupy/_version.py b/cupy/_version.py
index a33997dd100..04188a16d9f 100644
--- a/cupy/_version.py
+++ b/cupy/_version.py
@@ -1 +1 @@
-__version__ = '2.1.0'
+__version__ = '2.2.0'
diff --git a/docker/python2/Dockerfile b/docker/python2/Dockerfile
index 675123c8279..d844f54ef11 100644
--- a/docker/python2/Dockerfile
+++ b/docker/python2/Dockerfile
@@ -6,4 +6,4 @@ RUN apt-get update -y && \
     python-pip && \
     rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
 
-RUN pip install cupy==2.1.0
+RUN pip install cupy==2.2.0
diff --git a/docker/python3/Dockerfile b/docker/python3/Dockerfile
index 474f58956a8..da0b905f193 100644
--- a/docker/python3/Dockerfile
+++ b/docker/python3/Dockerfile
@@ -6,4 +6,4 @@ RUN apt-get update -y && \
     python3-pip && \
     rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
 
-RUN pip3 install cupy==2.1.0
+RUN pip3 install cupy==2.2.0