/*
    Copyright (C) 2016 University of the Basque Country, UPV/EHU.

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

/** \file
 * Basic linear algebra routines in GPU. They can use cuBLAS if specified at
 * compilation time.
 */

/********************************************
 * Includes                                 *
 ********************************************/
#ifdef USE_CUBLAS
#include <cublas_v2.h>
#endif

#include "core/globals.h"
#include "cuPoisson.h"
#include "utils/utils.h"

#include "alg.h"

/********************************************
 * Private function prototypes              *
 ********************************************/
#ifndef USE_CUBLAS
__global__
static void dev_daddvv( const double* v1,
                        const double* v2,
                        double*       res,
                        int           n );
__global__
static void dev_dsubvv( const double* v1,
                        const double* v2,
                        double*       res,
                        int           n );
#endif
__global__
static void dev_dnorm2v( const double* v,
                         double*       res,
                         int           n );
__global__
static void dev_ddotp( const double* v1,
                       const double* v2,
                       double* res,
                       int n );
#ifndef USE_CUBLAS
__global__
static void dev_dsaddvv( double        c,
                         const double* v1,
                         const double* v2,
                         double*       res,
                         int           n );
__global__
static void dev_dssubvv( double        c,
                         const double* v1,
                         const double* v2,
                         double*       res,
                         int           n );
__global__
static void dev_dscalv( double        c,
                        const double* v,
                        double*       res,
                        int           n );
#endif
__device__
static double dev_atomic_add( double* address, double val );

/********************************************
 * Public functions                         *
 ********************************************/
/**	\ingroup algebra
 * Adds two double vectors of size \p n into a third vector using the current
 * GPU device.
 * \param[in] v1, v2 The input vectors in device memory.
 * \param[out] res A pointer pointing to an allocated memory space in device
 * memory where the resulting vector will be stored.
 * \param[in] n The number of elements in the vectors.
 * \param[in] i_dev The device index computing the operation.
 * \return CUP_SUCCESS, CUP_CUDA_ERROR.
 */
cup_error_t daddvv( const double* v1,
                    const double* v2,
                    double*       res,
                    int           n,
                    int           i_dev )
{
	assert( v1 != NULL && v2 != NULL && res != NULL );
	assert( n > 0 );
	assert( i_dev >= 0 && i_dev < global_info.num_devices );

#ifdef USE_CUBLAS
	double alpha;
	cublasHandle_t handle = global_info.cublas_handles[i_dev];

	CUDA( cudaMemcpy (res, v2, n * sizeof( double ),
	                          cudaMemcpyDeviceToDevice) )
	alpha = 1.0;
	if( cublasDaxpy( handle, n, &alpha, v1, 1, res, 1)
	    != CUBLAS_STATUS_SUCCESS )
		return CUP_CUDA_ERROR;
#else
	int num_blocks,
	    block_size;

	set_grid_dims( n, &num_blocks, &block_size );
	dev_daddvv<<< num_blocks, block_size >>>( v1, v2, res, n );
#endif

	return CUP_SUCCESS;
}

/**	\ingroup algebra
 * Substracts two double vectors of size \p n into a third vector using the
 * current GPU device.
 * \param[in] v1, v2 The input vectors in device memory.
 * \param[out] res A pointer pointing to an allocated memory space in device
 * memory where the resulting vector will be stored.
 * \param[in] n The number of elements in the vectors.
 * \param[in] i_dev The device index computing the operation.
 * \return CUP_SUCCESS, CUP_CUDA_ERROR.
 */
cup_error_t dsubvv( const double* v1,
                    const double* v2,
                    double*       res,
                    int           n,
                    int           i_dev )
{
	assert( v1 != NULL && v2 != NULL && res != NULL );
	assert( n > 0 );
	assert( i_dev >= 0 && i_dev < global_info.num_devices );

#ifdef USE_CUBLAS
	double alpha;
	cublasHandle_t handle = global_info.cublas_handles[i_dev];

	CUDA( cudaMemcpy( res, v2, n * sizeof( double ),
			                  cudaMemcpyDeviceToDevice) )
	alpha = -1.0;
	if( cublasDscal( handle, n, &alpha, res, 1 ) != CUBLAS_STATUS_SUCCESS )
			return CUP_CUDA_ERROR;
	alpha = 1.0;
	if( cublasDaxpy( handle, n, &alpha, v1, 1, res, 1 )
		!= CUBLAS_STATUS_SUCCESS )
		return CUP_CUDA_ERROR;
#else
	int num_blocks,
	    block_size;

	set_grid_dims( n, &num_blocks, &block_size );
	dev_dsubvv<<< num_blocks,block_size >>>( v1, v2, res, n );
#endif

	return CUP_SUCCESS;
}

/**	\ingroup algebra
 * Computes the square of the euclidean norm of a double vector of size \p n
 * using the current GPU device.
 * \param[in] v The input vector in device memory.
 * \param[out] res A pointer pointing to an allocated memory space in device
 * memory where the result will be stored.
 * \param[in] n The number of elements in the vector.
 * \return CUP_SUCCESS, CUP_CUDA_ERROR.
 */
cup_error_t dnorm2v( const double* v, double* res, int n )
{
	int num_blocks,
	block_size;

	assert( v != NULL && res != NULL );
	assert( n > 0 );

	set_grid_dims_reduction( n, &num_blocks, &block_size );

	CUDA( cudaMemset( res, 0, sizeof( double ) ) )
	dev_dnorm2v<<< num_blocks, block_size, block_size * sizeof( double )>>>
	           (v, res, n );

	return CUP_SUCCESS;
}

/**	\ingroup algebra
 * Computes the dot product of two double vectors of size \p n
 * using the current GPU device.
 * \param[in] v1, v2 The input vectors in device memory.
 * \param[out] res A pointer pointing to an allocated memory space in device
 * memory where the result will be stored.
 * \param[in] n The number of elements in the vectors.
 * \return CUP_SUCCESS, CUP_CUDA_ERROR.
 */
cup_error_t ddotp( const double* v1, const double* v2, double* res, int n)
{
	int num_blocks,
	block_size;

	assert( v1 != NULL && v2 != NULL && res != NULL );
	assert( n > 0 );

	set_grid_dims_reduction( n, &num_blocks, &block_size );

	CUDA( cudaMemset( res, 0, sizeof( double ) ) )
	dev_ddotp<<< num_blocks, block_size, block_size * sizeof( double ) >>>
	         ( v1, v2, res, n );

	return CUP_SUCCESS;
}

/**	\ingroup algebra
 * Adds two double vectors of size \p n into a third vector using the current
 * GPU device. The \p v2 vector is first multiplied by \p c.
 * \param[in] c A factor that is multiplied to \p v2 before the sum.
 * \param[in] v1, v2 The input vectors in device memory.
 * \param[out] res A pointer pointing to an allocated memory space in device
 * memory where the resulting vector will be stored.
 * \param[in] n The number of elements in the vectors.
 * \param[in] i_dev The device index computing the operation.
 * \return CUP_SUCCESS, CUP_CUDA_ERROR.
 */
cup_error_t dsaddvv( double        c,
                     const double* v1,
                     const double* v2,
                     double*       res,
                     int           n,
                     int           i_dev )
{
	assert( v1 != NULL && v2 != NULL && res != NULL );
	assert( n > 0 );
	assert( i_dev >= 0 && i_dev < global_info.num_devices );

#ifdef USE_CUBLAS
	cublasHandle_t handle = global_info.cublas_handles[i_dev];

	CUDA( cudaMemcpy( res, v1, n * sizeof( double ),
			                  cudaMemcpyDeviceToDevice) )
	if( cublasDaxpy( handle, n, &c, v2, 1, res, 1 ) != CUBLAS_STATUS_SUCCESS )
		return CUP_CUDA_ERROR;
#else
	int num_blocks,
	block_size;

	set_grid_dims( n, &num_blocks, &block_size );
	dev_dsaddvv<<< num_blocks, block_size >>>( c, v1, v2, res, n );
#endif

	return CUP_SUCCESS;
}

/**	\ingroup algebra
 * Substracts two double vectors of size \p n into a third vector using the
 * current GPU device. The \p v2 vector is first multiplied by \p c.
 * \param[in] c A factor that is multiplied to \p v2 before the substraction.
 * \param[in] v1, v2 The input vectors in device memory.
 * \param[out] res A pointer pointing to an allocated memory space in device
 * memory where the resulting vector will be stored.
 * \param[in] n The number of elements in the vectors.
 * \param[in] i_dev The device index computing the operation.
 * \return CUP_SUCCESS, CUP_CUDA_ERROR.
 */
cup_error_t dssubvv( double        c,
                     const double* v1,
                     const double* v2,
                     double*       res,
                     int           n,
                     int           i_dev )
{
	assert( v1 != NULL && v2 != NULL && res != NULL );
	assert( n > 0 );
	assert( i_dev >= 0 && i_dev < global_info.num_devices );

#ifdef USE_CUBLAS
	double alpha;
	cublasHandle_t handle = global_info.cublas_handles[i_dev];

	CUDA( cudaMemcpy( res, v2, n * sizeof( double ),
	                          cudaMemcpyDeviceToDevice) )
	alpha = -c;
	if( cublasDscal( handle, n, &alpha, res, 1 ) != CUBLAS_STATUS_SUCCESS )
		return CUP_CUDA_ERROR;
	alpha = 1.0;
	if( cublasDaxpy( handle, n, &alpha, v1, 1, res, 1 )
	    != CUBLAS_STATUS_SUCCESS)
		return CUP_CUDA_ERROR;
#else
	int num_blocks,
	block_size;

	set_grid_dims( n, &num_blocks, &block_size );
	dev_dssubvv<<< num_blocks, block_size >>>( c, v1, v2, res, n );
#endif
	return CUP_SUCCESS;
}

/**	\ingroup algebra
 * Multiplies a vector of size \p n by a constant, \p c, on the
 * current GPU device.
 * \param[in] c The multiplication factor.
 * \param[in] v The input vector in device memory.
 * \param[out] res A pointer pointing to an allocated memory space in device
 * memory where the resulting vector will be stored.
 * \param[in] n The number of elements in the vectors.
 * \param[in] i_dev The device index computing the operation.
 * \param[in] stream The CUDA stream the kernel must run on.
 * \return CUP_SUCCESS, CUP_CUDA_ERROR.
 */
cup_error_t dscalv( double        c,
                    const double* v,
                    double*       res,
                    int           n,
                    int           i_dev,
                    cudaStream_t  stream )
{
	assert( v != NULL && res != NULL );
	assert( n > 0 );
	assert( i_dev >= 0 && i_dev < global_info.num_devices );

#ifdef USE_CUBLAS
	double alpha;
	cublasHandle_t handle = global_info.cublas_handles[i_dev];

	if( cublasSetStream( handle, stream ) != CUBLAS_STATUS_SUCCESS )
		return CUP_CUDA_ERROR;
	CUDA( cudaMemcpy( res, v, n * sizeof( double ),
			      cudaMemcpyDeviceToDevice) )
	alpha = c;
	if( cublasDscal( handle, n, &alpha, res, 1 ) != CUBLAS_STATUS_SUCCESS )
		return CUP_CUDA_ERROR;
#else
	int num_blocks,
	block_size;

	set_grid_dims( n, &num_blocks, &block_size );
	dev_dscalv<<< num_blocks, block_size, 0, stream >>>( c, v, res, n );
#endif
	return CUP_SUCCESS;
}

/********************************************
 * Private functions                        *
 ********************************************/
#ifndef USE_CUBLAS
/**	\ingroup algebra
 * Kernel called by daddvv.
 */
__global__ static
void dev_daddvv( const double* v1,
                 const double* v2,
                 double*       res,
                 int           n )
{
	int tid = threadIdx.x + blockIdx.x*blockDim.x,
	    stride = gridDim.x * blockDim.x,
	    i;

	for( i = tid; i < n; i += stride )
		res[i] = v1[i] + v2[i];
	return;
}

/**	\ingroup kernel algebra
 * Kernel called by dsubvv.
 */
__global__ static
void dev_dsubvv( const double* v1,
                 const double* v2,
                 double*       res,
                 int           n )
{
	int tid = threadIdx.x + blockIdx.x*blockDim.x,
	    stride = gridDim.x * blockDim.x,
	    i;

	for( i = tid; i < n; i += stride )
		res[i] = v1[i] - v2[i];
	return;
}
#endif

/**	\ingroup algebra
 * Kernel called by dnorm2v.
 */
__global__ static
void __launch_bounds__(1024,1) dev_dnorm2v( const double* v,
                                            double*       res,
                                            int           n )
{
	extern __shared__ double sh_tmp[];

	int bs = blockDim.x,
	    tid = threadIdx.x + blockIdx.x*bs,
	    lid = threadIdx.x,
	    stride = gridDim.x * bs,
	    i;

	sh_tmp[lid] = 0.0;
	for( i = tid; i < n; i += stride )
		sh_tmp[lid] +=  v[i] * v[i];

	//Reduction
	for( i = bs>>1; i >= 32; i >>= 1 )
	{
		__syncthreads();
		if( lid < i )
			sh_tmp[lid] += sh_tmp[lid + i];
	}

	// Last steps do not need to be synchronized explicitly since are executed
	// by a single warp.
	if( i == 16 )
	{
		if( lid < i )
			sh_tmp[lid] += sh_tmp[lid + i];
		i >>= 1;
	}
	if( i == 8 )
	{
		if( lid < i )
			sh_tmp[lid] += sh_tmp[lid + i];
		i >>= 1;
	}
	if( i == 4 )
	{
		if( lid < i )
			sh_tmp[lid] += sh_tmp[lid + i];
		i >>= 1;
	}
	if( i == 2 )
	{
		if( lid < i )
			sh_tmp[lid] += sh_tmp[lid + i];
		i >>= 1;
	}
	if( i == 1 )
		if( lid < i )
		{
			sh_tmp[lid] += sh_tmp[lid + i];
			dev_atomic_add( res, sh_tmp[0] );
		}

	return;
}

/**	\ingroup algebra
 * Kernel called by ddotp.
 */
__global__ static
void __launch_bounds__(1024,1) dev_ddotp( const double* v1,
                                          const double* v2,
                                          double*       res,
                                          int           n )
{
	extern __shared__ double sh_tmp[];

	int bs = blockDim.x,
	    tid = threadIdx.x + blockIdx.x*bs,
	    lid = threadIdx.x,
	    stride = gridDim.x * bs,
	    i;

	sh_tmp[lid] = 0.0;
	for( i = tid; i < n; i += stride )
		sh_tmp[lid] +=  v1[i] * v2[i];

	//Reduction
	for( i = bs>>1; i >= 32; i >>= 1)
	{
		__syncthreads();
		if( lid < i )
			sh_tmp[lid] += sh_tmp[lid + i];
	}

	// Last steps do not need to be synchronized explicitly since are executed
	// by a single warp.
	if( i == 16 )
	{
		if( lid < i )
			sh_tmp[lid] += sh_tmp[lid + i];
		i >>= 1;
	}
	if( i == 8 )
	{
		if( lid < i )
			sh_tmp[lid] += sh_tmp[lid + i];
		i >>= 1;
	}
	if( i == 4 )
	{
		if( lid < i )
			sh_tmp[lid] += sh_tmp[lid + i];
		i >>= 1;
	}
	if( i == 2 )
	{
		if( lid < i )
			sh_tmp[lid] += sh_tmp[lid + i];
		i >>= 1;
	}
	if( i == 1 )
		if( lid < i )
		{
			sh_tmp[lid] += sh_tmp[lid + i];
			dev_atomic_add( res, sh_tmp[0] );
		}

	return;
}

#ifndef USE_CUBLAS
/**	\ingroup algebra
 * Kernel called by dsaddvv.
 */
__global__ static
void dev_dsaddvv( double        c,
                  const double* v1,
                  const double* v2,
                  double*       res,
                  int           n )
{
	int tid = threadIdx.x + blockIdx.x*blockDim.x,
	    stride = gridDim.x * blockDim.x,
	    i;

	for( i = tid; i <n; i += stride )
		res[i] = v1[i] + c*v2[i];
	return;
}

/**	\ingroup algebra
 * Kernel called by dssubvv.
 */
__global__ static
void dev_dssubvv( double        c,
                  const double* v1,
                  const double* v2,
                  double*       res,
                  int           n )
{
	int tid = threadIdx.x + blockIdx.x*blockDim.x,
	    stride = gridDim.x * blockDim.x,
	    i;

	for( i = tid; i < n; i += stride )
		res[i] = v1[i] - c*v2[i];
	return;
}

/**	\ingroup algebra
 * Kernel called by dscalv.
 */
__global__ static
void dev_dscalv( double        c,
                 const double* v,
                 double*       res,
                 int           n )
{
	int tid = threadIdx.x + blockIdx.x*blockDim.x,
	    stride = gridDim.x * blockDim.x,
	    i;

	for( i = tid; i < n; i += stride )
		res[i] = c * v[i];
	return;
}
#endif

/**	\ingroup algebra
 * Atomic Add in global memory is not currently implemented in CUDA,
 * because the devices do not have hw to support it. This function performs an
 * atomic add for doubles, based on atomicCAS function, and its proposed by
 * CUDA developers. It is important to notice that this function is much slower
 * than hw implemented atomic adds.
 * \param[in,out] address A global memory address where the addition must be
 * performed.
 * \param[in] val The value to be added.
 * \return The value at /p address before the addition.
 */
__device__ static
double dev_atomic_add( double* address, double val )
{
    unsigned long long int* address_as_ull = (unsigned long long int*) address,
                            old = *address_as_ull,
                            assumed;

    do
    {
    	assumed = old;
    	old = atomicCAS( address_as_ull, assumed,
    	                 __double_as_longlong( val +
    	                                       __longlong_as_double( assumed )
    	                                     ) );
    }
    while( assumed != old );
    return __longlong_as_double( old );

}
