/*
    Copyright (C) 2016 University of the Basque Country, UPV/EHU.

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

/** \file
 * General utility functions. Some of them could be moved to some more specific
 * utility files.
 */

/********************************************
 * Includes                                 *
 ********************************************/
#include <limits.h>

#include "core/globals.h"
#include "cuPoisson.h"

#include "utils.h"

/********************************************
 * Private function prototypes              *
 ********************************************/
#ifndef NDEBUG
static int check_distribution( struct distribution* distribution );
static int distribution_size( struct distribution* distribution );
#endif

/********************************************
 * Public functions                         *
 ********************************************/
/** \ingroup utils
 * Computes the CUDA grid dimensions for a particular size of threads.
 * \param[in] n The number of threads.
 * \param[out] num_blocks The number of blocks.
 * \param[out] block_size The block size.
 */
void set_grid_dims( int n, int* num_blocks, int* block_size )
{
	assert( num_blocks != NULL && num_blocks != NULL );
	assert( n > 0 );

	*block_size = global_info.device_info[0].maxThreadsPerBlock;
	*num_blocks = n / (*block_size);
	if( n % (*block_size) != 0 )
		(*num_blocks)++;
	if( *num_blocks > 1024 )
		*num_blocks = 1024;
}

/** \ingroup utils
 * Computes the CUDA grid dimensions for a particular size of threads, when the
 * kernel to be launched involves a reduction and atomic operations in global
 * memory.
 * \param[in] n The number of threads.
 * \param[out] num_blocks The number of blocks.
 * \param[out] block_size The block size.
 */
void set_grid_dims_reduction( int n, int* num_blocks, int* block_size )
{
	assert( num_blocks != NULL && num_blocks != NULL );
	assert( n > 0 );

	*block_size = global_info.device_info[0].maxThreadsPerBlock;
	*num_blocks = 8;
}

/**
 * Distribute \p num_elements elements in \p num_blocks blocks.
 * All the blocks, except possibly the last one, get the same block size.
 * If \p num_elements is not a multiple of \p num_blocks, then the last
 * block will get less elements.
 * If there isn't enough elements for the last blocks, they will be empty
 * and the number of blocks will be reduced accordingly.
 * \param[in] num_elements The number of elements to distribute.
 * \param[in] num_blocks The number of blocks.
 * \param[out] distribution The output distribution.
 */
void distribute_blocks( int num_elements,
                        int num_blocks,
                        struct distribution* distribution )
{
	assert( num_elements > 0 );
	assert( num_blocks > 0 );
	assert( distribution != NULL );

	distribution->block_size = div_up( num_elements, num_blocks );
	distribution->num_full_blocks = num_elements /
	                                distribution->block_size;	
	distribution->rest_block_size = num_elements -
	                                distribution->num_full_blocks *
	                                distribution->block_size;

	distribution->num_blocks = distribution->num_full_blocks;	
	if( distribution->rest_block_size > 0 )
		distribution->num_blocks++;

	assert( distribution->num_blocks <= num_blocks );
	assert( distribution_size( distribution ) == num_elements );
}

/**
 * Distribute \p num_elements elements in \p num_blocks blocks.
 * The first blocks will be of size block_size, and the remaining one of size
 * block_size - 1.
 * The function name comes from the fact that every block's size will be fairly
 * equal.
 * \param[in] num_elements The number of elements to distribute.
 * \param[in] num_blocks The number of blocks.
 * \param[out] distribution The output distribution.
 */
void distribute_blocks_fairly( int num_elements,
                               int num_blocks,
                               struct distribution* distribution )
{
	assert( num_elements > 0 );
	assert( num_blocks > 0 );
	assert( distribution != NULL );

	distribution->block_size = div_up( num_elements, num_blocks );
	distribution->num_full_blocks = num_elements %
	                                num_blocks;
	if( distribution->num_full_blocks == 0 )
		distribution->num_full_blocks = num_blocks;

	distribution->rest_block_size = distribution->block_size - 1;
	distribution->num_blocks = num_blocks;

	assert( distribution_size( distribution ) == num_elements );
}

/**
 * Distribute \p num_elements elements in \p num_blocks blocks.
 * The last block can be padded to ensure that all the blocks get the same block size.
 * If there is less elements than blocks, the last ones will be empty
 * and the number of blocks will be reduced accordingly.
 * \param[in] num_elements The number of elements to distribute.
 * \param[in] num_blocks The number of blocks.
 * \param[out] distribution The output distribution.
 */
void distribute_blocks_padded( int num_elements,
                               int num_blocks,
                               struct distribution* distribution )
{
	assert( num_elements > 0 );
	assert( num_blocks > 0 );
	assert( distribution != NULL );

	distribution->block_size = div_up( num_elements, num_blocks );
	distribution->num_full_blocks = num_elements /
	                                distribution->block_size;
	distribution->rest_block_size = num_elements -
	                                distribution->num_full_blocks *
	                                distribution->block_size;

	if( distribution->rest_block_size > 0 )
		distribution->num_full_blocks++;
	distribution->num_blocks = distribution->num_full_blocks;
	distribution->rest_block_size = 0;

	assert( distribution->num_blocks <= num_blocks );
	assert( distribution_size( distribution ) >= num_elements );
}

/**
 * Get the offset of a particular block in a distribution structure.
 * \param[in] distribution The distribution.
 * \param[in] index The index of the block whose offset is computed.
 * \return The offset of the block with the specified index in the distribution.
 * \see distribute_blocks
 */
unsigned int get_block_offset( const struct distribution* distribution,
                               int                        index )
{
	assert( distribution != NULL && index >= 0 );

	if( index < distribution->num_full_blocks )
		return index * distribution->block_size;
	else if( index < distribution->num_blocks )
		return distribution->num_full_blocks * distribution->block_size +
		       (index - distribution->num_full_blocks) * distribution->rest_block_size;
	else
		return distribution->num_full_blocks * distribution->block_size +
		       (distribution->num_blocks - distribution->num_full_blocks)*
		       distribution->rest_block_size;
}

/**
 * Get the size of a particular block in a distribution structure.
 * \param[in] distribution The distribution.
 * \param[in] index The index of the block whose size is computed.
 * \return The size of the block with the specified index in the distribution.
 * \see distribute_blocks
 */
unsigned int get_block_size( const struct distribution* distribution,
                             int                        index )
{
	assert( distribution != NULL && index >= 0 );

	if( index < distribution->num_full_blocks )
		return distribution->block_size;
	else if( index < distribution->num_blocks )
		return distribution->rest_block_size;
	else
		return 0;
}

/**
 * Rounded up integer division.
 * \param[in] dividend
 * \param[in] divisor
 * \return The lowest integer greater than or equal to \par dividend /
 *         \par divisor (considering a real division).
 */
int div_up( int dividend, int divisor )
{
	int result;

	assert( divisor != 0 );

	result = dividend / divisor;
	if( dividend % divisor != 0)
		result++;
	return result;
}

/**
 * Convert a string representing a data size into an integer.
 * If the string is numeric it is converted as is. If it is suffixed with any
 * of the allowed modifiers (B, K, M and G) the size is multiplied by
 * 2**0, 2**10, 2**20 or 2**30, respectively. Lowercase modifiers are also
 * allowed.
 * \param[in] str_size A string representing a size.
 * \return The amount of bytes corresponding to the input string on success,
 * -1 otherwise.
 */
long size2int( const char* str_size )
{
	char* endptr;
	long size;

	assert( str_size != NULL );

	size = strtol( str_size, &endptr, 10 );
	if( size < 0 || size == LONG_MAX )
		return -1L;

	if( *endptr == '\0' )  // No suffix.
		return size;
	if( endptr[1] != '\0' ) // Too long suffix.
		return -1;

	switch( *endptr )
	{
	case 'b':
	case 'B':
		return size;
	case 'k':
	case 'K':
		return size << 10;
	case 'm':
	case 'M':
		return size << 20;
	case 'g':
	case 'G':
		return size << 30;
	default:
		return -1L;
	}
}

/**
 * Find the position of the first \p x in \p vector.
 * \param[in] x The element to be found.
 * \param[in] vector The vector where \p x must be found.
 * \param[in] length The vector length.
 * \return The lowest index value for which \par vector[i] == \p x.
 * If \p x is not found in \p vector -1 is returned.
 */
int find_first( int x, const int* vector, int length )
{
	int i;

	assert( vector != NULL );
	assert( length > 0 );

	for( i = 0; i < length; i++ )
		if( vector[i] == x )
			return i;

	return -1;
}

/**
 * Dump data in a device buffer to a file. Helpful for debugging. The filename
 * will be "dump" suffixed by an id (useful for MPI executions).
 * \param[in] data A pointer to the device buffer.
 * \param[in] num_bytes The number of bytes to dump.
 * \param[in] id An identifier that would be appended to the filename.
 */
void dump_device_buffer( const void* data, int num_bytes, int id )
{
	char* hdata;
	FILE* fp;
	char filename[80];

	hdata = (char*) malloc( num_bytes );
	if( hdata == NULL )
		return;
	cudaMemcpy( hdata, data, num_bytes, cudaMemcpyDeviceToHost );

	sprintf( filename, "dump%d", id );
	fp = fopen( filename, "wb" );
	if( fp == NULL )
		return;
	fwrite( hdata, 1, num_bytes, fp );

	fclose(fp);
	free(hdata);
}

/********************************************
 * Private functions                        *
 ********************************************/
#ifndef NDEBUG
int check_distribution( struct distribution* distribution )
{
	assert( distribution != NULL );

	if( distribution->block_size <= 0 )
		return 0;
	if( distribution->num_blocks < 0 )
		return 0;
	if( distribution->num_full_blocks < 0 )
		return 0;
	if( distribution->num_full_blocks > distribution->num_blocks )
		return 0;
	if( distribution->rest_block_size < 0 )
		return 0;
	if( distribution->rest_block_size >= distribution->block_size )
		return 0;

	return 1;
}

static int distribution_size( struct distribution* distribution )
{
	int size,
		rest_num_blocks;

	assert( distribution != NULL );
	assert( check_distribution( distribution ) );

	rest_num_blocks = distribution->num_blocks - distribution->num_full_blocks;

	size  = distribution->num_full_blocks * distribution->block_size;
	size += rest_num_blocks * distribution->rest_block_size;

	return size;
}
#endif
