/*
    Copyright (C) 2016 University of the Basque Country, UPV/EHU.

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

/** \file
 * Library initialization functions.
 */

/********************************************
 * Includes                                 *
 ********************************************/
#include <fcntl.h>
#include <errno.h>
#ifdef USE_CUBLAS
#include <cublas_v2.h>
#endif

#include "globals.h"
#include "cuPoisson.h"
#include "utils/utils.h"


/********************************************
 * Global variables                         *
 ********************************************/
int glob_is_init = 0;
struct cup_global_info global_info;

/********************************************
 * Private function prototypes              *
 ********************************************/
static cup_error_t enable_P2P();
static cup_error_t load_configuration();
static int get_int_env( const char* name, int def );
static int get_size_env( const char* name, const char* def );

/********************************************
 * Exported functions                       *
 ********************************************/
extern "C" {

/**	\ingroup public
 * Initializes the cuPoisson library.
 * It checks how many CUDA capable device are accessible and stores the
 * properties of all of them in a globally accessible array.
 * \param[in] num_devices The number of devices the library must use.
 * If \p num_devices <= 0, all available valid devices will be used and the
 * \p devices parameter must be NULL.
 * \param[in] devices A list specifying the numbers of the devices that the
 * library must use. If NULL, the library will use the first \p num_devices
 * valid devices.
 * \return CUP_SUCCESS on success. CUP_NO_DEVICES if not enough devices were
 * found. CUP_NOT_VALID_DEVICES if not enough valid device were found.
 */
cup_error_t cup_init( int num_devices, int* devices )
{
	int  i_dev,
	     i_valid,
	     total_devices,
	     num_valid_devices,
	   * valid_devices;
	struct  cudaDeviceProp* device_info;
	FILE* msg_output;
	char selected_devices[80];

	if( cup_is_init() )
		return CUP_INITIALIZED;

	global_info.pid = getpid();

	THROW_CUP_ERROR( load_configuration() );

	// Currently all messages, except errors, are written to the same file.
	// In the future a configuration file could be used to specify this info.
	msg_output = stderr;
	global_info.config.msg_output[ERROR] = stderr;
#if DEBUG != 0 || VERBOSE != 0
	char log_file_name[32];
	snprintf( log_file_name, 32, "%s_%d.log", LOG_FILE_NAME, global_info.pid );
	msg_output = fopen( log_file_name, "a" );

	if( msg_output == NULL )
	{
		msg_output = stderr;
		global_info.config.verbose_level = 1;	// Just error and debug info.
		print_msg( ERROR, "Log file %s could not be opened: %s",
				   log_file_name, strerror( errno ));
	}

#endif
	// Initialize all descriptors, otherwise the closing step in cup_finish()
	// crashes.
	global_info.config.msg_output[TRACE] = msg_output;
	global_info.config.msg_output[WARNING] = msg_output;
	global_info.config.msg_output[INFO] = msg_output;

	CUDA( cudaGetDeviceCount( &total_devices ) );
	if( total_devices == 0 )
	{
		print_msg( ERROR, "No devices found." );
		return CUP_NO_DEVICES;
	}
	else
		print_msg( INFO, "%d devices found.", total_devices );

	THROW_CUP_ERROR( cup_get_valid_devices( &valid_devices, &num_valid_devices ) );

	if( num_valid_devices == 0 )
	{
		print_msg( ERROR, "No valid devices found." );
		return CUP_NO_VALID_DEVICES;
	}
	else if( num_devices <= 0 )
		num_devices = num_valid_devices;
	else if( num_valid_devices < num_devices )
	{
		print_msg( ERROR,
		           "Not enough valid devices found. %d required and %d found.",
		           num_devices,
		           num_valid_devices );
		return CUP_NO_VALID_DEVICES;
	}
	if(devices == NULL)
		devices = valid_devices;

	MALLOC( device_info, num_devices, struct cudaDeviceProp );

	for( i_dev = 0; i_dev < num_devices; i_dev++ )
	{
		if( devices[i_dev] > total_devices )
		{
			print_msg( ERROR, "Device %d not found.", i_dev);
			return CUP_NO_DEVICES;
		}
		for( i_valid = 0; i_valid < num_valid_devices; i_valid++ )
			if( devices[i_dev] == valid_devices[i_valid])
				break;
		if( i_valid == num_valid_devices )
		{
			print_msg( ERROR, "Device %d not valid.", i_dev);
			return CUP_NO_VALID_DEVICES;
		}
		CUDA( cudaGetDeviceProperties( device_info + i_dev, devices[i_dev]) );
	}

	global_info.num_devices = num_devices;
	global_info.device_info = device_info;
	MALLOC( global_info.devices, num_devices, int );
	memcpy(global_info.devices, devices, num_devices * sizeof( int ) );
	free( valid_devices );

	THROW_CUP_ERROR( enable_P2P() );

#ifdef USE_CUBLAS
	global_info.use_cublas = 1;
	MALLOC( global_info.cublas_handles, num_devices, cublasHandle_t );
	for( i_dev = 0; i_dev < num_devices; i_dev++ )
	{
		CUDA( cudaSetDevice( global_info.devices[i_dev] ) );
		if( cublasCreate( global_info.cublas_handles + i_dev )
			!= CUBLAS_STATUS_SUCCESS)
		{
			print_msg( WARNING,
					   "Could not initialize cuBLAS. Process will continue"
					   "without cuBLAS support." );
			for( i_dev--; i_dev >= 0; i_dev-- )
				cublasDestroy( global_info.cublas_handles[i_dev] );
			free( global_info.cublas_handles );
			global_info.use_cublas = 0;
			break;
		}
	}
	CUDA( cudaSetDevice( global_info.devices[0] ) );
#endif

	print_msg( INFO, "Library initialized." );
	snprintf( selected_devices,
	          sizeof( selected_devices ),
	          "%d",
	          global_info.devices[0] );
	for( i_dev = 1; i_dev < global_info.num_devices; i_dev++ )
		snprintf( selected_devices + strlen( selected_devices ),
		          sizeof( selected_devices ) - strlen( selected_devices ),
		          ", %d",
		          global_info.devices[i_dev] );

	print_msg( INFO,
	           "%d of %d devices are valid (CC >= 1.3). %d will be used: %s",
	           num_valid_devices,
	           total_devices,
	           num_devices,
	           selected_devices );

	glob_is_init = 1;
	return CUP_SUCCESS;
}

/**	\ingroup public
 * Checks whether the library is initialized.
 * \return \p True if the library is initialized, and \p false otherwise.
 */
int cup_is_init()
{
	return glob_is_init;
}

/**	\ingroup public
 * Finalizes the library.
 * No more operations can be done with the library after calling this function.
 * \return CUP_SUCCESS if the library was initialized,
 * CUP_NOT_INITIALIZED otherwise.
 */
cup_error_t cup_finish()
{
	int i,
	    fd,
	    fds[NUM_PRINT_MODES],
	    count;

	CHECK_INIT();

#ifdef USE_CUBLAS
	if( global_info.use_cublas )
	{
		for( i = 0; i < global_info.num_devices; i++ )
			cublasDestroy( global_info.cublas_handles[i] );
		free( global_info.cublas_handles );
	}
#endif

	for( i = 0; i < global_info.num_devices; i++ )
	{
		CUDA( cudaSetDevice( global_info.devices[i] ) );
		CUDA( cudaDeviceReset() );
	}

	free( global_info.devices );
	free( global_info.device_info );

	for ( i = 0; i < NUM_PRINT_MODES; i++ )
		fds[i] = -1;

	count = 0;
	for ( i = 0; i < NUM_PRINT_MODES; i++ )
	{
		fd = fileno( global_info.config.msg_output[i] );
		if( find_first( fd, fds, NUM_PRINT_MODES ) < 0 )
			fds[count++] = fd;
	}

	// Close descriptor if it is not standard and it is open.
	for ( i = 0; i < NUM_PRINT_MODES; i++ )
		if( fds[i] > 2 && (fcntl( fds[i], F_GETFL ) != -1 || errno != EBADF) )
			fclose( global_info.config.msg_output[i] );

	return CUP_SUCCESS;
}

} // extern "C"

/********************************************
 * Private functions                        *
 ********************************************/
/**
 * Enables P2P access between all the device pairs, if possible.
 * \return CUP_SUCCESS, CUP_CUDA_ERROR.
 */
static cup_error_t enable_P2P()
{
#if CUDART_VERSION >= 4000 // P2P is possible from CUDA 4.0 on.
	int  nd = global_info.num_devices,
	   * devices = global_info.devices;

	int i_dev, j_dev,
	    can_access;

	for ( i_dev = 0; i_dev < nd - 1; i_dev++ )
	{
		if( global_info.device_info[i_dev].major >= 2)
		{
			for ( j_dev = i_dev + 1; j_dev < nd; j_dev++ )
			{
				if( global_info.device_info[j_dev].major >= 2)
				{
					CUDA( cudaDeviceCanAccessPeer( &can_access,
					                               devices[i_dev],
					                               devices[j_dev] ) )
					if( can_access )
					{
						CUDA( cudaSetDevice( i_dev ) );
						CUDA( cudaDeviceEnablePeerAccess( devices[j_dev],
						                                  0 ) );
					}
					CUDA( cudaDeviceCanAccessPeer( &can_access,
					                               devices[j_dev],
					                               devices[i_dev] ) );
					if( can_access )
					{
						CUDA( cudaSetDevice( j_dev ) );
						CUDA( cudaDeviceEnablePeerAccess( devices[i_dev],
						                                  0 ) );
					}
				}
			}
		}
	}
#endif
	return CUP_SUCCESS;
}

/**
 * Loads configuration values for the library.
 * \return CUP_SUCCESS.
 */
static cup_error_t load_configuration()
{
	int   ienv;

	// CUP_MPI_NONBLOCKING
	// If nonzero, then the calls to execute MPI solvers are nonblocking. This
	// requires the MPI implementation to provide MPI_THREAD_SERIALIZED
	// level of thread support, otherwise will default to 0.
	// Default value: 0.
	ienv = get_int_env( "CUP_MPI_NONBLOCKING", 0 );
	global_info.config.mpi_nonblocking = ienv != 0;

	// CUP_MPI_SEGMENT_SIZE
	// The minimum size of a segment used in the pipelined
	// communication between MPI processes. If zero, the
	// communication will not be pipelined.
	ienv = get_size_env( "CUP_MPI_SEGMENT_SIZE", "128K" );
	global_info.config.mpi_segment_size = MAX( ienv, 0 );
	// Default value: 131072 (128 KiB)

	// CUP_VERBOSE_LEVEL
	// Sets the verbosity of the log output of the library.
	// 0: Just error messages.
	// 1: Also trace messages.
	// 2: Also warning messages.
	// 3: Also information messages.
	// Default value: 2
	ienv = get_int_env( "CUP_VERBOSE_LEVEL", 2 );
	global_info.config.verbose_level = ENCLOSE( ienv, 0, 2 );

	return CUP_SUCCESS;
}

/**
 * Get integer valued environment variable.
 * \param[in] name The name of the environment variable.
 * \param[in] def Default value for the environment variable.
 * \return The integer value of the \p name environment value, or \p def if
 * it is not set.
 */
static int get_int_env( const char* name, int def )
{
	char* env;
	int   ienv;

	env = getenv( name );
	ienv = (env == NULL) ? def : atoi( env );

	return ienv;
}

/**
 * Get size valued environment variable.
 * \param[in] name The name of the environment variable.
 * \param[in] def Default value for the environment variable.
 * \return The integer value of the \p name environment value, or \p def if
 * it is not set.
 */
static int get_size_env( const char* name, const char* def )
{
	char* env;
	int   ienv;

	env = getenv( name );
	ienv = (env == NULL) ? size2int( def ) : size2int( env );

	return ienv;
}
