/*
    Copyright (C) 2016 University of the Basque Country, UPV/EHU.

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

#ifndef FFT_UTILS_MPI_H_
#define FFT_UTILS_MPI_H_
/********************************************
 * Includes                                 *
 ********************************************/
#include "fft_utils_ser.h"
#include "cuPoisson.h"

/********************************************
 * Macros                                   *
 ********************************************/
#define SHUFFLE_TILE_SIZE 16

/********************************************
 * Data Definitions                         *
 ********************************************/
// Structure to store some repeated computations related to the number of
// elements in several structures as planes, segments...
struct mpi_fft_np
{
	int real_seg,      // Number of points in a real segment.
	    real_last_seg, // Num. points in the last real segment.
	    c2d_seg,       // Num. points in a complex segment, in the initial distribution.
	    c2d_last_seg,  // Idem. for the last segment.
	    c1d_seg,       // Num. points in a complex segment, in the transposed distribution.
	                   // The size is for this MPI process and can differ to other processes (i.e. the last process).
	    c1d_last_seg,  // Idem. for the last segment.
	    c1d_block;     // Similar to the previous ones, but in this case the complete block (standard block, ignoring the particular last block) is considered instead of a single segment.
};

struct mpi_fft_solver_data
{
	cufftDoubleComplex* buffer2d[2],       // Buffers for MPI data exchange.
	                  * buffer1d[2],
	                  * dev_data_1d,       // Buffer to store the whole block in the transposed distribution for the pipelined solver.
	                  * dev_transposed[4], // Buffer for the transposed array.
	                  * dev_aux[2];        // Auxiliary buffer for the pipelined solver.
	cufftHandle         plan_2d_fw_last,   // Extra plans for the differently sized last 2D FFT computations.
	                    plan_2d_inv_last;
	cudaStream_t        streams[4];
	cudaEvent_t         events[4];
	struct distribution distr_2d,
	                    distr_1d;
	int                 padded_np2;        // The number of points in dim. 2 if padding is needed.
	unsigned int        offset_2d,         // Can be computed from distr_2d. Stored for convenience.
	                    size_2d,           // Can be computed from distr_2d. Stored for convenience.
	                    offset_1d,         // Can be computed from distr_1d. Stored for convenience.
	                    size_1d;           // Can be computed from distr_1d. Stored for convenience.
	int                 seg_size,          // The segment size, in number of planes.
	                    last_seg_size,     // The size of the last segment, in number of planes.
	                    num_segs;          // The number of segments in the block.
	int*                counts_2d[2],      // To be used in the MPI Alltoall call.
	   *                displs_2d[2],
	   *                counts_1d[2],
	   *                displs_1d[2];
	struct mpi_fft_np np;                  // Auxiliary data with frequently computed values.
};

/********************************************
 * Public function prototypes               *
 ********************************************/

cup_error_t mpi_set_fft_parms( cup_mpi_solver* mpi_solver );
cup_error_t mpi_set_fft_plans( cup_mpi_solver* mpi_solver );
cup_error_t mpi_exchange_fft_data( struct cup_mpi_solver* mpi_solver,
                                   int direction );
void mpi_solve_poisson_in_GPU( cup_mpi_solver* mpi_solver,
                               cufftDoubleComplex* data );
cup_error_t mpi_pipeline_fft2d_fw( struct cup_mpi_solver* mpi_solver, int i_seg );
cup_error_t mpi_pipeline_alltoall_fw( struct cup_mpi_solver* mpi_solver, int i_seg );
cup_error_t mpi_pipeline_h2d_fw( struct cup_mpi_solver* mpi_solver, int i_seg );
cup_error_t mpi_pipeline_d2h_inv( struct cup_mpi_solver* mpi_solver, int i_seg );
cup_error_t mpi_pipeline_alltoall_inv( struct cup_mpi_solver* mpi_solver,
                                       int i_seg );
cup_error_t mpi_pipeline_fft2d_inv( struct cup_mpi_solver* mpi_solver,
                                    int i_seg );

#endif /* FFT_UTILS_MPI_H_ */
