/*************************************************************************/
/*                                                                       */
/*  Copyright (c) 1994 Stanford University                               */
/*                                                                       */
/*  All rights reserved.                                                 */
/*                                                                       */
/*  Permission is given to use, copy, and modify this software for any   */
/*  non-commercial purpose as long as this copyright notice is not       */
/*  removed.  All other uses, including redistribution in whole or in    */
/*  part, are forbidden without prior written permission.                */
/*                                                                       */
/*  This software is provided with absolutely no warranty and no         */
/*  support.                                                             */
/*                                                                       */
/*************************************************************************/

///////////////////////////////////////////////////////////////////////////
// This port of the SPLASH FFT benchmark on the ALMOS-MKH OS has been
// done by Alain Greiner (august 2018).
//
// This application performs the 1D fast Fourier transfom for an array
// of N complex points, using the Cooley-Tuckey FFT method.
// The N data points are seen as a 2D array (rootN rows * rootN columns).
// Each thread handle (rootN / nthreads) rows.
// The N input data points can be initialised in three different modes:
// - CONSTANT : all data points have the same [1,0] value
// - COSIN    : data point n has [cos(n/N) , sin(n/N)] values
// - RANDOM   : data points have pseudo random values
//
// The main parameters for this generic application are the following:      
//  - M : N = 2**M = number of data points / M must be an even number. 
//  - T : nthreads = ncores defined by the hardware / must be power of 2. 
// The number of threads cannot be larger than the number of rows.
//
// This application uses 3 shared data arrays, that are dynamically
// allocated and distributed in clusters, with one sub-buffer per cluster:
// - data[N] contains N input data points,
// - trans[N] contains N intermediate data points,
// - twid[N] contains N coefs : exp(2*pi*i*j/N) / i and j in [0,rootN-1]
// Each sub-buffer contains (N/nclusters) entries, with 2 double per entry.
// These distributed buffers are allocated and initialised in parallel
// by the working threads running on core 0 in each cluster.
//
// Each working thread allocates also a private coefs[rootN-1] buffer,
// that contains all coefs required for a rootN points FFT. 
//
// There is one working thread per core.
// The actual number of cores and cluster in a given hardware architecture
// is obtained by the get_config() syscall (x_size, y_size, ncores).
// The max number of clusters is bounded by (X_MAX * Y_MAX).
// The max number of cores per cluster is bounded by CORES_MAX.
//
// Several configuration parameters can be defined below:
//  - PRINT_ARRAY : Print out complex data points arrays. 
//  - CHECK       : Perform both FFT and inverse FFT to check output/input.
//  - DEBUG_MAIN  : Display intermediate results in main()
//  - DEBUG_FFT1D : Display intermediate results in FFT1D()
//  - DEBUG_ROW   : Display intermedite results in FFTrow()
//
// Regarding final instrumentation:
// - the sequencial initialisation time (init_time) is computed
//   by the main thread in the main() function.
// - The parallel execution time (parallel_time[i]) is computed by each
//   working thread(i) in the work() function.
// - The synchronisation time related to the barriers (sync_time[i])
//   is computed by each thread(i) in the work() function.
// The results are displayed on the TXT terminal, and registered on disk.
///////////////////////////////////////////////////////////////////////////

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <pthread.h>
#include <almosmkh.h>
#include <hal_macros.h>

// constants

#define PI                      3.14159265359
#define PAGE_SIZE               4096
#define X_MAX                   16              // max number of clusters in a row
#define Y_MAX                   16              // max number of clusters in a column
#define CORES_MAX               4               // max number of cores in a cluster
#define CLUSTERS_MAX            X_MAX * Y_MAX
#define THREADS_MAX             CLUSTERS_MAX * CORES_MAX
#define RANDOM                  0
#define COSIN                   1
#define CONSTANT                2

// parameters

#define DEFAULT_M               18              // 256 K complex points
#define USE_DQT_BARRIER         1               // use DDT barrier if non zero
#define MODE                    COSIN           // DATA array initialisation mode
#define CHECK                   0               
#define DEBUG_MAIN              1               // trace main() function (detailed if odd)
#define DEBUG_WORK              0               // trace work() function (detailed if odd)
#define DEBUG_FFT1D             0               // trace FFT1D() function (detailed if odd)
#define DEBUG_ROW               0               // trace FFTRow() function (detailed if odd)
#define PRINT_ARRAY             0
#define DISPLAY_SCHED_AND_VMM   0               // display final VMM state in all clusters 

// macro to swap two variables
#define SWAP(a,b) { double tmp; tmp = a; a = b; b = tmp; }

/////////////////////////////////////////////////////////////////////////////////////
//             FFT global variables
/////////////////////////////////////////////////////////////////////////////////////

// work function arguments
typedef struct work_args_s
{
    unsigned int        tid;               // thread continuous index
    unsigned int        lid;               // core local index
    unsigned int        cid;               // cluster continuous index
    pthread_barrier_t * parent_barrier;    // parent barrier to signal completion 
}
work_args_t;

unsigned int   nthreads;                   // total number of threads (one thread per core)
unsigned int   nclusters;                  // total number of clusters
unsigned int   M = DEFAULT_M;              // log2(number of points)
unsigned int   N;                          // number of points (N = 2^M)         
unsigned int   rootN;                      // rootN = 2^M/2    
unsigned int   rows_per_thread;            // number of data "rows" handled by a single thread
unsigned int   points_per_cluster;         // number of data points per cluster 

// arrays of pointers on distributed buffers (one sub-buffer per cluster) 
double *       data[CLUSTERS_MAX];         // original time-domain data
double *       trans[CLUSTERS_MAX];        // used as auxiliary space for transpose
double *       twid[CLUSTERS_MAX];         // twiddle factor : exp(-2iPI*k*n/N) 
double *       bloup[CLUSTERS_MAX];        // used as auxiliary space for DFT

// instrumentation counters
unsigned int   pgfault_nr[THREADS_MAX];    // total number of page faults (per thread)
unsigned int   pgfault_cost[THREADS_MAX];  // total page faults cost (per thread)
unsigned int   pgfault_max[THREADS_MAX];   // max page faults cost (per thread)
unsigned int   parallel_time[THREADS_MAX]; // total computation time (per thread)
unsigned int   sync_time[THREADS_MAX];     // cumulated waiting time in barriers (per thread)
unsigned int   init_time;                  // initialisation time (in main)

// synchronisation barrier (all threads)
pthread_barrier_t      barrier;
pthread_barrierattr_t  barrier_attr;

/////////////////////////////////////////////////////////////////////////////////////
//             Global variables required by parallel_pthread_create()
/////////////////////////////////////////////////////////////////////////////////////

// 2D arrays of input arguments for the <work> threads
// These arrays are initialised by the application main thread

work_args_t       work_args[CLUSTERS_MAX][CORES_MAX];  // work function arguments 
work_args_t     * work_ptrs[CLUSTERS_MAX][CORES_MAX];  // pointers on arguments 

// 1D array of barriers to allow the <work> threads to signal termination 
// this array is initialised in each cluster by the <build[cxy][0]> thread
  
pthread_barrier_t parent_barriers[CLUSTERS_MAX];        // termination barrier

/////////////////////////////////////////////////////////////////////////////////////
//           functions declaration
/////////////////////////////////////////////////////////////////////////////////////

void work( work_args_t * args );

double CheckSum( void );

void InitD( double    ** data , 
            unsigned int mode,
            unsigned int tid );

void InitT( double    ** twid,
            unsigned int tid );

void InitU( double * coefs );

unsigned int BitReverse( unsigned int k );

void FFT1D( int          direction,
            double    ** x,
            double    ** tmp,
            double     * upriv, 
            double    ** twid,
            unsigned int tid,
            unsigned int MyFirst,
            unsigned int MyLast );

void TwiddleOneCol( int          direction,
                    unsigned int j,
                    double    ** u,
                    double    ** x,
                    unsigned int offset_x );

void Scale( double    ** x,
            unsigned int offset_x );

void Transpose( double    ** src, 
                double    ** dest,
                unsigned int MyFirst,
                unsigned int MyLast );

void Copy( double    ** src,
           double    ** dest,
           unsigned int MyFirst,
           unsigned int MyLast );

void Reverse( double    ** x, 
              unsigned int offset_x );

void FFTRow( int          direction,
                double     * u,
                double    ** x,
                unsigned int offset_x );

void PrintArray( double ** x,
                 unsigned int size );

void SimpleDft( int          direction,
                unsigned int size,
                double    ** src,
                unsigned int src_offset,
                double    ** dst,
                unsigned int dst_offset );

///////////////////////////////////////////////////////////////////
// This main() function execute the sequencial initialisation
// launch the parallel execution, and makes the instrumentation.
///////////////////////////////////////////////////////////////////
void main ( void )
{
    int                 error;

    unsigned int        x_size;            // number of clusters per row 
    unsigned int        y_size;            // number of clusters per column
    unsigned int        ncores;            // max number of cores per cluster


    unsigned int        x;                 // current index for cluster X coordinate
    unsigned int        y;                 // current index for cluster Y coordinate
    unsigned int        lid;               // current index for core in a cluster
    unsigned int        tid;               // continuous thread index
    unsigned int        cid;               // cluster continuous index
    unsigned int        cxy;               // hardware specific cluster identifier

    char                name[64];          // instrumentation file name
    char                path[128];         // instrumentation path name
    char                string[256];
    int                 ret;

    unsigned long long  start_init_cycle; 
    unsigned long long  end_init_cycle;

#if DEBUG_MAIN
    unsigned long long  debug_cycle;
#endif

#if CHECK
    double              ck1;               // for input/output checking
    double              ck3;               // for input/output checking
#endif
    
    int                 pid = getpid();

    // get FFT application start cycle
    get_cycle( &start_init_cycle );

    // get platform parameters
    if( get_config( &x_size , &y_size , &ncores ) )
    {
        printf("\n[fft error] cannot get hardware configuration\n");
        exit( 0 );
    }

    // check ncores
    if( (ncores != 1) && (ncores != 2) && (ncores != 4) )
    {
        printf("\n[fft error] number of cores per cluster must be 1/2/4\n");
        exit( 0 );
    }

    // check x_size
    if( (x_size != 1) && (x_size != 2) && (x_size != 4) && (x_size != 8) && (x_size != 16) )
    {
        printf("\n[fft error] x_size must be 1/2/4/8/16\n");
        exit( 0 );
    }

    // check y_size
    if( (y_size != 1) && (y_size != 2) && (y_size != 4) && (y_size != 8) && (y_size != 16) )
    {
        printf("\n[fft error] y_size must be 1/2/4/8/16\n");
        exit( 0 );
    }

    // compute nthreads and nclusters
    nthreads  = x_size * y_size * ncores;
    nclusters = x_size * y_size;

    // compute covering DQT size an level
    unsigned int z = (x_size > y_size) ? x_size : y_size;
    unsigned int root_level = (z == 1) ? 0 : (z == 2) ? 1 : (z == 4) ? 2 : (z == 8) ? 3 : 4;

    // compute various constants depending on N and T 
    N                  = 1 << M;
    rootN              = 1 << (M / 2);
    rows_per_thread    = rootN / nthreads;
    points_per_cluster = N / nclusters;
 
    // check N versus T
    if( rootN < nthreads )
    {
        printf("\n[fft error] sqrt(N) must be larger than T\n");
        exit( 0 );
    }

    printf("\n[fft] starts / %d points / %d thread(s) / PID %x / cycle %d\n",
    N, nthreads, pid, (unsigned int)start_init_cycle );

    // build instrumentation file name
    if( USE_DQT_BARRIER )
    snprintf( name , 64 , "p_fft_dqt_%d_%d_%d", N , x_size * y_size , ncores );
    else
    snprintf( name , 64 , "p_fft_smp_%d_%d_%d", N , x_size * y_size , ncores );

    // build pathname
    snprintf( path , 128 , "/home/%s", name );

    // open instrumentation file
    FILE * f = fopen( path , NULL );
    if ( f == NULL ) 
    { 
        printf("\n[fft error] cannot open instrumentation file <%s>\n", path );
        exit( 0 );
    }

#if DEBUG_MAIN
get_cycle( &debug_cycle );
printf("\n[fft] main open file <%s> at cycle %d\n",
path, (unsigned int)debug_cycle );
#endif

#if CHECK 
ck1 = CheckSum();
#endif

#if PRINT_ARRAY 
printf("\nData values / base = %x\n", &data[0][0] );
PrintArray( data , N );

printf("\nTwiddle values / base = %x\n", &twid[0][0] );
PrintArray( twid , N );

SimpleDft( 1 , N , data , 0 , bloup , 0 );

printf("\nExpected results / base = %x\n", &bloup[0][0] );
PrintArray( bloup , N );
#endif

    // initialise barrier synchronizing all <work> threads
    if( USE_DQT_BARRIER )
    {
        barrier_attr.x_size   = x_size;
        barrier_attr.y_size   = y_size;
        barrier_attr.nthreads = ncores;
        error = pthread_barrier_init( &barrier, &barrier_attr , nthreads );
    }
    else
    {
        error = pthread_barrier_init( &barrier, NULL , nthreads );
    }

    if( error )
    {
        printf("\n[fft error] cannot initialize barrier\n");
        exit( 0 );
    }

#if DEBUG_MAIN
get_cycle( &debug_cycle );
printf("\n[fft] main completes barrier init at cycle %d\n",
(unsigned int)debug_cycle );
#endif

    // build array of arguments for the <work> threads
    for (x = 0 ; x < x_size ; x++)
    {
        for (y = 0 ; y < y_size ; y++)
        {
            // compute cluster identifier
            cxy = HAL_CXY_FROM_XY( x , y );

            for ( lid = 0 ; lid < ncores ; lid++ )
            {
                // compute cluster continuous index
                cid = (x * y_size) + y;

                // compute work thread continuous index
                tid = (cid * ncores) + lid;
                
                // initialize 2D array of arguments
                work_args[cxy][lid].tid            = tid;
                work_args[cxy][lid].lid            = lid;
                work_args[cxy][lid].cid            = cid;
                work_args[cxy][lid].parent_barrier = &parent_barriers[cxy];

                // initialize 2D array of pointers
                work_ptrs[cxy][lid] = &work_args[cxy][lid];
            }
        }
    }

    // register sequencial time
    get_cycle( &end_init_cycle );
    init_time = (unsigned int)(end_init_cycle - start_init_cycle);

#if DEBUG_MAIN
printf("\n[fft] main completes <work> threads arguments at cycle %d\n",
(unsigned int)end_init_cycle );
#endif

    // create and execute the working threads
    if( pthread_parallel_create( root_level,
                                 &work,
                                 &work_ptrs[0][0],
                                 &parent_barriers[0] ) )
    {
        printf("\n[fft error] creating threads\n");
        exit( 0 );
    }

#if DEBUG_MAIN
get_cycle( &debug_cycle );
printf("\n[fft] main resume for instrumentation at cycle %d\n",
(unsigned int)debug_cycle) ;
#endif

#if PRINT_ARRAY 
printf("\nData values after FFT:\n");
PrintArray( data , N );
#endif

#if CHECK
ck3 = CheckSum();
printf("\n*** Results ***\n");
printf("Checksum difference is %f (%f, %f)\n", ck1 - ck3, ck1, ck3);
if (fabs(ck1 - ck3) < 0.001)  printf("Results OK\n");
else                          printf("Results KO\n");
#endif

    // display header on terminal, and save to file
    printf("\n----- %s -----\n", name );

    ret = fprintf( f , "\n----- %s -----\n", name );
    if( ret < 0 )
    {
        printf("\n[fft error] cannot write header to file <%s>\n", path );
        exit(0);
    }

    // initializes global (all threads) instrumentation values
    unsigned int time_para      = parallel_time[0];
    unsigned int time_sync      = sync_time[0];
    unsigned int pgfaults_nr    = 0;
    unsigned int pgfaults_cost  = 0;
    unsigned int pgfaults_max   = pgfault_max[0];

    // loop on threads to compute global instrumentation results
    for (tid = 0 ; tid < nthreads ; tid++) 
    {
        snprintf( string , 256 ,
        "- tid %d : Seq %d / Para %d / Sync %d / Pgfaults %d ( cost %d / max %d )\n",
        tid, init_time, parallel_time[tid], sync_time[tid], 
        pgfault_nr[tid], (pgfault_cost[tid] / pgfault_nr[tid]) , pgfault_max[tid] );

        // save  to instrumentation file
        fprintf( f , "%s" , string );
        if( ret < 0 )
        {
            printf("\n[fft error] cannot save thread %d results to file <%s>\n", tid, path );
            printf("%s", string );
            exit(0);
        }

        // compute global values
        if (parallel_time[tid] > time_para)    time_para      = parallel_time[tid];
        if (sync_time[tid]     > time_sync)    time_sync      = sync_time[tid];

        pgfaults_nr   += pgfault_nr[tid];
        pgfaults_cost += pgfault_cost[tid];

        if (pgfault_max[tid]   > pgfaults_max) pgfaults_max   = pgfault_max[tid];
    }

    // display global values on terminal and save to file
    snprintf( string , 256 ,
    "\nSeq %d / Para %d / Sync %d / Pgfaults %d ( cost %d / max %d )\n",
    init_time, time_para, time_sync, pgfaults_nr, (pgfaults_cost / pgfaults_nr), pgfaults_max );

    printf("%s", string );

    // save global values to file
    ret = fprintf( f , "%s", string );

    if( ret < 0 )
    {
        printf("\n[fft error] cannot save global results to file <%s>\n", path );
        exit(0);
    }

    // close instrumentation file
    ret = fclose( f );

    if( ret < 0 )
    {
        printf("\n[fft error] cannot close file <%s>\n", path );
        exit(0);
    }

#if DEBUG_MAIN
get_cycle( &debug_cycle );
printf("\n[fft] main exit <%s> at cycle %d\n",
path, (unsigned int)debug_cycle );
#endif

    exit( 0 );

} // end main()

/////////////////////////////////////////////////////////////////
// This function is executed in parallel by all <work> threads.
/////////////////////////////////////////////////////////////////
void work( work_args_t * args ) 
{
    unsigned int        tid;              // this thread continuous index
    unsigned int        lid;              // core local index 
    unsigned int        cid;              // cluster continuous index
    pthread_barrier_t * parent_barrier;   // pointer on parent barrier

    unsigned int        MyFirst;          // index first row allocated to thread
    unsigned int        MyLast;           // index last row allocated to thread
    double            * upriv;            // private array of FFT coefs

    unsigned long long  parallel_start;
    unsigned long long  parallel_stop;
    unsigned long long  barrier_start;
    unsigned long long  barrier_stop;

    // get thread arguments
    tid            = args->tid; 
    lid            = args->lid;              
    cid            = args->cid;              
    parent_barrier = args->parent_barrier;

    get_cycle( &parallel_start );

#if DEBUG_WORK
printf("\n[fft] %s : thread %d enter / cycle %d\n",
__FUNCTION__, tid, (unsigned int)parallel_start );
#endif

    // thread on core 0 allocates memory from the local cluster
    // for the distributed data[], trans[], twid[] buffers
    if( lid == 0 )
    {
        unsigned int data_size = (N / nclusters) * 2 * sizeof(double);

        data[cid] = (double *)malloc( data_size ); 
        if( data[cid] == NULL )
        {
            printf("\n[fft_error] in work : cannot allocate data[%d] buffer\n", cid );
            pthread_barrier_wait( parent_barrier );
            pthread_exit( NULL );
        }
        
        trans[cid] = (double *)malloc( data_size ); 
        if( trans[cid] == NULL )
        {
            printf("\n[fft_error] in work : cannot allocate trans[%d] buffer\n", cid );
            pthread_barrier_wait( parent_barrier );
            pthread_exit( NULL );
        }
        
        twid[cid] = (double *)malloc( data_size ); 
        if( twid[cid] == NULL )
        {
            printf("\n[fft_error] in work : cannot allocate twid[%d] buffer\n", cid );
            pthread_barrier_wait( parent_barrier );
            pthread_exit( NULL );
        }
    }

    // BARRIER to wait distributed buffers allocation
    get_cycle( &barrier_start );
    pthread_barrier_wait( &barrier );
    get_cycle( &barrier_stop );
    sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);

#if DEBUG_WORK
printf("\n[fft] %s : thread %d exit barrier for buffer allocation / cycle %d\n",
__FUNCTION__, tid, (unsigned int)barrier_stop );
#endif

#if DISPLAY_SCHED_AND_VMM
    unsigned int x_size;
    unsigned int y_size;
    unsigned int ncores;
    get_config( &x_size , &y_size , &ncores );
    unsigned int x   = cid / y_size;
    unsigned int y   = cid % y_size;
    unsigned int cxy = HAL_CXY_FROM_XY( x , y );
display_sched( cxy , lid );
if( lid == 0 ) display_vmm( cxy , getpid() , 0 );
#endif

    // all threads contribute to data[] local array initialisation
    InitD( data , MODE , tid ); 

    // all threads contribute to data[] local array initialisation
    InitT( twid , tid );
    
    // BARRIER to wait distributed buffers initialisation
    get_cycle( &barrier_start );
    pthread_barrier_wait( &barrier );
    get_cycle( &barrier_stop );
    sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);

#if DEBUG_WORK
printf("\n[fft] %s : thread %d exit barrier for buffer initialisation / cycle %d\n",
__FUNCTION__, tid, (unsigned int)barrier_stop );
#endif

    // all threads allocate memory from the local cluster
    // for the private upriv[] buffer
    upriv = (double *)malloc( (rootN - 1) * 2 * sizeof(double) );
    if( upriv == NULL )
    {
        printf("\n[fft_error] in work : cannot allocate trans[%d] buffer\n", cid );
        pthread_barrier_wait( parent_barrier );
        pthread_exit( NULL );
    }

    // all threads initialise the private upriv[] array 
    InitU( upriv );

    // all threads compute first and last rows handled by the thread
    MyFirst = rootN * tid / nthreads;
    MyLast  = rootN * (tid + 1) / nthreads;

    // all threads perform forward FFT 
    FFT1D( 1 , data , trans , upriv , twid , tid , MyFirst , MyLast );

#if CHECK 
get_cycle( &barrier_start );
pthread_barrier_wait( &barrier );
get_cycle( &barrier_stop );
sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);
FFT1D( -1 , data , trans , upriv , twid , tid , MyFirst , MyLast );
#endif

    get_cycle( &parallel_stop );

    // register parallel time in instrumentation counters
    parallel_time[tid] = (unsigned int)(parallel_stop - parallel_start);

    // get work thread info for page faults
    thread_info_t info;
    get_thread_info( &info );
    
    // register page faults in instrumentation counters
    pgfault_nr[tid]   = info.false_pgfault_nr + 
                        info.local_pgfault_nr + 
                        info.global_pgfault_nr;
    pgfault_cost[tid] = info.false_pgfault_cost + 
                        info.local_pgfault_cost + 
                        info.global_pgfault_cost;
    pgfault_max[tid]  = info.false_pgfault_max + 
                        info.local_pgfault_max + 
                        info.global_pgfault_max;
#if DEBUG_WORK
printf("\n[fft] %s : thread %d completes fft / p_start %d / p_stop %d\n", 
__FUNCTION__, tid, (unsigned int)parallel_start, (unsigned int)parallel_stop );
#endif

    //  work thread signals completion to main
    pthread_barrier_wait( parent_barrier );

#if DEBUG_WORK
printf("\n[fft] %s : thread %d exit\n", 
__FUNCTION__, tid );
#endif

#if DISPLAY_SCHED_AND_VMM
printf("\n[fft] %s : thread %d exit\n", __FUNCTION__, tid );
if( lid == 0 ) display_vmm( cxy , getpid() , 0 );
#endif

    //  work thread exit 
    pthread_exit( NULL );

}  // end work()

////////////////////////////////////////////////////////////////////////////////////////
// This function makes the DFT from the src[nclusters][points_per_cluster] distributed
// buffer, to the dst[nclusters][points_per_cluster] distributed buffer.
////////////////////////////////////////////////////////////////////////////////////////
void SimpleDft( int             direction,      // 1 direct / -1 reverse
                unsigned int    size,           // number of points
                double       ** src,            // source distributed buffer
                unsigned int    src_offset,     // offset in source array
                double       ** dst,            // destination distributed buffer
                unsigned int    dst_offset )    // offset in destination array
{
    unsigned int  n , k;
    double        phi;            // 2*PI*n*k/N
    double        u_r;            // cos( phi )
    double        u_c;            // sin( phi )
    double        d_r;            // Re(data[n])
    double        d_c;            // Im(data[n])
    double        accu_r;         // Re(accu)
    double        accu_c;         // Im(accu)
    unsigned int  c_id;           // distributed buffer cluster index
    unsigned int  c_offset;       // offset in distributed buffer

    for ( k = 0 ; k < size ; k++ )       // loop on the output data points
    {
        // initialise accu
        accu_r = 0;
        accu_c = 0;

        for ( n = 0 ; n < size ; n++ )   // loop on the input data points
        {
            // compute coef
            phi = (double)(2*PI*n*k) / size;
            u_r =  cos( phi );
            u_c = -sin( phi ) * direction;

            // get input data point
            c_id     = (src_offset + n) / (points_per_cluster);
            c_offset = (src_offset + n) % (points_per_cluster);
            d_r      = src[c_id][2*c_offset];
            d_c      = src[c_id][2*c_offset+1];

            // increment accu
            accu_r += ((u_r*d_r) - (u_c*d_c));
            accu_c += ((u_r*d_c) + (u_c*d_r));
        }

        // scale for inverse DFT
        if ( direction == -1 )
        {
            accu_r /= size;
            accu_c /= size;
        }

        // set output data point
        c_id     = (dst_offset + k) / (points_per_cluster);
        c_offset = (dst_offset + k) % (points_per_cluster);
        dst[c_id][2*c_offset]   = accu_r;
        dst[c_id][2*c_offset+1] = accu_c;
    }

}  // end SimpleDft()

///////////////////////
double CheckSum( void )
{
    unsigned int         i , j;
    unsigned int         c_id;
    unsigned int         c_offset;
    double               cks;

    cks = 0.0;
    for (j = 0; j < rootN ; j++) 
    {
        for (i = 0; i < rootN ; i++) 
        {
            c_id      = (rootN * j + i) / (points_per_cluster);
            c_offset  = (rootN * j + i) % (points_per_cluster);

            cks += data[c_id][2*c_offset] + data[c_id][2*c_offset+1];
        }
    }
    return(cks);
}

//////////////////////////////////////////////////////////////////////////////////////
// Each working thread <tid> contributes to initialize (rootN / nthreads) rows,
// in the shared - and distributed - <data> array.
//////////////////////////////////////////////////////////////////////////////////////
void InitD(double      ** data,
           unsigned int   mode,
           unsigned int   tid ) 
{
    unsigned int    i , j;
    unsigned int    c_id;
    unsigned int    c_offset;
    unsigned int    index;

    // compute row_min and row_max 
    unsigned int    row_min = tid * rows_per_thread;
    unsigned int    row_max = row_min + rows_per_thread;

    for ( j = row_min ; j < row_max ; j++ )      // loop on rows 
    {  
        for ( i = 0 ; i < rootN ; i++ )          // loop on points in a row
        {  
            index     = j * rootN + i;
            c_id      = index / (points_per_cluster);
            c_offset  = index % (points_per_cluster);

            // complex input signal is random
            if ( mode == RANDOM )                
            {
                data[c_id][2*c_offset]   = ( (double)rand() ) / 65536;
                data[c_id][2*c_offset+1] = ( (double)rand() ) / 65536;
            }
            

            // complex input signal is cos(n/N) / sin(n/N) 
            if ( mode == COSIN )                
            {
                double phi = (double)( 2 * PI * index) / N;
                data[c_id][2*c_offset]   = cos( phi );
                data[c_id][2*c_offset+1] = sin( phi );
            }

            // complex input signal is constant 
            if ( mode == CONSTANT )                
            {
                data[c_id][2*c_offset]   = 1.0;
                data[c_id][2*c_offset+1] = 0.0;
            }
        }
    }
}

///////////////////////////////////////////////////////////////////////////////////////
// Each working thread <tid> contributes to initialize (rootN / nthreads) rows,
// in the shared - and distributed - <twiddle> array.
///////////////////////////////////////////////////////////////////////////////////////
void InitT( double      ** twid,
            unsigned int   tid )
{
    unsigned int    i, j;
    unsigned int    index;
    unsigned int    c_id;
    unsigned int    c_offset;
    double  phi;

    // compute row_min and row_max 
    unsigned int    row_min = tid * rows_per_thread;
    unsigned int    row_max = row_min + rows_per_thread;

    for ( j = row_min ; j < row_max ; j++ )      // loop on rows 
    {  
        for ( i = 0 ; i < rootN ; i++ )          // loop on points in a row
        {  
            index     = j * rootN + i;
            c_id      = index / (points_per_cluster);
            c_offset  = index % (points_per_cluster);

            phi = (double)(2.0 * PI * i * j) / N;
            twid[c_id][2*c_offset]   = cos( phi );
            twid[c_id][2*c_offset+1] = -sin( phi );
        }
    }
}

///////////////////////////////////////////////////////////////////////////////////////
// Each working thread initialize the private <upriv> array / (rootN - 1) entries.
///////////////////////////////////////////////////////////////////////////////////////
void InitU( double * upriv ) 
{
    unsigned int    q; 
    unsigned int    j; 
    unsigned int    base; 
    unsigned int    n1;
    double          phi;

    for (q = 0 ; ((unsigned int)(1 << q) < N) ; q++) 
    {  
        n1 = 1 << q;    // n1 == 2**q
        base = n1 - 1;
        for (j = 0; (j < n1) ; j++) 
        {
            if (base + j > rootN - 1) return;

            phi = (double)(2.0 * PI * j) / (2 * n1);
            upriv[2*(base+j)]   = cos( phi );
            upriv[2*(base+j)+1] = -sin( phi );
        }
    }
}

////////////////////////////////////////////////////////////////////////////////////////
// This function returns an index value that is the bit reverse of the input value.
////////////////////////////////////////////////////////////////////////////////////////
unsigned int BitReverse( unsigned int k ) 
{
    unsigned int i; 
    unsigned int j; 
    unsigned int tmp;

    j = 0;
    tmp = k;
    for (i = 0; i < M/2 ; i++) 
    {
        j = 2 * j + (tmp & 0x1);
        tmp = tmp >> 1;
    }
    return j;
}

////////////////////////////////////////////////////////////////////////////////////////
// This function perform the in place (direct or inverse) FFT on the N data points
// contained in the distributed buffers x[nclusters][points_per_cluster].
// It handles the (N) points 1D array as a (rootN*rootN) points 2D array. 
// 1) it transpose (rootN/nthreads ) rows from x to tmp.
// 2) it make (rootN/nthreads) FFT on the tmp rows and apply the twiddle factor.
// 3) it transpose (rootN/nthreads) columns from tmp to x.
// 4) it make (rootN/nthreads) FFT on the x rows.
// It calls the FFTRow() 2*(rootN/nthreads) times to perform the in place FFT
// on the rootN points contained in a row.
////////////////////////////////////////////////////////////////////////////////////////
void FFT1D( int              direction,       // direct 1 / inverse -1
            double       **  x,               // input & output distributed data points array
            double       **  tmp,             // auxiliary distributed data points array
            double        *  upriv,           // local array containing coefs for rootN FFT
            double       **  twid,            // distributed arrays containing N twiddle factors
            unsigned int     tid,             // thread continuous index
            unsigned int     MyFirst, 
            unsigned int     MyLast )
{
    unsigned int j;
    unsigned long long barrier_start;
    unsigned long long barrier_stop;

#if DEBUG_FFT1D
unsigned long long cycle;
get_cycle( &cycle );
printf("\n[fft] %s : thread %d enter / first %d / last %d / cycle %d\n",
__FUNCTION__, tid, MyFirst, MyLast, (unsigned int)cycle );
#endif

    // transpose (rootN/nthreads) rows from x to tmp 
    Transpose( x , tmp , MyFirst , MyLast );

#if( DEBUG_FFT1D & 1 )
get_cycle( &cycle );
printf("\n[fft] %s : thread %d after first transpose / cycle %d\n",
__FUNCTION__, tid, (unsigned int)cycle );
if( PRINT_ARRAY ) PrintArray( tmp , N );
#endif

    // BARRIER
    get_cycle( &barrier_start );
    pthread_barrier_wait( &barrier );
    get_cycle( &barrier_stop );
    sync_time[tid] = (unsigned int)(barrier_stop - barrier_start);

#if( DEBUG_FFT1D & 1 )
get_cycle( &cycle );
printf("\n[fft] %s : thread %d exit barrier after first transpose / cycle %d\n",
__FUNCTION__, tid, (unsigned int)cycle );
#endif

    // do FFTs on rows of tmp (i.e. columns of x) and apply twiddle factor
    for (j = MyFirst; j < MyLast; j++) 
    {
        FFTRow( direction , upriv , tmp , j * rootN );

        TwiddleOneCol( direction , j , twid , tmp , j * rootN );
    }  

#if( DEBUG_FFT1D & 1 )
printf("\n[fft] %s : thread %d after first twiddle\n", __FUNCTION__, tid);
if( PRINT_ARRAY ) PrintArray( tmp , N );
#endif

    // BARRIER
    get_cycle( &barrier_start );
    pthread_barrier_wait( &barrier );
    get_cycle( &barrier_stop );

#if( DEBUG_FFT1D & 1 )
printf("\n[fft] %s : thread %d exit barrier after first twiddle\n", __FUNCTION__, tid);
#endif

    sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);

    // transpose tmp to x
    Transpose( tmp , x , MyFirst , MyLast );

#if( DEBUG_FFT1D & 1 )
printf("\n[fft] %s : thread %d after second transpose\n", __FUNCTION__, tid);
if( PRINT_ARRAY ) PrintArray( x , N );
#endif

    // BARRIER
    get_cycle( &barrier_start );
    pthread_barrier_wait( &barrier );
    get_cycle( &barrier_stop );

#if( DEBUG_FFT1D & 1 )
printf("\n[fft] %s : thread %d exit barrier after second transpose\n", __FUNCTION__, tid);
#endif

    sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);

    // do FFTs on rows of x and apply the scaling factor 
    for (j = MyFirst; j < MyLast; j++) 
    {
        FFTRow( direction , upriv , x , j * rootN );
        if (direction == -1) Scale( x , j * rootN );
    }

#if( DEBUG_FFT1D & 1 )
printf("\n[fft] %s : thread %d after FFT on rows\n", __FUNCTION__, tid);
if( PRINT_ARRAY ) PrintArray( x , N );
#endif

    // BARRIER
    get_cycle( &barrier_start );
    pthread_barrier_wait( &barrier );
    get_cycle( &barrier_stop );

#if( DEBUG_FFT1D & 1 )
printf("\n[fft] %s : thread %d exit barrier after FFT on rows\n", __FUNCTION__, tid);
#endif
    sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);

    // transpose x to tmp
    Transpose( x , tmp , MyFirst , MyLast );

#if( DEBUG_FFT1D & 1 )
printf("\n[fft] %s : thread %x after third transpose\n", __FUNCTION__, tid);
if( PRINT_ARRAY ) PrintArray( x , N );
#endif

    // BARRIER
    get_cycle( &barrier_start );
    pthread_barrier_wait( &barrier );
    get_cycle( &barrier_stop );

#if( DEBUG_FFT1D & 1 )
printf("\n[fft] %s : thread %d exit barrier after third transpose\n", __FUNCTION__, tid);
#endif

    sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);
    sync_time[tid] += (long)(barrier_stop - barrier_start);

    // copy tmp to x
    Copy( tmp , x , MyFirst , MyLast );

#if DEBUG_FFT1D
printf("\n[fft] %s : thread %d completed\n", __FUNCTION__, tid);
if( PRINT_ARRAY ) PrintArray( x , N );
#endif

}  // end FFT1D()

/////////////////////////////////////////////////////////////////////////////////////
// This function multiply all points contained in a row (rootN points) of the 
// x[] array by the corresponding twiddle factor, contained in the u[] array.
/////////////////////////////////////////////////////////////////////////////////////
void TwiddleOneCol( int             direction, 
                    unsigned int    j,              // y coordinate in 2D view of coef array
                    double       ** u,              // coef array base address
                    double       ** x,              // data array base address
                    unsigned int    offset_x )      // first point in N points data array
{
    unsigned int i;
    double       omega_r; 
    double       omega_c; 
    double       x_r; 
    double       x_c;
    unsigned int c_id;
    unsigned int c_offset;

    for (i = 0; i < rootN ; i++)  // loop on the rootN points
    {
        // get coef
        c_id      = (j * rootN + i) / (points_per_cluster);
        c_offset  = (j * rootN + i) % (points_per_cluster);
        omega_r = u[c_id][2*c_offset];
        omega_c = direction * u[c_id][2*c_offset+1];

        // access data
        c_id      = (offset_x + i) / (points_per_cluster);
        c_offset  = (offset_x + i) % (points_per_cluster);   
        x_r = x[c_id][2*c_offset]; 
        x_c = x[c_id][2*c_offset+1];

        x[c_id][2*c_offset]   = omega_r*x_r - omega_c * x_c;
        x[c_id][2*c_offset+1] = omega_r*x_c + omega_c * x_r;
    }
}  // end TwiddleOneCol()

////////////////////////////
void Scale( double      ** x,           // data array base address 
            unsigned int   offset_x )   // first point of the row to be scaled
{
    unsigned int i;
    unsigned int c_id;
    unsigned int c_offset;

    for (i = 0; i < rootN ; i++) 
    {
        c_id      = (offset_x + i) / (points_per_cluster);
        c_offset  = (offset_x + i) % (points_per_cluster);
        x[c_id][2*c_offset]     /= N;
        x[c_id][2*c_offset + 1] /= N;
    }
}

///////////////////////////////////
void Transpose( double      ** src,      // source buffer (array of pointers)
                double      ** dest,     // destination buffer (array of pointers)
                unsigned int   MyFirst,  // first row allocated to the thread
                unsigned int   MyLast )  // last row allocated to the thread
{
    unsigned int row;               // row index
    unsigned int point;             // data point index in a row

    unsigned int index_src;         // absolute index in the source N points array
    unsigned int c_id_src;          // cluster for the source buffer
    unsigned int c_offset_src;      // offset in the source buffer

    unsigned int index_dst;         // absolute index in the dest N points array
    unsigned int c_id_dst;          // cluster for the dest buffer
    unsigned int c_offset_dst;      // offset in the dest buffer

    
    // scan all data points allocated to the thread 
    // (between MyFirst row and MyLast row) from the source buffer
    // and write these points to the destination buffer
    for ( row = MyFirst ; row < MyLast ; row++ )       // loop on the rows
    {
        for ( point = 0 ; point < rootN ; point++ )    // loop on points in row
        {
            index_src    = row * rootN + point;
            c_id_src     = index_src / (points_per_cluster);
            c_offset_src = index_src % (points_per_cluster);

            index_dst    = point * rootN + row;
            c_id_dst     = index_dst / (points_per_cluster);
            c_offset_dst = index_dst % (points_per_cluster);

            dest[c_id_dst][2*c_offset_dst]   = src[c_id_src][2*c_offset_src];
            dest[c_id_dst][2*c_offset_dst+1] = src[c_id_src][2*c_offset_src+1];
        }
    }
}  // end Transpose()

//////////////////////////////
void Copy( double      ** src,      // source buffer (array of pointers)
           double      ** dest,     // destination buffer (array of pointers)
           unsigned int   MyFirst,  // first row allocated to the thread
           unsigned int   MyLast )  // last row allocated to the thread
{
    unsigned int row;                  // row index
    unsigned int point;                // data point index in a row

    unsigned int index;                // absolute index in the N points array
    unsigned int c_id;                 // cluster index
    unsigned int c_offset;             // offset in local buffer

    // scan all data points allocated to the thread 
    for ( row = MyFirst ; row < MyLast ; row++ )       // loop on the rows
    {
        for ( point = 0 ; point < rootN ; point++ )    // loop on points in row
        {
            index    = row * rootN + point;
            c_id     = index / (points_per_cluster);
            c_offset = index % (points_per_cluster);

            dest[c_id][2*c_offset]   = src[c_id][2*c_offset];
            dest[c_id][2*c_offset+1] = src[c_id][2*c_offset+1];
        }
    }
}  // end Copy()

///////////////////////////////
void Reverse( double      ** x, 
              unsigned int   offset_x )
{
    unsigned int j, k;
    unsigned int c_id_j;
    unsigned int c_offset_j;
    unsigned int c_id_k;
    unsigned int c_offset_k;

    for (k = 0 ; k < rootN ; k++) 
    {
        j = BitReverse( k );
        if (j > k) 
        {
            c_id_j      = (offset_x + j) / (points_per_cluster);
            c_offset_j  = (offset_x + j) % (points_per_cluster);
            c_id_k      = (offset_x + k) / (points_per_cluster);
            c_offset_k  = (offset_x + k) % (points_per_cluster);

            SWAP(x[c_id_j][2*c_offset_j]  , x[c_id_k][2*c_offset_k]);
            SWAP(x[c_id_j][2*c_offset_j+1], x[c_id_k][2*c_offset_k+1]);
        }
    }
}

/////////////////////////////////////////////////////////////////////////////
// This function makes the in-place FFT on all points contained in a row
// (i.e. rootN points) of the x[nclusters][points_per_cluster] array.
/////////////////////////////////////////////////////////////////////////////
void FFTRow( int            direction,  // 1 direct / -1 inverse
                double       * u,          // private coefs array 
                double      ** x,          // array of pointers on distributed buffers
                unsigned int   offset_x )  // absolute offset in the x array
{
    unsigned int     j;
    unsigned int     k;
    unsigned int     q;
    unsigned int     L;
    unsigned int     r;
    unsigned int     Lstar;
    double * u1; 

    unsigned int     offset_x1;     // index first butterfly input
    unsigned int     offset_x2;     // index second butterfly output

    double           omega_r;       // real part butterfy coef
    double           omega_c;       // complex part butterfly coef

    double           tau_r;
    double           tau_c;

    double           d1_r;          // real part first butterfly input
    double           d1_c;          // imag part first butterfly input
    double           d2_r;          // real part second butterfly input
    double           d2_c;          // imag part second butterfly input

    unsigned int     c_id_1;        // cluster index for first butterfly input
    unsigned int     c_offset_1;    // offset for first butterfly input
    unsigned int     c_id_2;        // cluster index for second butterfly input
    unsigned int     c_offset_2;    // offset for second butterfly input

#if DEBUG_ROW
unsigned int p;
printf("\n[fft] ROW data in / %d points / offset = %d\n", rootN , offset_x );

for ( p = 0 ; p < rootN ; p++ )
{
    unsigned int index    = offset_x + p;
    unsigned int c_id     = index / (points_per_cluster);
    unsigned int c_offset = index % (points_per_cluster);
    printf("%f , %f | ", x[c_id][2*c_offset] , x[c_id][2*c_offset+1] );
}
printf("\n");
#endif

    // This makes the rootN input points reordering
    Reverse( x , offset_x );  

#if DEBUG_ROW
printf("\n[fft] ROW data after reverse / %d points / offset = %d\n", rootN , offset_x );

for ( p = 0 ; p < rootN ; p++ )
{
    unsigned int index    = offset_x + p;
    unsigned int c_id     = index / (points_per_cluster);
    unsigned int c_offset = index % (points_per_cluster);
    printf("%f , %f | ", x[c_id][2*c_offset] , x[c_id][2*c_offset+1] );
}
printf("\n");
#endif

    // This implements the multi-stages, in place Butterfly network
    for (q = 1; q <= M/2 ; q++)     // loop on stages
    {
        L = 1 << q;       // number of points per subset for current stage
        r = rootN / L;    // number of subsets
        Lstar = L / 2;
        u1 = &u[2 * (Lstar - 1)];
        for (k = 0; k < r; k++)     // loop on the subsets
        {
            offset_x1  = offset_x + (k * L);            // index first point
            offset_x2  = offset_x + (k * L + Lstar);    // index second point

#if (DEBUG_ROW & 1)
printf("\n ### q = %d / k = %d / x1 = %d / x2 = %d\n", q , k , offset_x1 , offset_x2 );
#endif
            // makes all in-place butterfly(s) for subset
            for (j = 0; j < Lstar; j++) 
            {
                // get coef
                omega_r = u1[2*j];
                omega_c = direction * u1[2*j+1];

                // get d[x1] address and value
                c_id_1      = (offset_x1 + j) / (points_per_cluster);
                c_offset_1  = (offset_x1 + j) % (points_per_cluster);
                d1_r        = x[c_id_1][2*c_offset_1];
                d1_c        = x[c_id_1][2*c_offset_1+1];

                // get d[x2] address and value
                c_id_2      = (offset_x2 + j) / (points_per_cluster);
                c_offset_2  = (offset_x2 + j) % (points_per_cluster);
                d2_r        = x[c_id_2][2*c_offset_2];
                d2_c        = x[c_id_2][2*c_offset_2+1];

#if (DEBUG_ROW & 1)
printf("\n ### d1_in = (%f , %f) / d2_in = (%f , %f) / coef = (%f , %f)\n", 
                d1_r , d1_c , d2_r , d2_c , omega_r , omega_c);
#endif
                // tau = omega * d[x2]
                tau_r = omega_r * d2_r - omega_c * d2_c;
                tau_c = omega_r * d2_c + omega_c * d2_r;

                // set new value for d[x1] = d[x1] + omega * d[x2]
                x[c_id_1][2*c_offset_1]   = d1_r + tau_r;
                x[c_id_1][2*c_offset_1+1] = d1_c + tau_c;

                // set new value for d[x2] = d[x1] - omega * d[x2]
                x[c_id_2][2*c_offset_2]   = d1_r - tau_r;
                x[c_id_2][2*c_offset_2+1] = d1_c - tau_c;

#if (DEBUG_ROW & 1)
printf("\n ### d1_out = (%f , %f) / d2_out = (%f , %f)\n", 
                d1_r + tau_r , d1_c + tau_c , d2_r - tau_r , d2_c - tau_c );
#endif
            }
        }
    }

#if DEBUG_ROW
printf("\n[fft] ROW data out / %d points / offset = %d\n", rootN , offset_x );
for ( p = 0 ; p < rootN ; p++ )
{
    unsigned int index    = offset_x + p;
    unsigned int c_id     = index / (points_per_cluster);
    unsigned int c_offset = index % (points_per_cluster);
    printf("%f , %f | ", x[c_id][2*c_offset] , x[c_id][2*c_offset+1] );
}
printf("\n");
#endif

}  // end FFTRow()

///////////////////////////////////////
void PrintArray( double       ** array,
                 unsigned int    size ) 
{
    unsigned int  i;
    unsigned int  c_id;
    unsigned int  c_offset;

    // float display
    for (i = 0; i < size ; i++) 
    {
        c_id      = i / (points_per_cluster);
        c_offset  = i % (points_per_cluster);

        printf(" %f  %f |", array[c_id][2*c_offset], array[c_id][2*c_offset+1]);

        if ( (i+1) % 4 == 0)  printf("\n");
    }
    printf("\n");
}


// Local Variables:
// tab-width: 4
// c-basic-offset: 4
// c-file-offsets:((innamespace . 0)(inline-open . 0))
// indent-tabs-mode: nil
// End:

// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=4:softtabstop=4